agentshield-sdk 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CHANGELOG.md +191 -0
  2. package/LICENSE +21 -0
  3. package/README.md +975 -0
  4. package/bin/agent-shield.js +680 -0
  5. package/package.json +118 -0
  6. package/src/adaptive.js +330 -0
  7. package/src/agent-protocol.js +998 -0
  8. package/src/alert-tuning.js +480 -0
  9. package/src/allowlist.js +603 -0
  10. package/src/audit-immutable.js +914 -0
  11. package/src/audit-streaming.js +469 -0
  12. package/src/badges.js +196 -0
  13. package/src/behavior-profiling.js +289 -0
  14. package/src/benchmark-harness.js +804 -0
  15. package/src/canary.js +271 -0
  16. package/src/certification.js +563 -0
  17. package/src/circuit-breaker.js +321 -0
  18. package/src/compliance.js +617 -0
  19. package/src/confidence-tuning.js +324 -0
  20. package/src/confused-deputy.js +624 -0
  21. package/src/context-scoring.js +360 -0
  22. package/src/conversation.js +494 -0
  23. package/src/cost-optimizer.js +1024 -0
  24. package/src/ctf.js +462 -0
  25. package/src/detector-core.js +1999 -0
  26. package/src/distributed.js +359 -0
  27. package/src/document-scanner.js +795 -0
  28. package/src/embedding.js +307 -0
  29. package/src/encoding.js +429 -0
  30. package/src/enterprise.js +405 -0
  31. package/src/errors.js +100 -0
  32. package/src/eu-ai-act.js +523 -0
  33. package/src/fuzzer.js +764 -0
  34. package/src/honeypot.js +328 -0
  35. package/src/i18n-patterns.js +523 -0
  36. package/src/index.js +430 -0
  37. package/src/integrations.js +528 -0
  38. package/src/llm-redteam.js +670 -0
  39. package/src/main.js +741 -0
  40. package/src/main.mjs +38 -0
  41. package/src/mcp-bridge.js +542 -0
  42. package/src/mcp-certification.js +846 -0
  43. package/src/mcp-sdk-integration.js +355 -0
  44. package/src/mcp-security-runtime.js +741 -0
  45. package/src/mcp-server.js +740 -0
  46. package/src/middleware.js +208 -0
  47. package/src/model-finetuning.js +884 -0
  48. package/src/model-fingerprint.js +1042 -0
  49. package/src/multi-agent-trust.js +453 -0
  50. package/src/multi-agent.js +404 -0
  51. package/src/multimodal.js +296 -0
  52. package/src/nist-mapping.js +505 -0
  53. package/src/observability.js +330 -0
  54. package/src/openclaw.js +450 -0
  55. package/src/otel.js +544 -0
  56. package/src/owasp-2025.js +483 -0
  57. package/src/pii.js +390 -0
  58. package/src/plugin-marketplace.js +628 -0
  59. package/src/plugin-system.js +349 -0
  60. package/src/policy-dsl.js +775 -0
  61. package/src/policy-extended.js +635 -0
  62. package/src/policy.js +443 -0
  63. package/src/presets.js +409 -0
  64. package/src/production.js +557 -0
  65. package/src/prompt-leakage.js +321 -0
  66. package/src/rag-vulnerability.js +579 -0
  67. package/src/redteam.js +475 -0
  68. package/src/response-handler.js +429 -0
  69. package/src/scanners.js +357 -0
  70. package/src/self-healing.js +363 -0
  71. package/src/semantic.js +339 -0
  72. package/src/shield-score.js +250 -0
  73. package/src/sso-saml.js +897 -0
  74. package/src/stream-scanner.js +806 -0
  75. package/src/testing.js +505 -0
  76. package/src/threat-encyclopedia.js +629 -0
  77. package/src/threat-intel-network.js +1017 -0
  78. package/src/token-analysis.js +467 -0
  79. package/src/tool-guard.js +412 -0
  80. package/src/tool-output-validator.js +354 -0
  81. package/src/utils.js +83 -0
  82. package/src/watermark.js +235 -0
  83. package/src/worker-scanner.js +601 -0
  84. package/types/index.d.ts +2088 -0
@@ -0,0 +1,804 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Agent Shield — Standardized Benchmark Harness
5
+ *
6
+ * Provides a reproducible framework for evaluating detection engines.
7
+ * Supports dataset loading (JSON, BIPIA, Garak formats), metric computation
8
+ * (precision, recall, F1, MCC, per-category breakdowns), regression tracking,
9
+ * and multi-engine comparison.
10
+ *
11
+ * All processing is local — no data leaves the environment.
12
+ *
13
+ * @module benchmark-harness
14
+ */
15
+
16
+ const fs = require('fs');
17
+ const path = require('path');
18
+
19
+ // =========================================================================
20
+ // CONSTANTS
21
+ // =========================================================================
22
+
23
+ /** Required fields for each dataset entry. */
24
+ const REQUIRED_ENTRY_FIELDS = ['id', 'text', 'category', 'expected_detection', 'severity', 'difficulty'];
25
+
26
+ /** Valid severity levels. */
27
+ const VALID_SEVERITIES = ['critical', 'high', 'medium', 'low'];
28
+
29
+ /** Valid difficulty levels. */
30
+ const VALID_DIFFICULTIES = ['easy', 'medium', 'hard'];
31
+
32
+ /** Default F1 regression threshold (absolute drop). */
33
+ const DEFAULT_F1_REGRESSION_THRESHOLD = 0.02;
34
+
35
+ /** Default latency regression threshold (relative increase). */
36
+ const DEFAULT_LATENCY_REGRESSION_THRESHOLD = 0.20;
37
+
38
+ // =========================================================================
39
+ // DatasetLoader
40
+ // =========================================================================
41
+
42
+ /**
43
+ * Validates and loads benchmark datasets from various formats.
44
+ */
45
+ class DatasetLoader {
46
+ /**
47
+ * Load a dataset from a JSON file.
48
+ * @param {string} filePath — absolute or relative path to the JSON dataset
49
+ * @returns {{ entries: Array<Object>, meta: Object }}
50
+ */
51
+ load(filePath) {
52
+ const resolved = path.resolve(filePath);
53
+ if (!fs.existsSync(resolved)) {
54
+ throw new Error(`[Agent Shield] Dataset file not found: ${resolved}`);
55
+ }
56
+ const raw = fs.readFileSync(resolved, 'utf-8');
57
+ let data;
58
+ try {
59
+ data = JSON.parse(raw);
60
+ } catch (err) {
61
+ throw new Error(`[Agent Shield] Invalid JSON in dataset: ${err.message}`);
62
+ }
63
+
64
+ const entries = Array.isArray(data) ? data : (data.entries || data.dataset || []);
65
+ const meta = Array.isArray(data) ? {} : (data.meta || {});
66
+
67
+ this.validate(entries);
68
+ return { entries, meta };
69
+ }
70
+
71
+ /**
72
+ * Validate the structure of dataset entries.
73
+ * @param {Array<Object>} entries
74
+ * @throws {Error} if any entry is missing required fields or has invalid values
75
+ */
76
+ validate(entries) {
77
+ if (!Array.isArray(entries) || entries.length === 0) {
78
+ throw new Error('[Agent Shield] Dataset must be a non-empty array of entries');
79
+ }
80
+
81
+ for (let i = 0; i < entries.length; i++) {
82
+ const entry = entries[i];
83
+ for (const field of REQUIRED_ENTRY_FIELDS) {
84
+ if (!(field in entry)) {
85
+ throw new Error(`[Agent Shield] Entry ${i} missing required field: "${field}"`);
86
+ }
87
+ }
88
+ if (typeof entry.expected_detection !== 'boolean') {
89
+ throw new Error(`[Agent Shield] Entry ${i} (${entry.id}): expected_detection must be boolean`);
90
+ }
91
+ if (!VALID_SEVERITIES.includes(entry.severity)) {
92
+ throw new Error(`[Agent Shield] Entry ${i} (${entry.id}): invalid severity "${entry.severity}"`);
93
+ }
94
+ if (!VALID_DIFFICULTIES.includes(entry.difficulty)) {
95
+ throw new Error(`[Agent Shield] Entry ${i} (${entry.id}): invalid difficulty "${entry.difficulty}"`);
96
+ }
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Convert BIPIA-format entries to the standard format.
102
+ * BIPIA entries have: { prompt, label, attack_type }
103
+ * @param {Array<Object>} entries
104
+ * @returns {Array<Object>}
105
+ */
106
+ fromBIPIA(entries) {
107
+ if (!Array.isArray(entries)) {
108
+ throw new Error('[Agent Shield] BIPIA entries must be an array');
109
+ }
110
+ return entries.map((e, i) => ({
111
+ id: e.id || `bipia-${i}`,
112
+ text: e.prompt || e.text || '',
113
+ category: e.attack_type || 'prompt_injection',
114
+ expected_detection: e.label === 'attack' || e.label === 1 || e.label === true,
115
+ severity: e.severity || 'high',
116
+ difficulty: e.difficulty || 'medium'
117
+ }));
118
+ }
119
+
120
+ /**
121
+ * Convert Garak-format entries to the standard format.
122
+ * Garak entries have: { prompt, detector_results, probe_name }
123
+ * @param {Array<Object>} entries
124
+ * @returns {Array<Object>}
125
+ */
126
+ fromGarak(entries) {
127
+ if (!Array.isArray(entries)) {
128
+ throw new Error('[Agent Shield] Garak entries must be an array');
129
+ }
130
+ return entries.map((e, i) => ({
131
+ id: e.id || `garak-${i}`,
132
+ text: e.prompt || e.text || '',
133
+ category: e.probe_name || 'unknown',
134
+ expected_detection: e.detector_results === 'fail' || e.expected_detection === true,
135
+ severity: e.severity || 'medium',
136
+ difficulty: e.difficulty || 'medium'
137
+ }));
138
+ }
139
+ }
140
+
141
+ // =========================================================================
142
+ // BenchmarkMetrics
143
+ // =========================================================================
144
+
145
+ /**
146
+ * Computes classification metrics from benchmark results.
147
+ */
148
+ class BenchmarkMetrics {
149
+ /**
150
+ * Compute comprehensive metrics from benchmark results.
151
+ * @param {Array<Object>} results — array of { entry, detected, expected, latencyMs }
152
+ * @returns {Object} metrics object
153
+ */
154
+ compute(results) {
155
+ let tp = 0, fp = 0, tn = 0, fn = 0;
156
+ const latencies = [];
157
+ const perCategory = {};
158
+ const perDifficulty = { easy: { tp: 0, fp: 0, tn: 0, fn: 0 }, medium: { tp: 0, fp: 0, tn: 0, fn: 0 }, hard: { tp: 0, fp: 0, tn: 0, fn: 0 } };
159
+
160
+ for (const r of results) {
161
+ const expected = r.expected;
162
+ const detected = r.detected;
163
+ const cat = r.entry.category;
164
+ const diff = r.entry.difficulty;
165
+
166
+ if (!perCategory[cat]) {
167
+ perCategory[cat] = { tp: 0, fp: 0, tn: 0, fn: 0 };
168
+ }
169
+
170
+ if (expected && detected) {
171
+ tp++;
172
+ perCategory[cat].tp++;
173
+ if (perDifficulty[diff]) perDifficulty[diff].tp++;
174
+ } else if (!expected && detected) {
175
+ fp++;
176
+ perCategory[cat].fp++;
177
+ if (perDifficulty[diff]) perDifficulty[diff].fp++;
178
+ } else if (!expected && !detected) {
179
+ tn++;
180
+ perCategory[cat].tn++;
181
+ if (perDifficulty[diff]) perDifficulty[diff].tn++;
182
+ } else {
183
+ fn++;
184
+ perCategory[cat].fn++;
185
+ if (perDifficulty[diff]) perDifficulty[diff].fn++;
186
+ }
187
+
188
+ if (typeof r.latencyMs === 'number') {
189
+ latencies.push(r.latencyMs);
190
+ }
191
+ }
192
+
193
+ const precision = (tp + fp) > 0 ? tp / (tp + fp) : 0;
194
+ const recall = (tp + fn) > 0 ? tp / (tp + fn) : 0;
195
+ const f1 = (precision + recall) > 0 ? (2 * precision * recall) / (precision + recall) : 0;
196
+ const accuracy = results.length > 0 ? (tp + tn) / results.length : 0;
197
+ const mcc = this._computeMCC(tp, tn, fp, fn);
198
+
199
+ const confusionMatrix = { tp, fp, tn, fn };
200
+
201
+ // Per-category metrics
202
+ const perCategoryMetrics = {};
203
+ for (const [cat, cm] of Object.entries(perCategory)) {
204
+ perCategoryMetrics[cat] = this._metricsFromCM(cm);
205
+ }
206
+
207
+ // Per-difficulty metrics
208
+ const perDifficultyMetrics = {};
209
+ for (const [diff, cm] of Object.entries(perDifficulty)) {
210
+ if (cm.tp + cm.fp + cm.tn + cm.fn > 0) {
211
+ perDifficultyMetrics[diff] = this._metricsFromCM(cm);
212
+ }
213
+ }
214
+
215
+ // Latency stats
216
+ const latency = this._computeLatencyStats(latencies);
217
+
218
+ return {
219
+ total: results.length,
220
+ precision,
221
+ recall,
222
+ f1,
223
+ accuracy,
224
+ mcc,
225
+ confusionMatrix,
226
+ perCategory: perCategoryMetrics,
227
+ perDifficulty: perDifficultyMetrics,
228
+ latency
229
+ };
230
+ }
231
+
232
+ /**
233
+ * Compute Matthews Correlation Coefficient.
234
+ * MCC = (TP*TN - FP*FN) / sqrt((TP+FP)(TP+FN)(TN+FP)(TN+FN))
235
+ * @param {number} tp
236
+ * @param {number} tn
237
+ * @param {number} fp
238
+ * @param {number} fn
239
+ * @returns {number}
240
+ * @private
241
+ */
242
+ _computeMCC(tp, tn, fp, fn) {
243
+ const numerator = (tp * tn) - (fp * fn);
244
+ const denominator = Math.sqrt(
245
+ (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
246
+ );
247
+ if (denominator === 0) return 0;
248
+ return numerator / denominator;
249
+ }
250
+
251
+ /**
252
+ * Derive precision/recall/f1 from a confusion matrix bucket.
253
+ * @param {{ tp: number, fp: number, tn: number, fn: number }} cm
254
+ * @returns {{ precision: number, recall: number, f1: number, total: number }}
255
+ * @private
256
+ */
257
+ _metricsFromCM(cm) {
258
+ const precision = (cm.tp + cm.fp) > 0 ? cm.tp / (cm.tp + cm.fp) : 0;
259
+ const recall = (cm.tp + cm.fn) > 0 ? cm.tp / (cm.tp + cm.fn) : 0;
260
+ const f1 = (precision + recall) > 0 ? (2 * precision * recall) / (precision + recall) : 0;
261
+ return { precision, recall, f1, total: cm.tp + cm.fp + cm.tn + cm.fn, ...cm };
262
+ }
263
+
264
+ /**
265
+ * Compute latency statistics from an array of timings.
266
+ * @param {number[]} latencies
267
+ * @returns {{ mean: number, median: number, p95: number, p99: number, min: number, max: number }}
268
+ * @private
269
+ */
270
+ _computeLatencyStats(latencies) {
271
+ if (latencies.length === 0) {
272
+ return { mean: 0, median: 0, p95: 0, p99: 0, min: 0, max: 0 };
273
+ }
274
+ const sorted = [...latencies].sort((a, b) => a - b);
275
+ const sum = sorted.reduce((a, b) => a + b, 0);
276
+ const mean = sum / sorted.length;
277
+ const median = sorted[Math.floor(sorted.length / 2)];
278
+ const p95 = sorted[Math.floor(sorted.length * 0.95)];
279
+ const p99 = sorted[Math.floor(sorted.length * 0.99)];
280
+ return {
281
+ mean: Math.round(mean * 100) / 100,
282
+ median: Math.round(median * 100) / 100,
283
+ p95: Math.round(p95 * 100) / 100,
284
+ p99: Math.round(p99 * 100) / 100,
285
+ min: Math.round(sorted[0] * 100) / 100,
286
+ max: Math.round(sorted[sorted.length - 1] * 100) / 100
287
+ };
288
+ }
289
+ }
290
+
291
+ // =========================================================================
292
+ // RegressionTracker
293
+ // =========================================================================
294
+
295
+ /**
296
+ * Tracks performance baselines and detects regressions between runs.
297
+ */
298
+ class RegressionTracker {
299
+ /**
300
+ * @param {Object} [options]
301
+ * @param {number} [options.f1Threshold=0.02] — F1 drop threshold to flag regression
302
+ * @param {number} [options.latencyThreshold=0.20] — latency increase ratio threshold
303
+ */
304
+ constructor(options = {}) {
305
+ this.f1Threshold = options.f1Threshold || DEFAULT_F1_REGRESSION_THRESHOLD;
306
+ this.latencyThreshold = options.latencyThreshold || DEFAULT_LATENCY_REGRESSION_THRESHOLD;
307
+ }
308
+
309
+ /**
310
+ * Save a baseline to a JSON file.
311
+ * @param {Object} metrics — output from BenchmarkMetrics.compute()
312
+ * @param {string} filePath — path to write the baseline
313
+ */
314
+ saveBaseline(metrics, filePath) {
315
+ const resolved = path.resolve(filePath);
316
+ const dir = path.dirname(resolved);
317
+ if (!fs.existsSync(dir)) {
318
+ fs.mkdirSync(dir, { recursive: true });
319
+ }
320
+ const baseline = {
321
+ timestamp: new Date().toISOString(),
322
+ metrics
323
+ };
324
+ fs.writeFileSync(resolved, JSON.stringify(baseline, null, 2), 'utf-8');
325
+ }
326
+
327
+ /**
328
+ * Load a baseline from a JSON file.
329
+ * @param {string} filePath
330
+ * @returns {Object} baseline object with { timestamp, metrics }
331
+ */
332
+ loadBaseline(filePath) {
333
+ const resolved = path.resolve(filePath);
334
+ if (!fs.existsSync(resolved)) {
335
+ throw new Error(`[Agent Shield] Baseline file not found: ${resolved}`);
336
+ }
337
+ const raw = fs.readFileSync(resolved, 'utf-8');
338
+ return JSON.parse(raw);
339
+ }
340
+
341
+ /**
342
+ * Compare current metrics against a baseline and flag regressions.
343
+ * @param {Object} current — current metrics from BenchmarkMetrics.compute()
344
+ * @param {Object} baseline — baseline object from loadBaseline() or { metrics }
345
+ * @returns {{ passed: boolean, regressions: Array<Object>, improvements: Array<Object> }}
346
+ */
347
+ compare(current, baseline) {
348
+ const baseMetrics = baseline.metrics || baseline;
349
+ const regressions = [];
350
+ const improvements = [];
351
+
352
+ // F1 score check
353
+ const f1Delta = current.f1 - baseMetrics.f1;
354
+ if (f1Delta < -this.f1Threshold) {
355
+ regressions.push({
356
+ metric: 'f1',
357
+ baseline: baseMetrics.f1,
358
+ current: current.f1,
359
+ delta: f1Delta,
360
+ message: `F1 dropped by ${Math.abs(f1Delta).toFixed(4)} (threshold: ${this.f1Threshold})`
361
+ });
362
+ } else if (f1Delta > this.f1Threshold) {
363
+ improvements.push({
364
+ metric: 'f1',
365
+ baseline: baseMetrics.f1,
366
+ current: current.f1,
367
+ delta: f1Delta,
368
+ message: `F1 improved by ${f1Delta.toFixed(4)}`
369
+ });
370
+ }
371
+
372
+ // Precision check
373
+ const precDelta = current.precision - baseMetrics.precision;
374
+ if (precDelta < -this.f1Threshold) {
375
+ regressions.push({
376
+ metric: 'precision',
377
+ baseline: baseMetrics.precision,
378
+ current: current.precision,
379
+ delta: precDelta,
380
+ message: `Precision dropped by ${Math.abs(precDelta).toFixed(4)}`
381
+ });
382
+ }
383
+
384
+ // Recall check
385
+ const recDelta = current.recall - baseMetrics.recall;
386
+ if (recDelta < -this.f1Threshold) {
387
+ regressions.push({
388
+ metric: 'recall',
389
+ baseline: baseMetrics.recall,
390
+ current: current.recall,
391
+ delta: recDelta,
392
+ message: `Recall dropped by ${Math.abs(recDelta).toFixed(4)}`
393
+ });
394
+ }
395
+
396
+ // MCC check
397
+ if (typeof baseMetrics.mcc === 'number') {
398
+ const mccDelta = current.mcc - baseMetrics.mcc;
399
+ if (mccDelta < -this.f1Threshold) {
400
+ regressions.push({
401
+ metric: 'mcc',
402
+ baseline: baseMetrics.mcc,
403
+ current: current.mcc,
404
+ delta: mccDelta,
405
+ message: `MCC dropped by ${Math.abs(mccDelta).toFixed(4)}`
406
+ });
407
+ }
408
+ }
409
+
410
+ // Latency check (mean)
411
+ if (baseMetrics.latency && baseMetrics.latency.mean > 0 && current.latency) {
412
+ const latencyRatio = (current.latency.mean - baseMetrics.latency.mean) / baseMetrics.latency.mean;
413
+ if (latencyRatio > this.latencyThreshold) {
414
+ regressions.push({
415
+ metric: 'latency_mean',
416
+ baseline: baseMetrics.latency.mean,
417
+ current: current.latency.mean,
418
+ delta: latencyRatio,
419
+ message: `Mean latency increased by ${(latencyRatio * 100).toFixed(1)}% (threshold: ${(this.latencyThreshold * 100).toFixed(0)}%)`
420
+ });
421
+ }
422
+ }
423
+
424
+ // Per-category regression check
425
+ if (baseMetrics.perCategory && current.perCategory) {
426
+ for (const [cat, baseStats] of Object.entries(baseMetrics.perCategory)) {
427
+ if (current.perCategory[cat]) {
428
+ const catF1Delta = current.perCategory[cat].f1 - baseStats.f1;
429
+ if (catF1Delta < -this.f1Threshold) {
430
+ regressions.push({
431
+ metric: `category:${cat}:f1`,
432
+ baseline: baseStats.f1,
433
+ current: current.perCategory[cat].f1,
434
+ delta: catF1Delta,
435
+ message: `Category "${cat}" F1 dropped by ${Math.abs(catF1Delta).toFixed(4)}`
436
+ });
437
+ }
438
+ }
439
+ }
440
+ }
441
+
442
+ return {
443
+ passed: regressions.length === 0,
444
+ regressions,
445
+ improvements
446
+ };
447
+ }
448
+ }
449
+
450
+ // =========================================================================
451
+ // BenchmarkReportGenerator
452
+ // =========================================================================
453
+
454
+ /**
455
+ * Formats benchmark results into text, JSON, and markdown reports.
456
+ */
457
+ class BenchmarkReportGenerator {
458
+ /**
459
+ * Generate a plain text report.
460
+ * @param {Object} metrics — output from BenchmarkMetrics.compute()
461
+ * @param {Object} [options]
462
+ * @param {string} [options.title='Benchmark Report']
463
+ * @returns {string}
464
+ */
465
+ text(metrics, options = {}) {
466
+ const title = options.title || 'Benchmark Report';
467
+ const lines = [];
468
+ const sep = '='.repeat(60);
469
+
470
+ lines.push(sep);
471
+ lines.push(` ${title}`);
472
+ lines.push(sep);
473
+ lines.push('');
474
+ lines.push(` Total entries: ${metrics.total}`);
475
+ lines.push(` Precision: ${(metrics.precision * 100).toFixed(2)}%`);
476
+ lines.push(` Recall: ${(metrics.recall * 100).toFixed(2)}%`);
477
+ lines.push(` F1 Score: ${(metrics.f1 * 100).toFixed(2)}%`);
478
+ lines.push(` Accuracy: ${(metrics.accuracy * 100).toFixed(2)}%`);
479
+ lines.push(` MCC: ${metrics.mcc.toFixed(4)}`);
480
+ lines.push('');
481
+
482
+ // Confusion matrix
483
+ const cm = metrics.confusionMatrix;
484
+ lines.push(' Confusion Matrix:');
485
+ lines.push(` TP: ${cm.tp} FP: ${cm.fp}`);
486
+ lines.push(` FN: ${cm.fn} TN: ${cm.tn}`);
487
+ lines.push('');
488
+
489
+ // Per-category breakdown
490
+ if (Object.keys(metrics.perCategory).length > 0) {
491
+ lines.push(' Per-Category Breakdown:');
492
+ lines.push(` ${'Category'.padEnd(25)} ${'Prec'.padStart(7)} ${'Rec'.padStart(7)} ${'F1'.padStart(7)} ${'N'.padStart(5)}`);
493
+ lines.push(' ' + '-'.repeat(51));
494
+ for (const [cat, m] of Object.entries(metrics.perCategory)) {
495
+ lines.push(` ${cat.padEnd(25)} ${(m.precision * 100).toFixed(1).padStart(6)}% ${(m.recall * 100).toFixed(1).padStart(6)}% ${(m.f1 * 100).toFixed(1).padStart(6)}% ${String(m.total).padStart(5)}`);
496
+ }
497
+ lines.push('');
498
+ }
499
+
500
+ // Per-difficulty breakdown
501
+ if (Object.keys(metrics.perDifficulty).length > 0) {
502
+ lines.push(' Per-Difficulty Breakdown:');
503
+ for (const [diff, m] of Object.entries(metrics.perDifficulty)) {
504
+ lines.push(` ${diff.padEnd(10)} F1: ${(m.f1 * 100).toFixed(1)}% (n=${m.total})`);
505
+ }
506
+ lines.push('');
507
+ }
508
+
509
+ // Latency
510
+ if (metrics.latency && metrics.latency.mean > 0) {
511
+ lines.push(' Latency (ms):');
512
+ lines.push(` Mean: ${metrics.latency.mean}`);
513
+ lines.push(` Median: ${metrics.latency.median}`);
514
+ lines.push(` P95: ${metrics.latency.p95}`);
515
+ lines.push(` P99: ${metrics.latency.p99}`);
516
+ lines.push(` Min: ${metrics.latency.min}`);
517
+ lines.push(` Max: ${metrics.latency.max}`);
518
+ lines.push('');
519
+ }
520
+
521
+ lines.push(sep);
522
+ return lines.join('\n');
523
+ }
524
+
525
+ /**
526
+ * Generate a JSON report.
527
+ * @param {Object} metrics
528
+ * @returns {string}
529
+ */
530
+ json(metrics) {
531
+ return JSON.stringify(metrics, null, 2);
532
+ }
533
+
534
+ /**
535
+ * Generate a markdown table report.
536
+ * @param {Object} metrics — output from BenchmarkMetrics.compute()
537
+ * @param {Object} [options]
538
+ * @param {string} [options.title='Benchmark Report']
539
+ * @returns {string}
540
+ */
541
+ markdown(metrics, options = {}) {
542
+ const title = options.title || 'Benchmark Report';
543
+ const lines = [];
544
+
545
+ lines.push(`# ${title}`);
546
+ lines.push('');
547
+ lines.push('## Overall Metrics');
548
+ lines.push('');
549
+ lines.push('| Metric | Value |');
550
+ lines.push('|--------|-------|');
551
+ lines.push(`| Total | ${metrics.total} |`);
552
+ lines.push(`| Precision | ${(metrics.precision * 100).toFixed(2)}% |`);
553
+ lines.push(`| Recall | ${(metrics.recall * 100).toFixed(2)}% |`);
554
+ lines.push(`| F1 Score | ${(metrics.f1 * 100).toFixed(2)}% |`);
555
+ lines.push(`| Accuracy | ${(metrics.accuracy * 100).toFixed(2)}% |`);
556
+ lines.push(`| MCC | ${metrics.mcc.toFixed(4)} |`);
557
+ lines.push('');
558
+
559
+ // Confusion matrix
560
+ const cm = metrics.confusionMatrix;
561
+ lines.push('## Confusion Matrix');
562
+ lines.push('');
563
+ lines.push('| | Predicted Positive | Predicted Negative |');
564
+ lines.push('|---|---|---|');
565
+ lines.push(`| **Actual Positive** | TP: ${cm.tp} | FN: ${cm.fn} |`);
566
+ lines.push(`| **Actual Negative** | FP: ${cm.fp} | TN: ${cm.tn} |`);
567
+ lines.push('');
568
+
569
+ // Per-category
570
+ if (Object.keys(metrics.perCategory).length > 0) {
571
+ lines.push('## Per-Category Breakdown');
572
+ lines.push('');
573
+ lines.push('| Category | Precision | Recall | F1 | N |');
574
+ lines.push('|----------|-----------|--------|-----|---|');
575
+ for (const [cat, m] of Object.entries(metrics.perCategory)) {
576
+ lines.push(`| ${cat} | ${(m.precision * 100).toFixed(1)}% | ${(m.recall * 100).toFixed(1)}% | ${(m.f1 * 100).toFixed(1)}% | ${m.total} |`);
577
+ }
578
+ lines.push('');
579
+ }
580
+
581
+ // Latency
582
+ if (metrics.latency && metrics.latency.mean > 0) {
583
+ lines.push('## Latency');
584
+ lines.push('');
585
+ lines.push('| Stat | ms |');
586
+ lines.push('|------|-----|');
587
+ lines.push(`| Mean | ${metrics.latency.mean} |`);
588
+ lines.push(`| Median | ${metrics.latency.median} |`);
589
+ lines.push(`| P95 | ${metrics.latency.p95} |`);
590
+ lines.push(`| P99 | ${metrics.latency.p99} |`);
591
+ lines.push('');
592
+ }
593
+
594
+ return lines.join('\n');
595
+ }
596
+
597
+ /**
598
+ * Generate a comparison table for multiple engines.
599
+ * @param {Object} comparison — output from BenchmarkHarness.compare()
600
+ * @returns {string}
601
+ */
602
+ comparisonText(comparison) {
603
+ const lines = [];
604
+ const sep = '='.repeat(70);
605
+
606
+ lines.push(sep);
607
+ lines.push(' Engine Comparison');
608
+ lines.push(sep);
609
+ lines.push('');
610
+ lines.push(` ${'Engine'.padEnd(20)} ${'Prec'.padStart(7)} ${'Rec'.padStart(7)} ${'F1'.padStart(7)} ${'MCC'.padStart(7)} ${'Lat(ms)'.padStart(8)}`);
611
+ lines.push(' ' + '-'.repeat(56));
612
+
613
+ for (const [name, m] of Object.entries(comparison)) {
614
+ const lat = m.latency && m.latency.mean > 0 ? m.latency.mean.toFixed(1) : 'N/A';
615
+ lines.push(` ${name.padEnd(20)} ${(m.precision * 100).toFixed(1).padStart(6)}% ${(m.recall * 100).toFixed(1).padStart(6)}% ${(m.f1 * 100).toFixed(1).padStart(6)}% ${m.mcc.toFixed(3).padStart(7)} ${lat.padStart(8)}`);
616
+ }
617
+
618
+ lines.push('');
619
+ lines.push(sep);
620
+ return lines.join('\n');
621
+ }
622
+ }
623
+
624
+ // =========================================================================
625
+ // BenchmarkHarness
626
+ // =========================================================================
627
+
628
+ /**
629
+ * Main benchmark harness for evaluating detection engines.
630
+ *
631
+ * @example
632
+ * const { BenchmarkHarness } = require('./benchmark-harness');
633
+ * const { scanText } = require('./detector-core');
634
+ *
635
+ * const harness = new BenchmarkHarness();
636
+ * harness.loadDataset('datasets/attack-corpus.json');
637
+ * const results = harness.run((text) => {
638
+ * const result = scanText(text);
639
+ * return result.threats.length > 0;
640
+ * });
641
+ * console.log(harness.formatReport(results));
642
+ */
643
+ class BenchmarkHarness {
644
+ /**
645
+ * @param {Object} [options]
646
+ * @param {number} [options.warmupRuns=1] — number of warmup iterations (not measured)
647
+ */
648
+ constructor(options = {}) {
649
+ /** @private */
650
+ this._entries = [];
651
+ /** @private */
652
+ this._meta = {};
653
+ /** @private */
654
+ this._loader = new DatasetLoader();
655
+ /** @private */
656
+ this._metrics = new BenchmarkMetrics();
657
+ /** @private */
658
+ this._reporter = new BenchmarkReportGenerator();
659
+ /** @private */
660
+ this._warmupRuns = options.warmupRuns || 1;
661
+ }
662
+
663
+ /**
664
+ * Load a benchmark dataset from a JSON file.
665
+ * @param {string} filePath — path to the dataset JSON
666
+ * @returns {BenchmarkHarness} this (for chaining)
667
+ */
668
+ loadDataset(filePath) {
669
+ const { entries, meta } = this._loader.load(filePath);
670
+ this._entries = entries;
671
+ this._meta = meta;
672
+ return this;
673
+ }
674
+
675
+ /**
676
+ * Load entries directly (must conform to the standard schema).
677
+ * @param {Array<Object>} entries
678
+ * @returns {BenchmarkHarness} this (for chaining)
679
+ */
680
+ loadEntries(entries) {
681
+ this._loader.validate(entries);
682
+ this._entries = entries;
683
+ return this;
684
+ }
685
+
686
+ /**
687
+ * Run a detector function against all loaded entries.
688
+ *
689
+ * The detector function receives the text and must return a boolean
690
+ * (true = threat detected, false = no threat).
691
+ *
692
+ * @param {function(string): boolean} detectorFn
693
+ * @returns {{ results: Array<Object>, metrics: Object }}
694
+ */
695
+ run(detectorFn) {
696
+ if (this._entries.length === 0) {
697
+ throw new Error('[Agent Shield] No entries loaded. Call loadDataset() or loadEntries() first.');
698
+ }
699
+ if (typeof detectorFn !== 'function') {
700
+ throw new Error('[Agent Shield] detectorFn must be a function');
701
+ }
702
+
703
+ // Warmup runs (not measured)
704
+ for (let w = 0; w < this._warmupRuns; w++) {
705
+ for (const entry of this._entries) {
706
+ detectorFn(entry.text);
707
+ }
708
+ }
709
+
710
+ // Measured run
711
+ const results = [];
712
+ for (const entry of this._entries) {
713
+ const start = this._now();
714
+ let detected;
715
+ try {
716
+ detected = Boolean(detectorFn(entry.text));
717
+ } catch (err) {
718
+ detected = false;
719
+ }
720
+ const latencyMs = this._now() - start;
721
+
722
+ results.push({
723
+ entry,
724
+ detected,
725
+ expected: entry.expected_detection,
726
+ correct: detected === entry.expected_detection,
727
+ latencyMs
728
+ });
729
+ }
730
+
731
+ const metrics = this._metrics.compute(results);
732
+ return { results, metrics };
733
+ }
734
+
735
+ /**
736
+ * Compare multiple detector functions side-by-side.
737
+ *
738
+ * @param {Object<string, function(string): boolean>} detectors — name → detectorFn
739
+ * @returns {Object<string, Object>} name → metrics
740
+ */
741
+ compare(detectors) {
742
+ if (!detectors || typeof detectors !== 'object') {
743
+ throw new Error('[Agent Shield] detectors must be an object mapping names to functions');
744
+ }
745
+ const comparison = {};
746
+ for (const [name, fn] of Object.entries(detectors)) {
747
+ const { metrics } = this.run(fn);
748
+ comparison[name] = metrics;
749
+ }
750
+ return comparison;
751
+ }
752
+
753
+ /**
754
+ * Generate a human-readable text report from run results.
755
+ * @param {{ results: Array, metrics: Object }} results — output from run()
756
+ * @returns {string}
757
+ */
758
+ formatReport(results) {
759
+ return this._reporter.text(results.metrics, { title: 'Agent Shield Benchmark Report' });
760
+ }
761
+
762
+ /**
763
+ * Generate a comparison table from compare() output.
764
+ * @param {Object} comparison — output from compare()
765
+ * @returns {string}
766
+ */
767
+ formatComparison(comparison) {
768
+ return this._reporter.comparisonText(comparison);
769
+ }
770
+
771
+ /**
772
+ * Generate a markdown report from run results.
773
+ * @param {{ results: Array, metrics: Object }} results — output from run()
774
+ * @returns {string}
775
+ */
776
+ formatMarkdown(results) {
777
+ return this._reporter.markdown(results.metrics, { title: 'Agent Shield Benchmark Report' });
778
+ }
779
+
780
+ /**
781
+ * High-resolution timer.
782
+ * @returns {number}
783
+ * @private
784
+ */
785
+ _now() {
786
+ if (typeof performance !== 'undefined' && performance.now) {
787
+ return performance.now();
788
+ }
789
+ const hr = process.hrtime();
790
+ return hr[0] * 1e3 + hr[1] / 1e6;
791
+ }
792
+ }
793
+
794
+ // =========================================================================
795
+ // EXPORTS
796
+ // =========================================================================
797
+
798
+ module.exports = {
799
+ BenchmarkHarness,
800
+ DatasetLoader,
801
+ BenchmarkMetrics,
802
+ RegressionTracker,
803
+ BenchmarkReportGenerator
804
+ };