@panguard-ai/atr 1.4.3 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/action-executor.d.ts +44 -0
- package/dist/action-executor.d.ts.map +1 -0
- package/dist/action-executor.js +130 -0
- package/dist/action-executor.js.map +1 -0
- package/dist/adapters/default-adapter.d.ts +24 -0
- package/dist/adapters/default-adapter.d.ts.map +1 -0
- package/dist/adapters/default-adapter.js +51 -0
- package/dist/adapters/default-adapter.js.map +1 -0
- package/dist/adapters/stdio-adapter.d.ts +30 -0
- package/dist/adapters/stdio-adapter.d.ts.map +1 -0
- package/dist/adapters/stdio-adapter.js +128 -0
- package/dist/adapters/stdio-adapter.js.map +1 -0
- package/dist/badge.d.ts +42 -0
- package/dist/badge.d.ts.map +1 -0
- package/dist/badge.js +163 -0
- package/dist/badge.js.map +1 -0
- package/dist/capability-extractor.d.ts +35 -0
- package/dist/capability-extractor.d.ts.map +1 -0
- package/dist/capability-extractor.js +91 -0
- package/dist/capability-extractor.js.map +1 -0
- package/dist/cli/scan-handler.d.ts +21 -0
- package/dist/cli/scan-handler.d.ts.map +1 -0
- package/dist/cli/scan-handler.js +276 -0
- package/dist/cli/scan-handler.js.map +1 -0
- package/dist/cli/tc-pipeline.d.ts +18 -0
- package/dist/cli/tc-pipeline.d.ts.map +1 -0
- package/dist/cli/tc-pipeline.js +295 -0
- package/dist/cli/tc-pipeline.js.map +1 -0
- package/dist/cli.d.ts +12 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +894 -0
- package/dist/cli.js.map +1 -0
- package/dist/content-hash.d.ts +7 -0
- package/dist/content-hash.d.ts.map +1 -0
- package/dist/content-hash.js +10 -0
- package/dist/content-hash.js.map +1 -0
- package/dist/converters/elastic.d.ts +36 -0
- package/dist/converters/elastic.d.ts.map +1 -0
- package/dist/converters/elastic.js +125 -0
- package/dist/converters/elastic.js.map +1 -0
- package/dist/converters/generic-regex.d.ts +37 -0
- package/dist/converters/generic-regex.d.ts.map +1 -0
- package/dist/converters/generic-regex.js +59 -0
- package/dist/converters/generic-regex.js.map +1 -0
- package/dist/converters/index.d.ts +32 -0
- package/dist/converters/index.d.ts.map +1 -0
- package/dist/converters/index.js +38 -0
- package/dist/converters/index.js.map +1 -0
- package/dist/converters/sarif.d.ts +18 -0
- package/dist/converters/sarif.d.ts.map +1 -0
- package/dist/converters/sarif.js +142 -0
- package/dist/converters/sarif.js.map +1 -0
- package/dist/converters/splunk.d.ts +19 -0
- package/dist/converters/splunk.d.ts.map +1 -0
- package/dist/converters/splunk.js +148 -0
- package/dist/converters/splunk.js.map +1 -0
- package/dist/coverage-analyzer.d.ts +43 -0
- package/dist/coverage-analyzer.d.ts.map +1 -0
- package/dist/coverage-analyzer.js +329 -0
- package/dist/coverage-analyzer.js.map +1 -0
- package/dist/embedding/build-corpus.d.ts +15 -0
- package/dist/embedding/build-corpus.d.ts.map +1 -0
- package/dist/embedding/build-corpus.js +105 -0
- package/dist/embedding/build-corpus.js.map +1 -0
- package/dist/embedding/model-loader.d.ts +41 -0
- package/dist/embedding/model-loader.d.ts.map +1 -0
- package/dist/embedding/model-loader.js +90 -0
- package/dist/embedding/model-loader.js.map +1 -0
- package/dist/embedding/vector-store.d.ts +41 -0
- package/dist/embedding/vector-store.d.ts.map +1 -0
- package/dist/embedding/vector-store.js +70 -0
- package/dist/embedding/vector-store.js.map +1 -0
- package/dist/engine.d.ts +222 -0
- package/dist/engine.d.ts.map +1 -0
- package/dist/engine.js +1185 -0
- package/dist/engine.js.map +1 -0
- package/dist/eval/corpus.d.ts +42 -0
- package/dist/eval/corpus.d.ts.map +1 -0
- package/dist/eval/corpus.js +427 -0
- package/dist/eval/corpus.js.map +1 -0
- package/dist/eval/eval-harness.d.ts +44 -0
- package/dist/eval/eval-harness.d.ts.map +1 -0
- package/dist/eval/eval-harness.js +296 -0
- package/dist/eval/eval-harness.js.map +1 -0
- package/dist/eval/index.d.ts +13 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +9 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/metrics.d.ts +74 -0
- package/dist/eval/metrics.d.ts.map +1 -0
- package/dist/eval/metrics.js +108 -0
- package/dist/eval/metrics.js.map +1 -0
- package/dist/eval/pint-corpus.d.ts +34 -0
- package/dist/eval/pint-corpus.d.ts.map +1 -0
- package/dist/eval/pint-corpus.js +113 -0
- package/dist/eval/pint-corpus.js.map +1 -0
- package/dist/eval/rule-corpus.d.ts +9 -0
- package/dist/eval/rule-corpus.d.ts.map +1 -0
- package/dist/eval/rule-corpus.js +4780 -0
- package/dist/eval/rule-corpus.js.map +1 -0
- package/dist/eval/rule-metrics.d.ts +34 -0
- package/dist/eval/rule-metrics.d.ts.map +1 -0
- package/dist/eval/rule-metrics.js +92 -0
- package/dist/eval/rule-metrics.js.map +1 -0
- package/dist/eval/run-eval.d.ts +7 -0
- package/dist/eval/run-eval.d.ts.map +1 -0
- package/dist/eval/run-eval.js +11 -0
- package/dist/eval/run-eval.js.map +1 -0
- package/dist/eval/run-pint-benchmark.d.ts +18 -0
- package/dist/eval/run-pint-benchmark.d.ts.map +1 -0
- package/dist/eval/run-pint-benchmark.js +159 -0
- package/dist/eval/run-pint-benchmark.js.map +1 -0
- package/dist/eval/skill-benchmark.d.ts +66 -0
- package/dist/eval/skill-benchmark.d.ts.map +1 -0
- package/dist/eval/skill-benchmark.js +194 -0
- package/dist/eval/skill-benchmark.js.map +1 -0
- package/dist/flywheel.d.ts +54 -0
- package/dist/flywheel.d.ts.map +1 -0
- package/dist/flywheel.js +121 -0
- package/dist/flywheel.js.map +1 -0
- package/dist/hook-handler.d.ts +61 -0
- package/dist/hook-handler.d.ts.map +1 -0
- package/dist/hook-handler.js +178 -0
- package/dist/hook-handler.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/{src/index.ts → dist/index.js} +1 -0
- package/dist/index.js.map +1 -0
- package/dist/layer-integration.d.ts +55 -0
- package/dist/layer-integration.d.ts.map +1 -0
- package/dist/layer-integration.js +187 -0
- package/dist/layer-integration.js.map +1 -0
- package/dist/loader.d.ts +18 -0
- package/dist/loader.d.ts.map +1 -0
- package/dist/loader.js +129 -0
- package/dist/loader.js.map +1 -0
- package/dist/mcp-server.d.ts +13 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +246 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/mcp-tools/coverage-gaps.d.ts +13 -0
- package/dist/mcp-tools/coverage-gaps.d.ts.map +1 -0
- package/dist/mcp-tools/coverage-gaps.js +55 -0
- package/dist/mcp-tools/coverage-gaps.js.map +1 -0
- package/dist/mcp-tools/list-rules.d.ts +17 -0
- package/dist/mcp-tools/list-rules.d.ts.map +1 -0
- package/dist/mcp-tools/list-rules.js +45 -0
- package/dist/mcp-tools/list-rules.js.map +1 -0
- package/dist/mcp-tools/scan-skill.d.ts +17 -0
- package/dist/mcp-tools/scan-skill.d.ts.map +1 -0
- package/dist/mcp-tools/scan-skill.js +65 -0
- package/dist/mcp-tools/scan-skill.js.map +1 -0
- package/dist/mcp-tools/scan.d.ts +24 -0
- package/dist/mcp-tools/scan.d.ts.map +1 -0
- package/dist/mcp-tools/scan.js +94 -0
- package/dist/mcp-tools/scan.js.map +1 -0
- package/dist/mcp-tools/submit-proposal.d.ts +12 -0
- package/dist/mcp-tools/submit-proposal.d.ts.map +1 -0
- package/dist/mcp-tools/submit-proposal.js +103 -0
- package/dist/mcp-tools/submit-proposal.js.map +1 -0
- package/dist/mcp-tools/threat-summary.d.ts +12 -0
- package/dist/mcp-tools/threat-summary.d.ts.map +1 -0
- package/dist/mcp-tools/threat-summary.js +74 -0
- package/dist/mcp-tools/threat-summary.js.map +1 -0
- package/dist/mcp-tools/validate.d.ts +15 -0
- package/dist/mcp-tools/validate.d.ts.map +1 -0
- package/dist/mcp-tools/validate.js +51 -0
- package/dist/mcp-tools/validate.js.map +1 -0
- package/dist/modules/embedding.d.ts +71 -0
- package/dist/modules/embedding.d.ts.map +1 -0
- package/dist/modules/embedding.js +141 -0
- package/dist/modules/embedding.js.map +1 -0
- package/dist/modules/index.d.ts +144 -0
- package/dist/modules/index.d.ts.map +1 -0
- package/dist/modules/index.js +82 -0
- package/dist/modules/index.js.map +1 -0
- package/dist/modules/semantic.d.ts +106 -0
- package/dist/modules/semantic.d.ts.map +1 -0
- package/dist/modules/semantic.js +359 -0
- package/dist/modules/semantic.js.map +1 -0
- package/dist/modules/session.d.ts +70 -0
- package/dist/modules/session.d.ts.map +1 -0
- package/dist/modules/session.js +128 -0
- package/dist/modules/session.js.map +1 -0
- package/dist/quality/adapters/atr.d.ts +65 -0
- package/dist/quality/adapters/atr.d.ts.map +1 -0
- package/dist/quality/adapters/atr.js +154 -0
- package/dist/quality/adapters/atr.js.map +1 -0
- package/dist/quality/adapters/index.d.ts +10 -0
- package/dist/quality/adapters/index.d.ts.map +1 -0
- package/dist/quality/adapters/index.js +10 -0
- package/dist/quality/adapters/index.js.map +1 -0
- package/dist/quality/compute-confidence.d.ts +45 -0
- package/dist/quality/compute-confidence.d.ts.map +1 -0
- package/dist/quality/compute-confidence.js +133 -0
- package/dist/quality/compute-confidence.js.map +1 -0
- package/dist/quality/index.d.ts +36 -0
- package/dist/quality/index.d.ts.map +1 -0
- package/dist/quality/index.js +39 -0
- package/dist/quality/index.js.map +1 -0
- package/dist/quality/quality-gate.d.ts +86 -0
- package/dist/quality/quality-gate.d.ts.map +1 -0
- package/dist/quality/quality-gate.js +187 -0
- package/dist/quality/quality-gate.js.map +1 -0
- package/dist/quality/types.d.ts +129 -0
- package/dist/quality/types.d.ts.map +1 -0
- package/dist/quality/types.js +10 -0
- package/dist/quality/types.js.map +1 -0
- package/dist/quality/validate-maturity.d.ts +51 -0
- package/dist/quality/validate-maturity.d.ts.map +1 -0
- package/dist/quality/validate-maturity.js +134 -0
- package/dist/quality/validate-maturity.js.map +1 -0
- package/dist/quality.d.ts +8 -0
- package/dist/quality.d.ts.map +1 -0
- package/dist/quality.js +8 -0
- package/dist/quality.js.map +1 -0
- package/dist/rule-scaffolder.d.ts +53 -0
- package/dist/rule-scaffolder.d.ts.map +1 -0
- package/dist/rule-scaffolder.js +301 -0
- package/dist/rule-scaffolder.js.map +1 -0
- package/dist/session-tracker.d.ts +58 -0
- package/dist/session-tracker.d.ts.map +1 -0
- package/dist/session-tracker.js +176 -0
- package/dist/session-tracker.js.map +1 -0
- package/dist/shadow-evaluator.d.ts +48 -0
- package/dist/shadow-evaluator.d.ts.map +1 -0
- package/dist/shadow-evaluator.js +129 -0
- package/dist/shadow-evaluator.js.map +1 -0
- package/dist/skill-fingerprint.d.ts +85 -0
- package/dist/skill-fingerprint.d.ts.map +1 -0
- package/dist/skill-fingerprint.js +284 -0
- package/dist/skill-fingerprint.js.map +1 -0
- package/dist/tc-reporter.d.ts +50 -0
- package/dist/tc-reporter.d.ts.map +1 -0
- package/dist/tc-reporter.js +164 -0
- package/dist/tc-reporter.js.map +1 -0
- package/dist/tier0-invariant.d.ts +49 -0
- package/dist/tier0-invariant.d.ts.map +1 -0
- package/dist/tier0-invariant.js +185 -0
- package/dist/tier0-invariant.js.map +1 -0
- package/dist/tier1-blacklist.d.ts +48 -0
- package/dist/tier1-blacklist.d.ts.map +1 -0
- package/dist/tier1-blacklist.js +92 -0
- package/dist/tier1-blacklist.js.map +1 -0
- package/dist/types.d.ts +232 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/verdict.d.ts +26 -0
- package/dist/verdict.d.ts.map +1 -0
- package/dist/verdict.js +127 -0
- package/dist/verdict.js.map +1 -0
- package/package.json +16 -4
- package/.github/ISSUE_TEMPLATE/evasion-report.yml +0 -75
- package/.github/ISSUE_TEMPLATE/false-positive.yml +0 -31
- package/.github/ISSUE_TEMPLATE/mirofish-prediction.yml +0 -128
- package/.github/ISSUE_TEMPLATE/new-rule.yml +0 -37
- package/.github/PULL_REQUEST_TEMPLATE.md +0 -23
- package/.github/workflows/rule-quality.yml +0 -203
- package/.github/workflows/validate.yml +0 -42
- package/CHANGELOG.md +0 -30
- package/CONTRIBUTING.md +0 -168
- package/CONTRIBUTORS.md +0 -28
- package/COVERAGE.md +0 -135
- package/LIMITATIONS.md +0 -154
- package/SECURITY.md +0 -48
- package/THREAT-MODEL.md +0 -243
- package/docs/contribution-paths.md +0 -202
- package/docs/mirofish-prediction-guide.md +0 -304
- package/docs/quick-start.md +0 -245
- package/docs/rule-writing-guide.md +0 -647
- package/docs/schema-spec.md +0 -594
- package/examples/how-to-write-a-rule.md +0 -251
- package/tsconfig.json +0 -17
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Harness -- orchestrates running the corpus through the ATR engine
|
|
3
|
+
* and produces a structured EvalReport.
|
|
4
|
+
*
|
|
5
|
+
* Supports:
|
|
6
|
+
* - Regex-only evaluation (Tier 2)
|
|
7
|
+
* - Regex + Embedding evaluation (Tier 2 + 2.5)
|
|
8
|
+
* - Full pipeline evaluation (all tiers)
|
|
9
|
+
* - Per-sample latency measurement
|
|
10
|
+
* - Regression check against baseline thresholds
|
|
11
|
+
*
|
|
12
|
+
* @module agent-threat-rules/eval/eval-harness
|
|
13
|
+
*/
|
|
14
|
+
import { resolve, join } from 'node:path';
|
|
15
|
+
import { existsSync, readFileSync, writeFileSync } from 'node:fs';
|
|
16
|
+
import { ATREngine } from '../engine.js';
|
|
17
|
+
import { EmbeddingModule } from '../modules/embedding.js';
|
|
18
|
+
import { EVAL_CORPUS, getCorpusStats } from './corpus.js';
|
|
19
|
+
import { computeEvalReport, checkRegression } from './metrics.js';
|
|
20
|
+
import { computeRuleQuality } from './rule-metrics.js';
|
|
21
|
+
/**
|
|
22
|
+
* Convert a corpus sample to an AgentEvent.
|
|
23
|
+
*/
|
|
24
|
+
function sampleToEvent(sample) {
|
|
25
|
+
return {
|
|
26
|
+
type: sample.eventType,
|
|
27
|
+
content: sample.text,
|
|
28
|
+
timestamp: new Date().toISOString(),
|
|
29
|
+
fields: sample.fields,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Run a single sample through the engine (regex only) and measure results.
|
|
34
|
+
* Catches engine errors so a single bad sample doesn't abort the entire eval.
|
|
35
|
+
*/
|
|
36
|
+
function evaluateSampleRegex(engine, sample) {
|
|
37
|
+
const event = sampleToEvent(sample);
|
|
38
|
+
try {
|
|
39
|
+
const start = performance.now();
|
|
40
|
+
const matches = engine.evaluate(event);
|
|
41
|
+
const latencyMs = performance.now() - start;
|
|
42
|
+
const detected = matches.length > 0;
|
|
43
|
+
const topMatch = matches[0];
|
|
44
|
+
return {
|
|
45
|
+
id: sample.id,
|
|
46
|
+
category: sample.category,
|
|
47
|
+
expectedDetection: sample.expectedDetection,
|
|
48
|
+
actualDetection: detected,
|
|
49
|
+
matchedRules: matches.map((m) => m.rule.id),
|
|
50
|
+
confidence: topMatch?.confidence ?? 0,
|
|
51
|
+
latencyMs,
|
|
52
|
+
difficulty: sample.difficulty,
|
|
53
|
+
tier: sample.tier,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
catch {
|
|
57
|
+
return {
|
|
58
|
+
id: sample.id,
|
|
59
|
+
category: sample.category,
|
|
60
|
+
expectedDetection: sample.expectedDetection,
|
|
61
|
+
actualDetection: false,
|
|
62
|
+
matchedRules: [],
|
|
63
|
+
confidence: 0,
|
|
64
|
+
latencyMs: 0,
|
|
65
|
+
difficulty: sample.difficulty,
|
|
66
|
+
tier: sample.tier,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Run a single sample through the full pipeline (regex + embedding) and measure results.
|
|
72
|
+
*/
|
|
73
|
+
async function evaluateSampleFull(engine, sample) {
|
|
74
|
+
const event = sampleToEvent(sample);
|
|
75
|
+
try {
|
|
76
|
+
const start = performance.now();
|
|
77
|
+
const { verdict } = await engine.evaluateWithVerdict(event);
|
|
78
|
+
const latencyMs = performance.now() - start;
|
|
79
|
+
const detected = verdict.matchCount > 0;
|
|
80
|
+
return {
|
|
81
|
+
id: sample.id,
|
|
82
|
+
category: sample.category,
|
|
83
|
+
expectedDetection: sample.expectedDetection,
|
|
84
|
+
actualDetection: detected,
|
|
85
|
+
matchedRules: verdict.matches.map((m) => m.rule.id),
|
|
86
|
+
confidence: verdict.highestConfidence,
|
|
87
|
+
latencyMs,
|
|
88
|
+
difficulty: sample.difficulty,
|
|
89
|
+
tier: sample.tier,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
catch {
|
|
93
|
+
return {
|
|
94
|
+
id: sample.id,
|
|
95
|
+
category: sample.category,
|
|
96
|
+
expectedDetection: sample.expectedDetection,
|
|
97
|
+
actualDetection: false,
|
|
98
|
+
matchedRules: [],
|
|
99
|
+
confidence: 0,
|
|
100
|
+
latencyMs: 0,
|
|
101
|
+
difficulty: sample.difficulty,
|
|
102
|
+
tier: sample.tier,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Try to load the embedding module. Returns null if unavailable.
|
|
108
|
+
*/
|
|
109
|
+
async function tryLoadEmbedding(embeddingsPath) {
|
|
110
|
+
if (!existsSync(embeddingsPath))
|
|
111
|
+
return null;
|
|
112
|
+
try {
|
|
113
|
+
const data = JSON.parse(readFileSync(embeddingsPath, 'utf-8'));
|
|
114
|
+
const module = new EmbeddingModule({
|
|
115
|
+
attackVectorsData: data,
|
|
116
|
+
similarityThreshold: 0.65,
|
|
117
|
+
});
|
|
118
|
+
await module.initialize();
|
|
119
|
+
return module.isAvailable() ? module : null;
|
|
120
|
+
}
|
|
121
|
+
catch {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Run the full evaluation harness.
|
|
127
|
+
* Returns the EvalReport and RegressionCheck.
|
|
128
|
+
*/
|
|
129
|
+
export async function runEval(config) {
|
|
130
|
+
const corpus = config.corpus ?? EVAL_CORPUS;
|
|
131
|
+
const base = resolve(config.rulesDir, '..');
|
|
132
|
+
const embeddingsPath = config.embeddingsPath ?? join(base, 'data', 'attack-embeddings.json');
|
|
133
|
+
// Try to load embedding module
|
|
134
|
+
const shouldEmbed = config.enableEmbedding !== false;
|
|
135
|
+
let embeddingModule = null;
|
|
136
|
+
const tiersUsed = ['tier2-regex'];
|
|
137
|
+
if (shouldEmbed) {
|
|
138
|
+
embeddingModule = await tryLoadEmbedding(embeddingsPath);
|
|
139
|
+
if (embeddingModule) {
|
|
140
|
+
tiersUsed.push('tier2.5-embedding');
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
// Initialize engine
|
|
144
|
+
const engine = new ATREngine({
|
|
145
|
+
rulesDir: config.rulesDir,
|
|
146
|
+
embeddingModule: embeddingModule ?? undefined,
|
|
147
|
+
});
|
|
148
|
+
const ruleCount = await engine.loadRules();
|
|
149
|
+
if (ruleCount === 0) {
|
|
150
|
+
throw new Error(`No rules loaded from ${config.rulesDir}`);
|
|
151
|
+
}
|
|
152
|
+
// Run all samples
|
|
153
|
+
const results = [];
|
|
154
|
+
const useFullPipeline = embeddingModule !== null;
|
|
155
|
+
for (const sample of corpus) {
|
|
156
|
+
const result = useFullPipeline
|
|
157
|
+
? await evaluateSampleFull(engine, sample)
|
|
158
|
+
: evaluateSampleRegex(engine, sample);
|
|
159
|
+
results.push(result);
|
|
160
|
+
}
|
|
161
|
+
// Cleanup embedding module
|
|
162
|
+
if (embeddingModule) {
|
|
163
|
+
await embeddingModule.destroy();
|
|
164
|
+
}
|
|
165
|
+
// Compute report
|
|
166
|
+
const report = computeEvalReport(results);
|
|
167
|
+
const regression = checkRegression(report, config.thresholds);
|
|
168
|
+
const corpusStats = getCorpusStats();
|
|
169
|
+
// Compute per-rule quality
|
|
170
|
+
const loadedRuleIds = engine.getRules().map((r) => r.id);
|
|
171
|
+
const ruleQuality = computeRuleQuality(results, loadedRuleIds);
|
|
172
|
+
// Save report if output path specified
|
|
173
|
+
if (config.outputPath) {
|
|
174
|
+
const output = {
|
|
175
|
+
report,
|
|
176
|
+
regression,
|
|
177
|
+
corpusStats,
|
|
178
|
+
ruleQuality,
|
|
179
|
+
ruleCount,
|
|
180
|
+
engine: 'ATREngine',
|
|
181
|
+
tiers: tiersUsed,
|
|
182
|
+
};
|
|
183
|
+
writeFileSync(config.outputPath, JSON.stringify(output, null, 2));
|
|
184
|
+
}
|
|
185
|
+
return { report, regression, corpusStats, tiersUsed, ruleQuality };
|
|
186
|
+
}
|
|
187
|
+
// ---------------------------------------------------------------------------
|
|
188
|
+
// CLI entry point
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
function formatPercent(n) {
|
|
191
|
+
return `${(n * 100).toFixed(1)}%`;
|
|
192
|
+
}
|
|
193
|
+
function formatMs(n) {
|
|
194
|
+
return `${n.toFixed(2)}ms`;
|
|
195
|
+
}
|
|
196
|
+
export async function runEvalCLI() {
|
|
197
|
+
const base = resolve(join(import.meta.dirname ?? '.', '..', '..'));
|
|
198
|
+
const rulesDir = join(base, 'rules');
|
|
199
|
+
const outputPath = join(base, 'data', 'eval-report.json');
|
|
200
|
+
console.log('\n=== ATR Evaluation Harness ===\n');
|
|
201
|
+
const { report, regression, corpusStats, tiersUsed, ruleQuality } = await runEval({
|
|
202
|
+
rulesDir,
|
|
203
|
+
outputPath,
|
|
204
|
+
});
|
|
205
|
+
// Corpus stats
|
|
206
|
+
console.log(`Corpus: ${corpusStats.total} samples (${corpusStats.attacks} attacks, ${corpusStats.benign} benign)`);
|
|
207
|
+
console.log(`Categories: ${Object.keys(corpusStats.byCategory).join(', ')}`);
|
|
208
|
+
console.log(`Tiers: ${tiersUsed.join(' + ')}`);
|
|
209
|
+
// Overall metrics
|
|
210
|
+
console.log(`\n--- Overall ---`);
|
|
211
|
+
console.log(` Precision: ${formatPercent(report.overall.precision)}`);
|
|
212
|
+
console.log(` Recall: ${formatPercent(report.overall.recall)}`);
|
|
213
|
+
console.log(` F1: ${formatPercent(report.overall.f1)}`);
|
|
214
|
+
console.log(` Accuracy: ${formatPercent(report.overall.accuracy)}`);
|
|
215
|
+
console.log(` FP Rate: ${formatPercent(report.overall.fpRate)}`);
|
|
216
|
+
console.log(` Confusion: TP=${report.overall.confusion.tp} FP=${report.overall.confusion.fp} TN=${report.overall.confusion.tn} FN=${report.overall.confusion.fn}`);
|
|
217
|
+
// Latency
|
|
218
|
+
console.log(`\n--- Latency ---`);
|
|
219
|
+
console.log(` P50: ${formatMs(report.latency.p50)}`);
|
|
220
|
+
console.log(` P95: ${formatMs(report.latency.p95)}`);
|
|
221
|
+
console.log(` P99: ${formatMs(report.latency.p99)}`);
|
|
222
|
+
console.log(` Mean: ${formatMs(report.latency.mean)}`);
|
|
223
|
+
console.log(` Max: ${formatMs(report.latency.max)}`);
|
|
224
|
+
// Per category
|
|
225
|
+
console.log(`\n--- By Category ---`);
|
|
226
|
+
for (const cat of report.byCategory) {
|
|
227
|
+
const missed = cat.missedSamples.length > 0 ? ` (missed: ${cat.missedSamples.join(', ')})` : '';
|
|
228
|
+
const fps = cat.falsePositives.length > 0 ? ` (FP: ${cat.falsePositives.join(', ')})` : '';
|
|
229
|
+
console.log(` ${cat.category}: recall=${formatPercent(cat.metrics.recall)} precision=${formatPercent(cat.metrics.precision)} f1=${formatPercent(cat.metrics.f1)}${missed}${fps}`);
|
|
230
|
+
}
|
|
231
|
+
// Per difficulty
|
|
232
|
+
console.log(`\n--- By Difficulty ---`);
|
|
233
|
+
for (const diff of report.byDifficulty) {
|
|
234
|
+
console.log(` ${diff.difficulty}: recall=${formatPercent(diff.metrics.recall)} precision=${formatPercent(diff.metrics.precision)} f1=${formatPercent(diff.metrics.f1)}`);
|
|
235
|
+
}
|
|
236
|
+
// Missed attacks
|
|
237
|
+
if (report.missedAttacks.length > 0) {
|
|
238
|
+
console.log(`\n--- Missed Attacks (${report.missedAttacks.length}) ---`);
|
|
239
|
+
for (const m of report.missedAttacks) {
|
|
240
|
+
console.log(` [${m.id}] ${m.category}/${m.difficulty}/${m.tier}`);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
// False positives
|
|
244
|
+
if (report.falsePositives.length > 0) {
|
|
245
|
+
console.log(`\n--- False Positives (${report.falsePositives.length}) ---`);
|
|
246
|
+
for (const fp of report.falsePositives) {
|
|
247
|
+
console.log(` [${fp.id}] rules: ${fp.matchedRules.join(', ')}`);
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
// Rule quality
|
|
251
|
+
console.log(`\n--- Rule Quality ---`);
|
|
252
|
+
console.log(` Total rules loaded: ${ruleQuality.totalRulesEvaluated}`);
|
|
253
|
+
console.log(` Rules fired: ${ruleQuality.rulesFired}`);
|
|
254
|
+
console.log(` Rules never fired: ${ruleQuality.rulesNeverFired}`);
|
|
255
|
+
if (ruleQuality.topRules.length > 0) {
|
|
256
|
+
console.log(`\n Top 10 rules by match count:`);
|
|
257
|
+
for (const rule of ruleQuality.topRules.slice(0, 10)) {
|
|
258
|
+
const precision = rule.matchCount > 0
|
|
259
|
+
? formatPercent(rule.tpCount / rule.matchCount)
|
|
260
|
+
: 'N/A';
|
|
261
|
+
console.log(` ${rule.ruleId}: matches=${rule.matchCount} TP=${rule.tpCount} FP=${rule.fpCount} precision=${precision} avgConf=${rule.avgConfidence.toFixed(2)}`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
if (ruleQuality.weakRules.length > 0) {
|
|
265
|
+
console.log(`\n Weak rules (FP > 0 or matchCount <= 1):`);
|
|
266
|
+
for (const rule of ruleQuality.weakRules.slice(0, 10)) {
|
|
267
|
+
console.log(` ${rule.ruleId}: matches=${rule.matchCount} TP=${rule.tpCount} FP=${rule.fpCount} categories=[${rule.categories.join(', ')}]`);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
if (ruleQuality.neverFiredRuleIds.length > 0) {
|
|
271
|
+
console.log(`\n Never-fired rules (${ruleQuality.neverFiredRuleIds.length}):`);
|
|
272
|
+
for (const id of ruleQuality.neverFiredRuleIds.slice(0, 20)) {
|
|
273
|
+
console.log(` ${id}`);
|
|
274
|
+
}
|
|
275
|
+
if (ruleQuality.neverFiredRuleIds.length > 20) {
|
|
276
|
+
console.log(` ... and ${ruleQuality.neverFiredRuleIds.length - 20} more`);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
// Regression check
|
|
280
|
+
console.log(`\n--- Regression Check ---`);
|
|
281
|
+
if (regression.passed) {
|
|
282
|
+
console.log(' PASSED');
|
|
283
|
+
}
|
|
284
|
+
else {
|
|
285
|
+
console.log(' FAILED:');
|
|
286
|
+
for (const v of regression.violations) {
|
|
287
|
+
console.log(` - ${v}`);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
console.log(`\nReport saved to: ${outputPath}`);
|
|
291
|
+
console.log('Done.\n');
|
|
292
|
+
if (!regression.passed) {
|
|
293
|
+
process.exitCode = 1;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
//# sourceMappingURL=eval-harness.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-harness.js","sourceRoot":"","sources":["../../src/eval/eval-harness.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AAClE,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAG1D,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAElE,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AAiBvD;;GAEG;AACH,SAAS,aAAa,CAAC,MAAoB;IACzC,OAAO;QACL,IAAI,EAAE,MAAM,CAAC,SAAS;QACtB,OAAO,EAAE,MAAM,CAAC,IAAI;QACpB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,MAAM,EAAE,MAAM,CAAC,MAAM;KACtB,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,SAAS,mBAAmB,CAC1B,MAAiB,EACjB,MAAoB;IAEpB,MAAM,KAAK,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAEpC,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAChC,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;QACvC,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;QAE5C,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QACpC,MAAM,QAAQ,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QAE5B,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,QAAQ;YACzB,YAAY,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3C,UAAU,EAAE,QAAQ,EAAE,UAAU,IAAI,CAAC;YACrC,SAAS;YACT,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,KAAK;YACtB,YAAY,EAAE,EAAE;YAChB,UAAU,EAAE,CAAC;YACb,SAAS,EAAE,CAAC;YACZ,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,kBAAkB,CAC/B,MAAiB,EACjB,MAAoB;IAEpB,MAAM,KAAK,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAEpC,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAChC,MAAM,EAAE,OAAO,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,KAAK,CAAC,CAAC;QAC5D,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;QAE5C,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,GAAG,CAAC,CAAC;QAExC,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,QAAQ;YACzB,YAAY,EAAE,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;YACnD,UAAU,EAAE,OAAO,CAAC,iBAAiB;YACrC,SAAS;YACT,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,KAAK;YACtB,YAAY,EAAE,EAAE;YAChB,UAAU,EAAE,CAAC;YACb,SAAS,EAAE,CAAC;YACZ,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,gBAAgB,CAAC,cAAsB;IACpD,IAAI,CAAC,UAAU,CAAC,cAAc,CAAC;QAAE,OAAO,IAAI,CAAC;IAE7C,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,CAAC;QAC/D,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC;YACjC,iBAAiB,EAAE,IAAI;YACvB,mBAAmB,EAAE,IAAI;SAC1B,CAAC,CAAC;QACH,MAAM,MAAM,CAAC,UAAU,EAAE,CAAC;QAC1B,OAAO,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC;IAC9C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,MAAkB;IAO9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,WAAW,CAAC;IAC5C,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;IAC5C,MAAM,cAAc,GAAG,MAAM,CAAC,cAAc,IAAI,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,wBAAwB,CAAC,CAAC;IAE7F,+BAA+B;IAC/B,MAAM,WAAW,GAAG,MAAM,CAAC,eAAe,KAAK,KAAK,CAAC;IACrD,IAAI,eAAe,GAA2B,IAAI,CAAC;IACnD,MAAM,SAAS,GAAa,CAAC,aAAa,CAAC,CAAC;IAE5C,IAAI,WAAW,EAAE,CAAC;QAChB,eAAe,GAAG,MAAM,gBAAgB,CAAC,cAAc,CAAC,CAAC;QACzD,IAAI,eAAe,EAAE,CAAC;YACpB,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,eAAe,EAAE,eAAe,IAAI,SAAS;KAC9C,CAAC,CAAC;IACH,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC;IAE3C,IAAI,SAAS,KAAK,CAAC,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,wBAAwB,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,kBAAkB;IAClB,MAAM,OAAO,GAAmB,EAAE,CAAC;IACnC,MAAM,eAAe,GAAG,eAAe,KAAK,IAAI,CAAC;IAEjD,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC5B,MAAM,MAAM,GAAG,eAAe;YAC5B,CAAC,CAAC,MAAM,kBAAkB,CAAC,MAAM,EAAE,MAAM,CAAC;YAC1C,CAAC,CAAC,mBAAmB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,CAAC;IAED,2BAA2B;IAC3B,IAAI,eAAe,EAAE,CAAC;QACpB,MAAM,eAAe,CAAC,OAAO,EAAE,CAAC;IAClC,CAAC;IAED,iBAAiB;IACjB,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;IAC1C,MAAM,UAAU,GAAG,eAAe,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IAC9D,MAAM,WAAW,GAAG,cAAc,EAAE,CAAC;IAErC,2BAA2B;IAC3B,MAAM,aAAa,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACzD,MAAM,WAAW,GAAG,kBAAkB,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAE/D,uCAAuC;IACvC,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,MAAM,MAAM,GAAG;YACb,MAAM;YACN,UAAU;YACV,WAAW;YACX,WAAW;YACX,SAAS;YACT,MAAM,EAAE,WAAW;YACnB,KAAK,EAAE,SAAS;SACjB,CAAC;QACF,aAAa,CAAC,MAAM,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IACpE,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC;AACrE,CAAC;AAED,8EAA8E;AAC9E,kBAAkB;AAClB,8EAA8E;AAE9E,SAAS,aAAa,CAAC,CAAS;IAC9B,OAAO,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;AACpC,CAAC;AAED,SAAS,QAAQ,CAAC,CAAS;IACzB,OAAO,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;AAC7B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;IACnE,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACrC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,kBAAkB,CAAC,CAAC;IAE1D,OAAO,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;IAElD,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,MAAM,OAAO,CAAC;QAChF,QAAQ;QACR,UAAU;KACX,CAAC,CAAC;IAEH,eAAe;IACf,OAAO,CAAC,GAAG,CAAC,WAAW,WAAW,CAAC,KAAK,aAAa,WAAW,CAAC,OAAO,aAAa,WAAW,CAAC,MAAM,UAAU,CAAC,CAAC;IACnH,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC7E,OAAO,CAAC,GAAG,CAAC,UAAU,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;IAE/C,kBAAkB;IAClB,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;IACjC,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IACrE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;IACjE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IACrE,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,OAAO,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,OAAO,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,OAAO,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,CAAC,CAAC;IAErK,UAAU;IACV,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;IACjC,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEvD,eAAe;IACf,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;IACrC,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACpC,MAAM,MAAM,GAAG,GAAG,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QAChG,MAAM,GAAG,GAAG,GAAG,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3F,OAAO,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,QAAQ,YAAY,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,cAAc,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,OAAO,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,GAAG,MAAM,GAAG,GAAG,EAAE,CAAC,CAAC;IACrL,CAAC;IAED,iBAAiB;IACjB,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,UAAU,YAAY,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,cAAc,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,OAAO,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;IAC5K,CAAC;IAED,iBAAiB;IACjB,IAAI,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,yBAAyB,MAAM,CAAC,aAAa,CAAC,MAAM,OAAO,CAAC,CAAC;QACzE,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,kBAAkB;IAClB,IAAI,MAAM,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,0BAA0B,MAAM,CAAC,cAAc,CAAC,MAAM,OAAO,CAAC,CAAC;QAC3E,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;YACvC,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnE,CAAC;IACH,CAAC;IAED,eAAe;IACf,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;IACtC,OAAO,CAAC,GAAG,CAAC,yBAAyB,WAAW,CAAC,mBAAmB,EAAE,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,kBAAkB,WAAW,CAAC,UAAU,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,wBAAwB,WAAW,CAAC,eAAe,EAAE,CAAC,CAAC;IAEnE,IAAI,WAAW,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAC;QAChD,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;YACrD,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,GAAG,CAAC;gBACnC,CAAC,CAAC,aAAa,CAAC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC;gBAC/C,CAAC,CAAC,KAAK,CAAC;YACV,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,CAAC,MAAM,aAAa,IAAI,CAAC,UAAU,OAAO,IAAI,CAAC,OAAO,OAAO,IAAI,CAAC,OAAO,cAAc,SAAS,YAAY,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACtK,CAAC;IACH,CAAC;IAED,IAAI,WAAW,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,6CAA6C,CAAC,CAAC;QAC3D,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;YACtD,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,CAAC,MAAM,aAAa,IAAI,CAAC,UAAU,OAAO,IAAI,CAAC,OAAO,OAAO,IAAI,CAAC,OAAO,gBAAgB,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjJ,CAAC;IACH,CAAC;IAED,IAAI,WAAW,CAAC,iBAAiB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7C,OAAO,CAAC,GAAG,CAAC,0BAA0B,WAAW,CAAC,iBAAiB,CAAC,MAAM,IAAI,CAAC,CAAC;QAChF,KAAK,MAAM,EAAE,IAAI,WAAW,CAAC,iBAAiB,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;YAC5D,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAC3B,CAAC;QACD,IAAI,WAAW,CAAC,iBAAiB,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAC9C,OAAO,CAAC,GAAG,CAAC,eAAe,WAAW,CAAC,iBAAiB,CAAC,MAAM,GAAG,EAAE,OAAO,CAAC,CAAC;QAC/E,CAAC;IACH,CAAC;IAED,mBAAmB;IACnB,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAC1C,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC;QACtB,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAC1B,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QACzB,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,UAAU,EAAE,CAAC;YACtC,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,sBAAsB,UAAU,EAAE,CAAC,CAAC;IAChD,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAEvB,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC;QACvB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC;IACvB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation framework public API
|
|
3
|
+
* @module agent-threat-rules/eval
|
|
4
|
+
*/
|
|
5
|
+
export { EVAL_CORPUS, getAttackSamples, getBenignSamples, getSamplesByCategory, getSamplesByDifficulty, getCorpusStats } from './corpus.js';
|
|
6
|
+
export type { CorpusSample } from './corpus.js';
|
|
7
|
+
export { computeEvalReport, checkRegression } from './metrics.js';
|
|
8
|
+
export type { SampleResult, ConfusionMatrix, ClassMetrics, LatencyStats, CategoryBreakdown, DifficultyBreakdown, EvalReport, RegressionCheck, BaselineThresholds, } from './metrics.js';
|
|
9
|
+
export { runEval, runEvalCLI } from './eval-harness.js';
|
|
10
|
+
export type { EvalConfig } from './eval-harness.js';
|
|
11
|
+
export { computeRuleQuality } from './rule-metrics.js';
|
|
12
|
+
export type { RuleQuality, RuleQualityReport } from './rule-metrics.js';
|
|
13
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAC5I,YAAY,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAEhD,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAClE,YAAY,EACV,YAAY,EACZ,eAAe,EACf,YAAY,EACZ,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,UAAU,EACV,eAAe,EACf,kBAAkB,GACnB,MAAM,cAAc,CAAC;AAEtB,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACxD,YAAY,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,YAAY,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation framework public API
|
|
3
|
+
* @module agent-threat-rules/eval
|
|
4
|
+
*/
|
|
5
|
+
export { EVAL_CORPUS, getAttackSamples, getBenignSamples, getSamplesByCategory, getSamplesByDifficulty, getCorpusStats } from './corpus.js';
|
|
6
|
+
export { computeEvalReport, checkRegression } from './metrics.js';
|
|
7
|
+
export { runEval, runEvalCLI } from './eval-harness.js';
|
|
8
|
+
export { computeRuleQuality } from './rule-metrics.js';
|
|
9
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAG5I,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAalE,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAGxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Metrics -- computes precision, recall, F1, confusion matrix,
|
|
3
|
+
* per-category breakdowns, and latency percentiles.
|
|
4
|
+
*
|
|
5
|
+
* All functions are pure (no side effects, no mutation).
|
|
6
|
+
*
|
|
7
|
+
* @module agent-threat-rules/eval/metrics
|
|
8
|
+
*/
|
|
9
|
+
export interface SampleResult {
|
|
10
|
+
readonly id: string;
|
|
11
|
+
readonly category: string;
|
|
12
|
+
readonly expectedDetection: boolean;
|
|
13
|
+
readonly actualDetection: boolean;
|
|
14
|
+
readonly matchedRules: readonly string[];
|
|
15
|
+
readonly confidence: number;
|
|
16
|
+
readonly latencyMs: number;
|
|
17
|
+
readonly difficulty: string;
|
|
18
|
+
readonly tier: string;
|
|
19
|
+
}
|
|
20
|
+
export interface ConfusionMatrix {
|
|
21
|
+
readonly tp: number;
|
|
22
|
+
readonly fp: number;
|
|
23
|
+
readonly tn: number;
|
|
24
|
+
readonly fn: number;
|
|
25
|
+
}
|
|
26
|
+
export interface ClassMetrics {
|
|
27
|
+
readonly precision: number;
|
|
28
|
+
readonly recall: number;
|
|
29
|
+
readonly f1: number;
|
|
30
|
+
readonly accuracy: number;
|
|
31
|
+
readonly fpRate: number;
|
|
32
|
+
readonly confusion: ConfusionMatrix;
|
|
33
|
+
readonly sampleCount: number;
|
|
34
|
+
}
|
|
35
|
+
export interface LatencyStats {
|
|
36
|
+
readonly p50: number;
|
|
37
|
+
readonly p95: number;
|
|
38
|
+
readonly p99: number;
|
|
39
|
+
readonly mean: number;
|
|
40
|
+
readonly max: number;
|
|
41
|
+
}
|
|
42
|
+
export interface CategoryBreakdown {
|
|
43
|
+
readonly category: string;
|
|
44
|
+
readonly metrics: ClassMetrics;
|
|
45
|
+
readonly missedSamples: readonly string[];
|
|
46
|
+
readonly falsePositives: readonly string[];
|
|
47
|
+
}
|
|
48
|
+
export interface DifficultyBreakdown {
|
|
49
|
+
readonly difficulty: string;
|
|
50
|
+
readonly metrics: ClassMetrics;
|
|
51
|
+
}
|
|
52
|
+
export interface EvalReport {
|
|
53
|
+
readonly timestamp: string;
|
|
54
|
+
readonly corpusSize: number;
|
|
55
|
+
readonly overall: ClassMetrics;
|
|
56
|
+
readonly latency: LatencyStats;
|
|
57
|
+
readonly byCategory: readonly CategoryBreakdown[];
|
|
58
|
+
readonly byDifficulty: readonly DifficultyBreakdown[];
|
|
59
|
+
readonly missedAttacks: readonly SampleResult[];
|
|
60
|
+
readonly falsePositives: readonly SampleResult[];
|
|
61
|
+
}
|
|
62
|
+
export declare function computeEvalReport(results: readonly SampleResult[]): EvalReport;
|
|
63
|
+
export interface RegressionCheck {
|
|
64
|
+
readonly passed: boolean;
|
|
65
|
+
readonly violations: readonly string[];
|
|
66
|
+
}
|
|
67
|
+
export interface BaselineThresholds {
|
|
68
|
+
readonly minRecall: number;
|
|
69
|
+
readonly maxFpRate: number;
|
|
70
|
+
readonly minF1: number;
|
|
71
|
+
readonly maxP95LatencyMs: number;
|
|
72
|
+
}
|
|
73
|
+
export declare function checkRegression(report: EvalReport, thresholds?: BaselineThresholds): RegressionCheck;
|
|
74
|
+
//# sourceMappingURL=metrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.d.ts","sourceRoot":"","sources":["../../src/eval/metrics.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,QAAQ,CAAC,eAAe,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,YAAY,EAAE,SAAS,MAAM,EAAE,CAAC;IACzC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,eAAe,CAAC;IACpC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,aAAa,EAAE,SAAS,MAAM,EAAE,CAAC;IAC1C,QAAQ,CAAC,cAAc,EAAE,SAAS,MAAM,EAAE,CAAC;CAC5C;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;CAChC;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,UAAU,EAAE,SAAS,iBAAiB,EAAE,CAAC;IAClD,QAAQ,CAAC,YAAY,EAAE,SAAS,mBAAmB,EAAE,CAAC;IACtD,QAAQ,CAAC,aAAa,EAAE,SAAS,YAAY,EAAE,CAAC;IAChD,QAAQ,CAAC,cAAc,EAAE,SAAS,YAAY,EAAE,CAAC;CAClD;AAkDD,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,SAAS,YAAY,EAAE,GAAG,UAAU,CAsC9E;AAMD,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,UAAU,EAAE,SAAS,MAAM,EAAE,CAAC;CACxC;AAED,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;CAClC;AASD,wBAAgB,eAAe,CAC7B,MAAM,EAAE,UAAU,EAClB,UAAU,GAAE,kBAAuC,GAClD,eAAe,CA4BjB"}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Metrics -- computes precision, recall, F1, confusion matrix,
|
|
3
|
+
* per-category breakdowns, and latency percentiles.
|
|
4
|
+
*
|
|
5
|
+
* All functions are pure (no side effects, no mutation).
|
|
6
|
+
*
|
|
7
|
+
* @module agent-threat-rules/eval/metrics
|
|
8
|
+
*/
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
// Core metric calculations
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
function buildConfusionMatrix(results) {
|
|
13
|
+
let tp = 0, fp = 0, tn = 0, fn = 0;
|
|
14
|
+
for (const r of results) {
|
|
15
|
+
if (r.expectedDetection && r.actualDetection)
|
|
16
|
+
tp++;
|
|
17
|
+
else if (!r.expectedDetection && r.actualDetection)
|
|
18
|
+
fp++;
|
|
19
|
+
else if (!r.expectedDetection && !r.actualDetection)
|
|
20
|
+
tn++;
|
|
21
|
+
else
|
|
22
|
+
fn++;
|
|
23
|
+
}
|
|
24
|
+
return { tp, fp, tn, fn };
|
|
25
|
+
}
|
|
26
|
+
function computeClassMetrics(cm, sampleCount) {
|
|
27
|
+
const precision = cm.tp + cm.fp > 0 ? cm.tp / (cm.tp + cm.fp) : 1;
|
|
28
|
+
const recall = cm.tp + cm.fn > 0 ? cm.tp / (cm.tp + cm.fn) : 1;
|
|
29
|
+
const f1 = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
|
|
30
|
+
const accuracy = sampleCount > 0 ? (cm.tp + cm.tn) / sampleCount : 1;
|
|
31
|
+
const fpRate = cm.fp + cm.tn > 0 ? cm.fp / (cm.fp + cm.tn) : 0;
|
|
32
|
+
return { precision, recall, f1, accuracy, fpRate, confusion: cm, sampleCount };
|
|
33
|
+
}
|
|
34
|
+
function computeLatency(results) {
|
|
35
|
+
if (results.length === 0) {
|
|
36
|
+
return { p50: 0, p95: 0, p99: 0, mean: 0, max: 0 };
|
|
37
|
+
}
|
|
38
|
+
const sorted = [...results].map((r) => r.latencyMs).sort((a, b) => a - b);
|
|
39
|
+
const len = sorted.length;
|
|
40
|
+
return {
|
|
41
|
+
p50: sorted[Math.floor(len * 0.5)] ?? 0,
|
|
42
|
+
p95: sorted[Math.floor(len * 0.95)] ?? 0,
|
|
43
|
+
p99: sorted[Math.floor(len * 0.99)] ?? 0,
|
|
44
|
+
mean: sorted.reduce((a, b) => a + b, 0) / len,
|
|
45
|
+
max: sorted[len - 1] ?? 0,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Report generation
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
export function computeEvalReport(results) {
|
|
52
|
+
const overallCM = buildConfusionMatrix(results);
|
|
53
|
+
const overall = computeClassMetrics(overallCM, results.length);
|
|
54
|
+
const latency = computeLatency(results);
|
|
55
|
+
// By category
|
|
56
|
+
const categories = [...new Set(results.map((r) => r.category))];
|
|
57
|
+
const byCategory = categories.map((cat) => {
|
|
58
|
+
const catResults = results.filter((r) => r.category === cat);
|
|
59
|
+
const cm = buildConfusionMatrix(catResults);
|
|
60
|
+
const metrics = computeClassMetrics(cm, catResults.length);
|
|
61
|
+
const missed = catResults.filter((r) => r.expectedDetection && !r.actualDetection).map((r) => r.id);
|
|
62
|
+
const fps = catResults.filter((r) => !r.expectedDetection && r.actualDetection).map((r) => r.id);
|
|
63
|
+
return { category: cat, metrics, missedSamples: missed, falsePositives: fps };
|
|
64
|
+
});
|
|
65
|
+
// By difficulty
|
|
66
|
+
const difficulties = [...new Set(results.map((r) => r.difficulty))];
|
|
67
|
+
const byDifficulty = difficulties.map((diff) => {
|
|
68
|
+
const diffResults = results.filter((r) => r.difficulty === diff);
|
|
69
|
+
const cm = buildConfusionMatrix(diffResults);
|
|
70
|
+
const metrics = computeClassMetrics(cm, diffResults.length);
|
|
71
|
+
return { difficulty: diff, metrics };
|
|
72
|
+
});
|
|
73
|
+
const missedAttacks = results.filter((r) => r.expectedDetection && !r.actualDetection);
|
|
74
|
+
const falsePositives = results.filter((r) => !r.expectedDetection && r.actualDetection);
|
|
75
|
+
return {
|
|
76
|
+
timestamp: new Date().toISOString(),
|
|
77
|
+
corpusSize: results.length,
|
|
78
|
+
overall,
|
|
79
|
+
latency,
|
|
80
|
+
byCategory,
|
|
81
|
+
byDifficulty,
|
|
82
|
+
missedAttacks,
|
|
83
|
+
falsePositives,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
const DEFAULT_THRESHOLDS = {
|
|
87
|
+
minRecall: 0.60,
|
|
88
|
+
maxFpRate: 0.05,
|
|
89
|
+
minF1: 0.70,
|
|
90
|
+
maxP95LatencyMs: 50,
|
|
91
|
+
};
|
|
92
|
+
export function checkRegression(report, thresholds = DEFAULT_THRESHOLDS) {
|
|
93
|
+
const violations = [];
|
|
94
|
+
if (report.overall.recall < thresholds.minRecall) {
|
|
95
|
+
violations.push(`Recall ${(report.overall.recall * 100).toFixed(1)}% < minimum ${(thresholds.minRecall * 100).toFixed(1)}%`);
|
|
96
|
+
}
|
|
97
|
+
if (report.overall.fpRate > thresholds.maxFpRate) {
|
|
98
|
+
violations.push(`FP rate ${(report.overall.fpRate * 100).toFixed(3)}% > maximum ${(thresholds.maxFpRate * 100).toFixed(3)}%`);
|
|
99
|
+
}
|
|
100
|
+
if (report.overall.f1 < thresholds.minF1) {
|
|
101
|
+
violations.push(`F1 ${(report.overall.f1 * 100).toFixed(1)}% < minimum ${(thresholds.minF1 * 100).toFixed(1)}%`);
|
|
102
|
+
}
|
|
103
|
+
if (report.latency.p95 > thresholds.maxP95LatencyMs) {
|
|
104
|
+
violations.push(`P95 latency ${report.latency.p95.toFixed(1)}ms > maximum ${thresholds.maxP95LatencyMs}ms`);
|
|
105
|
+
}
|
|
106
|
+
return { passed: violations.length === 0, violations };
|
|
107
|
+
}
|
|
108
|
+
//# sourceMappingURL=metrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.js","sourceRoot":"","sources":["../../src/eval/metrics.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AA8DH,8EAA8E;AAC9E,2BAA2B;AAC3B,8EAA8E;AAE9E,SAAS,oBAAoB,CAAC,OAAgC;IAC5D,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;IAEnC,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe;YAAE,EAAE,EAAE,CAAC;aAC9C,IAAI,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe;YAAE,EAAE,EAAE,CAAC;aACpD,IAAI,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,eAAe;YAAE,EAAE,EAAE,CAAC;;YACrD,EAAE,EAAE,CAAC;IACZ,CAAC;IAED,OAAO,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,CAAC;AAC5B,CAAC;AAED,SAAS,mBAAmB,CAAC,EAAmB,EAAE,WAAmB;IACnE,MAAM,SAAS,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAClE,MAAM,MAAM,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/D,MAAM,EAAE,GAAG,SAAS,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,SAAS,GAAG,MAAM,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACtF,MAAM,QAAQ,GAAG,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IACrE,MAAM,MAAM,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAE/D,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,EAAE,WAAW,EAAE,CAAC;AACjF,CAAC;AAED,SAAS,cAAc,CAAC,OAAgC;IACtD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC;IACrD,CAAC;IAED,MAAM,MAAM,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1E,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,CAAC;IAE1B,OAAO;QACL,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC;QACvC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC;QACxC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC;QACxC,IAAI,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,GAAG;QAC7C,GAAG,EAAE,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC;KAC1B,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,oBAAoB;AACpB,8EAA8E;AAE9E,MAAM,UAAU,iBAAiB,CAAC,OAAgC;IAChE,MAAM,SAAS,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,mBAAmB,CAAC,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;IAC/D,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IAExC,cAAc;IACd,MAAM,UAAU,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;IAChE,MAAM,UAAU,GAAwB,UAAU,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;QAC7D,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,GAAG,CAAC,CAAC;QAC7D,MAAM,EAAE,GAAG,oBAAoB,CAAC,UAAU,CAAC,CAAC;QAC5C,MAAM,OAAO,GAAG,mBAAmB,CAAC,EAAE,EAAE,UAAU,CAAC,MAAM,CAAC,CAAC;QAC3D,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACpG,MAAM,GAAG,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACjG,OAAO,EAAE,QAAQ,EAAE,GAAG,EAAE,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,cAAc,EAAE,GAAG,EAAE,CAAC;IAChF,CAAC,CAAC,CAAC;IAEH,gBAAgB;IAChB,MAAM,YAAY,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;IACpE,MAAM,YAAY,GAA0B,YAAY,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACpE,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,IAAI,CAAC,CAAC;QACjE,MAAM,EAAE,GAAG,oBAAoB,CAAC,WAAW,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,mBAAmB,CAAC,EAAE,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;QAC5D,OAAO,EAAE,UAAU,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC;IACvF,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe,CAAC,CAAC;IAExF,OAAO;QACL,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,UAAU,EAAE,OAAO,CAAC,MAAM;QAC1B,OAAO;QACP,OAAO;QACP,UAAU;QACV,YAAY;QACZ,aAAa;QACb,cAAc;KACf,CAAC;AACJ,CAAC;AAkBD,MAAM,kBAAkB,GAAuB;IAC7C,SAAS,EAAE,IAAI;IACf,SAAS,EAAE,IAAI;IACf,KAAK,EAAE,IAAI;IACX,eAAe,EAAE,EAAE;CACpB,CAAC;AAEF,MAAM,UAAU,eAAe,CAC7B,MAAkB,EAClB,aAAiC,kBAAkB;IAEnD,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,UAAU,CAAC,SAAS,EAAE,CAAC;QACjD,UAAU,CAAC,IAAI,CACb,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,UAAU,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAC5G,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,UAAU,CAAC,SAAS,EAAE,CAAC;QACjD,UAAU,CAAC,IAAI,CACb,WAAW,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,UAAU,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAC7G,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,EAAE,GAAG,UAAU,CAAC,KAAK,EAAE,CAAC;QACzC,UAAU,CAAC,IAAI,CACb,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,UAAU,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAChG,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,GAAG,UAAU,CAAC,eAAe,EAAE,CAAC;QACpD,UAAU,CAAC,IAAI,CACb,eAAe,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,gBAAgB,UAAU,CAAC,eAAe,IAAI,CAC3F,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;AACzD,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PINT Benchmark Corpus Loader
|
|
3
|
+
*
|
|
4
|
+
* Reads the PINT-format dataset (JSON with text/category/label/source/language)
|
|
5
|
+
* built from publicly available prompt injection datasets:
|
|
6
|
+
* - deepset/prompt-injections (HuggingFace)
|
|
7
|
+
* - Lakera/gandalf_ignore_instructions (HuggingFace)
|
|
8
|
+
*
|
|
9
|
+
* Converts each sample into the CorpusSample interface used by the ATR eval
|
|
10
|
+
* harness, allowing the PINT corpus to be evaluated alongside or instead of
|
|
11
|
+
* the built-in hand-crafted corpus.
|
|
12
|
+
*
|
|
13
|
+
* @module agent-threat-rules/eval/pint-corpus
|
|
14
|
+
*/
|
|
15
|
+
import type { CorpusSample } from './corpus.js';
|
|
16
|
+
/**
|
|
17
|
+
* Load the PINT benchmark corpus from a JSON file on disk.
|
|
18
|
+
*
|
|
19
|
+
* @param dataPath - Absolute path to pint-corpus.json
|
|
20
|
+
* @returns Readonly array of CorpusSample for use with runEval()
|
|
21
|
+
*/
|
|
22
|
+
export declare function loadPintCorpus(dataPath: string): readonly CorpusSample[];
|
|
23
|
+
/**
|
|
24
|
+
* Get summary statistics for the loaded PINT corpus.
|
|
25
|
+
*/
|
|
26
|
+
export declare function getPintCorpusStats(corpus: readonly CorpusSample[]): {
|
|
27
|
+
readonly total: number;
|
|
28
|
+
readonly attacks: number;
|
|
29
|
+
readonly benign: number;
|
|
30
|
+
readonly byCategory: Readonly<Record<string, number>>;
|
|
31
|
+
readonly byDifficulty: Readonly<Record<string, number>>;
|
|
32
|
+
readonly byLanguage: Readonly<Record<string, number>>;
|
|
33
|
+
};
|
|
34
|
+
//# sourceMappingURL=pint-corpus.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pint-corpus.d.ts","sourceRoot":"","sources":["../../src/eval/pint-corpus.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAsDhD;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,SAAS,YAAY,EAAE,CAyCxE;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,SAAS,YAAY,EAAE,GAAG;IACnE,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACtD,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACxD,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;CACvD,CAoBA"}
|