agent-threat-rules 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +152 -642
- package/dist/capability-extractor.d.ts +35 -0
- package/dist/capability-extractor.d.ts.map +1 -0
- package/dist/capability-extractor.js +91 -0
- package/dist/capability-extractor.js.map +1 -0
- package/dist/cli.js +56 -2
- package/dist/cli.js.map +1 -1
- package/dist/converters/elastic.d.ts +36 -0
- package/dist/converters/elastic.d.ts.map +1 -0
- package/dist/converters/elastic.js +125 -0
- package/dist/converters/elastic.js.map +1 -0
- package/dist/converters/index.d.ts +28 -0
- package/dist/converters/index.d.ts.map +1 -0
- package/dist/converters/index.js +36 -0
- package/dist/converters/index.js.map +1 -0
- package/dist/converters/splunk.d.ts +19 -0
- package/dist/converters/splunk.d.ts.map +1 -0
- package/dist/converters/splunk.js +148 -0
- package/dist/converters/splunk.js.map +1 -0
- package/dist/embedding/build-corpus.d.ts +15 -0
- package/dist/embedding/build-corpus.d.ts.map +1 -0
- package/dist/embedding/build-corpus.js +105 -0
- package/dist/embedding/build-corpus.js.map +1 -0
- package/dist/embedding/model-loader.d.ts +41 -0
- package/dist/embedding/model-loader.d.ts.map +1 -0
- package/dist/embedding/model-loader.js +90 -0
- package/dist/embedding/model-loader.js.map +1 -0
- package/dist/embedding/vector-store.d.ts +41 -0
- package/dist/embedding/vector-store.d.ts.map +1 -0
- package/dist/embedding/vector-store.js +70 -0
- package/dist/embedding/vector-store.js.map +1 -0
- package/dist/engine.d.ts +23 -20
- package/dist/engine.d.ts.map +1 -1
- package/dist/engine.js +173 -24
- package/dist/engine.js.map +1 -1
- package/dist/eval/corpus.d.ts +42 -0
- package/dist/eval/corpus.d.ts.map +1 -0
- package/dist/eval/corpus.js +427 -0
- package/dist/eval/corpus.js.map +1 -0
- package/dist/eval/eval-harness.d.ts +44 -0
- package/dist/eval/eval-harness.d.ts.map +1 -0
- package/dist/eval/eval-harness.js +296 -0
- package/dist/eval/eval-harness.js.map +1 -0
- package/dist/eval/index.d.ts +13 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +9 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/metrics.d.ts +74 -0
- package/dist/eval/metrics.d.ts.map +1 -0
- package/dist/eval/metrics.js +108 -0
- package/dist/eval/metrics.js.map +1 -0
- package/dist/eval/pint-corpus.d.ts +34 -0
- package/dist/eval/pint-corpus.d.ts.map +1 -0
- package/dist/eval/pint-corpus.js +109 -0
- package/dist/eval/pint-corpus.js.map +1 -0
- package/dist/eval/rule-corpus.d.ts +9 -0
- package/dist/eval/rule-corpus.d.ts.map +1 -0
- package/dist/eval/rule-corpus.js +4780 -0
- package/dist/eval/rule-corpus.js.map +1 -0
- package/dist/eval/rule-metrics.d.ts +34 -0
- package/dist/eval/rule-metrics.d.ts.map +1 -0
- package/dist/eval/rule-metrics.js +92 -0
- package/dist/eval/rule-metrics.js.map +1 -0
- package/dist/eval/run-eval.d.ts +7 -0
- package/dist/eval/run-eval.d.ts.map +1 -0
- package/dist/eval/run-eval.js +11 -0
- package/dist/eval/run-eval.js.map +1 -0
- package/dist/eval/run-pint-benchmark.d.ts +18 -0
- package/dist/eval/run-pint-benchmark.d.ts.map +1 -0
- package/dist/eval/run-pint-benchmark.js +157 -0
- package/dist/eval/run-pint-benchmark.js.map +1 -0
- package/dist/flywheel.d.ts +54 -0
- package/dist/flywheel.d.ts.map +1 -0
- package/dist/flywheel.js +121 -0
- package/dist/flywheel.js.map +1 -0
- package/dist/index.d.ts +21 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +19 -2
- package/dist/index.js.map +1 -1
- package/dist/modules/embedding.d.ts +71 -0
- package/dist/modules/embedding.d.ts.map +1 -0
- package/dist/modules/embedding.js +141 -0
- package/dist/modules/embedding.js.map +1 -0
- package/dist/modules/semantic.d.ts +1 -0
- package/dist/modules/semantic.d.ts.map +1 -1
- package/dist/modules/semantic.js +77 -1
- package/dist/modules/semantic.js.map +1 -1
- package/dist/rule-scaffolder.d.ts +14 -0
- package/dist/rule-scaffolder.d.ts.map +1 -1
- package/dist/rule-scaffolder.js +123 -6
- package/dist/rule-scaffolder.js.map +1 -1
- package/dist/session-tracker.d.ts +2 -0
- package/dist/session-tracker.d.ts.map +1 -1
- package/dist/session-tracker.js +1 -0
- package/dist/session-tracker.js.map +1 -1
- package/dist/shadow-evaluator.d.ts +48 -0
- package/dist/shadow-evaluator.d.ts.map +1 -0
- package/dist/shadow-evaluator.js +128 -0
- package/dist/shadow-evaluator.js.map +1 -0
- package/dist/skill-fingerprint.d.ts.map +1 -1
- package/dist/skill-fingerprint.js +10 -52
- package/dist/skill-fingerprint.js.map +1 -1
- package/dist/tier0-invariant.d.ts +49 -0
- package/dist/tier0-invariant.d.ts.map +1 -0
- package/dist/tier0-invariant.js +184 -0
- package/dist/tier0-invariant.js.map +1 -0
- package/dist/tier1-blacklist.d.ts +48 -0
- package/dist/tier1-blacklist.d.ts.map +1 -0
- package/dist/tier1-blacklist.js +91 -0
- package/dist/tier1-blacklist.js.map +1 -0
- package/package.json +7 -1
- package/rules/agent-manipulation/ATR-2026-108-consensus-sybil-attack.yaml +103 -0
- package/rules/context-exfiltration/ATR-2026-102-disguised-analytics-exfiltration.yaml +69 -0
- package/rules/privilege-escalation/ATR-2026-107-delayed-execution-bypass.yaml +67 -0
- package/rules/prompt-injection/ATR-2026-001-direct-prompt-injection.yaml +181 -94
- package/rules/prompt-injection/ATR-2026-003-jailbreak-attempt.yaml +23 -12
- package/rules/prompt-injection/ATR-2026-004-system-prompt-override.yaml +3 -3
- package/rules/prompt-injection/ATR-2026-081-semantic-multi-turn.yaml +2 -2
- package/rules/prompt-injection/ATR-2026-093-gradual-escalation.yaml +1 -1
- package/rules/prompt-injection/ATR-2026-104-persona-hijacking.yaml +72 -0
- package/rules/tool-poisoning/ATR-2026-100-consent-bypass-instruction.yaml +80 -0
- package/rules/tool-poisoning/ATR-2026-101-trust-escalation-override.yaml +66 -0
- package/rules/tool-poisoning/ATR-2026-103-hidden-safety-bypass-instruction.yaml +71 -0
- package/rules/tool-poisoning/ATR-2026-105-silent-action-concealment.yaml +67 -0
- package/rules/tool-poisoning/ATR-2026-106-schema-description-contradiction.yaml +66 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Harness -- orchestrates running the corpus through the ATR engine
|
|
3
|
+
* and produces a structured EvalReport.
|
|
4
|
+
*
|
|
5
|
+
* Supports:
|
|
6
|
+
* - Regex-only evaluation (Tier 2)
|
|
7
|
+
* - Regex + Embedding evaluation (Tier 2 + 2.5)
|
|
8
|
+
* - Full pipeline evaluation (all tiers)
|
|
9
|
+
* - Per-sample latency measurement
|
|
10
|
+
* - Regression check against baseline thresholds
|
|
11
|
+
*
|
|
12
|
+
* @module agent-threat-rules/eval/eval-harness
|
|
13
|
+
*/
|
|
14
|
+
import { resolve, join } from 'node:path';
|
|
15
|
+
import { existsSync, readFileSync, writeFileSync } from 'node:fs';
|
|
16
|
+
import { ATREngine } from '../engine.js';
|
|
17
|
+
import { EmbeddingModule } from '../modules/embedding.js';
|
|
18
|
+
import { EVAL_CORPUS, getCorpusStats } from './corpus.js';
|
|
19
|
+
import { computeEvalReport, checkRegression } from './metrics.js';
|
|
20
|
+
import { computeRuleQuality } from './rule-metrics.js';
|
|
21
|
+
/**
|
|
22
|
+
* Convert a corpus sample to an AgentEvent.
|
|
23
|
+
*/
|
|
24
|
+
function sampleToEvent(sample) {
|
|
25
|
+
return {
|
|
26
|
+
type: sample.eventType,
|
|
27
|
+
content: sample.text,
|
|
28
|
+
timestamp: new Date().toISOString(),
|
|
29
|
+
fields: sample.fields,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Run a single sample through the engine (regex only) and measure results.
|
|
34
|
+
* Catches engine errors so a single bad sample doesn't abort the entire eval.
|
|
35
|
+
*/
|
|
36
|
+
function evaluateSampleRegex(engine, sample) {
|
|
37
|
+
const event = sampleToEvent(sample);
|
|
38
|
+
try {
|
|
39
|
+
const start = performance.now();
|
|
40
|
+
const matches = engine.evaluate(event);
|
|
41
|
+
const latencyMs = performance.now() - start;
|
|
42
|
+
const detected = matches.length > 0;
|
|
43
|
+
const topMatch = matches[0];
|
|
44
|
+
return {
|
|
45
|
+
id: sample.id,
|
|
46
|
+
category: sample.category,
|
|
47
|
+
expectedDetection: sample.expectedDetection,
|
|
48
|
+
actualDetection: detected,
|
|
49
|
+
matchedRules: matches.map((m) => m.rule.id),
|
|
50
|
+
confidence: topMatch?.confidence ?? 0,
|
|
51
|
+
latencyMs,
|
|
52
|
+
difficulty: sample.difficulty,
|
|
53
|
+
tier: sample.tier,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
catch {
|
|
57
|
+
return {
|
|
58
|
+
id: sample.id,
|
|
59
|
+
category: sample.category,
|
|
60
|
+
expectedDetection: sample.expectedDetection,
|
|
61
|
+
actualDetection: false,
|
|
62
|
+
matchedRules: [],
|
|
63
|
+
confidence: 0,
|
|
64
|
+
latencyMs: 0,
|
|
65
|
+
difficulty: sample.difficulty,
|
|
66
|
+
tier: sample.tier,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Run a single sample through the full pipeline (regex + embedding) and measure results.
|
|
72
|
+
*/
|
|
73
|
+
async function evaluateSampleFull(engine, sample) {
|
|
74
|
+
const event = sampleToEvent(sample);
|
|
75
|
+
try {
|
|
76
|
+
const start = performance.now();
|
|
77
|
+
const { verdict } = await engine.evaluateWithVerdict(event);
|
|
78
|
+
const latencyMs = performance.now() - start;
|
|
79
|
+
const detected = verdict.matchCount > 0;
|
|
80
|
+
return {
|
|
81
|
+
id: sample.id,
|
|
82
|
+
category: sample.category,
|
|
83
|
+
expectedDetection: sample.expectedDetection,
|
|
84
|
+
actualDetection: detected,
|
|
85
|
+
matchedRules: verdict.matches.map((m) => m.rule.id),
|
|
86
|
+
confidence: verdict.highestConfidence,
|
|
87
|
+
latencyMs,
|
|
88
|
+
difficulty: sample.difficulty,
|
|
89
|
+
tier: sample.tier,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
catch {
|
|
93
|
+
return {
|
|
94
|
+
id: sample.id,
|
|
95
|
+
category: sample.category,
|
|
96
|
+
expectedDetection: sample.expectedDetection,
|
|
97
|
+
actualDetection: false,
|
|
98
|
+
matchedRules: [],
|
|
99
|
+
confidence: 0,
|
|
100
|
+
latencyMs: 0,
|
|
101
|
+
difficulty: sample.difficulty,
|
|
102
|
+
tier: sample.tier,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Try to load the embedding module. Returns null if unavailable.
|
|
108
|
+
*/
|
|
109
|
+
async function tryLoadEmbedding(embeddingsPath) {
|
|
110
|
+
if (!existsSync(embeddingsPath))
|
|
111
|
+
return null;
|
|
112
|
+
try {
|
|
113
|
+
const data = JSON.parse(readFileSync(embeddingsPath, 'utf-8'));
|
|
114
|
+
const module = new EmbeddingModule({
|
|
115
|
+
attackVectorsData: data,
|
|
116
|
+
similarityThreshold: 0.65,
|
|
117
|
+
});
|
|
118
|
+
await module.initialize();
|
|
119
|
+
return module.isAvailable() ? module : null;
|
|
120
|
+
}
|
|
121
|
+
catch {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Run the full evaluation harness.
|
|
127
|
+
* Returns the EvalReport and RegressionCheck.
|
|
128
|
+
*/
|
|
129
|
+
export async function runEval(config) {
|
|
130
|
+
const corpus = config.corpus ?? EVAL_CORPUS;
|
|
131
|
+
const base = resolve(config.rulesDir, '..');
|
|
132
|
+
const embeddingsPath = config.embeddingsPath ?? join(base, 'data', 'attack-embeddings.json');
|
|
133
|
+
// Try to load embedding module
|
|
134
|
+
const shouldEmbed = config.enableEmbedding !== false;
|
|
135
|
+
let embeddingModule = null;
|
|
136
|
+
const tiersUsed = ['tier2-regex'];
|
|
137
|
+
if (shouldEmbed) {
|
|
138
|
+
embeddingModule = await tryLoadEmbedding(embeddingsPath);
|
|
139
|
+
if (embeddingModule) {
|
|
140
|
+
tiersUsed.push('tier2.5-embedding');
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
// Initialize engine
|
|
144
|
+
const engine = new ATREngine({
|
|
145
|
+
rulesDir: config.rulesDir,
|
|
146
|
+
embeddingModule: embeddingModule ?? undefined,
|
|
147
|
+
});
|
|
148
|
+
const ruleCount = await engine.loadRules();
|
|
149
|
+
if (ruleCount === 0) {
|
|
150
|
+
throw new Error(`No rules loaded from ${config.rulesDir}`);
|
|
151
|
+
}
|
|
152
|
+
// Run all samples
|
|
153
|
+
const results = [];
|
|
154
|
+
const useFullPipeline = embeddingModule !== null;
|
|
155
|
+
for (const sample of corpus) {
|
|
156
|
+
const result = useFullPipeline
|
|
157
|
+
? await evaluateSampleFull(engine, sample)
|
|
158
|
+
: evaluateSampleRegex(engine, sample);
|
|
159
|
+
results.push(result);
|
|
160
|
+
}
|
|
161
|
+
// Cleanup embedding module
|
|
162
|
+
if (embeddingModule) {
|
|
163
|
+
await embeddingModule.destroy();
|
|
164
|
+
}
|
|
165
|
+
// Compute report
|
|
166
|
+
const report = computeEvalReport(results);
|
|
167
|
+
const regression = checkRegression(report, config.thresholds);
|
|
168
|
+
const corpusStats = getCorpusStats();
|
|
169
|
+
// Compute per-rule quality
|
|
170
|
+
const loadedRuleIds = engine.getRules().map((r) => r.id);
|
|
171
|
+
const ruleQuality = computeRuleQuality(results, loadedRuleIds);
|
|
172
|
+
// Save report if output path specified
|
|
173
|
+
if (config.outputPath) {
|
|
174
|
+
const output = {
|
|
175
|
+
report,
|
|
176
|
+
regression,
|
|
177
|
+
corpusStats,
|
|
178
|
+
ruleQuality,
|
|
179
|
+
ruleCount,
|
|
180
|
+
engine: 'ATREngine',
|
|
181
|
+
tiers: tiersUsed,
|
|
182
|
+
};
|
|
183
|
+
writeFileSync(config.outputPath, JSON.stringify(output, null, 2));
|
|
184
|
+
}
|
|
185
|
+
return { report, regression, corpusStats, tiersUsed, ruleQuality };
|
|
186
|
+
}
|
|
187
|
+
// ---------------------------------------------------------------------------
|
|
188
|
+
// CLI entry point
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
function formatPercent(n) {
|
|
191
|
+
return `${(n * 100).toFixed(1)}%`;
|
|
192
|
+
}
|
|
193
|
+
function formatMs(n) {
|
|
194
|
+
return `${n.toFixed(2)}ms`;
|
|
195
|
+
}
|
|
196
|
+
export async function runEvalCLI() {
|
|
197
|
+
const base = resolve(join(import.meta.dirname ?? '.', '..', '..'));
|
|
198
|
+
const rulesDir = join(base, 'rules');
|
|
199
|
+
const outputPath = join(base, 'data', 'eval-report.json');
|
|
200
|
+
console.log('\n=== ATR Evaluation Harness ===\n');
|
|
201
|
+
const { report, regression, corpusStats, tiersUsed, ruleQuality } = await runEval({
|
|
202
|
+
rulesDir,
|
|
203
|
+
outputPath,
|
|
204
|
+
});
|
|
205
|
+
// Corpus stats
|
|
206
|
+
console.log(`Corpus: ${corpusStats.total} samples (${corpusStats.attacks} attacks, ${corpusStats.benign} benign)`);
|
|
207
|
+
console.log(`Categories: ${Object.keys(corpusStats.byCategory).join(', ')}`);
|
|
208
|
+
console.log(`Tiers: ${tiersUsed.join(' + ')}`);
|
|
209
|
+
// Overall metrics
|
|
210
|
+
console.log(`\n--- Overall ---`);
|
|
211
|
+
console.log(` Precision: ${formatPercent(report.overall.precision)}`);
|
|
212
|
+
console.log(` Recall: ${formatPercent(report.overall.recall)}`);
|
|
213
|
+
console.log(` F1: ${formatPercent(report.overall.f1)}`);
|
|
214
|
+
console.log(` Accuracy: ${formatPercent(report.overall.accuracy)}`);
|
|
215
|
+
console.log(` FP Rate: ${formatPercent(report.overall.fpRate)}`);
|
|
216
|
+
console.log(` Confusion: TP=${report.overall.confusion.tp} FP=${report.overall.confusion.fp} TN=${report.overall.confusion.tn} FN=${report.overall.confusion.fn}`);
|
|
217
|
+
// Latency
|
|
218
|
+
console.log(`\n--- Latency ---`);
|
|
219
|
+
console.log(` P50: ${formatMs(report.latency.p50)}`);
|
|
220
|
+
console.log(` P95: ${formatMs(report.latency.p95)}`);
|
|
221
|
+
console.log(` P99: ${formatMs(report.latency.p99)}`);
|
|
222
|
+
console.log(` Mean: ${formatMs(report.latency.mean)}`);
|
|
223
|
+
console.log(` Max: ${formatMs(report.latency.max)}`);
|
|
224
|
+
// Per category
|
|
225
|
+
console.log(`\n--- By Category ---`);
|
|
226
|
+
for (const cat of report.byCategory) {
|
|
227
|
+
const missed = cat.missedSamples.length > 0 ? ` (missed: ${cat.missedSamples.join(', ')})` : '';
|
|
228
|
+
const fps = cat.falsePositives.length > 0 ? ` (FP: ${cat.falsePositives.join(', ')})` : '';
|
|
229
|
+
console.log(` ${cat.category}: recall=${formatPercent(cat.metrics.recall)} precision=${formatPercent(cat.metrics.precision)} f1=${formatPercent(cat.metrics.f1)}${missed}${fps}`);
|
|
230
|
+
}
|
|
231
|
+
// Per difficulty
|
|
232
|
+
console.log(`\n--- By Difficulty ---`);
|
|
233
|
+
for (const diff of report.byDifficulty) {
|
|
234
|
+
console.log(` ${diff.difficulty}: recall=${formatPercent(diff.metrics.recall)} precision=${formatPercent(diff.metrics.precision)} f1=${formatPercent(diff.metrics.f1)}`);
|
|
235
|
+
}
|
|
236
|
+
// Missed attacks
|
|
237
|
+
if (report.missedAttacks.length > 0) {
|
|
238
|
+
console.log(`\n--- Missed Attacks (${report.missedAttacks.length}) ---`);
|
|
239
|
+
for (const m of report.missedAttacks) {
|
|
240
|
+
console.log(` [${m.id}] ${m.category}/${m.difficulty}/${m.tier}`);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
// False positives
|
|
244
|
+
if (report.falsePositives.length > 0) {
|
|
245
|
+
console.log(`\n--- False Positives (${report.falsePositives.length}) ---`);
|
|
246
|
+
for (const fp of report.falsePositives) {
|
|
247
|
+
console.log(` [${fp.id}] rules: ${fp.matchedRules.join(', ')}`);
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
// Rule quality
|
|
251
|
+
console.log(`\n--- Rule Quality ---`);
|
|
252
|
+
console.log(` Total rules loaded: ${ruleQuality.totalRulesEvaluated}`);
|
|
253
|
+
console.log(` Rules fired: ${ruleQuality.rulesFired}`);
|
|
254
|
+
console.log(` Rules never fired: ${ruleQuality.rulesNeverFired}`);
|
|
255
|
+
if (ruleQuality.topRules.length > 0) {
|
|
256
|
+
console.log(`\n Top 10 rules by match count:`);
|
|
257
|
+
for (const rule of ruleQuality.topRules.slice(0, 10)) {
|
|
258
|
+
const precision = rule.matchCount > 0
|
|
259
|
+
? formatPercent(rule.tpCount / rule.matchCount)
|
|
260
|
+
: 'N/A';
|
|
261
|
+
console.log(` ${rule.ruleId}: matches=${rule.matchCount} TP=${rule.tpCount} FP=${rule.fpCount} precision=${precision} avgConf=${rule.avgConfidence.toFixed(2)}`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
if (ruleQuality.weakRules.length > 0) {
|
|
265
|
+
console.log(`\n Weak rules (FP > 0 or matchCount <= 1):`);
|
|
266
|
+
for (const rule of ruleQuality.weakRules.slice(0, 10)) {
|
|
267
|
+
console.log(` ${rule.ruleId}: matches=${rule.matchCount} TP=${rule.tpCount} FP=${rule.fpCount} categories=[${rule.categories.join(', ')}]`);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
if (ruleQuality.neverFiredRuleIds.length > 0) {
|
|
271
|
+
console.log(`\n Never-fired rules (${ruleQuality.neverFiredRuleIds.length}):`);
|
|
272
|
+
for (const id of ruleQuality.neverFiredRuleIds.slice(0, 20)) {
|
|
273
|
+
console.log(` ${id}`);
|
|
274
|
+
}
|
|
275
|
+
if (ruleQuality.neverFiredRuleIds.length > 20) {
|
|
276
|
+
console.log(` ... and ${ruleQuality.neverFiredRuleIds.length - 20} more`);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
// Regression check
|
|
280
|
+
console.log(`\n--- Regression Check ---`);
|
|
281
|
+
if (regression.passed) {
|
|
282
|
+
console.log(' PASSED');
|
|
283
|
+
}
|
|
284
|
+
else {
|
|
285
|
+
console.log(' FAILED:');
|
|
286
|
+
for (const v of regression.violations) {
|
|
287
|
+
console.log(` - ${v}`);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
console.log(`\nReport saved to: ${outputPath}`);
|
|
291
|
+
console.log('Done.\n');
|
|
292
|
+
if (!regression.passed) {
|
|
293
|
+
process.exitCode = 1;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
//# sourceMappingURL=eval-harness.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-harness.js","sourceRoot":"","sources":["../../src/eval/eval-harness.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AAClE,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAG1D,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE1D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAElE,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AAiBvD;;GAEG;AACH,SAAS,aAAa,CAAC,MAAoB;IACzC,OAAO;QACL,IAAI,EAAE,MAAM,CAAC,SAAS;QACtB,OAAO,EAAE,MAAM,CAAC,IAAI;QACpB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,MAAM,EAAE,MAAM,CAAC,MAAM;KACtB,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,SAAS,mBAAmB,CAC1B,MAAiB,EACjB,MAAoB;IAEpB,MAAM,KAAK,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAEpC,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAChC,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;QACvC,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;QAE5C,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QACpC,MAAM,QAAQ,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QAE5B,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,QAAQ;YACzB,YAAY,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3C,UAAU,EAAE,QAAQ,EAAE,UAAU,IAAI,CAAC;YACrC,SAAS;YACT,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,KAAK;YACtB,YAAY,EAAE,EAAE;YAChB,UAAU,EAAE,CAAC;YACb,SAAS,EAAE,CAAC;YACZ,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,kBAAkB,CAC/B,MAAiB,EACjB,MAAoB;IAEpB,MAAM,KAAK,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAEpC,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAChC,MAAM,EAAE,OAAO,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,KAAK,CAAC,CAAC;QAC5D,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;QAE5C,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,GAAG,CAAC,CAAC;QAExC,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,QAAQ;YACzB,YAAY,EAAE,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;YACnD,UAAU,EAAE,OAAO,CAAC,iBAAiB;YACrC,SAAS;YACT,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;YAC3C,eAAe,EAAE,KAAK;YACtB,YAAY,EAAE,EAAE;YAChB,UAAU,EAAE,CAAC;YACb,SAAS,EAAE,CAAC;YACZ,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,IAAI,EAAE,MAAM,CAAC,IAAI;SAClB,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,gBAAgB,CAAC,cAAsB;IACpD,IAAI,CAAC,UAAU,CAAC,cAAc,CAAC;QAAE,OAAO,IAAI,CAAC;IAE7C,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,CAAC;QAC/D,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC;YACjC,iBAAiB,EAAE,IAAI;YACvB,mBAAmB,EAAE,IAAI;SAC1B,CAAC,CAAC;QACH,MAAM,MAAM,CAAC,UAAU,EAAE,CAAC;QAC1B,OAAO,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC;IAC9C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,MAAkB;IAO9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,WAAW,CAAC;IAC5C,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;IAC5C,MAAM,cAAc,GAAG,MAAM,CAAC,cAAc,IAAI,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,wBAAwB,CAAC,CAAC;IAE7F,+BAA+B;IAC/B,MAAM,WAAW,GAAG,MAAM,CAAC,eAAe,KAAK,KAAK,CAAC;IACrD,IAAI,eAAe,GAA2B,IAAI,CAAC;IACnD,MAAM,SAAS,GAAa,CAAC,aAAa,CAAC,CAAC;IAE5C,IAAI,WAAW,EAAE,CAAC;QAChB,eAAe,GAAG,MAAM,gBAAgB,CAAC,cAAc,CAAC,CAAC;QACzD,IAAI,eAAe,EAAE,CAAC;YACpB,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,eAAe,EAAE,eAAe,IAAI,SAAS;KAC9C,CAAC,CAAC;IACH,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC;IAE3C,IAAI,SAAS,KAAK,CAAC,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,wBAAwB,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,kBAAkB;IAClB,MAAM,OAAO,GAAmB,EAAE,CAAC;IACnC,MAAM,eAAe,GAAG,eAAe,KAAK,IAAI,CAAC;IAEjD,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC5B,MAAM,MAAM,GAAG,eAAe;YAC5B,CAAC,CAAC,MAAM,kBAAkB,CAAC,MAAM,EAAE,MAAM,CAAC;YAC1C,CAAC,CAAC,mBAAmB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,CAAC;IAED,2BAA2B;IAC3B,IAAI,eAAe,EAAE,CAAC;QACpB,MAAM,eAAe,CAAC,OAAO,EAAE,CAAC;IAClC,CAAC;IAED,iBAAiB;IACjB,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;IAC1C,MAAM,UAAU,GAAG,eAAe,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IAC9D,MAAM,WAAW,GAAG,cAAc,EAAE,CAAC;IAErC,2BAA2B;IAC3B,MAAM,aAAa,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACzD,MAAM,WAAW,GAAG,kBAAkB,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAE/D,uCAAuC;IACvC,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,MAAM,MAAM,GAAG;YACb,MAAM;YACN,UAAU;YACV,WAAW;YACX,WAAW;YACX,SAAS;YACT,MAAM,EAAE,WAAW;YACnB,KAAK,EAAE,SAAS;SACjB,CAAC;QACF,aAAa,CAAC,MAAM,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IACpE,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC;AACrE,CAAC;AAED,8EAA8E;AAC9E,kBAAkB;AAClB,8EAA8E;AAE9E,SAAS,aAAa,CAAC,CAAS;IAC9B,OAAO,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;AACpC,CAAC;AAED,SAAS,QAAQ,CAAC,CAAS;IACzB,OAAO,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;AAC7B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;IACnE,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACrC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,kBAAkB,CAAC,CAAC;IAE1D,OAAO,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;IAElD,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,MAAM,OAAO,CAAC;QAChF,QAAQ;QACR,UAAU;KACX,CAAC,CAAC;IAEH,eAAe;IACf,OAAO,CAAC,GAAG,CAAC,WAAW,WAAW,CAAC,KAAK,aAAa,WAAW,CAAC,OAAO,aAAa,WAAW,CAAC,MAAM,UAAU,CAAC,CAAC;IACnH,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC7E,OAAO,CAAC,GAAG,CAAC,UAAU,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;IAE/C,kBAAkB;IAClB,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;IACjC,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IACrE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;IACjE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,iBAAiB,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IACrE,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,OAAO,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,OAAO,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,OAAO,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,CAAC,CAAC;IAErK,UAAU;IACV,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;IACjC,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEvD,eAAe;IACf,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;IACrC,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACpC,MAAM,MAAM,GAAG,GAAG,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QAChG,MAAM,GAAG,GAAG,GAAG,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3F,OAAO,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,QAAQ,YAAY,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,cAAc,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,OAAO,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,GAAG,MAAM,GAAG,GAAG,EAAE,CAAC,CAAC;IACrL,CAAC;IAED,iBAAiB;IACjB,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,UAAU,YAAY,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,cAAc,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,OAAO,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;IAC5K,CAAC;IAED,iBAAiB;IACjB,IAAI,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,yBAAyB,MAAM,CAAC,aAAa,CAAC,MAAM,OAAO,CAAC,CAAC;QACzE,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,kBAAkB;IAClB,IAAI,MAAM,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,0BAA0B,MAAM,CAAC,cAAc,CAAC,MAAM,OAAO,CAAC,CAAC;QAC3E,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;YACvC,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnE,CAAC;IACH,CAAC;IAED,eAAe;IACf,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;IACtC,OAAO,CAAC,GAAG,CAAC,yBAAyB,WAAW,CAAC,mBAAmB,EAAE,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,kBAAkB,WAAW,CAAC,UAAU,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,wBAAwB,WAAW,CAAC,eAAe,EAAE,CAAC,CAAC;IAEnE,IAAI,WAAW,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAC;QAChD,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;YACrD,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,GAAG,CAAC;gBACnC,CAAC,CAAC,aAAa,CAAC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC;gBAC/C,CAAC,CAAC,KAAK,CAAC;YACV,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,CAAC,MAAM,aAAa,IAAI,CAAC,UAAU,OAAO,IAAI,CAAC,OAAO,OAAO,IAAI,CAAC,OAAO,cAAc,SAAS,YAAY,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACtK,CAAC;IACH,CAAC;IAED,IAAI,WAAW,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,6CAA6C,CAAC,CAAC;QAC3D,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;YACtD,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,CAAC,MAAM,aAAa,IAAI,CAAC,UAAU,OAAO,IAAI,CAAC,OAAO,OAAO,IAAI,CAAC,OAAO,gBAAgB,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjJ,CAAC;IACH,CAAC;IAED,IAAI,WAAW,CAAC,iBAAiB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7C,OAAO,CAAC,GAAG,CAAC,0BAA0B,WAAW,CAAC,iBAAiB,CAAC,MAAM,IAAI,CAAC,CAAC;QAChF,KAAK,MAAM,EAAE,IAAI,WAAW,CAAC,iBAAiB,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;YAC5D,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAC3B,CAAC;QACD,IAAI,WAAW,CAAC,iBAAiB,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAC9C,OAAO,CAAC,GAAG,CAAC,eAAe,WAAW,CAAC,iBAAiB,CAAC,MAAM,GAAG,EAAE,OAAO,CAAC,CAAC;QAC/E,CAAC;IACH,CAAC;IAED,mBAAmB;IACnB,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAC1C,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC;QACtB,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAC1B,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QACzB,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,UAAU,EAAE,CAAC;YACtC,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,sBAAsB,UAAU,EAAE,CAAC,CAAC;IAChD,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAEvB,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC;QACvB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC;IACvB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation framework public API
|
|
3
|
+
* @module agent-threat-rules/eval
|
|
4
|
+
*/
|
|
5
|
+
export { EVAL_CORPUS, getAttackSamples, getBenignSamples, getSamplesByCategory, getSamplesByDifficulty, getCorpusStats } from './corpus.js';
|
|
6
|
+
export type { CorpusSample } from './corpus.js';
|
|
7
|
+
export { computeEvalReport, checkRegression } from './metrics.js';
|
|
8
|
+
export type { SampleResult, ConfusionMatrix, ClassMetrics, LatencyStats, CategoryBreakdown, DifficultyBreakdown, EvalReport, RegressionCheck, BaselineThresholds, } from './metrics.js';
|
|
9
|
+
export { runEval, runEvalCLI } from './eval-harness.js';
|
|
10
|
+
export type { EvalConfig } from './eval-harness.js';
|
|
11
|
+
export { computeRuleQuality } from './rule-metrics.js';
|
|
12
|
+
export type { RuleQuality, RuleQualityReport } from './rule-metrics.js';
|
|
13
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAC5I,YAAY,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAEhD,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAClE,YAAY,EACV,YAAY,EACZ,eAAe,EACf,YAAY,EACZ,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,UAAU,EACV,eAAe,EACf,kBAAkB,GACnB,MAAM,cAAc,CAAC;AAEtB,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACxD,YAAY,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,YAAY,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation framework public API
|
|
3
|
+
* @module agent-threat-rules/eval
|
|
4
|
+
*/
|
|
5
|
+
export { EVAL_CORPUS, getAttackSamples, getBenignSamples, getSamplesByCategory, getSamplesByDifficulty, getCorpusStats } from './corpus.js';
|
|
6
|
+
export { computeEvalReport, checkRegression } from './metrics.js';
|
|
7
|
+
export { runEval, runEvalCLI } from './eval-harness.js';
|
|
8
|
+
export { computeRuleQuality } from './rule-metrics.js';
|
|
9
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAG5I,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAalE,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAGxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Metrics -- computes precision, recall, F1, confusion matrix,
|
|
3
|
+
* per-category breakdowns, and latency percentiles.
|
|
4
|
+
*
|
|
5
|
+
* All functions are pure (no side effects, no mutation).
|
|
6
|
+
*
|
|
7
|
+
* @module agent-threat-rules/eval/metrics
|
|
8
|
+
*/
|
|
9
|
+
export interface SampleResult {
|
|
10
|
+
readonly id: string;
|
|
11
|
+
readonly category: string;
|
|
12
|
+
readonly expectedDetection: boolean;
|
|
13
|
+
readonly actualDetection: boolean;
|
|
14
|
+
readonly matchedRules: readonly string[];
|
|
15
|
+
readonly confidence: number;
|
|
16
|
+
readonly latencyMs: number;
|
|
17
|
+
readonly difficulty: string;
|
|
18
|
+
readonly tier: string;
|
|
19
|
+
}
|
|
20
|
+
export interface ConfusionMatrix {
|
|
21
|
+
readonly tp: number;
|
|
22
|
+
readonly fp: number;
|
|
23
|
+
readonly tn: number;
|
|
24
|
+
readonly fn: number;
|
|
25
|
+
}
|
|
26
|
+
export interface ClassMetrics {
|
|
27
|
+
readonly precision: number;
|
|
28
|
+
readonly recall: number;
|
|
29
|
+
readonly f1: number;
|
|
30
|
+
readonly accuracy: number;
|
|
31
|
+
readonly fpRate: number;
|
|
32
|
+
readonly confusion: ConfusionMatrix;
|
|
33
|
+
readonly sampleCount: number;
|
|
34
|
+
}
|
|
35
|
+
export interface LatencyStats {
|
|
36
|
+
readonly p50: number;
|
|
37
|
+
readonly p95: number;
|
|
38
|
+
readonly p99: number;
|
|
39
|
+
readonly mean: number;
|
|
40
|
+
readonly max: number;
|
|
41
|
+
}
|
|
42
|
+
export interface CategoryBreakdown {
|
|
43
|
+
readonly category: string;
|
|
44
|
+
readonly metrics: ClassMetrics;
|
|
45
|
+
readonly missedSamples: readonly string[];
|
|
46
|
+
readonly falsePositives: readonly string[];
|
|
47
|
+
}
|
|
48
|
+
export interface DifficultyBreakdown {
|
|
49
|
+
readonly difficulty: string;
|
|
50
|
+
readonly metrics: ClassMetrics;
|
|
51
|
+
}
|
|
52
|
+
export interface EvalReport {
|
|
53
|
+
readonly timestamp: string;
|
|
54
|
+
readonly corpusSize: number;
|
|
55
|
+
readonly overall: ClassMetrics;
|
|
56
|
+
readonly latency: LatencyStats;
|
|
57
|
+
readonly byCategory: readonly CategoryBreakdown[];
|
|
58
|
+
readonly byDifficulty: readonly DifficultyBreakdown[];
|
|
59
|
+
readonly missedAttacks: readonly SampleResult[];
|
|
60
|
+
readonly falsePositives: readonly SampleResult[];
|
|
61
|
+
}
|
|
62
|
+
export declare function computeEvalReport(results: readonly SampleResult[]): EvalReport;
|
|
63
|
+
export interface RegressionCheck {
|
|
64
|
+
readonly passed: boolean;
|
|
65
|
+
readonly violations: readonly string[];
|
|
66
|
+
}
|
|
67
|
+
export interface BaselineThresholds {
|
|
68
|
+
readonly minRecall: number;
|
|
69
|
+
readonly maxFpRate: number;
|
|
70
|
+
readonly minF1: number;
|
|
71
|
+
readonly maxP95LatencyMs: number;
|
|
72
|
+
}
|
|
73
|
+
export declare function checkRegression(report: EvalReport, thresholds?: BaselineThresholds): RegressionCheck;
|
|
74
|
+
//# sourceMappingURL=metrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.d.ts","sourceRoot":"","sources":["../../src/eval/metrics.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,QAAQ,CAAC,eAAe,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,YAAY,EAAE,SAAS,MAAM,EAAE,CAAC;IACzC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,eAAe,CAAC;IACpC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,aAAa,EAAE,SAAS,MAAM,EAAE,CAAC;IAC1C,QAAQ,CAAC,cAAc,EAAE,SAAS,MAAM,EAAE,CAAC;CAC5C;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;CAChC;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC;IAC/B,QAAQ,CAAC,UAAU,EAAE,SAAS,iBAAiB,EAAE,CAAC;IAClD,QAAQ,CAAC,YAAY,EAAE,SAAS,mBAAmB,EAAE,CAAC;IACtD,QAAQ,CAAC,aAAa,EAAE,SAAS,YAAY,EAAE,CAAC;IAChD,QAAQ,CAAC,cAAc,EAAE,SAAS,YAAY,EAAE,CAAC;CAClD;AAkDD,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,SAAS,YAAY,EAAE,GAAG,UAAU,CAsC9E;AAMD,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,UAAU,EAAE,SAAS,MAAM,EAAE,CAAC;CACxC;AAED,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;CAClC;AASD,wBAAgB,eAAe,CAC7B,MAAM,EAAE,UAAU,EAClB,UAAU,GAAE,kBAAuC,GAClD,eAAe,CA4BjB"}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Metrics -- computes precision, recall, F1, confusion matrix,
|
|
3
|
+
* per-category breakdowns, and latency percentiles.
|
|
4
|
+
*
|
|
5
|
+
* All functions are pure (no side effects, no mutation).
|
|
6
|
+
*
|
|
7
|
+
* @module agent-threat-rules/eval/metrics
|
|
8
|
+
*/
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
// Core metric calculations
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
function buildConfusionMatrix(results) {
|
|
13
|
+
let tp = 0, fp = 0, tn = 0, fn = 0;
|
|
14
|
+
for (const r of results) {
|
|
15
|
+
if (r.expectedDetection && r.actualDetection)
|
|
16
|
+
tp++;
|
|
17
|
+
else if (!r.expectedDetection && r.actualDetection)
|
|
18
|
+
fp++;
|
|
19
|
+
else if (!r.expectedDetection && !r.actualDetection)
|
|
20
|
+
tn++;
|
|
21
|
+
else
|
|
22
|
+
fn++;
|
|
23
|
+
}
|
|
24
|
+
return { tp, fp, tn, fn };
|
|
25
|
+
}
|
|
26
|
+
function computeClassMetrics(cm, sampleCount) {
|
|
27
|
+
const precision = cm.tp + cm.fp > 0 ? cm.tp / (cm.tp + cm.fp) : 1;
|
|
28
|
+
const recall = cm.tp + cm.fn > 0 ? cm.tp / (cm.tp + cm.fn) : 1;
|
|
29
|
+
const f1 = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
|
|
30
|
+
const accuracy = sampleCount > 0 ? (cm.tp + cm.tn) / sampleCount : 1;
|
|
31
|
+
const fpRate = cm.fp + cm.tn > 0 ? cm.fp / (cm.fp + cm.tn) : 0;
|
|
32
|
+
return { precision, recall, f1, accuracy, fpRate, confusion: cm, sampleCount };
|
|
33
|
+
}
|
|
34
|
+
function computeLatency(results) {
|
|
35
|
+
if (results.length === 0) {
|
|
36
|
+
return { p50: 0, p95: 0, p99: 0, mean: 0, max: 0 };
|
|
37
|
+
}
|
|
38
|
+
const sorted = [...results].map((r) => r.latencyMs).sort((a, b) => a - b);
|
|
39
|
+
const len = sorted.length;
|
|
40
|
+
return {
|
|
41
|
+
p50: sorted[Math.floor(len * 0.5)] ?? 0,
|
|
42
|
+
p95: sorted[Math.floor(len * 0.95)] ?? 0,
|
|
43
|
+
p99: sorted[Math.floor(len * 0.99)] ?? 0,
|
|
44
|
+
mean: sorted.reduce((a, b) => a + b, 0) / len,
|
|
45
|
+
max: sorted[len - 1] ?? 0,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Report generation
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
export function computeEvalReport(results) {
|
|
52
|
+
const overallCM = buildConfusionMatrix(results);
|
|
53
|
+
const overall = computeClassMetrics(overallCM, results.length);
|
|
54
|
+
const latency = computeLatency(results);
|
|
55
|
+
// By category
|
|
56
|
+
const categories = [...new Set(results.map((r) => r.category))];
|
|
57
|
+
const byCategory = categories.map((cat) => {
|
|
58
|
+
const catResults = results.filter((r) => r.category === cat);
|
|
59
|
+
const cm = buildConfusionMatrix(catResults);
|
|
60
|
+
const metrics = computeClassMetrics(cm, catResults.length);
|
|
61
|
+
const missed = catResults.filter((r) => r.expectedDetection && !r.actualDetection).map((r) => r.id);
|
|
62
|
+
const fps = catResults.filter((r) => !r.expectedDetection && r.actualDetection).map((r) => r.id);
|
|
63
|
+
return { category: cat, metrics, missedSamples: missed, falsePositives: fps };
|
|
64
|
+
});
|
|
65
|
+
// By difficulty
|
|
66
|
+
const difficulties = [...new Set(results.map((r) => r.difficulty))];
|
|
67
|
+
const byDifficulty = difficulties.map((diff) => {
|
|
68
|
+
const diffResults = results.filter((r) => r.difficulty === diff);
|
|
69
|
+
const cm = buildConfusionMatrix(diffResults);
|
|
70
|
+
const metrics = computeClassMetrics(cm, diffResults.length);
|
|
71
|
+
return { difficulty: diff, metrics };
|
|
72
|
+
});
|
|
73
|
+
const missedAttacks = results.filter((r) => r.expectedDetection && !r.actualDetection);
|
|
74
|
+
const falsePositives = results.filter((r) => !r.expectedDetection && r.actualDetection);
|
|
75
|
+
return {
|
|
76
|
+
timestamp: new Date().toISOString(),
|
|
77
|
+
corpusSize: results.length,
|
|
78
|
+
overall,
|
|
79
|
+
latency,
|
|
80
|
+
byCategory,
|
|
81
|
+
byDifficulty,
|
|
82
|
+
missedAttacks,
|
|
83
|
+
falsePositives,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
const DEFAULT_THRESHOLDS = {
|
|
87
|
+
minRecall: 0.60,
|
|
88
|
+
maxFpRate: 0.05,
|
|
89
|
+
minF1: 0.70,
|
|
90
|
+
maxP95LatencyMs: 50,
|
|
91
|
+
};
|
|
92
|
+
export function checkRegression(report, thresholds = DEFAULT_THRESHOLDS) {
|
|
93
|
+
const violations = [];
|
|
94
|
+
if (report.overall.recall < thresholds.minRecall) {
|
|
95
|
+
violations.push(`Recall ${(report.overall.recall * 100).toFixed(1)}% < minimum ${(thresholds.minRecall * 100).toFixed(1)}%`);
|
|
96
|
+
}
|
|
97
|
+
if (report.overall.fpRate > thresholds.maxFpRate) {
|
|
98
|
+
violations.push(`FP rate ${(report.overall.fpRate * 100).toFixed(3)}% > maximum ${(thresholds.maxFpRate * 100).toFixed(3)}%`);
|
|
99
|
+
}
|
|
100
|
+
if (report.overall.f1 < thresholds.minF1) {
|
|
101
|
+
violations.push(`F1 ${(report.overall.f1 * 100).toFixed(1)}% < minimum ${(thresholds.minF1 * 100).toFixed(1)}%`);
|
|
102
|
+
}
|
|
103
|
+
if (report.latency.p95 > thresholds.maxP95LatencyMs) {
|
|
104
|
+
violations.push(`P95 latency ${report.latency.p95.toFixed(1)}ms > maximum ${thresholds.maxP95LatencyMs}ms`);
|
|
105
|
+
}
|
|
106
|
+
return { passed: violations.length === 0, violations };
|
|
107
|
+
}
|
|
108
|
+
//# sourceMappingURL=metrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.js","sourceRoot":"","sources":["../../src/eval/metrics.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AA8DH,8EAA8E;AAC9E,2BAA2B;AAC3B,8EAA8E;AAE9E,SAAS,oBAAoB,CAAC,OAAgC;IAC5D,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;IAEnC,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe;YAAE,EAAE,EAAE,CAAC;aAC9C,IAAI,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe;YAAE,EAAE,EAAE,CAAC;aACpD,IAAI,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,eAAe;YAAE,EAAE,EAAE,CAAC;;YACrD,EAAE,EAAE,CAAC;IACZ,CAAC;IAED,OAAO,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,CAAC;AAC5B,CAAC;AAED,SAAS,mBAAmB,CAAC,EAAmB,EAAE,WAAmB;IACnE,MAAM,SAAS,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAClE,MAAM,MAAM,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/D,MAAM,EAAE,GAAG,SAAS,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,SAAS,GAAG,MAAM,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACtF,MAAM,QAAQ,GAAG,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IACrE,MAAM,MAAM,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAE/D,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,EAAE,WAAW,EAAE,CAAC;AACjF,CAAC;AAED,SAAS,cAAc,CAAC,OAAgC;IACtD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC;IACrD,CAAC;IAED,MAAM,MAAM,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1E,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,CAAC;IAE1B,OAAO;QACL,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC;QACvC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC;QACxC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC;QACxC,IAAI,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,GAAG;QAC7C,GAAG,EAAE,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC;KAC1B,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,oBAAoB;AACpB,8EAA8E;AAE9E,MAAM,UAAU,iBAAiB,CAAC,OAAgC;IAChE,MAAM,SAAS,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,mBAAmB,CAAC,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;IAC/D,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IAExC,cAAc;IACd,MAAM,UAAU,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;IAChE,MAAM,UAAU,GAAwB,UAAU,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;QAC7D,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,GAAG,CAAC,CAAC;QAC7D,MAAM,EAAE,GAAG,oBAAoB,CAAC,UAAU,CAAC,CAAC;QAC5C,MAAM,OAAO,GAAG,mBAAmB,CAAC,EAAE,EAAE,UAAU,CAAC,MAAM,CAAC,CAAC;QAC3D,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACpG,MAAM,GAAG,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACjG,OAAO,EAAE,QAAQ,EAAE,GAAG,EAAE,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,cAAc,EAAE,GAAG,EAAE,CAAC;IAChF,CAAC,CAAC,CAAC;IAEH,gBAAgB;IAChB,MAAM,YAAY,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;IACpE,MAAM,YAAY,GAA0B,YAAY,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACpE,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,IAAI,CAAC,CAAC;QACjE,MAAM,EAAE,GAAG,oBAAoB,CAAC,WAAW,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,mBAAmB,CAAC,EAAE,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;QAC5D,OAAO,EAAE,UAAU,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC;IACvF,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,CAAC,eAAe,CAAC,CAAC;IAExF,OAAO;QACL,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,UAAU,EAAE,OAAO,CAAC,MAAM;QAC1B,OAAO;QACP,OAAO;QACP,UAAU;QACV,YAAY;QACZ,aAAa;QACb,cAAc;KACf,CAAC;AACJ,CAAC;AAkBD,MAAM,kBAAkB,GAAuB;IAC7C,SAAS,EAAE,IAAI;IACf,SAAS,EAAE,IAAI;IACf,KAAK,EAAE,IAAI;IACX,eAAe,EAAE,EAAE;CACpB,CAAC;AAEF,MAAM,UAAU,eAAe,CAC7B,MAAkB,EAClB,aAAiC,kBAAkB;IAEnD,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,UAAU,CAAC,SAAS,EAAE,CAAC;QACjD,UAAU,CAAC,IAAI,CACb,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,UAAU,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAC5G,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,UAAU,CAAC,SAAS,EAAE,CAAC;QACjD,UAAU,CAAC,IAAI,CACb,WAAW,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,UAAU,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAC7G,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,EAAE,GAAG,UAAU,CAAC,KAAK,EAAE,CAAC;QACzC,UAAU,CAAC,IAAI,CACb,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,UAAU,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAChG,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,GAAG,UAAU,CAAC,eAAe,EAAE,CAAC;QACpD,UAAU,CAAC,IAAI,CACb,eAAe,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,gBAAgB,UAAU,CAAC,eAAe,IAAI,CAC3F,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;AACzD,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PINT Benchmark Corpus Loader
|
|
3
|
+
*
|
|
4
|
+
* Reads the PINT-format dataset (JSON with text/category/label/source/language)
|
|
5
|
+
* built from publicly available prompt injection datasets:
|
|
6
|
+
* - deepset/prompt-injections (HuggingFace)
|
|
7
|
+
* - Lakera/gandalf_ignore_instructions (HuggingFace)
|
|
8
|
+
*
|
|
9
|
+
* Converts each sample into the CorpusSample interface used by the ATR eval
|
|
10
|
+
* harness, allowing the PINT corpus to be evaluated alongside or instead of
|
|
11
|
+
* the built-in hand-crafted corpus.
|
|
12
|
+
*
|
|
13
|
+
* @module agent-threat-rules/eval/pint-corpus
|
|
14
|
+
*/
|
|
15
|
+
import type { CorpusSample } from './corpus.js';
|
|
16
|
+
/**
|
|
17
|
+
* Load the PINT benchmark corpus from a JSON file on disk.
|
|
18
|
+
*
|
|
19
|
+
* @param dataPath - Absolute path to pint-corpus.json
|
|
20
|
+
* @returns Readonly array of CorpusSample for use with runEval()
|
|
21
|
+
*/
|
|
22
|
+
export declare function loadPintCorpus(dataPath: string): readonly CorpusSample[];
|
|
23
|
+
/**
|
|
24
|
+
* Get summary statistics for the loaded PINT corpus.
|
|
25
|
+
*/
|
|
26
|
+
export declare function getPintCorpusStats(corpus: readonly CorpusSample[]): {
|
|
27
|
+
readonly total: number;
|
|
28
|
+
readonly attacks: number;
|
|
29
|
+
readonly benign: number;
|
|
30
|
+
readonly byCategory: Readonly<Record<string, number>>;
|
|
31
|
+
readonly byDifficulty: Readonly<Record<string, number>>;
|
|
32
|
+
readonly byLanguage: Readonly<Record<string, number>>;
|
|
33
|
+
};
|
|
34
|
+
//# sourceMappingURL=pint-corpus.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pint-corpus.d.ts","sourceRoot":"","sources":["../../src/eval/pint-corpus.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAsDhD;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,SAAS,YAAY,EAAE,CAqCxE;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,SAAS,YAAY,EAAE,GAAG;IACnE,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACtD,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACxD,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;CACvD,CAoBA"}
|