veto-sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/README.md +209 -0
  2. package/dist/benchmark/cli.d.ts +22 -0
  3. package/dist/benchmark/cli.d.ts.map +1 -0
  4. package/dist/benchmark/cli.js +238 -0
  5. package/dist/benchmark/cli.js.map +1 -0
  6. package/dist/benchmark/index.d.ts +10 -0
  7. package/dist/benchmark/index.d.ts.map +1 -0
  8. package/dist/benchmark/index.js +10 -0
  9. package/dist/benchmark/index.js.map +1 -0
  10. package/dist/benchmark/loader.d.ts +19 -0
  11. package/dist/benchmark/loader.d.ts.map +1 -0
  12. package/dist/benchmark/loader.js +321 -0
  13. package/dist/benchmark/loader.js.map +1 -0
  14. package/dist/benchmark/metrics.d.ts +35 -0
  15. package/dist/benchmark/metrics.d.ts.map +1 -0
  16. package/dist/benchmark/metrics.js +195 -0
  17. package/dist/benchmark/metrics.js.map +1 -0
  18. package/dist/benchmark/runner.d.ts +39 -0
  19. package/dist/benchmark/runner.d.ts.map +1 -0
  20. package/dist/benchmark/runner.js +279 -0
  21. package/dist/benchmark/runner.js.map +1 -0
  22. package/dist/benchmark/types.d.ts +188 -0
  23. package/dist/benchmark/types.d.ts.map +1 -0
  24. package/dist/benchmark/types.js +24 -0
  25. package/dist/benchmark/types.js.map +1 -0
  26. package/dist/cli/bin.d.ts +8 -0
  27. package/dist/cli/bin.d.ts.map +1 -0
  28. package/dist/cli/bin.js +120 -0
  29. package/dist/cli/bin.js.map +1 -0
  30. package/dist/cli/config.d.ts +126 -0
  31. package/dist/cli/config.d.ts.map +1 -0
  32. package/dist/cli/config.js +137 -0
  33. package/dist/cli/config.js.map +1 -0
  34. package/dist/cli/index.d.ts +9 -0
  35. package/dist/cli/index.d.ts.map +1 -0
  36. package/dist/cli/index.js +9 -0
  37. package/dist/cli/index.js.map +1 -0
  38. package/dist/cli/init.d.ts +64 -0
  39. package/dist/cli/init.d.ts.map +1 -0
  40. package/dist/cli/init.js +160 -0
  41. package/dist/cli/init.js.map +1 -0
  42. package/dist/cli/templates.d.ts +22 -0
  43. package/dist/cli/templates.d.ts.map +1 -0
  44. package/dist/cli/templates.js +132 -0
  45. package/dist/cli/templates.js.map +1 -0
  46. package/dist/core/history.d.ts +104 -0
  47. package/dist/core/history.d.ts.map +1 -0
  48. package/dist/core/history.js +148 -0
  49. package/dist/core/history.js.map +1 -0
  50. package/dist/core/index.d.ts +10 -0
  51. package/dist/core/index.d.ts.map +1 -0
  52. package/dist/core/index.js +10 -0
  53. package/dist/core/index.js.map +1 -0
  54. package/dist/core/interceptor.d.ts +96 -0
  55. package/dist/core/interceptor.d.ts.map +1 -0
  56. package/dist/core/interceptor.js +227 -0
  57. package/dist/core/interceptor.js.map +1 -0
  58. package/dist/core/validator.d.ts +107 -0
  59. package/dist/core/validator.d.ts.map +1 -0
  60. package/dist/core/validator.js +263 -0
  61. package/dist/core/validator.js.map +1 -0
  62. package/dist/core/veto.d.ts +265 -0
  63. package/dist/core/veto.d.ts.map +1 -0
  64. package/dist/core/veto.js +681 -0
  65. package/dist/core/veto.js.map +1 -0
  66. package/dist/index.d.ts +43 -0
  67. package/dist/index.d.ts.map +1 -0
  68. package/dist/index.js +40 -0
  69. package/dist/index.js.map +1 -0
  70. package/dist/kernel/client.d.ts +82 -0
  71. package/dist/kernel/client.d.ts.map +1 -0
  72. package/dist/kernel/client.js +162 -0
  73. package/dist/kernel/client.js.map +1 -0
  74. package/dist/kernel/index.d.ts +9 -0
  75. package/dist/kernel/index.d.ts.map +1 -0
  76. package/dist/kernel/index.js +9 -0
  77. package/dist/kernel/index.js.map +1 -0
  78. package/dist/kernel/prompt.d.ts +27 -0
  79. package/dist/kernel/prompt.d.ts.map +1 -0
  80. package/dist/kernel/prompt.js +127 -0
  81. package/dist/kernel/prompt.js.map +1 -0
  82. package/dist/kernel/types.d.ts +85 -0
  83. package/dist/kernel/types.d.ts.map +1 -0
  84. package/dist/kernel/types.js +52 -0
  85. package/dist/kernel/types.js.map +1 -0
  86. package/dist/providers/adapters.d.ts +167 -0
  87. package/dist/providers/adapters.d.ts.map +1 -0
  88. package/dist/providers/adapters.js +244 -0
  89. package/dist/providers/adapters.js.map +1 -0
  90. package/dist/providers/index.d.ts +11 -0
  91. package/dist/providers/index.d.ts.map +1 -0
  92. package/dist/providers/index.js +11 -0
  93. package/dist/providers/index.js.map +1 -0
  94. package/dist/providers/types.d.ts +92 -0
  95. package/dist/providers/types.d.ts.map +1 -0
  96. package/dist/providers/types.js +10 -0
  97. package/dist/providers/types.js.map +1 -0
  98. package/dist/rules/api-client.d.ts +103 -0
  99. package/dist/rules/api-client.d.ts.map +1 -0
  100. package/dist/rules/api-client.js +241 -0
  101. package/dist/rules/api-client.js.map +1 -0
  102. package/dist/rules/index.d.ts +10 -0
  103. package/dist/rules/index.d.ts.map +1 -0
  104. package/dist/rules/index.js +10 -0
  105. package/dist/rules/index.js.map +1 -0
  106. package/dist/rules/loader.d.ts +116 -0
  107. package/dist/rules/loader.d.ts.map +1 -0
  108. package/dist/rules/loader.js +300 -0
  109. package/dist/rules/loader.js.map +1 -0
  110. package/dist/rules/rule-validator.d.ts +135 -0
  111. package/dist/rules/rule-validator.d.ts.map +1 -0
  112. package/dist/rules/rule-validator.js +239 -0
  113. package/dist/rules/rule-validator.js.map +1 -0
  114. package/dist/rules/types.d.ts +162 -0
  115. package/dist/rules/types.d.ts.map +1 -0
  116. package/dist/rules/types.js +16 -0
  117. package/dist/rules/types.js.map +1 -0
  118. package/dist/types/config.d.ts +171 -0
  119. package/dist/types/config.d.ts.map +1 -0
  120. package/dist/types/config.js +31 -0
  121. package/dist/types/config.js.map +1 -0
  122. package/dist/types/index.d.ts +8 -0
  123. package/dist/types/index.d.ts.map +1 -0
  124. package/dist/types/index.js +8 -0
  125. package/dist/types/index.js.map +1 -0
  126. package/dist/types/tool.d.ts +156 -0
  127. package/dist/types/tool.d.ts.map +1 -0
  128. package/dist/types/tool.js +27 -0
  129. package/dist/types/tool.js.map +1 -0
  130. package/dist/utils/glob.d.ts +21 -0
  131. package/dist/utils/glob.d.ts.map +1 -0
  132. package/dist/utils/glob.js +147 -0
  133. package/dist/utils/glob.js.map +1 -0
  134. package/dist/utils/id.d.ts +28 -0
  135. package/dist/utils/id.d.ts.map +1 -0
  136. package/dist/utils/id.js +43 -0
  137. package/dist/utils/id.js.map +1 -0
  138. package/dist/utils/index.d.ts +9 -0
  139. package/dist/utils/index.d.ts.map +1 -0
  140. package/dist/utils/index.js +9 -0
  141. package/dist/utils/index.js.map +1 -0
  142. package/dist/utils/logger.d.ts +97 -0
  143. package/dist/utils/logger.d.ts.map +1 -0
  144. package/dist/utils/logger.js +153 -0
  145. package/dist/utils/logger.js.map +1 -0
  146. package/package.json +90 -0
@@ -0,0 +1,321 @@
1
+ /**
2
+ * Dataset loader for benchmark samples.
3
+ *
4
+ * Parses JSONL training data files and extracts benchmark samples
5
+ * with tool calls, rules, and expected decisions.
6
+ *
7
+ * @module benchmark/loader
8
+ */
9
+ import { readFileSync, existsSync } from 'node:fs';
10
+ import { basename, dirname } from 'node:path';
11
+ import { glob } from '../utils/glob.js';
12
+ /**
13
+ * Load benchmark samples from a glob pattern.
14
+ *
15
+ * @param pattern - Glob pattern for JSONL files
16
+ * @param maxSamples - Maximum samples to load (0 = all)
17
+ * @param shuffle - Whether to shuffle samples
18
+ * @param seed - Random seed for shuffling
19
+ */
20
+ export async function loadBenchmarkSamples(pattern, maxSamples = 0, shuffle = false, seed) {
21
+ const files = await glob(pattern);
22
+ if (files.length === 0) {
23
+ throw new Error(`No files found matching pattern: ${pattern}`);
24
+ }
25
+ const samples = [];
26
+ let sampleId = 0;
27
+ for (const file of files) {
28
+ if (!existsSync(file))
29
+ continue;
30
+ const category = extractCategory(file);
31
+ const content = readFileSync(file, 'utf-8');
32
+ const lines = content.trim().split('\n').filter(line => line.trim());
33
+ for (const line of lines) {
34
+ try {
35
+ const example = JSON.parse(line);
36
+ const sample = parseTrainingExample(example, sampleId++, file, category);
37
+ if (sample) {
38
+ samples.push(sample);
39
+ }
40
+ }
41
+ catch (error) {
42
+ // Skip malformed lines
43
+ continue;
44
+ }
45
+ }
46
+ }
47
+ if (samples.length === 0) {
48
+ throw new Error('No valid samples found in dataset');
49
+ }
50
+ // Shuffle if requested
51
+ let result = shuffle ? shuffleArray(samples, seed) : samples;
52
+ // Limit samples if requested
53
+ if (maxSamples > 0 && result.length > maxSamples) {
54
+ result = result.slice(0, maxSamples);
55
+ }
56
+ return result;
57
+ }
58
+ /**
59
+ * Parse a training example into a benchmark sample.
60
+ */
61
+ function parseTrainingExample(example, id, sourceFile, category) {
62
+ if (!example.messages || example.messages.length < 3) {
63
+ return null;
64
+ }
65
+ const userMessage = example.messages.find(m => m.role === 'user');
66
+ const assistantMessage = example.messages.find(m => m.role === 'assistant');
67
+ if (!userMessage || !assistantMessage) {
68
+ return null;
69
+ }
70
+ // Parse user content to extract tool call and rules
71
+ const { tool, arguments: args, rules } = parseUserContent(userMessage.content);
72
+ if (!tool || !rules) {
73
+ return null;
74
+ }
75
+ // Parse expected response
76
+ const expected = parseExpectedResponse(assistantMessage.content);
77
+ if (!expected) {
78
+ return null;
79
+ }
80
+ return {
81
+ id: `sample-${id}`,
82
+ tool,
83
+ arguments: args,
84
+ rules,
85
+ expectedDecision: expected.decision,
86
+ expectedPassWeight: expected.pass_weight,
87
+ expectedBlockWeight: expected.block_weight,
88
+ sourceFile,
89
+ category,
90
+ };
91
+ }
92
+ /**
93
+ * Parse user content to extract tool call and rules.
94
+ */
95
+ function parseUserContent(content) {
96
+ const result = {
97
+ tool: null,
98
+ arguments: {},
99
+ rules: null,
100
+ };
101
+ // Split into TOOL CALL and RULES sections
102
+ const toolCallMatch = content.match(/TOOL CALL:\s*\n([\s\S]*?)(?=\nRULES:|$)/);
103
+ const rulesMatch = content.match(/RULES:\s*\n([\s\S]*?)$/);
104
+ if (toolCallMatch) {
105
+ const toolSection = toolCallMatch[1];
106
+ // Extract tool name
107
+ const toolNameMatch = toolSection.match(/tool:\s*(\S+)/);
108
+ if (toolNameMatch) {
109
+ result.tool = toolNameMatch[1];
110
+ }
111
+ // Extract arguments (YAML-like format)
112
+ const argsMatch = toolSection.match(/arguments:\s*\n([\s\S]*)/);
113
+ if (argsMatch) {
114
+ result.arguments = parseYamlLikeArgs(argsMatch[1]);
115
+ }
116
+ }
117
+ if (rulesMatch) {
118
+ result.rules = parseRulesSection(rulesMatch[1]);
119
+ }
120
+ return result;
121
+ }
122
+ /**
123
+ * Parse YAML-like arguments.
124
+ */
125
+ function parseYamlLikeArgs(content) {
126
+ const args = {};
127
+ const lines = content.split('\n');
128
+ for (const line of lines) {
129
+ const match = line.match(/^\s{2}(\w+):\s*(.*)$/);
130
+ if (match) {
131
+ const [, key, rawValue] = match;
132
+ args[key] = parseValue(rawValue.trim());
133
+ }
134
+ }
135
+ return args;
136
+ }
137
+ /**
138
+ * Parse a YAML-like value.
139
+ */
140
+ function parseValue(value) {
141
+ // Empty string
142
+ if (value === '""' || value === "''")
143
+ return '';
144
+ // Quoted string
145
+ if ((value.startsWith('"') && value.endsWith('"')) ||
146
+ (value.startsWith("'") && value.endsWith("'"))) {
147
+ return value.slice(1, -1);
148
+ }
149
+ // Boolean
150
+ if (value === 'true')
151
+ return true;
152
+ if (value === 'false')
153
+ return false;
154
+ // Null
155
+ if (value === 'null' || value === '~')
156
+ return null;
157
+ // Number
158
+ const num = Number(value);
159
+ if (!isNaN(num) && value !== '')
160
+ return num;
161
+ // Array (simple)
162
+ if (value.startsWith('[') && value.endsWith(']')) {
163
+ const inner = value.slice(1, -1);
164
+ if (inner === '')
165
+ return [];
166
+ return inner.split(',').map(v => parseValue(v.trim()));
167
+ }
168
+ // Default to string
169
+ return value;
170
+ }
171
+ /**
172
+ * Parse RULES section into Rule objects.
173
+ */
174
+ function parseRulesSection(content) {
175
+ const rules = [];
176
+ const ruleBlocks = content.split(/^- id:/m).filter(Boolean);
177
+ for (const block of ruleBlocks) {
178
+ const rule = parseRuleBlock('- id:' + block);
179
+ if (rule) {
180
+ rules.push(rule);
181
+ }
182
+ }
183
+ return rules;
184
+ }
185
+ /**
186
+ * Parse a single rule block.
187
+ */
188
+ function parseRuleBlock(block) {
189
+ const lines = block.split('\n');
190
+ const rule = {};
191
+ let currentSection = null;
192
+ let conditions = [];
193
+ let currentCondition = {};
194
+ for (const line of lines) {
195
+ // Rule ID
196
+ const idMatch = line.match(/^- id:\s*(.+)$/);
197
+ if (idMatch) {
198
+ rule.id = idMatch[1].trim();
199
+ continue;
200
+ }
201
+ // Simple fields
202
+ const fieldMatch = line.match(/^\s{2}(\w+):\s*(.+)$/);
203
+ if (fieldMatch) {
204
+ const [, key, value] = fieldMatch;
205
+ switch (key) {
206
+ case 'name':
207
+ rule.name = value;
208
+ break;
209
+ case 'enabled':
210
+ rule.enabled = value === 'true';
211
+ break;
212
+ case 'severity':
213
+ rule.severity = value;
214
+ break;
215
+ case 'action':
216
+ rule.action = value;
217
+ break;
218
+ case 'tools':
219
+ // Parse inline array: [tool1, tool2]
220
+ if (value.startsWith('[')) {
221
+ rule.tools = value.slice(1, -1).split(',').map(t => t.trim());
222
+ }
223
+ currentSection = 'tools';
224
+ break;
225
+ case 'conditions':
226
+ currentSection = 'conditions';
227
+ conditions = [];
228
+ break;
229
+ case 'condition_groups':
230
+ currentSection = 'condition_groups';
231
+ break;
232
+ }
233
+ continue;
234
+ }
235
+ // Condition fields
236
+ if (currentSection === 'conditions') {
237
+ const condFieldMatch = line.match(/^\s{4,6}- field:\s*(.+)$/);
238
+ if (condFieldMatch) {
239
+ if (Object.keys(currentCondition).length > 0) {
240
+ conditions.push(currentCondition);
241
+ }
242
+ currentCondition = { field: condFieldMatch[1].trim() };
243
+ continue;
244
+ }
245
+ const operatorMatch = line.match(/^\s{6,8}operator:\s*(.+)$/);
246
+ if (operatorMatch) {
247
+ currentCondition.operator = operatorMatch[1].trim();
248
+ continue;
249
+ }
250
+ const valueMatch = line.match(/^\s{6,8}value:\s*(.+)$/);
251
+ if (valueMatch) {
252
+ currentCondition.value = parseValue(valueMatch[1].trim());
253
+ continue;
254
+ }
255
+ }
256
+ }
257
+ // Add last condition
258
+ if (Object.keys(currentCondition).length > 0) {
259
+ conditions.push(currentCondition);
260
+ }
261
+ if (conditions.length > 0) {
262
+ rule.conditions = conditions;
263
+ }
264
+ // Validate required fields
265
+ if (!rule.id || !rule.name || rule.enabled === undefined || !rule.severity || !rule.action) {
266
+ return null;
267
+ }
268
+ return rule;
269
+ }
270
+ /**
271
+ * Parse expected response from assistant content.
272
+ */
273
+ function parseExpectedResponse(content) {
274
+ try {
275
+ // Extract JSON from content (might have extra text)
276
+ const jsonMatch = content.match(/\{[\s\S]*\}/);
277
+ if (!jsonMatch)
278
+ return null;
279
+ const parsed = JSON.parse(jsonMatch[0]);
280
+ if (typeof parsed.pass_weight !== 'number' ||
281
+ typeof parsed.block_weight !== 'number' ||
282
+ (parsed.decision !== 'pass' && parsed.decision !== 'block')) {
283
+ return null;
284
+ }
285
+ return parsed;
286
+ }
287
+ catch {
288
+ return null;
289
+ }
290
+ }
291
+ /**
292
+ * Extract category from file path.
293
+ */
294
+ function extractCategory(filePath) {
295
+ const dir = basename(dirname(filePath));
296
+ const file = basename(filePath, '.jsonl');
297
+ return `${dir}/${file}`;
298
+ }
299
+ /**
300
+ * Shuffle array using Fisher-Yates algorithm with optional seed.
301
+ */
302
+ function shuffleArray(array, seed) {
303
+ const result = [...array];
304
+ let random = seed !== undefined ? seededRandom(seed) : Math.random;
305
+ for (let i = result.length - 1; i > 0; i--) {
306
+ const j = Math.floor(random() * (i + 1));
307
+ [result[i], result[j]] = [result[j], result[i]];
308
+ }
309
+ return result;
310
+ }
311
+ /**
312
+ * Create a seeded random number generator.
313
+ */
314
+ function seededRandom(seed) {
315
+ let state = seed;
316
+ return () => {
317
+ state = (state * 1103515245 + 12345) & 0x7fffffff;
318
+ return state / 0x7fffffff;
319
+ };
320
+ }
321
+ //# sourceMappingURL=loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/benchmark/loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC9C,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AA8BxC;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAe,EACf,aAAqB,CAAC,EACtB,UAAmB,KAAK,EACxB,IAAa;IAEb,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,CAAC;IAElC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,oCAAoC,OAAO,EAAE,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;YAAE,SAAS;QAEhC,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAC5C,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;QAErE,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAoB,CAAC;gBACpD,MAAM,MAAM,GAAG,oBAAoB,CAAC,OAAO,EAAE,QAAQ,EAAE,EAAE,IAAI,EAAE,QAAQ,CAAC,CAAC;gBAEzE,IAAI,MAAM,EAAE,CAAC;oBACX,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;YACH,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,uBAAuB;gBACvB,SAAS;YACX,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;IACvD,CAAC;IAED,uBAAuB;IACvB,IAAI,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,YAAY,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;IAE7D,6BAA6B;IAC7B,IAAI,UAAU,GAAG,CAAC,IAAI,MAAM,CAAC,MAAM,GAAG,UAAU,EAAE,CAAC;QACjD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,oBAAoB,CAC3B,OAAwB,EACxB,EAAU,EACV,UAAkB,EAClB,QAAgB;IAEhB,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC;IAClE,MAAM,gBAAgB,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,WAAW,CAAC,CAAC;IAE5E,IAAI,CAAC,WAAW,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACtC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,oDAAoD;IACpD,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,gBAAgB,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE/E,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QACpB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,qBAAqB,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAEjE,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO;QACL,EAAE,EAAE,UAAU,EAAE,EAAE;QAClB,IAAI;QACJ,SAAS,EAAE,IAAI;QACf,KAAK;QACL,gBAAgB,EAAE,QAAQ,CAAC,QAAQ;QACnC,kBAAkB,EAAE,QAAQ,CAAC,WAAW;QACxC,mBAAmB,EAAE,QAAQ,CAAC,YAAY;QAC1C,UAAU;QACV,QAAQ;KACT,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,OAAe;IAKvC,MAAM,MAAM,GAIR;QACF,IAAI,EAAE,IAAI;QACV,SAAS,EAAE,EAAE;QACb,KAAK,EAAE,IAAI;KACZ,CAAC;IAEF,0CAA0C;IAC1C,MAAM,aAAa,GAAG,OAAO,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;IAC/E,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;IAE3D,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,WAAW,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;QAErC,oBAAoB;QACpB,MAAM,aAAa,GAAG,WAAW,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;QACzD,IAAI,aAAa,EAAE,CAAC;YAClB,MAAM,CAAC,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;QACjC,CAAC;QAED,uCAAuC;QACvC,MAAM,SAAS,GAAG,WAAW,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAChE,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,CAAC,SAAS,GAAG,iBAAiB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED,IAAI,UAAU,EAAE,CAAC;QACf,MAAM,CAAC,KAAK,GAAG,iBAAiB,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,OAAe;IACxC,MAAM,IAAI,GAA4B,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;QACjD,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,CAAC,EAAE,GAAG,EAAE,QAAQ,CAAC,GAAG,KAAK,CAAC;YAChC,IAAI,CAAC,GAAG,CAAC,GAAG,UAAU,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;QAC1C,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,UAAU,CAAC,KAAa;IAC/B,eAAe;IACf,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,IAAI;QAAE,OAAO,EAAE,CAAC;IAEhD,gBAAgB;IAChB,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;QAC9C,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QACnD,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IAC5B,CAAC;IAED,UAAU;IACV,IAAI,KAAK,KAAK,MAAM;QAAE,OAAO,IAAI,CAAC;IAClC,IAAI,KAAK,KAAK,OAAO;QAAE,OAAO,KAAK,CAAC;IAEpC,OAAO;IACP,IAAI,KAAK,KAAK,MAAM,IAAI,KAAK,KAAK,GAAG;QAAE,OAAO,IAAI,CAAC;IAEnD,SAAS;IACT,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;IAC1B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,KAAK,KAAK,EAAE;QAAE,OAAO,GAAG,CAAC;IAE5C,iBAAiB;IACjB,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACjD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACjC,IAAI,KAAK,KAAK,EAAE;YAAE,OAAO,EAAE,CAAC;QAC5B,OAAO,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACzD,CAAC;IAED,oBAAoB;IACpB,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,OAAe;IACxC,MAAM,KAAK,GAAW,EAAE,CAAC;IACzB,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE5D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,MAAM,IAAI,GAAG,cAAc,CAAC,OAAO,GAAG,KAAK,CAAC,CAAC;QAC7C,IAAI,IAAI,EAAE,CAAC;YACT,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,KAAa;IACnC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAChC,MAAM,IAAI,GAAkB,EAAE,CAAC;IAC/B,IAAI,cAAc,GAAuD,IAAI,CAAC;IAC9E,IAAI,UAAU,GAAoB,EAAE,CAAC;IACrC,IAAI,gBAAgB,GAA2B,EAAE,CAAC;IAElD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,UAAU;QACV,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;QAC7C,IAAI,OAAO,EAAE,CAAC;YACZ,IAAI,CAAC,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC5B,SAAS;QACX,CAAC;QAED,gBAAgB;QAChB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;QACtD,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,CAAC,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,UAAU,CAAC;YAClC,QAAQ,GAAG,EAAE,CAAC;gBACZ,KAAK,MAAM;oBACT,IAAI,CAAC,IAAI,GAAG,KAAK,CAAC;oBAClB,MAAM;gBACR,KAAK,SAAS;oBACZ,IAAI,CAAC,OAAO,GAAG,KAAK,KAAK,MAAM,CAAC;oBAChC,MAAM;gBACR,KAAK,UAAU;oBACb,IAAI,CAAC,QAAQ,GAAG,KAAyB,CAAC;oBAC1C,MAAM;gBACR,KAAK,QAAQ;oBACX,IAAI,CAAC,MAAM,GAAG,KAAuB,CAAC;oBACtC,MAAM;gBACR,KAAK,OAAO;oBACV,qCAAqC;oBACrC,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;wBAC1B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;oBAChE,CAAC;oBACD,cAAc,GAAG,OAAO,CAAC;oBACzB,MAAM;gBACR,KAAK,YAAY;oBACf,cAAc,GAAG,YAAY,CAAC;oBAC9B,UAAU,GAAG,EAAE,CAAC;oBAChB,MAAM;gBACR,KAAK,kBAAkB;oBACrB,cAAc,GAAG,kBAAkB,CAAC;oBACpC,MAAM;YACV,CAAC;YACD,SAAS;QACX,CAAC;QAED,mBAAmB;QACnB,IAAI,cAAc,KAAK,YAAY,EAAE,CAAC;YACpC,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;YAC9D,IAAI,cAAc,EAAE,CAAC;gBACnB,IAAI,MAAM,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC7C,UAAU,CAAC,IAAI,CAAC,gBAAiC,CAAC,CAAC;gBACrD,CAAC;gBACD,gBAAgB,GAAG,EAAE,KAAK,EAAE,cAAc,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC;gBACvD,SAAS;YACX,CAAC;YAED,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC;YAC9D,IAAI,aAAa,EAAE,CAAC;gBAClB,gBAAgB,CAAC,QAAQ,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,EAA+B,CAAC;gBACjF,SAAS;YACX,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;YACxD,IAAI,UAAU,EAAE,CAAC;gBACf,gBAAgB,CAAC,KAAK,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;gBAC1D,SAAS;YACX,CAAC;QACH,CAAC;IACH,CAAC;IAED,qBAAqB;IACrB,IAAI,MAAM,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7C,UAAU,CAAC,IAAI,CAAC,gBAAiC,CAAC,CAAC;IACrD,CAAC;IAED,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,2BAA2B;IAC3B,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3F,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,IAAY,CAAC;AACtB,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAAC,OAAe;IAC5C,IAAI,CAAC;QACH,oDAAoD;QACpD,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC/C,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAE5B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QAExC,IAAI,OAAO,MAAM,CAAC,WAAW,KAAK,QAAQ;YACtC,OAAO,MAAM,CAAC,YAAY,KAAK,QAAQ;YACvC,CAAC,MAAM,CAAC,QAAQ,KAAK,MAAM,IAAI,MAAM,CAAC,QAAQ,KAAK,OAAO,CAAC,EAAE,CAAC;YAChE,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,MAA0B,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,QAAgB;IACvC,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IACxC,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,OAAO,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAI,KAAU,EAAE,IAAa;IAChD,MAAM,MAAM,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;IAC1B,IAAI,MAAM,GAAG,IAAI,KAAK,SAAS,CAAC,CAAC,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;IAEnE,KAAK,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACzC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,IAAY;IAChC,IAAI,KAAK,GAAG,IAAI,CAAC;IACjB,OAAO,GAAG,EAAE;QACV,KAAK,GAAG,CAAC,KAAK,GAAG,UAAU,GAAG,KAAK,CAAC,GAAG,UAAU,CAAC;QAClD,OAAO,KAAK,GAAG,UAAU,CAAC;IAC5B,CAAC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Metrics calculation for benchmark results.
3
+ *
4
+ * @module benchmark/metrics
5
+ */
6
+ import type { BenchmarkResult, ConfusionMatrix, LatencyStats, ClassificationMetrics, CategoryMetrics } from './types.js';
7
+ /**
8
+ * Calculate confusion matrix from benchmark results.
9
+ * Treats "block" as the positive class.
10
+ */
11
+ export declare function calculateConfusionMatrix(results: BenchmarkResult[]): ConfusionMatrix;
12
+ /**
13
+ * Calculate classification metrics from confusion matrix.
14
+ */
15
+ export declare function calculateClassificationMetrics(cm: ConfusionMatrix): ClassificationMetrics;
16
+ /**
17
+ * Calculate latency statistics from benchmark results.
18
+ */
19
+ export declare function calculateLatencyStats(results: BenchmarkResult[]): LatencyStats;
20
+ /**
21
+ * Calculate per-category metrics.
22
+ */
23
+ export declare function calculateCategoryMetrics(results: BenchmarkResult[]): CategoryMetrics[];
24
+ /**
25
+ * Calculate weight calibration metrics.
26
+ * Measures how well the model's confidence weights match decisions.
27
+ */
28
+ export declare function calculateWeightCalibration(results: BenchmarkResult[]): {
29
+ meanPassWeightForPass: number;
30
+ meanPassWeightForBlock: number;
31
+ meanBlockWeightForPass: number;
32
+ meanBlockWeightForBlock: number;
33
+ weightAccuracy: number;
34
+ };
35
+ //# sourceMappingURL=metrics.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metrics.d.ts","sourceRoot":"","sources":["../../src/benchmark/metrics.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,eAAe,EACf,YAAY,EACZ,qBAAqB,EACrB,eAAe,EAChB,MAAM,YAAY,CAAC;AAEpB;;;GAGG;AACH,wBAAgB,wBAAwB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,eAAe,CA0BpF;AAED;;GAEG;AACH,wBAAgB,8BAA8B,CAAC,EAAE,EAAE,eAAe,GAAG,qBAAqB,CAsCzF;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,YAAY,CA0C9E;AAqBD;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,eAAe,EAAE,CA8BtF;AAED;;;GAGG;AACH,wBAAgB,0BAA0B,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG;IACtE,qBAAqB,EAAE,MAAM,CAAC;IAC9B,sBAAsB,EAAE,MAAM,CAAC;IAC/B,sBAAsB,EAAE,MAAM,CAAC;IAC/B,uBAAuB,EAAE,MAAM,CAAC;IAChC,cAAc,EAAE,MAAM,CAAC;CACxB,CAsCA"}
@@ -0,0 +1,195 @@
1
+ /**
2
+ * Metrics calculation for benchmark results.
3
+ *
4
+ * @module benchmark/metrics
5
+ */
6
+ /**
7
+ * Calculate confusion matrix from benchmark results.
8
+ * Treats "block" as the positive class.
9
+ */
10
+ export function calculateConfusionMatrix(results) {
11
+ let tp = 0, tn = 0, fp = 0, fn = 0;
12
+ for (const result of results) {
13
+ if (result.error)
14
+ continue;
15
+ const expected = result.sample.expectedDecision;
16
+ const actual = result.actualDecision;
17
+ if (expected === 'block' && actual === 'block') {
18
+ tp++;
19
+ }
20
+ else if (expected === 'pass' && actual === 'pass') {
21
+ tn++;
22
+ }
23
+ else if (expected === 'pass' && actual === 'block') {
24
+ fp++;
25
+ }
26
+ else if (expected === 'block' && actual === 'pass') {
27
+ fn++;
28
+ }
29
+ }
30
+ return {
31
+ truePositive: tp,
32
+ trueNegative: tn,
33
+ falsePositive: fp,
34
+ falseNegative: fn,
35
+ };
36
+ }
37
+ /**
38
+ * Calculate classification metrics from confusion matrix.
39
+ */
40
+ export function calculateClassificationMetrics(cm) {
41
+ const { truePositive: tp, trueNegative: tn, falsePositive: fp, falseNegative: fn } = cm;
42
+ const total = tp + tn + fp + fn;
43
+ // Accuracy
44
+ const accuracy = total > 0 ? (tp + tn) / total : 0;
45
+ // Precision (for block class)
46
+ const precision = (tp + fp) > 0 ? tp / (tp + fp) : 0;
47
+ // Recall (for block class)
48
+ const recall = (tp + fn) > 0 ? tp / (tp + fn) : 0;
49
+ // F1 Score
50
+ const f1Score = (precision + recall) > 0
51
+ ? 2 * (precision * recall) / (precision + recall)
52
+ : 0;
53
+ // False Positive Rate
54
+ const falsePositiveRate = (fp + tn) > 0 ? fp / (fp + tn) : 0;
55
+ // False Negative Rate
56
+ const falseNegativeRate = (fn + tp) > 0 ? fn / (fn + tp) : 0;
57
+ // Matthews Correlation Coefficient
58
+ const mccNumerator = (tp * tn) - (fp * fn);
59
+ const mccDenominator = Math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
60
+ const mcc = mccDenominator > 0 ? mccNumerator / mccDenominator : 0;
61
+ return {
62
+ accuracy,
63
+ precision,
64
+ recall,
65
+ f1Score,
66
+ falsePositiveRate,
67
+ falseNegativeRate,
68
+ mcc,
69
+ };
70
+ }
71
+ /**
72
+ * Calculate latency statistics from benchmark results.
73
+ */
74
+ export function calculateLatencyStats(results) {
75
+ const latencies = results
76
+ .filter(r => !r.error)
77
+ .map(r => r.latencyMs)
78
+ .sort((a, b) => a - b);
79
+ if (latencies.length === 0) {
80
+ return {
81
+ min: 0,
82
+ max: 0,
83
+ mean: 0,
84
+ median: 0,
85
+ p95: 0,
86
+ p99: 0,
87
+ stdDev: 0,
88
+ totalMs: 0,
89
+ };
90
+ }
91
+ const min = latencies[0];
92
+ const max = latencies[latencies.length - 1];
93
+ const sum = latencies.reduce((a, b) => a + b, 0);
94
+ const mean = sum / latencies.length;
95
+ const median = percentile(latencies, 50);
96
+ const p95 = percentile(latencies, 95);
97
+ const p99 = percentile(latencies, 99);
98
+ // Standard deviation
99
+ const squaredDiffs = latencies.map(l => Math.pow(l - mean, 2));
100
+ const avgSquaredDiff = squaredDiffs.reduce((a, b) => a + b, 0) / latencies.length;
101
+ const stdDev = Math.sqrt(avgSquaredDiff);
102
+ return {
103
+ min,
104
+ max,
105
+ mean,
106
+ median,
107
+ p95,
108
+ p99,
109
+ stdDev,
110
+ totalMs: sum,
111
+ };
112
+ }
113
+ /**
114
+ * Calculate percentile value from sorted array.
115
+ */
116
+ function percentile(sortedArray, p) {
117
+ if (sortedArray.length === 0)
118
+ return 0;
119
+ if (sortedArray.length === 1)
120
+ return sortedArray[0];
121
+ const index = (p / 100) * (sortedArray.length - 1);
122
+ const lower = Math.floor(index);
123
+ const upper = Math.ceil(index);
124
+ const fraction = index - lower;
125
+ if (lower === upper) {
126
+ return sortedArray[lower];
127
+ }
128
+ return sortedArray[lower] * (1 - fraction) + sortedArray[upper] * fraction;
129
+ }
130
+ /**
131
+ * Calculate per-category metrics.
132
+ */
133
+ export function calculateCategoryMetrics(results) {
134
+ // Group results by category
135
+ const byCategory = new Map();
136
+ for (const result of results) {
137
+ const category = result.sample.category ?? 'unknown';
138
+ const existing = byCategory.get(category) ?? [];
139
+ existing.push(result);
140
+ byCategory.set(category, existing);
141
+ }
142
+ // Calculate metrics for each category
143
+ const categoryMetrics = [];
144
+ for (const [category, categoryResults] of byCategory) {
145
+ const confusionMatrix = calculateConfusionMatrix(categoryResults);
146
+ const metrics = calculateClassificationMetrics(confusionMatrix);
147
+ const latency = calculateLatencyStats(categoryResults);
148
+ categoryMetrics.push({
149
+ category,
150
+ sampleCount: categoryResults.length,
151
+ accuracy: metrics.accuracy,
152
+ confusionMatrix,
153
+ latency,
154
+ });
155
+ }
156
+ // Sort by category name
157
+ return categoryMetrics.sort((a, b) => a.category.localeCompare(b.category));
158
+ }
159
+ /**
160
+ * Calculate weight calibration metrics.
161
+ * Measures how well the model's confidence weights match decisions.
162
+ */
163
+ export function calculateWeightCalibration(results) {
164
+ const passDecisions = results.filter(r => !r.error && r.actualDecision === 'pass');
165
+ const blockDecisions = results.filter(r => !r.error && r.actualDecision === 'block');
166
+ const meanPassWeightForPass = passDecisions.length > 0
167
+ ? passDecisions.reduce((sum, r) => sum + r.actualPassWeight, 0) / passDecisions.length
168
+ : 0;
169
+ const meanBlockWeightForPass = passDecisions.length > 0
170
+ ? passDecisions.reduce((sum, r) => sum + r.actualBlockWeight, 0) / passDecisions.length
171
+ : 0;
172
+ const meanPassWeightForBlock = blockDecisions.length > 0
173
+ ? blockDecisions.reduce((sum, r) => sum + r.actualPassWeight, 0) / blockDecisions.length
174
+ : 0;
175
+ const meanBlockWeightForBlock = blockDecisions.length > 0
176
+ ? blockDecisions.reduce((sum, r) => sum + r.actualBlockWeight, 0) / blockDecisions.length
177
+ : 0;
178
+ // Weight accuracy: how often does max weight match decision
179
+ const validResults = results.filter(r => !r.error);
180
+ const weightMatchCount = validResults.filter(r => {
181
+ const maxWeight = r.actualPassWeight > r.actualBlockWeight ? 'pass' : 'block';
182
+ return maxWeight === r.actualDecision;
183
+ }).length;
184
+ const weightAccuracy = validResults.length > 0
185
+ ? weightMatchCount / validResults.length
186
+ : 0;
187
+ return {
188
+ meanPassWeightForPass,
189
+ meanPassWeightForBlock,
190
+ meanBlockWeightForPass,
191
+ meanBlockWeightForBlock,
192
+ weightAccuracy,
193
+ };
194
+ }
195
+ //# sourceMappingURL=metrics.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metrics.js","sourceRoot":"","sources":["../../src/benchmark/metrics.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAUH;;;GAGG;AACH,MAAM,UAAU,wBAAwB,CAAC,OAA0B;IACjE,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;IAEnC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,MAAM,CAAC,KAAK;YAAE,SAAS;QAE3B,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC;QAChD,MAAM,MAAM,GAAG,MAAM,CAAC,cAAc,CAAC;QAErC,IAAI,QAAQ,KAAK,OAAO,IAAI,MAAM,KAAK,OAAO,EAAE,CAAC;YAC/C,EAAE,EAAE,CAAC;QACP,CAAC;aAAM,IAAI,QAAQ,KAAK,MAAM,IAAI,MAAM,KAAK,MAAM,EAAE,CAAC;YACpD,EAAE,EAAE,CAAC;QACP,CAAC;aAAM,IAAI,QAAQ,KAAK,MAAM,IAAI,MAAM,KAAK,OAAO,EAAE,CAAC;YACrD,EAAE,EAAE,CAAC;QACP,CAAC;aAAM,IAAI,QAAQ,KAAK,OAAO,IAAI,MAAM,KAAK,MAAM,EAAE,CAAC;YACrD,EAAE,EAAE,CAAC;QACP,CAAC;IACH,CAAC;IAED,OAAO;QACL,YAAY,EAAE,EAAE;QAChB,YAAY,EAAE,EAAE;QAChB,aAAa,EAAE,EAAE;QACjB,aAAa,EAAE,EAAE;KAClB,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,8BAA8B,CAAC,EAAmB;IAChE,MAAM,EAAE,YAAY,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,EAAE,aAAa,EAAE,EAAE,EAAE,aAAa,EAAE,EAAE,EAAE,GAAG,EAAE,CAAC;IACxF,MAAM,KAAK,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAEhC,WAAW;IACX,MAAM,QAAQ,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAEnD,8BAA8B;IAC9B,MAAM,SAAS,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAErD,2BAA2B;IAC3B,MAAM,MAAM,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAElD,WAAW;IACX,MAAM,OAAO,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC;QACtC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC;QACjD,CAAC,CAAC,CAAC,CAAC;IAEN,sBAAsB;IACtB,MAAM,iBAAiB,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,sBAAsB;IACtB,MAAM,iBAAiB,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,mCAAmC;IACnC,MAAM,YAAY,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;IAC3C,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;IAChF,MAAM,GAAG,GAAG,cAAc,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC;IAEnE,OAAO;QACL,QAAQ;QACR,SAAS;QACT,MAAM;QACN,OAAO;QACP,iBAAiB;QACjB,iBAAiB;QACjB,GAAG;KACJ,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CAAC,OAA0B;IAC9D,MAAM,SAAS,GAAG,OAAO;SACtB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;SACrB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;SACrB,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAEzB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO;YACL,GAAG,EAAE,CAAC;YACN,GAAG,EAAE,CAAC;YACN,IAAI,EAAE,CAAC;YACP,MAAM,EAAE,CAAC;YACT,GAAG,EAAE,CAAC;YACN,GAAG,EAAE,CAAC;YACN,MAAM,EAAE,CAAC;YACT,OAAO,EAAE,CAAC;SACX,CAAC;IACJ,CAAC;IAED,MAAM,GAAG,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;IACzB,MAAM,GAAG,GAAG,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC5C,MAAM,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACjD,MAAM,IAAI,GAAG,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC;IACpC,MAAM,MAAM,GAAG,UAAU,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IACzC,MAAM,GAAG,GAAG,UAAU,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IACtC,MAAM,GAAG,GAAG,UAAU,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAEtC,qBAAqB;IACrB,MAAM,YAAY,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAC/D,MAAM,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC;IAClF,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IAEzC,OAAO;QACL,GAAG;QACH,GAAG;QACH,IAAI;QACJ,MAAM;QACN,GAAG;QACH,GAAG;QACH,MAAM;QACN,OAAO,EAAE,GAAG;KACb,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,UAAU,CAAC,WAAqB,EAAE,CAAS;IAClD,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACvC,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC;IAEpD,MAAM,KAAK,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACnD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC/B,MAAM,QAAQ,GAAG,KAAK,GAAG,KAAK,CAAC;IAE/B,IAAI,KAAK,KAAK,KAAK,EAAE,CAAC;QACpB,OAAO,WAAW,CAAC,KAAK,CAAC,CAAC;IAC5B,CAAC;IAED,OAAO,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,GAAG,WAAW,CAAC,KAAK,CAAC,GAAG,QAAQ,CAAC;AAC7E,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,wBAAwB,CAAC,OAA0B;IACjE,4BAA4B;IAC5B,MAAM,UAAU,GAAG,IAAI,GAAG,EAA6B,CAAC;IAExD,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,QAAQ,IAAI,SAAS,CAAC;QACrD,MAAM,QAAQ,GAAG,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QAChD,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACtB,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IACrC,CAAC;IAED,sCAAsC;IACtC,MAAM,eAAe,GAAsB,EAAE,CAAC;IAE9C,KAAK,MAAM,CAAC,QAAQ,EAAE,eAAe,CAAC,IAAI,UAAU,EAAE,CAAC;QACrD,MAAM,eAAe,GAAG,wBAAwB,CAAC,eAAe,CAAC,CAAC;QAClE,MAAM,OAAO,GAAG,8BAA8B,CAAC,eAAe,CAAC,CAAC;QAChE,MAAM,OAAO,GAAG,qBAAqB,CAAC,eAAe,CAAC,CAAC;QAEvD,eAAe,CAAC,IAAI,CAAC;YACnB,QAAQ;YACR,WAAW,EAAE,eAAe,CAAC,MAAM;YACnC,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,eAAe;YACf,OAAO;SACR,CAAC,CAAC;IACL,CAAC;IAED,wBAAwB;IACxB,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;AAC9E,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,0BAA0B,CAAC,OAA0B;IAOnE,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,cAAc,KAAK,MAAM,CAAC,CAAC;IACnF,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,cAAc,KAAK,OAAO,CAAC,CAAC;IAErF,MAAM,qBAAqB,GAAG,aAAa,CAAC,MAAM,GAAG,CAAC;QACpD,CAAC,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,gBAAgB,EAAE,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM;QACtF,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,sBAAsB,GAAG,aAAa,CAAC,MAAM,GAAG,CAAC;QACrD,CAAC,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,iBAAiB,EAAE,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM;QACvF,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,sBAAsB,GAAG,cAAc,CAAC,MAAM,GAAG,CAAC;QACtD,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,gBAAgB,EAAE,CAAC,CAAC,GAAG,cAAc,CAAC,MAAM;QACxF,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,uBAAuB,GAAG,cAAc,CAAC,MAAM,GAAG,CAAC;QACvD,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,iBAAiB,EAAE,CAAC,CAAC,GAAG,cAAc,CAAC,MAAM;QACzF,CAAC,CAAC,CAAC,CAAC;IAEN,4DAA4D;IAC5D,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IACnD,MAAM,gBAAgB,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE;QAC/C,MAAM,SAAS,GAAG,CAAC,CAAC,gBAAgB,GAAG,CAAC,CAAC,iBAAiB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC;QAC9E,OAAO,SAAS,KAAK,CAAC,CAAC,cAAc,CAAC;IACxC,CAAC,CAAC,CAAC,MAAM,CAAC;IAEV,MAAM,cAAc,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC;QAC5C,CAAC,CAAC,gBAAgB,GAAG,YAAY,CAAC,MAAM;QACxC,CAAC,CAAC,CAAC,CAAC;IAEN,OAAO;QACL,qBAAqB;QACrB,sBAAsB;QACtB,sBAAsB;QACtB,uBAAuB;QACvB,cAAc;KACf,CAAC;AACJ,CAAC"}
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Benchmark runner for evaluating Veto kernel performance.
3
+ *
4
+ * @module benchmark/runner
5
+ */
6
+ import { type Logger } from '../utils/logger.js';
7
+ import type { BenchmarkResult, BenchmarkReport, BenchmarkConfig } from './types.js';
8
+ /**
9
+ * Progress callback for benchmark updates.
10
+ */
11
+ export type ProgressCallback = (completed: number, total: number, current: BenchmarkResult | null, eta: number) => void;
12
+ /**
13
+ * Options for running a benchmark.
14
+ */
15
+ export interface BenchmarkRunnerOptions {
16
+ /** Benchmark configuration */
17
+ config: BenchmarkConfig;
18
+ /** Progress callback */
19
+ onProgress?: ProgressCallback;
20
+ /** Logger instance */
21
+ logger?: Logger;
22
+ }
23
+ /**
24
+ * Run a benchmark evaluation.
25
+ */
26
+ export declare function runBenchmark(options: BenchmarkRunnerOptions): Promise<BenchmarkReport>;
27
+ /**
28
+ * Format a benchmark report for console output.
29
+ */
30
+ export declare function formatReportConsole(report: BenchmarkReport): string;
31
+ /**
32
+ * Save benchmark report to JSON file.
33
+ */
34
+ export declare function saveReportJson(report: BenchmarkReport, path: string): void;
35
+ /**
36
+ * Create a default progress logger.
37
+ */
38
+ export declare function createProgressLogger(): ProgressCallback;
39
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/benchmark/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,OAAO,EAAgB,KAAK,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAQ/D,OAAO,KAAK,EACV,eAAe,EACf,eAAe,EACf,eAAe,EAChB,MAAM,YAAY,CAAC;AAEpB;;GAEG;AACH,MAAM,MAAM,gBAAgB,GAAG,CAC7B,SAAS,EAAE,MAAM,EACjB,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,eAAe,GAAG,IAAI,EAC/B,GAAG,EAAE,MAAM,KACR,IAAI,CAAC;AAEV;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC,8BAA8B;IAC9B,MAAM,EAAE,eAAe,CAAC;IACxB,wBAAwB;IACxB,UAAU,CAAC,EAAE,gBAAgB,CAAC;IAC9B,sBAAsB;IACtB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC,eAAe,CAAC,CA0J5F;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CA8GnE;AAaD;;GAEG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,eAAe,EAAE,IAAI,EAAE,MAAM,GAAG,IAAI,CAE1E;AAED;;GAEG;AACH,wBAAgB,oBAAoB,IAAI,gBAAgB,CAkBvD"}