@plaited/agent-eval-harness 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -5
- package/bin/cli.ts +0 -2
- package/package.json +1 -1
- package/src/commands/balance.ts +0 -2
- package/src/commands/calibrate.ts +1 -2
- package/src/commands/capture.ts +1 -1
- package/src/commands/summarize.ts +1 -3
- package/src/commands/trials.ts +1 -1
- package/src/commands/validate-refs.ts +1 -2
- package/src/core/core.ts +1 -1
- package/src/core/loading.ts +77 -0
- package/src/core/output.ts +0 -1
- package/src/core.ts +4 -1
- package/src/graders/compare-statistical.ts +187 -0
- package/src/graders/compare-weighted.ts +112 -0
- package/src/graders/tests/compare-graders.spec.ts +293 -0
- package/src/graders.ts +19 -0
- package/src/headless/headless-cli.ts +0 -2
- package/src/headless/headless-session-manager.ts +4 -1
- package/src/pipeline/compare.ts +512 -70
- package/src/pipeline/extract.ts +1 -1
- package/src/pipeline/format.ts +0 -1
- package/src/pipeline/grade.ts +1 -1
- package/src/pipeline/pipeline.ts +2 -1
- package/src/pipeline/pipeline.types.ts +29 -1
- package/src/pipeline/run.ts +5 -3
- package/src/schemas/grader-loader.ts +9 -1
- package/src/schemas/schemas-cli.ts +0 -7
- package/src/schemas/schemas.ts +211 -0
- package/src/schemas.ts +23 -0
package/README.md
CHANGED
|
@@ -48,7 +48,7 @@ Pre-built schemas are available in `.claude/skills/headless-adapters/schemas/` f
|
|
|
48
48
|
| `extract <raw> --schema <path>` | Parse raw output into trajectories |
|
|
49
49
|
| `grade <results> --grader <path>` | Apply grader to extracted results |
|
|
50
50
|
| `format <results> --style <style>` | Convert to markdown, csv, or jsonl |
|
|
51
|
-
| `compare <run1> <run2
|
|
51
|
+
| `compare <run1> <run2>...` | Compare runs (aggregate report) |
|
|
52
52
|
|
|
53
53
|
### Examples
|
|
54
54
|
|
|
@@ -76,9 +76,8 @@ cat prompts.jsonl | \
|
|
|
76
76
|
bunx @plaited/agent-eval-harness grade -g ./grader.ts | \
|
|
77
77
|
bunx @plaited/agent-eval-harness format -f markdown > report.md
|
|
78
78
|
|
|
79
|
-
# Compare
|
|
80
|
-
bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl
|
|
81
|
-
--grader ./compare-grader.ts -o comparison.jsonl
|
|
79
|
+
# Compare runs (built-in strategies: weighted, statistical, custom)
|
|
80
|
+
bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
82
81
|
```
|
|
83
82
|
|
|
84
83
|
## Skills for AI Agents
|
|
@@ -117,7 +116,7 @@ CLI tool for capturing agent trajectories, optimized for TypeScript/JavaScript p
|
|
|
117
116
|
| `extract` | Parse raw output into trajectories |
|
|
118
117
|
| `grade` | Apply grader to extracted results |
|
|
119
118
|
| `format` | Convert to markdown, csv, or jsonl |
|
|
120
|
-
| `compare` | Compare
|
|
119
|
+
| `compare` | Compare runs (aggregate report) |
|
|
121
120
|
|
|
122
121
|
**Use cases:**
|
|
123
122
|
- Capturing trajectories for downstream evaluation (Braintrust, custom scorers)
|
package/bin/cli.ts
CHANGED
|
@@ -30,7 +30,6 @@ import { schemasCli } from '../src/schemas/schemas-cli.ts'
|
|
|
30
30
|
const [command, ...args] = Bun.argv.slice(2)
|
|
31
31
|
|
|
32
32
|
const printHelp = () => {
|
|
33
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
34
33
|
console.log(`
|
|
35
34
|
agent-eval-harness - CLI tool for agent evaluation
|
|
36
35
|
|
|
@@ -144,7 +143,6 @@ const main = async () => {
|
|
|
144
143
|
case '-v':
|
|
145
144
|
case '--version': {
|
|
146
145
|
const { version } = await import('../package.json')
|
|
147
|
-
// biome-ignore lint/suspicious/noConsole: CLI version output
|
|
148
146
|
console.log(version)
|
|
149
147
|
break
|
|
150
148
|
}
|
package/package.json
CHANGED
package/src/commands/balance.ts
CHANGED
|
@@ -171,7 +171,6 @@ export const runBalance = async (config: BalanceConfig): Promise<BalanceAnalysis
|
|
|
171
171
|
if (outputPath) {
|
|
172
172
|
await Bun.write(resolvePath(outputPath), output)
|
|
173
173
|
} else {
|
|
174
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
175
174
|
console.log(output)
|
|
176
175
|
}
|
|
177
176
|
|
|
@@ -216,7 +215,6 @@ export const balance = async (args: string[]): Promise<void> => {
|
|
|
216
215
|
})
|
|
217
216
|
|
|
218
217
|
if (values.help) {
|
|
219
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
220
218
|
console.log(`
|
|
221
219
|
Usage: agent-eval-harness balance <prompts.jsonl> [options]
|
|
222
220
|
|
|
@@ -218,6 +218,7 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
|
|
|
218
218
|
output: result.output,
|
|
219
219
|
hint: result.hint,
|
|
220
220
|
trajectory: result.trajectory,
|
|
221
|
+
metadata: result.metadata,
|
|
221
222
|
})
|
|
222
223
|
}
|
|
223
224
|
|
|
@@ -231,7 +232,6 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
|
|
|
231
232
|
if (outputPath) {
|
|
232
233
|
await Bun.write(resolvePath(outputPath), markdown)
|
|
233
234
|
} else {
|
|
234
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
235
235
|
console.log(markdown)
|
|
236
236
|
}
|
|
237
237
|
|
|
@@ -260,7 +260,6 @@ export const calibrate = async (args: string[]): Promise<void> => {
|
|
|
260
260
|
})
|
|
261
261
|
|
|
262
262
|
if (values.help) {
|
|
263
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
264
263
|
console.log(`
|
|
265
264
|
Usage: agent-eval-harness calibrate <results.jsonl> [options]
|
|
266
265
|
|
package/src/commands/capture.ts
CHANGED
|
@@ -230,6 +230,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
230
230
|
output,
|
|
231
231
|
hint: promptCase.hint,
|
|
232
232
|
trajectory,
|
|
233
|
+
metadata: promptCase.metadata,
|
|
233
234
|
})
|
|
234
235
|
}
|
|
235
236
|
|
|
@@ -309,7 +310,6 @@ export const capture = async (args: string[]): Promise<void> => {
|
|
|
309
310
|
})
|
|
310
311
|
|
|
311
312
|
if (values.help) {
|
|
312
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
313
313
|
console.log(`
|
|
314
314
|
Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
|
|
315
315
|
|
|
@@ -42,7 +42,7 @@ export const formatSummary = (result: CaptureResult): SummaryResult => {
|
|
|
42
42
|
id: result.id,
|
|
43
43
|
input: inputText,
|
|
44
44
|
output: result.output,
|
|
45
|
-
toolCalls: result.trajectory.
|
|
45
|
+
toolCalls: result.trajectory.flatMap((s) => (s.type === 'tool_call' ? [s.name] : [])),
|
|
46
46
|
duration: result.timing.end - result.timing.start,
|
|
47
47
|
}
|
|
48
48
|
}
|
|
@@ -160,7 +160,6 @@ export const runSummarize = async (config: SummarizeConfig): Promise<string> =>
|
|
|
160
160
|
if (outputPath) {
|
|
161
161
|
await Bun.write(resolvePath(outputPath), output)
|
|
162
162
|
} else {
|
|
163
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
164
163
|
console.log(output)
|
|
165
164
|
}
|
|
166
165
|
|
|
@@ -188,7 +187,6 @@ export const summarize = async (args: string[]): Promise<void> => {
|
|
|
188
187
|
})
|
|
189
188
|
|
|
190
189
|
if (values.help) {
|
|
191
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
192
190
|
console.log(`
|
|
193
191
|
Usage: agent-eval-harness summarize <results.jsonl> [options]
|
|
194
192
|
|
package/src/commands/trials.ts
CHANGED
|
@@ -216,6 +216,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
216
216
|
output,
|
|
217
217
|
hint: promptCase.hint,
|
|
218
218
|
trajectory,
|
|
219
|
+
metadata: promptCase.metadata,
|
|
219
220
|
})
|
|
220
221
|
entry.pass = graderResult.pass
|
|
221
222
|
entry.score = graderResult.score
|
|
@@ -310,7 +311,6 @@ export const trials = async (args: string[]): Promise<void> => {
|
|
|
310
311
|
})
|
|
311
312
|
|
|
312
313
|
if (values.help) {
|
|
313
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
314
314
|
console.log(`
|
|
315
315
|
Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
|
|
316
316
|
|
|
@@ -71,6 +71,7 @@ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<Valid
|
|
|
71
71
|
output: prompt.reference as string,
|
|
72
72
|
hint: prompt.hint,
|
|
73
73
|
trajectory: [], // No trajectory for reference validation
|
|
74
|
+
metadata: prompt.metadata,
|
|
74
75
|
})
|
|
75
76
|
|
|
76
77
|
results.push({
|
|
@@ -91,7 +92,6 @@ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<Valid
|
|
|
91
92
|
if (outputPath) {
|
|
92
93
|
await Bun.write(resolvePath(outputPath), output)
|
|
93
94
|
} else {
|
|
94
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
95
95
|
console.log(output)
|
|
96
96
|
}
|
|
97
97
|
|
|
@@ -131,7 +131,6 @@ export const validateRefs = async (args: string[]): Promise<void> => {
|
|
|
131
131
|
})
|
|
132
132
|
|
|
133
133
|
if (values.help) {
|
|
134
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
135
134
|
console.log(`
|
|
136
135
|
Usage: agent-eval-harness validate-refs <prompts.jsonl> --grader <grader.ts> [options]
|
|
137
136
|
|
package/src/core/core.ts
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
// Loading utilities
|
|
14
|
-
export { loadJsonl, loadPrompts, loadResults } from './loading.ts'
|
|
14
|
+
export { buildResultsIndex, countLines, loadJsonl, loadPrompts, loadResults, streamResults } from './loading.ts'
|
|
15
15
|
// Output utilities
|
|
16
16
|
export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
|
|
17
17
|
// Trajectory utilities
|
package/src/core/loading.ts
CHANGED
|
@@ -94,3 +94,80 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
|
|
|
94
94
|
}
|
|
95
95
|
})
|
|
96
96
|
}
|
|
97
|
+
|
|
98
|
+
// ============================================================================
|
|
99
|
+
// Streaming Loading
|
|
100
|
+
// ============================================================================
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Stream capture results from a JSONL file.
|
|
104
|
+
*
|
|
105
|
+
* @remarks
|
|
106
|
+
* Memory-efficient alternative to loadResults for large files.
|
|
107
|
+
* Yields results one at a time using an async generator.
|
|
108
|
+
*
|
|
109
|
+
* @param path - Path to the results.jsonl file
|
|
110
|
+
* @yields Parsed and validated capture results
|
|
111
|
+
* @throws Error if file cannot be read or any line is invalid
|
|
112
|
+
*
|
|
113
|
+
* @public
|
|
114
|
+
*/
|
|
115
|
+
export async function* streamResults(path: string): AsyncGenerator<CaptureResult, void, unknown> {
|
|
116
|
+
const file = Bun.file(path)
|
|
117
|
+
const text = await file.text()
|
|
118
|
+
const lines = text.split('\n')
|
|
119
|
+
|
|
120
|
+
for (let i = 0; i < lines.length; i++) {
|
|
121
|
+
const line = lines[i]?.trim()
|
|
122
|
+
if (!line) continue
|
|
123
|
+
|
|
124
|
+
try {
|
|
125
|
+
yield CaptureResultSchema.parse(JSON.parse(line))
|
|
126
|
+
} catch (error) {
|
|
127
|
+
throw new Error(`Invalid result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Build an indexed map of results by ID using streaming.
|
|
134
|
+
*
|
|
135
|
+
* @remarks
|
|
136
|
+
* Memory-efficient for the compare command. Loads results into a Map
|
|
137
|
+
* keyed by ID for O(1) lookups without holding raw file content.
|
|
138
|
+
*
|
|
139
|
+
* For very large files (10k+ results), this is more memory-efficient than
|
|
140
|
+
* loading everything into an array and then building an index.
|
|
141
|
+
*
|
|
142
|
+
* @param path - Path to the results.jsonl file
|
|
143
|
+
* @returns Map of result ID to CaptureResult
|
|
144
|
+
*
|
|
145
|
+
* @public
|
|
146
|
+
*/
|
|
147
|
+
export const buildResultsIndex = async (path: string): Promise<Map<string, CaptureResult>> => {
|
|
148
|
+
const index = new Map<string, CaptureResult>()
|
|
149
|
+
|
|
150
|
+
for await (const result of streamResults(path)) {
|
|
151
|
+
index.set(result.id, result)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return index
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Count lines in a JSONL file without loading content.
|
|
159
|
+
*
|
|
160
|
+
* @remarks
|
|
161
|
+
* Useful for detecting large files that should use streaming mode.
|
|
162
|
+
* Uses byte-level scanning for efficiency.
|
|
163
|
+
*
|
|
164
|
+
* @param path - Path to the JSONL file
|
|
165
|
+
* @returns Number of non-empty lines
|
|
166
|
+
*
|
|
167
|
+
* @public
|
|
168
|
+
*/
|
|
169
|
+
export const countLines = async (path: string): Promise<number> => {
|
|
170
|
+
const file = Bun.file(path)
|
|
171
|
+
const text = await file.text()
|
|
172
|
+
return text.split('\n').filter((line) => line.trim()).length
|
|
173
|
+
}
|
package/src/core/output.ts
CHANGED
package/src/core.ts
CHANGED
|
@@ -8,6 +8,9 @@
|
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
export {
|
|
11
|
+
// Loading
|
|
12
|
+
buildResultsIndex,
|
|
13
|
+
countLines,
|
|
11
14
|
// Trajectory
|
|
12
15
|
detectTrajectoryRichness,
|
|
13
16
|
extractContent,
|
|
@@ -18,11 +21,11 @@ export {
|
|
|
18
21
|
getInputPreview,
|
|
19
22
|
hasToolErrors,
|
|
20
23
|
headTailPreview,
|
|
21
|
-
// Loading
|
|
22
24
|
loadJsonl,
|
|
23
25
|
loadPrompts,
|
|
24
26
|
loadResults,
|
|
25
27
|
logProgress,
|
|
26
28
|
resolvePath,
|
|
29
|
+
streamResults,
|
|
27
30
|
writeOutput,
|
|
28
31
|
} from './core/core.ts'
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in statistical significance comparison grader.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Uses bootstrap sampling to compute confidence intervals for score estimates.
|
|
6
|
+
* Flags when the winner is statistically significant (p<0.05, non-overlapping CIs).
|
|
7
|
+
*
|
|
8
|
+
* Bootstrap iterations can be customized via environment variable:
|
|
9
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
|
|
10
|
+
*
|
|
11
|
+
* @packageDocumentation
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
|
|
15
|
+
|
|
16
|
+
/** Default number of bootstrap iterations */
|
|
17
|
+
const DEFAULT_ITERATIONS = 1000
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Bootstrap confidence interval result.
|
|
21
|
+
*/
|
|
22
|
+
type BootstrapResult = {
|
|
23
|
+
/** Estimated mean from bootstrap */
|
|
24
|
+
mean: number
|
|
25
|
+
/** 95% confidence interval [lower, upper] */
|
|
26
|
+
ci95: [number, number]
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Compute bootstrap confidence interval for sample mean.
|
|
31
|
+
*
|
|
32
|
+
* @remarks
|
|
33
|
+
* Bootstrap resampling provides robust confidence intervals without
|
|
34
|
+
* assuming a specific distribution. For small samples, it's more
|
|
35
|
+
* reliable than parametric methods.
|
|
36
|
+
*
|
|
37
|
+
* @param samples - Array of numeric samples
|
|
38
|
+
* @param iterations - Number of bootstrap iterations
|
|
39
|
+
* @returns Bootstrap mean and 95% confidence interval
|
|
40
|
+
*/
|
|
41
|
+
const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
|
|
42
|
+
if (samples.length === 0) {
|
|
43
|
+
return { mean: 0, ci95: [0, 0] }
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (samples.length === 1) {
|
|
47
|
+
const value = samples[0] ?? 0
|
|
48
|
+
return { mean: value, ci95: [value, value] }
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const means: number[] = []
|
|
52
|
+
|
|
53
|
+
for (let i = 0; i < iterations; i++) {
|
|
54
|
+
// Resample with replacement - we know samples.length > 1 at this point
|
|
55
|
+
const resampled = Array.from(
|
|
56
|
+
{ length: samples.length },
|
|
57
|
+
() => samples[Math.floor(Math.random() * samples.length)] as number,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
// Compute mean of resampled data
|
|
61
|
+
const sum = resampled.reduce((acc, val) => acc + val, 0)
|
|
62
|
+
means.push(sum / resampled.length)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Sort means for percentile calculation
|
|
66
|
+
means.sort((a, b) => a - b)
|
|
67
|
+
|
|
68
|
+
// 95% CI: 2.5th and 97.5th percentiles
|
|
69
|
+
const lowerIdx = Math.floor(iterations * 0.025)
|
|
70
|
+
const upperIdx = Math.floor(iterations * 0.975)
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
mean: means[Math.floor(iterations / 2)] ?? 0,
|
|
74
|
+
ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Get bootstrap iterations from environment variable.
|
|
80
|
+
*
|
|
81
|
+
* @returns Number of bootstrap iterations
|
|
82
|
+
*/
|
|
83
|
+
const getIterationsFromEnv = (): number => {
|
|
84
|
+
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
85
|
+
if (!envValue) return DEFAULT_ITERATIONS
|
|
86
|
+
|
|
87
|
+
const parsed = Number.parseInt(envValue, 10)
|
|
88
|
+
return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Statistical significance comparison grader.
|
|
93
|
+
*
|
|
94
|
+
* @remarks
|
|
95
|
+
* Compares runs using bootstrap sampling to determine if differences
|
|
96
|
+
* are statistically significant. When confidence intervals don't overlap,
|
|
97
|
+
* the difference is flagged as significant (p<0.05).
|
|
98
|
+
*
|
|
99
|
+
* **Single-sample limitation:** When comparing individual prompts, each run
|
|
100
|
+
* provides only one score sample. Bootstrap with a single sample yields a
|
|
101
|
+
* degenerate CI of `[value, value]`. This grader is most useful when:
|
|
102
|
+
* - Aggregating results across multiple prompts
|
|
103
|
+
* - Using with the full comparison report (which combines per-prompt comparisons)
|
|
104
|
+
*
|
|
105
|
+
* For single-prompt comparisons, consider the weighted grader instead.
|
|
106
|
+
*
|
|
107
|
+
* @public
|
|
108
|
+
*/
|
|
109
|
+
export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
110
|
+
const iterations = getIterationsFromEnv()
|
|
111
|
+
|
|
112
|
+
// Collect scores for each run
|
|
113
|
+
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
114
|
+
// Use grader score if available, otherwise 0
|
|
115
|
+
const score = run.score?.score ?? 0
|
|
116
|
+
|
|
117
|
+
// For single-prompt comparison, we only have one sample
|
|
118
|
+
// In practice, this grader is most useful when aggregating across prompts
|
|
119
|
+
const stats = bootstrap([score], iterations)
|
|
120
|
+
|
|
121
|
+
return { label, score, stats }
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
// Sort by bootstrap mean descending
|
|
125
|
+
const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
|
|
126
|
+
|
|
127
|
+
// Check if winner is statistically significant
|
|
128
|
+
// CIs don't overlap = significant difference (approximately p<0.05)
|
|
129
|
+
let isSignificant = false
|
|
130
|
+
const first = sorted[0]
|
|
131
|
+
const second = sorted[1]
|
|
132
|
+
if (first && second) {
|
|
133
|
+
// Non-overlapping: first's lower bound > second's upper bound
|
|
134
|
+
isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const reasoning = isSignificant
|
|
138
|
+
? `Winner "${first?.label}" is statistically significant (p<0.05, non-overlapping 95% CIs)`
|
|
139
|
+
: 'No statistically significant difference between top runs (overlapping 95% CIs)'
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
rankings: sorted.map((s, i) => ({
|
|
143
|
+
run: s.label,
|
|
144
|
+
rank: i + 1,
|
|
145
|
+
score: s.stats.mean,
|
|
146
|
+
})),
|
|
147
|
+
reasoning,
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Create a statistical grader with custom iteration count.
|
|
153
|
+
*
|
|
154
|
+
* @param iterations - Number of bootstrap iterations
|
|
155
|
+
* @returns Comparison grader function
|
|
156
|
+
*
|
|
157
|
+
* @public
|
|
158
|
+
*/
|
|
159
|
+
export const createStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): ComparisonGrader => {
|
|
160
|
+
return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
161
|
+
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
162
|
+
const score = run.score?.score ?? 0
|
|
163
|
+
const stats = bootstrap([score], iterations)
|
|
164
|
+
return { label, score, stats }
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
|
|
168
|
+
|
|
169
|
+
let isSignificant = false
|
|
170
|
+
const first = sorted[0]
|
|
171
|
+
const second = sorted[1]
|
|
172
|
+
if (first && second) {
|
|
173
|
+
isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
rankings: sorted.map((s, i) => ({
|
|
178
|
+
run: s.label,
|
|
179
|
+
rank: i + 1,
|
|
180
|
+
score: s.stats.mean,
|
|
181
|
+
})),
|
|
182
|
+
reasoning: isSignificant
|
|
183
|
+
? `Winner "${first?.label}" is statistically significant (p<0.05)`
|
|
184
|
+
: 'No statistically significant difference between top runs',
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in weighted multi-dimensional comparison grader.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Configurable weights for quality, latency, and reliability.
|
|
6
|
+
* Default strategy when no `--grader` is specified for the compare command.
|
|
7
|
+
*
|
|
8
|
+
* Weights can be customized via environment variables:
|
|
9
|
+
* - `COMPARE_QUALITY` (default: 0.5)
|
|
10
|
+
* - `COMPARE_LATENCY` (default: 0.3)
|
|
11
|
+
* - `COMPARE_RELIABILITY` (default: 0.2)
|
|
12
|
+
*
|
|
13
|
+
* @packageDocumentation
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Weight configuration for comparison dimensions.
|
|
20
|
+
*/
|
|
21
|
+
export type Weights = {
|
|
22
|
+
/** Weight for quality (pass/score) - how much correctness matters */
|
|
23
|
+
quality: number
|
|
24
|
+
/** Weight for latency - how much speed matters */
|
|
25
|
+
latency: number
|
|
26
|
+
/** Weight for reliability - how much error-free execution matters */
|
|
27
|
+
reliability: number
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Default weights: quality=0.5, latency=0.3, reliability=0.2 */
|
|
31
|
+
export const DEFAULT_WEIGHTS: Weights = {
|
|
32
|
+
quality: 0.5,
|
|
33
|
+
latency: 0.3,
|
|
34
|
+
reliability: 0.2,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Read weights from environment variables with fallback to defaults.
|
|
39
|
+
*
|
|
40
|
+
* @returns Weights configuration
|
|
41
|
+
*/
|
|
42
|
+
export const getWeightsFromEnv = (): Weights => {
|
|
43
|
+
const quality = Number.parseFloat(process.env.COMPARE_QUALITY ?? String(DEFAULT_WEIGHTS.quality))
|
|
44
|
+
const latency = Number.parseFloat(process.env.COMPARE_LATENCY ?? String(DEFAULT_WEIGHTS.latency))
|
|
45
|
+
const reliability = Number.parseFloat(process.env.COMPARE_RELIABILITY ?? String(DEFAULT_WEIGHTS.reliability))
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
quality: Number.isNaN(quality) ? DEFAULT_WEIGHTS.quality : quality,
|
|
49
|
+
latency: Number.isNaN(latency) ? DEFAULT_WEIGHTS.latency : latency,
|
|
50
|
+
reliability: Number.isNaN(reliability) ? DEFAULT_WEIGHTS.reliability : reliability,
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Create a weighted comparison grader with custom weights.
|
|
56
|
+
*
|
|
57
|
+
* @param weights - Weight configuration for comparison dimensions
|
|
58
|
+
* @returns Comparison grader function
|
|
59
|
+
*
|
|
60
|
+
* @public
|
|
61
|
+
*/
|
|
62
|
+
export const createWeightedGrader = (weights: Weights = DEFAULT_WEIGHTS): ComparisonGrader => {
|
|
63
|
+
return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
64
|
+
const scores = Object.entries(runs).map(([label, run]) => {
|
|
65
|
+
// Quality score: use grader score if available, otherwise 0
|
|
66
|
+
// Note: run.score is only present if the result was graded
|
|
67
|
+
const qualityScore = run.score?.score ?? 0
|
|
68
|
+
|
|
69
|
+
// Latency score: inverse relationship (faster = better)
|
|
70
|
+
// Normalize: 1 / (1 + duration/1000) gives ~0.5 at 1s, ~0.1 at 10s
|
|
71
|
+
const duration = run.duration ?? 10000
|
|
72
|
+
const latencyScore = 1 / (1 + duration / 1000)
|
|
73
|
+
|
|
74
|
+
// Reliability score: 1 if no errors, 0 if errors
|
|
75
|
+
const hasErrors = run.toolErrors ?? false
|
|
76
|
+
const reliabilityScore = hasErrors ? 0 : 1
|
|
77
|
+
|
|
78
|
+
// Weighted combination
|
|
79
|
+
const weighted =
|
|
80
|
+
qualityScore * weights.quality + latencyScore * weights.latency + reliabilityScore * weights.reliability
|
|
81
|
+
|
|
82
|
+
return { label, weighted, qualityScore, latencyScore, reliabilityScore }
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
// Sort by weighted score descending (highest = best)
|
|
86
|
+
const sorted = scores.sort((a, b) => b.weighted - a.weighted)
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
rankings: sorted.map((s, i) => ({
|
|
90
|
+
run: s.label,
|
|
91
|
+
rank: i + 1,
|
|
92
|
+
score: s.weighted,
|
|
93
|
+
})),
|
|
94
|
+
reasoning: `Weighted: quality=${weights.quality}, latency=${weights.latency}, reliability=${weights.reliability}`,
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Default weighted comparison grader using environment or default weights.
|
|
101
|
+
*
|
|
102
|
+
* @remarks
|
|
103
|
+
* This is the default grader used when `--strategy weighted` is specified
|
|
104
|
+
* or when no strategy is specified for the compare command.
|
|
105
|
+
*
|
|
106
|
+
* @public
|
|
107
|
+
*/
|
|
108
|
+
export const grade: ComparisonGrader = async (input: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
109
|
+
const weights = getWeightsFromEnv()
|
|
110
|
+
const grader = createWeightedGrader(weights)
|
|
111
|
+
return grader(input)
|
|
112
|
+
}
|