@plaited/agent-eval-harness 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -5
- package/bin/cli.ts +0 -2
- package/package.json +1 -1
- package/src/commands/balance.ts +0 -2
- package/src/commands/calibrate.ts +1 -2
- package/src/commands/capture.ts +1 -1
- package/src/commands/summarize.ts +1 -3
- package/src/commands/trials.ts +1 -1
- package/src/commands/validate-refs.ts +1 -2
- package/src/core/core.ts +1 -1
- package/src/core/loading.ts +77 -0
- package/src/core/output.ts +0 -1
- package/src/core.ts +4 -1
- package/src/graders/compare-statistical.ts +187 -0
- package/src/graders/compare-weighted.ts +112 -0
- package/src/graders/tests/compare-graders.spec.ts +293 -0
- package/src/graders.ts +19 -0
- package/src/headless/headless-cli.ts +0 -2
- package/src/headless/headless-session-manager.ts +4 -1
- package/src/pipeline/compare.ts +512 -70
- package/src/pipeline/extract.ts +1 -1
- package/src/pipeline/format.ts +0 -1
- package/src/pipeline/grade.ts +1 -1
- package/src/pipeline/pipeline.ts +2 -1
- package/src/pipeline/pipeline.types.ts +29 -1
- package/src/pipeline/run.ts +5 -3
- package/src/schemas/grader-loader.ts +9 -1
- package/src/schemas/schemas-cli.ts +0 -7
- package/src/schemas/schemas.ts +211 -0
- package/src/schemas.ts +23 -0
package/src/pipeline/extract.ts
CHANGED
|
@@ -91,6 +91,7 @@ const extractFromRaw = (rawOutput: RawOutput, parser: ReturnType<typeof createOu
|
|
|
91
91
|
output: finalOutput,
|
|
92
92
|
trajectory,
|
|
93
93
|
toolErrors: toolErrors || !!rawOutput.error,
|
|
94
|
+
metadata: rawOutput.metadata,
|
|
94
95
|
timing: rawOutput.timing,
|
|
95
96
|
...(rawOutput.error && { error: rawOutput.error }),
|
|
96
97
|
}
|
|
@@ -186,7 +187,6 @@ export const extract = async (args: string[]): Promise<void> => {
|
|
|
186
187
|
})
|
|
187
188
|
|
|
188
189
|
if (values.help) {
|
|
189
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
190
190
|
console.log(`
|
|
191
191
|
Usage: agent-eval-harness extract [raw.jsonl] --schema <schema.json> [options]
|
|
192
192
|
|
package/src/pipeline/format.ts
CHANGED
package/src/pipeline/grade.ts
CHANGED
|
@@ -51,6 +51,7 @@ export const runGrade = async (
|
|
|
51
51
|
output: extracted.output,
|
|
52
52
|
hint: extracted.hint,
|
|
53
53
|
trajectory: extracted.trajectory,
|
|
54
|
+
metadata: extracted.metadata,
|
|
54
55
|
})
|
|
55
56
|
|
|
56
57
|
const graded: GradedResult = {
|
|
@@ -110,7 +111,6 @@ export const grade = async (args: string[]): Promise<void> => {
|
|
|
110
111
|
})
|
|
111
112
|
|
|
112
113
|
if (values.help) {
|
|
113
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
114
114
|
console.log(`
|
|
115
115
|
Usage: agent-eval-harness grade [extracted.jsonl] --grader <grader> [options]
|
|
116
116
|
|
package/src/pipeline/pipeline.ts
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
17
|
// Commands
|
|
18
|
-
export { compare } from './compare.ts'
|
|
18
|
+
export { type CompareStrategy, compare, type ExtendedCompareConfig, runCompare } from './compare.ts'
|
|
19
19
|
export { extract } from './extract.ts'
|
|
20
20
|
export { format } from './format.ts'
|
|
21
21
|
export { grade } from './grade.ts'
|
|
@@ -27,6 +27,7 @@ export type {
|
|
|
27
27
|
ComparisonGraderResult,
|
|
28
28
|
ComparisonRanking,
|
|
29
29
|
ComparisonResult,
|
|
30
|
+
ComparisonRunData,
|
|
30
31
|
ExtractConfig,
|
|
31
32
|
ExtractedResult,
|
|
32
33
|
FormatConfig,
|
|
@@ -26,6 +26,8 @@ export type RawOutput = {
|
|
|
26
26
|
input: string | string[]
|
|
27
27
|
/** Grader context hint */
|
|
28
28
|
hint?: string
|
|
29
|
+
/** Optional metadata from original prompt */
|
|
30
|
+
metadata?: Record<string, unknown>
|
|
29
31
|
/** Raw output lines from the agent (JSON strings) */
|
|
30
32
|
rawLines: string[]
|
|
31
33
|
/** Timing metadata */
|
|
@@ -58,6 +60,8 @@ export type ExtractedResult = {
|
|
|
58
60
|
trajectory: TrajectoryStep[]
|
|
59
61
|
/** Whether tool errors were detected */
|
|
60
62
|
toolErrors: boolean
|
|
63
|
+
/** Optional metadata from original prompt */
|
|
64
|
+
metadata?: Record<string, unknown>
|
|
61
65
|
/** Timing metadata */
|
|
62
66
|
timing: {
|
|
63
67
|
start: number
|
|
@@ -158,6 +162,28 @@ export type LabeledRun = {
|
|
|
158
162
|
path: string
|
|
159
163
|
}
|
|
160
164
|
|
|
165
|
+
/**
|
|
166
|
+
* Run data provided to comparison graders.
|
|
167
|
+
*
|
|
168
|
+
* @remarks
|
|
169
|
+
* Extended run data includes optional fields that built-in graders use:
|
|
170
|
+
* - `score`: Grader result if the run was previously graded
|
|
171
|
+
* - `duration`: Total duration from timing
|
|
172
|
+
* - `toolErrors`: Whether tool errors occurred
|
|
173
|
+
*/
|
|
174
|
+
export type ComparisonRunData = {
|
|
175
|
+
/** Final agent output */
|
|
176
|
+
output: string
|
|
177
|
+
/** Execution trajectory (optional, varies by adapter) */
|
|
178
|
+
trajectory?: TrajectoryStep[]
|
|
179
|
+
/** Grader score (if run was graded) */
|
|
180
|
+
score?: GraderResult
|
|
181
|
+
/** Total duration in milliseconds */
|
|
182
|
+
duration?: number
|
|
183
|
+
/** Whether tool errors occurred */
|
|
184
|
+
toolErrors?: boolean
|
|
185
|
+
}
|
|
186
|
+
|
|
161
187
|
/**
|
|
162
188
|
* Input to comparison grader function.
|
|
163
189
|
*
|
|
@@ -172,8 +198,10 @@ export type ComparisonGraderInput = {
|
|
|
172
198
|
input: string | string[]
|
|
173
199
|
/** Grader context hint */
|
|
174
200
|
hint?: string
|
|
201
|
+
/** Optional metadata from original prompt */
|
|
202
|
+
metadata?: Record<string, unknown>
|
|
175
203
|
/** Results keyed by run label */
|
|
176
|
-
runs: Record<string,
|
|
204
|
+
runs: Record<string, ComparisonRunData>
|
|
177
205
|
}
|
|
178
206
|
|
|
179
207
|
/**
|
package/src/pipeline/run.ts
CHANGED
|
@@ -105,7 +105,7 @@ const runShell = async (
|
|
|
105
105
|
*/
|
|
106
106
|
export const runPipeline = async (
|
|
107
107
|
config: RunConfig,
|
|
108
|
-
prompts: Array<{ id: string; input: string | string[]; hint?: string }>,
|
|
108
|
+
prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>,
|
|
109
109
|
outputPath?: string,
|
|
110
110
|
): Promise<void> => {
|
|
111
111
|
const {
|
|
@@ -181,6 +181,7 @@ export const runPipeline = async (
|
|
|
181
181
|
id: promptCase.id,
|
|
182
182
|
input: promptCase.input,
|
|
183
183
|
hint: promptCase.hint,
|
|
184
|
+
metadata: promptCase.metadata,
|
|
184
185
|
rawLines,
|
|
185
186
|
timing: {
|
|
186
187
|
start: startTime,
|
|
@@ -224,6 +225,7 @@ export const runPipeline = async (
|
|
|
224
225
|
id: promptCase.id,
|
|
225
226
|
input: promptCase.input,
|
|
226
227
|
hint: promptCase.hint,
|
|
228
|
+
metadata: promptCase.metadata,
|
|
227
229
|
rawLines: allLines,
|
|
228
230
|
timing: {
|
|
229
231
|
start: startTime,
|
|
@@ -267,6 +269,7 @@ export const runPipeline = async (
|
|
|
267
269
|
id: promptCase.id,
|
|
268
270
|
input: promptCase.input,
|
|
269
271
|
hint: promptCase.hint,
|
|
272
|
+
metadata: promptCase.metadata,
|
|
270
273
|
rawLines: allLines,
|
|
271
274
|
timing: {
|
|
272
275
|
start: startTime,
|
|
@@ -331,7 +334,6 @@ export const run = async (args: string[]): Promise<void> => {
|
|
|
331
334
|
})
|
|
332
335
|
|
|
333
336
|
if (values.help) {
|
|
334
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
335
337
|
console.log(`
|
|
336
338
|
Usage: agent-eval-harness run [prompts.jsonl] [options]
|
|
337
339
|
|
|
@@ -383,7 +385,7 @@ Examples:
|
|
|
383
385
|
|
|
384
386
|
// Load prompts from file or stdin
|
|
385
387
|
const promptsPath = positionals[0]
|
|
386
|
-
let prompts: Array<{ id: string; input: string | string[]; hint?: string }>
|
|
388
|
+
let prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>
|
|
387
389
|
|
|
388
390
|
if (promptsPath) {
|
|
389
391
|
prompts = await loadPrompts(promptsPath)
|
|
@@ -40,12 +40,20 @@ const resolvePath = (path: string): string => {
|
|
|
40
40
|
// Executable Grader
|
|
41
41
|
// ============================================================================
|
|
42
42
|
|
|
43
|
-
/**
|
|
43
|
+
/**
|
|
44
|
+
* Input format for executable graders (stdin JSON).
|
|
45
|
+
*
|
|
46
|
+
* @remarks
|
|
47
|
+
* The metadata field contains arbitrary key-value pairs from the original
|
|
48
|
+
* prompt JSONL (e.g., category, difficulty, tags). Use this to implement
|
|
49
|
+
* category-specific grading logic or filter calibration samples.
|
|
50
|
+
*/
|
|
44
51
|
type ExecGraderInput = {
|
|
45
52
|
input: string | string[]
|
|
46
53
|
output: string
|
|
47
54
|
hint?: string
|
|
48
55
|
trajectory?: TrajectoryStep[]
|
|
56
|
+
metadata?: Record<string, unknown>
|
|
49
57
|
}
|
|
50
58
|
|
|
51
59
|
/**
|
|
@@ -99,10 +99,8 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
|
|
|
99
99
|
// List mode
|
|
100
100
|
if (list) {
|
|
101
101
|
const names = Object.keys(SCHEMA_REGISTRY)
|
|
102
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
103
102
|
console.log('Available schemas:')
|
|
104
103
|
for (const name of names) {
|
|
105
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
106
104
|
console.log(` - ${name}`)
|
|
107
105
|
}
|
|
108
106
|
return names
|
|
@@ -123,7 +121,6 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
|
|
|
123
121
|
if (outputPath) {
|
|
124
122
|
await Bun.write(resolvePath(outputPath), output)
|
|
125
123
|
} else {
|
|
126
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
127
124
|
console.log(output)
|
|
128
125
|
}
|
|
129
126
|
|
|
@@ -154,15 +151,12 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
|
|
|
154
151
|
if (outputPath) {
|
|
155
152
|
await Bun.write(resolvePath(outputPath), output)
|
|
156
153
|
} else {
|
|
157
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
158
154
|
console.log(output)
|
|
159
155
|
}
|
|
160
156
|
} else {
|
|
161
157
|
// Default: list schemas
|
|
162
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
163
158
|
console.log('Available schemas (use --json to export):')
|
|
164
159
|
for (const name of Object.keys(allSchemas)) {
|
|
165
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
166
160
|
console.log(` - ${name}`)
|
|
167
161
|
}
|
|
168
162
|
}
|
|
@@ -193,7 +187,6 @@ export const schemasCli = async (args: string[]): Promise<void> => {
|
|
|
193
187
|
})
|
|
194
188
|
|
|
195
189
|
if (values.help) {
|
|
196
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
197
190
|
console.log(`
|
|
198
191
|
Usage: agent-eval-harness schemas [schema-name] [options]
|
|
199
192
|
|
package/src/schemas/schemas.ts
CHANGED
|
@@ -229,12 +229,14 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
|
|
|
229
229
|
* User-provided graders implement this interface to score agent outputs.
|
|
230
230
|
* - `input` is the original prompt (string or array for multi-turn)
|
|
231
231
|
* - `hint` provides grader context (renamed from `expected`)
|
|
232
|
+
* - `metadata` contains arbitrary key-value pairs from the original prompt JSONL
|
|
232
233
|
*/
|
|
233
234
|
export type Grader = (params: {
|
|
234
235
|
input: string | string[]
|
|
235
236
|
output: string
|
|
236
237
|
hint?: string
|
|
237
238
|
trajectory?: TrajectoryStep[]
|
|
239
|
+
metadata?: Record<string, unknown>
|
|
238
240
|
}) => Promise<GraderResult>
|
|
239
241
|
|
|
240
242
|
// ============================================================================
|
|
@@ -556,3 +558,212 @@ export const ValidationResultSchema = z.object({
|
|
|
556
558
|
|
|
557
559
|
/** Validation result type */
|
|
558
560
|
export type ValidationResult = z.infer<typeof ValidationResultSchema>
|
|
561
|
+
|
|
562
|
+
// ============================================================================
|
|
563
|
+
// Comparison Report Schemas
|
|
564
|
+
// ============================================================================
|
|
565
|
+
|
|
566
|
+
/**
|
|
567
|
+
* Score distribution histogram for quality analysis.
|
|
568
|
+
*
|
|
569
|
+
* @remarks
|
|
570
|
+
* Buckets divide the 0-1 score range into 5 equal bins.
|
|
571
|
+
*/
|
|
572
|
+
export const ScoreDistributionSchema = z.object({
|
|
573
|
+
'0.0-0.2': z.number(),
|
|
574
|
+
'0.2-0.4': z.number(),
|
|
575
|
+
'0.4-0.6': z.number(),
|
|
576
|
+
'0.6-0.8': z.number(),
|
|
577
|
+
'0.8-1.0': z.number(),
|
|
578
|
+
})
|
|
579
|
+
|
|
580
|
+
/** Score distribution type */
|
|
581
|
+
export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
|
|
582
|
+
|
|
583
|
+
/**
|
|
584
|
+
* Quality metrics for a single run in comparison.
|
|
585
|
+
*/
|
|
586
|
+
export const QualityMetricsSchema = z.object({
|
|
587
|
+
/** Mean grader score (0-1) */
|
|
588
|
+
avgScore: z.number(),
|
|
589
|
+
/** Percentage of pass=true results */
|
|
590
|
+
passRate: z.number(),
|
|
591
|
+
/** Count of passing results */
|
|
592
|
+
passCount: z.number(),
|
|
593
|
+
/** Count of failing results */
|
|
594
|
+
failCount: z.number(),
|
|
595
|
+
/** Score distribution histogram */
|
|
596
|
+
scoreDistribution: ScoreDistributionSchema,
|
|
597
|
+
})
|
|
598
|
+
|
|
599
|
+
/** Quality metrics type */
|
|
600
|
+
export type QualityMetrics = z.infer<typeof QualityMetricsSchema>
|
|
601
|
+
|
|
602
|
+
/**
|
|
603
|
+
* Latency statistics for performance analysis.
|
|
604
|
+
*/
|
|
605
|
+
export const LatencyStatsSchema = z.object({
|
|
606
|
+
/** 50th percentile (median) in milliseconds */
|
|
607
|
+
p50: z.number(),
|
|
608
|
+
/** 90th percentile in milliseconds */
|
|
609
|
+
p90: z.number(),
|
|
610
|
+
/** 99th percentile in milliseconds */
|
|
611
|
+
p99: z.number(),
|
|
612
|
+
/** Mean latency in milliseconds */
|
|
613
|
+
mean: z.number(),
|
|
614
|
+
/** Minimum latency in milliseconds */
|
|
615
|
+
min: z.number(),
|
|
616
|
+
/** Maximum latency in milliseconds */
|
|
617
|
+
max: z.number(),
|
|
618
|
+
})
|
|
619
|
+
|
|
620
|
+
/** Latency stats type */
|
|
621
|
+
export type LatencyStats = z.infer<typeof LatencyStatsSchema>
|
|
622
|
+
|
|
623
|
+
/**
|
|
624
|
+
* Performance metrics for a single run in comparison.
|
|
625
|
+
*/
|
|
626
|
+
export const PerformanceMetricsSchema = z.object({
|
|
627
|
+
/** End-to-end latency statistics */
|
|
628
|
+
latency: LatencyStatsSchema,
|
|
629
|
+
/** Time to first response statistics (optional, not all adapters support) */
|
|
630
|
+
firstResponse: LatencyStatsSchema.optional(),
|
|
631
|
+
/** Sum of all run durations in milliseconds */
|
|
632
|
+
totalDuration: z.number(),
|
|
633
|
+
})
|
|
634
|
+
|
|
635
|
+
/** Performance metrics type */
|
|
636
|
+
export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
|
|
637
|
+
|
|
638
|
+
/**
|
|
639
|
+
* Reliability metrics for a single run in comparison.
|
|
640
|
+
*/
|
|
641
|
+
export const ReliabilityMetricsSchema = z.object({
|
|
642
|
+
/** Count of runs with toolErrors=true */
|
|
643
|
+
toolErrors: z.number(),
|
|
644
|
+
/** Percentage of runs with tool errors */
|
|
645
|
+
toolErrorRate: z.number(),
|
|
646
|
+
/** Count of runs that hit timeout */
|
|
647
|
+
timeouts: z.number(),
|
|
648
|
+
/** Percentage of runs that hit timeout */
|
|
649
|
+
timeoutRate: z.number(),
|
|
650
|
+
/** Percentage of runs that completed successfully */
|
|
651
|
+
completionRate: z.number(),
|
|
652
|
+
})
|
|
653
|
+
|
|
654
|
+
/** Reliability metrics type */
|
|
655
|
+
export type ReliabilityMetrics = z.infer<typeof ReliabilityMetricsSchema>
|
|
656
|
+
|
|
657
|
+
/**
|
|
658
|
+
* Trajectory info for a single run in comparison.
|
|
659
|
+
*/
|
|
660
|
+
export const TrajectoryInfoSchema = z.object({
|
|
661
|
+
/** Trajectory richness level */
|
|
662
|
+
richness: TrajectoryRichnessSchema,
|
|
663
|
+
/** Average trajectory steps per run */
|
|
664
|
+
avgStepCount: z.number(),
|
|
665
|
+
})
|
|
666
|
+
|
|
667
|
+
/** Trajectory info type */
|
|
668
|
+
export type TrajectoryInfo = z.infer<typeof TrajectoryInfoSchema>
|
|
669
|
+
|
|
670
|
+
/**
|
|
671
|
+
* Per-prompt comparison entry for head-to-head drill-down.
|
|
672
|
+
*/
|
|
673
|
+
export const PromptComparisonSchema = z.object({
|
|
674
|
+
/** Prompt identifier */
|
|
675
|
+
id: z.string(),
|
|
676
|
+
/** Run label of the winner, or null if tie */
|
|
677
|
+
winner: z.string().nullable(),
|
|
678
|
+
/** Scores by run label */
|
|
679
|
+
scores: z.record(z.string(), z.number()),
|
|
680
|
+
/** Latencies by run label in milliseconds */
|
|
681
|
+
latencies: z.record(z.string(), z.number()),
|
|
682
|
+
/** Whether each run had errors */
|
|
683
|
+
hadErrors: z.record(z.string(), z.boolean()),
|
|
684
|
+
})
|
|
685
|
+
|
|
686
|
+
/** Prompt comparison type */
|
|
687
|
+
export type PromptComparison = z.infer<typeof PromptComparisonSchema>
|
|
688
|
+
|
|
689
|
+
/**
|
|
690
|
+
* Pairwise win/loss/tie statistics between two runs.
|
|
691
|
+
*/
|
|
692
|
+
export const PairwiseComparisonSchema = z.object({
|
|
693
|
+
/** First run label */
|
|
694
|
+
runA: z.string(),
|
|
695
|
+
/** Second run label */
|
|
696
|
+
runB: z.string(),
|
|
697
|
+
/** Number of prompts where A won */
|
|
698
|
+
aWins: z.number(),
|
|
699
|
+
/** Number of prompts where B won */
|
|
700
|
+
bWins: z.number(),
|
|
701
|
+
/** Number of prompts where A and B tied */
|
|
702
|
+
ties: z.number(),
|
|
703
|
+
})
|
|
704
|
+
|
|
705
|
+
/** Pairwise comparison type */
|
|
706
|
+
export type PairwiseComparison = z.infer<typeof PairwiseComparisonSchema>
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Head-to-head comparison section.
|
|
710
|
+
*/
|
|
711
|
+
export const HeadToHeadSchema = z.object({
|
|
712
|
+
/** Per-prompt breakdown for drill-down */
|
|
713
|
+
prompts: z.array(PromptComparisonSchema),
|
|
714
|
+
/** Pairwise win rates between runs */
|
|
715
|
+
pairwise: z.array(PairwiseComparisonSchema),
|
|
716
|
+
})
|
|
717
|
+
|
|
718
|
+
/** Head-to-head type */
|
|
719
|
+
export type HeadToHead = z.infer<typeof HeadToHeadSchema>
|
|
720
|
+
|
|
721
|
+
/**
|
|
722
|
+
* Metadata for the comparison report.
|
|
723
|
+
*/
|
|
724
|
+
export const ComparisonMetaSchema = z.object({
|
|
725
|
+
/** ISO timestamp when report was generated */
|
|
726
|
+
generatedAt: z.string(),
|
|
727
|
+
/** Run labels included in comparison */
|
|
728
|
+
runs: z.array(z.string()),
|
|
729
|
+
/** Total prompts compared */
|
|
730
|
+
promptCount: z.number(),
|
|
731
|
+
/** Prompts where all runs completed */
|
|
732
|
+
promptsWithAllRuns: z.number(),
|
|
733
|
+
})
|
|
734
|
+
|
|
735
|
+
/** Comparison meta type */
|
|
736
|
+
export type ComparisonMeta = z.infer<typeof ComparisonMetaSchema>
|
|
737
|
+
|
|
738
|
+
/**
|
|
739
|
+
* Holistic comparison report schema.
|
|
740
|
+
*
|
|
741
|
+
* @remarks
|
|
742
|
+
* Aggregates comparison output across all dimensions:
|
|
743
|
+
* - Quality: pass rates, scores, distributions
|
|
744
|
+
* - Performance: latency percentiles
|
|
745
|
+
* - Reliability: error rates, completion rates
|
|
746
|
+
* - Head-to-head: per-prompt winners, pairwise stats
|
|
747
|
+
*
|
|
748
|
+
* Note: Tool usage analysis is NOT included because adapter formats vary.
|
|
749
|
+
* Different adapters provide different `trajectoryRichness` levels and
|
|
750
|
+
* the `tool_call.name` field often contains tool use IDs rather than
|
|
751
|
+
* human-readable names.
|
|
752
|
+
*/
|
|
753
|
+
export const ComparisonReportSchema = z.object({
|
|
754
|
+
/** Report metadata */
|
|
755
|
+
meta: ComparisonMetaSchema,
|
|
756
|
+
/** Quality metrics by run label */
|
|
757
|
+
quality: z.record(z.string(), QualityMetricsSchema),
|
|
758
|
+
/** Performance metrics by run label */
|
|
759
|
+
performance: z.record(z.string(), PerformanceMetricsSchema),
|
|
760
|
+
/** Reliability metrics by run label */
|
|
761
|
+
reliability: z.record(z.string(), ReliabilityMetricsSchema),
|
|
762
|
+
/** Trajectory info by run label */
|
|
763
|
+
trajectoryInfo: z.record(z.string(), TrajectoryInfoSchema),
|
|
764
|
+
/** Head-to-head comparison details */
|
|
765
|
+
headToHead: HeadToHeadSchema,
|
|
766
|
+
})
|
|
767
|
+
|
|
768
|
+
/** Comparison report type */
|
|
769
|
+
export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
|
package/src/schemas.ts
CHANGED
|
@@ -35,10 +35,17 @@ export {
|
|
|
35
35
|
CaptureResultSchema,
|
|
36
36
|
type CategoryDistribution,
|
|
37
37
|
CategoryDistributionSchema,
|
|
38
|
+
// Comparison report types
|
|
39
|
+
type ComparisonMeta,
|
|
40
|
+
ComparisonMetaSchema,
|
|
41
|
+
type ComparisonReport,
|
|
42
|
+
ComparisonReportSchema,
|
|
38
43
|
EnvVariableSchema,
|
|
39
44
|
type Grader,
|
|
40
45
|
type GraderResult,
|
|
41
46
|
GraderResultSchema,
|
|
47
|
+
type HeadToHead,
|
|
48
|
+
HeadToHeadSchema,
|
|
42
49
|
HttpHeaderSchema,
|
|
43
50
|
type IndexedStep,
|
|
44
51
|
type JsonRpcError,
|
|
@@ -55,14 +62,28 @@ export {
|
|
|
55
62
|
JsonRpcResponseSchema,
|
|
56
63
|
type JsonRpcSuccessResponse,
|
|
57
64
|
JsonRpcSuccessResponseSchema,
|
|
65
|
+
type LatencyStats,
|
|
66
|
+
LatencyStatsSchema,
|
|
58
67
|
type McpServerConfig,
|
|
59
68
|
McpServerHttpSchema,
|
|
60
69
|
McpServerSchema,
|
|
61
70
|
McpServerStdioSchema,
|
|
62
71
|
MessageStepSchema,
|
|
72
|
+
type PairwiseComparison,
|
|
73
|
+
PairwiseComparisonSchema,
|
|
74
|
+
type PerformanceMetrics,
|
|
75
|
+
PerformanceMetricsSchema,
|
|
63
76
|
PlanStepSchema,
|
|
64
77
|
type PromptCase,
|
|
65
78
|
PromptCaseSchema,
|
|
79
|
+
type PromptComparison,
|
|
80
|
+
PromptComparisonSchema,
|
|
81
|
+
type QualityMetrics,
|
|
82
|
+
QualityMetricsSchema,
|
|
83
|
+
type ReliabilityMetrics,
|
|
84
|
+
ReliabilityMetricsSchema,
|
|
85
|
+
type ScoreDistribution,
|
|
86
|
+
ScoreDistributionSchema,
|
|
66
87
|
type Session,
|
|
67
88
|
SessionSchema,
|
|
68
89
|
type SummaryResult,
|
|
@@ -73,6 +94,8 @@ export {
|
|
|
73
94
|
ToolCallStepSchema,
|
|
74
95
|
type ToolInput,
|
|
75
96
|
ToolInputSchema,
|
|
97
|
+
type TrajectoryInfo,
|
|
98
|
+
TrajectoryInfoSchema,
|
|
76
99
|
type TrajectoryRichness,
|
|
77
100
|
TrajectoryRichnessSchema,
|
|
78
101
|
type TrajectoryStep,
|