@plaited/agent-eval-harness 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,6 +91,7 @@ const extractFromRaw = (rawOutput: RawOutput, parser: ReturnType<typeof createOu
91
91
  output: finalOutput,
92
92
  trajectory,
93
93
  toolErrors: toolErrors || !!rawOutput.error,
94
+ metadata: rawOutput.metadata,
94
95
  timing: rawOutput.timing,
95
96
  ...(rawOutput.error && { error: rawOutput.error }),
96
97
  }
@@ -186,7 +187,6 @@ export const extract = async (args: string[]): Promise<void> => {
186
187
  })
187
188
 
188
189
  if (values.help) {
189
- // biome-ignore lint/suspicious/noConsole: CLI help output
190
190
  console.log(`
191
191
  Usage: agent-eval-harness extract [raw.jsonl] --schema <schema.json> [options]
192
192
 
@@ -235,7 +235,6 @@ export const format = async (args: string[]): Promise<void> => {
235
235
  })
236
236
 
237
237
  if (values.help) {
238
- // biome-ignore lint/suspicious/noConsole: CLI help output
239
238
  console.log(`
240
239
  Usage: agent-eval-harness format [results.jsonl] [options]
241
240
 
@@ -51,6 +51,7 @@ export const runGrade = async (
51
51
  output: extracted.output,
52
52
  hint: extracted.hint,
53
53
  trajectory: extracted.trajectory,
54
+ metadata: extracted.metadata,
54
55
  })
55
56
 
56
57
  const graded: GradedResult = {
@@ -110,7 +111,6 @@ export const grade = async (args: string[]): Promise<void> => {
110
111
  })
111
112
 
112
113
  if (values.help) {
113
- // biome-ignore lint/suspicious/noConsole: CLI help output
114
114
  console.log(`
115
115
  Usage: agent-eval-harness grade [extracted.jsonl] --grader <grader> [options]
116
116
 
@@ -15,7 +15,7 @@
15
15
  */
16
16
 
17
17
  // Commands
18
- export { compare } from './compare.ts'
18
+ export { type CompareStrategy, compare, type ExtendedCompareConfig, runCompare } from './compare.ts'
19
19
  export { extract } from './extract.ts'
20
20
  export { format } from './format.ts'
21
21
  export { grade } from './grade.ts'
@@ -27,6 +27,7 @@ export type {
27
27
  ComparisonGraderResult,
28
28
  ComparisonRanking,
29
29
  ComparisonResult,
30
+ ComparisonRunData,
30
31
  ExtractConfig,
31
32
  ExtractedResult,
32
33
  FormatConfig,
@@ -26,6 +26,8 @@ export type RawOutput = {
26
26
  input: string | string[]
27
27
  /** Grader context hint */
28
28
  hint?: string
29
+ /** Optional metadata from original prompt */
30
+ metadata?: Record<string, unknown>
29
31
  /** Raw output lines from the agent (JSON strings) */
30
32
  rawLines: string[]
31
33
  /** Timing metadata */
@@ -58,6 +60,8 @@ export type ExtractedResult = {
58
60
  trajectory: TrajectoryStep[]
59
61
  /** Whether tool errors were detected */
60
62
  toolErrors: boolean
63
+ /** Optional metadata from original prompt */
64
+ metadata?: Record<string, unknown>
61
65
  /** Timing metadata */
62
66
  timing: {
63
67
  start: number
@@ -158,6 +162,28 @@ export type LabeledRun = {
158
162
  path: string
159
163
  }
160
164
 
165
+ /**
166
+ * Run data provided to comparison graders.
167
+ *
168
+ * @remarks
169
+ * Extended run data includes optional fields that built-in graders use:
170
+ * - `score`: Grader result if the run was previously graded
171
+ * - `duration`: Total duration from timing
172
+ * - `toolErrors`: Whether tool errors occurred
173
+ */
174
+ export type ComparisonRunData = {
175
+ /** Final agent output */
176
+ output: string
177
+ /** Execution trajectory (optional, varies by adapter) */
178
+ trajectory?: TrajectoryStep[]
179
+ /** Grader score (if run was graded) */
180
+ score?: GraderResult
181
+ /** Total duration in milliseconds */
182
+ duration?: number
183
+ /** Whether tool errors occurred */
184
+ toolErrors?: boolean
185
+ }
186
+
161
187
  /**
162
188
  * Input to comparison grader function.
163
189
  *
@@ -172,8 +198,10 @@ export type ComparisonGraderInput = {
172
198
  input: string | string[]
173
199
  /** Grader context hint */
174
200
  hint?: string
201
+ /** Optional metadata from original prompt */
202
+ metadata?: Record<string, unknown>
175
203
  /** Results keyed by run label */
176
- runs: Record<string, { output: string; trajectory?: TrajectoryStep[] }>
204
+ runs: Record<string, ComparisonRunData>
177
205
  }
178
206
 
179
207
  /**
@@ -105,7 +105,7 @@ const runShell = async (
105
105
  */
106
106
  export const runPipeline = async (
107
107
  config: RunConfig,
108
- prompts: Array<{ id: string; input: string | string[]; hint?: string }>,
108
+ prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>,
109
109
  outputPath?: string,
110
110
  ): Promise<void> => {
111
111
  const {
@@ -181,6 +181,7 @@ export const runPipeline = async (
181
181
  id: promptCase.id,
182
182
  input: promptCase.input,
183
183
  hint: promptCase.hint,
184
+ metadata: promptCase.metadata,
184
185
  rawLines,
185
186
  timing: {
186
187
  start: startTime,
@@ -224,6 +225,7 @@ export const runPipeline = async (
224
225
  id: promptCase.id,
225
226
  input: promptCase.input,
226
227
  hint: promptCase.hint,
228
+ metadata: promptCase.metadata,
227
229
  rawLines: allLines,
228
230
  timing: {
229
231
  start: startTime,
@@ -267,6 +269,7 @@ export const runPipeline = async (
267
269
  id: promptCase.id,
268
270
  input: promptCase.input,
269
271
  hint: promptCase.hint,
272
+ metadata: promptCase.metadata,
270
273
  rawLines: allLines,
271
274
  timing: {
272
275
  start: startTime,
@@ -331,7 +334,6 @@ export const run = async (args: string[]): Promise<void> => {
331
334
  })
332
335
 
333
336
  if (values.help) {
334
- // biome-ignore lint/suspicious/noConsole: CLI help output
335
337
  console.log(`
336
338
  Usage: agent-eval-harness run [prompts.jsonl] [options]
337
339
 
@@ -383,7 +385,7 @@ Examples:
383
385
 
384
386
  // Load prompts from file or stdin
385
387
  const promptsPath = positionals[0]
386
- let prompts: Array<{ id: string; input: string | string[]; hint?: string }>
388
+ let prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>
387
389
 
388
390
  if (promptsPath) {
389
391
  prompts = await loadPrompts(promptsPath)
@@ -40,12 +40,20 @@ const resolvePath = (path: string): string => {
40
40
  // Executable Grader
41
41
  // ============================================================================
42
42
 
43
- /** Input format for executable graders (stdin JSON) */
43
+ /**
44
+ * Input format for executable graders (stdin JSON).
45
+ *
46
+ * @remarks
47
+ * The metadata field contains arbitrary key-value pairs from the original
48
+ * prompt JSONL (e.g., category, difficulty, tags). Use this to implement
49
+ * category-specific grading logic or filter calibration samples.
50
+ */
44
51
  type ExecGraderInput = {
45
52
  input: string | string[]
46
53
  output: string
47
54
  hint?: string
48
55
  trajectory?: TrajectoryStep[]
56
+ metadata?: Record<string, unknown>
49
57
  }
50
58
 
51
59
  /**
@@ -99,10 +99,8 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
99
99
  // List mode
100
100
  if (list) {
101
101
  const names = Object.keys(SCHEMA_REGISTRY)
102
- // biome-ignore lint/suspicious/noConsole: CLI stdout output
103
102
  console.log('Available schemas:')
104
103
  for (const name of names) {
105
- // biome-ignore lint/suspicious/noConsole: CLI stdout output
106
104
  console.log(` - ${name}`)
107
105
  }
108
106
  return names
@@ -123,7 +121,6 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
123
121
  if (outputPath) {
124
122
  await Bun.write(resolvePath(outputPath), output)
125
123
  } else {
126
- // biome-ignore lint/suspicious/noConsole: CLI stdout output
127
124
  console.log(output)
128
125
  }
129
126
 
@@ -154,15 +151,12 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
154
151
  if (outputPath) {
155
152
  await Bun.write(resolvePath(outputPath), output)
156
153
  } else {
157
- // biome-ignore lint/suspicious/noConsole: CLI stdout output
158
154
  console.log(output)
159
155
  }
160
156
  } else {
161
157
  // Default: list schemas
162
- // biome-ignore lint/suspicious/noConsole: CLI stdout output
163
158
  console.log('Available schemas (use --json to export):')
164
159
  for (const name of Object.keys(allSchemas)) {
165
- // biome-ignore lint/suspicious/noConsole: CLI stdout output
166
160
  console.log(` - ${name}`)
167
161
  }
168
162
  }
@@ -193,7 +187,6 @@ export const schemasCli = async (args: string[]): Promise<void> => {
193
187
  })
194
188
 
195
189
  if (values.help) {
196
- // biome-ignore lint/suspicious/noConsole: CLI help output
197
190
  console.log(`
198
191
  Usage: agent-eval-harness schemas [schema-name] [options]
199
192
 
@@ -229,12 +229,14 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
229
229
  * User-provided graders implement this interface to score agent outputs.
230
230
  * - `input` is the original prompt (string or array for multi-turn)
231
231
  * - `hint` provides grader context (renamed from `expected`)
232
+ * - `metadata` contains arbitrary key-value pairs from the original prompt JSONL
232
233
  */
233
234
  export type Grader = (params: {
234
235
  input: string | string[]
235
236
  output: string
236
237
  hint?: string
237
238
  trajectory?: TrajectoryStep[]
239
+ metadata?: Record<string, unknown>
238
240
  }) => Promise<GraderResult>
239
241
 
240
242
  // ============================================================================
@@ -556,3 +558,212 @@ export const ValidationResultSchema = z.object({
556
558
 
557
559
  /** Validation result type */
558
560
  export type ValidationResult = z.infer<typeof ValidationResultSchema>
561
+
562
+ // ============================================================================
563
+ // Comparison Report Schemas
564
+ // ============================================================================
565
+
566
+ /**
567
+ * Score distribution histogram for quality analysis.
568
+ *
569
+ * @remarks
570
+ * Buckets divide the 0-1 score range into 5 equal bins.
571
+ */
572
+ export const ScoreDistributionSchema = z.object({
573
+ '0.0-0.2': z.number(),
574
+ '0.2-0.4': z.number(),
575
+ '0.4-0.6': z.number(),
576
+ '0.6-0.8': z.number(),
577
+ '0.8-1.0': z.number(),
578
+ })
579
+
580
+ /** Score distribution type */
581
+ export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
582
+
583
+ /**
584
+ * Quality metrics for a single run in comparison.
585
+ */
586
+ export const QualityMetricsSchema = z.object({
587
+ /** Mean grader score (0-1) */
588
+ avgScore: z.number(),
589
+ /** Percentage of pass=true results */
590
+ passRate: z.number(),
591
+ /** Count of passing results */
592
+ passCount: z.number(),
593
+ /** Count of failing results */
594
+ failCount: z.number(),
595
+ /** Score distribution histogram */
596
+ scoreDistribution: ScoreDistributionSchema,
597
+ })
598
+
599
+ /** Quality metrics type */
600
+ export type QualityMetrics = z.infer<typeof QualityMetricsSchema>
601
+
602
+ /**
603
+ * Latency statistics for performance analysis.
604
+ */
605
+ export const LatencyStatsSchema = z.object({
606
+ /** 50th percentile (median) in milliseconds */
607
+ p50: z.number(),
608
+ /** 90th percentile in milliseconds */
609
+ p90: z.number(),
610
+ /** 99th percentile in milliseconds */
611
+ p99: z.number(),
612
+ /** Mean latency in milliseconds */
613
+ mean: z.number(),
614
+ /** Minimum latency in milliseconds */
615
+ min: z.number(),
616
+ /** Maximum latency in milliseconds */
617
+ max: z.number(),
618
+ })
619
+
620
+ /** Latency stats type */
621
+ export type LatencyStats = z.infer<typeof LatencyStatsSchema>
622
+
623
+ /**
624
+ * Performance metrics for a single run in comparison.
625
+ */
626
+ export const PerformanceMetricsSchema = z.object({
627
+ /** End-to-end latency statistics */
628
+ latency: LatencyStatsSchema,
629
+ /** Time to first response statistics (optional, not all adapters support) */
630
+ firstResponse: LatencyStatsSchema.optional(),
631
+ /** Sum of all run durations in milliseconds */
632
+ totalDuration: z.number(),
633
+ })
634
+
635
+ /** Performance metrics type */
636
+ export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
637
+
638
+ /**
639
+ * Reliability metrics for a single run in comparison.
640
+ */
641
+ export const ReliabilityMetricsSchema = z.object({
642
+ /** Count of runs with toolErrors=true */
643
+ toolErrors: z.number(),
644
+ /** Percentage of runs with tool errors */
645
+ toolErrorRate: z.number(),
646
+ /** Count of runs that hit timeout */
647
+ timeouts: z.number(),
648
+ /** Percentage of runs that hit timeout */
649
+ timeoutRate: z.number(),
650
+ /** Percentage of runs that completed successfully */
651
+ completionRate: z.number(),
652
+ })
653
+
654
+ /** Reliability metrics type */
655
+ export type ReliabilityMetrics = z.infer<typeof ReliabilityMetricsSchema>
656
+
657
+ /**
658
+ * Trajectory info for a single run in comparison.
659
+ */
660
+ export const TrajectoryInfoSchema = z.object({
661
+ /** Trajectory richness level */
662
+ richness: TrajectoryRichnessSchema,
663
+ /** Average trajectory steps per run */
664
+ avgStepCount: z.number(),
665
+ })
666
+
667
+ /** Trajectory info type */
668
+ export type TrajectoryInfo = z.infer<typeof TrajectoryInfoSchema>
669
+
670
+ /**
671
+ * Per-prompt comparison entry for head-to-head drill-down.
672
+ */
673
+ export const PromptComparisonSchema = z.object({
674
+ /** Prompt identifier */
675
+ id: z.string(),
676
+ /** Run label of the winner, or null if tie */
677
+ winner: z.string().nullable(),
678
+ /** Scores by run label */
679
+ scores: z.record(z.string(), z.number()),
680
+ /** Latencies by run label in milliseconds */
681
+ latencies: z.record(z.string(), z.number()),
682
+ /** Whether each run had errors */
683
+ hadErrors: z.record(z.string(), z.boolean()),
684
+ })
685
+
686
+ /** Prompt comparison type */
687
+ export type PromptComparison = z.infer<typeof PromptComparisonSchema>
688
+
689
+ /**
690
+ * Pairwise win/loss/tie statistics between two runs.
691
+ */
692
+ export const PairwiseComparisonSchema = z.object({
693
+ /** First run label */
694
+ runA: z.string(),
695
+ /** Second run label */
696
+ runB: z.string(),
697
+ /** Number of prompts where A won */
698
+ aWins: z.number(),
699
+ /** Number of prompts where B won */
700
+ bWins: z.number(),
701
+ /** Number of prompts where A and B tied */
702
+ ties: z.number(),
703
+ })
704
+
705
+ /** Pairwise comparison type */
706
+ export type PairwiseComparison = z.infer<typeof PairwiseComparisonSchema>
707
+
708
+ /**
709
+ * Head-to-head comparison section.
710
+ */
711
+ export const HeadToHeadSchema = z.object({
712
+ /** Per-prompt breakdown for drill-down */
713
+ prompts: z.array(PromptComparisonSchema),
714
+ /** Pairwise win rates between runs */
715
+ pairwise: z.array(PairwiseComparisonSchema),
716
+ })
717
+
718
+ /** Head-to-head type */
719
+ export type HeadToHead = z.infer<typeof HeadToHeadSchema>
720
+
721
+ /**
722
+ * Metadata for the comparison report.
723
+ */
724
+ export const ComparisonMetaSchema = z.object({
725
+ /** ISO timestamp when report was generated */
726
+ generatedAt: z.string(),
727
+ /** Run labels included in comparison */
728
+ runs: z.array(z.string()),
729
+ /** Total prompts compared */
730
+ promptCount: z.number(),
731
+ /** Prompts where all runs completed */
732
+ promptsWithAllRuns: z.number(),
733
+ })
734
+
735
+ /** Comparison meta type */
736
+ export type ComparisonMeta = z.infer<typeof ComparisonMetaSchema>
737
+
738
+ /**
739
+ * Holistic comparison report schema.
740
+ *
741
+ * @remarks
742
+ * Aggregates comparison output across all dimensions:
743
+ * - Quality: pass rates, scores, distributions
744
+ * - Performance: latency percentiles
745
+ * - Reliability: error rates, completion rates
746
+ * - Head-to-head: per-prompt winners, pairwise stats
747
+ *
748
+ * Note: Tool usage analysis is NOT included because adapter formats vary.
749
+ * Different adapters provide different `trajectoryRichness` levels and
750
+ * the `tool_call.name` field often contains tool use IDs rather than
751
+ * human-readable names.
752
+ */
753
+ export const ComparisonReportSchema = z.object({
754
+ /** Report metadata */
755
+ meta: ComparisonMetaSchema,
756
+ /** Quality metrics by run label */
757
+ quality: z.record(z.string(), QualityMetricsSchema),
758
+ /** Performance metrics by run label */
759
+ performance: z.record(z.string(), PerformanceMetricsSchema),
760
+ /** Reliability metrics by run label */
761
+ reliability: z.record(z.string(), ReliabilityMetricsSchema),
762
+ /** Trajectory info by run label */
763
+ trajectoryInfo: z.record(z.string(), TrajectoryInfoSchema),
764
+ /** Head-to-head comparison details */
765
+ headToHead: HeadToHeadSchema,
766
+ })
767
+
768
+ /** Comparison report type */
769
+ export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
package/src/schemas.ts CHANGED
@@ -35,10 +35,17 @@ export {
35
35
  CaptureResultSchema,
36
36
  type CategoryDistribution,
37
37
  CategoryDistributionSchema,
38
+ // Comparison report types
39
+ type ComparisonMeta,
40
+ ComparisonMetaSchema,
41
+ type ComparisonReport,
42
+ ComparisonReportSchema,
38
43
  EnvVariableSchema,
39
44
  type Grader,
40
45
  type GraderResult,
41
46
  GraderResultSchema,
47
+ type HeadToHead,
48
+ HeadToHeadSchema,
42
49
  HttpHeaderSchema,
43
50
  type IndexedStep,
44
51
  type JsonRpcError,
@@ -55,14 +62,28 @@ export {
55
62
  JsonRpcResponseSchema,
56
63
  type JsonRpcSuccessResponse,
57
64
  JsonRpcSuccessResponseSchema,
65
+ type LatencyStats,
66
+ LatencyStatsSchema,
58
67
  type McpServerConfig,
59
68
  McpServerHttpSchema,
60
69
  McpServerSchema,
61
70
  McpServerStdioSchema,
62
71
  MessageStepSchema,
72
+ type PairwiseComparison,
73
+ PairwiseComparisonSchema,
74
+ type PerformanceMetrics,
75
+ PerformanceMetricsSchema,
63
76
  PlanStepSchema,
64
77
  type PromptCase,
65
78
  PromptCaseSchema,
79
+ type PromptComparison,
80
+ PromptComparisonSchema,
81
+ type QualityMetrics,
82
+ QualityMetricsSchema,
83
+ type ReliabilityMetrics,
84
+ ReliabilityMetricsSchema,
85
+ type ScoreDistribution,
86
+ ScoreDistributionSchema,
66
87
  type Session,
67
88
  SessionSchema,
68
89
  type SummaryResult,
@@ -73,6 +94,8 @@ export {
73
94
  ToolCallStepSchema,
74
95
  type ToolInput,
75
96
  ToolInputSchema,
97
+ type TrajectoryInfo,
98
+ TrajectoryInfoSchema,
76
99
  type TrajectoryRichness,
77
100
  TrajectoryRichnessSchema,
78
101
  type TrajectoryStep,