@plaited/agent-eval-harness 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.8.1",
3
+ "version": "0.9.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -195,13 +195,14 @@ const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMet
195
195
  const passExpKValues = results.map((r) => r.passExpK ?? 0)
196
196
 
197
197
  if (passExpKValues.length === 0) {
198
- return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
198
+ return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
199
199
  }
200
200
 
201
201
  const sorted = [...passExpKValues].sort((a, b) => a - b)
202
202
  const sum = passExpKValues.reduce((a, b) => a + b, 0)
203
203
 
204
204
  return {
205
+ type: 'trial',
205
206
  avgPassExpK: sum / passExpKValues.length,
206
207
  medianPassExpK: percentile(sorted, 0.5),
207
208
  p25PassExpK: percentile(sorted, 0.25),
@@ -464,6 +464,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
464
464
  const completedCount = results.filter((r) => r.output && !r.errors?.length).length
465
465
 
466
466
  reliability[label] = {
467
+ type: 'run',
467
468
  toolErrors: toolErrorCount,
468
469
  toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
469
470
  timeouts: timeoutCount,
@@ -101,6 +101,10 @@ describe('runCompare statistical strategy', () => {
101
101
  const passRateCI = highQuality?.confidenceIntervals?.passRate
102
102
  expect(passRateCI).toHaveLength(2)
103
103
  expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
104
+
105
+ // Verify reliability metrics include type discriminator
106
+ expect(report.reliability.high?.type).toBe('run')
107
+ expect(report.reliability.low?.type).toBe('run')
104
108
  })
105
109
 
106
110
  test('computes confidence intervals for performance metrics', async () => {
@@ -108,6 +108,8 @@ describe('runTrialsCompare', () => {
108
108
  expect(report.meta.promptCount).toBe(2)
109
109
  expect(report.capability).toBeDefined()
110
110
  expect(report.reliability).toBeDefined()
111
+ expect(report.reliability.baseline?.type).toBe('trial')
112
+ expect(report.reliability.variant?.type).toBe('trial')
111
113
  expect(report.flakiness).toBeDefined()
112
114
  expect(report.headToHead.capability.length).toBeGreaterThan(0)
113
115
 
@@ -284,6 +286,7 @@ describe('runTrialsCompare', () => {
284
286
  // Verify confidence intervals are computed for reliability
285
287
  const reliableRel = report.reliability.reliable
286
288
  expect(reliableRel).toBeDefined()
289
+ expect(reliableRel?.type).toBe('trial')
287
290
  expect(reliableRel?.confidenceIntervals).toBeDefined()
288
291
  expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()
289
292
 
@@ -688,6 +688,8 @@ export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
688
688
  * Reliability metrics for a single run in comparison.
689
689
  */
690
690
  export const ReliabilityMetricsSchema = z.object({
691
+ /** Discriminator for run-based reliability metrics */
692
+ type: z.literal('run'),
691
693
  /** Count of runs with toolErrors=true */
692
694
  toolErrors: z.number(),
693
695
  /** Percentage of runs with tool errors */
@@ -874,6 +876,8 @@ export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabil
874
876
  * Higher passExpK means the agent reliably solves the task every time.
875
877
  */
876
878
  export const TrialsReliabilityMetricsSchema = z.object({
879
+ /** Discriminator for trial-based reliability metrics */
880
+ type: z.literal('trial'),
877
881
  /** Average passExpK across all prompts */
878
882
  avgPassExpK: z.number(),
879
883
  /** Median passExpK */