npm - @plaited/agent-eval-harness - Versions diffs - 0.8.1 → 0.9.0 - Mend

@plaited/agent-eval-harness 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +1 -1
package/src/pipeline/compare-trials.ts +2 -1
package/src/pipeline/compare.ts +1 -0
package/src/pipeline/tests/compare-statistical.spec.ts +4 -0
package/src/pipeline/tests/compare-trials.spec.ts +3 -0
package/src/schemas/schemas.ts +4 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.8.1",
+  "version": "0.9.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {

package/src/pipeline/compare-trials.ts CHANGED Viewed

@@ -195,13 +195,14 @@ const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMet
   const passExpKValues = results.map((r) => r.passExpK ?? 0)
   if (passExpKValues.length === 0) {
-    return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
+    return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
   }
   const sorted = [...passExpKValues].sort((a, b) => a - b)
   const sum = passExpKValues.reduce((a, b) => a + b, 0)
   return {
+    type: 'trial',
     avgPassExpK: sum / passExpKValues.length,
     medianPassExpK: percentile(sorted, 0.5),
     p25PassExpK: percentile(sorted, 0.25),

package/src/pipeline/compare.ts CHANGED Viewed

@@ -464,6 +464,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
     const completedCount = results.filter((r) => r.output && !r.errors?.length).length
     reliability[label] = {
+      type: 'run',
       toolErrors: toolErrorCount,
       toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
       timeouts: timeoutCount,

package/src/pipeline/tests/compare-statistical.spec.ts CHANGED Viewed

@@ -101,6 +101,10 @@ describe('runCompare statistical strategy', () => {
     const passRateCI = highQuality?.confidenceIntervals?.passRate
     expect(passRateCI).toHaveLength(2)
     expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
+    // Verify reliability metrics include type discriminator
+    expect(report.reliability.high?.type).toBe('run')
+    expect(report.reliability.low?.type).toBe('run')
   })
   test('computes confidence intervals for performance metrics', async () => {

package/src/pipeline/tests/compare-trials.spec.ts CHANGED Viewed

@@ -108,6 +108,8 @@ describe('runTrialsCompare', () => {
     expect(report.meta.promptCount).toBe(2)
     expect(report.capability).toBeDefined()
     expect(report.reliability).toBeDefined()
+    expect(report.reliability.baseline?.type).toBe('trial')
+    expect(report.reliability.variant?.type).toBe('trial')
     expect(report.flakiness).toBeDefined()
     expect(report.headToHead.capability.length).toBeGreaterThan(0)
@@ -284,6 +286,7 @@ describe('runTrialsCompare', () => {
     // Verify confidence intervals are computed for reliability
     const reliableRel = report.reliability.reliable
     expect(reliableRel).toBeDefined()
+    expect(reliableRel?.type).toBe('trial')
     expect(reliableRel?.confidenceIntervals).toBeDefined()
     expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()

package/src/schemas/schemas.ts CHANGED Viewed

@@ -688,6 +688,8 @@ export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
  * Reliability metrics for a single run in comparison.
  */
 export const ReliabilityMetricsSchema = z.object({
+  /** Discriminator for run-based reliability metrics */
+  type: z.literal('run'),
   /** Count of runs with toolErrors=true */
   toolErrors: z.number(),
   /** Percentage of runs with tool errors */
@@ -874,6 +876,8 @@ export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabil
  * Higher passExpK means the agent reliably solves the task every time.
  */
 export const TrialsReliabilityMetricsSchema = z.object({
+  /** Discriminator for trial-based reliability metrics */
+  type: z.literal('trial'),
   /** Average passExpK across all prompts */
   avgPassExpK: z.number(),
   /** Median passExpK */