@plaited/agent-eval-harness 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -195,13 +195,14 @@ const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMet
|
|
|
195
195
|
const passExpKValues = results.map((r) => r.passExpK ?? 0)
|
|
196
196
|
|
|
197
197
|
if (passExpKValues.length === 0) {
|
|
198
|
-
return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
|
|
198
|
+
return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
|
|
199
199
|
}
|
|
200
200
|
|
|
201
201
|
const sorted = [...passExpKValues].sort((a, b) => a - b)
|
|
202
202
|
const sum = passExpKValues.reduce((a, b) => a + b, 0)
|
|
203
203
|
|
|
204
204
|
return {
|
|
205
|
+
type: 'trial',
|
|
205
206
|
avgPassExpK: sum / passExpKValues.length,
|
|
206
207
|
medianPassExpK: percentile(sorted, 0.5),
|
|
207
208
|
p25PassExpK: percentile(sorted, 0.25),
|
package/src/pipeline/compare.ts
CHANGED
|
@@ -464,6 +464,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
|
|
|
464
464
|
const completedCount = results.filter((r) => r.output && !r.errors?.length).length
|
|
465
465
|
|
|
466
466
|
reliability[label] = {
|
|
467
|
+
type: 'run',
|
|
467
468
|
toolErrors: toolErrorCount,
|
|
468
469
|
toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
|
|
469
470
|
timeouts: timeoutCount,
|
|
@@ -101,6 +101,10 @@ describe('runCompare statistical strategy', () => {
|
|
|
101
101
|
const passRateCI = highQuality?.confidenceIntervals?.passRate
|
|
102
102
|
expect(passRateCI).toHaveLength(2)
|
|
103
103
|
expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
|
|
104
|
+
|
|
105
|
+
// Verify reliability metrics include type discriminator
|
|
106
|
+
expect(report.reliability.high?.type).toBe('run')
|
|
107
|
+
expect(report.reliability.low?.type).toBe('run')
|
|
104
108
|
})
|
|
105
109
|
|
|
106
110
|
test('computes confidence intervals for performance metrics', async () => {
|
|
@@ -108,6 +108,8 @@ describe('runTrialsCompare', () => {
|
|
|
108
108
|
expect(report.meta.promptCount).toBe(2)
|
|
109
109
|
expect(report.capability).toBeDefined()
|
|
110
110
|
expect(report.reliability).toBeDefined()
|
|
111
|
+
expect(report.reliability.baseline?.type).toBe('trial')
|
|
112
|
+
expect(report.reliability.variant?.type).toBe('trial')
|
|
111
113
|
expect(report.flakiness).toBeDefined()
|
|
112
114
|
expect(report.headToHead.capability.length).toBeGreaterThan(0)
|
|
113
115
|
|
|
@@ -284,6 +286,7 @@ describe('runTrialsCompare', () => {
|
|
|
284
286
|
// Verify confidence intervals are computed for reliability
|
|
285
287
|
const reliableRel = report.reliability.reliable
|
|
286
288
|
expect(reliableRel).toBeDefined()
|
|
289
|
+
expect(reliableRel?.type).toBe('trial')
|
|
287
290
|
expect(reliableRel?.confidenceIntervals).toBeDefined()
|
|
288
291
|
expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()
|
|
289
292
|
|
package/src/schemas/schemas.ts
CHANGED
|
@@ -688,6 +688,8 @@ export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
|
|
|
688
688
|
* Reliability metrics for a single run in comparison.
|
|
689
689
|
*/
|
|
690
690
|
export const ReliabilityMetricsSchema = z.object({
|
|
691
|
+
/** Discriminator for run-based reliability metrics */
|
|
692
|
+
type: z.literal('run'),
|
|
691
693
|
/** Count of runs with toolErrors=true */
|
|
692
694
|
toolErrors: z.number(),
|
|
693
695
|
/** Percentage of runs with tool errors */
|
|
@@ -874,6 +876,8 @@ export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabil
|
|
|
874
876
|
* Higher passExpK means the agent reliably solves the task every time.
|
|
875
877
|
*/
|
|
876
878
|
export const TrialsReliabilityMetricsSchema = z.object({
|
|
879
|
+
/** Discriminator for trial-based reliability metrics */
|
|
880
|
+
type: z.literal('trial'),
|
|
877
881
|
/** Average passExpK across all prompts */
|
|
878
882
|
avgPassExpK: z.number(),
|
|
879
883
|
/** Median passExpK */
|