@plaited/agent-eval-harness 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.12.1",
3
+ "version": "0.12.2",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -56,12 +56,12 @@
56
56
  ]
57
57
  },
58
58
  "dependencies": {
59
- "@plaited/development-skills": "0.7.0",
59
+ "@plaited/development-skills": "0.8.0",
60
60
  "zod": "^4.3.6"
61
61
  },
62
62
  "devDependencies": {
63
- "@biomejs/biome": "2.3.12",
64
- "@types/bun": "1.3.6",
63
+ "@biomejs/biome": "2.3.14",
64
+ "@types/bun": "1.3.9",
65
65
  "format-package": "7.0.0",
66
66
  "lint-staged": "16.2.7",
67
67
  "typescript": "5.9.3"
@@ -262,6 +262,7 @@ const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResu
262
262
 
263
263
  return {
264
264
  metrics: {
265
+ type: 'trial',
265
266
  avgScore: sum / rawScores.length,
266
267
  medianScore: percentile(sorted, 0.5),
267
268
  p25Score: percentile(sorted, 0.25),
@@ -365,6 +365,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
365
365
  const fails = results.length - passes
366
366
 
367
367
  quality[label] = {
368
+ type: 'run',
368
369
  avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
369
370
  passRate: results.length > 0 ? passes / results.length : 0,
370
371
  passCount: passes,
@@ -105,6 +105,10 @@ describe('runCompare statistical strategy', () => {
105
105
  // Verify reliability metrics include type discriminator
106
106
  expect(report.reliability.high?.type).toBe('run')
107
107
  expect(report.reliability.low?.type).toBe('run')
108
+
109
+ // Verify quality metrics include type discriminator
110
+ expect(report.quality.high?.type).toBe('run')
111
+ expect(report.quality.low?.type).toBe('run')
108
112
  })
109
113
 
110
114
  test('computes confidence intervals for performance metrics', async () => {
@@ -477,6 +477,7 @@ describe('runTrialsCompare', () => {
477
477
  expect(report.quality?.run1).toBeDefined()
478
478
 
479
479
  const qual = report.quality?.run1
480
+ expect(qual?.type).toBe('trial')
480
481
  expect(qual?.avgScore).toBeGreaterThan(0)
481
482
  expect(qual?.medianScore).toBeGreaterThan(0)
482
483
  expect(qual?.p25Score).toBeDefined()
@@ -620,6 +620,8 @@ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceInterva
620
620
  * Quality metrics for a single run in comparison.
621
621
  */
622
622
  export const QualityMetricsSchema = z.object({
623
+ /** Discriminator for run-level quality metrics */
624
+ type: z.literal('run'),
623
625
  /** Mean grader score (0-1) */
624
626
  avgScore: z.number(),
625
627
  /** Percentage of pass=true results */
@@ -942,6 +944,8 @@ export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfi
942
944
  * Only present when a grader was used during trials capture.
943
945
  */
944
946
  export const TrialsQualityMetricsSchema = z.object({
947
+ /** Discriminator for trial-level quality metrics */
948
+ type: z.literal('trial'),
945
949
  /** Average score across all trials */
946
950
  avgScore: z.number(),
947
951
  /** Median score */