@plaited/agent-eval-harness 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/agent-eval-harness",
|
|
3
|
-
"version": "0.12.
|
|
3
|
+
"version": "0.12.2",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -56,12 +56,12 @@
|
|
|
56
56
|
]
|
|
57
57
|
},
|
|
58
58
|
"dependencies": {
|
|
59
|
-
"@plaited/development-skills": "0.
|
|
59
|
+
"@plaited/development-skills": "0.8.0",
|
|
60
60
|
"zod": "^4.3.6"
|
|
61
61
|
},
|
|
62
62
|
"devDependencies": {
|
|
63
|
-
"@biomejs/biome": "2.3.
|
|
64
|
-
"@types/bun": "1.3.
|
|
63
|
+
"@biomejs/biome": "2.3.14",
|
|
64
|
+
"@types/bun": "1.3.9",
|
|
65
65
|
"format-package": "7.0.0",
|
|
66
66
|
"lint-staged": "16.2.7",
|
|
67
67
|
"typescript": "5.9.3"
|
package/src/pipeline/compare.ts
CHANGED
|
@@ -365,6 +365,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
|
|
|
365
365
|
const fails = results.length - passes
|
|
366
366
|
|
|
367
367
|
quality[label] = {
|
|
368
|
+
type: 'run',
|
|
368
369
|
avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
|
|
369
370
|
passRate: results.length > 0 ? passes / results.length : 0,
|
|
370
371
|
passCount: passes,
|
|
@@ -105,6 +105,10 @@ describe('runCompare statistical strategy', () => {
|
|
|
105
105
|
// Verify reliability metrics include type discriminator
|
|
106
106
|
expect(report.reliability.high?.type).toBe('run')
|
|
107
107
|
expect(report.reliability.low?.type).toBe('run')
|
|
108
|
+
|
|
109
|
+
// Verify quality metrics include type discriminator
|
|
110
|
+
expect(report.quality.high?.type).toBe('run')
|
|
111
|
+
expect(report.quality.low?.type).toBe('run')
|
|
108
112
|
})
|
|
109
113
|
|
|
110
114
|
test('computes confidence intervals for performance metrics', async () => {
|
|
@@ -477,6 +477,7 @@ describe('runTrialsCompare', () => {
|
|
|
477
477
|
expect(report.quality?.run1).toBeDefined()
|
|
478
478
|
|
|
479
479
|
const qual = report.quality?.run1
|
|
480
|
+
expect(qual?.type).toBe('trial')
|
|
480
481
|
expect(qual?.avgScore).toBeGreaterThan(0)
|
|
481
482
|
expect(qual?.medianScore).toBeGreaterThan(0)
|
|
482
483
|
expect(qual?.p25Score).toBeDefined()
|
package/src/schemas/schemas.ts
CHANGED
|
@@ -620,6 +620,8 @@ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceInterva
|
|
|
620
620
|
* Quality metrics for a single run in comparison.
|
|
621
621
|
*/
|
|
622
622
|
export const QualityMetricsSchema = z.object({
|
|
623
|
+
/** Discriminator for run-level quality metrics */
|
|
624
|
+
type: z.literal('run'),
|
|
623
625
|
/** Mean grader score (0-1) */
|
|
624
626
|
avgScore: z.number(),
|
|
625
627
|
/** Percentage of pass=true results */
|
|
@@ -942,6 +944,8 @@ export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfi
|
|
|
942
944
|
* Only present when a grader was used during trials capture.
|
|
943
945
|
*/
|
|
944
946
|
export const TrialsQualityMetricsSchema = z.object({
|
|
947
|
+
/** Discriminator for trial-level quality metrics */
|
|
948
|
+
type: z.literal('trial'),
|
|
945
949
|
/** Average score across all trials */
|
|
946
950
|
avgScore: z.number(),
|
|
947
951
|
/** Median score */
|