npm - @plaited/agent-eval-harness - Versions diffs - 0.6.2 → 0.8.0 - Mend

@plaited/agent-eval-harness 0.6.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +146 -6
package/package.json +1 -1
package/src/commands/capture.ts +9 -1
package/src/commands/trials.ts +6 -0
package/src/graders/tests/trials-compare-graders.spec.ts +358 -0
package/src/graders/trials-compare-statistical.ts +188 -0
package/src/graders/trials-compare-weighted.ts +128 -0
package/src/graders.ts +21 -1
package/src/pipeline/compare-format-detection.ts +100 -0
package/src/pipeline/compare-trials.ts +596 -0
package/src/pipeline/compare.ts +75 -19
package/src/pipeline/grade.ts +6 -0
package/src/pipeline/pipeline.types.ts +57 -1
package/src/pipeline/tests/compare-format-detection.spec.ts +142 -0
package/src/pipeline/tests/compare-trials.spec.ts +277 -0
package/src/schemas/grader-loader.ts +4 -0
package/src/schemas/schemas.ts +161 -0
package/src/schemas/tests/fixtures/grader-git.ts +116 -0
package/src/schemas/tests/grader-git.spec.ts +222 -0
package/src/schemas.ts +13 -0

package/README.md CHANGED Viewed

@@ -78,6 +78,9 @@ cat prompts.jsonl | \
 # Compare runs (built-in strategies: weighted, statistical, custom)
 bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
+# Compare trials for pass@k reliability analysis (auto-detects format)
+bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
 ```
 ## Skills for AI Agents
@@ -184,11 +187,68 @@ Key fields:
 ## Graders
-Graders score agent outputs. The harness supports two types:
+Graders score agent outputs. The harness supports two types and two grading approaches:
+### Git-Based Outcome Grading (Recommended for Coding Agents)
+**Grade outcomes, not paths.** Use git to detect actual environmental changes:
+```typescript
+import type { Grader } from '@plaited/agent-eval-harness/schemas'
+import { resolve } from 'node:path'
+export const grade: Grader = async ({ output, hint, cwd }) => {
+  // Validate cwd to prevent command injection
+  const isValidPath = (path: string): boolean => {
+    const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
+    if (dangerousChars.test(path)) return false
+    if (path.includes('..') || path.startsWith('-')) return false
+    return true
+  }
+  if (!cwd || !isValidPath(cwd)) {
+    return {
+      pass: false,
+      score: 0,
+      reasoning: 'Invalid working directory path'
+    }
+  }
+  const safeCwd = resolve(cwd)
+  // Detect file changes using git
+  const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
+  const filesCreated = status
+    .split('\n')
+    .filter(line => line.startsWith('??'))
+    .map(line => line.slice(3).trim())
+  // Run tests to verify outcome
+  const testResult = await Bun.$`cd ${safeCwd} && bun test`.nothrow()
+  const testsPassed = testResult.exitCode === 0
+  return {
+    pass: filesCreated.length > 0 && testsPassed,
+    score: testsPassed ? 1.0 : 0.0,
+    reasoning: `Files created: ${filesCreated.join(', ')}. Tests: ${testsPassed ? 'pass' : 'fail'}`,
+    outcome: {  // Optional: structured data for analysis
+      filesCreated,
+      testsPassed,
+      type: 'file_creation_with_tests'
+    }
+  }
+}
+```
+**Benefits:**
+- Detects actual file changes, test results, build success
+- Works universally in any git repo, any language
+- Returns structured `outcome` data for downstream analysis
+- Zero configuration required
-### TypeScript/JavaScript Graders
+### Output-Based Grading (General Purpose)
-Export a `grade` function:
+For non-coding tasks or when git is unavailable:
 ```typescript
 import type { Grader } from '@plaited/agent-eval-harness/schemas'
@@ -215,11 +275,62 @@ Any executable script using stdin/stdout JSON protocol:
 #!/usr/bin/env python3
 import json
 import sys
+import subprocess
+import re
+import os
 data = json.load(sys.stdin)
 output = data["output"].lower()
 hint = (data.get("hint") or "").lower()
+cwd = data.get("cwd")
+# Validate cwd to prevent command injection
+def is_valid_path(path):
+    if not path:
+        return False
+    # Reject shell metacharacters
+    if re.search(r'[;&|`$(){}\[\]<>\'"\\]', path):
+        return False
+    # Reject directory traversal and option injection
+    if '..' in path or path.startswith('-'):
+        return False
+    return True
+# Git-based grading if cwd is provided
+if cwd:
+    if not is_valid_path(cwd):
+        print(json.dumps({
+            "pass": False,
+            "score": 0.0,
+            "reasoning": "Invalid working directory path"
+        }))
+        sys.exit(0)
+    safe_cwd = os.path.abspath(cwd)
+    try:
+        result = subprocess.run(
+            ["git", "-C", safe_cwd, "status", "--porcelain"],
+            capture_output=True, text=True, check=True
+        )
+        files_created = [
+            line[3:].strip()
+            for line in result.stdout.split('\n')
+            if line.startswith('??')
+        ]
+        has_changes = len(files_created) > 0
+        print(json.dumps({
+            "pass": has_changes,
+            "score": 1.0 if has_changes else 0.0,
+            "reasoning": f"Files created: {', '.join(files_created)}",
+            "outcome": {"filesCreated": files_created, "type": "git_check"}
+        }))
+        sys.exit(0)
+    except subprocess.CalledProcessError:
+        # Fall back to output-based grading
+        pass
+# Output-based grading fallback
 pass_result = hint in output if hint else True
 print(json.dumps({
     "pass": pass_result,
@@ -234,11 +345,14 @@ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grade
 ```
 **Protocol:**
-- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
-- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
+- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...], "cwd": "/path/to/dir"}`
+- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "...", "outcome": {...}}`
+- `cwd` and `outcome` are optional fields
 ## Downstream Integration
+The harness outputs standard JSONL. When graders return the optional `outcome` field, it's merged onto results for powerful downstream analysis:
 ```bash
 # Filter failures
 cat results.jsonl | jq 'select(.score.pass == false)'
@@ -246,10 +360,36 @@ cat results.jsonl | jq 'select(.score.pass == false)'
 # Extract tool usage patterns
 cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
+# Analyze outcomes from git-based graders
+cat results.jsonl | jq 'select(.outcome.type == "test_execution")'
+cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length'
+cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)'
 # Use with your scoring pipeline
 cat results.jsonl | your-scoring-script.ts
 ```
+### Outcome Field
+Git-based graders can return structured `outcome` data:
+```jsonl
+{
+  "id": "fix-tests",
+  "input": "Fix the failing authentication tests",
+  "output": "I fixed the auth tests by...",
+  "score": {"pass": true, "score": 1.0, "reasoning": "Tests pass"},
+  "outcome": {
+    "testsPassed": true,
+    "filesModified": ["src/auth.ts", "src/auth.spec.ts"],
+    "exitCode": 0,
+    "type": "test_execution"
+  }
+}
+```
+This enables rich analysis across evaluations without re-parsing trajectories.
 ## Development
 ```bash

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.6.2",
+  "version": "0.8.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {

package/src/commands/capture.ts CHANGED Viewed

@@ -225,13 +225,21 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       // Apply grader if provided
       if (grader) {
-        result.score = await grader({
+        const graderResult = await grader({
           input: promptCase.input,
           output,
           hint: promptCase.hint,
           trajectory,
           metadata: promptCase.metadata,
+          cwd: session.cwd,
         })
+        result.score = graderResult
+        // Merge outcome from grader if present
+        if (graderResult.outcome) {
+          result.outcome = graderResult.outcome
+        }
       }
       // Clean up session

package/src/commands/trials.ts CHANGED Viewed

@@ -217,10 +217,16 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
             hint: promptCase.hint,
             trajectory,
             metadata: promptCase.metadata,
+            cwd: session.cwd,
           })
           entry.pass = graderResult.pass
           entry.score = graderResult.score
           entry.reasoning = graderResult.reasoning
+          // Merge outcome from grader if present
+          if (graderResult.outcome) {
+            entry.outcome = graderResult.outcome
+          }
         }
         trialEntries.push(entry)

package/src/graders/tests/trials-compare-graders.spec.ts ADDED Viewed

@@ -0,0 +1,358 @@
+/**
+ * Unit tests for built-in trials comparison graders.
+ *
+ * @remarks
+ * Tests for:
+ * - trials-compare-weighted: Configurable weight grader for trials
+ * - trials-compare-statistical: Bootstrap confidence interval grader for trials
+ *
+ * @packageDocumentation
+ */
+import { describe, expect, test } from 'bun:test'
+import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts'
+import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts'
+import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts'
+// ============================================================================
+// Test Fixtures
+// ============================================================================
+const createMockTrialRuns = (
+  overrides: Partial<Record<string, Partial<TrialsComparisonRunData>>> = {},
+): Record<string, TrialsComparisonRunData> => ({
+  baseline: {
+    passRate: 0.67,
+    passAtK: 0.9,
+    passExpK: 0.3,
+    k: 3,
+    trials: [
+      { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 },
+      { trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 },
+      { trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 },
+    ],
+    ...overrides.baseline,
+  },
+  variant: {
+    passRate: 1.0,
+    passAtK: 1.0,
+    passExpK: 1.0,
+    k: 3,
+    trials: [
+      { trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 },
+      { trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 },
+      { trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 },
+    ],
+    ...overrides.variant,
+  },
+})
+const createMockTrialInput = (runs: Record<string, TrialsComparisonRunData>): TrialsComparisonGraderInput => ({
+  id: 'test-001',
+  input: 'Test prompt',
+  hint: 'Expected output',
+  runs,
+})
+// ============================================================================
+// Weighted Grader Tests
+// ============================================================================
+describe('trials-compare-weighted grader', () => {
+  describe('DEFAULT_TRIALS_WEIGHTS', () => {
+    test('has expected default values', () => {
+      expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4)
+      expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4)
+      expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2)
+    })
+    test('weights sum to 1.0', () => {
+      const sum =
+        DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency
+      expect(sum).toBe(1.0)
+    })
+  })
+  describe('createTrialsWeightedGrader', () => {
+    test('returns higher rank for better passAtK when capability weight is high', async () => {
+      const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 })
+      const runs = createMockTrialRuns({
+        baseline: { passAtK: 0.7 },
+        variant: { passAtK: 0.95 },
+      })
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      expect(result.rankings.length).toBe(2)
+      expect(result.rankings[0]?.run).toBe('variant')
+      expect(result.rankings[0]?.rank).toBe(1)
+    })
+    test('returns higher rank for better passExpK when reliability weight is high', async () => {
+      const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 })
+      const runs = createMockTrialRuns({
+        baseline: { passExpK: 0.9 },
+        variant: { passExpK: 0.3 },
+      })
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      expect(result.rankings[0]?.run).toBe('baseline')
+    })
+    test('penalizes flaky runs when consistency weight is high', async () => {
+      const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
+      const runs = createMockTrialRuns({
+        // baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6
+        baseline: { passAtK: 0.9, passExpK: 0.3 },
+        // variant: passAtK=0.8, passExpK=0.8, flakiness=0.0
+        variant: { passAtK: 0.8, passExpK: 0.8 },
+      })
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      // Variant should win due to lower flakiness (higher consistency)
+      expect(result.rankings[0]?.run).toBe('variant')
+    })
+    test('includes weights in reasoning', async () => {
+      const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 }
+      const grader = createTrialsWeightedGrader(weights)
+      const input = createMockTrialInput(createMockTrialRuns())
+      const result = await grader(input)
+      expect(result.reasoning).toContain('capability=0.5')
+      expect(result.reasoning).toContain('reliability=0.3')
+      expect(result.reasoning).toContain('consistency=0.2')
+    })
+    test('handles missing passAtK gracefully (treats as 0)', async () => {
+      const grader = createTrialsWeightedGrader()
+      const runs: Record<string, TrialsComparisonRunData> = {
+        baseline: {
+          k: 3,
+          trials: [],
+        },
+        variant: {
+          passAtK: 0.8,
+          passExpK: 0.5,
+          k: 3,
+          trials: [],
+        },
+      }
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      // Should not throw, variant should rank higher
+      expect(result.rankings.length).toBe(2)
+      expect(result.rankings[0]?.run).toBe('variant')
+    })
+    test('handles three or more runs', async () => {
+      const grader = createTrialsWeightedGrader()
+      const runs: Record<string, TrialsComparisonRunData> = {
+        a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] },
+        b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] },
+        c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
+      }
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      expect(result.rankings.length).toBe(3)
+      // Ranks should be 1, 2, 3
+      expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
+    })
+  })
+})
+// ============================================================================
+// Statistical Grader Tests
+// ============================================================================
+describe('trials-compare-statistical grader', () => {
+  describe('createTrialsStatisticalGrader', () => {
+    test('returns rankings based on bootstrapped passAtK', async () => {
+      const grader = createTrialsStatisticalGrader(100)
+      const runs = createMockTrialRuns({
+        baseline: { passAtK: 0.6 },
+        variant: { passAtK: 0.95 },
+      })
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      expect(result.rankings.length).toBe(2)
+      expect(result.rankings[0]?.run).toBe('variant')
+    })
+    test('uses trial outcomes for bootstrap variance estimation', async () => {
+      const grader = createTrialsStatisticalGrader(100)
+      // All trials pass for variant, mixed for baseline
+      const runs: Record<string, TrialsComparisonRunData> = {
+        baseline: {
+          passAtK: 0.9,
+          passExpK: 0.3,
+          k: 5,
+          trials: [
+            { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true },
+            { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true },
+            { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
+            { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true },
+            { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
+          ],
+        },
+        variant: {
+          passAtK: 1.0,
+          passExpK: 1.0,
+          k: 5,
+          trials: [
+            { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
+            { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
+            { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
+            { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
+            { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
+          ],
+        },
+      }
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      // Variant with 100% pass rate should rank higher
+      expect(result.rankings[0]?.run).toBe('variant')
+    })
+    test('indicates significance when passAtK differs substantially', async () => {
+      const grader = createTrialsStatisticalGrader(500)
+      // Strong difference: all pass vs all fail
+      const runs: Record<string, TrialsComparisonRunData> = {
+        baseline: {
+          passAtK: 0,
+          k: 5,
+          trials: [
+            { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false },
+            { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false },
+            { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
+            { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false },
+            { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
+          ],
+        },
+        variant: {
+          passAtK: 1.0,
+          k: 5,
+          trials: [
+            { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
+            { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
+            { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
+            { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
+            { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
+          ],
+        },
+      }
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      expect(result.reasoning).toContain('clear separation')
+    })
+    test('handles empty trials array', async () => {
+      const grader = createTrialsStatisticalGrader(100)
+      const runs: Record<string, TrialsComparisonRunData> = {
+        baseline: { k: 3, trials: [] },
+        variant: {
+          k: 3,
+          trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }],
+        },
+      }
+      const input = createMockTrialInput(runs)
+      const result = await grader(input)
+      // Should not throw
+      expect(result.rankings.length).toBe(2)
+    })
+  })
+  describe('grade function', () => {
+    test('works with default iterations', async () => {
+      const runs = createMockTrialRuns()
+      const input = createMockTrialInput(runs)
+      const result = await statisticalGrade(input)
+      expect(result.rankings).toBeDefined()
+      expect(result.rankings.length).toBe(2)
+    })
+  })
+})
+// ============================================================================
+// Edge Case Tests
+// ============================================================================
+describe('trials comparison grader edge cases', () => {
+  test('handles single run gracefully', async () => {
+    const grader = createTrialsWeightedGrader()
+    const runs: Record<string, TrialsComparisonRunData> = {
+      only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] },
+    }
+    const input = createMockTrialInput(runs)
+    const result = await grader(input)
+    expect(result.rankings.length).toBe(1)
+    expect(result.rankings[0]?.rank).toBe(1)
+  })
+  test('handles zero passAtK and passExpK', async () => {
+    const grader = createTrialsWeightedGrader()
+    const runs: Record<string, TrialsComparisonRunData> = {
+      baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] },
+      variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
+    }
+    const input = createMockTrialInput(runs)
+    const result = await grader(input)
+    expect(result.rankings[0]?.run).toBe('variant')
+  })
+  test('deterministic ordering for equal scores', async () => {
+    const grader = createTrialsWeightedGrader()
+    const runs = createMockTrialRuns({
+      baseline: { passAtK: 0.8, passExpK: 0.6 },
+      variant: { passAtK: 0.8, passExpK: 0.6 },
+    })
+    const input = createMockTrialInput(runs)
+    // Run multiple times to check stability
+    const results = await Promise.all([grader(input), grader(input), grader(input)])
+    // All should have same ordering
+    const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
+    expect(new Set(orders).size).toBe(1)
+  })
+  test('flakiness is clamped to non-negative', async () => {
+    // Edge case: passExpK > passAtK shouldn't happen but handle gracefully
+    const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
+    const runs: Record<string, TrialsComparisonRunData> = {
+      baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work
+      variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] },
+    }
+    const input = createMockTrialInput(runs)
+    const result = await grader(input)
+    // Both should have flakiness 0, so consistency score should be 1.0 for both
+    // Variant has higher capability/reliability so it wins on tiebreaker
+    expect(result.rankings).toBeDefined()
+  })
+})