npm - @plaited/agent-eval-harness - Versions diffs - 0.6.2 → 0.7.0 - Mend

@plaited/agent-eval-harness 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +143 -6
package/package.json +1 -1
package/src/commands/capture.ts +9 -1
package/src/commands/trials.ts +6 -0
package/src/pipeline/grade.ts +6 -0
package/src/pipeline/pipeline.types.ts +5 -0
package/src/schemas/grader-loader.ts +4 -0
package/src/schemas/schemas.ts +10 -0
package/src/schemas/tests/fixtures/grader-git.ts +116 -0
package/src/schemas/tests/grader-git.spec.ts +222 -0

package/README.md CHANGED Viewed

@@ -184,11 +184,68 @@ Key fields:
 ## Graders
-Graders score agent outputs. The harness supports two types:
+Graders score agent outputs. The harness supports two types and two grading approaches:
-### TypeScript/JavaScript Graders
+### Git-Based Outcome Grading (Recommended for Coding Agents)
-Export a `grade` function:
+**Grade outcomes, not paths.** Use git to detect actual environmental changes:
+```typescript
+import type { Grader } from '@plaited/agent-eval-harness/schemas'
+import { resolve } from 'node:path'
+export const grade: Grader = async ({ output, hint, cwd }) => {
+  // Validate cwd to prevent command injection
+  const isValidPath = (path: string): boolean => {
+    const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
+    if (dangerousChars.test(path)) return false
+    if (path.includes('..') || path.startsWith('-')) return false
+    return true
+  }
+  if (!cwd || !isValidPath(cwd)) {
+    return {
+      pass: false,
+      score: 0,
+      reasoning: 'Invalid working directory path'
+    }
+  }
+  const safeCwd = resolve(cwd)
+  // Detect file changes using git
+  const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
+  const filesCreated = status
+    .split('\n')
+    .filter(line => line.startsWith('??'))
+    .map(line => line.slice(3).trim())
+  // Run tests to verify outcome
+  const testResult = await Bun.$`cd ${safeCwd} && bun test`.nothrow()
+  const testsPassed = testResult.exitCode === 0
+  return {
+    pass: filesCreated.length > 0 && testsPassed,
+    score: testsPassed ? 1.0 : 0.0,
+    reasoning: `Files created: ${filesCreated.join(', ')}. Tests: ${testsPassed ? 'pass' : 'fail'}`,
+    outcome: {  // Optional: structured data for analysis
+      filesCreated,
+      testsPassed,
+      type: 'file_creation_with_tests'
+    }
+  }
+}
+```
+**Benefits:**
+- Detects actual file changes, test results, build success
+- Works universally in any git repo, any language
+- Returns structured `outcome` data for downstream analysis
+- Zero configuration required
+### Output-Based Grading (General Purpose)
+For non-coding tasks or when git is unavailable:
 ```typescript
 import type { Grader } from '@plaited/agent-eval-harness/schemas'
@@ -215,11 +272,62 @@ Any executable script using stdin/stdout JSON protocol:
 #!/usr/bin/env python3
 import json
 import sys
+import subprocess
+import re
+import os
 data = json.load(sys.stdin)
 output = data["output"].lower()
 hint = (data.get("hint") or "").lower()
+cwd = data.get("cwd")
+# Validate cwd to prevent command injection
+def is_valid_path(path):
+    if not path:
+        return False
+    # Reject shell metacharacters
+    if re.search(r'[;&|`$(){}\[\]<>\'"\\]', path):
+        return False
+    # Reject directory traversal and option injection
+    if '..' in path or path.startswith('-'):
+        return False
+    return True
+# Git-based grading if cwd is provided
+if cwd:
+    if not is_valid_path(cwd):
+        print(json.dumps({
+            "pass": False,
+            "score": 0.0,
+            "reasoning": "Invalid working directory path"
+        }))
+        sys.exit(0)
+    safe_cwd = os.path.abspath(cwd)
+    try:
+        result = subprocess.run(
+            ["git", "-C", safe_cwd, "status", "--porcelain"],
+            capture_output=True, text=True, check=True
+        )
+        files_created = [
+            line[3:].strip()
+            for line in result.stdout.split('\n')
+            if line.startswith('??')
+        ]
+        has_changes = len(files_created) > 0
+        print(json.dumps({
+            "pass": has_changes,
+            "score": 1.0 if has_changes else 0.0,
+            "reasoning": f"Files created: {', '.join(files_created)}",
+            "outcome": {"filesCreated": files_created, "type": "git_check"}
+        }))
+        sys.exit(0)
+    except subprocess.CalledProcessError:
+        # Fall back to output-based grading
+        pass
+# Output-based grading fallback
 pass_result = hint in output if hint else True
 print(json.dumps({
     "pass": pass_result,
@@ -234,11 +342,14 @@ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grade
 ```
 **Protocol:**
-- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
-- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
+- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...], "cwd": "/path/to/dir"}`
+- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "...", "outcome": {...}}`
+- `cwd` and `outcome` are optional fields
 ## Downstream Integration
+The harness outputs standard JSONL. When graders return the optional `outcome` field, it's merged onto results for powerful downstream analysis:
 ```bash
 # Filter failures
 cat results.jsonl | jq 'select(.score.pass == false)'
@@ -246,10 +357,36 @@ cat results.jsonl | jq 'select(.score.pass == false)'
 # Extract tool usage patterns
 cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
+# Analyze outcomes from git-based graders
+cat results.jsonl | jq 'select(.outcome.type == "test_execution")'
+cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length'
+cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)'
 # Use with your scoring pipeline
 cat results.jsonl | your-scoring-script.ts
 ```
+### Outcome Field
+Git-based graders can return structured `outcome` data:
+```jsonl
+{
+  "id": "fix-tests",
+  "input": "Fix the failing authentication tests",
+  "output": "I fixed the auth tests by...",
+  "score": {"pass": true, "score": 1.0, "reasoning": "Tests pass"},
+  "outcome": {
+    "testsPassed": true,
+    "filesModified": ["src/auth.ts", "src/auth.spec.ts"],
+    "exitCode": 0,
+    "type": "test_execution"
+  }
+}
+```
+This enables rich analysis across evaluations without re-parsing trajectories.
 ## Development
 ```bash

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.6.2",
+  "version": "0.7.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {

package/src/commands/capture.ts CHANGED Viewed

@@ -225,13 +225,21 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       // Apply grader if provided
       if (grader) {
-        result.score = await grader({
+        const graderResult = await grader({
           input: promptCase.input,
           output,
           hint: promptCase.hint,
           trajectory,
           metadata: promptCase.metadata,
+          cwd: session.cwd,
         })
+        result.score = graderResult
+        // Merge outcome from grader if present
+        if (graderResult.outcome) {
+          result.outcome = graderResult.outcome
+        }
       }
       // Clean up session

package/src/commands/trials.ts CHANGED Viewed

@@ -217,10 +217,16 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
             hint: promptCase.hint,
             trajectory,
             metadata: promptCase.metadata,
+            cwd: session.cwd,
           })
           entry.pass = graderResult.pass
           entry.score = graderResult.score
           entry.reasoning = graderResult.reasoning
+          // Merge outcome from grader if present
+          if (graderResult.outcome) {
+            entry.outcome = graderResult.outcome
+          }
         }
         trialEntries.push(entry)

package/src/pipeline/grade.ts CHANGED Viewed

@@ -52,6 +52,7 @@ export const runGrade = async (
       hint: extracted.hint,
       trajectory: extracted.trajectory,
       metadata: extracted.metadata,
+      cwd: extracted.cwd,
     })
     const graded: GradedResult = {
@@ -59,6 +60,11 @@ export const runGrade = async (
       score,
     }
+    // Merge outcome from grader if present
+    if (score.outcome) {
+      graded.outcome = score.outcome
+    }
     const icon = score.pass ? '✓' : '✗'
     logProgress(`  ${icon} score=${score.score.toFixed(2)}`, progress)

package/src/pipeline/pipeline.types.ts CHANGED Viewed

@@ -62,6 +62,8 @@ export type ExtractedResult = {
   toolErrors: boolean
   /** Optional metadata from original prompt */
   metadata?: Record<string, unknown>
+  /** Working directory path (optional, for git-based grading) */
+  cwd?: string
   /** Timing metadata */
   timing: {
     start: number
@@ -77,10 +79,13 @@ export type ExtractedResult = {
  *
  * @remarks
  * Adds grader score to extracted result.
+ * Outcome field is merged from grader result if present.
  */
 export type GradedResult = ExtractedResult & {
   /** Grader score */
   score: GraderResult
+  /** Outcome data from grader (if grader returned outcome) */
+  outcome?: Record<string, unknown>
 }
 /**

package/src/schemas/grader-loader.ts CHANGED Viewed

@@ -47,6 +47,7 @@ const resolvePath = (path: string): string => {
  * The metadata field contains arbitrary key-value pairs from the original
  * prompt JSONL (e.g., category, difficulty, tags). Use this to implement
  * category-specific grading logic or filter calibration samples.
+ * The cwd field provides the working directory path for git-based outcome detection.
  */
 type ExecGraderInput = {
   input: string | string[]
@@ -54,6 +55,7 @@ type ExecGraderInput = {
   hint?: string
   trajectory?: TrajectoryStep[]
   metadata?: Record<string, unknown>
+  cwd?: string
 }
 /**
@@ -73,6 +75,8 @@ const createExecGrader = (execPath: string): Grader => {
       output: params.output,
       hint: params.hint,
       trajectory: params.trajectory,
+      metadata: params.metadata,
+      cwd: params.cwd,
     }
     const inputJson = JSON.stringify(input)

package/src/schemas/schemas.ts CHANGED Viewed

@@ -209,6 +209,7 @@ export type PromptCase = z.infer<typeof PromptCaseSchema>
  *
  * @remarks
  * Result returned by user-provided grader functions.
+ * - `outcome`: Optional structured outcome data detected by the grader
  */
 export const GraderResultSchema = z.object({
   /** Whether the output passes the evaluation criteria */
@@ -217,6 +218,8 @@ export const GraderResultSchema = z.object({
   score: z.number().min(0).max(1),
   /** Optional explanation for the score */
   reasoning: z.string().optional(),
+  /** Optional outcome data (e.g., files created, tests passed) */
+  outcome: z.record(z.string(), z.unknown()).optional(),
 })
 /** Grader result type */
@@ -230,6 +233,7 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
  * - `input` is the original prompt (string or array for multi-turn)
  * - `hint` provides grader context (renamed from `expected`)
  * - `metadata` contains arbitrary key-value pairs from the original prompt JSONL
+ * - `cwd` is the working directory path (optional, enables git-based outcome detection)
  */
 export type Grader = (params: {
   input: string | string[]
@@ -237,6 +241,7 @@ export type Grader = (params: {
   hint?: string
   trajectory?: TrajectoryStep[]
   metadata?: Record<string, unknown>
+  cwd?: string
 }) => Promise<GraderResult>
 // ============================================================================
@@ -375,6 +380,7 @@ export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
  * - `input` can be string (single turn) or string[] (multi-turn)
  * - `hint` provides grader context (renamed from `expected`)
  * - `toolErrors` replaces misleading `status: 'passed'|'failed'`
+ * - `outcome` is merged from grader result if grader returns outcome data
  * Real pass/fail determination comes from your grader.
  */
 export const CaptureResultSchema = z.object({
@@ -398,6 +404,8 @@ export const CaptureResultSchema = z.object({
   errors: z.array(z.string()).optional(),
   /** Grader score (if grader was provided) */
   score: GraderResultSchema.optional(),
+  /** Outcome data from grader (if grader provided and returned outcome) */
+  outcome: z.record(z.string(), z.unknown()).optional(),
 })
 /** Capture result type */
@@ -449,6 +457,8 @@ export const TrialEntrySchema = z.object({
   score: z.number().optional(),
   /** Grader reasoning (if grader provided) */
   reasoning: z.string().optional(),
+  /** Outcome data from grader (if grader provided and returned outcome) */
+  outcome: z.record(z.string(), z.unknown()).optional(),
 })
 /** Trial entry type */

package/src/schemas/tests/fixtures/grader-git.ts ADDED Viewed

@@ -0,0 +1,116 @@
+/**
+ * Test fixture: Git-based grader that detects file changes.
+ *
+ * @remarks
+ * This grader uses git to detect environmental outcomes instead of just
+ * checking output text. It demonstrates the "grade outcomes, not paths" principle.
+ *
+ * SECURITY NOTE: This fixture validates the cwd parameter to prevent command injection.
+ * When implementing your own git-based graders, always validate paths from untrusted sources.
+ * The cwd parameter should only come from trusted sources (process.cwd(), CLI flags, etc.).
+ */
+import { resolve } from 'node:path'
+import type { Grader } from '../../schemas.ts'
+/**
+ * Validates that a path is safe to use in shell commands.
+ *
+ * @remarks
+ * Rejects paths containing shell metacharacters or suspicious patterns
+ * that could be used for command injection.
+ *
+ * @param path - The path to validate
+ * @returns True if path appears safe, false otherwise
+ */
+const isValidPath = (path: string): boolean => {
+  // Reject paths with shell metacharacters that could enable command injection
+  const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
+  if (dangerousChars.test(path)) {
+    return false
+  }
+  // Reject paths with suspicious patterns
+  if (path.includes('..') || path.startsWith('-')) {
+    return false
+  }
+  return true
+}
+export const grade: Grader = async ({ output: _output, hint, cwd }) => {
+  // If no cwd provided, fall back to hint-based grading
+  if (!cwd) {
+    return {
+      pass: false,
+      score: 0,
+      reasoning: 'No working directory provided',
+    }
+  }
+  // SECURITY: Validate cwd to prevent command injection
+  if (!isValidPath(cwd)) {
+    return {
+      pass: false,
+      score: 0,
+      reasoning: 'Invalid working directory path (contains suspicious characters)',
+    }
+  }
+  // Normalize path to prevent directory traversal
+  const safeCwd = resolve(cwd)
+  // Check if we're in a git repo
+  const isGit = await Bun.$`git -C ${safeCwd} rev-parse --git-dir 2>/dev/null`.nothrow()
+  if (isGit.exitCode !== 0) {
+    return {
+      pass: false,
+      score: 0,
+      reasoning: 'Not a git repository',
+    }
+  }
+  // Detect what files were created/modified using git
+  // Note: This detects untracked (??) and modified (M) files.
+  // Staged (A), renamed (R), deleted (D) files are not included in this example.
+  const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
+  const filesCreated = status
+    .split('\n')
+    .filter((line) => line.startsWith('??')) // ?? = untracked files
+    .map((line) => line.slice(3).trim())
+    .filter(Boolean)
+  const filesModified = status
+    .split('\n')
+    .filter((line) => line.startsWith(' M') || line.startsWith('M ')) // M = modified
+    .map((line) => line.slice(3).trim())
+    .filter(Boolean)
+  const hasChanges = filesCreated.length > 0 || filesModified.length > 0
+  // If hint is provided, check if any changed file matches the hint
+  let matchesHint = true
+  if (hint) {
+    const allChangedFiles = [...filesCreated, ...filesModified]
+    matchesHint = allChangedFiles.some((file) => file.toLowerCase().includes(hint.toLowerCase()))
+  }
+  const pass = hasChanges && matchesHint
+  return {
+    pass,
+    score: pass ? 1.0 : hasChanges ? 0.5 : 0.0,
+    reasoning: pass
+      ? `Files changed: ${[...filesCreated, ...filesModified].join(', ')}`
+      : hasChanges
+        ? 'File changes do not match hint'
+        : 'No file changes detected',
+    outcome: {
+      filesCreated,
+      filesModified,
+      type: 'git_status_check',
+    },
+  }
+}

package/src/schemas/tests/grader-git.spec.ts ADDED Viewed

@@ -0,0 +1,222 @@
+/**
+ * Tests for git-based grader fixture.
+ *
+ * @remarks
+ * Verifies that graders can use git to detect environmental outcomes
+ * and return structured outcome data.
+ */
+import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
+import { mkdtemp, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import type { Grader } from '../schemas.ts'
+describe('Git-based grader', () => {
+  let tempDir: string
+  let grader: Grader
+  beforeEach(async () => {
+    // Create temporary directory
+    tempDir = await mkdtemp(join(tmpdir(), 'git-grader-test-'))
+    // Initialize git repo
+    await Bun.$`git -C ${tempDir} init`.quiet()
+    await Bun.$`git -C ${tempDir} config user.email "test@test.com"`.quiet()
+    await Bun.$`git -C ${tempDir} config user.name "Test User"`.quiet()
+    // Load the git-based grader
+    const module = await import('./fixtures/grader-git.ts')
+    grader = module.grade
+  })
+  afterEach(async () => {
+    // Clean up temporary directory
+    await rm(tempDir, { recursive: true, force: true })
+  })
+  test('detects newly created files', async () => {
+    // Create a new file (untracked)
+    await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button>Click</button>')
+    const result = await grader({
+      input: 'Create a button component',
+      output: 'I created Button.tsx',
+      hint: 'button',
+      cwd: tempDir,
+    })
+    expect(result.pass).toBe(true)
+    expect(result.score).toBe(1.0)
+    expect(result.reasoning).toContain('button.tsx')
+    expect(result.outcome).toBeDefined()
+    expect(result.outcome?.filesCreated).toEqual(['button.tsx'])
+    expect(result.outcome?.type).toBe('git_status_check')
+  })
+  test('detects modified files', async () => {
+    // Create and commit a file
+    await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 1 }')
+    await Bun.$`git -C ${tempDir} add config.ts`.quiet()
+    await Bun.$`git -C ${tempDir} commit -m "Initial commit"`.quiet()
+    // Modify the file
+    await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 2 }')
+    const result = await grader({
+      input: 'Update config value',
+      output: 'I updated the config',
+      hint: 'config',
+      cwd: tempDir,
+    })
+    expect(result.pass).toBe(true)
+    expect(result.score).toBe(1.0)
+    expect(result.reasoning).toContain('config.ts')
+    expect(result.outcome).toBeDefined()
+    expect(result.outcome?.filesModified).toEqual(['config.ts'])
+    expect(result.outcome?.type).toBe('git_status_check')
+  })
+  test('fails when no changes detected', async () => {
+    // No files created or modified
+    const result = await grader({
+      input: 'Create a button component',
+      output: 'I created a button component',
+      cwd: tempDir,
+    })
+    expect(result.pass).toBe(false)
+    expect(result.score).toBe(0)
+    expect(result.reasoning).toContain('No file changes detected')
+    expect(result.outcome).toBeDefined()
+    expect(result.outcome?.filesCreated).toEqual([])
+    expect(result.outcome?.filesModified).toEqual([])
+  })
+  test('partial score when changes do not match hint', async () => {
+    // Create a file that does not match the hint
+    await Bun.write(join(tempDir, 'unrelated.ts'), 'export const foo = 1')
+    const result = await grader({
+      input: 'Create a button component',
+      output: 'I created something',
+      hint: 'button',
+      cwd: tempDir,
+    })
+    expect(result.pass).toBe(false)
+    expect(result.score).toBe(0.5) // Has changes but doesn't match hint
+    expect(result.reasoning).toContain('do not match hint')
+    expect(result.outcome?.filesCreated).toEqual(['unrelated.ts'])
+  })
+  test('handles missing cwd parameter', async () => {
+    const result = await grader({
+      input: 'Create a button component',
+      output: 'I created a button',
+      hint: 'button',
+      // cwd not provided
+    })
+    expect(result.pass).toBe(false)
+    expect(result.score).toBe(0)
+    expect(result.reasoning).toBe('No working directory provided')
+  })
+  test('handles non-git directory', async () => {
+    // Create a non-git temp directory
+    const nonGitDir = await mkdtemp(join(tmpdir(), 'non-git-test-'))
+    try {
+      const result = await grader({
+        input: 'Create a button component',
+        output: 'I created a button',
+        cwd: nonGitDir,
+      })
+      expect(result.pass).toBe(false)
+      expect(result.score).toBe(0)
+      expect(result.reasoning).toBe('Not a git repository')
+    } finally {
+      await rm(nonGitDir, { recursive: true, force: true })
+    }
+  })
+  test('works without hint parameter', async () => {
+    // Create a file
+    await Bun.write(join(tempDir, 'any-file.ts'), 'export const x = 1')
+    const result = await grader({
+      input: 'Create a file',
+      output: 'I created a file',
+      cwd: tempDir,
+      // hint not provided
+    })
+    expect(result.pass).toBe(true)
+    expect(result.score).toBe(1.0)
+    expect(result.reasoning).toContain('any-file.ts')
+    expect(result.outcome?.filesCreated).toEqual(['any-file.ts'])
+  })
+  test('returns structured outcome for downstream analysis', async () => {
+    // Create multiple files
+    await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button />')
+    await Bun.write(join(tempDir, 'input.tsx'), 'export const Input = () => <input />')
+    const result = await grader({
+      input: 'Create UI components',
+      output: 'I created Button and Input components',
+      cwd: tempDir,
+    })
+    expect(result.outcome).toBeDefined()
+    expect(result.outcome?.type).toBe('git_status_check')
+    expect(result.outcome?.filesCreated).toBeInstanceOf(Array)
+    expect(result.outcome?.filesCreated).toHaveLength(2)
+    expect(result.outcome?.filesCreated).toContain('button.tsx')
+    expect(result.outcome?.filesCreated).toContain('input.tsx')
+    expect(result.outcome?.filesModified).toEqual([])
+  })
+  test('rejects path with command injection attempt', async () => {
+    const result = await grader({
+      input: 'Create a file',
+      output: 'Created file',
+      cwd: '/tmp/test; rm -rf /', // Command injection attempt
+    })
+    expect(result.pass).toBe(false)
+    expect(result.score).toBe(0)
+    expect(result.reasoning).toContain('Invalid working directory path')
+  })
+  test('rejects path with directory traversal', async () => {
+    const result = await grader({
+      input: 'Create a file',
+      output: 'Created file',
+      cwd: '/tmp/../../../etc', // Directory traversal
+    })
+    expect(result.pass).toBe(false)
+    expect(result.score).toBe(0)
+    expect(result.reasoning).toContain('Invalid working directory path')
+  })
+  test('rejects path with shell metacharacters', async () => {
+    const dangerousPaths = ['/tmp/test$(whoami)', '/tmp/test`id`', '/tmp/test|cat', '/tmp/test&echo', '/tmp/test>out']
+    for (const path of dangerousPaths) {
+      const result = await grader({
+        input: 'Create a file',
+        output: 'Created file',
+        cwd: path,
+      })
+      expect(result.pass).toBe(false)
+      expect(result.score).toBe(0)
+      expect(result.reasoning).toContain('Invalid working directory path')
+    }
+  })
+})