@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
package/LICENSE ADDED
@@ -0,0 +1,15 @@
1
+ ISC License
2
+
3
+ Copyright (c) 2026 Plaited Labs
4
+
5
+ Permission to use, copy, modify, and/or distribute this software for any
6
+ purpose with or without fee is hereby granted, provided that the above
7
+ copyright notice and this permission notice appear in all copies.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10
+ REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11
+ AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
12
+ INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
14
+ OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15
+ PERFORMANCE OF THIS SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,273 @@
1
+ # @plaited/agent-eval-harness
2
+
3
+ [![npm version](https://img.shields.io/npm/v/@plaited/agent-eval-harness.svg)](https://www.npmjs.com/package/@plaited/agent-eval-harness)
4
+ [![CI](https://github.com/plaited/agent-eval-harness/actions/workflows/ci.yml/badge.svg)](https://github.com/plaited/agent-eval-harness/actions/workflows/ci.yml)
5
+ [![License: ISC](https://img.shields.io/badge/License-ISC-blue.svg)](https://opensource.org/licenses/ISC)
6
+
7
+ CLI tool for capturing agent trajectories from headless CLI agents. Execute prompts, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring. Available as both a CLI tool and as installable skills for AI coding agents.
8
+
9
+ ## CLI Tool
10
+
11
+ Use these tools directly via the CLI without installation:
12
+
13
+ ```bash
14
+ # Using built-in headless adapter (recommended - no extra install needed)
15
+ export ANTHROPIC_API_KEY=sk-...
16
+ bunx @plaited/agent-eval-harness capture prompts.jsonl \
17
+ --schema ./schemas/claude-headless.json \
18
+ -o results.jsonl
19
+ ```
20
+
21
+ **Prerequisite:** Set your API key. The harness works with any CLI agent that supports JSON output - just provide a schema describing how to interact with it:
22
+
23
+ ```bash
24
+ export ANTHROPIC_API_KEY=sk-... # For Claude
25
+ export GEMINI_API_KEY=... # For Gemini
26
+ ```
27
+
28
+ Pre-built schemas are available in `.claude/skills/headless-adapters/schemas/` for Claude and Gemini.
29
+
30
+ ### Core Commands
31
+
32
+ | Command | Description |
33
+ |---------|-------------|
34
+ | `capture <prompts> --schema <path>` | Trajectory capture (full JSONL) |
35
+ | `trials <prompts> --schema <path>` | Multi-run with pass@k metrics |
36
+ | `summarize <results>` | Derive compact views from results |
37
+ | `calibrate <results>` | Sample failures for review |
38
+ | `validate-refs <prompts>` | Check reference solutions |
39
+ | `balance <prompts>` | Analyze test set coverage |
40
+ | `schemas [name]` | Export JSON schemas |
41
+ | `headless --schema <path>` | Schema-driven adapter for any CLI agent |
42
+
43
+ ### Pipeline Commands (Unix-style)
44
+
45
+ | Command | Description |
46
+ |---------|-------------|
47
+ | `run <prompts> --schema <path>` | Execute prompts, output raw results |
48
+ | `extract <raw> --schema <path>` | Parse raw output into trajectories |
49
+ | `grade <results> --grader <path>` | Apply grader to extracted results |
50
+ | `format <results> --style <style>` | Convert to markdown, csv, or jsonl |
51
+ | `compare <run1> <run2>... --grader <path>` | Compare multiple runs |
52
+
53
+ ### Examples
54
+
55
+ ```bash
56
+ # Capture trajectories using headless adapter (recommended)
57
+ bunx @plaited/agent-eval-harness capture prompts.jsonl \
58
+ --schema ./schemas/claude-headless.json \
59
+ -o results.jsonl
60
+
61
+ # Run trials for pass@k analysis with debug mode
62
+ bunx @plaited/agent-eval-harness trials prompts.jsonl \
63
+ --schema ./schemas/claude-headless.json \
64
+ -k 5 --grader ./grader.ts --debug
65
+
66
+ # Summarize results
67
+ bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl
68
+
69
+ # Export schemas
70
+ bunx @plaited/agent-eval-harness schemas CaptureResult --json
71
+
72
+ # Pipeline workflow (Unix-style composition)
73
+ cat prompts.jsonl | \
74
+ bunx @plaited/agent-eval-harness run -s ./schemas/claude-headless.json | \
75
+ bunx @plaited/agent-eval-harness extract -s ./schemas/claude-headless.json | \
76
+ bunx @plaited/agent-eval-harness grade -g ./grader.ts | \
77
+ bunx @plaited/agent-eval-harness format -f markdown > report.md
78
+
79
+ # Compare multiple runs
80
+ bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl \
81
+ --grader ./compare-grader.ts -o comparison.jsonl
82
+ ```
83
+
84
+ ## Skills for AI Agents
85
+
86
+ **Install skills** for use with AI coding agents:
87
+
88
+ ```bash
89
+ curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agent <agent-name> --project agent-eval-harness
90
+ ```
91
+
92
+ Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
93
+
94
+ ### Available Skills
95
+
96
+ #### Agent Eval Harness
97
+
98
+ CLI tool for capturing agent trajectories, optimized for TypeScript/JavaScript projects using Bun.
99
+
100
+ **Core Commands:**
101
+
102
+ | Command | Description |
103
+ |---------|-------------|
104
+ | `capture` | Execute prompts and capture full trajectories |
105
+ | `trials` | Multi-run trials with pass@k/pass^k metrics |
106
+ | `summarize` | Derive compact views from trajectory results |
107
+ | `calibrate` | Sample failures for grader calibration |
108
+ | `validate-refs` | Validate reference solutions against graders |
109
+ | `balance` | Analyze test set coverage distribution |
110
+ | `schemas` | Export Zod schemas as JSON Schema |
111
+
112
+ **Pipeline Commands (Unix-style):**
113
+
114
+ | Command | Description |
115
+ |---------|-------------|
116
+ | `run` | Execute prompts, output raw results |
117
+ | `extract` | Parse raw output into trajectories |
118
+ | `grade` | Apply grader to extracted results |
119
+ | `format` | Convert to markdown, csv, or jsonl |
120
+ | `compare` | Compare multiple runs |
121
+
122
+ **Use cases:**
123
+ - Capturing trajectories for downstream evaluation (Braintrust, custom scorers)
124
+ - Generating training data (SFT/DPO) with full context
125
+ - Building regression test fixtures for agent behavior
126
+ - Comparing agent responses across configurations
127
+
128
+ #### Headless Adapters
129
+
130
+ Schema-driven adapters for headless CLI agent integration.
131
+
132
+ **Commands:**
133
+
134
+ | Command | Description |
135
+ |---------|-------------|
136
+ | `headless` | Schema-driven adapter for any CLI agent |
137
+
138
+ **Use cases:**
139
+ - Wrapping headless CLI agents with schema-driven adapter
140
+ - Finding existing adapters for your agent
141
+ - Creating new schemas for CLI agents
142
+
143
+ ## Input Format
144
+
145
+ ```jsonl
146
+ {"id":"test-001","input":"Create a primary button","hint":"should contain <button>","metadata":{"category":"ui"}}
147
+ {"id":"test-002","input":["Create a component","Now add tests"],"metadata":{"category":"multi-turn"}}
148
+ ```
149
+
150
+ | Field | Required | Description |
151
+ |-------|----------|-------------|
152
+ | `id` | Yes | Unique identifier |
153
+ | `input` | Yes | Single prompt (string) or conversation turns (string[]) |
154
+ | `hint` | No | Grader context - what to look for |
155
+ | `reference` | No | Reference solution (for validate-refs) |
156
+ | `metadata` | No | Tags, category, difficulty for filtering |
157
+ | `timeout` | No | Override default timeout for this prompt (ms) |
158
+
159
+ ## Output Format
160
+
161
+ The harness outputs full trajectory JSONL (`CaptureResult` schema):
162
+
163
+ ```jsonl
164
+ {
165
+ "id": "test-001",
166
+ "input": "Create a primary button",
167
+ "output": "Here's a button component...",
168
+ "hint": "should contain <button>",
169
+ "trajectory": [...],
170
+ "metadata": {"category": "ui", "trajectoryRichness": "full", "turnCount": 1},
171
+ "timing": {"start": 1234567890, "end": 1234567900, "total": 10},
172
+ "toolErrors": false,
173
+ "exitInfo": {"exitCode": 0},
174
+ "score": {"pass": true, "score": 1.0, "reasoning": "Contains hint"}
175
+ }
176
+ ```
177
+
178
+ Key fields:
179
+ - `toolErrors`: Boolean indicating if any tool calls failed
180
+ - `score`: Grader result (only if `--grader` provided)
181
+ - `trajectory`: Full execution trace (thoughts, messages, tool calls, plans)
182
+ - `metadata.trajectoryRichness`: `"full"` | `"messages-only"` | `"minimal"`
183
+ - `exitInfo`: Process exit information (`exitCode`, `signal`, `timedOut`)
184
+ - `timing.total`: End-to-end duration (ms)
185
+
186
+ ## Graders
187
+
188
+ Graders score agent outputs. The harness supports two types:
189
+
190
+ ### TypeScript/JavaScript Graders
191
+
192
+ Export a `grade` function:
193
+
194
+ ```typescript
195
+ import type { Grader } from '@plaited/agent-eval-harness/schemas'
196
+
197
+ export const grade: Grader = async ({ input, output, hint, trajectory }) => {
198
+ const pass = output.toLowerCase().includes(hint?.toLowerCase() ?? '')
199
+ return {
200
+ pass,
201
+ score: pass ? 1.0 : 0.0,
202
+ reasoning: pass ? 'Contains hint content' : 'Missing hint content'
203
+ }
204
+ }
205
+ ```
206
+
207
+ ```bash
208
+ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grader.ts
209
+ ```
210
+
211
+ ### Polyglot Graders (Python, etc.)
212
+
213
+ Any executable script using stdin/stdout JSON protocol:
214
+
215
+ ```python
216
+ #!/usr/bin/env python3
217
+ import json
218
+ import sys
219
+
220
+ data = json.load(sys.stdin)
221
+ output = data["output"].lower()
222
+ hint = (data.get("hint") or "").lower()
223
+
224
+ pass_result = hint in output if hint else True
225
+ print(json.dumps({
226
+ "pass": pass_result,
227
+ "score": 1.0 if pass_result else 0.0,
228
+ "reasoning": "Contains hint" if pass_result else "Missing hint"
229
+ }))
230
+ ```
231
+
232
+ ```bash
233
+ chmod +x grader.py
234
+ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grader.py
235
+ ```
236
+
237
+ **Protocol:**
238
+ - Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
239
+ - Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
240
+
241
+ ## Downstream Integration
242
+
243
+ ```bash
244
+ # Filter failures
245
+ cat results.jsonl | jq 'select(.score.pass == false)'
246
+
247
+ # Extract tool usage patterns
248
+ cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
249
+
250
+ # Use with your scoring pipeline
251
+ cat results.jsonl | your-scoring-script.ts
252
+ ```
253
+
254
+ ## Development
255
+
256
+ ```bash
257
+ bun install # Install dependencies
258
+ bun run check # Type check + lint + format
259
+ bun test # Run unit tests
260
+
261
+ # Run integration tests in Docker (requires API keys)
262
+ ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.test.yml run --rm test
263
+ ```
264
+
265
+ ## Requirements
266
+
267
+ - **Runtime:** Bun >= 1.2.9
268
+ - **Schema:** JSON schema describing CLI agent interaction (see `.claude/skills/headless-adapters/schemas/`)
269
+ - **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
270
+
271
+ ## License
272
+
273
+ ISC © [Plaited Labs](https://github.com/plaited)
package/bin/cli.ts ADDED
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * Agent Eval Harness CLI - Agent evaluation toolkit.
5
+ *
6
+ * @remarks
7
+ * Router for harness commands. Thin wrapper that delegates to command modules.
8
+ *
9
+ * Commands:
10
+ * - capture: Core trajectory capture
11
+ * - trials: Multi-run pass@k/pass^k analysis
12
+ * - summarize: Derive compact views from results
13
+ * - calibrate: Sample failures for grader review
14
+ * - validate-refs: Check reference solutions
15
+ * - balance: Analyze test set coverage
16
+ * - schemas: Export JSON schemas for non-TS users
17
+ * - headless: Schema-driven adapter for any headless CLI agent
18
+ */
19
+
20
+ import { balance } from '../src/commands/balance.ts'
21
+ import { calibrate } from '../src/commands/calibrate.ts'
22
+ import { capture } from '../src/commands/capture.ts'
23
+ import { summarize } from '../src/commands/summarize.ts'
24
+ import { trials } from '../src/commands/trials.ts'
25
+ import { validateRefs } from '../src/commands/validate-refs.ts'
26
+ import { headless } from '../src/headless.ts'
27
+ import { compare, extract, format, grade, run } from '../src/pipeline.ts'
28
+ import { schemasCli } from '../src/schemas/schemas-cli.ts'
29
+
30
+ const [command, ...args] = Bun.argv.slice(2)
31
+
32
+ const printHelp = () => {
33
+ // biome-ignore lint/suspicious/noConsole: CLI help output
34
+ console.log(`
35
+ agent-eval-harness - CLI tool for agent evaluation
36
+
37
+ Commands:
38
+ capture Capture trajectories from CLI agents
39
+ trials Run prompts multiple times for pass@k/pass^k metrics
40
+ summarize Derive compact views from results
41
+ calibrate Sample failures for grader review
42
+ validate-refs Check reference solutions against grader
43
+ balance Analyze test set coverage
44
+ schemas Export JSON schemas for non-TypeScript users
45
+ headless Schema-driven adapter for any headless CLI agent
46
+
47
+ Pipeline Commands (Unix-style composable):
48
+ run Execute prompts and output raw results
49
+ extract Parse raw output into trajectories
50
+ grade Apply grader to extracted results
51
+ format Convert results to different output formats
52
+ compare Compare multiple runs of the same prompts
53
+
54
+ Run 'agent-eval-harness <command> --help' for command-specific help.
55
+
56
+ Examples:
57
+ # Basic capture with schema
58
+ agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
59
+
60
+ # With grader
61
+ agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
62
+
63
+ # Multi-run trials
64
+ agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
65
+
66
+ # Derive summary view
67
+ agent-eval-harness summarize results.jsonl -o summary.jsonl
68
+
69
+ # Pipeline workflow
70
+ cat prompts.jsonl | \\
71
+ agent-eval-harness run -s claude.json | \\
72
+ agent-eval-harness extract -s claude.json | \\
73
+ agent-eval-harness grade -g ./grader.ts | \\
74
+ agent-eval-harness format -f markdown > report.md
75
+
76
+ # Compare multiple runs
77
+ agent-eval-harness compare run1.jsonl run2.jsonl -g ./compare-grader.ts
78
+
79
+ Documentation: https://github.com/plaited/agent-eval-harness
80
+ `)
81
+ }
82
+
83
+ const main = async () => {
84
+ switch (command) {
85
+ case 'capture':
86
+ await capture(args)
87
+ break
88
+
89
+ case 'trials':
90
+ await trials(args)
91
+ break
92
+
93
+ case 'summarize':
94
+ await summarize(args)
95
+ break
96
+
97
+ case 'calibrate':
98
+ await calibrate(args)
99
+ break
100
+
101
+ case 'validate-refs':
102
+ await validateRefs(args)
103
+ break
104
+
105
+ case 'balance':
106
+ await balance(args)
107
+ break
108
+
109
+ case 'schemas':
110
+ await schemasCli(args)
111
+ break
112
+
113
+ case 'headless':
114
+ await headless(args)
115
+ break
116
+
117
+ // Pipeline commands
118
+ case 'run':
119
+ await run(args)
120
+ break
121
+
122
+ case 'extract':
123
+ await extract(args)
124
+ break
125
+
126
+ case 'grade':
127
+ await grade(args)
128
+ break
129
+
130
+ case 'format':
131
+ await format(args)
132
+ break
133
+
134
+ case 'compare':
135
+ await compare(args)
136
+ break
137
+
138
+ case '-h':
139
+ case '--help':
140
+ case undefined:
141
+ printHelp()
142
+ break
143
+
144
+ case '-v':
145
+ case '--version': {
146
+ const { version } = await import('../package.json')
147
+ // biome-ignore lint/suspicious/noConsole: CLI version output
148
+ console.log(version)
149
+ break
150
+ }
151
+
152
+ default:
153
+ console.error(`Unknown command: ${command}`)
154
+ console.error("Run 'agent-eval-harness --help' for usage")
155
+ process.exit(1)
156
+ }
157
+ }
158
+
159
+ main().catch((error) => {
160
+ console.error('Error:', error instanceof Error ? error.message : error)
161
+ process.exit(1)
162
+ })