@plaited/agent-eval-harness 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -9
- package/bin/tests/cli.spec.ts +6 -6
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -25,7 +25,7 @@ export ANTHROPIC_API_KEY=sk-... # For Claude
|
|
|
25
25
|
export GEMINI_API_KEY=... # For Gemini
|
|
26
26
|
```
|
|
27
27
|
|
|
28
|
-
Pre-built schemas are available in `.
|
|
28
|
+
Pre-built schemas are available in `.plaited/skills/headless-adapters/schemas/` for Claude and Gemini.
|
|
29
29
|
|
|
30
30
|
### Core Commands
|
|
31
31
|
|
|
@@ -85,7 +85,7 @@ bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.jso
|
|
|
85
85
|
**Install skills** for use with AI coding agents:
|
|
86
86
|
|
|
87
87
|
```bash
|
|
88
|
-
curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --
|
|
88
|
+
curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agents <agent-name> --project agent-eval-harness
|
|
89
89
|
```
|
|
90
90
|
|
|
91
91
|
Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
|
|
@@ -253,18 +253,20 @@ cat results.jsonl | your-scoring-script.ts
|
|
|
253
253
|
## Development
|
|
254
254
|
|
|
255
255
|
```bash
|
|
256
|
-
bun install
|
|
257
|
-
bun run check
|
|
258
|
-
bun test
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
256
|
+
bun install # Install dependencies
|
|
257
|
+
bun run check # Type check + lint + format
|
|
258
|
+
bun test # Run unit tests
|
|
259
|
+
bun run test:integration # Run integration tests (requires API keys)
|
|
260
|
+
|
|
261
|
+
# Alternative: Run integration tests in Docker
|
|
262
|
+
ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
|
|
263
|
+
docker compose -f docker-compose.test.yml run --rm test
|
|
262
264
|
```
|
|
263
265
|
|
|
264
266
|
## Requirements
|
|
265
267
|
|
|
266
268
|
- **Runtime:** Bun >= 1.2.9
|
|
267
|
-
- **Schema:** JSON schema describing CLI agent interaction (see `.
|
|
269
|
+
- **Schema:** JSON schema describing CLI agent interaction (see `.plaited/skills/headless-adapters/schemas/`)
|
|
268
270
|
- **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
|
|
269
271
|
|
|
270
272
|
## License
|
package/bin/tests/cli.spec.ts
CHANGED
|
@@ -219,8 +219,8 @@ const SAMPLE_SUMMARY_JSONL = `{"id":"test-001","input":"Create a button","output
|
|
|
219
219
|
{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","toolCalls":["Read","Edit"],"duration":2567}
|
|
220
220
|
{"id":"test-003","input":"Broken test","output":"","toolCalls":[],"duration":500}`
|
|
221
221
|
|
|
222
|
-
const SAMPLE_CAPTURE_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","trajectory":[{"type":"thought","content":"I'll create a button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"export const Button = () => <button>Click</button>"},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-
|
|
223
|
-
{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-
|
|
222
|
+
const SAMPLE_CAPTURE_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","trajectory":[{"type":"thought","content":"I'll create a button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"export const Button = () => <button>Click</button>"},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-headless"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false}
|
|
223
|
+
{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-headless"},"timing":{"start":1704067300000,"end":1704067302567},"toolErrors":false}`
|
|
224
224
|
|
|
225
225
|
// ============================================================================
|
|
226
226
|
// Downstream Pattern Tests
|
|
@@ -429,7 +429,7 @@ describe('MCP server config parsing', () => {
|
|
|
429
429
|
test('parses stdio MCP server config', () => {
|
|
430
430
|
const json = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":["/data"],"env":[]}'
|
|
431
431
|
const proc = Bun.spawn(
|
|
432
|
-
['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '
|
|
432
|
+
['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'],
|
|
433
433
|
{
|
|
434
434
|
stdout: 'pipe',
|
|
435
435
|
stderr: 'pipe',
|
|
@@ -444,7 +444,7 @@ describe('MCP server config parsing', () => {
|
|
|
444
444
|
const json =
|
|
445
445
|
'{"type":"http","name":"api","url":"https://example.com/mcp","headers":[{"name":"Authorization","value":"Bearer token"}]}'
|
|
446
446
|
const proc = Bun.spawn(
|
|
447
|
-
['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '
|
|
447
|
+
['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'],
|
|
448
448
|
{
|
|
449
449
|
stdout: 'pipe',
|
|
450
450
|
stderr: 'pipe',
|
|
@@ -464,8 +464,8 @@ describe('MCP server config parsing', () => {
|
|
|
464
464
|
CLI_PATH,
|
|
465
465
|
'capture',
|
|
466
466
|
'/tmp/test.jsonl',
|
|
467
|
-
'
|
|
468
|
-
'
|
|
467
|
+
'--schema',
|
|
468
|
+
'./test-schema.json',
|
|
469
469
|
'--mcp-server',
|
|
470
470
|
json1,
|
|
471
471
|
'--mcp-server',
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/agent-eval-harness",
|
|
3
|
-
"version": "0.6.
|
|
3
|
+
"version": "0.6.2",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -54,11 +54,11 @@
|
|
|
54
54
|
]
|
|
55
55
|
},
|
|
56
56
|
"dependencies": {
|
|
57
|
-
"
|
|
58
|
-
"
|
|
57
|
+
"@plaited/development-skills": "0.6.5",
|
|
58
|
+
"zod": "^4.3.6"
|
|
59
59
|
},
|
|
60
60
|
"devDependencies": {
|
|
61
|
-
"@biomejs/biome": "2.3.
|
|
61
|
+
"@biomejs/biome": "2.3.12",
|
|
62
62
|
"@types/bun": "1.3.6",
|
|
63
63
|
"format-package": "7.0.0",
|
|
64
64
|
"lint-staged": "16.2.7",
|