@plaited/agent-eval-harness 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -25,7 +25,7 @@ export ANTHROPIC_API_KEY=sk-... # For Claude
25
25
  export GEMINI_API_KEY=... # For Gemini
26
26
  ```
27
27
 
28
- Pre-built schemas are available in `.claude/skills/headless-adapters/schemas/` for Claude and Gemini.
28
+ Pre-built schemas are available in `.plaited/skills/headless-adapters/schemas/` for Claude and Gemini.
29
29
 
30
30
  ### Core Commands
31
31
 
@@ -85,7 +85,7 @@ bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.jso
85
85
  **Install skills** for use with AI coding agents:
86
86
 
87
87
  ```bash
88
- curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agent <agent-name> --project agent-eval-harness
88
+ curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agents <agent-name> --project agent-eval-harness
89
89
  ```
90
90
 
91
91
  Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
@@ -253,18 +253,20 @@ cat results.jsonl | your-scoring-script.ts
253
253
  ## Development
254
254
 
255
255
  ```bash
256
- bun install # Install dependencies
257
- bun run check # Type check + lint + format
258
- bun test # Run unit tests
259
-
260
- # Run integration tests in Docker (requires API keys)
261
- ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.test.yml run --rm test
256
+ bun install # Install dependencies
257
+ bun run check # Type check + lint + format
258
+ bun test # Run unit tests
259
+ bun run test:integration # Run integration tests (requires API keys)
260
+
261
+ # Alternative: Run integration tests in Docker
262
+ ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
263
+ docker compose -f docker-compose.test.yml run --rm test
262
264
  ```
263
265
 
264
266
  ## Requirements
265
267
 
266
268
  - **Runtime:** Bun >= 1.2.9
267
- - **Schema:** JSON schema describing CLI agent interaction (see `.claude/skills/headless-adapters/schemas/`)
269
+ - **Schema:** JSON schema describing CLI agent interaction (see `.plaited/skills/headless-adapters/schemas/`)
268
270
  - **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
269
271
 
270
272
  ## License
@@ -219,8 +219,8 @@ const SAMPLE_SUMMARY_JSONL = `{"id":"test-001","input":"Create a button","output
219
219
  {"id":"test-002","input":"Fix the bug","output":"I fixed the bug","toolCalls":["Read","Edit"],"duration":2567}
220
220
  {"id":"test-003","input":"Broken test","output":"","toolCalls":[],"duration":500}`
221
221
 
222
- const SAMPLE_CAPTURE_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","trajectory":[{"type":"thought","content":"I'll create a button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"export const Button = () => <button>Click</button>"},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-code-acp"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false}
223
- {"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-code-acp"},"timing":{"start":1704067300000,"end":1704067302567},"toolErrors":false}`
222
+ const SAMPLE_CAPTURE_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","trajectory":[{"type":"thought","content":"I'll create a button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"export const Button = () => <button>Click</button>"},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-headless"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false}
223
+ {"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-headless"},"timing":{"start":1704067300000,"end":1704067302567},"toolErrors":false}`
224
224
 
225
225
  // ============================================================================
226
226
  // Downstream Pattern Tests
@@ -429,7 +429,7 @@ describe('MCP server config parsing', () => {
429
429
  test('parses stdio MCP server config', () => {
430
430
  const json = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":["/data"],"env":[]}'
431
431
  const proc = Bun.spawn(
432
- ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', 'bunx', 'claude-code-acp', '--mcp-server', json, '--help'],
432
+ ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'],
433
433
  {
434
434
  stdout: 'pipe',
435
435
  stderr: 'pipe',
@@ -444,7 +444,7 @@ describe('MCP server config parsing', () => {
444
444
  const json =
445
445
  '{"type":"http","name":"api","url":"https://example.com/mcp","headers":[{"name":"Authorization","value":"Bearer token"}]}'
446
446
  const proc = Bun.spawn(
447
- ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', 'bunx', 'claude-code-acp', '--mcp-server', json, '--help'],
447
+ ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'],
448
448
  {
449
449
  stdout: 'pipe',
450
450
  stderr: 'pipe',
@@ -464,8 +464,8 @@ describe('MCP server config parsing', () => {
464
464
  CLI_PATH,
465
465
  'capture',
466
466
  '/tmp/test.jsonl',
467
- 'bunx',
468
- 'claude-code-acp',
467
+ '--schema',
468
+ './test-schema.json',
469
469
  '--mcp-server',
470
470
  json1,
471
471
  '--mcp-server',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.6.0",
3
+ "version": "0.6.2",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -54,11 +54,11 @@
54
54
  ]
55
55
  },
56
56
  "dependencies": {
57
- "zod": "^4.3.5",
58
- "@plaited/development-skills": "0.6.3"
57
+ "@plaited/development-skills": "0.6.5",
58
+ "zod": "^4.3.6"
59
59
  },
60
60
  "devDependencies": {
61
- "@biomejs/biome": "2.3.11",
61
+ "@biomejs/biome": "2.3.12",
62
62
  "@types/bun": "1.3.6",
63
63
  "format-package": "7.0.0",
64
64
  "lint-staged": "16.2.7",