@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
import { describe, expect, test } from 'bun:test'
|
|
2
|
+
import { join } from 'node:path'
|
|
3
|
+
import { z } from 'zod'
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Tests for the agent-eval-harness CLI.
|
|
7
|
+
*
|
|
8
|
+
* @remarks
|
|
9
|
+
* Tests CLI argument parsing, help output, and output format schemas.
|
|
10
|
+
* Integration tests requiring an actual CLI agent are in *.docker.ts files.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const CLI_PATH = join(import.meta.dir, '..', 'cli.ts')
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// CLI Invocation Tests
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
describe('CLI invocation', () => {
|
|
20
|
+
test('shows help with --help flag', async () => {
|
|
21
|
+
const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
|
|
22
|
+
stdout: 'pipe',
|
|
23
|
+
stderr: 'pipe',
|
|
24
|
+
})
|
|
25
|
+
const stdout = await new Response(proc.stdout).text()
|
|
26
|
+
const exitCode = await proc.exited
|
|
27
|
+
|
|
28
|
+
expect(exitCode).toBe(0)
|
|
29
|
+
expect(stdout).toContain('agent-eval-harness')
|
|
30
|
+
expect(stdout).toContain('Commands:')
|
|
31
|
+
expect(stdout).toContain('capture')
|
|
32
|
+
expect(stdout).toContain('trials')
|
|
33
|
+
expect(stdout).toContain('summarize')
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
test('shows help with -h flag', async () => {
|
|
37
|
+
const proc = Bun.spawn(['bun', CLI_PATH, '-h'], {
|
|
38
|
+
stdout: 'pipe',
|
|
39
|
+
stderr: 'pipe',
|
|
40
|
+
})
|
|
41
|
+
const stdout = await new Response(proc.stdout).text()
|
|
42
|
+
const exitCode = await proc.exited
|
|
43
|
+
|
|
44
|
+
expect(exitCode).toBe(0)
|
|
45
|
+
expect(stdout).toContain('agent-eval-harness')
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
test('shows help when no arguments provided', async () => {
|
|
49
|
+
const proc = Bun.spawn(['bun', CLI_PATH], {
|
|
50
|
+
stdout: 'pipe',
|
|
51
|
+
stderr: 'pipe',
|
|
52
|
+
})
|
|
53
|
+
const stdout = await new Response(proc.stdout).text()
|
|
54
|
+
const exitCode = await proc.exited
|
|
55
|
+
|
|
56
|
+
expect(exitCode).toBe(0) // Exits cleanly when showing help
|
|
57
|
+
expect(stdout).toContain('agent-eval-harness')
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
test('help shows example commands', async () => {
|
|
61
|
+
const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
|
|
62
|
+
stdout: 'pipe',
|
|
63
|
+
stderr: 'pipe',
|
|
64
|
+
})
|
|
65
|
+
const stdout = await new Response(proc.stdout).text()
|
|
66
|
+
|
|
67
|
+
expect(stdout).toContain('--schema')
|
|
68
|
+
expect(stdout).toContain('prompts.jsonl')
|
|
69
|
+
expect(stdout).toContain('results.jsonl')
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
test('help shows available commands', async () => {
|
|
73
|
+
const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
|
|
74
|
+
stdout: 'pipe',
|
|
75
|
+
stderr: 'pipe',
|
|
76
|
+
})
|
|
77
|
+
const stdout = await new Response(proc.stdout).text()
|
|
78
|
+
|
|
79
|
+
expect(stdout).toContain('capture')
|
|
80
|
+
expect(stdout).toContain('trials')
|
|
81
|
+
expect(stdout).toContain('summarize')
|
|
82
|
+
expect(stdout).toContain('calibrate')
|
|
83
|
+
expect(stdout).toContain('balance')
|
|
84
|
+
expect(stdout).toContain('schemas')
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
test('fails with non-existent schema file', async () => {
|
|
88
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'capture', 'prompts.jsonl', '--schema', 'nonexistent.json'], {
|
|
89
|
+
stdout: 'pipe',
|
|
90
|
+
stderr: 'pipe',
|
|
91
|
+
})
|
|
92
|
+
const stderr = await new Response(proc.stderr).text()
|
|
93
|
+
const exitCode = await proc.exited
|
|
94
|
+
|
|
95
|
+
expect(exitCode).not.toBe(0)
|
|
96
|
+
expect(stderr).toContain('Schema file not found')
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
test('fails when no schema provided', async () => {
|
|
100
|
+
const tmpFile = `/tmp/test-prompts-${Date.now()}.jsonl`
|
|
101
|
+
await Bun.write(tmpFile, '{"id":"test-001","input":"test"}\n')
|
|
102
|
+
|
|
103
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'capture', tmpFile], {
|
|
104
|
+
stdout: 'pipe',
|
|
105
|
+
stderr: 'pipe',
|
|
106
|
+
})
|
|
107
|
+
const stderr = await new Response(proc.stderr).text()
|
|
108
|
+
const exitCode = await proc.exited
|
|
109
|
+
|
|
110
|
+
expect(exitCode).toBe(1)
|
|
111
|
+
expect(stderr).toContain('--schema is required')
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
test('fails with unknown command', async () => {
|
|
115
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'unknown-command'], {
|
|
116
|
+
stdout: 'pipe',
|
|
117
|
+
stderr: 'pipe',
|
|
118
|
+
})
|
|
119
|
+
const stderr = await new Response(proc.stderr).text()
|
|
120
|
+
const exitCode = await proc.exited
|
|
121
|
+
|
|
122
|
+
expect(exitCode).toBe(1)
|
|
123
|
+
expect(stderr).toContain('Unknown command')
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
test('capture command shows help with --help', async () => {
|
|
127
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'capture', '--help'], {
|
|
128
|
+
stdout: 'pipe',
|
|
129
|
+
stderr: 'pipe',
|
|
130
|
+
})
|
|
131
|
+
const stdout = await new Response(proc.stdout).text()
|
|
132
|
+
const exitCode = await proc.exited
|
|
133
|
+
|
|
134
|
+
expect(exitCode).toBe(0)
|
|
135
|
+
expect(stdout).toContain('capture')
|
|
136
|
+
expect(stdout).toContain('prompts.jsonl')
|
|
137
|
+
expect(stdout).toContain('--output')
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
test('trials command shows help with --help', async () => {
|
|
141
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'trials', '--help'], {
|
|
142
|
+
stdout: 'pipe',
|
|
143
|
+
stderr: 'pipe',
|
|
144
|
+
})
|
|
145
|
+
const stdout = await new Response(proc.stdout).text()
|
|
146
|
+
const exitCode = await proc.exited
|
|
147
|
+
|
|
148
|
+
expect(exitCode).toBe(0)
|
|
149
|
+
expect(stdout).toContain('trials')
|
|
150
|
+
expect(stdout).toContain('-k')
|
|
151
|
+
expect(stdout).toContain('pass@k')
|
|
152
|
+
})
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
// ============================================================================
|
|
156
|
+
// Output Format Schemas (for downstream validation)
|
|
157
|
+
// ============================================================================
|
|
158
|
+
|
|
159
|
+
const SummaryResultSchema = z.object({
|
|
160
|
+
id: z.string(),
|
|
161
|
+
input: z.string(),
|
|
162
|
+
output: z.string(),
|
|
163
|
+
toolCalls: z.array(z.string()),
|
|
164
|
+
duration: z.number(),
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
const TrajectoryStepSchema = z.discriminatedUnion('type', [
|
|
168
|
+
z.object({
|
|
169
|
+
type: z.literal('thought'),
|
|
170
|
+
content: z.string(),
|
|
171
|
+
timestamp: z.number(),
|
|
172
|
+
stepId: z.string().optional(),
|
|
173
|
+
}),
|
|
174
|
+
z.object({
|
|
175
|
+
type: z.literal('message'),
|
|
176
|
+
content: z.string(),
|
|
177
|
+
timestamp: z.number(),
|
|
178
|
+
stepId: z.string().optional(),
|
|
179
|
+
}),
|
|
180
|
+
z.object({
|
|
181
|
+
type: z.literal('tool_call'),
|
|
182
|
+
name: z.string(),
|
|
183
|
+
status: z.string(),
|
|
184
|
+
input: z.unknown().optional(),
|
|
185
|
+
output: z.unknown().optional(),
|
|
186
|
+
duration: z.number().optional(),
|
|
187
|
+
timestamp: z.number(),
|
|
188
|
+
stepId: z.string().optional(),
|
|
189
|
+
}),
|
|
190
|
+
z.object({
|
|
191
|
+
type: z.literal('plan'),
|
|
192
|
+
entries: z.array(z.unknown()),
|
|
193
|
+
timestamp: z.number(),
|
|
194
|
+
stepId: z.string().optional(),
|
|
195
|
+
}),
|
|
196
|
+
])
|
|
197
|
+
|
|
198
|
+
const CaptureResultSchema = z.object({
|
|
199
|
+
id: z.string(),
|
|
200
|
+
input: z.string(),
|
|
201
|
+
output: z.string(),
|
|
202
|
+
expected: z.string().optional(),
|
|
203
|
+
trajectory: z.array(TrajectoryStepSchema),
|
|
204
|
+
metadata: z.record(z.string(), z.unknown()),
|
|
205
|
+
timing: z.object({
|
|
206
|
+
start: z.number(),
|
|
207
|
+
end: z.number(),
|
|
208
|
+
firstResponse: z.number().optional(),
|
|
209
|
+
}),
|
|
210
|
+
toolErrors: z.boolean(),
|
|
211
|
+
errors: z.array(z.string()).optional(),
|
|
212
|
+
})
|
|
213
|
+
|
|
214
|
+
// ============================================================================
|
|
215
|
+
// Sample Output Data (matches harness output format)
|
|
216
|
+
// ============================================================================
|
|
217
|
+
|
|
218
|
+
const SAMPLE_SUMMARY_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","toolCalls":["Write"],"duration":1234}
|
|
219
|
+
{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","toolCalls":["Read","Edit"],"duration":2567}
|
|
220
|
+
{"id":"test-003","input":"Broken test","output":"","toolCalls":[],"duration":500}`
|
|
221
|
+
|
|
222
|
+
const SAMPLE_CAPTURE_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","trajectory":[{"type":"thought","content":"I'll create a button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"export const Button = () => <button>Click</button>"},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-code-acp"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false}
|
|
223
|
+
{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-code-acp"},"timing":{"start":1704067300000,"end":1704067302567},"toolErrors":false}`
|
|
224
|
+
|
|
225
|
+
// ============================================================================
|
|
226
|
+
// Downstream Pattern Tests
|
|
227
|
+
// ============================================================================
|
|
228
|
+
|
|
229
|
+
describe('downstream patterns: summary JSONL', () => {
|
|
230
|
+
const parseResults = (jsonl: string) =>
|
|
231
|
+
jsonl
|
|
232
|
+
.trim()
|
|
233
|
+
.split('\n')
|
|
234
|
+
.map((line) => JSON.parse(line))
|
|
235
|
+
|
|
236
|
+
test('parses summary JSONL correctly', () => {
|
|
237
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
238
|
+
|
|
239
|
+
expect(results).toHaveLength(3)
|
|
240
|
+
for (const result of results) {
|
|
241
|
+
expect(() => SummaryResultSchema.parse(result)).not.toThrow()
|
|
242
|
+
}
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
test('filters by output presence (jq pattern)', () => {
|
|
246
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
247
|
+
const withOutput = results.filter((r) => r.output.length > 0)
|
|
248
|
+
|
|
249
|
+
expect(withOutput).toHaveLength(2)
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
test('calculates average duration (jq pattern)', () => {
|
|
253
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
254
|
+
const avg = results.reduce((sum, r) => sum + r.duration, 0) / results.length
|
|
255
|
+
|
|
256
|
+
expect(avg).toBeCloseTo(1433.67, 0)
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
test('counts tool usage (jq pattern)', () => {
|
|
260
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
261
|
+
const allTools = results.flatMap((r) => r.toolCalls)
|
|
262
|
+
const toolCounts = allTools.reduce<Record<string, number>>((acc, tool) => {
|
|
263
|
+
acc[tool] = (acc[tool] ?? 0) + 1
|
|
264
|
+
return acc
|
|
265
|
+
}, {})
|
|
266
|
+
|
|
267
|
+
expect(toolCounts).toEqual({ Write: 1, Read: 1, Edit: 1 })
|
|
268
|
+
})
|
|
269
|
+
|
|
270
|
+
test('calculates success rate by output presence', () => {
|
|
271
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
272
|
+
const withOutput = results.filter((r) => r.output.length > 0).length
|
|
273
|
+
const total = results.length
|
|
274
|
+
|
|
275
|
+
expect(withOutput).toBe(2)
|
|
276
|
+
expect(total).toBe(3)
|
|
277
|
+
expect(withOutput / total).toBeCloseTo(0.667, 2)
|
|
278
|
+
})
|
|
279
|
+
})
|
|
280
|
+
|
|
281
|
+
describe('downstream patterns: capture JSONL', () => {
|
|
282
|
+
const parseResults = (jsonl: string) =>
|
|
283
|
+
jsonl
|
|
284
|
+
.trim()
|
|
285
|
+
.split('\n')
|
|
286
|
+
.map((line) => JSON.parse(line))
|
|
287
|
+
|
|
288
|
+
test('parses capture JSONL with trajectories', () => {
|
|
289
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
290
|
+
|
|
291
|
+
expect(results).toHaveLength(2)
|
|
292
|
+
for (const result of results) {
|
|
293
|
+
expect(() => CaptureResultSchema.parse(result)).not.toThrow()
|
|
294
|
+
}
|
|
295
|
+
})
|
|
296
|
+
|
|
297
|
+
test('step IDs follow expected format', () => {
|
|
298
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
299
|
+
|
|
300
|
+
for (const result of results) {
|
|
301
|
+
for (const step of result.trajectory) {
|
|
302
|
+
expect(step.stepId).toMatch(new RegExp(`^${result.id}-step-\\d+$`))
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
})
|
|
306
|
+
|
|
307
|
+
test('step-level retrieval pattern works', () => {
|
|
308
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
309
|
+
|
|
310
|
+
// Build step index (pattern from downstream.md)
|
|
311
|
+
const stepIndex = new Map<string, unknown>()
|
|
312
|
+
for (const result of results) {
|
|
313
|
+
for (const step of result.trajectory) {
|
|
314
|
+
stepIndex.set(step.stepId, step)
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Retrieve specific step by ID
|
|
319
|
+
const step = stepIndex.get('test-001-step-2') as { name: string; input: { file_path: string } }
|
|
320
|
+
expect(step).toBeDefined()
|
|
321
|
+
expect(step.name).toBe('Write')
|
|
322
|
+
expect(step.input.file_path).toBe('src/button.tsx')
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
test('extracts tool calls from trajectory', () => {
|
|
326
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
327
|
+
const result = results[1] // test-002
|
|
328
|
+
|
|
329
|
+
const toolCalls = result.trajectory.filter((s: { type: string }) => s.type === 'tool_call')
|
|
330
|
+
expect(toolCalls).toHaveLength(2)
|
|
331
|
+
expect(toolCalls.map((t: { name: string }) => t.name)).toEqual(['Read', 'Edit'])
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
test('filters by metadata category', () => {
|
|
335
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
336
|
+
const uiResults = results.filter((r) => r.metadata.category === 'ui')
|
|
337
|
+
|
|
338
|
+
expect(uiResults).toHaveLength(1)
|
|
339
|
+
expect(uiResults[0]?.id).toBe('test-001')
|
|
340
|
+
})
|
|
341
|
+
|
|
342
|
+
test('identifies results with tool errors', () => {
|
|
343
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
344
|
+
const withErrors = results.filter((r) => r.toolErrors)
|
|
345
|
+
|
|
346
|
+
expect(withErrors).toHaveLength(0) // Sample data has no errors
|
|
347
|
+
})
|
|
348
|
+
})
|
|
349
|
+
|
|
350
|
+
describe('downstream patterns: advanced filtering', () => {
|
|
351
|
+
const parseResults = (jsonl: string) =>
|
|
352
|
+
jsonl
|
|
353
|
+
.trim()
|
|
354
|
+
.split('\n')
|
|
355
|
+
.map((line) => JSON.parse(line))
|
|
356
|
+
|
|
357
|
+
test('filters by tool usage (jq contains pattern)', () => {
|
|
358
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
359
|
+
const withWrite = results.filter((r) => r.toolCalls.includes('Write'))
|
|
360
|
+
|
|
361
|
+
expect(withWrite).toHaveLength(1)
|
|
362
|
+
expect(withWrite[0]?.id).toBe('test-001')
|
|
363
|
+
})
|
|
364
|
+
|
|
365
|
+
test('filters by duration threshold (slow evaluations)', () => {
|
|
366
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
367
|
+
const slow = results.filter((r) => r.duration > 2000)
|
|
368
|
+
|
|
369
|
+
expect(slow).toHaveLength(1)
|
|
370
|
+
expect(slow[0]?.id).toBe('test-002')
|
|
371
|
+
})
|
|
372
|
+
|
|
373
|
+
test('finds slowest evaluations (sorted)', () => {
|
|
374
|
+
const results = parseResults(SAMPLE_SUMMARY_JSONL)
|
|
375
|
+
const sorted = [...results].sort((a, b) => b.duration - a.duration)
|
|
376
|
+
const top2 = sorted.slice(0, 2)
|
|
377
|
+
|
|
378
|
+
expect(top2[0]?.id).toBe('test-002')
|
|
379
|
+
expect(top2[1]?.id).toBe('test-001')
|
|
380
|
+
})
|
|
381
|
+
|
|
382
|
+
test('deduplicates by ID keeping latest (merge pattern)', () => {
|
|
383
|
+
const combinedJsonl = `${SAMPLE_SUMMARY_JSONL}
|
|
384
|
+
{"id":"test-001","input":"Create a button v2","output":"I created the button v2","toolCalls":["Write","Edit"],"duration":1500}`
|
|
385
|
+
|
|
386
|
+
const results = parseResults(combinedJsonl)
|
|
387
|
+
|
|
388
|
+
// Group by ID and keep last occurrence (simulates jq group_by + last)
|
|
389
|
+
const byId = new Map<string, unknown>()
|
|
390
|
+
for (const result of results) {
|
|
391
|
+
byId.set(result.id, result)
|
|
392
|
+
}
|
|
393
|
+
const deduped = Array.from(byId.values())
|
|
394
|
+
|
|
395
|
+
expect(deduped).toHaveLength(3) // test-001, test-002, test-003
|
|
396
|
+
const test001 = deduped.find((r) => (r as { id: string }).id === 'test-001') as { input: string }
|
|
397
|
+
expect(test001?.input).toBe('Create a button v2')
|
|
398
|
+
})
|
|
399
|
+
|
|
400
|
+
test('groups by category and counts', () => {
|
|
401
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
402
|
+
|
|
403
|
+
// Group by category (simulates jq group_by pattern)
|
|
404
|
+
const grouped = results.reduce<Record<string, number>>((acc, r) => {
|
|
405
|
+
const cat = r.metadata.category as string
|
|
406
|
+
acc[cat] = (acc[cat] ?? 0) + 1
|
|
407
|
+
return acc
|
|
408
|
+
}, {})
|
|
409
|
+
|
|
410
|
+
expect(grouped).toEqual({ ui: 1, bugfix: 1 })
|
|
411
|
+
})
|
|
412
|
+
|
|
413
|
+
test('extracts timing information', () => {
|
|
414
|
+
const results = parseResults(SAMPLE_CAPTURE_JSONL)
|
|
415
|
+
const result = results[0]
|
|
416
|
+
|
|
417
|
+
expect(result.timing.start).toBe(1704067200000)
|
|
418
|
+
expect(result.timing.end).toBe(1704067201234)
|
|
419
|
+
expect(result.timing.firstResponse).toBe(100)
|
|
420
|
+
expect(result.timing.end - result.timing.start).toBe(1234) // matches duration
|
|
421
|
+
})
|
|
422
|
+
})
|
|
423
|
+
|
|
424
|
+
// ============================================================================
|
|
425
|
+
// MCP Server Config Parsing Tests
|
|
426
|
+
// ============================================================================
|
|
427
|
+
|
|
428
|
+
describe('MCP server config parsing', () => {
|
|
429
|
+
test('parses stdio MCP server config', () => {
|
|
430
|
+
const json = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":["/data"],"env":[]}'
|
|
431
|
+
const proc = Bun.spawn(
|
|
432
|
+
['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', 'bunx', 'claude-code-acp', '--mcp-server', json, '--help'],
|
|
433
|
+
{
|
|
434
|
+
stdout: 'pipe',
|
|
435
|
+
stderr: 'pipe',
|
|
436
|
+
},
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
// If it doesn't crash, the parsing worked
|
|
440
|
+
expect(proc.exited).resolves.toBeDefined()
|
|
441
|
+
})
|
|
442
|
+
|
|
443
|
+
test('parses http MCP server config', () => {
|
|
444
|
+
const json =
|
|
445
|
+
'{"type":"http","name":"api","url":"https://example.com/mcp","headers":[{"name":"Authorization","value":"Bearer token"}]}'
|
|
446
|
+
const proc = Bun.spawn(
|
|
447
|
+
['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', 'bunx', 'claude-code-acp', '--mcp-server', json, '--help'],
|
|
448
|
+
{
|
|
449
|
+
stdout: 'pipe',
|
|
450
|
+
stderr: 'pipe',
|
|
451
|
+
},
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
// If it doesn't crash, the parsing worked
|
|
455
|
+
expect(proc.exited).resolves.toBeDefined()
|
|
456
|
+
})
|
|
457
|
+
|
|
458
|
+
test('accepts multiple MCP servers', () => {
|
|
459
|
+
const json1 = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":[],"env":[]}'
|
|
460
|
+
const json2 = '{"type":"http","name":"api","url":"https://example.com","headers":[]}'
|
|
461
|
+
const proc = Bun.spawn(
|
|
462
|
+
[
|
|
463
|
+
'bun',
|
|
464
|
+
CLI_PATH,
|
|
465
|
+
'capture',
|
|
466
|
+
'/tmp/test.jsonl',
|
|
467
|
+
'bunx',
|
|
468
|
+
'claude-code-acp',
|
|
469
|
+
'--mcp-server',
|
|
470
|
+
json1,
|
|
471
|
+
'--mcp-server',
|
|
472
|
+
json2,
|
|
473
|
+
'--help',
|
|
474
|
+
],
|
|
475
|
+
{
|
|
476
|
+
stdout: 'pipe',
|
|
477
|
+
stderr: 'pipe',
|
|
478
|
+
},
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
// If it doesn't crash, the parsing worked
|
|
482
|
+
expect(proc.exited).resolves.toBeDefined()
|
|
483
|
+
})
|
|
484
|
+
})
|
|
485
|
+
|
|
486
|
+
// ============================================================================
|
|
487
|
+
// Error Handling Tests
|
|
488
|
+
// ============================================================================
|
|
489
|
+
|
|
490
|
+
describe('error handling', () => {
|
|
491
|
+
test('fails when schema file does not exist', async () => {
|
|
492
|
+
const tmpFile = `/tmp/invalid-${Date.now()}.jsonl`
|
|
493
|
+
await Bun.write(tmpFile, '{"id": "t1", "input": "test"}\n')
|
|
494
|
+
|
|
495
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'capture', tmpFile, '--schema', 'nonexistent-schema.json'], {
|
|
496
|
+
stdout: 'pipe',
|
|
497
|
+
stderr: 'pipe',
|
|
498
|
+
})
|
|
499
|
+
const stderr = await new Response(proc.stderr).text()
|
|
500
|
+
const exitCode = await proc.exited
|
|
501
|
+
|
|
502
|
+
expect(exitCode).not.toBe(0)
|
|
503
|
+
expect(stderr).toContain('Schema file not found')
|
|
504
|
+
})
|
|
505
|
+
|
|
506
|
+
test('capture command requires prompts path', async () => {
|
|
507
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'capture'], {
|
|
508
|
+
stdout: 'pipe',
|
|
509
|
+
stderr: 'pipe',
|
|
510
|
+
})
|
|
511
|
+
const stderr = await new Response(proc.stderr).text()
|
|
512
|
+
const exitCode = await proc.exited
|
|
513
|
+
|
|
514
|
+
expect(exitCode).toBe(1)
|
|
515
|
+
expect(stderr).toContain('prompts.jsonl path is required')
|
|
516
|
+
})
|
|
517
|
+
|
|
518
|
+
test('summarize command requires input path', async () => {
|
|
519
|
+
const proc = Bun.spawn(['bun', CLI_PATH, 'summarize'], {
|
|
520
|
+
stdout: 'pipe',
|
|
521
|
+
stderr: 'pipe',
|
|
522
|
+
})
|
|
523
|
+
const stderr = await new Response(proc.stderr).text()
|
|
524
|
+
const exitCode = await proc.exited
|
|
525
|
+
|
|
526
|
+
expect(exitCode).toBe(1)
|
|
527
|
+
expect(stderr).toContain('results.jsonl path is required')
|
|
528
|
+
})
|
|
529
|
+
})
|
package/package.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@plaited/agent-eval-harness",
|
|
3
|
+
"version": "0.5.0",
|
|
4
|
+
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
|
+
"license": "ISC",
|
|
6
|
+
"engines": {
|
|
7
|
+
"bun": ">= v1.2.9"
|
|
8
|
+
},
|
|
9
|
+
"repository": {
|
|
10
|
+
"type": "git",
|
|
11
|
+
"url": "git+https://github.com/plaited/agent-eval-harness.git"
|
|
12
|
+
},
|
|
13
|
+
"bugs": {
|
|
14
|
+
"url": "https://github.com/plaited/agent-eval-harness/issues"
|
|
15
|
+
},
|
|
16
|
+
"homepage": "https://github.com/plaited/agent-eval-harness/tree/main#readme",
|
|
17
|
+
"bin": {
|
|
18
|
+
"agent-eval-harness": "./bin/cli.ts"
|
|
19
|
+
},
|
|
20
|
+
"type": "module",
|
|
21
|
+
"exports": {
|
|
22
|
+
".": "./src/harness.ts",
|
|
23
|
+
"./schemas": "./src/schemas.ts",
|
|
24
|
+
"./headless": "./src/headless.ts",
|
|
25
|
+
"./pipeline": "./src/pipeline.ts"
|
|
26
|
+
},
|
|
27
|
+
"files": [
|
|
28
|
+
"./src/**",
|
|
29
|
+
"./bin/**",
|
|
30
|
+
"!./src/**/tests/*",
|
|
31
|
+
"!./src/**/*.spec.ts",
|
|
32
|
+
"!./bin/**/tests/*",
|
|
33
|
+
"!./bin/**/*.spec.ts"
|
|
34
|
+
],
|
|
35
|
+
"publishConfig": {
|
|
36
|
+
"access": "public"
|
|
37
|
+
},
|
|
38
|
+
"scripts": {
|
|
39
|
+
"check": "bun run check:biome && bun run check:types && bun run check:package",
|
|
40
|
+
"check:biome": "biome check",
|
|
41
|
+
"check:package": "format-package --check",
|
|
42
|
+
"check:types": "tsc --noEmit",
|
|
43
|
+
"check:write": "biome check --write && format-package --write",
|
|
44
|
+
"prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
|
|
45
|
+
"test": "bun test ./**/tests/*.spec.ts",
|
|
46
|
+
"test:integration": "bun test ./**/integration_tests/*.spec.ts"
|
|
47
|
+
},
|
|
48
|
+
"lint-staged": {
|
|
49
|
+
"*.{js,cjs,jsx,tsx,ts}": [
|
|
50
|
+
"bunx biome check --write --files-ignore-unknown"
|
|
51
|
+
],
|
|
52
|
+
"package.json": [
|
|
53
|
+
"format-package -w"
|
|
54
|
+
]
|
|
55
|
+
},
|
|
56
|
+
"dependencies": {
|
|
57
|
+
"zod": "^4.3.5",
|
|
58
|
+
"@plaited/development-skills": "0.6.3"
|
|
59
|
+
},
|
|
60
|
+
"devDependencies": {
|
|
61
|
+
"@biomejs/biome": "2.3.11",
|
|
62
|
+
"@types/bun": "1.3.6",
|
|
63
|
+
"format-package": "7.0.0",
|
|
64
|
+
"lint-staged": "16.2.7",
|
|
65
|
+
"typescript": "5.9.3"
|
|
66
|
+
}
|
|
67
|
+
}
|