@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,393 @@
1
+ /**
2
+ * Core trajectory capture command.
3
+ *
4
+ * @remarks
5
+ * Executes prompts against a CLI agent and captures full trajectories.
6
+ * This is the foundational command - all other views derive from its output.
7
+ *
8
+ * Output format is always full trajectory JSONL (`CaptureResultSchema`).
9
+ * Use `summarize` command to derive compact views.
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import { parseArgs } from 'node:util'
15
+ import {
16
+ detectTrajectoryRichness,
17
+ extractOutput,
18
+ extractTrajectory,
19
+ getInputPreview,
20
+ hasToolErrors,
21
+ loadPrompts,
22
+ logProgress,
23
+ resolvePath,
24
+ writeOutput,
25
+ } from '../core.ts'
26
+ import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
27
+ import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
28
+ import { createSessionManager, type ProcessExitInfo, type PromptResult } from '../headless/headless-session-manager.ts'
29
+ import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
30
+ import { loadGrader } from '../schemas/grader-loader.ts'
31
+ import type { CaptureResult, Grader, TrajectoryRichness } from '../schemas.ts'
32
+
33
+ // ============================================================================
34
+ // Re-exports for backward compatibility
35
+ // ============================================================================
36
+
37
+ // These functions are now in core/ but re-exported here for existing consumers
38
+ export {
39
+ detectTrajectoryRichness,
40
+ extractContent,
41
+ extractFilePath,
42
+ extractOutput,
43
+ extractTrajectory,
44
+ hasToolErrors,
45
+ headTailPreview,
46
+ loadPrompts,
47
+ } from '../core.ts'
48
+
49
+ // ============================================================================
50
+ // Types
51
+ // ============================================================================
52
+
53
+ /** Configuration for capture command */
54
+ export type CaptureConfig = {
55
+ /** Path to prompts.jsonl file */
56
+ promptsPath: string
57
+ /** Path to agent schema JSON file */
58
+ schemaPath: string
59
+ /** Output file path (undefined for stdout) */
60
+ outputPath?: string
61
+ /** Working directory for agent */
62
+ cwd?: string
63
+ /** Timeout per prompt in milliseconds (overrides schema default) */
64
+ timeout?: number
65
+ /** Show progress to stderr */
66
+ progress?: boolean
67
+ /** Append to output file instead of overwriting */
68
+ append?: boolean
69
+ /** Optional grader function */
70
+ grader?: Grader
71
+ /** Enable debug mode for detailed output */
72
+ debug?: boolean
73
+ }
74
+
75
+ // ============================================================================
76
+ // Capture Implementation
77
+ // ============================================================================
78
+
79
+ /**
80
+ * Execute capture with configuration object.
81
+ *
82
+ * @remarks
83
+ * Creates a fresh session for each JSONL entry to ensure isolation.
84
+ * Supports multi-turn conversations via `input: string[]`.
85
+ *
86
+ * @param config - Capture configuration
87
+ * @returns Array of capture results
88
+ */
89
+ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
90
+ const {
91
+ promptsPath,
92
+ schemaPath,
93
+ outputPath,
94
+ cwd,
95
+ timeout,
96
+ progress = false,
97
+ append = false,
98
+ grader,
99
+ debug = false,
100
+ } = config
101
+
102
+ // Load and validate schema
103
+ const schemaFile = Bun.file(schemaPath)
104
+ if (!(await schemaFile.exists())) {
105
+ throw new Error(`Schema file not found: ${schemaPath}`)
106
+ }
107
+
108
+ let schema: HeadlessAdapterConfig
109
+ try {
110
+ const rawSchema = await schemaFile.json()
111
+ schema = parseHeadlessConfig(rawSchema)
112
+ } catch (error) {
113
+ throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
114
+ }
115
+
116
+ // Load prompts
117
+ const prompts = await loadPrompts(promptsPath)
118
+
119
+ // Resolve output path
120
+ const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
121
+
122
+ // Determine effective timeout (CLI flag > schema default > harness default)
123
+ const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
124
+ const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
125
+
126
+ // Log progress info
127
+ logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
128
+ logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
129
+ logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
130
+ if (resolvedOutputPath) {
131
+ logProgress(`Output: ${resolvedOutputPath}`, progress)
132
+ }
133
+ if (debug) {
134
+ logProgress(`Debug mode: enabled`, progress)
135
+ }
136
+
137
+ // Create session manager with schema
138
+ const sessions = createSessionManager({
139
+ schema,
140
+ timeout: effectiveTimeout,
141
+ verbose: progress,
142
+ debug,
143
+ })
144
+
145
+ // Clear output file if not appending
146
+ if (resolvedOutputPath && !append) {
147
+ await Bun.write(resolvedOutputPath, '')
148
+ }
149
+
150
+ const workingDir = cwd ?? process.cwd()
151
+ const results: CaptureResult[] = []
152
+ let isFirstOutput = true
153
+
154
+ // Run evaluations sequentially - fresh session per entry
155
+ for (let i = 0; i < prompts.length; i++) {
156
+ const promptCase = prompts[i]
157
+ if (!promptCase) continue
158
+
159
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
160
+
161
+ const startTime = Date.now()
162
+ let result: CaptureResult
163
+
164
+ try {
165
+ // Create fresh session for each entry (ensures isolation)
166
+ const sessionStart = Date.now()
167
+ const session = await sessions.create(workingDir)
168
+ const sessionCreation = Date.now() - sessionStart
169
+ logProgress(` Session: ${session.id}`, progress)
170
+
171
+ // Handle string or array input
172
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
173
+ const turnCount = inputs.length
174
+
175
+ // Collect all updates from all turns
176
+ const allUpdates: ParsedUpdate[] = []
177
+ let lastExitInfo: ProcessExitInfo | undefined
178
+ let lastOutput = ''
179
+
180
+ // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
181
+ // The session manager would need to accept timeout per-call to support this
182
+
183
+ // Execute each turn sequentially in the same session
184
+ for (const turnInput of inputs) {
185
+ const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
186
+ allUpdates.push(...turnResult.updates)
187
+ lastExitInfo = turnResult.exitInfo
188
+ lastOutput = turnResult.output
189
+ }
190
+
191
+ const endTime = Date.now()
192
+ const trajectory = extractTrajectory(allUpdates, startTime)
193
+
194
+ // Use last turn's output or extract from trajectory
195
+ const output = lastOutput || extractOutput(trajectory)
196
+ const toolErrors = hasToolErrors(trajectory) || (lastExitInfo?.timedOut ?? false)
197
+ const trajectoryRichness = detectTrajectoryRichness(trajectory)
198
+
199
+ result = {
200
+ id: promptCase.id,
201
+ input: promptCase.input, // Preserve original (string or array)
202
+ output,
203
+ ...(promptCase.hint && { hint: promptCase.hint }),
204
+ trajectory,
205
+ metadata: {
206
+ ...promptCase.metadata,
207
+ agent: schema.name,
208
+ trajectoryRichness,
209
+ turnCount,
210
+ ...(lastExitInfo && {
211
+ exitCode: lastExitInfo.exitCode,
212
+ signal: lastExitInfo.signal,
213
+ timedOut: lastExitInfo.timedOut,
214
+ }),
215
+ },
216
+ timing: {
217
+ start: startTime,
218
+ end: endTime,
219
+ firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
220
+ sessionCreation,
221
+ total: endTime - startTime,
222
+ },
223
+ toolErrors,
224
+ }
225
+
226
+ // Apply grader if provided
227
+ if (grader) {
228
+ result.score = await grader({
229
+ input: promptCase.input,
230
+ output,
231
+ hint: promptCase.hint,
232
+ trajectory,
233
+ })
234
+ }
235
+
236
+ // Clean up session
237
+ sessions.destroy(session.id)
238
+ } catch (error) {
239
+ const endTime = Date.now()
240
+ const message = error instanceof Error ? error.message : String(error)
241
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
242
+
243
+ result = {
244
+ id: promptCase.id,
245
+ input: promptCase.input,
246
+ output: '',
247
+ trajectory: [],
248
+ metadata: {
249
+ ...promptCase.metadata,
250
+ agent: schema.name,
251
+ trajectoryRichness: 'minimal' as TrajectoryRichness,
252
+ turnCount: inputs.length,
253
+ },
254
+ timing: {
255
+ start: startTime,
256
+ end: endTime,
257
+ sessionCreation: 0,
258
+ total: endTime - startTime,
259
+ },
260
+ toolErrors: true,
261
+ errors: [message],
262
+ }
263
+ }
264
+
265
+ results.push(result)
266
+
267
+ // Write result immediately
268
+ const formatted = JSON.stringify(result)
269
+ await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
270
+ isFirstOutput = false
271
+
272
+ const statusIcon = result.toolErrors ? '!' : '✓'
273
+ const exitInfo = result.metadata?.timedOut
274
+ ? ' - TIMEOUT'
275
+ : result.metadata?.exitCode && result.metadata.exitCode !== 0
276
+ ? ` - exit ${result.metadata.exitCode}`
277
+ : ''
278
+ logProgress(` ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
279
+ }
280
+
281
+ logProgress('Done!', progress)
282
+ return results
283
+ }
284
+
285
+ // ============================================================================
286
+ // CLI Entry Point
287
+ // ============================================================================
288
+
289
+ /**
290
+ * Capture command CLI handler.
291
+ *
292
+ * @param args - Command line arguments (after 'capture')
293
+ */
294
+ export const capture = async (args: string[]): Promise<void> => {
295
+ const { values, positionals } = parseArgs({
296
+ args,
297
+ options: {
298
+ schema: { type: 'string', short: 's' },
299
+ output: { type: 'string', short: 'o' },
300
+ cwd: { type: 'string', short: 'c' },
301
+ timeout: { type: 'string', short: 't' },
302
+ progress: { type: 'boolean', default: false },
303
+ append: { type: 'boolean', default: false },
304
+ grader: { type: 'string', short: 'g' },
305
+ debug: { type: 'boolean', default: false },
306
+ help: { type: 'boolean', short: 'h' },
307
+ },
308
+ allowPositionals: true,
309
+ })
310
+
311
+ if (values.help) {
312
+ // biome-ignore lint/suspicious/noConsole: CLI help output
313
+ console.log(`
314
+ Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
315
+
316
+ Arguments:
317
+ prompts.jsonl Input file with evaluation prompts
318
+
319
+ Options:
320
+ -s, --schema Path to agent schema JSON file (required)
321
+ -o, --output Output file (default: stdout)
322
+ -c, --cwd Working directory for agent
323
+ -t, --timeout Request timeout in ms (overrides schema default)
324
+ --progress Show progress to stderr
325
+ --append Append to output file instead of overwriting
326
+ -g, --grader Path to grader (.ts/.js module or executable script)
327
+ --debug Enable debug mode (shows raw output, JSONPath matching)
328
+ -h, --help Show this help message
329
+
330
+ Output Format:
331
+ Full trajectory JSONL with toolErrors indicator.
332
+ Use 'agent-eval-harness summarize' to derive compact views.
333
+
334
+ Exit Info (in metadata):
335
+ exitCode Process exit code (null if killed/timed out)
336
+ signal Signal that killed process (if any)
337
+ timedOut true if process was killed due to timeout
338
+
339
+ Graders:
340
+ TS/JS modules must export a 'grade' function.
341
+ Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
342
+
343
+ Examples:
344
+ # Basic capture with schema
345
+ agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
346
+
347
+ # With TypeScript grader
348
+ agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
349
+
350
+ # With debug mode
351
+ agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
352
+
353
+ # With per-prompt timeout override (in prompts.jsonl):
354
+ {"id": "slow-task", "input": "...", "timeout": 180000}
355
+ `)
356
+ return
357
+ }
358
+
359
+ const promptsPath = positionals[0]
360
+ if (!promptsPath) {
361
+ console.error('Error: prompts.jsonl path is required')
362
+ process.exit(1)
363
+ }
364
+
365
+ if (!values.schema) {
366
+ console.error('Error: --schema is required')
367
+ console.error('Example: agent-eval-harness capture prompts.jsonl --schema ./claude.json')
368
+ process.exit(1)
369
+ }
370
+
371
+ // Load grader if specified
372
+ let grader: Grader | undefined
373
+ if (values.grader) {
374
+ try {
375
+ grader = await loadGrader(values.grader)
376
+ } catch (error) {
377
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
378
+ process.exit(1)
379
+ }
380
+ }
381
+
382
+ await runCapture({
383
+ promptsPath,
384
+ schemaPath: values.schema,
385
+ outputPath: values.output,
386
+ cwd: values.cwd,
387
+ timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
388
+ progress: values.progress ?? false,
389
+ append: values.append ?? false,
390
+ grader,
391
+ debug: values.debug ?? false,
392
+ })
393
+ }
@@ -0,0 +1,228 @@
1
+ /**
2
+ * Summarize command - derive compact views from full trajectory results.
3
+ *
4
+ * @remarks
5
+ * Transforms full trajectory JSONL into:
6
+ * - Summary JSONL: Compact format for jq analysis
7
+ * - Markdown: Human-readable format for LLM-as-judge workflows
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { parseArgs } from 'node:util'
13
+ import { extractContent, extractFilePath, headTailPreview, loadResults, resolvePath } from '../core.ts'
14
+ import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from '../schemas/constants.ts'
15
+ import type { CaptureResult, SummaryResult } from '../schemas.ts'
16
+
17
+ // ============================================================================
18
+ // Types
19
+ // ============================================================================
20
+
21
+ /** Configuration for summarize command */
22
+ export type SummarizeConfig = {
23
+ /** Path to results.jsonl file */
24
+ resultsPath: string
25
+ /** Output file path */
26
+ outputPath?: string
27
+ /** Output as markdown instead of JSONL */
28
+ markdown?: boolean
29
+ }
30
+
31
+ /**
32
+ * Format capture result as compact summary.
33
+ *
34
+ * @param result - Full capture result
35
+ * @returns Compact summary result
36
+ *
37
+ * @public
38
+ */
39
+ export const formatSummary = (result: CaptureResult): SummaryResult => {
40
+ const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
41
+ return {
42
+ id: result.id,
43
+ input: inputText,
44
+ output: result.output,
45
+ toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
46
+ duration: result.timing.end - result.timing.start,
47
+ }
48
+ }
49
+
50
+ /**
51
+ * Format capture result as markdown with step IDs.
52
+ *
53
+ * @param result - Full capture result
54
+ * @returns Markdown formatted string
55
+ *
56
+ * @public
57
+ */
58
+ export const formatMarkdown = (result: CaptureResult): string => {
59
+ const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
60
+ const lines: string[] = [`## Evaluation Record: ${result.id}`, '', `**Input:** ${inputText}`, '', '**Trajectory:**']
61
+
62
+ let stepNum = 1
63
+ for (const step of result.trajectory) {
64
+ const stepId = `${result.id}-step-${stepNum}`
65
+
66
+ if (step.type === 'thought') {
67
+ const preview = step.content.slice(0, 100)
68
+ const truncated = step.content.length > 100 ? '...' : ''
69
+ lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
70
+ stepNum++
71
+ } else if (step.type === 'tool_call') {
72
+ const duration = step.duration ? ` (${step.duration}ms)` : ''
73
+ const filePath = extractFilePath(step.input)
74
+ const content = extractContent(step.input)
75
+
76
+ lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
77
+
78
+ // Add file path if present
79
+ if (filePath) {
80
+ const charCount = content?.length ?? 0
81
+ lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
82
+ }
83
+
84
+ // Add head/tail preview for content-producing tools
85
+ if (content && content.length > 0) {
86
+ const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content
87
+ // Detect file extension for syntax highlighting
88
+ const ext = filePath?.split('.').pop() ?? 'typescript'
89
+ lines.push(` \`\`\`${ext}`)
90
+ lines.push(` ${preview.split('\n').join('\n ')}`)
91
+ lines.push(' ```')
92
+ }
93
+ stepNum++
94
+ } else if (step.type === 'plan') {
95
+ const entries = step.entries as Array<{ content: string; status: string }>
96
+ const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ')
97
+ const truncated = planSummary.length > 80 ? '...' : ''
98
+ lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
99
+ stepNum++
100
+ } else if (step.type === 'message') {
101
+ const preview = step.content.slice(0, 100)
102
+ const truncated = step.content.length > 100 ? '...' : ''
103
+ lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
104
+ stepNum++
105
+ }
106
+ }
107
+
108
+ lines.push('')
109
+ const outputPreview = result.output.slice(0, 200)
110
+ const outputTruncated = result.output.length > 200 ? '...' : ''
111
+ lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
112
+ lines.push('')
113
+
114
+ const metadataStr = Object.entries(result.metadata)
115
+ .map(([k, v]) => `${k}=${v}`)
116
+ .join(', ')
117
+ lines.push(`**Metadata:** ${metadataStr}`)
118
+ lines.push(`**Tool Errors:** ${result.toolErrors}`)
119
+ lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
120
+
121
+ if (result.score) {
122
+ lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`)
123
+ if (result.score.reasoning) {
124
+ lines.push(`**Reasoning:** ${result.score.reasoning}`)
125
+ }
126
+ }
127
+
128
+ lines.push('')
129
+ lines.push('---')
130
+ lines.push('')
131
+
132
+ return lines.join('\n')
133
+ }
134
+
135
+ // ============================================================================
136
+ // Summarize Implementation
137
+ // ============================================================================
138
+
139
+ /**
140
+ * Execute summarize with configuration object.
141
+ *
142
+ * @param config - Summarize configuration
143
+ * @returns Formatted output string
144
+ */
145
+ export const runSummarize = async (config: SummarizeConfig): Promise<string> => {
146
+ const { resultsPath, outputPath, markdown = false } = config
147
+
148
+ // Load results
149
+ const results = await loadResults(resultsPath)
150
+
151
+ // Format output
152
+ let output: string
153
+ if (markdown) {
154
+ output = results.map(formatMarkdown).join('\n')
155
+ } else {
156
+ output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n')
157
+ }
158
+
159
+ // Write output
160
+ if (outputPath) {
161
+ await Bun.write(resolvePath(outputPath), output)
162
+ } else {
163
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
164
+ console.log(output)
165
+ }
166
+
167
+ return output
168
+ }
169
+
170
+ // ============================================================================
171
+ // CLI Entry Point
172
+ // ============================================================================
173
+
174
+ /**
175
+ * Summarize command CLI handler.
176
+ *
177
+ * @param args - Command line arguments (after 'summarize')
178
+ */
179
+ export const summarize = async (args: string[]): Promise<void> => {
180
+ const { values, positionals } = parseArgs({
181
+ args,
182
+ options: {
183
+ output: { type: 'string', short: 'o' },
184
+ markdown: { type: 'boolean', short: 'm', default: false },
185
+ help: { type: 'boolean', short: 'h' },
186
+ },
187
+ allowPositionals: true,
188
+ })
189
+
190
+ if (values.help) {
191
+ // biome-ignore lint/suspicious/noConsole: CLI help output
192
+ console.log(`
193
+ Usage: agent-eval-harness summarize <results.jsonl> [options]
194
+
195
+ Arguments:
196
+ results.jsonl Input file with capture results
197
+
198
+ Options:
199
+ -o, --output Output file (default: stdout)
200
+ -m, --markdown Output as markdown instead of JSONL
201
+ -h, --help Show this help message
202
+
203
+ Output Formats:
204
+ JSONL (default): Compact summary with id, input, output, toolCalls, duration
205
+ Markdown (-m): Human-readable format with step IDs for LLM-as-judge
206
+
207
+ Examples:
208
+ # Summary JSONL for jq analysis
209
+ agent-eval-harness summarize results.jsonl -o summary.jsonl
210
+
211
+ # Markdown for LLM evaluation
212
+ agent-eval-harness summarize results.jsonl --markdown -o results.md
213
+ `)
214
+ return
215
+ }
216
+
217
+ const resultsPath = positionals[0]
218
+ if (!resultsPath) {
219
+ console.error('Error: results.jsonl path is required')
220
+ process.exit(1)
221
+ }
222
+
223
+ await runSummarize({
224
+ resultsPath,
225
+ outputPath: values.output,
226
+ markdown: values.markdown ?? false,
227
+ })
228
+ }