@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Headless adapter factory - schema-driven adapter for any CLI agent.
3
+ *
4
+ * @remarks
5
+ * Re-exports public API from the headless module. The headless adapter enables
6
+ * capturing trajectories from ANY headless CLI agent by defining a schema
7
+ * that describes how to interact with the CLI.
8
+ *
9
+ * **CLI Usage:**
10
+ * ```bash
11
+ * agent-eval-harness headless --schema ./my-agent.json
12
+ * ```
13
+ *
14
+ * **Programmatic Usage:**
15
+ * ```typescript
16
+ * import { parseHeadlessConfig, createSessionManager } from '@plaited/agent-eval-harness/headless'
17
+ *
18
+ * const schema = parseHeadlessConfig(jsonConfig)
19
+ * const sessions = createSessionManager({ schema })
20
+ * ```
21
+ *
22
+ * @packageDocumentation
23
+ */
24
+
25
+ // Schema definitions and parsing
26
+ export {
27
+ HeadlessAdapterSchema,
28
+ OutputConfigSchema,
29
+ OutputEventExtractSchema,
30
+ OutputEventMappingSchema,
31
+ OutputEventMatchSchema,
32
+ PromptConfigSchema,
33
+ parseHeadlessConfig,
34
+ ResultConfigSchema,
35
+ ResumeConfigSchema,
36
+ safeParseHeadlessConfig,
37
+ } from './headless/headless.schemas.ts'
38
+ // Types
39
+ export type {
40
+ HeadlessAdapterConfig,
41
+ OutputConfig,
42
+ OutputEventExtract,
43
+ OutputEventMapping,
44
+ OutputEventMatch,
45
+ PromptConfig,
46
+ ResultConfig,
47
+ ResumeConfig,
48
+ } from './headless/headless.types.ts'
49
+ // CLI entry point
50
+ export { headless } from './headless/headless-cli.ts'
51
+ export type { HistoryBuilder, HistoryBuilderConfig, HistoryTurn } from './headless/headless-history-builder.ts'
52
+ // History builder
53
+ export { createHistoryBuilder } from './headless/headless-history-builder.ts'
54
+ export type {
55
+ OutputParser,
56
+ ParsedResult,
57
+ ParsedUpdate,
58
+ ResultParseResult,
59
+ SessionUpdateType,
60
+ } from './headless/headless-output-parser.ts'
61
+ // Output parser
62
+ export { createOutputParser, jsonPath, jsonPathString } from './headless/headless-output-parser.ts'
63
+ export type {
64
+ ProcessExitInfo,
65
+ PromptResult,
66
+ Session,
67
+ SessionManager,
68
+ SessionManagerConfig,
69
+ UpdateCallback,
70
+ } from './headless/headless-session-manager.ts'
71
+ // Session manager
72
+ export { createSessionManager } from './headless/headless-session-manager.ts'
@@ -0,0 +1,157 @@
1
+ /**
2
+ * Integration tests for Claude Code headless adapter.
3
+ *
4
+ * @remarks
5
+ * Tests verify the headless session manager works correctly with Claude Code CLI
6
+ * using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
7
+ *
8
+ * Run locally with API key:
9
+ * ```bash
10
+ * ANTHROPIC_API_KEY=sk-... bun test ./src/integration_tests/claude.spec.ts
11
+ * ```
12
+ *
13
+ * Prerequisites:
14
+ * 1. Claude CLI installed (`curl -fsSL https://claude.ai/install.sh | bash`)
15
+ * 2. API key: `ANTHROPIC_API_KEY` environment variable
16
+ *
17
+ * These tests make real API calls and consume credits.
18
+ */
19
+
20
+ import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
21
+ import { join } from 'node:path'
22
+ import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
23
+ import { createSessionManager } from '../headless/headless-session-manager.ts'
24
+
25
+ // Long timeout for real agent interactions (2 minutes)
26
+ setDefaultTimeout(120000)
27
+
28
+ // Use project root as cwd - agents discover MCP servers from config files
29
+ const PROJECT_ROOT = process.cwd()
30
+
31
+ // Schema path for Claude headless adapter
32
+ const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/claude-headless.json')
33
+
34
+ // Get API key from environment
35
+ const API_KEY = process.env.ANTHROPIC_API_KEY ?? ''
36
+
37
+ // Skip all tests if no API key is available
38
+ const describeWithApiKey = API_KEY ? describe : describe.skip
39
+
40
+ describeWithApiKey('Claude Code Integration', () => {
41
+ let sessionManager: ReturnType<typeof createSessionManager>
42
+ let schemaConfig: ReturnType<typeof parseHeadlessConfig>
43
+
44
+ beforeAll(async () => {
45
+ // Load JSON from file, then parse with Zod schema
46
+ const schemaJson = await Bun.file(SCHEMA_PATH).json()
47
+ schemaConfig = parseHeadlessConfig(schemaJson)
48
+
49
+ // Create session manager with the schema
50
+ sessionManager = createSessionManager({
51
+ schema: schemaConfig,
52
+ timeout: 120000,
53
+ debug: false,
54
+ })
55
+ })
56
+
57
+ afterAll(async () => {
58
+ // Cleanup handled automatically by session manager
59
+ })
60
+
61
+ test('creates session successfully', async () => {
62
+ const session = await sessionManager.create(PROJECT_ROOT)
63
+
64
+ expect(session).toBeDefined()
65
+ expect(session.id).toBeDefined()
66
+ expect(typeof session.id).toBe('string')
67
+ expect(session.active).toBe(true)
68
+ expect(session.cwd).toBe(PROJECT_ROOT)
69
+ })
70
+
71
+ test('sends prompt and receives response', async () => {
72
+ const session = await sessionManager.create(PROJECT_ROOT)
73
+
74
+ // Simple prompt that doesn't require tools
75
+ const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.')
76
+
77
+ expect(result).toBeDefined()
78
+ expect(result.output).toBeDefined()
79
+ expect(result.output.length).toBeGreaterThan(0)
80
+ expect(result.updates).toBeInstanceOf(Array)
81
+
82
+ // Should contain "4" somewhere in the response
83
+ expect(result.output).toMatch(/4/)
84
+ })
85
+
86
+ test('collects trajectory updates during execution', async () => {
87
+ const session = await sessionManager.create(PROJECT_ROOT)
88
+ const collectedUpdates: unknown[] = []
89
+
90
+ const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => {
91
+ collectedUpdates.push(update)
92
+ })
93
+
94
+ expect(result.updates.length).toBeGreaterThan(0)
95
+
96
+ // Should have at least one message update
97
+ const messageUpdates = result.updates.filter((u) => u.type === 'message')
98
+ expect(messageUpdates.length).toBeGreaterThan(0)
99
+ })
100
+
101
+ test('uses MCP server from project config', async () => {
102
+ // This test verifies that Claude discovers MCP servers from .mcp.json
103
+ // The bun-docs MCP server is configured at project root
104
+ const session = await sessionManager.create(PROJECT_ROOT)
105
+
106
+ // Query the bun-docs MCP server (configured in .mcp.json)
107
+ const result = await sessionManager.prompt(
108
+ session.id,
109
+ 'Use the bun-docs MCP server to search for information about Bun.serve(). ' +
110
+ 'What are the key options for creating an HTTP server with Bun?',
111
+ )
112
+
113
+ // Response should contain Bun server-related information
114
+ expect(result.output.length).toBeGreaterThan(0)
115
+ // Should mention server/HTTP-related concepts from Bun docs
116
+ expect(result.output.toLowerCase()).toMatch(/serve|server|http|port|fetch|handler/)
117
+ })
118
+
119
+ test('multi-turn conversation maintains context (stream mode)', async () => {
120
+ // Multi-turn: multiple prompts to same session
121
+ const session = await sessionManager.create(PROJECT_ROOT)
122
+
123
+ // Turn 1: Establish context
124
+ const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.')
125
+ expect(turn1Result.output).toMatch(/42|forty.?two|remember/i)
126
+
127
+ // Turn 2: Reference previous context
128
+ const turn2Result = await sessionManager.prompt(
129
+ session.id,
130
+ 'What number did I ask you to remember? Reply with just the number.',
131
+ )
132
+ expect(turn2Result.output).toMatch(/42/)
133
+ })
134
+
135
+ test('receives valid trajectory updates', async () => {
136
+ const session = await sessionManager.create(PROJECT_ROOT)
137
+
138
+ // Prompt that generates a response with trajectory updates
139
+ const result = await sessionManager.prompt(
140
+ session.id,
141
+ 'What programming language is this project written in? Look at the file extensions.',
142
+ )
143
+
144
+ // Result should have output
145
+ expect(result.output).toBeDefined()
146
+ expect(result.output.length).toBeGreaterThan(0)
147
+
148
+ // Should have collected updates during execution
149
+ expect(result.updates).toBeInstanceOf(Array)
150
+ expect(result.updates.length).toBeGreaterThan(0)
151
+
152
+ // All updates should have valid types
153
+ const validTypes = ['thought', 'tool_call', 'message', 'plan']
154
+ const allValidTypes = result.updates.every((u) => validTypes.includes(u.type))
155
+ expect(allValidTypes).toBe(true)
156
+ })
157
+ })
@@ -0,0 +1,139 @@
1
+ /**
2
+ * Integration tests for Gemini CLI headless adapter.
3
+ *
4
+ * @remarks
5
+ * Tests verify the headless session manager works correctly with Gemini CLI
6
+ * using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
7
+ *
8
+ * Run locally with API key:
9
+ * ```bash
10
+ * GEMINI_API_KEY=... bun test ./src/integration_tests/gemini.spec.ts
11
+ * ```
12
+ *
13
+ * Prerequisites:
14
+ * 1. Gemini CLI installed (`npm install -g @google/gemini-cli`)
15
+ * 2. API key: `GEMINI_API_KEY` environment variable
16
+ *
17
+ * These tests make real API calls and consume credits.
18
+ */
19
+
20
+ import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
21
+ import { join } from 'node:path'
22
+ import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
23
+ import { createSessionManager } from '../headless/headless-session-manager.ts'
24
+
25
+ // Long timeout for real agent interactions (2 minutes)
26
+ setDefaultTimeout(120000)
27
+
28
+ // Use project root as cwd - agents discover MCP servers from config files
29
+ const PROJECT_ROOT = process.cwd()
30
+
31
+ // Schema path for Gemini headless adapter
32
+ const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/gemini-headless.json')
33
+
34
+ // Get API key from environment
35
+ const GEMINI_API_KEY = process.env.GEMINI_API_KEY ?? ''
36
+
37
+ // Skip all tests if no API key is available
38
+ const describeWithApiKey = GEMINI_API_KEY ? describe : describe.skip
39
+
40
+ describeWithApiKey('Gemini CLI Integration', () => {
41
+ let sessionManager: ReturnType<typeof createSessionManager>
42
+ let schemaConfig: ReturnType<typeof parseHeadlessConfig>
43
+
44
+ beforeAll(async () => {
45
+ // Load JSON from file, then parse with Zod schema
46
+ const schemaJson = await Bun.file(SCHEMA_PATH).json()
47
+ schemaConfig = parseHeadlessConfig(schemaJson)
48
+
49
+ // Create session manager with the schema
50
+ sessionManager = createSessionManager({
51
+ schema: schemaConfig,
52
+ timeout: 120000,
53
+ debug: false,
54
+ })
55
+ })
56
+
57
+ afterAll(async () => {
58
+ // Cleanup handled automatically by session manager
59
+ })
60
+
61
+ test('creates session successfully', async () => {
62
+ const session = await sessionManager.create(PROJECT_ROOT)
63
+
64
+ expect(session).toBeDefined()
65
+ expect(session.id).toBeDefined()
66
+ expect(typeof session.id).toBe('string')
67
+ expect(session.active).toBe(true)
68
+ expect(session.cwd).toBe(PROJECT_ROOT)
69
+ })
70
+
71
+ test('sends prompt and receives response', async () => {
72
+ const session = await sessionManager.create(PROJECT_ROOT)
73
+
74
+ // Simple prompt that doesn't require tools
75
+ const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.')
76
+
77
+ expect(result).toBeDefined()
78
+ expect(result.output).toBeDefined()
79
+ expect(result.output.length).toBeGreaterThan(0)
80
+ expect(result.updates).toBeInstanceOf(Array)
81
+
82
+ // Should contain "4" somewhere in the response
83
+ expect(result.output).toMatch(/4/)
84
+ })
85
+
86
+ test('collects trajectory updates during execution', async () => {
87
+ const session = await sessionManager.create(PROJECT_ROOT)
88
+ const collectedUpdates: unknown[] = []
89
+
90
+ const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => {
91
+ collectedUpdates.push(update)
92
+ })
93
+
94
+ expect(result.updates.length).toBeGreaterThan(0)
95
+
96
+ // Should have at least one message update
97
+ const messageUpdates = result.updates.filter((u) => u.type === 'message')
98
+ expect(messageUpdates.length).toBeGreaterThan(0)
99
+ })
100
+
101
+ test('multi-turn conversation maintains context (iterative mode)', async () => {
102
+ // Multi-turn via headless adapter in iterative mode (history accumulation)
103
+ const session = await sessionManager.create(PROJECT_ROOT)
104
+
105
+ // Turn 1: Establish context
106
+ const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.')
107
+ expect(turn1Result.output).toMatch(/42|forty.?two|remember/i)
108
+
109
+ // Turn 2: Reference previous context
110
+ const turn2Result = await sessionManager.prompt(
111
+ session.id,
112
+ 'What number did I ask you to remember? Reply with just the number.',
113
+ )
114
+ expect(turn2Result.output).toMatch(/42/)
115
+ })
116
+
117
+ test('handles simple math question correctly', async () => {
118
+ const session = await sessionManager.create(PROJECT_ROOT)
119
+
120
+ const result = await sessionManager.prompt(session.id, 'Calculate 15 * 7. Reply with just the number.')
121
+
122
+ // Gemini CLI may include formatting variations (newlines, spaces)
123
+ // Strip whitespace to verify the correct answer is present
124
+ expect(result.output.replace(/\s/g, '')).toContain('105')
125
+ })
126
+
127
+ test('processes longer response without timeout', async () => {
128
+ const session = await sessionManager.create(PROJECT_ROOT)
129
+
130
+ const result = await sessionManager.prompt(
131
+ session.id,
132
+ 'List 5 programming languages and one key feature of each. Be brief.',
133
+ )
134
+
135
+ expect(result.output.length).toBeGreaterThan(50)
136
+ // Should mention at least some programming languages
137
+ expect(result.output.toLowerCase()).toMatch(/python|javascript|java|rust|go|typescript|c\+\+|ruby/)
138
+ })
139
+ })
@@ -0,0 +1,325 @@
1
+ /**
2
+ * Pipeline compare command - compare multiple runs of the same prompts.
3
+ *
4
+ * @remarks
5
+ * Compares results from different configurations (agents, MCP servers, models)
6
+ * using a user-provided comparison grader that ranks the runs.
7
+ *
8
+ * Terminology: "runs" (not "agents") because comparisons can be:
9
+ * - Same agent, different MCP servers
10
+ * - Same agent, different skills enabled
11
+ * - Same agent, different system prompts
12
+ * - Same agent, different model versions
13
+ * - Different agents entirely
14
+ *
15
+ * @packageDocumentation
16
+ */
17
+
18
+ import { basename, extname } from 'node:path'
19
+ import { parseArgs } from 'node:util'
20
+ import { loadResults, logProgress, writeOutput } from '../core.ts'
21
+ import type { CaptureResult } from '../schemas.ts'
22
+ import type {
23
+ CompareConfig,
24
+ ComparisonGrader,
25
+ ComparisonGraderInput,
26
+ ComparisonResult,
27
+ LabeledRun,
28
+ } from './pipeline.types.ts'
29
+
30
+ /**
31
+ * Load comparison grader from file.
32
+ *
33
+ * @remarks
34
+ * Similar to loadGrader but expects ComparisonGrader interface.
35
+ *
36
+ * @param path - Path to grader module
37
+ * @returns Loaded comparison grader function
38
+ */
39
+ const loadComparisonGrader = async (path: string): Promise<ComparisonGrader> => {
40
+ const module = await import(path)
41
+
42
+ if (typeof module.grade === 'function') {
43
+ return module.grade as ComparisonGrader
44
+ }
45
+ if (typeof module.default === 'function') {
46
+ return module.default as ComparisonGrader
47
+ }
48
+ if (typeof module.compare === 'function') {
49
+ return module.compare as ComparisonGrader
50
+ }
51
+
52
+ throw new Error(`Comparison grader must export 'grade', 'compare', or 'default' function`)
53
+ }
54
+
55
+ /**
56
+ * Derive label from file path.
57
+ *
58
+ * @param path - File path
59
+ * @returns Label derived from filename without extension
60
+ */
61
+ const labelFromPath = (path: string): string => {
62
+ const base = basename(path)
63
+ const ext = extname(base)
64
+ return base.slice(0, -ext.length)
65
+ }
66
+
67
+ /**
68
+ * Parse labeled run argument.
69
+ *
70
+ * @remarks
71
+ * Supports formats:
72
+ * - "path.jsonl" - label derived from filename
73
+ * - "label:path.jsonl" - explicit label
74
+ *
75
+ * @param arg - Run argument string
76
+ * @returns Labeled run object
77
+ */
78
+ const parseLabeledRun = (arg: string): LabeledRun => {
79
+ const colonIndex = arg.indexOf(':')
80
+
81
+ // Check if this looks like a label:path format (not a Windows drive letter)
82
+ if (colonIndex > 0 && colonIndex !== 1) {
83
+ return {
84
+ label: arg.slice(0, colonIndex),
85
+ path: arg.slice(colonIndex + 1),
86
+ }
87
+ }
88
+
89
+ return {
90
+ label: labelFromPath(arg),
91
+ path: arg,
92
+ }
93
+ }
94
+
95
+ /**
96
+ * Execute pipeline compare with configuration.
97
+ *
98
+ * @param config - Compare configuration
99
+ */
100
+ export const runCompare = async (config: CompareConfig): Promise<void> => {
101
+ const { runs, graderPath, outputPath, progress = false } = config
102
+
103
+ if (runs.length < 2) {
104
+ throw new Error('At least 2 runs required for comparison')
105
+ }
106
+
107
+ // Load comparison grader
108
+ const grader = await loadComparisonGrader(graderPath)
109
+
110
+ logProgress(`Comparing ${runs.length} runs with: ${graderPath}`, progress)
111
+ for (const run of runs) {
112
+ logProgress(` - ${run.label}: ${run.path}`, progress)
113
+ }
114
+
115
+ // Load all runs
116
+ const runResults: Record<string, CaptureResult[]> = {}
117
+ for (const run of runs) {
118
+ logProgress(`Loading ${run.label}...`, progress)
119
+ runResults[run.label] = await loadResults(run.path)
120
+ }
121
+
122
+ // Build map of prompt IDs to runs
123
+ const promptIds = new Set<string>()
124
+ for (const results of Object.values(runResults)) {
125
+ for (const result of results) {
126
+ promptIds.add(result.id)
127
+ }
128
+ }
129
+
130
+ logProgress(`Comparing ${promptIds.size} prompts...`, progress)
131
+
132
+ let isFirstOutput = true
133
+
134
+ // Clear output file if specified
135
+ if (outputPath) {
136
+ await Bun.write(outputPath, '')
137
+ }
138
+
139
+ const results: ComparisonResult[] = []
140
+
141
+ for (const promptId of promptIds) {
142
+ logProgress(` ${promptId}`, progress)
143
+
144
+ // Build comparison input
145
+ const runsData: ComparisonGraderInput['runs'] = {}
146
+ let input: string | string[] = ''
147
+ let hint: string | undefined
148
+
149
+ for (const [label, labelResults] of Object.entries(runResults)) {
150
+ const result = labelResults.find((r) => r.id === promptId)
151
+ if (result) {
152
+ runsData[label] = {
153
+ output: result.output,
154
+ trajectory: result.trajectory,
155
+ }
156
+ // Use first found input/hint as the reference
157
+ if (!input) {
158
+ input = result.input
159
+ hint = result.hint
160
+ }
161
+ }
162
+ }
163
+
164
+ // Skip if not present in at least 2 runs
165
+ if (Object.keys(runsData).length < 2) {
166
+ logProgress(` Skipped (only in ${Object.keys(runsData).length} run)`, progress)
167
+ continue
168
+ }
169
+
170
+ // Apply comparison grader
171
+ const graderInput: ComparisonGraderInput = {
172
+ id: promptId,
173
+ input,
174
+ hint,
175
+ runs: runsData,
176
+ }
177
+
178
+ const graderResult = await grader(graderInput)
179
+
180
+ const comparisonResult: ComparisonResult = {
181
+ id: promptId,
182
+ input,
183
+ hint,
184
+ rankings: graderResult.rankings,
185
+ reasoning: graderResult.reasoning,
186
+ }
187
+
188
+ results.push(comparisonResult)
189
+
190
+ // Log winner
191
+ const winner = graderResult.rankings.find((r) => r.rank === 1)
192
+ if (winner) {
193
+ logProgress(` Winner: ${winner.run} (${winner.score.toFixed(2)})`, progress)
194
+ }
195
+
196
+ await writeOutput(JSON.stringify(comparisonResult), outputPath, !isFirstOutput)
197
+ isFirstOutput = false
198
+ }
199
+
200
+ // Summary statistics
201
+ logProgress('', progress)
202
+ logProgress('=== Summary ===', progress)
203
+
204
+ const winCounts: Record<string, number> = {}
205
+ for (const run of runs) {
206
+ winCounts[run.label] = 0
207
+ }
208
+
209
+ for (const result of results) {
210
+ const winner = result.rankings.find((r) => r.rank === 1)
211
+ if (winner && winner.run in winCounts) {
212
+ const currentCount = winCounts[winner.run] ?? 0
213
+ winCounts[winner.run] = currentCount + 1
214
+ }
215
+ }
216
+
217
+ for (const [label, wins] of Object.entries(winCounts)) {
218
+ const pct = ((wins / results.length) * 100).toFixed(1)
219
+ logProgress(` ${label}: ${wins} wins (${pct}%)`, progress)
220
+ }
221
+
222
+ logProgress('Done!', progress)
223
+ }
224
+
225
+ /**
226
+ * Pipeline compare command CLI handler.
227
+ *
228
+ * @param args - Command line arguments (after 'compare')
229
+ */
230
+ export const compare = async (args: string[]): Promise<void> => {
231
+ const { values, positionals } = parseArgs({
232
+ args,
233
+ options: {
234
+ run: { type: 'string', multiple: true },
235
+ grader: { type: 'string', short: 'g' },
236
+ output: { type: 'string', short: 'o' },
237
+ progress: { type: 'boolean', default: false },
238
+ help: { type: 'boolean', short: 'h' },
239
+ },
240
+ allowPositionals: true,
241
+ })
242
+
243
+ if (values.help) {
244
+ // biome-ignore lint/suspicious/noConsole: CLI help output
245
+ console.log(`
246
+ Usage: agent-eval-harness compare [files...] --grader <grader> [options]
247
+
248
+ Compare multiple runs of the same prompts.
249
+
250
+ Arguments:
251
+ files... Result files to compare (positional, unlimited)
252
+
253
+ Options:
254
+ --run Labeled run format: "label:path.jsonl" (alternative to positional)
255
+ -g, --grader Path to comparison grader (.ts/.js module) (required)
256
+ -o, --output Output file (default: stdout)
257
+ --progress Show progress to stderr
258
+ -h, --help Show this help message
259
+
260
+ Comparison Grader:
261
+ Must export 'grade' or 'compare' function with signature:
262
+ (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
263
+
264
+ Input includes all runs' results for a single prompt.
265
+ Output should rank runs from best to worst.
266
+
267
+ Examples:
268
+ # Compare multiple result files (positional)
269
+ agent-eval-harness compare run1.jsonl run2.jsonl run3.jsonl -g ./compare-grader.ts
270
+
271
+ # With explicit labels
272
+ agent-eval-harness compare \\
273
+ --run "with-bun-mcp:results-bun.jsonl" \\
274
+ --run "vanilla:results-vanilla.jsonl" \\
275
+ -g ./compare-grader.ts
276
+
277
+ # Mix positional and labeled
278
+ agent-eval-harness compare results-*.jsonl \\
279
+ --run "baseline:baseline.jsonl" \\
280
+ -g ./compare-grader.ts -o comparison.jsonl
281
+
282
+ # Typical workflow
283
+ # 1. Capture with different configs
284
+ agent-eval-harness capture prompts.jsonl -s claude.json -o vanilla.jsonl
285
+ agent-eval-harness capture prompts.jsonl -s claude-with-mcp.json -o with-mcp.jsonl
286
+
287
+ # 2. Compare results
288
+ agent-eval-harness compare vanilla.jsonl with-mcp.jsonl -g ./compare-grader.ts
289
+ `)
290
+ return
291
+ }
292
+
293
+ if (!values.grader) {
294
+ console.error('Error: --grader is required')
295
+ process.exit(1)
296
+ }
297
+
298
+ // Collect runs from positional args and --run flags
299
+ const runs: LabeledRun[] = []
300
+
301
+ // Positional arguments (file paths)
302
+ for (const arg of positionals) {
303
+ runs.push(parseLabeledRun(arg))
304
+ }
305
+
306
+ // --run flags
307
+ if (values.run) {
308
+ for (const arg of values.run) {
309
+ runs.push(parseLabeledRun(arg))
310
+ }
311
+ }
312
+
313
+ if (runs.length < 2) {
314
+ console.error('Error: At least 2 result files required for comparison')
315
+ console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl -g ./grader.ts')
316
+ process.exit(1)
317
+ }
318
+
319
+ await runCompare({
320
+ runs,
321
+ graderPath: values.grader,
322
+ outputPath: values.output,
323
+ progress: values.progress,
324
+ })
325
+ }