@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
package/src/headless.ts
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Headless adapter factory - schema-driven adapter for any CLI agent.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Re-exports public API from the headless module. The headless adapter enables
|
|
6
|
+
* capturing trajectories from ANY headless CLI agent by defining a schema
|
|
7
|
+
* that describes how to interact with the CLI.
|
|
8
|
+
*
|
|
9
|
+
* **CLI Usage:**
|
|
10
|
+
* ```bash
|
|
11
|
+
* agent-eval-harness headless --schema ./my-agent.json
|
|
12
|
+
* ```
|
|
13
|
+
*
|
|
14
|
+
* **Programmatic Usage:**
|
|
15
|
+
* ```typescript
|
|
16
|
+
* import { parseHeadlessConfig, createSessionManager } from '@plaited/agent-eval-harness/headless'
|
|
17
|
+
*
|
|
18
|
+
* const schema = parseHeadlessConfig(jsonConfig)
|
|
19
|
+
* const sessions = createSessionManager({ schema })
|
|
20
|
+
* ```
|
|
21
|
+
*
|
|
22
|
+
* @packageDocumentation
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
// Schema definitions and parsing
|
|
26
|
+
export {
|
|
27
|
+
HeadlessAdapterSchema,
|
|
28
|
+
OutputConfigSchema,
|
|
29
|
+
OutputEventExtractSchema,
|
|
30
|
+
OutputEventMappingSchema,
|
|
31
|
+
OutputEventMatchSchema,
|
|
32
|
+
PromptConfigSchema,
|
|
33
|
+
parseHeadlessConfig,
|
|
34
|
+
ResultConfigSchema,
|
|
35
|
+
ResumeConfigSchema,
|
|
36
|
+
safeParseHeadlessConfig,
|
|
37
|
+
} from './headless/headless.schemas.ts'
|
|
38
|
+
// Types
|
|
39
|
+
export type {
|
|
40
|
+
HeadlessAdapterConfig,
|
|
41
|
+
OutputConfig,
|
|
42
|
+
OutputEventExtract,
|
|
43
|
+
OutputEventMapping,
|
|
44
|
+
OutputEventMatch,
|
|
45
|
+
PromptConfig,
|
|
46
|
+
ResultConfig,
|
|
47
|
+
ResumeConfig,
|
|
48
|
+
} from './headless/headless.types.ts'
|
|
49
|
+
// CLI entry point
|
|
50
|
+
export { headless } from './headless/headless-cli.ts'
|
|
51
|
+
export type { HistoryBuilder, HistoryBuilderConfig, HistoryTurn } from './headless/headless-history-builder.ts'
|
|
52
|
+
// History builder
|
|
53
|
+
export { createHistoryBuilder } from './headless/headless-history-builder.ts'
|
|
54
|
+
export type {
|
|
55
|
+
OutputParser,
|
|
56
|
+
ParsedResult,
|
|
57
|
+
ParsedUpdate,
|
|
58
|
+
ResultParseResult,
|
|
59
|
+
SessionUpdateType,
|
|
60
|
+
} from './headless/headless-output-parser.ts'
|
|
61
|
+
// Output parser
|
|
62
|
+
export { createOutputParser, jsonPath, jsonPathString } from './headless/headless-output-parser.ts'
|
|
63
|
+
export type {
|
|
64
|
+
ProcessExitInfo,
|
|
65
|
+
PromptResult,
|
|
66
|
+
Session,
|
|
67
|
+
SessionManager,
|
|
68
|
+
SessionManagerConfig,
|
|
69
|
+
UpdateCallback,
|
|
70
|
+
} from './headless/headless-session-manager.ts'
|
|
71
|
+
// Session manager
|
|
72
|
+
export { createSessionManager } from './headless/headless-session-manager.ts'
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration tests for Claude Code headless adapter.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests verify the headless session manager works correctly with Claude Code CLI
|
|
6
|
+
* using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
|
|
7
|
+
*
|
|
8
|
+
* Run locally with API key:
|
|
9
|
+
* ```bash
|
|
10
|
+
* ANTHROPIC_API_KEY=sk-... bun test ./src/integration_tests/claude.spec.ts
|
|
11
|
+
* ```
|
|
12
|
+
*
|
|
13
|
+
* Prerequisites:
|
|
14
|
+
* 1. Claude CLI installed (`curl -fsSL https://claude.ai/install.sh | bash`)
|
|
15
|
+
* 2. API key: `ANTHROPIC_API_KEY` environment variable
|
|
16
|
+
*
|
|
17
|
+
* These tests make real API calls and consume credits.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
|
|
21
|
+
import { join } from 'node:path'
|
|
22
|
+
import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
23
|
+
import { createSessionManager } from '../headless/headless-session-manager.ts'
|
|
24
|
+
|
|
25
|
+
// Long timeout for real agent interactions (2 minutes)
|
|
26
|
+
setDefaultTimeout(120000)
|
|
27
|
+
|
|
28
|
+
// Use project root as cwd - agents discover MCP servers from config files
|
|
29
|
+
const PROJECT_ROOT = process.cwd()
|
|
30
|
+
|
|
31
|
+
// Schema path for Claude headless adapter
|
|
32
|
+
const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/claude-headless.json')
|
|
33
|
+
|
|
34
|
+
// Get API key from environment
|
|
35
|
+
const API_KEY = process.env.ANTHROPIC_API_KEY ?? ''
|
|
36
|
+
|
|
37
|
+
// Skip all tests if no API key is available
|
|
38
|
+
const describeWithApiKey = API_KEY ? describe : describe.skip
|
|
39
|
+
|
|
40
|
+
describeWithApiKey('Claude Code Integration', () => {
|
|
41
|
+
let sessionManager: ReturnType<typeof createSessionManager>
|
|
42
|
+
let schemaConfig: ReturnType<typeof parseHeadlessConfig>
|
|
43
|
+
|
|
44
|
+
beforeAll(async () => {
|
|
45
|
+
// Load JSON from file, then parse with Zod schema
|
|
46
|
+
const schemaJson = await Bun.file(SCHEMA_PATH).json()
|
|
47
|
+
schemaConfig = parseHeadlessConfig(schemaJson)
|
|
48
|
+
|
|
49
|
+
// Create session manager with the schema
|
|
50
|
+
sessionManager = createSessionManager({
|
|
51
|
+
schema: schemaConfig,
|
|
52
|
+
timeout: 120000,
|
|
53
|
+
debug: false,
|
|
54
|
+
})
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
afterAll(async () => {
|
|
58
|
+
// Cleanup handled automatically by session manager
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
test('creates session successfully', async () => {
|
|
62
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
63
|
+
|
|
64
|
+
expect(session).toBeDefined()
|
|
65
|
+
expect(session.id).toBeDefined()
|
|
66
|
+
expect(typeof session.id).toBe('string')
|
|
67
|
+
expect(session.active).toBe(true)
|
|
68
|
+
expect(session.cwd).toBe(PROJECT_ROOT)
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
test('sends prompt and receives response', async () => {
|
|
72
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
73
|
+
|
|
74
|
+
// Simple prompt that doesn't require tools
|
|
75
|
+
const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.')
|
|
76
|
+
|
|
77
|
+
expect(result).toBeDefined()
|
|
78
|
+
expect(result.output).toBeDefined()
|
|
79
|
+
expect(result.output.length).toBeGreaterThan(0)
|
|
80
|
+
expect(result.updates).toBeInstanceOf(Array)
|
|
81
|
+
|
|
82
|
+
// Should contain "4" somewhere in the response
|
|
83
|
+
expect(result.output).toMatch(/4/)
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
test('collects trajectory updates during execution', async () => {
|
|
87
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
88
|
+
const collectedUpdates: unknown[] = []
|
|
89
|
+
|
|
90
|
+
const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => {
|
|
91
|
+
collectedUpdates.push(update)
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
expect(result.updates.length).toBeGreaterThan(0)
|
|
95
|
+
|
|
96
|
+
// Should have at least one message update
|
|
97
|
+
const messageUpdates = result.updates.filter((u) => u.type === 'message')
|
|
98
|
+
expect(messageUpdates.length).toBeGreaterThan(0)
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
test('uses MCP server from project config', async () => {
|
|
102
|
+
// This test verifies that Claude discovers MCP servers from .mcp.json
|
|
103
|
+
// The bun-docs MCP server is configured at project root
|
|
104
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
105
|
+
|
|
106
|
+
// Query the bun-docs MCP server (configured in .mcp.json)
|
|
107
|
+
const result = await sessionManager.prompt(
|
|
108
|
+
session.id,
|
|
109
|
+
'Use the bun-docs MCP server to search for information about Bun.serve(). ' +
|
|
110
|
+
'What are the key options for creating an HTTP server with Bun?',
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
// Response should contain Bun server-related information
|
|
114
|
+
expect(result.output.length).toBeGreaterThan(0)
|
|
115
|
+
// Should mention server/HTTP-related concepts from Bun docs
|
|
116
|
+
expect(result.output.toLowerCase()).toMatch(/serve|server|http|port|fetch|handler/)
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
test('multi-turn conversation maintains context (stream mode)', async () => {
|
|
120
|
+
// Multi-turn: multiple prompts to same session
|
|
121
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
122
|
+
|
|
123
|
+
// Turn 1: Establish context
|
|
124
|
+
const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.')
|
|
125
|
+
expect(turn1Result.output).toMatch(/42|forty.?two|remember/i)
|
|
126
|
+
|
|
127
|
+
// Turn 2: Reference previous context
|
|
128
|
+
const turn2Result = await sessionManager.prompt(
|
|
129
|
+
session.id,
|
|
130
|
+
'What number did I ask you to remember? Reply with just the number.',
|
|
131
|
+
)
|
|
132
|
+
expect(turn2Result.output).toMatch(/42/)
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
test('receives valid trajectory updates', async () => {
|
|
136
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
137
|
+
|
|
138
|
+
// Prompt that generates a response with trajectory updates
|
|
139
|
+
const result = await sessionManager.prompt(
|
|
140
|
+
session.id,
|
|
141
|
+
'What programming language is this project written in? Look at the file extensions.',
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
// Result should have output
|
|
145
|
+
expect(result.output).toBeDefined()
|
|
146
|
+
expect(result.output.length).toBeGreaterThan(0)
|
|
147
|
+
|
|
148
|
+
// Should have collected updates during execution
|
|
149
|
+
expect(result.updates).toBeInstanceOf(Array)
|
|
150
|
+
expect(result.updates.length).toBeGreaterThan(0)
|
|
151
|
+
|
|
152
|
+
// All updates should have valid types
|
|
153
|
+
const validTypes = ['thought', 'tool_call', 'message', 'plan']
|
|
154
|
+
const allValidTypes = result.updates.every((u) => validTypes.includes(u.type))
|
|
155
|
+
expect(allValidTypes).toBe(true)
|
|
156
|
+
})
|
|
157
|
+
})
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration tests for Gemini CLI headless adapter.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests verify the headless session manager works correctly with Gemini CLI
|
|
6
|
+
* using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
|
|
7
|
+
*
|
|
8
|
+
* Run locally with API key:
|
|
9
|
+
* ```bash
|
|
10
|
+
* GEMINI_API_KEY=... bun test ./src/integration_tests/gemini.spec.ts
|
|
11
|
+
* ```
|
|
12
|
+
*
|
|
13
|
+
* Prerequisites:
|
|
14
|
+
* 1. Gemini CLI installed (`npm install -g @google/gemini-cli`)
|
|
15
|
+
* 2. API key: `GEMINI_API_KEY` environment variable
|
|
16
|
+
*
|
|
17
|
+
* These tests make real API calls and consume credits.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
|
|
21
|
+
import { join } from 'node:path'
|
|
22
|
+
import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
23
|
+
import { createSessionManager } from '../headless/headless-session-manager.ts'
|
|
24
|
+
|
|
25
|
+
// Long timeout for real agent interactions (2 minutes)
|
|
26
|
+
setDefaultTimeout(120000)
|
|
27
|
+
|
|
28
|
+
// Use project root as cwd - agents discover MCP servers from config files
|
|
29
|
+
const PROJECT_ROOT = process.cwd()
|
|
30
|
+
|
|
31
|
+
// Schema path for Gemini headless adapter
|
|
32
|
+
const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/gemini-headless.json')
|
|
33
|
+
|
|
34
|
+
// Get API key from environment
|
|
35
|
+
const GEMINI_API_KEY = process.env.GEMINI_API_KEY ?? ''
|
|
36
|
+
|
|
37
|
+
// Skip all tests if no API key is available
|
|
38
|
+
const describeWithApiKey = GEMINI_API_KEY ? describe : describe.skip
|
|
39
|
+
|
|
40
|
+
describeWithApiKey('Gemini CLI Integration', () => {
|
|
41
|
+
let sessionManager: ReturnType<typeof createSessionManager>
|
|
42
|
+
let schemaConfig: ReturnType<typeof parseHeadlessConfig>
|
|
43
|
+
|
|
44
|
+
beforeAll(async () => {
|
|
45
|
+
// Load JSON from file, then parse with Zod schema
|
|
46
|
+
const schemaJson = await Bun.file(SCHEMA_PATH).json()
|
|
47
|
+
schemaConfig = parseHeadlessConfig(schemaJson)
|
|
48
|
+
|
|
49
|
+
// Create session manager with the schema
|
|
50
|
+
sessionManager = createSessionManager({
|
|
51
|
+
schema: schemaConfig,
|
|
52
|
+
timeout: 120000,
|
|
53
|
+
debug: false,
|
|
54
|
+
})
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
afterAll(async () => {
|
|
58
|
+
// Cleanup handled automatically by session manager
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
test('creates session successfully', async () => {
|
|
62
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
63
|
+
|
|
64
|
+
expect(session).toBeDefined()
|
|
65
|
+
expect(session.id).toBeDefined()
|
|
66
|
+
expect(typeof session.id).toBe('string')
|
|
67
|
+
expect(session.active).toBe(true)
|
|
68
|
+
expect(session.cwd).toBe(PROJECT_ROOT)
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
test('sends prompt and receives response', async () => {
|
|
72
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
73
|
+
|
|
74
|
+
// Simple prompt that doesn't require tools
|
|
75
|
+
const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.')
|
|
76
|
+
|
|
77
|
+
expect(result).toBeDefined()
|
|
78
|
+
expect(result.output).toBeDefined()
|
|
79
|
+
expect(result.output.length).toBeGreaterThan(0)
|
|
80
|
+
expect(result.updates).toBeInstanceOf(Array)
|
|
81
|
+
|
|
82
|
+
// Should contain "4" somewhere in the response
|
|
83
|
+
expect(result.output).toMatch(/4/)
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
test('collects trajectory updates during execution', async () => {
|
|
87
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
88
|
+
const collectedUpdates: unknown[] = []
|
|
89
|
+
|
|
90
|
+
const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => {
|
|
91
|
+
collectedUpdates.push(update)
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
expect(result.updates.length).toBeGreaterThan(0)
|
|
95
|
+
|
|
96
|
+
// Should have at least one message update
|
|
97
|
+
const messageUpdates = result.updates.filter((u) => u.type === 'message')
|
|
98
|
+
expect(messageUpdates.length).toBeGreaterThan(0)
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
test('multi-turn conversation maintains context (iterative mode)', async () => {
|
|
102
|
+
// Multi-turn via headless adapter in iterative mode (history accumulation)
|
|
103
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
104
|
+
|
|
105
|
+
// Turn 1: Establish context
|
|
106
|
+
const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.')
|
|
107
|
+
expect(turn1Result.output).toMatch(/42|forty.?two|remember/i)
|
|
108
|
+
|
|
109
|
+
// Turn 2: Reference previous context
|
|
110
|
+
const turn2Result = await sessionManager.prompt(
|
|
111
|
+
session.id,
|
|
112
|
+
'What number did I ask you to remember? Reply with just the number.',
|
|
113
|
+
)
|
|
114
|
+
expect(turn2Result.output).toMatch(/42/)
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
test('handles simple math question correctly', async () => {
|
|
118
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
119
|
+
|
|
120
|
+
const result = await sessionManager.prompt(session.id, 'Calculate 15 * 7. Reply with just the number.')
|
|
121
|
+
|
|
122
|
+
// Gemini CLI may include formatting variations (newlines, spaces)
|
|
123
|
+
// Strip whitespace to verify the correct answer is present
|
|
124
|
+
expect(result.output.replace(/\s/g, '')).toContain('105')
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
test('processes longer response without timeout', async () => {
|
|
128
|
+
const session = await sessionManager.create(PROJECT_ROOT)
|
|
129
|
+
|
|
130
|
+
const result = await sessionManager.prompt(
|
|
131
|
+
session.id,
|
|
132
|
+
'List 5 programming languages and one key feature of each. Be brief.',
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
expect(result.output.length).toBeGreaterThan(50)
|
|
136
|
+
// Should mention at least some programming languages
|
|
137
|
+
expect(result.output.toLowerCase()).toMatch(/python|javascript|java|rust|go|typescript|c\+\+|ruby/)
|
|
138
|
+
})
|
|
139
|
+
})
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline compare command - compare multiple runs of the same prompts.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Compares results from different configurations (agents, MCP servers, models)
|
|
6
|
+
* using a user-provided comparison grader that ranks the runs.
|
|
7
|
+
*
|
|
8
|
+
* Terminology: "runs" (not "agents") because comparisons can be:
|
|
9
|
+
* - Same agent, different MCP servers
|
|
10
|
+
* - Same agent, different skills enabled
|
|
11
|
+
* - Same agent, different system prompts
|
|
12
|
+
* - Same agent, different model versions
|
|
13
|
+
* - Different agents entirely
|
|
14
|
+
*
|
|
15
|
+
* @packageDocumentation
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { basename, extname } from 'node:path'
|
|
19
|
+
import { parseArgs } from 'node:util'
|
|
20
|
+
import { loadResults, logProgress, writeOutput } from '../core.ts'
|
|
21
|
+
import type { CaptureResult } from '../schemas.ts'
|
|
22
|
+
import type {
|
|
23
|
+
CompareConfig,
|
|
24
|
+
ComparisonGrader,
|
|
25
|
+
ComparisonGraderInput,
|
|
26
|
+
ComparisonResult,
|
|
27
|
+
LabeledRun,
|
|
28
|
+
} from './pipeline.types.ts'
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Load comparison grader from file.
|
|
32
|
+
*
|
|
33
|
+
* @remarks
|
|
34
|
+
* Similar to loadGrader but expects ComparisonGrader interface.
|
|
35
|
+
*
|
|
36
|
+
* @param path - Path to grader module
|
|
37
|
+
* @returns Loaded comparison grader function
|
|
38
|
+
*/
|
|
39
|
+
const loadComparisonGrader = async (path: string): Promise<ComparisonGrader> => {
|
|
40
|
+
const module = await import(path)
|
|
41
|
+
|
|
42
|
+
if (typeof module.grade === 'function') {
|
|
43
|
+
return module.grade as ComparisonGrader
|
|
44
|
+
}
|
|
45
|
+
if (typeof module.default === 'function') {
|
|
46
|
+
return module.default as ComparisonGrader
|
|
47
|
+
}
|
|
48
|
+
if (typeof module.compare === 'function') {
|
|
49
|
+
return module.compare as ComparisonGrader
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
throw new Error(`Comparison grader must export 'grade', 'compare', or 'default' function`)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Derive label from file path.
|
|
57
|
+
*
|
|
58
|
+
* @param path - File path
|
|
59
|
+
* @returns Label derived from filename without extension
|
|
60
|
+
*/
|
|
61
|
+
const labelFromPath = (path: string): string => {
|
|
62
|
+
const base = basename(path)
|
|
63
|
+
const ext = extname(base)
|
|
64
|
+
return base.slice(0, -ext.length)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Parse labeled run argument.
|
|
69
|
+
*
|
|
70
|
+
* @remarks
|
|
71
|
+
* Supports formats:
|
|
72
|
+
* - "path.jsonl" - label derived from filename
|
|
73
|
+
* - "label:path.jsonl" - explicit label
|
|
74
|
+
*
|
|
75
|
+
* @param arg - Run argument string
|
|
76
|
+
* @returns Labeled run object
|
|
77
|
+
*/
|
|
78
|
+
const parseLabeledRun = (arg: string): LabeledRun => {
|
|
79
|
+
const colonIndex = arg.indexOf(':')
|
|
80
|
+
|
|
81
|
+
// Check if this looks like a label:path format (not a Windows drive letter)
|
|
82
|
+
if (colonIndex > 0 && colonIndex !== 1) {
|
|
83
|
+
return {
|
|
84
|
+
label: arg.slice(0, colonIndex),
|
|
85
|
+
path: arg.slice(colonIndex + 1),
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
label: labelFromPath(arg),
|
|
91
|
+
path: arg,
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Execute pipeline compare with configuration.
|
|
97
|
+
*
|
|
98
|
+
* @param config - Compare configuration
|
|
99
|
+
*/
|
|
100
|
+
export const runCompare = async (config: CompareConfig): Promise<void> => {
|
|
101
|
+
const { runs, graderPath, outputPath, progress = false } = config
|
|
102
|
+
|
|
103
|
+
if (runs.length < 2) {
|
|
104
|
+
throw new Error('At least 2 runs required for comparison')
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Load comparison grader
|
|
108
|
+
const grader = await loadComparisonGrader(graderPath)
|
|
109
|
+
|
|
110
|
+
logProgress(`Comparing ${runs.length} runs with: ${graderPath}`, progress)
|
|
111
|
+
for (const run of runs) {
|
|
112
|
+
logProgress(` - ${run.label}: ${run.path}`, progress)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Load all runs
|
|
116
|
+
const runResults: Record<string, CaptureResult[]> = {}
|
|
117
|
+
for (const run of runs) {
|
|
118
|
+
logProgress(`Loading ${run.label}...`, progress)
|
|
119
|
+
runResults[run.label] = await loadResults(run.path)
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Build map of prompt IDs to runs
|
|
123
|
+
const promptIds = new Set<string>()
|
|
124
|
+
for (const results of Object.values(runResults)) {
|
|
125
|
+
for (const result of results) {
|
|
126
|
+
promptIds.add(result.id)
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
logProgress(`Comparing ${promptIds.size} prompts...`, progress)
|
|
131
|
+
|
|
132
|
+
let isFirstOutput = true
|
|
133
|
+
|
|
134
|
+
// Clear output file if specified
|
|
135
|
+
if (outputPath) {
|
|
136
|
+
await Bun.write(outputPath, '')
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const results: ComparisonResult[] = []
|
|
140
|
+
|
|
141
|
+
for (const promptId of promptIds) {
|
|
142
|
+
logProgress(` ${promptId}`, progress)
|
|
143
|
+
|
|
144
|
+
// Build comparison input
|
|
145
|
+
const runsData: ComparisonGraderInput['runs'] = {}
|
|
146
|
+
let input: string | string[] = ''
|
|
147
|
+
let hint: string | undefined
|
|
148
|
+
|
|
149
|
+
for (const [label, labelResults] of Object.entries(runResults)) {
|
|
150
|
+
const result = labelResults.find((r) => r.id === promptId)
|
|
151
|
+
if (result) {
|
|
152
|
+
runsData[label] = {
|
|
153
|
+
output: result.output,
|
|
154
|
+
trajectory: result.trajectory,
|
|
155
|
+
}
|
|
156
|
+
// Use first found input/hint as the reference
|
|
157
|
+
if (!input) {
|
|
158
|
+
input = result.input
|
|
159
|
+
hint = result.hint
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Skip if not present in at least 2 runs
|
|
165
|
+
if (Object.keys(runsData).length < 2) {
|
|
166
|
+
logProgress(` Skipped (only in ${Object.keys(runsData).length} run)`, progress)
|
|
167
|
+
continue
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Apply comparison grader
|
|
171
|
+
const graderInput: ComparisonGraderInput = {
|
|
172
|
+
id: promptId,
|
|
173
|
+
input,
|
|
174
|
+
hint,
|
|
175
|
+
runs: runsData,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const graderResult = await grader(graderInput)
|
|
179
|
+
|
|
180
|
+
const comparisonResult: ComparisonResult = {
|
|
181
|
+
id: promptId,
|
|
182
|
+
input,
|
|
183
|
+
hint,
|
|
184
|
+
rankings: graderResult.rankings,
|
|
185
|
+
reasoning: graderResult.reasoning,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
results.push(comparisonResult)
|
|
189
|
+
|
|
190
|
+
// Log winner
|
|
191
|
+
const winner = graderResult.rankings.find((r) => r.rank === 1)
|
|
192
|
+
if (winner) {
|
|
193
|
+
logProgress(` Winner: ${winner.run} (${winner.score.toFixed(2)})`, progress)
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
await writeOutput(JSON.stringify(comparisonResult), outputPath, !isFirstOutput)
|
|
197
|
+
isFirstOutput = false
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Summary statistics
|
|
201
|
+
logProgress('', progress)
|
|
202
|
+
logProgress('=== Summary ===', progress)
|
|
203
|
+
|
|
204
|
+
const winCounts: Record<string, number> = {}
|
|
205
|
+
for (const run of runs) {
|
|
206
|
+
winCounts[run.label] = 0
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
for (const result of results) {
|
|
210
|
+
const winner = result.rankings.find((r) => r.rank === 1)
|
|
211
|
+
if (winner && winner.run in winCounts) {
|
|
212
|
+
const currentCount = winCounts[winner.run] ?? 0
|
|
213
|
+
winCounts[winner.run] = currentCount + 1
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
for (const [label, wins] of Object.entries(winCounts)) {
|
|
218
|
+
const pct = ((wins / results.length) * 100).toFixed(1)
|
|
219
|
+
logProgress(` ${label}: ${wins} wins (${pct}%)`, progress)
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
logProgress('Done!', progress)
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Pipeline compare command CLI handler.
|
|
227
|
+
*
|
|
228
|
+
* @param args - Command line arguments (after 'compare')
|
|
229
|
+
*/
|
|
230
|
+
export const compare = async (args: string[]): Promise<void> => {
|
|
231
|
+
const { values, positionals } = parseArgs({
|
|
232
|
+
args,
|
|
233
|
+
options: {
|
|
234
|
+
run: { type: 'string', multiple: true },
|
|
235
|
+
grader: { type: 'string', short: 'g' },
|
|
236
|
+
output: { type: 'string', short: 'o' },
|
|
237
|
+
progress: { type: 'boolean', default: false },
|
|
238
|
+
help: { type: 'boolean', short: 'h' },
|
|
239
|
+
},
|
|
240
|
+
allowPositionals: true,
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
if (values.help) {
|
|
244
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
245
|
+
console.log(`
|
|
246
|
+
Usage: agent-eval-harness compare [files...] --grader <grader> [options]
|
|
247
|
+
|
|
248
|
+
Compare multiple runs of the same prompts.
|
|
249
|
+
|
|
250
|
+
Arguments:
|
|
251
|
+
files... Result files to compare (positional, unlimited)
|
|
252
|
+
|
|
253
|
+
Options:
|
|
254
|
+
--run Labeled run format: "label:path.jsonl" (alternative to positional)
|
|
255
|
+
-g, --grader Path to comparison grader (.ts/.js module) (required)
|
|
256
|
+
-o, --output Output file (default: stdout)
|
|
257
|
+
--progress Show progress to stderr
|
|
258
|
+
-h, --help Show this help message
|
|
259
|
+
|
|
260
|
+
Comparison Grader:
|
|
261
|
+
Must export 'grade' or 'compare' function with signature:
|
|
262
|
+
(params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
|
|
263
|
+
|
|
264
|
+
Input includes all runs' results for a single prompt.
|
|
265
|
+
Output should rank runs from best to worst.
|
|
266
|
+
|
|
267
|
+
Examples:
|
|
268
|
+
# Compare multiple result files (positional)
|
|
269
|
+
agent-eval-harness compare run1.jsonl run2.jsonl run3.jsonl -g ./compare-grader.ts
|
|
270
|
+
|
|
271
|
+
# With explicit labels
|
|
272
|
+
agent-eval-harness compare \\
|
|
273
|
+
--run "with-bun-mcp:results-bun.jsonl" \\
|
|
274
|
+
--run "vanilla:results-vanilla.jsonl" \\
|
|
275
|
+
-g ./compare-grader.ts
|
|
276
|
+
|
|
277
|
+
# Mix positional and labeled
|
|
278
|
+
agent-eval-harness compare results-*.jsonl \\
|
|
279
|
+
--run "baseline:baseline.jsonl" \\
|
|
280
|
+
-g ./compare-grader.ts -o comparison.jsonl
|
|
281
|
+
|
|
282
|
+
# Typical workflow
|
|
283
|
+
# 1. Capture with different configs
|
|
284
|
+
agent-eval-harness capture prompts.jsonl -s claude.json -o vanilla.jsonl
|
|
285
|
+
agent-eval-harness capture prompts.jsonl -s claude-with-mcp.json -o with-mcp.jsonl
|
|
286
|
+
|
|
287
|
+
# 2. Compare results
|
|
288
|
+
agent-eval-harness compare vanilla.jsonl with-mcp.jsonl -g ./compare-grader.ts
|
|
289
|
+
`)
|
|
290
|
+
return
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
if (!values.grader) {
|
|
294
|
+
console.error('Error: --grader is required')
|
|
295
|
+
process.exit(1)
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Collect runs from positional args and --run flags
|
|
299
|
+
const runs: LabeledRun[] = []
|
|
300
|
+
|
|
301
|
+
// Positional arguments (file paths)
|
|
302
|
+
for (const arg of positionals) {
|
|
303
|
+
runs.push(parseLabeledRun(arg))
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// --run flags
|
|
307
|
+
if (values.run) {
|
|
308
|
+
for (const arg of values.run) {
|
|
309
|
+
runs.push(parseLabeledRun(arg))
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (runs.length < 2) {
|
|
314
|
+
console.error('Error: At least 2 result files required for comparison')
|
|
315
|
+
console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl -g ./grader.ts')
|
|
316
|
+
process.exit(1)
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
await runCompare({
|
|
320
|
+
runs,
|
|
321
|
+
graderPath: values.grader,
|
|
322
|
+
outputPath: values.output,
|
|
323
|
+
progress: values.progress,
|
|
324
|
+
})
|
|
325
|
+
}
|