@plaited/agent-eval-harness 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,245 @@
1
+ /**
2
+ * Shared execution utilities for capture and trials commands.
3
+ *
4
+ * @remarks
5
+ * Extracts common setup logic: schema loading, prompt loading, path resolution,
6
+ * session manager creation, output initialization, and worker pool execution.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { mkdir } from 'node:fs/promises'
12
+ import { createWriteMutex, loadPrompts, logProgress, resolvePath, runWorkerPool, writeOutput } from '../core.ts'
13
+ import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
14
+ import { createSessionManager, type SessionManager } from '../headless/headless-session-manager.ts'
15
+ import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
16
+ import type { Grader, PromptCase } from '../schemas.ts'
17
+
18
+ // ============================================================================
19
+ // Types
20
+ // ============================================================================
21
+
22
+ /** Base configuration shared by capture and trials commands */
23
+ export type BaseExecutionConfig = {
24
+ /** Path to prompts.jsonl file (required unless prompts provided) */
25
+ promptsPath?: string
26
+ /** Path to agent schema JSON file */
27
+ schemaPath: string
28
+ /** Pre-loaded prompt cases (from stdin); skips file loading when set */
29
+ prompts?: PromptCase[]
30
+ /** Output file path (undefined for stdout) */
31
+ outputPath?: string
32
+ /** Working directory for agent */
33
+ cwd?: string
34
+ /** Timeout per prompt in milliseconds (overrides schema default) */
35
+ timeout?: number
36
+ /** Show progress to stderr */
37
+ progress?: boolean
38
+ /** Append to output file instead of overwriting */
39
+ append?: boolean
40
+ /** Optional grader function */
41
+ grader?: Grader
42
+ /** Enable debug mode */
43
+ debug?: boolean
44
+ /** Number of concurrent workers (default: 1 for sequential) */
45
+ concurrency?: number
46
+ /** Base directory for per-prompt workspace isolation */
47
+ workspaceDir?: string
48
+ }
49
+
50
+ /** Prepared execution context returned by prepareExecution */
51
+ export type ExecutionContext = {
52
+ /** Parsed and validated headless adapter schema */
53
+ schema: HeadlessAdapterConfig
54
+ /** Loaded and validated prompt cases */
55
+ prompts: PromptCase[]
56
+ /** Session manager for creating/destroying agent sessions */
57
+ sessions: SessionManager
58
+ /** Resolved absolute output path (undefined for stdout) */
59
+ resolvedOutputPath?: string
60
+ /** Resolved absolute workspace directory path */
61
+ resolvedWorkspaceDir?: string
62
+ /** Effective timeout in milliseconds */
63
+ effectiveTimeout: number
64
+ /** Default working directory for agent sessions */
65
+ defaultWorkingDir: string
66
+ /** Number of concurrent workers */
67
+ concurrency: number
68
+ /** Whether to show progress output */
69
+ progress: boolean
70
+ /** Optional grader function */
71
+ grader?: Grader
72
+ /** Whether debug mode is enabled */
73
+ debug: boolean
74
+ /** Write a result object as JSONL, coordinated via mutex */
75
+ writeResult: (result: unknown) => Promise<void>
76
+ }
77
+
78
+ // ============================================================================
79
+ // Execution Setup
80
+ // ============================================================================
81
+
82
+ /**
83
+ * Prepare execution context from base configuration.
84
+ *
85
+ * @remarks
86
+ * Handles all shared setup: schema loading/validation, prompt loading,
87
+ * path resolution, session manager creation, output file initialization,
88
+ * workspace directory creation, and write mutex coordination.
89
+ *
90
+ * @param config - Base execution configuration
91
+ * @returns Prepared execution context
92
+ * @throws Error if schema file not found, invalid, or prompts missing
93
+ *
94
+ * @public
95
+ */
96
+ export const prepareExecution = async (config: BaseExecutionConfig): Promise<ExecutionContext> => {
97
+ const {
98
+ promptsPath,
99
+ schemaPath,
100
+ outputPath,
101
+ cwd,
102
+ timeout,
103
+ progress = false,
104
+ append = false,
105
+ grader,
106
+ debug = false,
107
+ concurrency = 1,
108
+ workspaceDir,
109
+ } = config
110
+
111
+ // Validate prompt source
112
+ if (!config.prompts && !promptsPath) {
113
+ throw new Error('Either promptsPath or prompts must be provided')
114
+ }
115
+
116
+ // Load and validate schema
117
+ const schemaFile = Bun.file(schemaPath)
118
+ if (!(await schemaFile.exists())) {
119
+ throw new Error(`Schema file not found: ${schemaPath}`)
120
+ }
121
+
122
+ let schema: HeadlessAdapterConfig
123
+ try {
124
+ const rawSchema = await schemaFile.json()
125
+ schema = parseHeadlessConfig(rawSchema)
126
+ } catch (error) {
127
+ throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
128
+ }
129
+
130
+ // Load prompts
131
+ const prompts = config.prompts ?? (await loadPrompts(promptsPath!))
132
+
133
+ // Resolve paths
134
+ const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
135
+ const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
136
+
137
+ // Determine effective timeout (CLI flag > schema default > harness default)
138
+ const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
139
+ const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
140
+
141
+ // Create session manager
142
+ const sessions = createSessionManager({
143
+ schema,
144
+ timeout: effectiveTimeout,
145
+ verbose: progress,
146
+ debug,
147
+ })
148
+
149
+ // Initialize output file (clear if not appending)
150
+ if (resolvedOutputPath && !append) {
151
+ await Bun.write(resolvedOutputPath, '')
152
+ }
153
+
154
+ // Create workspace base directory if specified
155
+ if (resolvedWorkspaceDir) {
156
+ await mkdir(resolvedWorkspaceDir, { recursive: true })
157
+ }
158
+
159
+ const defaultWorkingDir = cwd ?? process.cwd()
160
+
161
+ // Create write mutex with closure for coordinated result writing
162
+ const writeMutex = createWriteMutex()
163
+ let isFirstOutput = true
164
+
165
+ const writeResult = async (result: unknown) => {
166
+ await writeMutex.write(async () => {
167
+ const formatted = JSON.stringify(result)
168
+ await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
169
+ isFirstOutput = false
170
+ })
171
+ }
172
+
173
+ return {
174
+ schema,
175
+ prompts,
176
+ sessions,
177
+ resolvedOutputPath,
178
+ resolvedWorkspaceDir,
179
+ effectiveTimeout,
180
+ defaultWorkingDir,
181
+ concurrency,
182
+ progress,
183
+ grader,
184
+ debug,
185
+ writeResult,
186
+ }
187
+ }
188
+
189
+ // ============================================================================
190
+ // Worker Pool Execution
191
+ // ============================================================================
192
+
193
+ /**
194
+ * Execute prompts through a worker pool with progress logging.
195
+ *
196
+ * @remarks
197
+ * Common wrapper for the runWorkerPool pattern used by both capture and trials.
198
+ * Handles progress callbacks, error logging, and completion logging.
199
+ *
200
+ * @param ctx - Execution context from prepareExecution
201
+ * @param processFn - Function to process each prompt
202
+ * @returns Array of results
203
+ *
204
+ * @public
205
+ */
206
+ export const executePrompts = async <T>(
207
+ ctx: ExecutionContext,
208
+ processFn: (promptCase: PromptCase, index: number) => Promise<T>,
209
+ ): Promise<T[]> => {
210
+ const { results, errors } = await runWorkerPool(ctx.prompts, processFn, {
211
+ concurrency: ctx.concurrency,
212
+ onProgress: (completed, total) => {
213
+ logProgress(`Progress: ${completed}/${total} prompts completed`, ctx.progress)
214
+ },
215
+ })
216
+
217
+ if (errors.length > 0) {
218
+ logProgress(`Completed with ${errors.length} error(s)`, ctx.progress)
219
+ }
220
+
221
+ logProgress('Done!', ctx.progress)
222
+ return results
223
+ }
224
+
225
+ // ============================================================================
226
+ // CLI Helpers
227
+ // ============================================================================
228
+
229
+ /**
230
+ * Parse and validate concurrency CLI argument.
231
+ *
232
+ * @param value - Raw string value from parseArgs
233
+ * @returns Validated positive integer (default: 1)
234
+ *
235
+ * @public
236
+ */
237
+ export const parseConcurrency = (value: string | undefined): number => {
238
+ if (!value) return 1
239
+ const parsed = Number.parseInt(value, 10)
240
+ if (Number.isNaN(parsed) || parsed < 1) {
241
+ console.error('Error: --concurrency must be a positive integer')
242
+ process.exit(1)
243
+ }
244
+ return parsed
245
+ }
@@ -117,10 +117,14 @@ describe('runCapture configuration', () => {
117
117
  progress: true,
118
118
  append: false,
119
119
  debug: false,
120
+ concurrency: 4,
121
+ workspaceDir: '/tmp/workspaces',
120
122
  }
121
123
 
122
124
  expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
123
125
  expect(config.schemaPath).toBe('./schemas/claude-headless.json')
126
+ expect(config.concurrency).toBe(4)
127
+ expect(config.workspaceDir).toBe('/tmp/workspaces')
124
128
  })
125
129
 
126
130
  test('CaptureConfig allows minimal configuration', () => {
@@ -135,6 +139,18 @@ describe('runCapture configuration', () => {
135
139
  expect(config.progress).toBeUndefined()
136
140
  expect(config.append).toBeUndefined()
137
141
  expect(config.grader).toBeUndefined()
142
+ expect(config.concurrency).toBeUndefined()
143
+ expect(config.workspaceDir).toBeUndefined()
144
+ })
145
+
146
+ test('CaptureConfig accepts prompts without promptsPath', () => {
147
+ const config: CaptureConfig = {
148
+ schemaPath: './test-schema.json',
149
+ prompts: [{ id: 't1', input: 'hello' }],
150
+ }
151
+
152
+ expect(config.promptsPath).toBeUndefined()
153
+ expect(config.prompts).toHaveLength(1)
138
154
  })
139
155
  })
140
156
 
@@ -160,6 +176,25 @@ describe('capture CLI', () => {
160
176
  expect(stdout).toContain('--progress')
161
177
  expect(stdout).toContain('-g, --grader')
162
178
  expect(stdout).toContain('-s, --schema')
179
+ expect(stdout).toContain('-j, --concurrency')
180
+ expect(stdout).toContain('--workspace-dir')
181
+ expect(stdout).toContain('--stdin')
182
+ })
183
+
184
+ test('shows error for --stdin with positional file', async () => {
185
+ const proc = Bun.spawn(
186
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
187
+ {
188
+ stdout: 'pipe',
189
+ stderr: 'pipe',
190
+ },
191
+ )
192
+
193
+ const stderr = await new Response(proc.stderr).text()
194
+ const exitCode = await proc.exited
195
+
196
+ expect(exitCode).not.toBe(0)
197
+ expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
163
198
  })
164
199
 
165
200
  test('shows error for missing prompts file argument', async () => {
@@ -187,4 +222,53 @@ describe('capture CLI', () => {
187
222
  expect(exitCode).not.toBe(0)
188
223
  expect(stderr).toContain('--schema is required')
189
224
  })
225
+
226
+ test('shows error for invalid concurrency value', async () => {
227
+ const proc = Bun.spawn(
228
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
229
+ {
230
+ stdout: 'pipe',
231
+ stderr: 'pipe',
232
+ },
233
+ )
234
+
235
+ const stderr = await new Response(proc.stderr).text()
236
+ const exitCode = await proc.exited
237
+
238
+ expect(exitCode).not.toBe(0)
239
+ expect(stderr).toContain('--concurrency must be a positive integer')
240
+ })
241
+
242
+ test('shows error for zero concurrency', async () => {
243
+ const proc = Bun.spawn(
244
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
245
+ {
246
+ stdout: 'pipe',
247
+ stderr: 'pipe',
248
+ },
249
+ )
250
+
251
+ const stderr = await new Response(proc.stderr).text()
252
+ const exitCode = await proc.exited
253
+
254
+ expect(exitCode).not.toBe(0)
255
+ expect(stderr).toContain('--concurrency must be a positive integer')
256
+ })
257
+
258
+ test('shows error for negative concurrency', async () => {
259
+ // Note: Using --concurrency=-1 format because -j -1 is ambiguous to parseArgs
260
+ const proc = Bun.spawn(
261
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '--concurrency=-1'],
262
+ {
263
+ stdout: 'pipe',
264
+ stderr: 'pipe',
265
+ },
266
+ )
267
+
268
+ const stderr = await new Response(proc.stderr).text()
269
+ const exitCode = await proc.exited
270
+
271
+ expect(exitCode).not.toBe(0)
272
+ expect(stderr).toContain('--concurrency must be a positive integer')
273
+ })
190
274
  })
@@ -17,11 +17,15 @@ describe('TrialsConfig configuration', () => {
17
17
  progress: true,
18
18
  append: false,
19
19
  debug: false,
20
+ concurrency: 4,
21
+ workspaceDir: '/tmp/workspaces',
20
22
  }
21
23
 
22
24
  expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
23
25
  expect(config.schemaPath).toBe('./schemas/claude-headless.json')
24
26
  expect(config.k).toBe(5)
27
+ expect(config.concurrency).toBe(4)
28
+ expect(config.workspaceDir).toBe('/tmp/workspaces')
25
29
  })
26
30
 
27
31
  test('TrialsConfig allows minimal configuration', () => {
@@ -37,6 +41,19 @@ describe('TrialsConfig configuration', () => {
37
41
  expect(config.progress).toBeUndefined()
38
42
  expect(config.append).toBeUndefined()
39
43
  expect(config.grader).toBeUndefined()
44
+ expect(config.concurrency).toBeUndefined()
45
+ expect(config.workspaceDir).toBeUndefined()
46
+ })
47
+
48
+ test('TrialsConfig accepts prompts without promptsPath', () => {
49
+ const config: TrialsConfig = {
50
+ schemaPath: './test-schema.json',
51
+ k: 3,
52
+ prompts: [{ id: 't1', input: 'hello' }],
53
+ }
54
+
55
+ expect(config.promptsPath).toBeUndefined()
56
+ expect(config.prompts).toHaveLength(1)
40
57
  })
41
58
  })
42
59
 
@@ -64,6 +81,25 @@ describe('trials CLI', () => {
64
81
  expect(stdout).toContain('-g, --grader')
65
82
  expect(stdout).toContain('-s, --schema')
66
83
  expect(stdout).toContain('pass@k')
84
+ expect(stdout).toContain('-j, --concurrency')
85
+ expect(stdout).toContain('--workspace-dir')
86
+ expect(stdout).toContain('--stdin')
87
+ })
88
+
89
+ test('shows error for --stdin with positional file', async () => {
90
+ const proc = Bun.spawn(
91
+ ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
92
+ {
93
+ stdout: 'pipe',
94
+ stderr: 'pipe',
95
+ },
96
+ )
97
+
98
+ const stderr = await new Response(proc.stderr).text()
99
+ const exitCode = await proc.exited
100
+
101
+ expect(exitCode).not.toBe(0)
102
+ expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
67
103
  })
68
104
 
69
105
  test('shows error for missing prompts file argument', async () => {
@@ -91,6 +127,38 @@ describe('trials CLI', () => {
91
127
  expect(exitCode).not.toBe(0)
92
128
  expect(stderr).toContain('--schema is required')
93
129
  })
130
+
131
+ test('shows error for invalid concurrency value', async () => {
132
+ const proc = Bun.spawn(
133
+ ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
134
+ {
135
+ stdout: 'pipe',
136
+ stderr: 'pipe',
137
+ },
138
+ )
139
+
140
+ const stderr = await new Response(proc.stderr).text()
141
+ const exitCode = await proc.exited
142
+
143
+ expect(exitCode).not.toBe(0)
144
+ expect(stderr).toContain('--concurrency must be a positive integer')
145
+ })
146
+
147
+ test('shows error for zero concurrency', async () => {
148
+ const proc = Bun.spawn(
149
+ ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
150
+ {
151
+ stdout: 'pipe',
152
+ stderr: 'pipe',
153
+ },
154
+ )
155
+
156
+ const stderr = await new Response(proc.stderr).text()
157
+ const exitCode = await proc.exited
158
+
159
+ expect(exitCode).not.toBe(0)
160
+ expect(stderr).toContain('--concurrency must be a positive integer')
161
+ })
94
162
  })
95
163
 
96
164
  // ============================================================================