@plaited/agent-eval-harness 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/package.json +1 -1
- package/src/commands/capture.ts +101 -26
- package/src/commands/tests/capture-cli.spec.ts +57 -0
- package/src/commands/tests/trials-cli.spec.ts +40 -0
- package/src/commands/trials.ts +111 -28
- package/src/core/core.ts +18 -0
- package/src/core/loading.ts +15 -19
- package/src/core/streaming.ts +172 -0
- package/src/core/tests/streaming.spec.ts +399 -0
- package/src/core/tests/worker-pool.spec.ts +377 -0
- package/src/core/worker-pool.ts +220 -0
- package/src/core.ts +14 -0
- package/src/schemas/schemas.ts +2 -0
package/README.md
CHANGED
|
@@ -58,11 +58,21 @@ bunx @plaited/agent-eval-harness capture prompts.jsonl \
|
|
|
58
58
|
--schema ./schemas/claude-headless.json \
|
|
59
59
|
-o results.jsonl
|
|
60
60
|
|
|
61
|
+
# Parallel capture (4x faster with 4 workers)
|
|
62
|
+
bunx @plaited/agent-eval-harness capture prompts.jsonl \
|
|
63
|
+
--schema ./schemas/claude-headless.json \
|
|
64
|
+
-j 4 -o results.jsonl
|
|
65
|
+
|
|
61
66
|
# Run trials for pass@k analysis with debug mode
|
|
62
67
|
bunx @plaited/agent-eval-harness trials prompts.jsonl \
|
|
63
68
|
--schema ./schemas/claude-headless.json \
|
|
64
69
|
-k 5 --grader ./grader.ts --debug
|
|
65
70
|
|
|
71
|
+
# Parallel trials (4 prompts running trials concurrently)
|
|
72
|
+
bunx @plaited/agent-eval-harness trials prompts.jsonl \
|
|
73
|
+
--schema ./schemas/claude-headless.json \
|
|
74
|
+
-k 5 -j 4 --workspace-dir ./workspaces -o trials.jsonl
|
|
75
|
+
|
|
66
76
|
# Summarize results
|
|
67
77
|
bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl
|
|
68
78
|
|
package/package.json
CHANGED
package/src/commands/capture.ts
CHANGED
|
@@ -11,8 +11,11 @@
|
|
|
11
11
|
* @packageDocumentation
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
+
import { mkdir } from 'node:fs/promises'
|
|
14
15
|
import { parseArgs } from 'node:util'
|
|
15
16
|
import {
|
|
17
|
+
createWorkspaceDir,
|
|
18
|
+
createWriteMutex,
|
|
16
19
|
detectTrajectoryRichness,
|
|
17
20
|
extractOutput,
|
|
18
21
|
extractTrajectory,
|
|
@@ -21,6 +24,7 @@ import {
|
|
|
21
24
|
loadPrompts,
|
|
22
25
|
logProgress,
|
|
23
26
|
resolvePath,
|
|
27
|
+
runWorkerPool,
|
|
24
28
|
writeOutput,
|
|
25
29
|
} from '../core.ts'
|
|
26
30
|
import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
@@ -70,6 +74,10 @@ export type CaptureConfig = {
|
|
|
70
74
|
grader?: Grader
|
|
71
75
|
/** Enable debug mode for detailed output */
|
|
72
76
|
debug?: boolean
|
|
77
|
+
/** Number of concurrent workers (default: 1 for sequential) */
|
|
78
|
+
concurrency?: number
|
|
79
|
+
/** Base directory for per-prompt workspace isolation */
|
|
80
|
+
workspaceDir?: string
|
|
73
81
|
}
|
|
74
82
|
|
|
75
83
|
// ============================================================================
|
|
@@ -97,6 +105,8 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
97
105
|
append = false,
|
|
98
106
|
grader,
|
|
99
107
|
debug = false,
|
|
108
|
+
concurrency = 1,
|
|
109
|
+
workspaceDir,
|
|
100
110
|
} = config
|
|
101
111
|
|
|
102
112
|
// Load and validate schema
|
|
@@ -116,8 +126,9 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
116
126
|
// Load prompts
|
|
117
127
|
const prompts = await loadPrompts(promptsPath)
|
|
118
128
|
|
|
119
|
-
// Resolve
|
|
129
|
+
// Resolve paths
|
|
120
130
|
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
131
|
+
const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
|
|
121
132
|
|
|
122
133
|
// Determine effective timeout (CLI flag > schema default > harness default)
|
|
123
134
|
const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
|
|
@@ -127,6 +138,12 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
127
138
|
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
|
|
128
139
|
logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
|
|
129
140
|
logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
|
|
141
|
+
if (concurrency > 1) {
|
|
142
|
+
logProgress(`Concurrency: ${concurrency} workers`, progress)
|
|
143
|
+
}
|
|
144
|
+
if (resolvedWorkspaceDir) {
|
|
145
|
+
logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
|
|
146
|
+
}
|
|
130
147
|
if (resolvedOutputPath) {
|
|
131
148
|
logProgress(`Output: ${resolvedOutputPath}`, progress)
|
|
132
149
|
}
|
|
@@ -147,24 +164,36 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
147
164
|
await Bun.write(resolvedOutputPath, '')
|
|
148
165
|
}
|
|
149
166
|
|
|
150
|
-
|
|
151
|
-
|
|
167
|
+
// Create workspace base directory if specified
|
|
168
|
+
// Uses fs.mkdir instead of shell to prevent command injection
|
|
169
|
+
if (resolvedWorkspaceDir) {
|
|
170
|
+
await mkdir(resolvedWorkspaceDir, { recursive: true })
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const defaultWorkingDir = cwd ?? process.cwd()
|
|
174
|
+
|
|
175
|
+
// Create write mutex for coordinating JSONL output
|
|
176
|
+
const writeMutex = createWriteMutex()
|
|
152
177
|
let isFirstOutput = true
|
|
153
178
|
|
|
154
|
-
//
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
179
|
+
// Process a single prompt (used by worker pool)
|
|
180
|
+
const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
|
|
181
|
+
// Determine working directory (per-prompt workspace or default)
|
|
182
|
+
const workingDir = resolvedWorkspaceDir
|
|
183
|
+
? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id)
|
|
184
|
+
: defaultWorkingDir
|
|
158
185
|
|
|
159
|
-
logProgress(`[${
|
|
186
|
+
logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
|
|
160
187
|
|
|
161
188
|
const startTime = Date.now()
|
|
162
189
|
let result: CaptureResult
|
|
190
|
+
let sessionId: string | undefined
|
|
163
191
|
|
|
164
192
|
try {
|
|
165
193
|
// Create fresh session for each entry (ensures isolation)
|
|
166
194
|
const sessionStart = Date.now()
|
|
167
195
|
const session = await sessions.create(workingDir)
|
|
196
|
+
sessionId = session.id
|
|
168
197
|
const sessionCreation = Date.now() - sessionStart
|
|
169
198
|
logProgress(` Session: ${session.id}`, progress)
|
|
170
199
|
|
|
@@ -177,9 +206,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
177
206
|
let lastExitInfo: ProcessExitInfo | undefined
|
|
178
207
|
let lastOutput = ''
|
|
179
208
|
|
|
180
|
-
// TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
|
|
181
|
-
// The session manager would need to accept timeout per-call to support this
|
|
182
|
-
|
|
183
209
|
// Execute each turn sequentially in the same session
|
|
184
210
|
for (const turnInput of inputs) {
|
|
185
211
|
const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
|
|
@@ -198,7 +224,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
198
224
|
|
|
199
225
|
result = {
|
|
200
226
|
id: promptCase.id,
|
|
201
|
-
input: promptCase.input,
|
|
227
|
+
input: promptCase.input,
|
|
202
228
|
output,
|
|
203
229
|
...(promptCase.hint && { hint: promptCase.hint }),
|
|
204
230
|
trajectory,
|
|
@@ -207,6 +233,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
207
233
|
agent: schema.name,
|
|
208
234
|
trajectoryRichness,
|
|
209
235
|
turnCount,
|
|
236
|
+
...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
|
|
210
237
|
...(lastExitInfo && {
|
|
211
238
|
exitCode: lastExitInfo.exitCode,
|
|
212
239
|
signal: lastExitInfo.signal,
|
|
@@ -236,14 +263,10 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
236
263
|
|
|
237
264
|
result.score = graderResult
|
|
238
265
|
|
|
239
|
-
// Merge outcome from grader if present
|
|
240
266
|
if (graderResult.outcome) {
|
|
241
267
|
result.outcome = graderResult.outcome
|
|
242
268
|
}
|
|
243
269
|
}
|
|
244
|
-
|
|
245
|
-
// Clean up session
|
|
246
|
-
sessions.destroy(session.id)
|
|
247
270
|
} catch (error) {
|
|
248
271
|
const endTime = Date.now()
|
|
249
272
|
const message = error instanceof Error ? error.message : String(error)
|
|
@@ -259,6 +282,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
259
282
|
agent: schema.name,
|
|
260
283
|
trajectoryRichness: 'minimal' as TrajectoryRichness,
|
|
261
284
|
turnCount: inputs.length,
|
|
285
|
+
...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
|
|
262
286
|
},
|
|
263
287
|
timing: {
|
|
264
288
|
start: startTime,
|
|
@@ -269,14 +293,19 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
269
293
|
toolErrors: true,
|
|
270
294
|
errors: [message],
|
|
271
295
|
}
|
|
296
|
+
} finally {
|
|
297
|
+
// Always clean up session if it was created
|
|
298
|
+
if (sessionId) {
|
|
299
|
+
sessions.destroy(sessionId)
|
|
300
|
+
}
|
|
272
301
|
}
|
|
273
302
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
303
|
+
// Write result immediately (coordinated via mutex for concurrent writes)
|
|
304
|
+
await writeMutex.write(async () => {
|
|
305
|
+
const formatted = JSON.stringify(result)
|
|
306
|
+
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
307
|
+
isFirstOutput = false
|
|
308
|
+
})
|
|
280
309
|
|
|
281
310
|
const statusIcon = result.toolErrors ? '!' : '✓'
|
|
282
311
|
const exitInfo = result.metadata?.timedOut
|
|
@@ -284,7 +313,22 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
284
313
|
: result.metadata?.exitCode && result.metadata.exitCode !== 0
|
|
285
314
|
? ` - exit ${result.metadata.exitCode}`
|
|
286
315
|
: ''
|
|
287
|
-
logProgress(` ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
|
|
316
|
+
logProgress(` ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress)
|
|
317
|
+
|
|
318
|
+
return result
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Run with worker pool
|
|
322
|
+
const { results, errors } = await runWorkerPool(prompts, processPrompt, {
|
|
323
|
+
concurrency,
|
|
324
|
+
onProgress: (completed, total) => {
|
|
325
|
+
logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
|
|
326
|
+
},
|
|
327
|
+
})
|
|
328
|
+
|
|
329
|
+
// Log any errors that occurred
|
|
330
|
+
if (errors.length > 0) {
|
|
331
|
+
logProgress(`Completed with ${errors.length} error(s)`, progress)
|
|
288
332
|
}
|
|
289
333
|
|
|
290
334
|
logProgress('Done!', progress)
|
|
@@ -312,6 +356,8 @@ export const capture = async (args: string[]): Promise<void> => {
|
|
|
312
356
|
append: { type: 'boolean', default: false },
|
|
313
357
|
grader: { type: 'string', short: 'g' },
|
|
314
358
|
debug: { type: 'boolean', default: false },
|
|
359
|
+
concurrency: { type: 'string', short: 'j' },
|
|
360
|
+
'workspace-dir': { type: 'string' },
|
|
315
361
|
help: { type: 'boolean', short: 'h' },
|
|
316
362
|
},
|
|
317
363
|
allowPositionals: true,
|
|
@@ -329,6 +375,8 @@ Options:
|
|
|
329
375
|
-o, --output Output file (default: stdout)
|
|
330
376
|
-c, --cwd Working directory for agent
|
|
331
377
|
-t, --timeout Request timeout in ms (overrides schema default)
|
|
378
|
+
-j, --concurrency Number of concurrent workers (default: 1)
|
|
379
|
+
--workspace-dir Base directory for per-prompt workspace isolation
|
|
332
380
|
--progress Show progress to stderr
|
|
333
381
|
--append Append to output file instead of overwriting
|
|
334
382
|
-g, --grader Path to grader (.ts/.js module or executable script)
|
|
@@ -348,18 +396,32 @@ Graders:
|
|
|
348
396
|
TS/JS modules must export a 'grade' function.
|
|
349
397
|
Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
|
|
350
398
|
|
|
399
|
+
Parallelization:
|
|
400
|
+
Use -j/--concurrency to run multiple prompts in parallel.
|
|
401
|
+
Each prompt gets its own agent session for isolation.
|
|
402
|
+
Results are written as they complete (order may differ from input).
|
|
403
|
+
|
|
404
|
+
Workspace Isolation:
|
|
405
|
+
Use --workspace-dir to create per-prompt directories.
|
|
406
|
+
Each prompt runs in {workspace-dir}/prompt-{id}/.
|
|
407
|
+
Useful for code generation tasks requiring filesystem isolation.
|
|
408
|
+
|
|
351
409
|
Examples:
|
|
352
410
|
# Basic capture with schema
|
|
353
411
|
agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
|
|
354
412
|
|
|
413
|
+
# Run 4 prompts in parallel
|
|
414
|
+
agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl
|
|
415
|
+
|
|
416
|
+
# With workspace isolation for code generation
|
|
417
|
+
agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\
|
|
418
|
+
--workspace-dir ./workspaces -o results.jsonl
|
|
419
|
+
|
|
355
420
|
# With TypeScript grader
|
|
356
421
|
agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
|
|
357
422
|
|
|
358
423
|
# With debug mode
|
|
359
424
|
agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
|
|
360
|
-
|
|
361
|
-
# With per-prompt timeout override (in prompts.jsonl):
|
|
362
|
-
{"id": "slow-task", "input": "...", "timeout": 180000}
|
|
363
425
|
`)
|
|
364
426
|
return
|
|
365
427
|
}
|
|
@@ -387,6 +449,17 @@ Examples:
|
|
|
387
449
|
}
|
|
388
450
|
}
|
|
389
451
|
|
|
452
|
+
// Validate and parse concurrency
|
|
453
|
+
let concurrency = 1
|
|
454
|
+
if (values.concurrency) {
|
|
455
|
+
const parsed = Number.parseInt(values.concurrency, 10)
|
|
456
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
457
|
+
console.error('Error: --concurrency must be a positive integer')
|
|
458
|
+
process.exit(1)
|
|
459
|
+
}
|
|
460
|
+
concurrency = parsed
|
|
461
|
+
}
|
|
462
|
+
|
|
390
463
|
await runCapture({
|
|
391
464
|
promptsPath,
|
|
392
465
|
schemaPath: values.schema,
|
|
@@ -397,5 +470,7 @@ Examples:
|
|
|
397
470
|
append: values.append ?? false,
|
|
398
471
|
grader,
|
|
399
472
|
debug: values.debug ?? false,
|
|
473
|
+
concurrency,
|
|
474
|
+
workspaceDir: values['workspace-dir'],
|
|
400
475
|
})
|
|
401
476
|
}
|
|
@@ -117,10 +117,14 @@ describe('runCapture configuration', () => {
|
|
|
117
117
|
progress: true,
|
|
118
118
|
append: false,
|
|
119
119
|
debug: false,
|
|
120
|
+
concurrency: 4,
|
|
121
|
+
workspaceDir: '/tmp/workspaces',
|
|
120
122
|
}
|
|
121
123
|
|
|
122
124
|
expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
|
|
123
125
|
expect(config.schemaPath).toBe('./schemas/claude-headless.json')
|
|
126
|
+
expect(config.concurrency).toBe(4)
|
|
127
|
+
expect(config.workspaceDir).toBe('/tmp/workspaces')
|
|
124
128
|
})
|
|
125
129
|
|
|
126
130
|
test('CaptureConfig allows minimal configuration', () => {
|
|
@@ -135,6 +139,8 @@ describe('runCapture configuration', () => {
|
|
|
135
139
|
expect(config.progress).toBeUndefined()
|
|
136
140
|
expect(config.append).toBeUndefined()
|
|
137
141
|
expect(config.grader).toBeUndefined()
|
|
142
|
+
expect(config.concurrency).toBeUndefined()
|
|
143
|
+
expect(config.workspaceDir).toBeUndefined()
|
|
138
144
|
})
|
|
139
145
|
})
|
|
140
146
|
|
|
@@ -160,6 +166,8 @@ describe('capture CLI', () => {
|
|
|
160
166
|
expect(stdout).toContain('--progress')
|
|
161
167
|
expect(stdout).toContain('-g, --grader')
|
|
162
168
|
expect(stdout).toContain('-s, --schema')
|
|
169
|
+
expect(stdout).toContain('-j, --concurrency')
|
|
170
|
+
expect(stdout).toContain('--workspace-dir')
|
|
163
171
|
})
|
|
164
172
|
|
|
165
173
|
test('shows error for missing prompts file argument', async () => {
|
|
@@ -187,4 +195,53 @@ describe('capture CLI', () => {
|
|
|
187
195
|
expect(exitCode).not.toBe(0)
|
|
188
196
|
expect(stderr).toContain('--schema is required')
|
|
189
197
|
})
|
|
198
|
+
|
|
199
|
+
test('shows error for invalid concurrency value', async () => {
|
|
200
|
+
const proc = Bun.spawn(
|
|
201
|
+
['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
|
|
202
|
+
{
|
|
203
|
+
stdout: 'pipe',
|
|
204
|
+
stderr: 'pipe',
|
|
205
|
+
},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
const stderr = await new Response(proc.stderr).text()
|
|
209
|
+
const exitCode = await proc.exited
|
|
210
|
+
|
|
211
|
+
expect(exitCode).not.toBe(0)
|
|
212
|
+
expect(stderr).toContain('--concurrency must be a positive integer')
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
test('shows error for zero concurrency', async () => {
|
|
216
|
+
const proc = Bun.spawn(
|
|
217
|
+
['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
|
|
218
|
+
{
|
|
219
|
+
stdout: 'pipe',
|
|
220
|
+
stderr: 'pipe',
|
|
221
|
+
},
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
const stderr = await new Response(proc.stderr).text()
|
|
225
|
+
const exitCode = await proc.exited
|
|
226
|
+
|
|
227
|
+
expect(exitCode).not.toBe(0)
|
|
228
|
+
expect(stderr).toContain('--concurrency must be a positive integer')
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
test('shows error for negative concurrency', async () => {
|
|
232
|
+
// Note: Using --concurrency=-1 format because -j -1 is ambiguous to parseArgs
|
|
233
|
+
const proc = Bun.spawn(
|
|
234
|
+
['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '--concurrency=-1'],
|
|
235
|
+
{
|
|
236
|
+
stdout: 'pipe',
|
|
237
|
+
stderr: 'pipe',
|
|
238
|
+
},
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
const stderr = await new Response(proc.stderr).text()
|
|
242
|
+
const exitCode = await proc.exited
|
|
243
|
+
|
|
244
|
+
expect(exitCode).not.toBe(0)
|
|
245
|
+
expect(stderr).toContain('--concurrency must be a positive integer')
|
|
246
|
+
})
|
|
190
247
|
})
|
|
@@ -17,11 +17,15 @@ describe('TrialsConfig configuration', () => {
|
|
|
17
17
|
progress: true,
|
|
18
18
|
append: false,
|
|
19
19
|
debug: false,
|
|
20
|
+
concurrency: 4,
|
|
21
|
+
workspaceDir: '/tmp/workspaces',
|
|
20
22
|
}
|
|
21
23
|
|
|
22
24
|
expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
|
|
23
25
|
expect(config.schemaPath).toBe('./schemas/claude-headless.json')
|
|
24
26
|
expect(config.k).toBe(5)
|
|
27
|
+
expect(config.concurrency).toBe(4)
|
|
28
|
+
expect(config.workspaceDir).toBe('/tmp/workspaces')
|
|
25
29
|
})
|
|
26
30
|
|
|
27
31
|
test('TrialsConfig allows minimal configuration', () => {
|
|
@@ -37,6 +41,8 @@ describe('TrialsConfig configuration', () => {
|
|
|
37
41
|
expect(config.progress).toBeUndefined()
|
|
38
42
|
expect(config.append).toBeUndefined()
|
|
39
43
|
expect(config.grader).toBeUndefined()
|
|
44
|
+
expect(config.concurrency).toBeUndefined()
|
|
45
|
+
expect(config.workspaceDir).toBeUndefined()
|
|
40
46
|
})
|
|
41
47
|
})
|
|
42
48
|
|
|
@@ -64,6 +70,8 @@ describe('trials CLI', () => {
|
|
|
64
70
|
expect(stdout).toContain('-g, --grader')
|
|
65
71
|
expect(stdout).toContain('-s, --schema')
|
|
66
72
|
expect(stdout).toContain('pass@k')
|
|
73
|
+
expect(stdout).toContain('-j, --concurrency')
|
|
74
|
+
expect(stdout).toContain('--workspace-dir')
|
|
67
75
|
})
|
|
68
76
|
|
|
69
77
|
test('shows error for missing prompts file argument', async () => {
|
|
@@ -91,6 +99,38 @@ describe('trials CLI', () => {
|
|
|
91
99
|
expect(exitCode).not.toBe(0)
|
|
92
100
|
expect(stderr).toContain('--schema is required')
|
|
93
101
|
})
|
|
102
|
+
|
|
103
|
+
test('shows error for invalid concurrency value', async () => {
|
|
104
|
+
const proc = Bun.spawn(
|
|
105
|
+
['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
|
|
106
|
+
{
|
|
107
|
+
stdout: 'pipe',
|
|
108
|
+
stderr: 'pipe',
|
|
109
|
+
},
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
const stderr = await new Response(proc.stderr).text()
|
|
113
|
+
const exitCode = await proc.exited
|
|
114
|
+
|
|
115
|
+
expect(exitCode).not.toBe(0)
|
|
116
|
+
expect(stderr).toContain('--concurrency must be a positive integer')
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
test('shows error for zero concurrency', async () => {
|
|
120
|
+
const proc = Bun.spawn(
|
|
121
|
+
['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
|
|
122
|
+
{
|
|
123
|
+
stdout: 'pipe',
|
|
124
|
+
stderr: 'pipe',
|
|
125
|
+
},
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
const stderr = await new Response(proc.stderr).text()
|
|
129
|
+
const exitCode = await proc.exited
|
|
130
|
+
|
|
131
|
+
expect(exitCode).not.toBe(0)
|
|
132
|
+
expect(stderr).toContain('--concurrency must be a positive integer')
|
|
133
|
+
})
|
|
94
134
|
})
|
|
95
135
|
|
|
96
136
|
// ============================================================================
|