@plaited/agent-eval-harness 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/package.json +1 -1
- package/src/commands/balance.ts +1 -11
- package/src/commands/calibrate.ts +2 -10
- package/src/commands/capture.ts +104 -114
- package/src/commands/execution.ts +245 -0
- package/src/commands/tests/capture-cli.spec.ts +84 -0
- package/src/commands/tests/trials-cli.spec.ts +68 -0
- package/src/commands/trials.ts +98 -115
- package/src/commands/validate-refs.ts +3 -19
- package/src/core/core.ts +27 -1
- package/src/core/loading.ts +53 -19
- package/src/core/streaming.ts +172 -0
- package/src/core/tests/streaming.spec.ts +399 -0
- package/src/core/tests/worker-pool.spec.ts +377 -0
- package/src/core/worker-pool.ts +220 -0
- package/src/core.ts +15 -0
- package/src/schemas/grader-loader.ts +23 -6
- package/src/schemas/schemas-cli.ts +1 -6
- package/src/schemas/schemas.ts +2 -0
- package/src/schemas.ts +1 -1
package/README.md
CHANGED
|
@@ -58,11 +58,21 @@ bunx @plaited/agent-eval-harness capture prompts.jsonl \
|
|
|
58
58
|
--schema ./schemas/claude-headless.json \
|
|
59
59
|
-o results.jsonl
|
|
60
60
|
|
|
61
|
+
# Parallel capture (4x faster with 4 workers)
|
|
62
|
+
bunx @plaited/agent-eval-harness capture prompts.jsonl \
|
|
63
|
+
--schema ./schemas/claude-headless.json \
|
|
64
|
+
-j 4 -o results.jsonl
|
|
65
|
+
|
|
61
66
|
# Run trials for pass@k analysis with debug mode
|
|
62
67
|
bunx @plaited/agent-eval-harness trials prompts.jsonl \
|
|
63
68
|
--schema ./schemas/claude-headless.json \
|
|
64
69
|
-k 5 --grader ./grader.ts --debug
|
|
65
70
|
|
|
71
|
+
# Parallel trials (4 prompts running trials concurrently)
|
|
72
|
+
bunx @plaited/agent-eval-harness trials prompts.jsonl \
|
|
73
|
+
--schema ./schemas/claude-headless.json \
|
|
74
|
+
-k 5 -j 4 --workspace-dir ./workspaces -o trials.jsonl
|
|
75
|
+
|
|
66
76
|
# Summarize results
|
|
67
77
|
bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl
|
|
68
78
|
|
package/package.json
CHANGED
package/src/commands/balance.ts
CHANGED
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
import { parseArgs } from 'node:util'
|
|
12
|
+
import { loadPrompts, resolvePath } from '../core.ts'
|
|
12
13
|
import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts'
|
|
13
|
-
import { loadPrompts } from './capture.ts'
|
|
14
14
|
|
|
15
15
|
// ============================================================================
|
|
16
16
|
// Types
|
|
@@ -28,16 +28,6 @@ export type BalanceConfig = {
|
|
|
28
28
|
threshold?: number
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
-
// ============================================================================
|
|
32
|
-
// Helpers
|
|
33
|
-
// ============================================================================
|
|
34
|
-
|
|
35
|
-
/** Resolve path relative to process.cwd() */
|
|
36
|
-
const resolvePath = (path: string): string => {
|
|
37
|
-
if (path.startsWith('/')) return path
|
|
38
|
-
return `${process.cwd()}/${path}`
|
|
39
|
-
}
|
|
40
|
-
|
|
41
31
|
/**
|
|
42
32
|
* Analyze category distribution across prompts.
|
|
43
33
|
*
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
import { parseArgs } from 'node:util'
|
|
12
12
|
import { loadResults, resolvePath } from '../core.ts'
|
|
13
13
|
import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts'
|
|
14
|
-
import {
|
|
14
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
15
15
|
import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts'
|
|
16
16
|
|
|
17
17
|
// ============================================================================
|
|
@@ -293,15 +293,7 @@ Examples:
|
|
|
293
293
|
}
|
|
294
294
|
|
|
295
295
|
// Load grader if specified
|
|
296
|
-
|
|
297
|
-
if (values.grader) {
|
|
298
|
-
try {
|
|
299
|
-
grader = await loadGrader(values.grader)
|
|
300
|
-
} catch (error) {
|
|
301
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
302
|
-
process.exit(1)
|
|
303
|
-
}
|
|
304
|
-
}
|
|
296
|
+
const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
|
|
305
297
|
|
|
306
298
|
await runCalibrate({
|
|
307
299
|
resultsPath,
|
package/src/commands/capture.ts
CHANGED
|
@@ -13,22 +13,20 @@
|
|
|
13
13
|
|
|
14
14
|
import { parseArgs } from 'node:util'
|
|
15
15
|
import {
|
|
16
|
+
createWorkspaceDir,
|
|
16
17
|
detectTrajectoryRichness,
|
|
17
18
|
extractOutput,
|
|
18
19
|
extractTrajectory,
|
|
19
20
|
getInputPreview,
|
|
20
21
|
hasToolErrors,
|
|
21
|
-
loadPrompts,
|
|
22
22
|
logProgress,
|
|
23
|
-
|
|
24
|
-
writeOutput,
|
|
23
|
+
readStdinPrompts,
|
|
25
24
|
} from '../core.ts'
|
|
26
|
-
import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
27
25
|
import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
|
|
28
|
-
import {
|
|
29
|
-
import {
|
|
30
|
-
import {
|
|
31
|
-
import type
|
|
26
|
+
import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts'
|
|
27
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
28
|
+
import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts'
|
|
29
|
+
import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
|
|
32
30
|
|
|
33
31
|
// ============================================================================
|
|
34
32
|
// Re-exports for backward compatibility
|
|
@@ -51,26 +49,7 @@ export {
|
|
|
51
49
|
// ============================================================================
|
|
52
50
|
|
|
53
51
|
/** Configuration for capture command */
|
|
54
|
-
export type CaptureConfig =
|
|
55
|
-
/** Path to prompts.jsonl file */
|
|
56
|
-
promptsPath: string
|
|
57
|
-
/** Path to agent schema JSON file */
|
|
58
|
-
schemaPath: string
|
|
59
|
-
/** Output file path (undefined for stdout) */
|
|
60
|
-
outputPath?: string
|
|
61
|
-
/** Working directory for agent */
|
|
62
|
-
cwd?: string
|
|
63
|
-
/** Timeout per prompt in milliseconds (overrides schema default) */
|
|
64
|
-
timeout?: number
|
|
65
|
-
/** Show progress to stderr */
|
|
66
|
-
progress?: boolean
|
|
67
|
-
/** Append to output file instead of overwriting */
|
|
68
|
-
append?: boolean
|
|
69
|
-
/** Optional grader function */
|
|
70
|
-
grader?: Grader
|
|
71
|
-
/** Enable debug mode for detailed output */
|
|
72
|
-
debug?: boolean
|
|
73
|
-
}
|
|
52
|
+
export type CaptureConfig = BaseExecutionConfig
|
|
74
53
|
|
|
75
54
|
// ============================================================================
|
|
76
55
|
// Capture Implementation
|
|
@@ -87,46 +66,29 @@ export type CaptureConfig = {
|
|
|
87
66
|
* @returns Array of capture results
|
|
88
67
|
*/
|
|
89
68
|
export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
|
|
69
|
+
const ctx = await prepareExecution(config)
|
|
90
70
|
const {
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
71
|
+
schema,
|
|
72
|
+
prompts,
|
|
73
|
+
sessions,
|
|
74
|
+
resolvedOutputPath,
|
|
75
|
+
resolvedWorkspaceDir,
|
|
76
|
+
defaultWorkingDir,
|
|
77
|
+
progress,
|
|
98
78
|
grader,
|
|
99
|
-
debug
|
|
100
|
-
} =
|
|
79
|
+
debug,
|
|
80
|
+
} = ctx
|
|
101
81
|
|
|
102
|
-
//
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
82
|
+
// Log progress info
|
|
83
|
+
logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
|
|
84
|
+
logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
|
|
85
|
+
logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
|
|
86
|
+
if (ctx.concurrency > 1) {
|
|
87
|
+
logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
|
|
106
88
|
}
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
try {
|
|
110
|
-
const rawSchema = await schemaFile.json()
|
|
111
|
-
schema = parseHeadlessConfig(rawSchema)
|
|
112
|
-
} catch (error) {
|
|
113
|
-
throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
|
|
89
|
+
if (resolvedWorkspaceDir) {
|
|
90
|
+
logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
|
|
114
91
|
}
|
|
115
|
-
|
|
116
|
-
// Load prompts
|
|
117
|
-
const prompts = await loadPrompts(promptsPath)
|
|
118
|
-
|
|
119
|
-
// Resolve output path
|
|
120
|
-
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
121
|
-
|
|
122
|
-
// Determine effective timeout (CLI flag > schema default > harness default)
|
|
123
|
-
const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
|
|
124
|
-
const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
|
|
125
|
-
|
|
126
|
-
// Log progress info
|
|
127
|
-
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
|
|
128
|
-
logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
|
|
129
|
-
logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
|
|
130
92
|
if (resolvedOutputPath) {
|
|
131
93
|
logProgress(`Output: ${resolvedOutputPath}`, progress)
|
|
132
94
|
}
|
|
@@ -134,37 +96,24 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
134
96
|
logProgress(`Debug mode: enabled`, progress)
|
|
135
97
|
}
|
|
136
98
|
|
|
137
|
-
//
|
|
138
|
-
const
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
})
|
|
99
|
+
// Process a single prompt (used by worker pool)
|
|
100
|
+
const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
|
|
101
|
+
// Determine working directory (per-prompt workspace or default)
|
|
102
|
+
const workingDir = resolvedWorkspaceDir
|
|
103
|
+
? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id)
|
|
104
|
+
: defaultWorkingDir
|
|
144
105
|
|
|
145
|
-
|
|
146
|
-
if (resolvedOutputPath && !append) {
|
|
147
|
-
await Bun.write(resolvedOutputPath, '')
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
const workingDir = cwd ?? process.cwd()
|
|
151
|
-
const results: CaptureResult[] = []
|
|
152
|
-
let isFirstOutput = true
|
|
153
|
-
|
|
154
|
-
// Run evaluations sequentially - fresh session per entry
|
|
155
|
-
for (let i = 0; i < prompts.length; i++) {
|
|
156
|
-
const promptCase = prompts[i]
|
|
157
|
-
if (!promptCase) continue
|
|
158
|
-
|
|
159
|
-
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
|
|
106
|
+
logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
|
|
160
107
|
|
|
161
108
|
const startTime = Date.now()
|
|
162
109
|
let result: CaptureResult
|
|
110
|
+
let sessionId: string | undefined
|
|
163
111
|
|
|
164
112
|
try {
|
|
165
113
|
// Create fresh session for each entry (ensures isolation)
|
|
166
114
|
const sessionStart = Date.now()
|
|
167
115
|
const session = await sessions.create(workingDir)
|
|
116
|
+
sessionId = session.id
|
|
168
117
|
const sessionCreation = Date.now() - sessionStart
|
|
169
118
|
logProgress(` Session: ${session.id}`, progress)
|
|
170
119
|
|
|
@@ -177,9 +126,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
177
126
|
let lastExitInfo: ProcessExitInfo | undefined
|
|
178
127
|
let lastOutput = ''
|
|
179
128
|
|
|
180
|
-
// TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
|
|
181
|
-
// The session manager would need to accept timeout per-call to support this
|
|
182
|
-
|
|
183
129
|
// Execute each turn sequentially in the same session
|
|
184
130
|
for (const turnInput of inputs) {
|
|
185
131
|
const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
|
|
@@ -198,7 +144,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
198
144
|
|
|
199
145
|
result = {
|
|
200
146
|
id: promptCase.id,
|
|
201
|
-
input: promptCase.input,
|
|
147
|
+
input: promptCase.input,
|
|
202
148
|
output,
|
|
203
149
|
...(promptCase.hint && { hint: promptCase.hint }),
|
|
204
150
|
trajectory,
|
|
@@ -207,6 +153,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
207
153
|
agent: schema.name,
|
|
208
154
|
trajectoryRichness,
|
|
209
155
|
turnCount,
|
|
156
|
+
...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
|
|
210
157
|
...(lastExitInfo && {
|
|
211
158
|
exitCode: lastExitInfo.exitCode,
|
|
212
159
|
signal: lastExitInfo.signal,
|
|
@@ -236,14 +183,10 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
236
183
|
|
|
237
184
|
result.score = graderResult
|
|
238
185
|
|
|
239
|
-
// Merge outcome from grader if present
|
|
240
186
|
if (graderResult.outcome) {
|
|
241
187
|
result.outcome = graderResult.outcome
|
|
242
188
|
}
|
|
243
189
|
}
|
|
244
|
-
|
|
245
|
-
// Clean up session
|
|
246
|
-
sessions.destroy(session.id)
|
|
247
190
|
} catch (error) {
|
|
248
191
|
const endTime = Date.now()
|
|
249
192
|
const message = error instanceof Error ? error.message : String(error)
|
|
@@ -259,6 +202,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
259
202
|
agent: schema.name,
|
|
260
203
|
trajectoryRichness: 'minimal' as TrajectoryRichness,
|
|
261
204
|
turnCount: inputs.length,
|
|
205
|
+
...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
|
|
262
206
|
},
|
|
263
207
|
timing: {
|
|
264
208
|
start: startTime,
|
|
@@ -269,14 +213,15 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
269
213
|
toolErrors: true,
|
|
270
214
|
errors: [message],
|
|
271
215
|
}
|
|
216
|
+
} finally {
|
|
217
|
+
// Always clean up session if it was created
|
|
218
|
+
if (sessionId) {
|
|
219
|
+
sessions.destroy(sessionId)
|
|
220
|
+
}
|
|
272
221
|
}
|
|
273
222
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
// Write result immediately
|
|
277
|
-
const formatted = JSON.stringify(result)
|
|
278
|
-
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
279
|
-
isFirstOutput = false
|
|
223
|
+
// Write result immediately (coordinated via mutex for concurrent writes)
|
|
224
|
+
await ctx.writeResult(result)
|
|
280
225
|
|
|
281
226
|
const statusIcon = result.toolErrors ? '!' : '✓'
|
|
282
227
|
const exitInfo = result.metadata?.timedOut
|
|
@@ -284,11 +229,13 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
284
229
|
: result.metadata?.exitCode && result.metadata.exitCode !== 0
|
|
285
230
|
? ` - exit ${result.metadata.exitCode}`
|
|
286
231
|
: ''
|
|
287
|
-
logProgress(` ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
|
|
232
|
+
logProgress(` ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress)
|
|
233
|
+
|
|
234
|
+
return result
|
|
288
235
|
}
|
|
289
236
|
|
|
290
|
-
|
|
291
|
-
return
|
|
237
|
+
// Run with worker pool
|
|
238
|
+
return executePrompts(ctx, processPrompt)
|
|
292
239
|
}
|
|
293
240
|
|
|
294
241
|
// ============================================================================
|
|
@@ -312,6 +259,9 @@ export const capture = async (args: string[]): Promise<void> => {
|
|
|
312
259
|
append: { type: 'boolean', default: false },
|
|
313
260
|
grader: { type: 'string', short: 'g' },
|
|
314
261
|
debug: { type: 'boolean', default: false },
|
|
262
|
+
stdin: { type: 'boolean', default: false },
|
|
263
|
+
concurrency: { type: 'string', short: 'j' },
|
|
264
|
+
'workspace-dir': { type: 'string' },
|
|
315
265
|
help: { type: 'boolean', short: 'h' },
|
|
316
266
|
},
|
|
317
267
|
allowPositionals: true,
|
|
@@ -320,6 +270,7 @@ export const capture = async (args: string[]): Promise<void> => {
|
|
|
320
270
|
if (values.help) {
|
|
321
271
|
console.log(`
|
|
322
272
|
Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
|
|
273
|
+
cat prompts.jsonl | agent-eval-harness capture --stdin --schema <schema.json> [options]
|
|
323
274
|
|
|
324
275
|
Arguments:
|
|
325
276
|
prompts.jsonl Input file with evaluation prompts
|
|
@@ -329,6 +280,9 @@ Options:
|
|
|
329
280
|
-o, --output Output file (default: stdout)
|
|
330
281
|
-c, --cwd Working directory for agent
|
|
331
282
|
-t, --timeout Request timeout in ms (overrides schema default)
|
|
283
|
+
-j, --concurrency Number of concurrent workers (default: 1)
|
|
284
|
+
--stdin Read prompts from stdin (mutually exclusive with file arg)
|
|
285
|
+
--workspace-dir Base directory for per-prompt workspace isolation
|
|
332
286
|
--progress Show progress to stderr
|
|
333
287
|
--append Append to output file instead of overwriting
|
|
334
288
|
-g, --grader Path to grader (.ts/.js module or executable script)
|
|
@@ -348,25 +302,55 @@ Graders:
|
|
|
348
302
|
TS/JS modules must export a 'grade' function.
|
|
349
303
|
Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
|
|
350
304
|
|
|
305
|
+
Parallelization:
|
|
306
|
+
Use -j/--concurrency to run multiple prompts in parallel.
|
|
307
|
+
Each prompt gets its own agent session for isolation.
|
|
308
|
+
Results are written as they complete (order may differ from input).
|
|
309
|
+
|
|
310
|
+
Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
|
|
311
|
+
at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
|
|
312
|
+
In memory-constrained environments (Docker, CI) this can cause OOM kills.
|
|
313
|
+
Use --stdin to pipe prompts for container-level orchestration.
|
|
314
|
+
|
|
315
|
+
Workspace Isolation:
|
|
316
|
+
Use --workspace-dir to create per-prompt directories.
|
|
317
|
+
Each prompt runs in {workspace-dir}/prompt-{id}/.
|
|
318
|
+
Useful for code generation tasks requiring filesystem isolation.
|
|
319
|
+
|
|
351
320
|
Examples:
|
|
352
321
|
# Basic capture with schema
|
|
353
322
|
agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
|
|
354
323
|
|
|
324
|
+
# Run 4 prompts in parallel
|
|
325
|
+
agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl
|
|
326
|
+
|
|
327
|
+
# With workspace isolation for code generation
|
|
328
|
+
agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\
|
|
329
|
+
--workspace-dir ./workspaces -o results.jsonl
|
|
330
|
+
|
|
355
331
|
# With TypeScript grader
|
|
356
332
|
agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
|
|
357
333
|
|
|
358
334
|
# With debug mode
|
|
359
335
|
agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
|
|
360
336
|
|
|
361
|
-
#
|
|
362
|
-
|
|
337
|
+
# Read prompts from stdin (container orchestration)
|
|
338
|
+
cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl
|
|
363
339
|
`)
|
|
364
340
|
return
|
|
365
341
|
}
|
|
366
342
|
|
|
367
343
|
const promptsPath = positionals[0]
|
|
368
|
-
|
|
369
|
-
|
|
344
|
+
const useStdin = values.stdin ?? false
|
|
345
|
+
|
|
346
|
+
// Mutual exclusivity: --stdin and positional file
|
|
347
|
+
if (useStdin && promptsPath) {
|
|
348
|
+
console.error('Error: --stdin and prompts file argument are mutually exclusive')
|
|
349
|
+
process.exit(1)
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
if (!useStdin && !promptsPath) {
|
|
353
|
+
console.error('Error: prompts.jsonl path is required (or use --stdin)')
|
|
370
354
|
process.exit(1)
|
|
371
355
|
}
|
|
372
356
|
|
|
@@ -376,19 +360,23 @@ Examples:
|
|
|
376
360
|
process.exit(1)
|
|
377
361
|
}
|
|
378
362
|
|
|
379
|
-
//
|
|
380
|
-
let
|
|
381
|
-
if (
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
363
|
+
// Read prompts from stdin if requested
|
|
364
|
+
let prompts: PromptCase[] | undefined
|
|
365
|
+
if (useStdin) {
|
|
366
|
+
const stdinPrompts = await readStdinPrompts()
|
|
367
|
+
if (!stdinPrompts || stdinPrompts.length === 0) {
|
|
368
|
+
console.error('Error: no prompts received on stdin')
|
|
386
369
|
process.exit(1)
|
|
387
370
|
}
|
|
371
|
+
prompts = stdinPrompts
|
|
388
372
|
}
|
|
389
373
|
|
|
374
|
+
// Load grader if specified
|
|
375
|
+
const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
|
|
376
|
+
|
|
390
377
|
await runCapture({
|
|
391
|
-
promptsPath,
|
|
378
|
+
promptsPath: promptsPath ?? undefined,
|
|
379
|
+
prompts,
|
|
392
380
|
schemaPath: values.schema,
|
|
393
381
|
outputPath: values.output,
|
|
394
382
|
cwd: values.cwd,
|
|
@@ -397,5 +385,7 @@ Examples:
|
|
|
397
385
|
append: values.append ?? false,
|
|
398
386
|
grader,
|
|
399
387
|
debug: values.debug ?? false,
|
|
388
|
+
concurrency: parseConcurrency(values.concurrency),
|
|
389
|
+
workspaceDir: values['workspace-dir'],
|
|
400
390
|
})
|
|
401
391
|
}
|