@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Pipeline commands for Unix-style composable evaluation.
3
+ *
4
+ * @remarks
5
+ * Re-exports pipeline commands and types.
6
+ *
7
+ * Commands:
8
+ * - run: Execute prompts and output raw results
9
+ * - extract: Parse raw output into trajectories
10
+ * - grade: Apply grader to extracted results
11
+ * - format: Convert results to different output formats
12
+ * - compare: Compare multiple runs of the same prompts
13
+ *
14
+ * @packageDocumentation
15
+ */
16
+
17
+ // Commands
18
+ export { compare } from './compare.ts'
19
+ export { extract } from './extract.ts'
20
+ export { format } from './format.ts'
21
+ export { grade } from './grade.ts'
22
+ // Types
23
+ export type {
24
+ CompareConfig,
25
+ ComparisonGrader,
26
+ ComparisonGraderInput,
27
+ ComparisonGraderResult,
28
+ ComparisonRanking,
29
+ ComparisonResult,
30
+ ExtractConfig,
31
+ ExtractedResult,
32
+ FormatConfig,
33
+ FormatStyle,
34
+ GradeConfig,
35
+ GradedResult,
36
+ LabeledRun,
37
+ RawOutput,
38
+ RunConfig,
39
+ RunMode,
40
+ } from './pipeline.types.ts'
41
+ export { run } from './run.ts'
@@ -0,0 +1,241 @@
1
+ /**
2
+ * Type definitions for pipeline commands.
3
+ *
4
+ * @remarks
5
+ * These types define the data flow between pipeline stages:
6
+ * run → extract → grade → format
7
+ *
8
+ * Each stage transforms the data, enabling Unix-style piping.
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ import type { GraderResult, TrajectoryStep } from '../schemas.ts'
14
+
15
+ /**
16
+ * Raw output from the `run` command.
17
+ *
18
+ * @remarks
19
+ * Captures the raw agent output before trajectory extraction.
20
+ * Used when piping `run` output to `extract`.
21
+ */
22
+ export type RawOutput = {
23
+ /** Test case identifier */
24
+ id: string
25
+ /** Original prompt input (string for single turn, array for multi-turn) */
26
+ input: string | string[]
27
+ /** Grader context hint */
28
+ hint?: string
29
+ /** Raw output lines from the agent (JSON strings) */
30
+ rawLines: string[]
31
+ /** Timing metadata */
32
+ timing: {
33
+ start: number
34
+ end: number
35
+ total: number
36
+ }
37
+ /** Error message if execution failed */
38
+ error?: string
39
+ }
40
+
41
+ /**
42
+ * Extracted result from the `extract` command.
43
+ *
44
+ * @remarks
45
+ * Converts raw output lines into structured trajectory and output.
46
+ * Ready for grading or formatting.
47
+ */
48
+ export type ExtractedResult = {
49
+ /** Test case identifier */
50
+ id: string
51
+ /** Original prompt input */
52
+ input: string | string[]
53
+ /** Grader context hint */
54
+ hint?: string
55
+ /** Final agent output (extracted from trajectory) */
56
+ output: string
57
+ /** Parsed trajectory steps */
58
+ trajectory: TrajectoryStep[]
59
+ /** Whether tool errors were detected */
60
+ toolErrors: boolean
61
+ /** Timing metadata */
62
+ timing: {
63
+ start: number
64
+ end: number
65
+ total: number
66
+ }
67
+ /** Error message if extraction failed */
68
+ error?: string
69
+ }
70
+
71
+ /**
72
+ * Graded result from the `grade` command.
73
+ *
74
+ * @remarks
75
+ * Adds grader score to extracted result.
76
+ */
77
+ export type GradedResult = ExtractedResult & {
78
+ /** Grader score */
79
+ score: GraderResult
80
+ }
81
+
82
+ /**
83
+ * Run mode for the pipeline run command.
84
+ *
85
+ * @remarks
86
+ * - `schema`: Use headless adapter with schema file
87
+ * - `simple`: Use Bun shell with placeholder substitution
88
+ * - `shell`: Use Bun shell with PROMPT env variable
89
+ */
90
+ export type RunMode = 'schema' | 'simple' | 'shell'
91
+
92
+ /**
93
+ * Configuration for pipeline run command.
94
+ */
95
+ export type RunConfig = {
96
+ /** Run mode */
97
+ mode: RunMode
98
+ /** Path to schema file (for 'schema' mode) */
99
+ schemaPath?: string
100
+ /** Command template (for 'simple' mode) - {} is replaced with prompt */
101
+ simpleCommand?: string
102
+ /** Shell template (for 'shell' mode) - $PROMPT env var is available */
103
+ shellTemplate?: string
104
+ /** Working directory */
105
+ cwd?: string
106
+ /** Timeout per prompt in milliseconds */
107
+ timeout?: number
108
+ /** Show progress to stderr */
109
+ progress?: boolean
110
+ }
111
+
112
+ /**
113
+ * Configuration for pipeline extract command.
114
+ */
115
+ export type ExtractConfig = {
116
+ /** Path to schema file for output parsing */
117
+ schemaPath: string
118
+ /** Show progress to stderr */
119
+ progress?: boolean
120
+ }
121
+
122
+ /**
123
+ * Configuration for pipeline grade command.
124
+ */
125
+ export type GradeConfig = {
126
+ /** Path to grader module or executable */
127
+ graderPath: string
128
+ /** Show progress to stderr */
129
+ progress?: boolean
130
+ }
131
+
132
+ /**
133
+ * Output format for pipeline format command.
134
+ */
135
+ export type FormatStyle = 'jsonl' | 'markdown' | 'csv'
136
+
137
+ /**
138
+ * Configuration for pipeline format command.
139
+ */
140
+ export type FormatConfig = {
141
+ /** Output format style */
142
+ style: FormatStyle
143
+ /** Show progress to stderr */
144
+ progress?: boolean
145
+ }
146
+
147
+ /**
148
+ * Labeled run for comparison.
149
+ *
150
+ * @remarks
151
+ * Associates a results file with a human-readable label
152
+ * for the compare command output.
153
+ */
154
+ export type LabeledRun = {
155
+ /** Human-readable label (derived from filename or explicit) */
156
+ label: string
157
+ /** Path to results JSONL file */
158
+ path: string
159
+ }
160
+
161
+ /**
162
+ * Input to comparison grader function.
163
+ *
164
+ * @remarks
165
+ * Provides all runs' results for a single prompt ID
166
+ * so the grader can compare and rank them.
167
+ */
168
+ export type ComparisonGraderInput = {
169
+ /** Test case identifier */
170
+ id: string
171
+ /** Original prompt input */
172
+ input: string | string[]
173
+ /** Grader context hint */
174
+ hint?: string
175
+ /** Results keyed by run label */
176
+ runs: Record<string, { output: string; trajectory?: TrajectoryStep[] }>
177
+ }
178
+
179
+ /**
180
+ * Single ranking entry in comparison result.
181
+ */
182
+ export type ComparisonRanking = {
183
+ /** Run label */
184
+ run: string
185
+ /** Rank position (1 = best) */
186
+ rank: number
187
+ /** Numeric score */
188
+ score: number
189
+ }
190
+
191
+ /**
192
+ * Result from comparison grader function.
193
+ *
194
+ * @remarks
195
+ * Rankings should be ordered from best to worst.
196
+ */
197
+ export type ComparisonGraderResult = {
198
+ /** Rankings from best to worst */
199
+ rankings: ComparisonRanking[]
200
+ /** Optional reasoning for the rankings */
201
+ reasoning?: string
202
+ }
203
+
204
+ /**
205
+ * Comparison grader function type.
206
+ *
207
+ * @remarks
208
+ * User-provided graders implement this interface to compare
209
+ * multiple runs of the same prompt.
210
+ */
211
+ export type ComparisonGrader = (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
212
+
213
+ /**
214
+ * Configuration for pipeline compare command.
215
+ */
216
+ export type CompareConfig = {
217
+ /** Labeled runs to compare */
218
+ runs: LabeledRun[]
219
+ /** Path to comparison grader */
220
+ graderPath: string
221
+ /** Output file path */
222
+ outputPath?: string
223
+ /** Show progress to stderr */
224
+ progress?: boolean
225
+ }
226
+
227
+ /**
228
+ * Comparison result for a single prompt.
229
+ */
230
+ export type ComparisonResult = {
231
+ /** Test case identifier */
232
+ id: string
233
+ /** Original prompt input */
234
+ input: string | string[]
235
+ /** Grader context hint */
236
+ hint?: string
237
+ /** Rankings from comparison grader */
238
+ rankings: ComparisonRanking[]
239
+ /** Optional reasoning */
240
+ reasoning?: string
241
+ }
@@ -0,0 +1,412 @@
1
+ /**
2
+ * Pipeline run command - execute prompts and output raw results.
3
+ *
4
+ * @remarks
5
+ * Supports three modes:
6
+ * - `schema`: Use headless adapter with schema file (full trajectory capture)
7
+ * - `simple`: Use Bun shell with `{}` placeholder for prompt
8
+ * - `shell`: Use Bun shell with `$PROMPT` environment variable
9
+ *
10
+ * Output is RawOutput JSONL suitable for piping to `extract`.
11
+ *
12
+ * @packageDocumentation
13
+ */
14
+
15
+ import { parseArgs } from 'node:util'
16
+ import { loadPrompts, logProgress, writeOutput } from '../core.ts'
17
+ import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
18
+ import { createSessionManager } from '../headless/headless-session-manager.ts'
19
+ import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
20
+ import type { RawOutput, RunConfig } from './pipeline.types.ts'
21
+
22
+ /**
23
+ * Execute a single prompt in simple mode.
24
+ *
25
+ * @remarks
26
+ * Replaces `{}` placeholder in command with the prompt text.
27
+ * Uses Bun shell for execution.
28
+ *
29
+ * @param prompt - Prompt text to execute
30
+ * @param command - Command template with `{}` placeholder
31
+ * @param timeout - Execution timeout in milliseconds
32
+ * @returns Object with output lines and optional stderr error
33
+ */
34
+ const runSimple = async (
35
+ prompt: string,
36
+ command: string,
37
+ timeout: number,
38
+ ): Promise<{ lines: string[]; error?: string }> => {
39
+ const escapedPrompt = prompt.replace(/'/g, "'\\''")
40
+ const finalCmd = command.replace('{}', `'${escapedPrompt}'`)
41
+
42
+ const proc = Bun.spawn(['sh', '-c', finalCmd], {
43
+ stdout: 'pipe',
44
+ stderr: 'pipe',
45
+ })
46
+
47
+ const timeoutId = setTimeout(() => proc.kill(), timeout)
48
+
49
+ try {
50
+ const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()])
51
+ clearTimeout(timeoutId)
52
+ const lines = stdout.trim().split('\n').filter(Boolean)
53
+ return stderr.trim() ? { lines, error: stderr.trim() } : { lines }
54
+ } catch (err) {
55
+ clearTimeout(timeoutId)
56
+ return { lines: [], error: err instanceof Error ? err.message : String(err) }
57
+ }
58
+ }
59
+
60
+ /**
61
+ * Execute a single prompt in shell mode.
62
+ *
63
+ * @remarks
64
+ * Sets PROMPT environment variable and executes shell template.
65
+ *
66
+ * @param prompt - Prompt text to execute
67
+ * @param template - Shell command template
68
+ * @param timeout - Execution timeout in milliseconds
69
+ * @returns Object with output lines and optional stderr error
70
+ */
71
+ const runShell = async (
72
+ prompt: string,
73
+ template: string,
74
+ timeout: number,
75
+ ): Promise<{ lines: string[]; error?: string }> => {
76
+ const proc = Bun.spawn(['sh', '-c', template], {
77
+ stdout: 'pipe',
78
+ stderr: 'pipe',
79
+ env: { ...process.env, PROMPT: prompt },
80
+ })
81
+
82
+ const timeoutId = setTimeout(() => proc.kill(), timeout)
83
+
84
+ try {
85
+ const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()])
86
+ clearTimeout(timeoutId)
87
+ const lines = stdout.trim().split('\n').filter(Boolean)
88
+ return stderr.trim() ? { lines, error: stderr.trim() } : { lines }
89
+ } catch (err) {
90
+ clearTimeout(timeoutId)
91
+ return { lines: [], error: err instanceof Error ? err.message : String(err) }
92
+ }
93
+ }
94
+
95
+ /**
96
+ * Execute pipeline run with configuration object.
97
+ *
98
+ * @remarks
99
+ * Processes prompts from stdin (if available) or from a file,
100
+ * executing each and outputting RawOutput JSONL.
101
+ *
102
+ * @param config - Run configuration
103
+ * @param prompts - Array of prompts to execute
104
+ * @param outputPath - Optional output file path
105
+ */
106
+ export const runPipeline = async (
107
+ config: RunConfig,
108
+ prompts: Array<{ id: string; input: string | string[]; hint?: string }>,
109
+ outputPath?: string,
110
+ ): Promise<void> => {
111
+ const {
112
+ mode,
113
+ schemaPath,
114
+ simpleCommand,
115
+ shellTemplate,
116
+ cwd,
117
+ timeout = DEFAULT_HARNESS_TIMEOUT,
118
+ progress = false,
119
+ } = config
120
+
121
+ const workingDir = cwd ?? process.cwd()
122
+ let isFirstOutput = true
123
+
124
+ // Clear output file if specified
125
+ if (outputPath) {
126
+ await Bun.write(outputPath, '')
127
+ }
128
+
129
+ if (mode === 'schema') {
130
+ // Schema mode: use headless adapter
131
+ if (!schemaPath) {
132
+ throw new Error('Schema path required for schema mode')
133
+ }
134
+
135
+ const schemaFile = Bun.file(schemaPath)
136
+ if (!(await schemaFile.exists())) {
137
+ throw new Error(`Schema file not found: ${schemaPath}`)
138
+ }
139
+
140
+ const rawSchema = await schemaFile.json()
141
+ const schema = parseHeadlessConfig(rawSchema)
142
+
143
+ const sessions = createSessionManager({
144
+ schema,
145
+ timeout,
146
+ verbose: progress,
147
+ })
148
+
149
+ logProgress(`Schema mode: ${schema.name}`, progress)
150
+
151
+ for (let i = 0; i < prompts.length; i++) {
152
+ const promptCase = prompts[i]
153
+ if (!promptCase) continue
154
+
155
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
156
+
157
+ const startTime = Date.now()
158
+ const rawLines: string[] = []
159
+ let error: string | undefined
160
+
161
+ try {
162
+ const session = await sessions.create(workingDir)
163
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
164
+
165
+ for (const turnInput of inputs) {
166
+ const result = await sessions.prompt(session.id, turnInput)
167
+ // Collect raw JSON lines from updates
168
+ for (const update of result.updates) {
169
+ rawLines.push(JSON.stringify(update.raw))
170
+ }
171
+ }
172
+
173
+ sessions.destroy(session.id)
174
+ } catch (err) {
175
+ error = err instanceof Error ? err.message : String(err)
176
+ }
177
+
178
+ const endTime = Date.now()
179
+
180
+ const output: RawOutput = {
181
+ id: promptCase.id,
182
+ input: promptCase.input,
183
+ hint: promptCase.hint,
184
+ rawLines,
185
+ timing: {
186
+ start: startTime,
187
+ end: endTime,
188
+ total: endTime - startTime,
189
+ },
190
+ ...(error && { error }),
191
+ }
192
+
193
+ await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
194
+ isFirstOutput = false
195
+ }
196
+ } else if (mode === 'simple') {
197
+ // Simple mode: placeholder substitution
198
+ if (!simpleCommand) {
199
+ throw new Error('Command required for simple mode')
200
+ }
201
+
202
+ logProgress(`Simple mode: ${simpleCommand}`, progress)
203
+
204
+ for (let i = 0; i < prompts.length; i++) {
205
+ const promptCase = prompts[i]
206
+ if (!promptCase) continue
207
+
208
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
209
+
210
+ const startTime = Date.now()
211
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
212
+ const allLines: string[] = []
213
+ const errors: string[] = []
214
+
215
+ for (const input of inputs) {
216
+ const result = await runSimple(input, simpleCommand, timeout)
217
+ allLines.push(...result.lines)
218
+ if (result.error) errors.push(result.error)
219
+ }
220
+
221
+ const endTime = Date.now()
222
+
223
+ const output: RawOutput = {
224
+ id: promptCase.id,
225
+ input: promptCase.input,
226
+ hint: promptCase.hint,
227
+ rawLines: allLines,
228
+ timing: {
229
+ start: startTime,
230
+ end: endTime,
231
+ total: endTime - startTime,
232
+ },
233
+ ...(errors.length > 0 && { error: errors.join('\n') }),
234
+ }
235
+
236
+ await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
237
+ isFirstOutput = false
238
+ }
239
+ } else if (mode === 'shell') {
240
+ // Shell mode: PROMPT env variable
241
+ if (!shellTemplate) {
242
+ throw new Error('Shell template required for shell mode')
243
+ }
244
+
245
+ logProgress(`Shell mode: ${shellTemplate}`, progress)
246
+
247
+ for (let i = 0; i < prompts.length; i++) {
248
+ const promptCase = prompts[i]
249
+ if (!promptCase) continue
250
+
251
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
252
+
253
+ const startTime = Date.now()
254
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
255
+ const allLines: string[] = []
256
+ const errors: string[] = []
257
+
258
+ for (const input of inputs) {
259
+ const result = await runShell(input, shellTemplate, timeout)
260
+ allLines.push(...result.lines)
261
+ if (result.error) errors.push(result.error)
262
+ }
263
+
264
+ const endTime = Date.now()
265
+
266
+ const output: RawOutput = {
267
+ id: promptCase.id,
268
+ input: promptCase.input,
269
+ hint: promptCase.hint,
270
+ rawLines: allLines,
271
+ timing: {
272
+ start: startTime,
273
+ end: endTime,
274
+ total: endTime - startTime,
275
+ },
276
+ ...(errors.length > 0 && { error: errors.join('\n') }),
277
+ }
278
+
279
+ await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
280
+ isFirstOutput = false
281
+ }
282
+ }
283
+
284
+ logProgress('Done!', progress)
285
+ }
286
+
287
+ /**
288
+ * Read prompts from stdin if available.
289
+ *
290
+ * @returns Array of parsed prompts or null if stdin is empty
291
+ */
292
+ const readStdinPrompts = async (): Promise<Array<{ id: string; input: string | string[]; hint?: string }> | null> => {
293
+ // Check if stdin has data (not a TTY)
294
+ if (process.stdin.isTTY) {
295
+ return null
296
+ }
297
+
298
+ const chunks: Buffer[] = []
299
+ for await (const chunk of process.stdin) {
300
+ chunks.push(chunk)
301
+ }
302
+
303
+ const content = Buffer.concat(chunks).toString('utf-8').trim()
304
+ if (!content) return null
305
+
306
+ return content
307
+ .split('\n')
308
+ .filter(Boolean)
309
+ .map((line) => JSON.parse(line))
310
+ }
311
+
312
+ /**
313
+ * Pipeline run command CLI handler.
314
+ *
315
+ * @param args - Command line arguments (after 'run')
316
+ */
317
+ export const run = async (args: string[]): Promise<void> => {
318
+ const { values, positionals } = parseArgs({
319
+ args,
320
+ options: {
321
+ schema: { type: 'string', short: 's' },
322
+ simple: { type: 'string' },
323
+ shell: { type: 'string' },
324
+ output: { type: 'string', short: 'o' },
325
+ cwd: { type: 'string', short: 'c' },
326
+ timeout: { type: 'string', short: 't' },
327
+ progress: { type: 'boolean', default: false },
328
+ help: { type: 'boolean', short: 'h' },
329
+ },
330
+ allowPositionals: true,
331
+ })
332
+
333
+ if (values.help) {
334
+ // biome-ignore lint/suspicious/noConsole: CLI help output
335
+ console.log(`
336
+ Usage: agent-eval-harness run [prompts.jsonl] [options]
337
+
338
+ Execute prompts and output raw results for pipeline processing.
339
+
340
+ Arguments:
341
+ prompts.jsonl Input file (or pipe from stdin)
342
+
343
+ Modes (choose one):
344
+ -s, --schema Path to headless adapter schema (recommended)
345
+ --simple Command template with {} placeholder
346
+ --shell Shell template with $PROMPT env variable
347
+
348
+ Options:
349
+ -o, --output Output file (default: stdout)
350
+ -c, --cwd Working directory for agent
351
+ -t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
352
+ --progress Show progress to stderr
353
+ -h, --help Show this help message
354
+
355
+ Examples:
356
+ # Schema mode (recommended)
357
+ agent-eval-harness run prompts.jsonl --schema claude.json | agent-eval-harness extract
358
+
359
+ # Simple mode with placeholder
360
+ agent-eval-harness run prompts.jsonl --simple "claude -p {} --output-format stream-json"
361
+
362
+ # Shell mode with env variable
363
+ agent-eval-harness run prompts.jsonl --shell 'claude -p "$PROMPT" --output-format stream-json'
364
+
365
+ # Pipe from stdin
366
+ cat prompts.jsonl | agent-eval-harness run --schema claude.json
367
+ `)
368
+ return
369
+ }
370
+
371
+ // Determine mode
372
+ let mode: 'schema' | 'simple' | 'shell'
373
+ if (values.schema) {
374
+ mode = 'schema'
375
+ } else if (values.simple) {
376
+ mode = 'simple'
377
+ } else if (values.shell) {
378
+ mode = 'shell'
379
+ } else {
380
+ console.error('Error: Must specify --schema, --simple, or --shell mode')
381
+ process.exit(1)
382
+ }
383
+
384
+ // Load prompts from file or stdin
385
+ const promptsPath = positionals[0]
386
+ let prompts: Array<{ id: string; input: string | string[]; hint?: string }>
387
+
388
+ if (promptsPath) {
389
+ prompts = await loadPrompts(promptsPath)
390
+ } else {
391
+ const stdinPrompts = await readStdinPrompts()
392
+ if (!stdinPrompts || stdinPrompts.length === 0) {
393
+ console.error('Error: No prompts provided (use file argument or pipe to stdin)')
394
+ process.exit(1)
395
+ }
396
+ prompts = stdinPrompts
397
+ }
398
+
399
+ await runPipeline(
400
+ {
401
+ mode,
402
+ schemaPath: values.schema,
403
+ simpleCommand: values.simple,
404
+ shellTemplate: values.shell,
405
+ cwd: values.cwd,
406
+ timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
407
+ progress: values.progress,
408
+ },
409
+ prompts,
410
+ values.output,
411
+ )
412
+ }