@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,587 @@
1
+ /**
2
+ * Session manager for headless CLI agents.
3
+ *
4
+ * @remarks
5
+ * Manages the lifecycle of CLI agent sessions including:
6
+ * - Process spawning and tracking
7
+ * - Stream mode (persistent process) vs iterative mode (new process per turn)
8
+ * - Output parsing and update emission
9
+ * - Session state management
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import type { Subprocess } from 'bun'
15
+ import type { HeadlessAdapterConfig } from './headless.schemas.ts'
16
+ import { createHistoryBuilder, type HistoryBuilder } from './headless-history-builder.ts'
17
+ import { createOutputParser, type OutputParser, type ParsedUpdate } from './headless-output-parser.ts'
18
+
19
+ // ============================================================================
20
+ // Types
21
+ // ============================================================================
22
+
23
+ /** Session state */
24
+ export type Session = {
25
+ /** Unique session identifier */
26
+ id: string
27
+ /** Working directory for this session */
28
+ cwd: string
29
+ /** Subprocess (stream mode only) */
30
+ process?: Subprocess
31
+ /** History builder (iterative mode only) */
32
+ history?: HistoryBuilder
33
+ /** Session ID from CLI (for resume, stream mode) */
34
+ cliSessionId?: string
35
+ /** Whether the session is active */
36
+ active: boolean
37
+ /** Turn count for this session */
38
+ turnCount: number
39
+ }
40
+
41
+ /** Process exit information for debugging */
42
+ export type ProcessExitInfo = {
43
+ /** Exit code (null if killed by signal or timed out) */
44
+ exitCode: number | null
45
+ /** Signal that killed the process (if any) */
46
+ signal?: string
47
+ /** Whether the process was killed due to timeout */
48
+ timedOut: boolean
49
+ }
50
+
51
+ /** Update callback for emitting session updates */
52
+ export type UpdateCallback = (update: ParsedUpdate) => void
53
+
54
+ /** Prompt result with final output */
55
+ export type PromptResult = {
56
+ /** Final output content */
57
+ output: string
58
+ /** All updates collected during the prompt */
59
+ updates: ParsedUpdate[]
60
+ /** Session ID from CLI (if available) */
61
+ cliSessionId?: string
62
+ /** Process exit information */
63
+ exitInfo?: ProcessExitInfo
64
+ }
65
+
66
+ /** Session manager configuration */
67
+ export type SessionManagerConfig = {
68
+ /** Headless adapter configuration */
69
+ schema: HeadlessAdapterConfig
70
+ /** Default timeout for operations in ms (overrides schema timeout) */
71
+ timeout?: number
72
+ /** Whether to show debug output (constructed commands, raw stdout) */
73
+ verbose?: boolean
74
+ /**
75
+ * Debug mode - shows detailed output for troubleshooting.
76
+ * When enabled:
77
+ * - Raw CLI stdout/stderr is logged
78
+ * - JSONPath match attempts and results are shown
79
+ * - Process spawn/exit info is displayed
80
+ * - Timing for each stage is reported
81
+ */
82
+ debug?: boolean
83
+ }
84
+
85
+ // ============================================================================
86
+ // Session Manager Factory
87
+ // ============================================================================
88
+
89
+ /**
90
+ * Creates a session manager for headless CLI agents.
91
+ *
92
+ * @remarks
93
+ * The session manager is the core orchestrator for CLI agent interaction:
94
+ *
95
+ * **Stream mode:**
96
+ * - Spawns one process per session
97
+ * - Keeps process alive across turns
98
+ * - Uses stdin/stdout for communication
99
+ * - Supports session resume via CLI flags
100
+ *
101
+ * **Iterative mode:**
102
+ * - Spawns a new process per turn
103
+ * - Accumulates history in prompts
104
+ * - No persistent process state
105
+ *
106
+ * @param config - Session manager configuration
107
+ * @returns Session manager with create, prompt, and cancel methods
108
+ */
109
+ export const createSessionManager = (config: SessionManagerConfig) => {
110
+ const { schema, verbose = false, debug = false } = config
111
+ // Use schema timeout if available, otherwise default to 60000ms
112
+ const schemaTimeout = 'timeout' in schema ? (schema.timeout ?? 60000) : 60000
113
+ const timeout = config.timeout ?? schemaTimeout
114
+ const sessions = new Map<string, Session>()
115
+ const outputParser = createOutputParser(schema)
116
+
117
+ /**
118
+ * Debug logging helper - only logs when debug mode is enabled.
119
+ */
120
+ const debugLog = (category: string, message: string, data?: unknown): void => {
121
+ if (debug) {
122
+ const timestamp = new Date().toISOString()
123
+ console.error(`[${timestamp}] [${category}] ${message}`)
124
+ if (data !== undefined) {
125
+ console.error(JSON.stringify(data, null, 2))
126
+ }
127
+ }
128
+ }
129
+
130
+ /**
131
+ * Creates a new session.
132
+ *
133
+ * @param cwd - Working directory for the session
134
+ * @returns Created session
135
+ */
136
+ const create = async (cwd: string): Promise<Session> => {
137
+ const id = generateSessionId()
138
+
139
+ const session: Session = {
140
+ id,
141
+ cwd,
142
+ active: true,
143
+ turnCount: 0,
144
+ }
145
+
146
+ // Initialize mode-specific state
147
+ if (schema.sessionMode === 'iterative') {
148
+ // Normalize historyTemplate: v2 schemas can have object format, convert to string
149
+ let templateString: string | undefined
150
+ if (typeof schema.historyTemplate === 'object' && schema.historyTemplate !== null) {
151
+ // Use turnFormat from object-style template
152
+ templateString = schema.historyTemplate.turnFormat
153
+ } else {
154
+ templateString = schema.historyTemplate
155
+ }
156
+ session.history = createHistoryBuilder({
157
+ template: templateString,
158
+ })
159
+ }
160
+
161
+ sessions.set(id, session)
162
+ return session
163
+ }
164
+
165
+ /**
166
+ * Sends a prompt to a session and collects the response.
167
+ *
168
+ * @param sessionId - Session ID
169
+ * @param promptText - Prompt text to send
170
+ * @param onUpdate - Callback for streaming updates
171
+ * @returns Prompt result with output and updates
172
+ */
173
+ const prompt = async (sessionId: string, promptText: string, onUpdate?: UpdateCallback): Promise<PromptResult> => {
174
+ const session = sessions.get(sessionId)
175
+ if (!session) {
176
+ throw new Error(`Session not found: ${sessionId}`)
177
+ }
178
+
179
+ if (!session.active) {
180
+ throw new Error(`Session is not active: ${sessionId}`)
181
+ }
182
+
183
+ session.turnCount++
184
+
185
+ if (schema.sessionMode === 'stream') {
186
+ return promptStream(session, promptText, onUpdate)
187
+ }
188
+
189
+ return promptIterative(session, promptText, onUpdate)
190
+ }
191
+
192
+ /**
193
+ * Stream mode: send prompt via stdin to persistent process.
194
+ */
195
+ const promptStream = async (
196
+ session: Session,
197
+ promptText: string,
198
+ onUpdate?: UpdateCallback,
199
+ ): Promise<PromptResult> => {
200
+ // Build command for first turn or if no process exists
201
+ if (!session.process || session.process.killed) {
202
+ const args = buildCommand(session, promptText)
203
+
204
+ // Choose stdin mode based on schema configuration
205
+ const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore'
206
+
207
+ session.process = Bun.spawn(args, {
208
+ cwd: session.cwd,
209
+ stdin: stdinMode,
210
+ stdout: 'pipe',
211
+ stderr: 'inherit',
212
+ })
213
+
214
+ // If using stdin, write the prompt and close stdin
215
+ // (stream mode spawns new process per turn, so stdin should close after writing)
216
+ if (schema.prompt.stdin && session.process) {
217
+ writePromptToStdin(session.process, promptText, true)
218
+ }
219
+ } else {
220
+ // Subsequent turns: spawn new process with resume flag
221
+ const args = buildCommand(session, promptText)
222
+ const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore'
223
+
224
+ session.process = Bun.spawn(args, {
225
+ cwd: session.cwd,
226
+ stdin: stdinMode,
227
+ stdout: 'pipe',
228
+ stderr: 'inherit',
229
+ })
230
+
231
+ // If using stdin, write the prompt and close stdin
232
+ // (stream mode spawns new process per turn, so stdin should close after writing)
233
+ if (schema.prompt.stdin && session.process) {
234
+ writePromptToStdin(session.process, promptText, true)
235
+ }
236
+ }
237
+
238
+ return collectOutput(session, outputParser, onUpdate, timeout, debugLog)
239
+ }
240
+
241
+ /**
242
+ * Iterative mode: spawn new process per turn with history context.
243
+ */
244
+ const promptIterative = async (
245
+ session: Session,
246
+ promptText: string,
247
+ onUpdate?: UpdateCallback,
248
+ ): Promise<PromptResult> => {
249
+ // Build full prompt with history
250
+ const fullPrompt = session.history?.buildPrompt(promptText) ?? promptText
251
+
252
+ // Build and spawn command
253
+ const args = buildCommand(session, fullPrompt)
254
+ const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore'
255
+
256
+ session.process = Bun.spawn(args, {
257
+ cwd: session.cwd,
258
+ stdin: stdinMode,
259
+ stdout: 'pipe',
260
+ stderr: 'inherit',
261
+ })
262
+
263
+ // If using stdin, write the prompt and close stdin
264
+ // (iterative mode spawns new process per turn, so stdin should close after writing)
265
+ if (schema.prompt.stdin && session.process) {
266
+ writePromptToStdin(session.process, fullPrompt, true)
267
+ }
268
+
269
+ const result = await collectOutput(session, outputParser, onUpdate, timeout, debugLog)
270
+
271
+ // Store in history for next turn
272
+ session.history?.addTurn(promptText, result.output)
273
+
274
+ // Clean up process
275
+ session.process = undefined
276
+
277
+ return result
278
+ }
279
+
280
+ /**
281
+ * Builds the command array for spawning the CLI.
282
+ */
283
+ const buildCommand = (session: Session, promptText: string): string[] => {
284
+ const args = [...schema.command]
285
+
286
+ // Add output format flags (only if non-empty)
287
+ if (schema.output.flag) {
288
+ args.push(schema.output.flag, schema.output.value)
289
+ }
290
+
291
+ // Add auto-approve flags
292
+ if (schema.autoApprove) {
293
+ args.push(...schema.autoApprove)
294
+ }
295
+
296
+ // Add cwd flag if specified
297
+ if (schema.cwdFlag) {
298
+ args.push(schema.cwdFlag, session.cwd)
299
+ }
300
+
301
+ // Add resume flag if available (stream mode, after first turn)
302
+ if (schema.sessionMode === 'stream' && schema.resume && session.cliSessionId) {
303
+ args.push(schema.resume.flag, session.cliSessionId)
304
+ }
305
+
306
+ // Add prompt flag and text (skip if using stdin)
307
+ if (!schema.prompt.stdin) {
308
+ if (schema.prompt.flag) {
309
+ args.push(schema.prompt.flag, promptText)
310
+ } else {
311
+ // Positional argument (no flag)
312
+ args.push(promptText)
313
+ }
314
+ }
315
+
316
+ // Debug output: show constructed command
317
+ if (verbose || debug) {
318
+ const stdinNote = schema.prompt.stdin ? ' (+ stdin)' : ''
319
+ console.error(`[headless] Command: ${args.join(' ')}${stdinNote}`)
320
+ }
321
+
322
+ return args
323
+ }
324
+
325
+ /**
326
+ * Cancels an active session.
327
+ *
328
+ * @param sessionId - Session ID to cancel
329
+ */
330
+ const cancel = (sessionId: string): void => {
331
+ const session = sessions.get(sessionId)
332
+ if (!session) return
333
+
334
+ session.active = false
335
+
336
+ if (session.process && !session.process.killed) {
337
+ session.process.kill()
338
+ }
339
+ }
340
+
341
+ /**
342
+ * Gets a session by ID.
343
+ *
344
+ * @param sessionId - Session ID
345
+ * @returns Session or undefined
346
+ */
347
+ const get = (sessionId: string): Session | undefined => {
348
+ return sessions.get(sessionId)
349
+ }
350
+
351
+ /**
352
+ * Deletes a session.
353
+ *
354
+ * @param sessionId - Session ID
355
+ */
356
+ const destroy = (sessionId: string): void => {
357
+ cancel(sessionId)
358
+ sessions.delete(sessionId)
359
+ }
360
+
361
+ return {
362
+ create,
363
+ prompt,
364
+ cancel,
365
+ get,
366
+ destroy,
367
+ }
368
+ }
369
+
370
+ // ============================================================================
371
+ // Helper Functions
372
+ // ============================================================================
373
+
374
+ /**
375
+ * Generates a unique session ID.
376
+ */
377
+ const generateSessionId = (): string => {
378
+ return `sess_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`
379
+ }
380
+
381
+ /**
382
+ * Writes a prompt to a process stdin stream.
383
+ *
384
+ * @remarks
385
+ * Uses Bun's FileSink API to write text to the process stdin.
386
+ * The FileSink type provides `write()` and `flush()` methods for
387
+ * efficient stream writing without async overhead.
388
+ *
389
+ * Type guard ensures stdin is a FileSink (not a file descriptor number)
390
+ * before attempting to write. This handles Bun's subprocess stdin types:
391
+ * - `'pipe'` → FileSink with write/flush methods
392
+ * - `'ignore'` → null (not writable)
393
+ * - number → file descriptor (not a FileSink)
394
+ *
395
+ * **Closing stdin:** When `closeAfterWrite` is true, the stdin stream is
396
+ * closed after writing. This is required for CLIs that read from stdin
397
+ * with `-` and wait for EOF before processing (e.g., Codex). For stream
398
+ * mode sessions where stdin stays open for subsequent prompts, pass false.
399
+ *
400
+ * @param process - Subprocess with stdin stream
401
+ * @param prompt - Prompt text to write
402
+ * @param closeAfterWrite - Whether to close stdin after writing (default: false)
403
+ *
404
+ * @internal
405
+ */
406
+ const writePromptToStdin = (process: Subprocess, prompt: string, closeAfterWrite = false): void => {
407
+ if (process.stdin && typeof process.stdin !== 'number') {
408
+ process.stdin.write(`${prompt}\n`)
409
+ process.stdin.flush()
410
+ if (closeAfterWrite) {
411
+ process.stdin.end()
412
+ }
413
+ }
414
+ }
415
+
416
+ /**
417
+ * Collects output from a running process.
418
+ *
419
+ * @param session - Active session
420
+ * @param parser - Output parser
421
+ * @param onUpdate - Update callback
422
+ * @param timeoutMs - Timeout in ms
423
+ * @param logDebug - Debug logging function
424
+ * @returns Collected output and updates
425
+ */
426
+ const collectOutput = async (
427
+ session: Session,
428
+ parser: OutputParser,
429
+ onUpdate: UpdateCallback | undefined,
430
+ timeoutMs: number,
431
+ logDebug: (category: string, message: string, data?: unknown) => void,
432
+ ): Promise<PromptResult> => {
433
+ const updates: ParsedUpdate[] = []
434
+ let output = ''
435
+ let cliSessionId: string | undefined
436
+ const accumulatedMessages: string[] = []
437
+ let timedOut = false
438
+
439
+ const stdout = session.process?.stdout
440
+ if (!stdout || typeof stdout === 'number') {
441
+ throw new Error('No stdout available')
442
+ }
443
+
444
+ const reader = stdout.getReader()
445
+ const decoder = new TextDecoder()
446
+ let buffer = ''
447
+
448
+ // Track timeout with a timer ID so we can clear it
449
+ let timeoutId: Timer | undefined
450
+
451
+ const timeoutPromise = new Promise<'timeout'>((resolve) => {
452
+ timeoutId = setTimeout(() => resolve('timeout'), timeoutMs)
453
+ })
454
+
455
+ logDebug('process', `Starting output collection with ${timeoutMs}ms timeout`)
456
+
457
+ try {
458
+ const readLoop = async (): Promise<'complete'> => {
459
+ readLines: while (true) {
460
+ const { done, value } = await reader.read()
461
+
462
+ if (done) {
463
+ logDebug('process', 'Process stdout closed')
464
+ break
465
+ }
466
+
467
+ const chunk = decoder.decode(value, { stream: true })
468
+ logDebug('raw', `Received ${chunk.length} bytes`)
469
+
470
+ buffer += chunk
471
+
472
+ // Process complete lines
473
+ const lines = buffer.split('\n')
474
+ buffer = lines.pop() ?? ''
475
+
476
+ for (const line of lines) {
477
+ if (!line.trim()) continue
478
+
479
+ logDebug('line', `Processing line: ${line.slice(0, 100)}${line.length > 100 ? '...' : ''}`)
480
+
481
+ // Parse as update first (so updates are emitted even for result lines)
482
+ const update = parser.parseLine(line)
483
+ if (update !== null) {
484
+ // Handle both single updates and arrays of updates (from wildcard matches)
485
+ const updatesToProcess = Array.isArray(update) ? update : [update]
486
+
487
+ for (const singleUpdate of updatesToProcess) {
488
+ logDebug('parse', `Matched event: ${singleUpdate.type}`, {
489
+ title: singleUpdate.title,
490
+ status: singleUpdate.status,
491
+ content: singleUpdate.content?.slice(0, 50),
492
+ })
493
+
494
+ updates.push(singleUpdate)
495
+ onUpdate?.(singleUpdate)
496
+
497
+ // Accumulate message content for fallback
498
+ if (singleUpdate.type === 'message' && singleUpdate.content) {
499
+ accumulatedMessages.push(singleUpdate.content)
500
+ }
501
+
502
+ // Extract CLI session ID if available
503
+ if (!cliSessionId && singleUpdate.raw && typeof singleUpdate.raw === 'object') {
504
+ const raw = singleUpdate.raw as Record<string, unknown>
505
+ if (typeof raw.session_id === 'string') {
506
+ cliSessionId = raw.session_id
507
+ session.cliSessionId = cliSessionId
508
+ logDebug('session', `Extracted CLI session ID: ${cliSessionId}`)
509
+ }
510
+ }
511
+ }
512
+ } else {
513
+ logDebug('parse', 'No matching event mapping for line')
514
+ }
515
+
516
+ // Check for final result (after emitting update)
517
+ const resultCheck = parser.parseResult(line)
518
+ if (resultCheck.isResult) {
519
+ output = resultCheck.content
520
+ logDebug('result', `Found result: ${output.slice(0, 100)}${output.length > 100 ? '...' : ''}`)
521
+ break readLines // Exit both loops immediately on result
522
+ }
523
+ }
524
+ }
525
+ return 'complete'
526
+ }
527
+
528
+ const raceResult = await Promise.race([readLoop(), timeoutPromise])
529
+
530
+ if (raceResult === 'timeout') {
531
+ timedOut = true
532
+ logDebug('timeout', `Process timed out after ${timeoutMs}ms`)
533
+
534
+ // Kill the process on timeout
535
+ if (session.process && !session.process.killed) {
536
+ session.process.kill('SIGTERM')
537
+ logDebug('process', 'Sent SIGTERM to process')
538
+ }
539
+ }
540
+ } finally {
541
+ if (timeoutId) {
542
+ clearTimeout(timeoutId)
543
+ }
544
+ reader.releaseLock()
545
+ }
546
+
547
+ // Fallback: if result contentPath didn't yield output, use accumulated messages
548
+ if (!output && accumulatedMessages.length > 0) {
549
+ output = accumulatedMessages.join('\n')
550
+ logDebug('fallback', `Using accumulated messages as output (${accumulatedMessages.length} messages)`)
551
+ }
552
+
553
+ // Get exit info from process
554
+ let exitInfo: ProcessExitInfo | undefined
555
+ if (session.process) {
556
+ try {
557
+ // Wait for process to exit (with a short timeout to not block)
558
+ const exitCode = await Promise.race([
559
+ session.process.exited,
560
+ new Promise<null>((resolve) => setTimeout(() => resolve(null), 1000)),
561
+ ])
562
+
563
+ exitInfo = {
564
+ exitCode: exitCode,
565
+ timedOut,
566
+ signal: timedOut ? 'SIGTERM' : undefined,
567
+ }
568
+
569
+ logDebug('exit', `Process exit info`, exitInfo)
570
+ } catch {
571
+ exitInfo = {
572
+ exitCode: null,
573
+ timedOut,
574
+ }
575
+ }
576
+ }
577
+
578
+ return {
579
+ output,
580
+ updates,
581
+ cliSessionId,
582
+ exitInfo,
583
+ }
584
+ }
585
+
586
+ /** Session manager type */
587
+ export type SessionManager = ReturnType<typeof createSessionManager>