@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,388 @@
1
+ /**
2
+ * Multi-run trials command for pass@k/pass^k analysis.
3
+ *
4
+ * @remarks
5
+ * Runs each prompt k times to measure non-determinism.
6
+ * Without a grader, captures raw trials. With a grader, computes:
7
+ * - passRate: Simple pass rate (passes / k)
8
+ * - passAtK: Probability of at least one pass in k samples
9
+ * - passExpK: Probability of all k samples passing
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import { parseArgs } from 'node:util'
15
+ import { extractOutput, extractTrajectory, loadPrompts, logProgress, resolvePath, writeOutput } from '../core.ts'
16
+ import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
17
+ import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
18
+ import { createSessionManager } from '../headless/headless-session-manager.ts'
19
+ import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
20
+ import { loadGrader } from '../schemas/grader-loader.ts'
21
+ import type { Grader, TrialEntry, TrialResult } from '../schemas.ts'
22
+
23
+ // ============================================================================
24
+ // Pass@k/Pass^k Calculation
25
+ // ============================================================================
26
+
27
+ /**
28
+ * Calculate pass@k: probability of at least one pass in k samples.
29
+ *
30
+ * @remarks
31
+ * Uses the unbiased estimator: 1 - C(n-c, k) / C(n, k)
32
+ * where n = total samples, c = correct samples, k = samples per trial
33
+ *
34
+ * For our case where n = k (we run exactly k trials per prompt):
35
+ * pass@k = 1 - (1 - passRate)^k (simplified)
36
+ *
37
+ * @param passes - Number of passing trials
38
+ * @param k - Total number of trials
39
+ * @returns Probability of at least one pass
40
+ *
41
+ * @public
42
+ */
43
+ export const calculatePassAtK = (passes: number, k: number): number => {
44
+ if (passes >= k) return 1
45
+ if (passes === 0) return 0
46
+
47
+ // Simplified formula when n = k
48
+ const passRate = passes / k
49
+ return 1 - (1 - passRate) ** k
50
+ }
51
+
52
+ /**
53
+ * Calculate pass^k: probability of all k samples passing.
54
+ *
55
+ * @remarks
56
+ * This is simply passRate^k
57
+ *
58
+ * @param passes - Number of passing trials
59
+ * @param k - Total number of trials
60
+ * @returns Probability of all k samples passing
61
+ *
62
+ * @public
63
+ */
64
+ export const calculatePassExpK = (passes: number, k: number): number => {
65
+ if (passes === k) return 1
66
+ if (passes === 0) return 0
67
+
68
+ const passRate = passes / k
69
+ return passRate ** k
70
+ }
71
+
72
+ // ============================================================================
73
+ // Types
74
+ // ============================================================================
75
+
76
+ /** Configuration for trials command */
77
+ export type TrialsConfig = {
78
+ /** Path to prompts.jsonl file */
79
+ promptsPath: string
80
+ /** Path to agent schema JSON file */
81
+ schemaPath: string
82
+ /** Number of trials per prompt */
83
+ k: number
84
+ /** Output file path */
85
+ outputPath?: string
86
+ /** Working directory for agent */
87
+ cwd?: string
88
+ /** Timeout per prompt in milliseconds (overrides schema default) */
89
+ timeout?: number
90
+ /** Show progress to stderr */
91
+ progress?: boolean
92
+ /** Append to output file */
93
+ append?: boolean
94
+ /** Optional grader function */
95
+ grader?: Grader
96
+ /** Enable debug mode */
97
+ debug?: boolean
98
+ }
99
+
100
+ // ============================================================================
101
+ // Trials Implementation
102
+ // ============================================================================
103
+
104
+ /**
105
+ * Execute trials with configuration object.
106
+ *
107
+ * @param config - Trials configuration
108
+ * @returns Array of trial results
109
+ */
110
+ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
111
+ const {
112
+ promptsPath,
113
+ schemaPath,
114
+ k,
115
+ outputPath,
116
+ cwd,
117
+ timeout,
118
+ progress = false,
119
+ append = false,
120
+ grader,
121
+ debug = false,
122
+ } = config
123
+
124
+ // Load and validate schema
125
+ const schemaFile = Bun.file(schemaPath)
126
+ if (!(await schemaFile.exists())) {
127
+ throw new Error(`Schema file not found: ${schemaPath}`)
128
+ }
129
+
130
+ let schema: HeadlessAdapterConfig
131
+ try {
132
+ const rawSchema = await schemaFile.json()
133
+ schema = parseHeadlessConfig(rawSchema)
134
+ } catch (error) {
135
+ throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
136
+ }
137
+
138
+ // Load prompts
139
+ const prompts = await loadPrompts(promptsPath)
140
+
141
+ // Resolve output path
142
+ const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
143
+
144
+ // Determine effective timeout (CLI flag > schema default > harness default)
145
+ const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
146
+ const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
147
+
148
+ // Log progress info
149
+ logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
150
+ logProgress(`Running ${k} trials per prompt`, progress)
151
+ logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
152
+ logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
153
+ if (grader) {
154
+ logProgress('Grader: enabled (will compute pass@k metrics)', progress)
155
+ }
156
+
157
+ // Create session manager with schema
158
+ const sessions = createSessionManager({
159
+ schema,
160
+ timeout: effectiveTimeout,
161
+ verbose: progress,
162
+ debug,
163
+ })
164
+
165
+ // Clear output file if not appending
166
+ if (resolvedOutputPath && !append) {
167
+ await Bun.write(resolvedOutputPath, '')
168
+ }
169
+
170
+ const workingDir = cwd ?? process.cwd()
171
+ const results: TrialResult[] = []
172
+ let isFirstOutput = true
173
+
174
+ // Run evaluations
175
+ for (let i = 0; i < prompts.length; i++) {
176
+ const promptCase = prompts[i]
177
+ if (!promptCase) continue
178
+
179
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
180
+
181
+ const trialEntries: TrialEntry[] = []
182
+
183
+ for (let trialNum = 1; trialNum <= k; trialNum++) {
184
+ // Create fresh session for each trial
185
+ const session = await sessions.create(workingDir)
186
+ const startTime = Date.now()
187
+
188
+ try {
189
+ // Handle string or array input
190
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
191
+ const allUpdates: ParsedUpdate[] = []
192
+
193
+ // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
194
+
195
+ // Execute each turn sequentially
196
+ for (const turnInput of inputs) {
197
+ const turnResult = await sessions.prompt(session.id, turnInput)
198
+ allUpdates.push(...turnResult.updates)
199
+ }
200
+
201
+ const endTime = Date.now()
202
+ const trajectory = extractTrajectory(allUpdates, startTime)
203
+ const output = extractOutput(trajectory)
204
+
205
+ const entry: TrialEntry = {
206
+ trialNum,
207
+ output,
208
+ trajectory,
209
+ duration: endTime - startTime,
210
+ }
211
+
212
+ // Apply grader if provided
213
+ if (grader) {
214
+ const graderResult = await grader({
215
+ input: promptCase.input,
216
+ output,
217
+ hint: promptCase.hint,
218
+ trajectory,
219
+ })
220
+ entry.pass = graderResult.pass
221
+ entry.score = graderResult.score
222
+ entry.reasoning = graderResult.reasoning
223
+ }
224
+
225
+ trialEntries.push(entry)
226
+ logProgress(
227
+ ` Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
228
+ progress,
229
+ )
230
+
231
+ // Clean up session
232
+ sessions.destroy(session.id)
233
+ } catch (error) {
234
+ const endTime = Date.now()
235
+ const message = error instanceof Error ? error.message : String(error)
236
+
237
+ trialEntries.push({
238
+ trialNum,
239
+ output: '',
240
+ trajectory: [],
241
+ duration: endTime - startTime,
242
+ pass: false,
243
+ reasoning: `Error: ${message}`,
244
+ })
245
+ logProgress(` Trial ${trialNum}/${k}: ! (error)`, progress)
246
+ }
247
+ }
248
+
249
+ // Build result
250
+ const result: TrialResult = {
251
+ id: promptCase.id,
252
+ input: promptCase.input,
253
+ ...(promptCase.hint && { hint: promptCase.hint }),
254
+ k,
255
+ trials: trialEntries,
256
+ }
257
+
258
+ // Calculate metrics if grader was used
259
+ if (grader) {
260
+ const passes = trialEntries.filter((t) => t.pass).length
261
+ result.passRate = passes / k
262
+ result.passAtK = calculatePassAtK(passes, k)
263
+ result.passExpK = calculatePassExpK(passes, k)
264
+ }
265
+
266
+ results.push(result)
267
+
268
+ // Write result immediately
269
+ const formatted = JSON.stringify(result)
270
+ await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
271
+ isFirstOutput = false
272
+
273
+ if (grader) {
274
+ logProgress(
275
+ ` → passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
276
+ progress,
277
+ )
278
+ }
279
+ }
280
+
281
+ logProgress('Done!', progress)
282
+ return results
283
+ }
284
+
285
+ // ============================================================================
286
+ // CLI Entry Point
287
+ // ============================================================================
288
+
289
+ /**
290
+ * Trials command CLI handler.
291
+ *
292
+ * @param args - Command line arguments (after 'trials')
293
+ */
294
+ export const trials = async (args: string[]): Promise<void> => {
295
+ const { values, positionals } = parseArgs({
296
+ args,
297
+ options: {
298
+ schema: { type: 'string', short: 's' },
299
+ output: { type: 'string', short: 'o' },
300
+ k: { type: 'string', short: 'k', default: String(DEFAULT_TRIAL_COUNT) },
301
+ cwd: { type: 'string', short: 'c' },
302
+ timeout: { type: 'string', short: 't' },
303
+ progress: { type: 'boolean', default: false },
304
+ append: { type: 'boolean', default: false },
305
+ grader: { type: 'string', short: 'g' },
306
+ debug: { type: 'boolean', default: false },
307
+ help: { type: 'boolean', short: 'h' },
308
+ },
309
+ allowPositionals: true,
310
+ })
311
+
312
+ if (values.help) {
313
+ // biome-ignore lint/suspicious/noConsole: CLI help output
314
+ console.log(`
315
+ Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
316
+
317
+ Arguments:
318
+ prompts.jsonl Input file with evaluation prompts
319
+
320
+ Options:
321
+ -s, --schema Path to agent schema JSON file (required)
322
+ -o, --output Output file (default: stdout)
323
+ -k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
324
+ -c, --cwd Working directory for agent
325
+ -t, --timeout Request timeout in ms (overrides schema default)
326
+ --progress Show progress to stderr
327
+ --append Append to output file
328
+ -g, --grader Path to grader (.ts/.js module or executable script)
329
+ --debug Enable debug mode
330
+ -h, --help Show this help message
331
+
332
+ Output Format:
333
+ Without grader: Raw trials with trajectories
334
+ With grader: Trials plus pass@k metrics (passRate, passAtK, passExpK)
335
+
336
+ Graders:
337
+ TS/JS modules must export a 'grade' function.
338
+ Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
339
+
340
+ Examples:
341
+ # Capture only
342
+ agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -o trials.jsonl
343
+
344
+ # With TypeScript grader
345
+ agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
346
+
347
+ # With Python grader
348
+ agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.py -o trials.jsonl
349
+ `)
350
+ return
351
+ }
352
+
353
+ const promptsPath = positionals[0]
354
+ if (!promptsPath) {
355
+ console.error('Error: prompts.jsonl path is required')
356
+ process.exit(1)
357
+ }
358
+
359
+ if (!values.schema) {
360
+ console.error('Error: --schema is required')
361
+ console.error('Example: agent-eval-harness trials prompts.jsonl --schema ./claude.json')
362
+ process.exit(1)
363
+ }
364
+
365
+ // Load grader if specified
366
+ let grader: Grader | undefined
367
+ if (values.grader) {
368
+ try {
369
+ grader = await loadGrader(values.grader)
370
+ } catch (error) {
371
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
372
+ process.exit(1)
373
+ }
374
+ }
375
+
376
+ await runTrials({
377
+ promptsPath,
378
+ schemaPath: values.schema,
379
+ k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
380
+ outputPath: values.output,
381
+ cwd: values.cwd,
382
+ timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
383
+ progress: values.progress ?? false,
384
+ append: values.append ?? false,
385
+ grader,
386
+ debug: values.debug ?? false,
387
+ })
388
+ }
@@ -0,0 +1,188 @@
1
+ /**
2
+ * Validate-refs command - check reference solutions against grader.
3
+ *
4
+ * @remarks
5
+ * Validates that reference solutions in prompts.jsonl pass the grader.
6
+ * Helps identify prompts with broken or incorrect reference solutions.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { loadGrader } from '../schemas/grader-loader.ts'
13
+ import type { Grader, ValidationResult } from '../schemas.ts'
14
+ import { loadPrompts } from './capture.ts'
15
+
16
+ // ============================================================================
17
+ // Types
18
+ // ============================================================================
19
+
20
+ /** Configuration for validate-refs command */
21
+ export type ValidateRefsConfig = {
22
+ /** Path to prompts.jsonl file */
23
+ promptsPath: string
24
+ /** Output file path */
25
+ outputPath?: string
26
+ /** Grader function */
27
+ grader: Grader
28
+ }
29
+
30
+ // ============================================================================
31
+ // Helpers
32
+ // ============================================================================
33
+
34
+ /** Resolve path relative to process.cwd() */
35
+ const resolvePath = (path: string): string => {
36
+ if (path.startsWith('/')) return path
37
+ return `${process.cwd()}/${path}`
38
+ }
39
+
40
+ // ============================================================================
41
+ // Validate-Refs Implementation
42
+ // ============================================================================
43
+
44
+ /**
45
+ * Execute validate-refs with configuration object.
46
+ *
47
+ * @param config - Validate-refs configuration
48
+ * @returns Array of validation results
49
+ */
50
+ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<ValidationResult[]> => {
51
+ const { promptsPath, outputPath, grader } = config
52
+
53
+ // Load prompts
54
+ const prompts = await loadPrompts(promptsPath)
55
+
56
+ // Filter to prompts with reference solutions
57
+ const promptsWithRefs = prompts.filter((p) => p.reference !== undefined)
58
+
59
+ if (promptsWithRefs.length === 0) {
60
+ console.error('No prompts with reference solutions found')
61
+ return []
62
+ }
63
+
64
+ console.error(`Validating ${promptsWithRefs.length} reference solutions...`)
65
+
66
+ const results: ValidationResult[] = []
67
+
68
+ for (const prompt of promptsWithRefs) {
69
+ const graderResult = await grader({
70
+ input: prompt.input,
71
+ output: prompt.reference as string,
72
+ hint: prompt.hint,
73
+ trajectory: [], // No trajectory for reference validation
74
+ })
75
+
76
+ results.push({
77
+ id: prompt.id,
78
+ reference: prompt.reference as string,
79
+ passes: graderResult.pass,
80
+ graderResult,
81
+ })
82
+
83
+ const icon = graderResult.pass ? '✓' : '✗'
84
+ console.error(` ${icon} ${prompt.id}`)
85
+ }
86
+
87
+ // Format output
88
+ const output = results.map((r) => JSON.stringify(r)).join('\n')
89
+
90
+ // Write output
91
+ if (outputPath) {
92
+ await Bun.write(resolvePath(outputPath), output)
93
+ } else {
94
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
95
+ console.log(output)
96
+ }
97
+
98
+ // Summary
99
+ const passed = results.filter((r) => r.passes).length
100
+ const failed = results.length - passed
101
+ console.error(`\nResults: ${passed} passed, ${failed} failed`)
102
+
103
+ if (failed > 0) {
104
+ console.error('\nFailing references:')
105
+ for (const result of results.filter((r) => !r.passes)) {
106
+ console.error(` - ${result.id}: ${result.graderResult.reasoning ?? 'No reasoning'}`)
107
+ }
108
+ }
109
+
110
+ return results
111
+ }
112
+
113
+ // ============================================================================
114
+ // CLI Entry Point
115
+ // ============================================================================
116
+
117
+ /**
118
+ * Validate-refs command CLI handler.
119
+ *
120
+ * @param args - Command line arguments (after 'validate-refs')
121
+ */
122
+ export const validateRefs = async (args: string[]): Promise<void> => {
123
+ const { values, positionals } = parseArgs({
124
+ args,
125
+ options: {
126
+ output: { type: 'string', short: 'o' },
127
+ grader: { type: 'string', short: 'g' },
128
+ help: { type: 'boolean', short: 'h' },
129
+ },
130
+ allowPositionals: true,
131
+ })
132
+
133
+ if (values.help) {
134
+ // biome-ignore lint/suspicious/noConsole: CLI help output
135
+ console.log(`
136
+ Usage: agent-eval-harness validate-refs <prompts.jsonl> --grader <grader.ts> [options]
137
+
138
+ Arguments:
139
+ prompts.jsonl Input file with prompts (must have 'reference' field)
140
+
141
+ Options:
142
+ -o, --output Output file (default: stdout)
143
+ -g, --grader Path to grader (.ts/.js module or executable script, required)
144
+ -h, --help Show this help message
145
+
146
+ Output:
147
+ JSONL with validation results for each reference solution.
148
+
149
+ Prompt Format:
150
+ {
151
+ "id": "test-001",
152
+ "input": "What is 2+2?",
153
+ "expected": "4",
154
+ "reference": "The answer is 4."
155
+ }
156
+
157
+ Examples:
158
+ agent-eval-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl
159
+ `)
160
+ return
161
+ }
162
+
163
+ const promptsPath = positionals[0]
164
+ if (!promptsPath) {
165
+ console.error('Error: prompts.jsonl path is required')
166
+ process.exit(1)
167
+ }
168
+
169
+ if (!values.grader) {
170
+ console.error('Error: --grader is required for validate-refs')
171
+ process.exit(1)
172
+ }
173
+
174
+ // Load grader
175
+ let grader: Grader
176
+ try {
177
+ grader = await loadGrader(values.grader)
178
+ } catch (error) {
179
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
180
+ process.exit(1)
181
+ }
182
+
183
+ await runValidateRefs({
184
+ promptsPath,
185
+ outputPath: values.output,
186
+ grader,
187
+ })
188
+ }
@@ -0,0 +1,33 @@
1
+ /**
2
+ * CLI command implementations for agent evaluation harness.
3
+ *
4
+ * @remarks
5
+ * Re-exports all CLI commands for programmatic use.
6
+ * For CLI usage, run `agent-eval-harness <command> --help`.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ // Balance command
12
+ export type { BalanceConfig } from './commands/balance.ts'
13
+ export { balance, runBalance } from './commands/balance.ts'
14
+
15
+ // Calibrate command
16
+ export type { CalibrateConfig } from './commands/calibrate.ts'
17
+ export { calibrate, runCalibrate } from './commands/calibrate.ts'
18
+
19
+ // Capture command
20
+ export type { CaptureConfig } from './commands/capture.ts'
21
+ export { capture, runCapture } from './commands/capture.ts'
22
+
23
+ // Summarize command
24
+ export type { SummarizeConfig } from './commands/summarize.ts'
25
+ export { runSummarize, summarize } from './commands/summarize.ts'
26
+
27
+ // Trials command
28
+ export type { TrialsConfig } from './commands/trials.ts'
29
+ export { runTrials, trials } from './commands/trials.ts'
30
+
31
+ // Validate-refs command
32
+ export type { ValidateRefsConfig } from './commands/validate-refs.ts'
33
+ export { runValidateRefs, validateRefs } from './commands/validate-refs.ts'
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Core utilities for agent-eval-harness.
3
+ *
4
+ * @remarks
5
+ * Re-exports shared utilities used across all commands:
6
+ * - Loading: JSONL file parsing for prompts and results
7
+ * - Trajectory: Extraction and analysis of agent trajectories
8
+ * - Output: Writing results, progress logging, path resolution
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ // Loading utilities
14
+ export { loadJsonl, loadPrompts, loadResults } from './loading.ts'
15
+ // Output utilities
16
+ export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
17
+ // Trajectory utilities
18
+ export {
19
+ detectTrajectoryRichness,
20
+ extractContent,
21
+ extractFilePath,
22
+ extractOutput,
23
+ extractTrajectory,
24
+ hasToolErrors,
25
+ } from './trajectory.ts'