@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,241 @@
1
+ /**
2
+ * Pipeline extract command - parse raw output into trajectories.
3
+ *
4
+ * @remarks
5
+ * Converts RawOutput from `run` command into ExtractedResult with
6
+ * parsed trajectory and final output. Uses the same schema-driven
7
+ * parsing as the capture command.
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { parseArgs } from 'node:util'
13
+ import { loadJsonl, logProgress, writeOutput } from '../core.ts'
14
+ import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
15
+ import { createOutputParser } from '../headless/headless-output-parser.ts'
16
+ import type { TrajectoryStep } from '../schemas.ts'
17
+ import type { ExtractedResult, RawOutput } from './pipeline.types.ts'
18
+
19
+ /**
20
+ * Extract trajectory from raw output using schema parser.
21
+ *
22
+ * @param rawOutput - Raw output from run command
23
+ * @param parser - Output parser created from schema
24
+ * @returns Extracted result with trajectory
25
+ */
26
+ const extractFromRaw = (rawOutput: RawOutput, parser: ReturnType<typeof createOutputParser>): ExtractedResult => {
27
+ const trajectory: TrajectoryStep[] = []
28
+ let finalOutput = ''
29
+ let toolErrors = false
30
+
31
+ // Parse each raw line
32
+ for (const line of rawOutput.rawLines) {
33
+ // Try to parse as trajectory update
34
+ const parsed = parser.parseLine(line)
35
+ if (parsed) {
36
+ const updates = Array.isArray(parsed) ? parsed : [parsed]
37
+ for (const update of updates) {
38
+ const timestamp = Date.now() - rawOutput.timing.start
39
+
40
+ if (update.type === 'thought') {
41
+ trajectory.push({
42
+ type: 'thought',
43
+ content: update.content ?? '',
44
+ timestamp,
45
+ })
46
+ } else if (update.type === 'message') {
47
+ trajectory.push({
48
+ type: 'message',
49
+ content: update.content ?? '',
50
+ timestamp,
51
+ })
52
+ } else if (update.type === 'tool_call') {
53
+ trajectory.push({
54
+ type: 'tool_call',
55
+ name: update.title ?? 'unknown',
56
+ status: update.status ?? 'pending',
57
+ timestamp,
58
+ })
59
+ if (update.status === 'failed') {
60
+ toolErrors = true
61
+ }
62
+ } else if (update.type === 'plan') {
63
+ trajectory.push({
64
+ type: 'plan',
65
+ entries: [],
66
+ timestamp,
67
+ })
68
+ }
69
+ }
70
+ }
71
+
72
+ // Try to parse as result
73
+ const result = parser.parseResult(line)
74
+ if (result.isResult) {
75
+ finalOutput = result.content
76
+ }
77
+ }
78
+
79
+ // If no explicit result, extract from messages
80
+ if (!finalOutput) {
81
+ finalOutput = trajectory
82
+ .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
83
+ .map((step) => step.content)
84
+ .join('\n')
85
+ }
86
+
87
+ return {
88
+ id: rawOutput.id,
89
+ input: rawOutput.input,
90
+ hint: rawOutput.hint,
91
+ output: finalOutput,
92
+ trajectory,
93
+ toolErrors: toolErrors || !!rawOutput.error,
94
+ timing: rawOutput.timing,
95
+ ...(rawOutput.error && { error: rawOutput.error }),
96
+ }
97
+ }
98
+
99
+ /**
100
+ * Execute pipeline extract with configuration.
101
+ *
102
+ * @param schemaPath - Path to headless adapter schema
103
+ * @param rawOutputs - Raw outputs from run command
104
+ * @param outputPath - Optional output file path
105
+ * @param progress - Show progress to stderr
106
+ */
107
+ export const runExtract = async (
108
+ schemaPath: string,
109
+ rawOutputs: RawOutput[],
110
+ outputPath?: string,
111
+ progress = false,
112
+ ): Promise<void> => {
113
+ // Load and validate schema
114
+ const schemaFile = Bun.file(schemaPath)
115
+ if (!(await schemaFile.exists())) {
116
+ throw new Error(`Schema file not found: ${schemaPath}`)
117
+ }
118
+
119
+ const rawSchema = await schemaFile.json()
120
+ const schema = parseHeadlessConfig(rawSchema)
121
+ const parser = createOutputParser(schema)
122
+
123
+ logProgress(`Extracting with schema: ${schema.name}`, progress)
124
+
125
+ let isFirstOutput = true
126
+
127
+ // Clear output file if specified
128
+ if (outputPath) {
129
+ await Bun.write(outputPath, '')
130
+ }
131
+
132
+ for (let i = 0; i < rawOutputs.length; i++) {
133
+ const rawOutput = rawOutputs[i]
134
+ if (!rawOutput) continue
135
+
136
+ logProgress(`[${i + 1}/${rawOutputs.length}] ${rawOutput.id}`, progress)
137
+
138
+ const extracted = extractFromRaw(rawOutput, parser)
139
+
140
+ await writeOutput(JSON.stringify(extracted), outputPath, !isFirstOutput)
141
+ isFirstOutput = false
142
+ }
143
+
144
+ logProgress('Done!', progress)
145
+ }
146
+
147
+ /**
148
+ * Read raw outputs from stdin.
149
+ *
150
+ * @returns Array of parsed raw outputs or null if stdin is empty
151
+ */
152
+ const readStdinRawOutputs = async (): Promise<RawOutput[] | null> => {
153
+ if (process.stdin.isTTY) {
154
+ return null
155
+ }
156
+
157
+ const chunks: Buffer[] = []
158
+ for await (const chunk of process.stdin) {
159
+ chunks.push(chunk)
160
+ }
161
+
162
+ const content = Buffer.concat(chunks).toString('utf-8').trim()
163
+ if (!content) return null
164
+
165
+ return content
166
+ .split('\n')
167
+ .filter(Boolean)
168
+ .map((line) => JSON.parse(line) as RawOutput)
169
+ }
170
+
171
+ /**
172
+ * Pipeline extract command CLI handler.
173
+ *
174
+ * @param args - Command line arguments (after 'extract')
175
+ */
176
+ export const extract = async (args: string[]): Promise<void> => {
177
+ const { values, positionals } = parseArgs({
178
+ args,
179
+ options: {
180
+ schema: { type: 'string', short: 's' },
181
+ output: { type: 'string', short: 'o' },
182
+ progress: { type: 'boolean', default: false },
183
+ help: { type: 'boolean', short: 'h' },
184
+ },
185
+ allowPositionals: true,
186
+ })
187
+
188
+ if (values.help) {
189
+ // biome-ignore lint/suspicious/noConsole: CLI help output
190
+ console.log(`
191
+ Usage: agent-eval-harness extract [raw.jsonl] --schema <schema.json> [options]
192
+
193
+ Parse raw output into trajectories and final output.
194
+
195
+ Arguments:
196
+ raw.jsonl Input file from 'run' command (or pipe from stdin)
197
+
198
+ Options:
199
+ -s, --schema Path to headless adapter schema (required)
200
+ -o, --output Output file (default: stdout)
201
+ --progress Show progress to stderr
202
+ -h, --help Show this help message
203
+
204
+ Examples:
205
+ # From file
206
+ agent-eval-harness extract raw.jsonl --schema claude.json -o extracted.jsonl
207
+
208
+ # Piped from run
209
+ agent-eval-harness run prompts.jsonl -s claude.json | agent-eval-harness extract -s claude.json
210
+
211
+ # Full pipeline
212
+ cat prompts.jsonl | \\
213
+ agent-eval-harness run -s claude.json | \\
214
+ agent-eval-harness extract -s claude.json | \\
215
+ agent-eval-harness grade --grader ./grader.ts
216
+ `)
217
+ return
218
+ }
219
+
220
+ if (!values.schema) {
221
+ console.error('Error: --schema is required')
222
+ process.exit(1)
223
+ }
224
+
225
+ // Load raw outputs from file or stdin
226
+ const inputPath = positionals[0]
227
+ let rawOutputs: RawOutput[]
228
+
229
+ if (inputPath) {
230
+ rawOutputs = await loadJsonl<RawOutput>(inputPath)
231
+ } else {
232
+ const stdinOutputs = await readStdinRawOutputs()
233
+ if (!stdinOutputs || stdinOutputs.length === 0) {
234
+ console.error('Error: No raw output provided (use file argument or pipe to stdin)')
235
+ process.exit(1)
236
+ }
237
+ rawOutputs = stdinOutputs
238
+ }
239
+
240
+ await runExtract(values.schema, rawOutputs, values.output, values.progress)
241
+ }
@@ -0,0 +1,292 @@
1
+ /**
2
+ * Pipeline format command - convert results to different output formats.
3
+ *
4
+ * @remarks
5
+ * Transforms graded or extracted results into various formats:
6
+ * - jsonl: Pass-through JSONL (default)
7
+ * - markdown: Human-readable report
8
+ * - csv: Comma-separated values for spreadsheets
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ import { parseArgs } from 'node:util'
14
+ import { loadJsonl, logProgress, writeOutput } from '../core.ts'
15
+ import type { CaptureResult } from '../schemas.ts'
16
+ import type { ExtractedResult, FormatStyle, GradedResult } from './pipeline.types.ts'
17
+
18
+ /** Union of all formattable result types */
19
+ type FormattableResult = ExtractedResult | GradedResult | CaptureResult
20
+
21
+ /**
22
+ * Check if result has a score (graded).
23
+ */
24
+ const isGraded = (
25
+ result: FormattableResult,
26
+ ): result is GradedResult | (CaptureResult & { score: NonNullable<CaptureResult['score']> }) => {
27
+ return 'score' in result && result.score !== undefined
28
+ }
29
+
30
+ /**
31
+ * Format results as markdown report.
32
+ *
33
+ * @param results - Results to format
34
+ * @returns Markdown string
35
+ */
36
+ const formatMarkdown = (results: FormattableResult[]): string => {
37
+ const lines: string[] = [
38
+ '# Evaluation Results',
39
+ '',
40
+ `Generated: ${new Date().toISOString()}`,
41
+ `Total: ${results.length} test cases`,
42
+ '',
43
+ ]
44
+
45
+ // Summary statistics if graded
46
+ const gradedResults = results.filter(isGraded)
47
+ if (gradedResults.length > 0) {
48
+ const passed = gradedResults.filter((r) => r.score.pass).length
49
+ const avgScore = gradedResults.reduce((sum, r) => sum + r.score.score, 0) / gradedResults.length
50
+
51
+ lines.push('## Summary')
52
+ lines.push('')
53
+ lines.push(
54
+ `- **Pass rate**: ${passed}/${gradedResults.length} (${((passed / gradedResults.length) * 100).toFixed(1)}%)`,
55
+ )
56
+ lines.push(`- **Average score**: ${avgScore.toFixed(3)}`)
57
+ lines.push('')
58
+ }
59
+
60
+ lines.push('## Results')
61
+ lines.push('')
62
+
63
+ for (const result of results) {
64
+ const input = Array.isArray(result.input) ? result.input.join(' → ') : result.input
65
+ const inputPreview = input.length > 100 ? `${input.slice(0, 100)}...` : input
66
+
67
+ lines.push(`### ${result.id}`)
68
+ lines.push('')
69
+ lines.push(`**Input**: ${inputPreview}`)
70
+ lines.push('')
71
+
72
+ if (result.hint) {
73
+ lines.push(`**Hint**: ${result.hint}`)
74
+ lines.push('')
75
+ }
76
+
77
+ const outputPreview = result.output.length > 500 ? `${result.output.slice(0, 500)}...` : result.output
78
+ lines.push(`**Output**:`)
79
+ lines.push('```')
80
+ lines.push(outputPreview)
81
+ lines.push('```')
82
+ lines.push('')
83
+
84
+ if (isGraded(result)) {
85
+ const icon = result.score.pass ? '✅' : '❌'
86
+ lines.push(`**Score**: ${icon} ${result.score.score.toFixed(3)} (${result.score.pass ? 'PASS' : 'FAIL'})`)
87
+ if (result.score.reasoning) {
88
+ lines.push(`**Reasoning**: ${result.score.reasoning}`)
89
+ }
90
+ lines.push('')
91
+ }
92
+
93
+ if (result.toolErrors) {
94
+ lines.push('⚠️ **Tool errors detected**')
95
+ lines.push('')
96
+ }
97
+
98
+ if ('error' in result && result.error) {
99
+ lines.push(`❌ **Error**: ${result.error}`)
100
+ lines.push('')
101
+ }
102
+
103
+ lines.push('---')
104
+ lines.push('')
105
+ }
106
+
107
+ return lines.join('\n')
108
+ }
109
+
110
+ /**
111
+ * Format results as CSV.
112
+ *
113
+ * @param results - Results to format
114
+ * @returns CSV string
115
+ */
116
+ const formatCsv = (results: FormattableResult[]): string => {
117
+ const lines: string[] = []
118
+
119
+ // Header
120
+ const hasScores = results.some(isGraded)
121
+ const headers = ['id', 'input', 'hint', 'output', 'tool_errors', 'duration_ms']
122
+ if (hasScores) {
123
+ headers.push('pass', 'score', 'reasoning')
124
+ }
125
+ lines.push(headers.join(','))
126
+
127
+ // Data rows
128
+ for (const result of results) {
129
+ const input = Array.isArray(result.input) ? result.input.join(' | ') : result.input
130
+ const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"`
131
+
132
+ const row = [
133
+ escapeCsv(result.id),
134
+ escapeCsv(input),
135
+ escapeCsv(result.hint ?? ''),
136
+ escapeCsv(result.output),
137
+ result.toolErrors ? 'true' : 'false',
138
+ String(result.timing.total),
139
+ ]
140
+
141
+ if (hasScores) {
142
+ if (isGraded(result)) {
143
+ row.push(
144
+ result.score.pass ? 'true' : 'false',
145
+ result.score.score.toFixed(3),
146
+ escapeCsv(result.score.reasoning ?? ''),
147
+ )
148
+ } else {
149
+ row.push('', '', '')
150
+ }
151
+ }
152
+
153
+ lines.push(row.join(','))
154
+ }
155
+
156
+ return lines.join('\n')
157
+ }
158
+
159
+ /**
160
+ * Execute pipeline format with configuration.
161
+ *
162
+ * @param style - Output format style
163
+ * @param results - Results to format
164
+ * @param outputPath - Optional output file path
165
+ * @param progress - Show progress to stderr
166
+ */
167
+ export const runFormat = async (
168
+ style: FormatStyle,
169
+ results: FormattableResult[],
170
+ outputPath?: string,
171
+ progress = false,
172
+ ): Promise<void> => {
173
+ logProgress(`Formatting ${results.length} results as ${style}`, progress)
174
+
175
+ let output: string
176
+
177
+ switch (style) {
178
+ case 'jsonl':
179
+ // Pass-through as JSONL
180
+ output = results.map((r) => JSON.stringify(r)).join('\n')
181
+ break
182
+
183
+ case 'markdown':
184
+ output = formatMarkdown(results)
185
+ break
186
+
187
+ case 'csv':
188
+ output = formatCsv(results)
189
+ break
190
+ }
191
+
192
+ await writeOutput(output, outputPath, false)
193
+ logProgress('Done!', progress)
194
+ }
195
+
196
+ /**
197
+ * Read results from stdin.
198
+ *
199
+ * @returns Array of parsed results or null if stdin is empty
200
+ */
201
+ const readStdinResults = async (): Promise<FormattableResult[] | null> => {
202
+ if (process.stdin.isTTY) {
203
+ return null
204
+ }
205
+
206
+ const chunks: Buffer[] = []
207
+ for await (const chunk of process.stdin) {
208
+ chunks.push(chunk)
209
+ }
210
+
211
+ const content = Buffer.concat(chunks).toString('utf-8').trim()
212
+ if (!content) return null
213
+
214
+ return content
215
+ .split('\n')
216
+ .filter(Boolean)
217
+ .map((line) => JSON.parse(line) as FormattableResult)
218
+ }
219
+
220
+ /**
221
+ * Pipeline format command CLI handler.
222
+ *
223
+ * @param args - Command line arguments (after 'format')
224
+ */
225
+ export const format = async (args: string[]): Promise<void> => {
226
+ const { values, positionals } = parseArgs({
227
+ args,
228
+ options: {
229
+ style: { type: 'string', short: 'f', default: 'jsonl' },
230
+ output: { type: 'string', short: 'o' },
231
+ progress: { type: 'boolean', default: false },
232
+ help: { type: 'boolean', short: 'h' },
233
+ },
234
+ allowPositionals: true,
235
+ })
236
+
237
+ if (values.help) {
238
+ // biome-ignore lint/suspicious/noConsole: CLI help output
239
+ console.log(`
240
+ Usage: agent-eval-harness format [results.jsonl] [options]
241
+
242
+ Convert results to different output formats.
243
+
244
+ Arguments:
245
+ results.jsonl Input file (or pipe from stdin)
246
+
247
+ Options:
248
+ -f, --style Output format: jsonl, markdown, csv (default: jsonl)
249
+ -o, --output Output file (default: stdout)
250
+ --progress Show progress to stderr
251
+ -h, --help Show this help message
252
+
253
+ Examples:
254
+ # Convert to markdown report
255
+ agent-eval-harness format graded.jsonl --style markdown -o report.md
256
+
257
+ # Piped from grade
258
+ agent-eval-harness grade extracted.jsonl -g ./grader.ts | agent-eval-harness format -f csv
259
+
260
+ # Full pipeline to markdown
261
+ cat prompts.jsonl | \\
262
+ agent-eval-harness run -s claude.json | \\
263
+ agent-eval-harness extract -s claude.json | \\
264
+ agent-eval-harness grade -g ./grader.ts | \\
265
+ agent-eval-harness format -f markdown > report.md
266
+ `)
267
+ return
268
+ }
269
+
270
+ const style = values.style as FormatStyle
271
+ if (!['jsonl', 'markdown', 'csv'].includes(style)) {
272
+ console.error(`Error: Invalid format style '${style}'. Must be: jsonl, markdown, csv`)
273
+ process.exit(1)
274
+ }
275
+
276
+ // Load results from file or stdin
277
+ const inputPath = positionals[0]
278
+ let results: FormattableResult[]
279
+
280
+ if (inputPath) {
281
+ results = await loadJsonl<FormattableResult>(inputPath)
282
+ } else {
283
+ const stdinResults = await readStdinResults()
284
+ if (!stdinResults || stdinResults.length === 0) {
285
+ console.error('Error: No results provided (use file argument or pipe to stdin)')
286
+ process.exit(1)
287
+ }
288
+ results = stdinResults
289
+ }
290
+
291
+ await runFormat(style, results, values.output, values.progress)
292
+ }
@@ -0,0 +1,169 @@
1
+ /**
2
+ * Pipeline grade command - apply grader to extracted results.
3
+ *
4
+ * @remarks
5
+ * Takes ExtractedResult from `extract` command and adds grader scores.
6
+ * Uses the same grader loading mechanism as the capture command.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { loadJsonl, logProgress, writeOutput } from '../core.ts'
13
+ import { loadGrader } from '../schemas/grader-loader.ts'
14
+ import type { ExtractedResult, GradedResult } from './pipeline.types.ts'
15
+
16
+ /**
17
+ * Execute pipeline grade with configuration.
18
+ *
19
+ * @param graderPath - Path to grader module or executable
20
+ * @param extractedResults - Extracted results from extract command
21
+ * @param outputPath - Optional output file path
22
+ * @param progress - Show progress to stderr
23
+ */
24
+ export const runGrade = async (
25
+ graderPath: string,
26
+ extractedResults: ExtractedResult[],
27
+ outputPath?: string,
28
+ progress = false,
29
+ ): Promise<void> => {
30
+ // Load grader
31
+ const grader = await loadGrader(graderPath)
32
+
33
+ logProgress(`Grading with: ${graderPath}`, progress)
34
+
35
+ let isFirstOutput = true
36
+
37
+ // Clear output file if specified
38
+ if (outputPath) {
39
+ await Bun.write(outputPath, '')
40
+ }
41
+
42
+ for (let i = 0; i < extractedResults.length; i++) {
43
+ const extracted = extractedResults[i]
44
+ if (!extracted) continue
45
+
46
+ logProgress(`[${i + 1}/${extractedResults.length}] ${extracted.id}`, progress)
47
+
48
+ // Apply grader
49
+ const score = await grader({
50
+ input: extracted.input,
51
+ output: extracted.output,
52
+ hint: extracted.hint,
53
+ trajectory: extracted.trajectory,
54
+ })
55
+
56
+ const graded: GradedResult = {
57
+ ...extracted,
58
+ score,
59
+ }
60
+
61
+ const icon = score.pass ? '✓' : '✗'
62
+ logProgress(` ${icon} score=${score.score.toFixed(2)}`, progress)
63
+
64
+ await writeOutput(JSON.stringify(graded), outputPath, !isFirstOutput)
65
+ isFirstOutput = false
66
+ }
67
+
68
+ logProgress('Done!', progress)
69
+ }
70
+
71
+ /**
72
+ * Read extracted results from stdin.
73
+ *
74
+ * @returns Array of parsed extracted results or null if stdin is empty
75
+ */
76
+ const readStdinExtracted = async (): Promise<ExtractedResult[] | null> => {
77
+ if (process.stdin.isTTY) {
78
+ return null
79
+ }
80
+
81
+ const chunks: Buffer[] = []
82
+ for await (const chunk of process.stdin) {
83
+ chunks.push(chunk)
84
+ }
85
+
86
+ const content = Buffer.concat(chunks).toString('utf-8').trim()
87
+ if (!content) return null
88
+
89
+ return content
90
+ .split('\n')
91
+ .filter(Boolean)
92
+ .map((line) => JSON.parse(line) as ExtractedResult)
93
+ }
94
+
95
+ /**
96
+ * Pipeline grade command CLI handler.
97
+ *
98
+ * @param args - Command line arguments (after 'grade')
99
+ */
100
+ export const grade = async (args: string[]): Promise<void> => {
101
+ const { values, positionals } = parseArgs({
102
+ args,
103
+ options: {
104
+ grader: { type: 'string', short: 'g' },
105
+ output: { type: 'string', short: 'o' },
106
+ progress: { type: 'boolean', default: false },
107
+ help: { type: 'boolean', short: 'h' },
108
+ },
109
+ allowPositionals: true,
110
+ })
111
+
112
+ if (values.help) {
113
+ // biome-ignore lint/suspicious/noConsole: CLI help output
114
+ console.log(`
115
+ Usage: agent-eval-harness grade [extracted.jsonl] --grader <grader> [options]
116
+
117
+ Apply grader to extracted results.
118
+
119
+ Arguments:
120
+ extracted.jsonl Input file from 'extract' command (or pipe from stdin)
121
+
122
+ Options:
123
+ -g, --grader Path to grader (.ts/.js module or executable script) (required)
124
+ -o, --output Output file (default: stdout)
125
+ --progress Show progress to stderr
126
+ -h, --help Show this help message
127
+
128
+ Graders:
129
+ TS/JS modules must export a 'grade' function.
130
+ Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
131
+
132
+ Examples:
133
+ # From file
134
+ agent-eval-harness grade extracted.jsonl --grader ./grader.ts -o graded.jsonl
135
+
136
+ # Piped from extract
137
+ agent-eval-harness extract raw.jsonl -s claude.json | agent-eval-harness grade -g ./grader.ts
138
+
139
+ # Full pipeline
140
+ cat prompts.jsonl | \\
141
+ agent-eval-harness run -s claude.json | \\
142
+ agent-eval-harness extract -s claude.json | \\
143
+ agent-eval-harness grade -g ./grader.ts > results.jsonl
144
+ `)
145
+ return
146
+ }
147
+
148
+ if (!values.grader) {
149
+ console.error('Error: --grader is required')
150
+ process.exit(1)
151
+ }
152
+
153
+ // Load extracted results from file or stdin
154
+ const inputPath = positionals[0]
155
+ let extractedResults: ExtractedResult[]
156
+
157
+ if (inputPath) {
158
+ extractedResults = await loadJsonl<ExtractedResult>(inputPath)
159
+ } else {
160
+ const stdinResults = await readStdinExtracted()
161
+ if (!stdinResults || stdinResults.length === 0) {
162
+ console.error('Error: No extracted results provided (use file argument or pipe to stdin)')
163
+ process.exit(1)
164
+ }
165
+ extractedResults = stdinResults
166
+ }
167
+
168
+ await runGrade(values.grader, extractedResults, values.output, values.progress)
169
+ }