@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline extract command - parse raw output into trajectories.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Converts RawOutput from `run` command into ExtractedResult with
|
|
6
|
+
* parsed trajectory and final output. Uses the same schema-driven
|
|
7
|
+
* parsing as the capture command.
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { parseArgs } from 'node:util'
|
|
13
|
+
import { loadJsonl, logProgress, writeOutput } from '../core.ts'
|
|
14
|
+
import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
15
|
+
import { createOutputParser } from '../headless/headless-output-parser.ts'
|
|
16
|
+
import type { TrajectoryStep } from '../schemas.ts'
|
|
17
|
+
import type { ExtractedResult, RawOutput } from './pipeline.types.ts'
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Extract trajectory from raw output using schema parser.
|
|
21
|
+
*
|
|
22
|
+
* @param rawOutput - Raw output from run command
|
|
23
|
+
* @param parser - Output parser created from schema
|
|
24
|
+
* @returns Extracted result with trajectory
|
|
25
|
+
*/
|
|
26
|
+
const extractFromRaw = (rawOutput: RawOutput, parser: ReturnType<typeof createOutputParser>): ExtractedResult => {
|
|
27
|
+
const trajectory: TrajectoryStep[] = []
|
|
28
|
+
let finalOutput = ''
|
|
29
|
+
let toolErrors = false
|
|
30
|
+
|
|
31
|
+
// Parse each raw line
|
|
32
|
+
for (const line of rawOutput.rawLines) {
|
|
33
|
+
// Try to parse as trajectory update
|
|
34
|
+
const parsed = parser.parseLine(line)
|
|
35
|
+
if (parsed) {
|
|
36
|
+
const updates = Array.isArray(parsed) ? parsed : [parsed]
|
|
37
|
+
for (const update of updates) {
|
|
38
|
+
const timestamp = Date.now() - rawOutput.timing.start
|
|
39
|
+
|
|
40
|
+
if (update.type === 'thought') {
|
|
41
|
+
trajectory.push({
|
|
42
|
+
type: 'thought',
|
|
43
|
+
content: update.content ?? '',
|
|
44
|
+
timestamp,
|
|
45
|
+
})
|
|
46
|
+
} else if (update.type === 'message') {
|
|
47
|
+
trajectory.push({
|
|
48
|
+
type: 'message',
|
|
49
|
+
content: update.content ?? '',
|
|
50
|
+
timestamp,
|
|
51
|
+
})
|
|
52
|
+
} else if (update.type === 'tool_call') {
|
|
53
|
+
trajectory.push({
|
|
54
|
+
type: 'tool_call',
|
|
55
|
+
name: update.title ?? 'unknown',
|
|
56
|
+
status: update.status ?? 'pending',
|
|
57
|
+
timestamp,
|
|
58
|
+
})
|
|
59
|
+
if (update.status === 'failed') {
|
|
60
|
+
toolErrors = true
|
|
61
|
+
}
|
|
62
|
+
} else if (update.type === 'plan') {
|
|
63
|
+
trajectory.push({
|
|
64
|
+
type: 'plan',
|
|
65
|
+
entries: [],
|
|
66
|
+
timestamp,
|
|
67
|
+
})
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Try to parse as result
|
|
73
|
+
const result = parser.parseResult(line)
|
|
74
|
+
if (result.isResult) {
|
|
75
|
+
finalOutput = result.content
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// If no explicit result, extract from messages
|
|
80
|
+
if (!finalOutput) {
|
|
81
|
+
finalOutput = trajectory
|
|
82
|
+
.filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
|
|
83
|
+
.map((step) => step.content)
|
|
84
|
+
.join('\n')
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return {
|
|
88
|
+
id: rawOutput.id,
|
|
89
|
+
input: rawOutput.input,
|
|
90
|
+
hint: rawOutput.hint,
|
|
91
|
+
output: finalOutput,
|
|
92
|
+
trajectory,
|
|
93
|
+
toolErrors: toolErrors || !!rawOutput.error,
|
|
94
|
+
timing: rawOutput.timing,
|
|
95
|
+
...(rawOutput.error && { error: rawOutput.error }),
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Execute pipeline extract with configuration.
|
|
101
|
+
*
|
|
102
|
+
* @param schemaPath - Path to headless adapter schema
|
|
103
|
+
* @param rawOutputs - Raw outputs from run command
|
|
104
|
+
* @param outputPath - Optional output file path
|
|
105
|
+
* @param progress - Show progress to stderr
|
|
106
|
+
*/
|
|
107
|
+
export const runExtract = async (
|
|
108
|
+
schemaPath: string,
|
|
109
|
+
rawOutputs: RawOutput[],
|
|
110
|
+
outputPath?: string,
|
|
111
|
+
progress = false,
|
|
112
|
+
): Promise<void> => {
|
|
113
|
+
// Load and validate schema
|
|
114
|
+
const schemaFile = Bun.file(schemaPath)
|
|
115
|
+
if (!(await schemaFile.exists())) {
|
|
116
|
+
throw new Error(`Schema file not found: ${schemaPath}`)
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const rawSchema = await schemaFile.json()
|
|
120
|
+
const schema = parseHeadlessConfig(rawSchema)
|
|
121
|
+
const parser = createOutputParser(schema)
|
|
122
|
+
|
|
123
|
+
logProgress(`Extracting with schema: ${schema.name}`, progress)
|
|
124
|
+
|
|
125
|
+
let isFirstOutput = true
|
|
126
|
+
|
|
127
|
+
// Clear output file if specified
|
|
128
|
+
if (outputPath) {
|
|
129
|
+
await Bun.write(outputPath, '')
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
for (let i = 0; i < rawOutputs.length; i++) {
|
|
133
|
+
const rawOutput = rawOutputs[i]
|
|
134
|
+
if (!rawOutput) continue
|
|
135
|
+
|
|
136
|
+
logProgress(`[${i + 1}/${rawOutputs.length}] ${rawOutput.id}`, progress)
|
|
137
|
+
|
|
138
|
+
const extracted = extractFromRaw(rawOutput, parser)
|
|
139
|
+
|
|
140
|
+
await writeOutput(JSON.stringify(extracted), outputPath, !isFirstOutput)
|
|
141
|
+
isFirstOutput = false
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
logProgress('Done!', progress)
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Read raw outputs from stdin.
|
|
149
|
+
*
|
|
150
|
+
* @returns Array of parsed raw outputs or null if stdin is empty
|
|
151
|
+
*/
|
|
152
|
+
const readStdinRawOutputs = async (): Promise<RawOutput[] | null> => {
|
|
153
|
+
if (process.stdin.isTTY) {
|
|
154
|
+
return null
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const chunks: Buffer[] = []
|
|
158
|
+
for await (const chunk of process.stdin) {
|
|
159
|
+
chunks.push(chunk)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const content = Buffer.concat(chunks).toString('utf-8').trim()
|
|
163
|
+
if (!content) return null
|
|
164
|
+
|
|
165
|
+
return content
|
|
166
|
+
.split('\n')
|
|
167
|
+
.filter(Boolean)
|
|
168
|
+
.map((line) => JSON.parse(line) as RawOutput)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Pipeline extract command CLI handler.
|
|
173
|
+
*
|
|
174
|
+
* @param args - Command line arguments (after 'extract')
|
|
175
|
+
*/
|
|
176
|
+
export const extract = async (args: string[]): Promise<void> => {
|
|
177
|
+
const { values, positionals } = parseArgs({
|
|
178
|
+
args,
|
|
179
|
+
options: {
|
|
180
|
+
schema: { type: 'string', short: 's' },
|
|
181
|
+
output: { type: 'string', short: 'o' },
|
|
182
|
+
progress: { type: 'boolean', default: false },
|
|
183
|
+
help: { type: 'boolean', short: 'h' },
|
|
184
|
+
},
|
|
185
|
+
allowPositionals: true,
|
|
186
|
+
})
|
|
187
|
+
|
|
188
|
+
if (values.help) {
|
|
189
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
190
|
+
console.log(`
|
|
191
|
+
Usage: agent-eval-harness extract [raw.jsonl] --schema <schema.json> [options]
|
|
192
|
+
|
|
193
|
+
Parse raw output into trajectories and final output.
|
|
194
|
+
|
|
195
|
+
Arguments:
|
|
196
|
+
raw.jsonl Input file from 'run' command (or pipe from stdin)
|
|
197
|
+
|
|
198
|
+
Options:
|
|
199
|
+
-s, --schema Path to headless adapter schema (required)
|
|
200
|
+
-o, --output Output file (default: stdout)
|
|
201
|
+
--progress Show progress to stderr
|
|
202
|
+
-h, --help Show this help message
|
|
203
|
+
|
|
204
|
+
Examples:
|
|
205
|
+
# From file
|
|
206
|
+
agent-eval-harness extract raw.jsonl --schema claude.json -o extracted.jsonl
|
|
207
|
+
|
|
208
|
+
# Piped from run
|
|
209
|
+
agent-eval-harness run prompts.jsonl -s claude.json | agent-eval-harness extract -s claude.json
|
|
210
|
+
|
|
211
|
+
# Full pipeline
|
|
212
|
+
cat prompts.jsonl | \\
|
|
213
|
+
agent-eval-harness run -s claude.json | \\
|
|
214
|
+
agent-eval-harness extract -s claude.json | \\
|
|
215
|
+
agent-eval-harness grade --grader ./grader.ts
|
|
216
|
+
`)
|
|
217
|
+
return
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if (!values.schema) {
|
|
221
|
+
console.error('Error: --schema is required')
|
|
222
|
+
process.exit(1)
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Load raw outputs from file or stdin
|
|
226
|
+
const inputPath = positionals[0]
|
|
227
|
+
let rawOutputs: RawOutput[]
|
|
228
|
+
|
|
229
|
+
if (inputPath) {
|
|
230
|
+
rawOutputs = await loadJsonl<RawOutput>(inputPath)
|
|
231
|
+
} else {
|
|
232
|
+
const stdinOutputs = await readStdinRawOutputs()
|
|
233
|
+
if (!stdinOutputs || stdinOutputs.length === 0) {
|
|
234
|
+
console.error('Error: No raw output provided (use file argument or pipe to stdin)')
|
|
235
|
+
process.exit(1)
|
|
236
|
+
}
|
|
237
|
+
rawOutputs = stdinOutputs
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
await runExtract(values.schema, rawOutputs, values.output, values.progress)
|
|
241
|
+
}
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline format command - convert results to different output formats.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Transforms graded or extracted results into various formats:
|
|
6
|
+
* - jsonl: Pass-through JSONL (default)
|
|
7
|
+
* - markdown: Human-readable report
|
|
8
|
+
* - csv: Comma-separated values for spreadsheets
|
|
9
|
+
*
|
|
10
|
+
* @packageDocumentation
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { parseArgs } from 'node:util'
|
|
14
|
+
import { loadJsonl, logProgress, writeOutput } from '../core.ts'
|
|
15
|
+
import type { CaptureResult } from '../schemas.ts'
|
|
16
|
+
import type { ExtractedResult, FormatStyle, GradedResult } from './pipeline.types.ts'
|
|
17
|
+
|
|
18
|
+
/** Union of all formattable result types */
|
|
19
|
+
type FormattableResult = ExtractedResult | GradedResult | CaptureResult
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Check if result has a score (graded).
|
|
23
|
+
*/
|
|
24
|
+
const isGraded = (
|
|
25
|
+
result: FormattableResult,
|
|
26
|
+
): result is GradedResult | (CaptureResult & { score: NonNullable<CaptureResult['score']> }) => {
|
|
27
|
+
return 'score' in result && result.score !== undefined
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Format results as markdown report.
|
|
32
|
+
*
|
|
33
|
+
* @param results - Results to format
|
|
34
|
+
* @returns Markdown string
|
|
35
|
+
*/
|
|
36
|
+
const formatMarkdown = (results: FormattableResult[]): string => {
|
|
37
|
+
const lines: string[] = [
|
|
38
|
+
'# Evaluation Results',
|
|
39
|
+
'',
|
|
40
|
+
`Generated: ${new Date().toISOString()}`,
|
|
41
|
+
`Total: ${results.length} test cases`,
|
|
42
|
+
'',
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
// Summary statistics if graded
|
|
46
|
+
const gradedResults = results.filter(isGraded)
|
|
47
|
+
if (gradedResults.length > 0) {
|
|
48
|
+
const passed = gradedResults.filter((r) => r.score.pass).length
|
|
49
|
+
const avgScore = gradedResults.reduce((sum, r) => sum + r.score.score, 0) / gradedResults.length
|
|
50
|
+
|
|
51
|
+
lines.push('## Summary')
|
|
52
|
+
lines.push('')
|
|
53
|
+
lines.push(
|
|
54
|
+
`- **Pass rate**: ${passed}/${gradedResults.length} (${((passed / gradedResults.length) * 100).toFixed(1)}%)`,
|
|
55
|
+
)
|
|
56
|
+
lines.push(`- **Average score**: ${avgScore.toFixed(3)}`)
|
|
57
|
+
lines.push('')
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
lines.push('## Results')
|
|
61
|
+
lines.push('')
|
|
62
|
+
|
|
63
|
+
for (const result of results) {
|
|
64
|
+
const input = Array.isArray(result.input) ? result.input.join(' → ') : result.input
|
|
65
|
+
const inputPreview = input.length > 100 ? `${input.slice(0, 100)}...` : input
|
|
66
|
+
|
|
67
|
+
lines.push(`### ${result.id}`)
|
|
68
|
+
lines.push('')
|
|
69
|
+
lines.push(`**Input**: ${inputPreview}`)
|
|
70
|
+
lines.push('')
|
|
71
|
+
|
|
72
|
+
if (result.hint) {
|
|
73
|
+
lines.push(`**Hint**: ${result.hint}`)
|
|
74
|
+
lines.push('')
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const outputPreview = result.output.length > 500 ? `${result.output.slice(0, 500)}...` : result.output
|
|
78
|
+
lines.push(`**Output**:`)
|
|
79
|
+
lines.push('```')
|
|
80
|
+
lines.push(outputPreview)
|
|
81
|
+
lines.push('```')
|
|
82
|
+
lines.push('')
|
|
83
|
+
|
|
84
|
+
if (isGraded(result)) {
|
|
85
|
+
const icon = result.score.pass ? '✅' : '❌'
|
|
86
|
+
lines.push(`**Score**: ${icon} ${result.score.score.toFixed(3)} (${result.score.pass ? 'PASS' : 'FAIL'})`)
|
|
87
|
+
if (result.score.reasoning) {
|
|
88
|
+
lines.push(`**Reasoning**: ${result.score.reasoning}`)
|
|
89
|
+
}
|
|
90
|
+
lines.push('')
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (result.toolErrors) {
|
|
94
|
+
lines.push('⚠️ **Tool errors detected**')
|
|
95
|
+
lines.push('')
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if ('error' in result && result.error) {
|
|
99
|
+
lines.push(`❌ **Error**: ${result.error}`)
|
|
100
|
+
lines.push('')
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
lines.push('---')
|
|
104
|
+
lines.push('')
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return lines.join('\n')
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Format results as CSV.
|
|
112
|
+
*
|
|
113
|
+
* @param results - Results to format
|
|
114
|
+
* @returns CSV string
|
|
115
|
+
*/
|
|
116
|
+
const formatCsv = (results: FormattableResult[]): string => {
|
|
117
|
+
const lines: string[] = []
|
|
118
|
+
|
|
119
|
+
// Header
|
|
120
|
+
const hasScores = results.some(isGraded)
|
|
121
|
+
const headers = ['id', 'input', 'hint', 'output', 'tool_errors', 'duration_ms']
|
|
122
|
+
if (hasScores) {
|
|
123
|
+
headers.push('pass', 'score', 'reasoning')
|
|
124
|
+
}
|
|
125
|
+
lines.push(headers.join(','))
|
|
126
|
+
|
|
127
|
+
// Data rows
|
|
128
|
+
for (const result of results) {
|
|
129
|
+
const input = Array.isArray(result.input) ? result.input.join(' | ') : result.input
|
|
130
|
+
const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"`
|
|
131
|
+
|
|
132
|
+
const row = [
|
|
133
|
+
escapeCsv(result.id),
|
|
134
|
+
escapeCsv(input),
|
|
135
|
+
escapeCsv(result.hint ?? ''),
|
|
136
|
+
escapeCsv(result.output),
|
|
137
|
+
result.toolErrors ? 'true' : 'false',
|
|
138
|
+
String(result.timing.total),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
if (hasScores) {
|
|
142
|
+
if (isGraded(result)) {
|
|
143
|
+
row.push(
|
|
144
|
+
result.score.pass ? 'true' : 'false',
|
|
145
|
+
result.score.score.toFixed(3),
|
|
146
|
+
escapeCsv(result.score.reasoning ?? ''),
|
|
147
|
+
)
|
|
148
|
+
} else {
|
|
149
|
+
row.push('', '', '')
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
lines.push(row.join(','))
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return lines.join('\n')
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Execute pipeline format with configuration.
|
|
161
|
+
*
|
|
162
|
+
* @param style - Output format style
|
|
163
|
+
* @param results - Results to format
|
|
164
|
+
* @param outputPath - Optional output file path
|
|
165
|
+
* @param progress - Show progress to stderr
|
|
166
|
+
*/
|
|
167
|
+
export const runFormat = async (
|
|
168
|
+
style: FormatStyle,
|
|
169
|
+
results: FormattableResult[],
|
|
170
|
+
outputPath?: string,
|
|
171
|
+
progress = false,
|
|
172
|
+
): Promise<void> => {
|
|
173
|
+
logProgress(`Formatting ${results.length} results as ${style}`, progress)
|
|
174
|
+
|
|
175
|
+
let output: string
|
|
176
|
+
|
|
177
|
+
switch (style) {
|
|
178
|
+
case 'jsonl':
|
|
179
|
+
// Pass-through as JSONL
|
|
180
|
+
output = results.map((r) => JSON.stringify(r)).join('\n')
|
|
181
|
+
break
|
|
182
|
+
|
|
183
|
+
case 'markdown':
|
|
184
|
+
output = formatMarkdown(results)
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
case 'csv':
|
|
188
|
+
output = formatCsv(results)
|
|
189
|
+
break
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
await writeOutput(output, outputPath, false)
|
|
193
|
+
logProgress('Done!', progress)
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Read results from stdin.
|
|
198
|
+
*
|
|
199
|
+
* @returns Array of parsed results or null if stdin is empty
|
|
200
|
+
*/
|
|
201
|
+
const readStdinResults = async (): Promise<FormattableResult[] | null> => {
|
|
202
|
+
if (process.stdin.isTTY) {
|
|
203
|
+
return null
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const chunks: Buffer[] = []
|
|
207
|
+
for await (const chunk of process.stdin) {
|
|
208
|
+
chunks.push(chunk)
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
const content = Buffer.concat(chunks).toString('utf-8').trim()
|
|
212
|
+
if (!content) return null
|
|
213
|
+
|
|
214
|
+
return content
|
|
215
|
+
.split('\n')
|
|
216
|
+
.filter(Boolean)
|
|
217
|
+
.map((line) => JSON.parse(line) as FormattableResult)
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Pipeline format command CLI handler.
|
|
222
|
+
*
|
|
223
|
+
* @param args - Command line arguments (after 'format')
|
|
224
|
+
*/
|
|
225
|
+
export const format = async (args: string[]): Promise<void> => {
|
|
226
|
+
const { values, positionals } = parseArgs({
|
|
227
|
+
args,
|
|
228
|
+
options: {
|
|
229
|
+
style: { type: 'string', short: 'f', default: 'jsonl' },
|
|
230
|
+
output: { type: 'string', short: 'o' },
|
|
231
|
+
progress: { type: 'boolean', default: false },
|
|
232
|
+
help: { type: 'boolean', short: 'h' },
|
|
233
|
+
},
|
|
234
|
+
allowPositionals: true,
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
if (values.help) {
|
|
238
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
239
|
+
console.log(`
|
|
240
|
+
Usage: agent-eval-harness format [results.jsonl] [options]
|
|
241
|
+
|
|
242
|
+
Convert results to different output formats.
|
|
243
|
+
|
|
244
|
+
Arguments:
|
|
245
|
+
results.jsonl Input file (or pipe from stdin)
|
|
246
|
+
|
|
247
|
+
Options:
|
|
248
|
+
-f, --style Output format: jsonl, markdown, csv (default: jsonl)
|
|
249
|
+
-o, --output Output file (default: stdout)
|
|
250
|
+
--progress Show progress to stderr
|
|
251
|
+
-h, --help Show this help message
|
|
252
|
+
|
|
253
|
+
Examples:
|
|
254
|
+
# Convert to markdown report
|
|
255
|
+
agent-eval-harness format graded.jsonl --style markdown -o report.md
|
|
256
|
+
|
|
257
|
+
# Piped from grade
|
|
258
|
+
agent-eval-harness grade extracted.jsonl -g ./grader.ts | agent-eval-harness format -f csv
|
|
259
|
+
|
|
260
|
+
# Full pipeline to markdown
|
|
261
|
+
cat prompts.jsonl | \\
|
|
262
|
+
agent-eval-harness run -s claude.json | \\
|
|
263
|
+
agent-eval-harness extract -s claude.json | \\
|
|
264
|
+
agent-eval-harness grade -g ./grader.ts | \\
|
|
265
|
+
agent-eval-harness format -f markdown > report.md
|
|
266
|
+
`)
|
|
267
|
+
return
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const style = values.style as FormatStyle
|
|
271
|
+
if (!['jsonl', 'markdown', 'csv'].includes(style)) {
|
|
272
|
+
console.error(`Error: Invalid format style '${style}'. Must be: jsonl, markdown, csv`)
|
|
273
|
+
process.exit(1)
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Load results from file or stdin
|
|
277
|
+
const inputPath = positionals[0]
|
|
278
|
+
let results: FormattableResult[]
|
|
279
|
+
|
|
280
|
+
if (inputPath) {
|
|
281
|
+
results = await loadJsonl<FormattableResult>(inputPath)
|
|
282
|
+
} else {
|
|
283
|
+
const stdinResults = await readStdinResults()
|
|
284
|
+
if (!stdinResults || stdinResults.length === 0) {
|
|
285
|
+
console.error('Error: No results provided (use file argument or pipe to stdin)')
|
|
286
|
+
process.exit(1)
|
|
287
|
+
}
|
|
288
|
+
results = stdinResults
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
await runFormat(style, results, values.output, values.progress)
|
|
292
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline grade command - apply grader to extracted results.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Takes ExtractedResult from `extract` command and adds grader scores.
|
|
6
|
+
* Uses the same grader loading mechanism as the capture command.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from 'node:util'
|
|
12
|
+
import { loadJsonl, logProgress, writeOutput } from '../core.ts'
|
|
13
|
+
import { loadGrader } from '../schemas/grader-loader.ts'
|
|
14
|
+
import type { ExtractedResult, GradedResult } from './pipeline.types.ts'
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Execute pipeline grade with configuration.
|
|
18
|
+
*
|
|
19
|
+
* @param graderPath - Path to grader module or executable
|
|
20
|
+
* @param extractedResults - Extracted results from extract command
|
|
21
|
+
* @param outputPath - Optional output file path
|
|
22
|
+
* @param progress - Show progress to stderr
|
|
23
|
+
*/
|
|
24
|
+
export const runGrade = async (
|
|
25
|
+
graderPath: string,
|
|
26
|
+
extractedResults: ExtractedResult[],
|
|
27
|
+
outputPath?: string,
|
|
28
|
+
progress = false,
|
|
29
|
+
): Promise<void> => {
|
|
30
|
+
// Load grader
|
|
31
|
+
const grader = await loadGrader(graderPath)
|
|
32
|
+
|
|
33
|
+
logProgress(`Grading with: ${graderPath}`, progress)
|
|
34
|
+
|
|
35
|
+
let isFirstOutput = true
|
|
36
|
+
|
|
37
|
+
// Clear output file if specified
|
|
38
|
+
if (outputPath) {
|
|
39
|
+
await Bun.write(outputPath, '')
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
for (let i = 0; i < extractedResults.length; i++) {
|
|
43
|
+
const extracted = extractedResults[i]
|
|
44
|
+
if (!extracted) continue
|
|
45
|
+
|
|
46
|
+
logProgress(`[${i + 1}/${extractedResults.length}] ${extracted.id}`, progress)
|
|
47
|
+
|
|
48
|
+
// Apply grader
|
|
49
|
+
const score = await grader({
|
|
50
|
+
input: extracted.input,
|
|
51
|
+
output: extracted.output,
|
|
52
|
+
hint: extracted.hint,
|
|
53
|
+
trajectory: extracted.trajectory,
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
const graded: GradedResult = {
|
|
57
|
+
...extracted,
|
|
58
|
+
score,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const icon = score.pass ? '✓' : '✗'
|
|
62
|
+
logProgress(` ${icon} score=${score.score.toFixed(2)}`, progress)
|
|
63
|
+
|
|
64
|
+
await writeOutput(JSON.stringify(graded), outputPath, !isFirstOutput)
|
|
65
|
+
isFirstOutput = false
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
logProgress('Done!', progress)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Read extracted results from stdin.
|
|
73
|
+
*
|
|
74
|
+
* @returns Array of parsed extracted results or null if stdin is empty
|
|
75
|
+
*/
|
|
76
|
+
const readStdinExtracted = async (): Promise<ExtractedResult[] | null> => {
|
|
77
|
+
if (process.stdin.isTTY) {
|
|
78
|
+
return null
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const chunks: Buffer[] = []
|
|
82
|
+
for await (const chunk of process.stdin) {
|
|
83
|
+
chunks.push(chunk)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const content = Buffer.concat(chunks).toString('utf-8').trim()
|
|
87
|
+
if (!content) return null
|
|
88
|
+
|
|
89
|
+
return content
|
|
90
|
+
.split('\n')
|
|
91
|
+
.filter(Boolean)
|
|
92
|
+
.map((line) => JSON.parse(line) as ExtractedResult)
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Pipeline grade command CLI handler.
|
|
97
|
+
*
|
|
98
|
+
* @param args - Command line arguments (after 'grade')
|
|
99
|
+
*/
|
|
100
|
+
export const grade = async (args: string[]): Promise<void> => {
|
|
101
|
+
const { values, positionals } = parseArgs({
|
|
102
|
+
args,
|
|
103
|
+
options: {
|
|
104
|
+
grader: { type: 'string', short: 'g' },
|
|
105
|
+
output: { type: 'string', short: 'o' },
|
|
106
|
+
progress: { type: 'boolean', default: false },
|
|
107
|
+
help: { type: 'boolean', short: 'h' },
|
|
108
|
+
},
|
|
109
|
+
allowPositionals: true,
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
if (values.help) {
|
|
113
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
114
|
+
console.log(`
|
|
115
|
+
Usage: agent-eval-harness grade [extracted.jsonl] --grader <grader> [options]
|
|
116
|
+
|
|
117
|
+
Apply grader to extracted results.
|
|
118
|
+
|
|
119
|
+
Arguments:
|
|
120
|
+
extracted.jsonl Input file from 'extract' command (or pipe from stdin)
|
|
121
|
+
|
|
122
|
+
Options:
|
|
123
|
+
-g, --grader Path to grader (.ts/.js module or executable script) (required)
|
|
124
|
+
-o, --output Output file (default: stdout)
|
|
125
|
+
--progress Show progress to stderr
|
|
126
|
+
-h, --help Show this help message
|
|
127
|
+
|
|
128
|
+
Graders:
|
|
129
|
+
TS/JS modules must export a 'grade' function.
|
|
130
|
+
Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
|
|
131
|
+
|
|
132
|
+
Examples:
|
|
133
|
+
# From file
|
|
134
|
+
agent-eval-harness grade extracted.jsonl --grader ./grader.ts -o graded.jsonl
|
|
135
|
+
|
|
136
|
+
# Piped from extract
|
|
137
|
+
agent-eval-harness extract raw.jsonl -s claude.json | agent-eval-harness grade -g ./grader.ts
|
|
138
|
+
|
|
139
|
+
# Full pipeline
|
|
140
|
+
cat prompts.jsonl | \\
|
|
141
|
+
agent-eval-harness run -s claude.json | \\
|
|
142
|
+
agent-eval-harness extract -s claude.json | \\
|
|
143
|
+
agent-eval-harness grade -g ./grader.ts > results.jsonl
|
|
144
|
+
`)
|
|
145
|
+
return
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (!values.grader) {
|
|
149
|
+
console.error('Error: --grader is required')
|
|
150
|
+
process.exit(1)
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Load extracted results from file or stdin
|
|
154
|
+
const inputPath = positionals[0]
|
|
155
|
+
let extractedResults: ExtractedResult[]
|
|
156
|
+
|
|
157
|
+
if (inputPath) {
|
|
158
|
+
extractedResults = await loadJsonl<ExtractedResult>(inputPath)
|
|
159
|
+
} else {
|
|
160
|
+
const stdinResults = await readStdinExtracted()
|
|
161
|
+
if (!stdinResults || stdinResults.length === 0) {
|
|
162
|
+
console.error('Error: No extracted results provided (use file argument or pipe to stdin)')
|
|
163
|
+
process.exit(1)
|
|
164
|
+
}
|
|
165
|
+
extractedResults = stdinResults
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
await runGrade(values.grader, extractedResults, values.output, values.progress)
|
|
169
|
+
}
|