@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schemas command - export JSON schemas for non-TypeScript users.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Uses Zod 4's native `z.toJSONSchema()` to generate JSON Schema from
|
|
6
|
+
* the harness schemas. Useful for validation in other languages/tools.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from 'node:util'
|
|
12
|
+
import { z } from 'zod'
|
|
13
|
+
import * as schemas from './schemas.ts'
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// Schema Registry
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
/** Available schemas for export */
|
|
20
|
+
const SCHEMA_REGISTRY: Record<string, z.ZodSchema> = {
|
|
21
|
+
PromptCase: schemas.PromptCaseSchema,
|
|
22
|
+
GraderResult: schemas.GraderResultSchema,
|
|
23
|
+
TrajectoryStep: schemas.TrajectoryStepSchema,
|
|
24
|
+
CaptureResult: schemas.CaptureResultSchema,
|
|
25
|
+
SummaryResult: schemas.SummaryResultSchema,
|
|
26
|
+
TrialEntry: schemas.TrialEntrySchema,
|
|
27
|
+
TrialResult: schemas.TrialResultSchema,
|
|
28
|
+
CalibrationSample: schemas.CalibrationSampleSchema,
|
|
29
|
+
BalanceAnalysis: schemas.BalanceAnalysisSchema,
|
|
30
|
+
ValidationResult: schemas.ValidationResultSchema,
|
|
31
|
+
McpServerConfig: schemas.McpServerSchema,
|
|
32
|
+
Session: schemas.SessionSchema,
|
|
33
|
+
JsonRpcRequest: schemas.JsonRpcRequestSchema,
|
|
34
|
+
JsonRpcResponse: schemas.JsonRpcResponseSchema,
|
|
35
|
+
JsonRpcError: schemas.JsonRpcErrorSchema,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// ============================================================================
|
|
39
|
+
// Types
|
|
40
|
+
// ============================================================================
|
|
41
|
+
|
|
42
|
+
/** Configuration for schemas command */
|
|
43
|
+
export type SchemasConfig = {
|
|
44
|
+
/** Specific schema name to export (undefined = all) */
|
|
45
|
+
schemaName?: string
|
|
46
|
+
/** Output file path */
|
|
47
|
+
outputPath?: string
|
|
48
|
+
/** Output as JSON (vs list) */
|
|
49
|
+
json?: boolean
|
|
50
|
+
/** Split into separate files */
|
|
51
|
+
split?: boolean
|
|
52
|
+
/** List available schemas */
|
|
53
|
+
list?: boolean
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// ============================================================================
|
|
57
|
+
// Helpers
|
|
58
|
+
// ============================================================================
|
|
59
|
+
|
|
60
|
+
/** Resolve path relative to process.cwd() */
|
|
61
|
+
const resolvePath = (path: string): string => {
|
|
62
|
+
if (path.startsWith('/')) return path
|
|
63
|
+
return `${process.cwd()}/${path}`
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Generate JSON Schema from Zod schema */
|
|
67
|
+
const toJsonSchema = (schema: z.ZodSchema, name: string): object => {
|
|
68
|
+
try {
|
|
69
|
+
// Zod 4's native JSON Schema generation
|
|
70
|
+
const jsonSchema = z.toJSONSchema(schema)
|
|
71
|
+
return {
|
|
72
|
+
$schema: 'https://json-schema.org/draft/2020-12/schema',
|
|
73
|
+
title: name,
|
|
74
|
+
...jsonSchema,
|
|
75
|
+
}
|
|
76
|
+
} catch (error) {
|
|
77
|
+
// Fallback for schemas that can't be converted
|
|
78
|
+
return {
|
|
79
|
+
$schema: 'https://json-schema.org/draft/2020-12/schema',
|
|
80
|
+
title: name,
|
|
81
|
+
description: `Schema for ${name} (auto-generation failed: ${error instanceof Error ? error.message : 'unknown error'})`,
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ============================================================================
|
|
87
|
+
// Schemas Implementation
|
|
88
|
+
// ============================================================================
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Execute schemas command with configuration object.
|
|
92
|
+
*
|
|
93
|
+
* @param config - Schemas configuration
|
|
94
|
+
* @returns Generated JSON schemas
|
|
95
|
+
*/
|
|
96
|
+
export const runSchemas = async (config: SchemasConfig): Promise<Record<string, object> | string[]> => {
|
|
97
|
+
const { schemaName, outputPath, json = false, split = false, list = false } = config
|
|
98
|
+
|
|
99
|
+
// List mode
|
|
100
|
+
if (list) {
|
|
101
|
+
const names = Object.keys(SCHEMA_REGISTRY)
|
|
102
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
103
|
+
console.log('Available schemas:')
|
|
104
|
+
for (const name of names) {
|
|
105
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
106
|
+
console.log(` - ${name}`)
|
|
107
|
+
}
|
|
108
|
+
return names
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Single schema mode
|
|
112
|
+
if (schemaName) {
|
|
113
|
+
const schema = SCHEMA_REGISTRY[schemaName]
|
|
114
|
+
if (!schema) {
|
|
115
|
+
console.error(`Error: Unknown schema '${schemaName}'`)
|
|
116
|
+
console.error(`Available: ${Object.keys(SCHEMA_REGISTRY).join(', ')}`)
|
|
117
|
+
process.exit(1)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const jsonSchema = toJsonSchema(schema, schemaName)
|
|
121
|
+
const output = JSON.stringify(jsonSchema, null, 2)
|
|
122
|
+
|
|
123
|
+
if (outputPath) {
|
|
124
|
+
await Bun.write(resolvePath(outputPath), output)
|
|
125
|
+
} else {
|
|
126
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
127
|
+
console.log(output)
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return { [schemaName]: jsonSchema }
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// All schemas mode
|
|
134
|
+
const allSchemas: Record<string, object> = {}
|
|
135
|
+
|
|
136
|
+
for (const [name, schema] of Object.entries(SCHEMA_REGISTRY)) {
|
|
137
|
+
allSchemas[name] = toJsonSchema(schema, name)
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (split && outputPath) {
|
|
141
|
+
// Create directory and write separate files
|
|
142
|
+
const dir = resolvePath(outputPath)
|
|
143
|
+
await Bun.$`mkdir -p ${dir}`
|
|
144
|
+
|
|
145
|
+
for (const [name, jsonSchema] of Object.entries(allSchemas)) {
|
|
146
|
+
const filePath = `${dir}/${name}.json`
|
|
147
|
+
await Bun.write(filePath, JSON.stringify(jsonSchema, null, 2))
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
console.error(`Wrote ${Object.keys(allSchemas).length} schema files to ${dir}/`)
|
|
151
|
+
} else if (json) {
|
|
152
|
+
const output = JSON.stringify(allSchemas, null, 2)
|
|
153
|
+
|
|
154
|
+
if (outputPath) {
|
|
155
|
+
await Bun.write(resolvePath(outputPath), output)
|
|
156
|
+
} else {
|
|
157
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
158
|
+
console.log(output)
|
|
159
|
+
}
|
|
160
|
+
} else {
|
|
161
|
+
// Default: list schemas
|
|
162
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
163
|
+
console.log('Available schemas (use --json to export):')
|
|
164
|
+
for (const name of Object.keys(allSchemas)) {
|
|
165
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
166
|
+
console.log(` - ${name}`)
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return allSchemas
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// ============================================================================
|
|
174
|
+
// CLI Entry Point
|
|
175
|
+
// ============================================================================
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Schemas command CLI handler.
|
|
179
|
+
*
|
|
180
|
+
* @param args - Command line arguments (after 'schemas')
|
|
181
|
+
*/
|
|
182
|
+
export const schemasCli = async (args: string[]): Promise<void> => {
|
|
183
|
+
const { values, positionals } = parseArgs({
|
|
184
|
+
args,
|
|
185
|
+
options: {
|
|
186
|
+
output: { type: 'string', short: 'o' },
|
|
187
|
+
json: { type: 'boolean', short: 'j', default: false },
|
|
188
|
+
split: { type: 'boolean', short: 's', default: false },
|
|
189
|
+
list: { type: 'boolean', short: 'l', default: false },
|
|
190
|
+
help: { type: 'boolean', short: 'h' },
|
|
191
|
+
},
|
|
192
|
+
allowPositionals: true,
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
if (values.help) {
|
|
196
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
197
|
+
console.log(`
|
|
198
|
+
Usage: agent-eval-harness schemas [schema-name] [options]
|
|
199
|
+
|
|
200
|
+
Arguments:
|
|
201
|
+
schema-name Specific schema to export (optional)
|
|
202
|
+
|
|
203
|
+
Options:
|
|
204
|
+
-o, --output Output file or directory (with --split)
|
|
205
|
+
-j, --json Export as JSON (default: list names)
|
|
206
|
+
-s, --split Split into separate files (requires --output dir)
|
|
207
|
+
-l, --list List available schemas
|
|
208
|
+
-h, --help Show this help message
|
|
209
|
+
|
|
210
|
+
Available Schemas:
|
|
211
|
+
PromptCase, GraderResult, TrajectoryStep, CaptureResult, SummaryResult,
|
|
212
|
+
TrialEntry, TrialResult, CalibrationSample, BalanceAnalysis, ValidationResult,
|
|
213
|
+
McpServerConfig, Session, JsonRpcRequest, JsonRpcResponse, JsonRpcError
|
|
214
|
+
|
|
215
|
+
Examples:
|
|
216
|
+
# List available schemas
|
|
217
|
+
agent-eval-harness schemas --list
|
|
218
|
+
|
|
219
|
+
# Export all schemas as single JSON file
|
|
220
|
+
agent-eval-harness schemas --json -o schemas.json
|
|
221
|
+
|
|
222
|
+
# Export specific schema
|
|
223
|
+
agent-eval-harness schemas CaptureResult --json
|
|
224
|
+
agent-eval-harness schemas TrialResult --json -o trial-schema.json
|
|
225
|
+
|
|
226
|
+
# Export all schemas as separate files
|
|
227
|
+
agent-eval-harness schemas --json --split -o schemas/
|
|
228
|
+
`)
|
|
229
|
+
return
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
await runSchemas({
|
|
233
|
+
schemaName: positionals[0],
|
|
234
|
+
outputPath: values.output,
|
|
235
|
+
json: values.json ?? false,
|
|
236
|
+
split: values.split ?? false,
|
|
237
|
+
list: values.list ?? false,
|
|
238
|
+
})
|
|
239
|
+
}
|