@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,239 @@
1
+ /**
2
+ * Schemas command - export JSON schemas for non-TypeScript users.
3
+ *
4
+ * @remarks
5
+ * Uses Zod 4's native `z.toJSONSchema()` to generate JSON Schema from
6
+ * the harness schemas. Useful for validation in other languages/tools.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { z } from 'zod'
13
+ import * as schemas from './schemas.ts'
14
+
15
+ // ============================================================================
16
+ // Schema Registry
17
+ // ============================================================================
18
+
19
+ /** Available schemas for export */
20
+ const SCHEMA_REGISTRY: Record<string, z.ZodSchema> = {
21
+ PromptCase: schemas.PromptCaseSchema,
22
+ GraderResult: schemas.GraderResultSchema,
23
+ TrajectoryStep: schemas.TrajectoryStepSchema,
24
+ CaptureResult: schemas.CaptureResultSchema,
25
+ SummaryResult: schemas.SummaryResultSchema,
26
+ TrialEntry: schemas.TrialEntrySchema,
27
+ TrialResult: schemas.TrialResultSchema,
28
+ CalibrationSample: schemas.CalibrationSampleSchema,
29
+ BalanceAnalysis: schemas.BalanceAnalysisSchema,
30
+ ValidationResult: schemas.ValidationResultSchema,
31
+ McpServerConfig: schemas.McpServerSchema,
32
+ Session: schemas.SessionSchema,
33
+ JsonRpcRequest: schemas.JsonRpcRequestSchema,
34
+ JsonRpcResponse: schemas.JsonRpcResponseSchema,
35
+ JsonRpcError: schemas.JsonRpcErrorSchema,
36
+ }
37
+
38
+ // ============================================================================
39
+ // Types
40
+ // ============================================================================
41
+
42
+ /** Configuration for schemas command */
43
+ export type SchemasConfig = {
44
+ /** Specific schema name to export (undefined = all) */
45
+ schemaName?: string
46
+ /** Output file path */
47
+ outputPath?: string
48
+ /** Output as JSON (vs list) */
49
+ json?: boolean
50
+ /** Split into separate files */
51
+ split?: boolean
52
+ /** List available schemas */
53
+ list?: boolean
54
+ }
55
+
56
+ // ============================================================================
57
+ // Helpers
58
+ // ============================================================================
59
+
60
+ /** Resolve path relative to process.cwd() */
61
+ const resolvePath = (path: string): string => {
62
+ if (path.startsWith('/')) return path
63
+ return `${process.cwd()}/${path}`
64
+ }
65
+
66
+ /** Generate JSON Schema from Zod schema */
67
+ const toJsonSchema = (schema: z.ZodSchema, name: string): object => {
68
+ try {
69
+ // Zod 4's native JSON Schema generation
70
+ const jsonSchema = z.toJSONSchema(schema)
71
+ return {
72
+ $schema: 'https://json-schema.org/draft/2020-12/schema',
73
+ title: name,
74
+ ...jsonSchema,
75
+ }
76
+ } catch (error) {
77
+ // Fallback for schemas that can't be converted
78
+ return {
79
+ $schema: 'https://json-schema.org/draft/2020-12/schema',
80
+ title: name,
81
+ description: `Schema for ${name} (auto-generation failed: ${error instanceof Error ? error.message : 'unknown error'})`,
82
+ }
83
+ }
84
+ }
85
+
86
+ // ============================================================================
87
+ // Schemas Implementation
88
+ // ============================================================================
89
+
90
+ /**
91
+ * Execute schemas command with configuration object.
92
+ *
93
+ * @param config - Schemas configuration
94
+ * @returns Generated JSON schemas
95
+ */
96
+ export const runSchemas = async (config: SchemasConfig): Promise<Record<string, object> | string[]> => {
97
+ const { schemaName, outputPath, json = false, split = false, list = false } = config
98
+
99
+ // List mode
100
+ if (list) {
101
+ const names = Object.keys(SCHEMA_REGISTRY)
102
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
103
+ console.log('Available schemas:')
104
+ for (const name of names) {
105
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
106
+ console.log(` - ${name}`)
107
+ }
108
+ return names
109
+ }
110
+
111
+ // Single schema mode
112
+ if (schemaName) {
113
+ const schema = SCHEMA_REGISTRY[schemaName]
114
+ if (!schema) {
115
+ console.error(`Error: Unknown schema '${schemaName}'`)
116
+ console.error(`Available: ${Object.keys(SCHEMA_REGISTRY).join(', ')}`)
117
+ process.exit(1)
118
+ }
119
+
120
+ const jsonSchema = toJsonSchema(schema, schemaName)
121
+ const output = JSON.stringify(jsonSchema, null, 2)
122
+
123
+ if (outputPath) {
124
+ await Bun.write(resolvePath(outputPath), output)
125
+ } else {
126
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
127
+ console.log(output)
128
+ }
129
+
130
+ return { [schemaName]: jsonSchema }
131
+ }
132
+
133
+ // All schemas mode
134
+ const allSchemas: Record<string, object> = {}
135
+
136
+ for (const [name, schema] of Object.entries(SCHEMA_REGISTRY)) {
137
+ allSchemas[name] = toJsonSchema(schema, name)
138
+ }
139
+
140
+ if (split && outputPath) {
141
+ // Create directory and write separate files
142
+ const dir = resolvePath(outputPath)
143
+ await Bun.$`mkdir -p ${dir}`
144
+
145
+ for (const [name, jsonSchema] of Object.entries(allSchemas)) {
146
+ const filePath = `${dir}/${name}.json`
147
+ await Bun.write(filePath, JSON.stringify(jsonSchema, null, 2))
148
+ }
149
+
150
+ console.error(`Wrote ${Object.keys(allSchemas).length} schema files to ${dir}/`)
151
+ } else if (json) {
152
+ const output = JSON.stringify(allSchemas, null, 2)
153
+
154
+ if (outputPath) {
155
+ await Bun.write(resolvePath(outputPath), output)
156
+ } else {
157
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
158
+ console.log(output)
159
+ }
160
+ } else {
161
+ // Default: list schemas
162
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
163
+ console.log('Available schemas (use --json to export):')
164
+ for (const name of Object.keys(allSchemas)) {
165
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
166
+ console.log(` - ${name}`)
167
+ }
168
+ }
169
+
170
+ return allSchemas
171
+ }
172
+
173
+ // ============================================================================
174
+ // CLI Entry Point
175
+ // ============================================================================
176
+
177
+ /**
178
+ * Schemas command CLI handler.
179
+ *
180
+ * @param args - Command line arguments (after 'schemas')
181
+ */
182
+ export const schemasCli = async (args: string[]): Promise<void> => {
183
+ const { values, positionals } = parseArgs({
184
+ args,
185
+ options: {
186
+ output: { type: 'string', short: 'o' },
187
+ json: { type: 'boolean', short: 'j', default: false },
188
+ split: { type: 'boolean', short: 's', default: false },
189
+ list: { type: 'boolean', short: 'l', default: false },
190
+ help: { type: 'boolean', short: 'h' },
191
+ },
192
+ allowPositionals: true,
193
+ })
194
+
195
+ if (values.help) {
196
+ // biome-ignore lint/suspicious/noConsole: CLI help output
197
+ console.log(`
198
+ Usage: agent-eval-harness schemas [schema-name] [options]
199
+
200
+ Arguments:
201
+ schema-name Specific schema to export (optional)
202
+
203
+ Options:
204
+ -o, --output Output file or directory (with --split)
205
+ -j, --json Export as JSON (default: list names)
206
+ -s, --split Split into separate files (requires --output dir)
207
+ -l, --list List available schemas
208
+ -h, --help Show this help message
209
+
210
+ Available Schemas:
211
+ PromptCase, GraderResult, TrajectoryStep, CaptureResult, SummaryResult,
212
+ TrialEntry, TrialResult, CalibrationSample, BalanceAnalysis, ValidationResult,
213
+ McpServerConfig, Session, JsonRpcRequest, JsonRpcResponse, JsonRpcError
214
+
215
+ Examples:
216
+ # List available schemas
217
+ agent-eval-harness schemas --list
218
+
219
+ # Export all schemas as single JSON file
220
+ agent-eval-harness schemas --json -o schemas.json
221
+
222
+ # Export specific schema
223
+ agent-eval-harness schemas CaptureResult --json
224
+ agent-eval-harness schemas TrialResult --json -o trial-schema.json
225
+
226
+ # Export all schemas as separate files
227
+ agent-eval-harness schemas --json --split -o schemas/
228
+ `)
229
+ return
230
+ }
231
+
232
+ await runSchemas({
233
+ schemaName: positionals[0],
234
+ outputPath: values.output,
235
+ json: values.json ?? false,
236
+ split: values.split ?? false,
237
+ list: values.list ?? false,
238
+ })
239
+ }