@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,310 @@
1
+ /**
2
+ * Zod schemas for headless adapter configuration.
3
+ *
4
+ * @remarks
5
+ * These schemas define how to interact with ANY headless CLI agent via a
6
+ * schema-driven approach. No hardcoded agent-specific logic - the schema
7
+ * defines everything: command, flags, output parsing rules.
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { z } from 'zod'
13
+
14
+ // ============================================================================
15
+ // Output Event Mapping Schema
16
+ // ============================================================================
17
+
18
+ /**
19
+ * Schema for matching CLI output to session update types.
20
+ *
21
+ * @remarks
22
+ * Uses JSONPath-like patterns to match events in CLI JSON output
23
+ * and map them to session update types.
24
+ */
25
+ export const OutputEventMatchSchema = z.object({
26
+ /** JSONPath to match event type in CLI output (e.g., "$.type") */
27
+ path: z.string(),
28
+ /** Value to match at the path (e.g., "tool_use") */
29
+ value: z.string(),
30
+ })
31
+
32
+ /** Output event match type */
33
+ export type OutputEventMatch = z.infer<typeof OutputEventMatchSchema>
34
+
35
+ /**
36
+ * Schema for extracting content from matched events.
37
+ *
38
+ * @remarks
39
+ * Paths can be:
40
+ * - JSONPath expressions (e.g., "$.message.text")
41
+ * - Literal strings in single quotes (e.g., "'pending'")
42
+ */
43
+ export const OutputEventExtractSchema = z.object({
44
+ /** JSONPath to extract main content */
45
+ content: z.string().optional(),
46
+ /** JSONPath to extract title (for tool calls) */
47
+ title: z.string().optional(),
48
+ /** JSONPath to extract status (or literal like "'pending'") */
49
+ status: z.string().optional(),
50
+ })
51
+
52
+ /** Output event extract type */
53
+ export type OutputEventExtract = z.infer<typeof OutputEventExtractSchema>
54
+
55
+ /**
56
+ * Schema for mapping CLI output events to session update types.
57
+ *
58
+ * @remarks
59
+ * Each mapping specifies:
60
+ * 1. How to match events (match.path + match.value)
61
+ * 2. What session update type to emit (emitAs)
62
+ * 3. What content to extract (extract)
63
+ */
64
+ export const OutputEventMappingSchema = z.object({
65
+ /** Matching criteria for CLI output */
66
+ match: OutputEventMatchSchema,
67
+ /** session update type to emit */
68
+ emitAs: z.enum(['thought', 'tool_call', 'message', 'plan']),
69
+ /** Content extraction configuration */
70
+ extract: OutputEventExtractSchema.optional(),
71
+ })
72
+
73
+ /** Output event mapping type */
74
+ export type OutputEventMapping = z.infer<typeof OutputEventMappingSchema>
75
+
76
+ // ============================================================================
77
+ // Prompt Configuration Schema
78
+ // ============================================================================
79
+
80
+ /**
81
+ * Schema for how to pass prompts to the CLI.
82
+ *
83
+ * @remarks
84
+ * Three modes are supported:
85
+ * 1. **Flag-based**: `flag: "-p"` - Pass prompt via command-line flag
86
+ * 2. **Positional**: `flag: ""` - Pass prompt as positional argument
87
+ * 3. **Stdin**: `stdin: true` - Write prompt to stdin (command should include `-` or equivalent)
88
+ */
89
+ export const PromptConfigSchema = z
90
+ .object({
91
+ /** Flag to pass prompt (e.g., "-p", "--prompt"). Empty string for positional. */
92
+ flag: z.string().optional(),
93
+ /** Use stdin to pass prompt instead of command args */
94
+ stdin: z.boolean().optional(),
95
+ /** Format for stdin input in stream mode */
96
+ stdinFormat: z.enum(['text', 'json']).optional(),
97
+ })
98
+ .refine((data) => !(data.flag && data.stdin), {
99
+ message: "Cannot specify both 'flag' and 'stdin' modes - use either flag-based or stdin mode, not both",
100
+ })
101
+
102
+ /** Prompt configuration type */
103
+ export type PromptConfig = z.infer<typeof PromptConfigSchema>
104
+
105
+ // ============================================================================
106
+ // Output Configuration Schema
107
+ // ============================================================================
108
+
109
+ /**
110
+ * Schema for output format configuration.
111
+ */
112
+ export const OutputConfigSchema = z.object({
113
+ /** Flag for output format (e.g., "--output-format") */
114
+ flag: z.string(),
115
+ /** Value for output format (e.g., "stream-json") */
116
+ value: z.string(),
117
+ })
118
+
119
+ /** Output configuration type */
120
+ export type OutputConfig = z.infer<typeof OutputConfigSchema>
121
+
122
+ // ============================================================================
123
+ // Resume Configuration Schema
124
+ // ============================================================================
125
+
126
+ /**
127
+ * Schema for session resume support (stream mode).
128
+ */
129
+ export const ResumeConfigSchema = z.object({
130
+ /** Flag to resume session (e.g., "--resume") */
131
+ flag: z.string(),
132
+ /** JSONPath to extract session ID from output */
133
+ sessionIdPath: z.string(),
134
+ })
135
+
136
+ /** Resume configuration type */
137
+ export type ResumeConfig = z.infer<typeof ResumeConfigSchema>
138
+
139
+ // ============================================================================
140
+ // Result Configuration Schema
141
+ // ============================================================================
142
+
143
+ /**
144
+ * Schema for final result extraction.
145
+ */
146
+ export const ResultConfigSchema = z.object({
147
+ /** JSONPath to match result type (e.g., "$.type") */
148
+ matchPath: z.string(),
149
+ /** Value indicating final result (e.g., "result") */
150
+ matchValue: z.string(),
151
+ /** JSONPath to extract result content */
152
+ contentPath: z.string(),
153
+ })
154
+
155
+ /** Result configuration type */
156
+ export type ResultConfig = z.infer<typeof ResultConfigSchema>
157
+
158
+ // ============================================================================
159
+ // Passthrough Type Mapping Schema
160
+ // ============================================================================
161
+
162
+ /**
163
+ * Schema for passthrough type mapping.
164
+ *
165
+ * @remarks
166
+ * Used when outputMode is 'passthrough' to map agent's native type names
167
+ * to standard session update types. Useful for agents with well-structured
168
+ * output that doesn't need complex JSONPath parsing.
169
+ */
170
+ export const PassthroughTypeMapSchema = z.object({
171
+ /** JSON field that contains the event type (default: "type") */
172
+ typeField: z.string().default('type'),
173
+ /** Mapping from agent type values to session update types */
174
+ typeValues: z.record(z.string(), z.enum(['thought', 'tool_call', 'message', 'plan'])).optional(),
175
+ })
176
+
177
+ /** Passthrough type mapping type */
178
+ export type PassthroughTypeMap = z.infer<typeof PassthroughTypeMapSchema>
179
+
180
+ // ============================================================================
181
+ // Main Adapter Schema
182
+ // ============================================================================
183
+
184
+ /**
185
+ * Schema for headless adapter configuration.
186
+ *
187
+ * @remarks
188
+ * This schema defines everything needed to interact with a headless CLI agent:
189
+ * - Command and flags to spawn
190
+ * - How to pass prompts
191
+ * - How to parse output (jsonpath or passthrough mode)
192
+ * - Session handling mode
193
+ *
194
+ * Supports two output parsing modes:
195
+ * - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default)
196
+ * - 'passthrough': Direct type mapping for well-structured output
197
+ *
198
+ * Example (Claude):
199
+ * ```json
200
+ * {
201
+ * "version": 1,
202
+ * "name": "claude-headless",
203
+ * "command": ["claude"],
204
+ * "sessionMode": "stream",
205
+ * "timeout": 90000,
206
+ * "prompt": { "flag": "-p" },
207
+ * "output": { "flag": "--output-format", "value": "stream-json" },
208
+ * "outputEvents": [...]
209
+ * }
210
+ * ```
211
+ */
212
+ export const HeadlessAdapterSchema = z.object({
213
+ /** Schema version */
214
+ version: z.literal(1),
215
+
216
+ /** Human-readable adapter name */
217
+ name: z.string(),
218
+
219
+ /** Base command to spawn (e.g., ["claude"], ["gemini"]) */
220
+ command: z.array(z.string()),
221
+
222
+ /**
223
+ * Session mode determines how multi-turn conversations work:
224
+ * - 'stream': Keep process alive, multi-turn via stdin
225
+ * - 'iterative': New process per turn, accumulate context in prompt
226
+ */
227
+ sessionMode: z.enum(['stream', 'iterative']),
228
+
229
+ /** Default timeout for this agent in milliseconds (can be overridden per-prompt) */
230
+ timeout: z.number().optional(),
231
+
232
+ /** How to pass the prompt */
233
+ prompt: PromptConfigSchema,
234
+
235
+ /** Output format configuration */
236
+ output: OutputConfigSchema,
237
+
238
+ /** Flags for auto-approval in headless mode (e.g., ["--allowedTools", "*"]) */
239
+ autoApprove: z.array(z.string()).optional(),
240
+
241
+ /** Session resume support (stream mode only) */
242
+ resume: ResumeConfigSchema.optional(),
243
+
244
+ /** Working directory flag (if CLI needs explicit --cwd) */
245
+ cwdFlag: z.string().optional(),
246
+
247
+ /**
248
+ * Output parsing mode:
249
+ * - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default)
250
+ * - 'passthrough': Direct type mapping for well-structured output
251
+ */
252
+ outputMode: z.enum(['jsonpath', 'passthrough']).default('jsonpath'),
253
+
254
+ /** Output event mappings - how to parse CLI output into updates (jsonpath mode) */
255
+ outputEvents: z.array(OutputEventMappingSchema).optional(),
256
+
257
+ /** Type mapping for passthrough mode */
258
+ passthroughTypeMap: PassthroughTypeMapSchema.optional(),
259
+
260
+ /** Final result extraction configuration */
261
+ result: ResultConfigSchema,
262
+
263
+ /**
264
+ * Template for formatting conversation history (iterative mode only).
265
+ *
266
+ * @remarks
267
+ * Supports both string format (simple) and object format (advanced):
268
+ * - String: "User: {{input}}\nAssistant: {{output}}"
269
+ * - Object: { system: "...", turnFormat: "..." }
270
+ */
271
+ historyTemplate: z
272
+ .union([
273
+ z.string(),
274
+ z.object({
275
+ /** System prefix for accumulated history */
276
+ system: z.string().optional(),
277
+ /** Format for each turn: {{input}} and {{output}} placeholders */
278
+ turnFormat: z.string(),
279
+ }),
280
+ ])
281
+ .optional(),
282
+ })
283
+
284
+ /** Headless adapter configuration type */
285
+ export type HeadlessAdapterConfig = z.infer<typeof HeadlessAdapterSchema>
286
+
287
+ // ============================================================================
288
+ // Validation Helpers
289
+ // ============================================================================
290
+
291
+ /**
292
+ * Validates and parses a headless adapter configuration.
293
+ *
294
+ * @param config - Raw configuration object (e.g., from JSON file)
295
+ * @returns Validated HeadlessAdapterConfig
296
+ * @throws ZodError if validation fails
297
+ */
298
+ export const parseHeadlessConfig = (config: unknown): HeadlessAdapterConfig => {
299
+ return HeadlessAdapterSchema.parse(config)
300
+ }
301
+
302
+ /**
303
+ * Safely validates a headless adapter configuration.
304
+ *
305
+ * @param config - Raw configuration object
306
+ * @returns Result with success/failure and data or error
307
+ */
308
+ export const safeParseHeadlessConfig = (config: unknown) => {
309
+ return HeadlessAdapterSchema.safeParse(config)
310
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Type exports for headless adapter.
3
+ *
4
+ * @remarks
5
+ * Re-exports all types from the schemas module for external consumers.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ export type {
11
+ HeadlessAdapterConfig,
12
+ OutputConfig,
13
+ OutputEventExtract,
14
+ OutputEventMapping,
15
+ OutputEventMatch,
16
+ PromptConfig,
17
+ ResultConfig,
18
+ ResumeConfig,
19
+ } from './headless.schemas.ts'