@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod schemas for headless adapter configuration.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* These schemas define how to interact with ANY headless CLI agent via a
|
|
6
|
+
* schema-driven approach. No hardcoded agent-specific logic - the schema
|
|
7
|
+
* defines everything: command, flags, output parsing rules.
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { z } from 'zod'
|
|
13
|
+
|
|
14
|
+
// ============================================================================
|
|
15
|
+
// Output Event Mapping Schema
|
|
16
|
+
// ============================================================================
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Schema for matching CLI output to session update types.
|
|
20
|
+
*
|
|
21
|
+
* @remarks
|
|
22
|
+
* Uses JSONPath-like patterns to match events in CLI JSON output
|
|
23
|
+
* and map them to session update types.
|
|
24
|
+
*/
|
|
25
|
+
export const OutputEventMatchSchema = z.object({
|
|
26
|
+
/** JSONPath to match event type in CLI output (e.g., "$.type") */
|
|
27
|
+
path: z.string(),
|
|
28
|
+
/** Value to match at the path (e.g., "tool_use") */
|
|
29
|
+
value: z.string(),
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
/** Output event match type */
|
|
33
|
+
export type OutputEventMatch = z.infer<typeof OutputEventMatchSchema>
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Schema for extracting content from matched events.
|
|
37
|
+
*
|
|
38
|
+
* @remarks
|
|
39
|
+
* Paths can be:
|
|
40
|
+
* - JSONPath expressions (e.g., "$.message.text")
|
|
41
|
+
* - Literal strings in single quotes (e.g., "'pending'")
|
|
42
|
+
*/
|
|
43
|
+
export const OutputEventExtractSchema = z.object({
|
|
44
|
+
/** JSONPath to extract main content */
|
|
45
|
+
content: z.string().optional(),
|
|
46
|
+
/** JSONPath to extract title (for tool calls) */
|
|
47
|
+
title: z.string().optional(),
|
|
48
|
+
/** JSONPath to extract status (or literal like "'pending'") */
|
|
49
|
+
status: z.string().optional(),
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
/** Output event extract type */
|
|
53
|
+
export type OutputEventExtract = z.infer<typeof OutputEventExtractSchema>
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Schema for mapping CLI output events to session update types.
|
|
57
|
+
*
|
|
58
|
+
* @remarks
|
|
59
|
+
* Each mapping specifies:
|
|
60
|
+
* 1. How to match events (match.path + match.value)
|
|
61
|
+
* 2. What session update type to emit (emitAs)
|
|
62
|
+
* 3. What content to extract (extract)
|
|
63
|
+
*/
|
|
64
|
+
export const OutputEventMappingSchema = z.object({
|
|
65
|
+
/** Matching criteria for CLI output */
|
|
66
|
+
match: OutputEventMatchSchema,
|
|
67
|
+
/** session update type to emit */
|
|
68
|
+
emitAs: z.enum(['thought', 'tool_call', 'message', 'plan']),
|
|
69
|
+
/** Content extraction configuration */
|
|
70
|
+
extract: OutputEventExtractSchema.optional(),
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
/** Output event mapping type */
|
|
74
|
+
export type OutputEventMapping = z.infer<typeof OutputEventMappingSchema>
|
|
75
|
+
|
|
76
|
+
// ============================================================================
|
|
77
|
+
// Prompt Configuration Schema
|
|
78
|
+
// ============================================================================
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Schema for how to pass prompts to the CLI.
|
|
82
|
+
*
|
|
83
|
+
* @remarks
|
|
84
|
+
* Three modes are supported:
|
|
85
|
+
* 1. **Flag-based**: `flag: "-p"` - Pass prompt via command-line flag
|
|
86
|
+
* 2. **Positional**: `flag: ""` - Pass prompt as positional argument
|
|
87
|
+
* 3. **Stdin**: `stdin: true` - Write prompt to stdin (command should include `-` or equivalent)
|
|
88
|
+
*/
|
|
89
|
+
export const PromptConfigSchema = z
|
|
90
|
+
.object({
|
|
91
|
+
/** Flag to pass prompt (e.g., "-p", "--prompt"). Empty string for positional. */
|
|
92
|
+
flag: z.string().optional(),
|
|
93
|
+
/** Use stdin to pass prompt instead of command args */
|
|
94
|
+
stdin: z.boolean().optional(),
|
|
95
|
+
/** Format for stdin input in stream mode */
|
|
96
|
+
stdinFormat: z.enum(['text', 'json']).optional(),
|
|
97
|
+
})
|
|
98
|
+
.refine((data) => !(data.flag && data.stdin), {
|
|
99
|
+
message: "Cannot specify both 'flag' and 'stdin' modes - use either flag-based or stdin mode, not both",
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
/** Prompt configuration type */
|
|
103
|
+
export type PromptConfig = z.infer<typeof PromptConfigSchema>
|
|
104
|
+
|
|
105
|
+
// ============================================================================
|
|
106
|
+
// Output Configuration Schema
|
|
107
|
+
// ============================================================================
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Schema for output format configuration.
|
|
111
|
+
*/
|
|
112
|
+
export const OutputConfigSchema = z.object({
|
|
113
|
+
/** Flag for output format (e.g., "--output-format") */
|
|
114
|
+
flag: z.string(),
|
|
115
|
+
/** Value for output format (e.g., "stream-json") */
|
|
116
|
+
value: z.string(),
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
/** Output configuration type */
|
|
120
|
+
export type OutputConfig = z.infer<typeof OutputConfigSchema>
|
|
121
|
+
|
|
122
|
+
// ============================================================================
|
|
123
|
+
// Resume Configuration Schema
|
|
124
|
+
// ============================================================================
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Schema for session resume support (stream mode).
|
|
128
|
+
*/
|
|
129
|
+
export const ResumeConfigSchema = z.object({
|
|
130
|
+
/** Flag to resume session (e.g., "--resume") */
|
|
131
|
+
flag: z.string(),
|
|
132
|
+
/** JSONPath to extract session ID from output */
|
|
133
|
+
sessionIdPath: z.string(),
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
/** Resume configuration type */
|
|
137
|
+
export type ResumeConfig = z.infer<typeof ResumeConfigSchema>
|
|
138
|
+
|
|
139
|
+
// ============================================================================
|
|
140
|
+
// Result Configuration Schema
|
|
141
|
+
// ============================================================================
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Schema for final result extraction.
|
|
145
|
+
*/
|
|
146
|
+
export const ResultConfigSchema = z.object({
|
|
147
|
+
/** JSONPath to match result type (e.g., "$.type") */
|
|
148
|
+
matchPath: z.string(),
|
|
149
|
+
/** Value indicating final result (e.g., "result") */
|
|
150
|
+
matchValue: z.string(),
|
|
151
|
+
/** JSONPath to extract result content */
|
|
152
|
+
contentPath: z.string(),
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
/** Result configuration type */
|
|
156
|
+
export type ResultConfig = z.infer<typeof ResultConfigSchema>
|
|
157
|
+
|
|
158
|
+
// ============================================================================
|
|
159
|
+
// Passthrough Type Mapping Schema
|
|
160
|
+
// ============================================================================
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Schema for passthrough type mapping.
|
|
164
|
+
*
|
|
165
|
+
* @remarks
|
|
166
|
+
* Used when outputMode is 'passthrough' to map agent's native type names
|
|
167
|
+
* to standard session update types. Useful for agents with well-structured
|
|
168
|
+
* output that doesn't need complex JSONPath parsing.
|
|
169
|
+
*/
|
|
170
|
+
export const PassthroughTypeMapSchema = z.object({
|
|
171
|
+
/** JSON field that contains the event type (default: "type") */
|
|
172
|
+
typeField: z.string().default('type'),
|
|
173
|
+
/** Mapping from agent type values to session update types */
|
|
174
|
+
typeValues: z.record(z.string(), z.enum(['thought', 'tool_call', 'message', 'plan'])).optional(),
|
|
175
|
+
})
|
|
176
|
+
|
|
177
|
+
/** Passthrough type mapping type */
|
|
178
|
+
export type PassthroughTypeMap = z.infer<typeof PassthroughTypeMapSchema>
|
|
179
|
+
|
|
180
|
+
// ============================================================================
|
|
181
|
+
// Main Adapter Schema
|
|
182
|
+
// ============================================================================
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Schema for headless adapter configuration.
|
|
186
|
+
*
|
|
187
|
+
* @remarks
|
|
188
|
+
* This schema defines everything needed to interact with a headless CLI agent:
|
|
189
|
+
* - Command and flags to spawn
|
|
190
|
+
* - How to pass prompts
|
|
191
|
+
* - How to parse output (jsonpath or passthrough mode)
|
|
192
|
+
* - Session handling mode
|
|
193
|
+
*
|
|
194
|
+
* Supports two output parsing modes:
|
|
195
|
+
* - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default)
|
|
196
|
+
* - 'passthrough': Direct type mapping for well-structured output
|
|
197
|
+
*
|
|
198
|
+
* Example (Claude):
|
|
199
|
+
* ```json
|
|
200
|
+
* {
|
|
201
|
+
* "version": 1,
|
|
202
|
+
* "name": "claude-headless",
|
|
203
|
+
* "command": ["claude"],
|
|
204
|
+
* "sessionMode": "stream",
|
|
205
|
+
* "timeout": 90000,
|
|
206
|
+
* "prompt": { "flag": "-p" },
|
|
207
|
+
* "output": { "flag": "--output-format", "value": "stream-json" },
|
|
208
|
+
* "outputEvents": [...]
|
|
209
|
+
* }
|
|
210
|
+
* ```
|
|
211
|
+
*/
|
|
212
|
+
export const HeadlessAdapterSchema = z.object({
|
|
213
|
+
/** Schema version */
|
|
214
|
+
version: z.literal(1),
|
|
215
|
+
|
|
216
|
+
/** Human-readable adapter name */
|
|
217
|
+
name: z.string(),
|
|
218
|
+
|
|
219
|
+
/** Base command to spawn (e.g., ["claude"], ["gemini"]) */
|
|
220
|
+
command: z.array(z.string()),
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Session mode determines how multi-turn conversations work:
|
|
224
|
+
* - 'stream': Keep process alive, multi-turn via stdin
|
|
225
|
+
* - 'iterative': New process per turn, accumulate context in prompt
|
|
226
|
+
*/
|
|
227
|
+
sessionMode: z.enum(['stream', 'iterative']),
|
|
228
|
+
|
|
229
|
+
/** Default timeout for this agent in milliseconds (can be overridden per-prompt) */
|
|
230
|
+
timeout: z.number().optional(),
|
|
231
|
+
|
|
232
|
+
/** How to pass the prompt */
|
|
233
|
+
prompt: PromptConfigSchema,
|
|
234
|
+
|
|
235
|
+
/** Output format configuration */
|
|
236
|
+
output: OutputConfigSchema,
|
|
237
|
+
|
|
238
|
+
/** Flags for auto-approval in headless mode (e.g., ["--allowedTools", "*"]) */
|
|
239
|
+
autoApprove: z.array(z.string()).optional(),
|
|
240
|
+
|
|
241
|
+
/** Session resume support (stream mode only) */
|
|
242
|
+
resume: ResumeConfigSchema.optional(),
|
|
243
|
+
|
|
244
|
+
/** Working directory flag (if CLI needs explicit --cwd) */
|
|
245
|
+
cwdFlag: z.string().optional(),
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Output parsing mode:
|
|
249
|
+
* - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default)
|
|
250
|
+
* - 'passthrough': Direct type mapping for well-structured output
|
|
251
|
+
*/
|
|
252
|
+
outputMode: z.enum(['jsonpath', 'passthrough']).default('jsonpath'),
|
|
253
|
+
|
|
254
|
+
/** Output event mappings - how to parse CLI output into updates (jsonpath mode) */
|
|
255
|
+
outputEvents: z.array(OutputEventMappingSchema).optional(),
|
|
256
|
+
|
|
257
|
+
/** Type mapping for passthrough mode */
|
|
258
|
+
passthroughTypeMap: PassthroughTypeMapSchema.optional(),
|
|
259
|
+
|
|
260
|
+
/** Final result extraction configuration */
|
|
261
|
+
result: ResultConfigSchema,
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Template for formatting conversation history (iterative mode only).
|
|
265
|
+
*
|
|
266
|
+
* @remarks
|
|
267
|
+
* Supports both string format (simple) and object format (advanced):
|
|
268
|
+
* - String: "User: {{input}}\nAssistant: {{output}}"
|
|
269
|
+
* - Object: { system: "...", turnFormat: "..." }
|
|
270
|
+
*/
|
|
271
|
+
historyTemplate: z
|
|
272
|
+
.union([
|
|
273
|
+
z.string(),
|
|
274
|
+
z.object({
|
|
275
|
+
/** System prefix for accumulated history */
|
|
276
|
+
system: z.string().optional(),
|
|
277
|
+
/** Format for each turn: {{input}} and {{output}} placeholders */
|
|
278
|
+
turnFormat: z.string(),
|
|
279
|
+
}),
|
|
280
|
+
])
|
|
281
|
+
.optional(),
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
/** Headless adapter configuration type */
|
|
285
|
+
export type HeadlessAdapterConfig = z.infer<typeof HeadlessAdapterSchema>
|
|
286
|
+
|
|
287
|
+
// ============================================================================
|
|
288
|
+
// Validation Helpers
|
|
289
|
+
// ============================================================================
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Validates and parses a headless adapter configuration.
|
|
293
|
+
*
|
|
294
|
+
* @param config - Raw configuration object (e.g., from JSON file)
|
|
295
|
+
* @returns Validated HeadlessAdapterConfig
|
|
296
|
+
* @throws ZodError if validation fails
|
|
297
|
+
*/
|
|
298
|
+
export const parseHeadlessConfig = (config: unknown): HeadlessAdapterConfig => {
|
|
299
|
+
return HeadlessAdapterSchema.parse(config)
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Safely validates a headless adapter configuration.
|
|
304
|
+
*
|
|
305
|
+
* @param config - Raw configuration object
|
|
306
|
+
* @returns Result with success/failure and data or error
|
|
307
|
+
*/
|
|
308
|
+
export const safeParseHeadlessConfig = (config: unknown) => {
|
|
309
|
+
return HeadlessAdapterSchema.safeParse(config)
|
|
310
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type exports for headless adapter.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Re-exports all types from the schemas module for external consumers.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
export type {
|
|
11
|
+
HeadlessAdapterConfig,
|
|
12
|
+
OutputConfig,
|
|
13
|
+
OutputEventExtract,
|
|
14
|
+
OutputEventMapping,
|
|
15
|
+
OutputEventMatch,
|
|
16
|
+
PromptConfig,
|
|
17
|
+
ResultConfig,
|
|
18
|
+
ResumeConfig,
|
|
19
|
+
} from './headless.schemas.ts'
|