@plaited/acp-harness 0.2.5 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +120 -16
- package/bin/cli.ts +105 -636
- package/bin/tests/cli.spec.ts +218 -51
- package/package.json +20 -4
- package/src/acp-client.ts +5 -4
- package/src/acp-transport.ts +14 -7
- package/src/adapter-check.ts +542 -0
- package/src/adapter-scaffold.ts +934 -0
- package/src/balance.ts +232 -0
- package/src/calibrate.ts +300 -0
- package/src/capture.ts +457 -0
- package/src/constants.ts +94 -0
- package/src/grader-loader.ts +174 -0
- package/src/harness.ts +35 -0
- package/src/schemas-cli.ts +239 -0
- package/src/schemas.ts +567 -0
- package/src/summarize.ts +245 -0
- package/src/tests/adapter-check.spec.ts +70 -0
- package/src/tests/adapter-scaffold.spec.ts +112 -0
- package/src/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/tests/fixtures/grader-exec.py +29 -0
- package/src/tests/fixtures/grader-module.ts +14 -0
- package/src/tests/grader-loader.spec.ts +153 -0
- package/src/trials.ts +395 -0
- package/src/validate-refs.ts +188 -0
- package/.claude/rules/accuracy.md +0 -43
- package/.claude/rules/bun-apis.md +0 -80
- package/.claude/rules/code-review.md +0 -254
- package/.claude/rules/git-workflow.md +0 -37
- package/.claude/rules/github.md +0 -154
- package/.claude/rules/testing.md +0 -172
- package/.claude/skills/acp-harness/SKILL.md +0 -310
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
- package/.claude/skills/acp-harness/references/downstream.md +0 -288
- package/.claude/skills/acp-harness/references/output-formats.md +0 -221
- package/.claude-plugin/marketplace.json +0 -15
- package/.claude-plugin/plugin.json +0 -16
- package/.github/CODEOWNERS +0 -6
- package/.github/workflows/ci.yml +0 -63
- package/.github/workflows/publish.yml +0 -146
- package/.mcp.json +0 -20
- package/CLAUDE.md +0 -92
- package/Dockerfile.test +0 -23
- package/biome.json +0 -96
- package/bun.lock +0 -513
- package/docker-compose.test.yml +0 -21
- package/scripts/bun-test-wrapper.sh +0 -46
- package/src/acp.constants.ts +0 -56
- package/src/acp.schemas.ts +0 -161
- package/src/acp.types.ts +0 -28
- package/src/tests/fixtures/.claude/settings.local.json +0 -8
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
- package/tsconfig.json +0 -32
package/src/schemas.ts
ADDED
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified Zod schemas and types for the ACP harness.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* This module follows a schema-first approach where Zod schemas are the
|
|
6
|
+
* single source of truth. TypeScript types are derived using `z.infer<>`.
|
|
7
|
+
*
|
|
8
|
+
* **Exports:**
|
|
9
|
+
* - Harness schemas: PromptCaseSchema, GraderResultSchema, CaptureResultSchema, etc.
|
|
10
|
+
* - JSON-RPC schemas: JsonRpcRequestSchema, JsonRpcResponseSchema, etc.
|
|
11
|
+
* - ACP SDK type schemas: SessionNotificationSchema, RequestPermissionRequestSchema
|
|
12
|
+
* - All inferred types via `z.infer<>`
|
|
13
|
+
*
|
|
14
|
+
* **JSON Schema generation (Zod 4):**
|
|
15
|
+
* ```typescript
|
|
16
|
+
* import { z } from 'zod'
|
|
17
|
+
* import { CaptureResultSchema } from '@plaited/acp-harness/schemas'
|
|
18
|
+
* const jsonSchema = z.toJSONSchema(CaptureResultSchema)
|
|
19
|
+
* ```
|
|
20
|
+
*
|
|
21
|
+
* @packageDocumentation
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import type { RequestPermissionRequest, SessionId, SessionNotification } from '@agentclientprotocol/sdk'
|
|
25
|
+
import { z } from 'zod'
|
|
26
|
+
|
|
27
|
+
// ============================================================================
|
|
28
|
+
// Internal Type Utilities
|
|
29
|
+
// ============================================================================
|
|
30
|
+
|
|
31
|
+
/** Precise type detection beyond typeof operator */
|
|
32
|
+
const trueTypeOf = (obj?: unknown): string => Object.prototype.toString.call(obj).slice(8, -1).toLowerCase()
|
|
33
|
+
|
|
34
|
+
/** Type guard for precise type checking with TypeScript narrowing */
|
|
35
|
+
const isTypeOf = <T>(obj: unknown, type: string): obj is T => trueTypeOf(obj) === type
|
|
36
|
+
|
|
37
|
+
/** Type guard for object shape validation */
|
|
38
|
+
const isRecord = (val: unknown): val is Record<string, unknown> => isTypeOf<Record<string, unknown>>(val, 'object')
|
|
39
|
+
|
|
40
|
+
// ============================================================================
|
|
41
|
+
// Session Types
|
|
42
|
+
// ============================================================================
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Session schema for session creation responses.
|
|
46
|
+
*/
|
|
47
|
+
export const SessionSchema = z.object({
|
|
48
|
+
id: z.string() as z.ZodType<SessionId>,
|
|
49
|
+
_meta: z.record(z.string(), z.unknown()).nullish(),
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
/** Session object returned from session creation */
|
|
53
|
+
export type Session = z.infer<typeof SessionSchema>
|
|
54
|
+
|
|
55
|
+
// ============================================================================
|
|
56
|
+
// JSON-RPC 2.0 Schemas
|
|
57
|
+
// ============================================================================
|
|
58
|
+
|
|
59
|
+
/** JSON-RPC version literal */
|
|
60
|
+
const JsonRpcVersionSchema = z.literal('2.0')
|
|
61
|
+
|
|
62
|
+
/** Request/response identifier */
|
|
63
|
+
const RequestIdSchema = z.union([z.string(), z.number()])
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* JSON-RPC 2.0 error object schema.
|
|
67
|
+
*
|
|
68
|
+
* @remarks
|
|
69
|
+
* Standard error codes:
|
|
70
|
+
* - `-32700`: Parse error
|
|
71
|
+
* - `-32600`: Invalid request
|
|
72
|
+
* - `-32601`: Method not found
|
|
73
|
+
* - `-32602`: Invalid params
|
|
74
|
+
* - `-32603`: Internal error
|
|
75
|
+
* - `-32800`: Request cancelled (ACP extension)
|
|
76
|
+
*/
|
|
77
|
+
export const JsonRpcErrorSchema = z.object({
|
|
78
|
+
code: z.number(),
|
|
79
|
+
message: z.string(),
|
|
80
|
+
data: z.unknown().optional(),
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
/** JSON-RPC 2.0 error object */
|
|
84
|
+
export type JsonRpcError = z.infer<typeof JsonRpcErrorSchema>
|
|
85
|
+
|
|
86
|
+
/** JSON-RPC 2.0 request schema */
|
|
87
|
+
export const JsonRpcRequestSchema = z.object({
|
|
88
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
89
|
+
id: RequestIdSchema,
|
|
90
|
+
method: z.string(),
|
|
91
|
+
params: z.unknown().optional(),
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
/** JSON-RPC 2.0 request structure */
|
|
95
|
+
export type JsonRpcRequest<T = unknown> = Omit<z.infer<typeof JsonRpcRequestSchema>, 'params'> & {
|
|
96
|
+
params?: T
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** JSON-RPC 2.0 notification schema (no id, no response expected) */
|
|
100
|
+
export const JsonRpcNotificationSchema = z.object({
|
|
101
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
102
|
+
method: z.string(),
|
|
103
|
+
params: z.unknown().optional(),
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
/** JSON-RPC 2.0 notification structure (no id, no response expected) */
|
|
107
|
+
export type JsonRpcNotification<T = unknown> = Omit<z.infer<typeof JsonRpcNotificationSchema>, 'params'> & {
|
|
108
|
+
params?: T
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** JSON-RPC 2.0 success response schema */
|
|
112
|
+
export const JsonRpcSuccessResponseSchema = z.object({
|
|
113
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
114
|
+
id: RequestIdSchema,
|
|
115
|
+
result: z.unknown(),
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
/** JSON-RPC 2.0 success response */
|
|
119
|
+
export type JsonRpcSuccessResponse<T = unknown> = Omit<z.infer<typeof JsonRpcSuccessResponseSchema>, 'result'> & {
|
|
120
|
+
result: T
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/** JSON-RPC 2.0 error response schema */
|
|
124
|
+
export const JsonRpcErrorResponseSchema = z.object({
|
|
125
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
126
|
+
id: z.union([RequestIdSchema, z.null()]),
|
|
127
|
+
error: JsonRpcErrorSchema,
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
/** JSON-RPC 2.0 error response */
|
|
131
|
+
export type JsonRpcErrorResponse = z.infer<typeof JsonRpcErrorResponseSchema>
|
|
132
|
+
|
|
133
|
+
/** Union of all JSON-RPC response types */
|
|
134
|
+
export const JsonRpcResponseSchema = z.union([JsonRpcSuccessResponseSchema, JsonRpcErrorResponseSchema])
|
|
135
|
+
|
|
136
|
+
/** Union of all JSON-RPC response types */
|
|
137
|
+
export type JsonRpcResponse<T = unknown> = JsonRpcSuccessResponse<T> | JsonRpcErrorResponse
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Union of all JSON-RPC message types.
|
|
141
|
+
*
|
|
142
|
+
* @remarks
|
|
143
|
+
* Use `safeParse` at transport boundaries for runtime validation.
|
|
144
|
+
*/
|
|
145
|
+
export const JsonRpcMessageSchema = z.union([JsonRpcRequestSchema, JsonRpcNotificationSchema, JsonRpcResponseSchema])
|
|
146
|
+
|
|
147
|
+
/** Union of all JSON-RPC message types */
|
|
148
|
+
export type JsonRpcMessage<T = unknown> = JsonRpcRequest<T> | JsonRpcNotification<T> | JsonRpcResponse<T>
|
|
149
|
+
|
|
150
|
+
// ============================================================================
|
|
151
|
+
// ACP SDK Type Schemas (Custom Validators)
|
|
152
|
+
// ============================================================================
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Schema for session update notifications.
|
|
156
|
+
*
|
|
157
|
+
* @remarks
|
|
158
|
+
* Validates `sessionId` and `update` fields used in notification handling.
|
|
159
|
+
* Uses z.custom() to validate SDK types at runtime while keeping SDK types
|
|
160
|
+
* as the source of truth.
|
|
161
|
+
*/
|
|
162
|
+
export const SessionNotificationSchema = z.custom<SessionNotification>(
|
|
163
|
+
(val): val is SessionNotification =>
|
|
164
|
+
isRecord(val) && 'sessionId' in val && typeof val.sessionId === 'string' && 'update' in val && isRecord(val.update),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Schema for permission requests from agent.
|
|
169
|
+
*
|
|
170
|
+
* @remarks
|
|
171
|
+
* Validates `options` array used in permission handling.
|
|
172
|
+
*/
|
|
173
|
+
export const RequestPermissionRequestSchema = z.custom<RequestPermissionRequest>(
|
|
174
|
+
(val): val is RequestPermissionRequest => isRecord(val) && 'options' in val && Array.isArray(val.options),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
// ============================================================================
|
|
178
|
+
// MCP Server Configuration Schemas
|
|
179
|
+
// ============================================================================
|
|
180
|
+
|
|
181
|
+
/** Environment variable configuration */
|
|
182
|
+
export const EnvVariableSchema = z.object({
|
|
183
|
+
name: z.string(),
|
|
184
|
+
value: z.string(),
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
/** HTTP header configuration */
|
|
188
|
+
export const HttpHeaderSchema = z.object({
|
|
189
|
+
name: z.string(),
|
|
190
|
+
value: z.string(),
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
/** MCP server stdio transport configuration */
|
|
194
|
+
export const McpServerStdioSchema = z.object({
|
|
195
|
+
type: z.literal('stdio').optional(),
|
|
196
|
+
name: z.string(),
|
|
197
|
+
command: z.string(),
|
|
198
|
+
args: z.array(z.string()),
|
|
199
|
+
env: z.array(EnvVariableSchema),
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
/** MCP server HTTP transport configuration */
|
|
203
|
+
export const McpServerHttpSchema = z.object({
|
|
204
|
+
type: z.literal('http'),
|
|
205
|
+
name: z.string(),
|
|
206
|
+
url: z.string(),
|
|
207
|
+
headers: z.array(HttpHeaderSchema),
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
/** MCP server configuration (stdio or HTTP) */
|
|
211
|
+
export const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
|
|
212
|
+
|
|
213
|
+
/** MCP server configuration type */
|
|
214
|
+
export type McpServerConfig = z.infer<typeof McpServerSchema>
|
|
215
|
+
|
|
216
|
+
// ============================================================================
|
|
217
|
+
// Harness Input Schemas
|
|
218
|
+
// ============================================================================
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Prompt case schema for evaluation inputs.
|
|
222
|
+
*
|
|
223
|
+
* @remarks
|
|
224
|
+
* Each line in a prompts.jsonl file should match this schema.
|
|
225
|
+
*/
|
|
226
|
+
export const PromptCaseSchema = z.object({
|
|
227
|
+
/** Unique identifier for the test case */
|
|
228
|
+
id: z.string(),
|
|
229
|
+
/** The prompt text to send to the agent */
|
|
230
|
+
input: z.string(),
|
|
231
|
+
/** Optional expected output for grading */
|
|
232
|
+
expected: z.string().optional(),
|
|
233
|
+
/** Optional reference solution for validation */
|
|
234
|
+
reference: z.string().optional(),
|
|
235
|
+
/** Optional metadata for categorization and analysis */
|
|
236
|
+
metadata: z.record(z.string(), z.unknown()).optional(),
|
|
237
|
+
/** Optional per-case timeout override in milliseconds */
|
|
238
|
+
timeout: z.number().optional(),
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
/** Prompt case type */
|
|
242
|
+
export type PromptCase = z.infer<typeof PromptCaseSchema>
|
|
243
|
+
|
|
244
|
+
// ============================================================================
|
|
245
|
+
// Grader Schemas
|
|
246
|
+
// ============================================================================
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Grader result schema.
|
|
250
|
+
*
|
|
251
|
+
* @remarks
|
|
252
|
+
* Result returned by user-provided grader functions.
|
|
253
|
+
*/
|
|
254
|
+
export const GraderResultSchema = z.object({
|
|
255
|
+
/** Whether the output passes the evaluation criteria */
|
|
256
|
+
pass: z.boolean(),
|
|
257
|
+
/** Numeric score from 0.0 to 1.0 */
|
|
258
|
+
score: z.number().min(0).max(1),
|
|
259
|
+
/** Optional explanation for the score */
|
|
260
|
+
reasoning: z.string().optional(),
|
|
261
|
+
})
|
|
262
|
+
|
|
263
|
+
/** Grader result type */
|
|
264
|
+
export type GraderResult = z.infer<typeof GraderResultSchema>
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Grader function type.
|
|
268
|
+
*
|
|
269
|
+
* @remarks
|
|
270
|
+
* User-provided graders implement this interface to score agent outputs.
|
|
271
|
+
*
|
|
272
|
+
* @example
|
|
273
|
+
* ```typescript
|
|
274
|
+
* import type { Grader } from '@plaited/acp-harness/schemas'
|
|
275
|
+
*
|
|
276
|
+
* export const grade: Grader = async ({ input, output, expected, trajectory }) => {
|
|
277
|
+
* const pass = output.toLowerCase().includes(expected?.toLowerCase() ?? '')
|
|
278
|
+
* return {
|
|
279
|
+
* pass,
|
|
280
|
+
* score: pass ? 1 : 0,
|
|
281
|
+
* reasoning: pass ? 'Contains expected answer' : 'Missing expected answer'
|
|
282
|
+
* }
|
|
283
|
+
* }
|
|
284
|
+
* ```
|
|
285
|
+
*/
|
|
286
|
+
export type Grader = (params: {
|
|
287
|
+
input: string
|
|
288
|
+
output: string
|
|
289
|
+
expected?: string
|
|
290
|
+
trajectory?: TrajectoryStep[]
|
|
291
|
+
}) => Promise<GraderResult>
|
|
292
|
+
|
|
293
|
+
// ============================================================================
|
|
294
|
+
// Trajectory Schemas
|
|
295
|
+
// ============================================================================
|
|
296
|
+
|
|
297
|
+
/** Tool input schema for extracting file paths and content */
|
|
298
|
+
export const ToolInputSchema = z
|
|
299
|
+
.object({
|
|
300
|
+
file_path: z.string().optional(),
|
|
301
|
+
path: z.string().optional(),
|
|
302
|
+
content: z.string().optional(),
|
|
303
|
+
new_string: z.string().optional(),
|
|
304
|
+
})
|
|
305
|
+
.passthrough()
|
|
306
|
+
|
|
307
|
+
/** Tool input type */
|
|
308
|
+
export type ToolInput = z.infer<typeof ToolInputSchema>
|
|
309
|
+
|
|
310
|
+
/** Thought trajectory step */
|
|
311
|
+
export const ThoughtStepSchema = z.object({
|
|
312
|
+
type: z.literal('thought'),
|
|
313
|
+
content: z.string(),
|
|
314
|
+
timestamp: z.number(),
|
|
315
|
+
stepId: z.string().optional(),
|
|
316
|
+
})
|
|
317
|
+
|
|
318
|
+
/** Message trajectory step */
|
|
319
|
+
export const MessageStepSchema = z.object({
|
|
320
|
+
type: z.literal('message'),
|
|
321
|
+
content: z.string(),
|
|
322
|
+
timestamp: z.number(),
|
|
323
|
+
stepId: z.string().optional(),
|
|
324
|
+
})
|
|
325
|
+
|
|
326
|
+
/** Tool call trajectory step */
|
|
327
|
+
export const ToolCallStepSchema = z.object({
|
|
328
|
+
type: z.literal('tool_call'),
|
|
329
|
+
name: z.string(),
|
|
330
|
+
status: z.string(),
|
|
331
|
+
input: z.unknown().optional(),
|
|
332
|
+
output: z.unknown().optional(),
|
|
333
|
+
duration: z.number().optional(),
|
|
334
|
+
timestamp: z.number(),
|
|
335
|
+
stepId: z.string().optional(),
|
|
336
|
+
})
|
|
337
|
+
|
|
338
|
+
/** Plan trajectory step */
|
|
339
|
+
export const PlanStepSchema = z.object({
|
|
340
|
+
type: z.literal('plan'),
|
|
341
|
+
entries: z.array(z.unknown()),
|
|
342
|
+
timestamp: z.number(),
|
|
343
|
+
stepId: z.string().optional(),
|
|
344
|
+
})
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Trajectory step schema (discriminated union).
|
|
348
|
+
*
|
|
349
|
+
* @remarks
|
|
350
|
+
* Represents a single step in the agent's execution trajectory.
|
|
351
|
+
*/
|
|
352
|
+
export const TrajectoryStepSchema = z.discriminatedUnion('type', [
|
|
353
|
+
ThoughtStepSchema,
|
|
354
|
+
MessageStepSchema,
|
|
355
|
+
ToolCallStepSchema,
|
|
356
|
+
PlanStepSchema,
|
|
357
|
+
])
|
|
358
|
+
|
|
359
|
+
/** Trajectory step type */
|
|
360
|
+
export type TrajectoryStep = z.infer<typeof TrajectoryStepSchema>
|
|
361
|
+
|
|
362
|
+
/** Indexed trajectory step with unique ID for correlation */
|
|
363
|
+
export type IndexedStep = TrajectoryStep & { stepId: string }
|
|
364
|
+
|
|
365
|
+
// ============================================================================
|
|
366
|
+
// Capture Result Schemas
|
|
367
|
+
// ============================================================================
|
|
368
|
+
|
|
369
|
+
/** Timing information for a capture result */
|
|
370
|
+
export const TimingSchema = z.object({
|
|
371
|
+
start: z.number(),
|
|
372
|
+
end: z.number(),
|
|
373
|
+
firstResponse: z.number().optional(),
|
|
374
|
+
})
|
|
375
|
+
|
|
376
|
+
/** Timing information type */
|
|
377
|
+
export type Timing = z.infer<typeof TimingSchema>
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Capture result schema.
|
|
381
|
+
*
|
|
382
|
+
* @remarks
|
|
383
|
+
* Full trajectory output from the `capture` command.
|
|
384
|
+
* The `toolErrors` field replaces the misleading `status: 'passed'|'failed'`.
|
|
385
|
+
* Real pass/fail determination comes from your grader.
|
|
386
|
+
*/
|
|
387
|
+
export const CaptureResultSchema = z.object({
|
|
388
|
+
/** Test case identifier */
|
|
389
|
+
id: z.string(),
|
|
390
|
+
/** Original prompt input */
|
|
391
|
+
input: z.string(),
|
|
392
|
+
/** Final agent output */
|
|
393
|
+
output: z.string(),
|
|
394
|
+
/** Expected output (if provided) */
|
|
395
|
+
expected: z.string().optional(),
|
|
396
|
+
/** Full execution trajectory */
|
|
397
|
+
trajectory: z.array(TrajectoryStepSchema),
|
|
398
|
+
/** Metadata including category, agent info, etc. */
|
|
399
|
+
metadata: z.record(z.string(), z.unknown()),
|
|
400
|
+
/** Timing information */
|
|
401
|
+
timing: TimingSchema,
|
|
402
|
+
/** Whether any tool calls failed */
|
|
403
|
+
toolErrors: z.boolean(),
|
|
404
|
+
/** Error messages (if any) */
|
|
405
|
+
errors: z.array(z.string()).optional(),
|
|
406
|
+
/** Grader score (if grader was provided) */
|
|
407
|
+
score: GraderResultSchema.optional(),
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
/** Capture result type */
|
|
411
|
+
export type CaptureResult = z.infer<typeof CaptureResultSchema>
|
|
412
|
+
|
|
413
|
+
// ============================================================================
|
|
414
|
+
// Summary Result Schemas
|
|
415
|
+
// ============================================================================
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* Summary result schema.
|
|
419
|
+
*
|
|
420
|
+
* @remarks
|
|
421
|
+
* Compact view derived from full capture results via the `summarize` command.
|
|
422
|
+
*/
|
|
423
|
+
export const SummaryResultSchema = z.object({
|
|
424
|
+
/** Test case identifier */
|
|
425
|
+
id: z.string(),
|
|
426
|
+
/** Original prompt input */
|
|
427
|
+
input: z.string(),
|
|
428
|
+
/** Final agent output */
|
|
429
|
+
output: z.string(),
|
|
430
|
+
/** List of tool names called */
|
|
431
|
+
toolCalls: z.array(z.string()),
|
|
432
|
+
/** Duration in milliseconds */
|
|
433
|
+
duration: z.number(),
|
|
434
|
+
})
|
|
435
|
+
|
|
436
|
+
/** Summary result type */
|
|
437
|
+
export type SummaryResult = z.infer<typeof SummaryResultSchema>
|
|
438
|
+
|
|
439
|
+
// ============================================================================
|
|
440
|
+
// Trial Result Schemas
|
|
441
|
+
// ============================================================================
|
|
442
|
+
|
|
443
|
+
/** Single trial within a trial run */
|
|
444
|
+
export const TrialEntrySchema = z.object({
|
|
445
|
+
/** Trial number (1-indexed) */
|
|
446
|
+
trialNum: z.number(),
|
|
447
|
+
/** Agent output for this trial */
|
|
448
|
+
output: z.string(),
|
|
449
|
+
/** Full trajectory for this trial */
|
|
450
|
+
trajectory: z.array(TrajectoryStepSchema),
|
|
451
|
+
/** Duration in milliseconds */
|
|
452
|
+
duration: z.number(),
|
|
453
|
+
/** Pass/fail (if grader provided) */
|
|
454
|
+
pass: z.boolean().optional(),
|
|
455
|
+
/** Numeric score (if grader provided) */
|
|
456
|
+
score: z.number().optional(),
|
|
457
|
+
/** Grader reasoning (if grader provided) */
|
|
458
|
+
reasoning: z.string().optional(),
|
|
459
|
+
})
|
|
460
|
+
|
|
461
|
+
/** Trial entry type */
|
|
462
|
+
export type TrialEntry = z.infer<typeof TrialEntrySchema>
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Trial result schema.
|
|
466
|
+
*
|
|
467
|
+
* @remarks
|
|
468
|
+
* Output from the `trials` command for pass@k/pass^k analysis.
|
|
469
|
+
* Metrics (passRate, passAtK, passExpK) are only present when a grader is provided.
|
|
470
|
+
*/
|
|
471
|
+
export const TrialResultSchema = z.object({
|
|
472
|
+
/** Test case identifier */
|
|
473
|
+
id: z.string(),
|
|
474
|
+
/** Original prompt input */
|
|
475
|
+
input: z.string(),
|
|
476
|
+
/** Expected output (if provided) */
|
|
477
|
+
expected: z.string().optional(),
|
|
478
|
+
/** Number of trials (k) */
|
|
479
|
+
k: z.number(),
|
|
480
|
+
/** Simple pass rate: passes / k (with grader only) */
|
|
481
|
+
passRate: z.number().optional(),
|
|
482
|
+
/** pass@k: probability of at least one pass in k samples (with grader only) */
|
|
483
|
+
passAtK: z.number().optional(),
|
|
484
|
+
/** pass^k: probability of all k samples passing (with grader only) */
|
|
485
|
+
passExpK: z.number().optional(),
|
|
486
|
+
/** Individual trial results */
|
|
487
|
+
trials: z.array(TrialEntrySchema),
|
|
488
|
+
})
|
|
489
|
+
|
|
490
|
+
/** Trial result type */
|
|
491
|
+
export type TrialResult = z.infer<typeof TrialResultSchema>
|
|
492
|
+
|
|
493
|
+
// ============================================================================
|
|
494
|
+
// Calibration Schemas
|
|
495
|
+
// ============================================================================
|
|
496
|
+
|
|
497
|
+
/** Calibration sample for grader review */
|
|
498
|
+
export const CalibrationSampleSchema = z.object({
|
|
499
|
+
/** Test case identifier */
|
|
500
|
+
id: z.string(),
|
|
501
|
+
/** Original prompt input */
|
|
502
|
+
input: z.string(),
|
|
503
|
+
/** Agent output */
|
|
504
|
+
output: z.string(),
|
|
505
|
+
/** Expected output (if provided) */
|
|
506
|
+
expected: z.string().optional(),
|
|
507
|
+
/** Original grader score */
|
|
508
|
+
originalScore: GraderResultSchema,
|
|
509
|
+
/** Re-scored result (if different grader provided) */
|
|
510
|
+
rescoredResult: GraderResultSchema.optional(),
|
|
511
|
+
/** Key trajectory snippets */
|
|
512
|
+
trajectorySnippet: z.array(TrajectoryStepSchema),
|
|
513
|
+
})
|
|
514
|
+
|
|
515
|
+
/** Calibration sample type */
|
|
516
|
+
export type CalibrationSample = z.infer<typeof CalibrationSampleSchema>
|
|
517
|
+
|
|
518
|
+
// ============================================================================
|
|
519
|
+
// Balance Analysis Schemas
|
|
520
|
+
// ============================================================================
|
|
521
|
+
|
|
522
|
+
/** Category distribution in test set */
|
|
523
|
+
export const CategoryDistributionSchema = z.object({
|
|
524
|
+
/** Category name */
|
|
525
|
+
name: z.string(),
|
|
526
|
+
/** Number of test cases */
|
|
527
|
+
count: z.number(),
|
|
528
|
+
/** Percentage of total */
|
|
529
|
+
percentage: z.number(),
|
|
530
|
+
})
|
|
531
|
+
|
|
532
|
+
/** Category distribution type */
|
|
533
|
+
export type CategoryDistribution = z.infer<typeof CategoryDistributionSchema>
|
|
534
|
+
|
|
535
|
+
/** Balance analysis result */
|
|
536
|
+
export const BalanceAnalysisSchema = z.object({
|
|
537
|
+
/** Total number of test cases */
|
|
538
|
+
totalCases: z.number(),
|
|
539
|
+
/** Distribution by category */
|
|
540
|
+
categories: z.array(CategoryDistributionSchema),
|
|
541
|
+
/** Categories that may need more test cases */
|
|
542
|
+
underrepresented: z.array(z.string()),
|
|
543
|
+
/** Suggested improvements */
|
|
544
|
+
suggestions: z.array(z.string()),
|
|
545
|
+
})
|
|
546
|
+
|
|
547
|
+
/** Balance analysis type */
|
|
548
|
+
export type BalanceAnalysis = z.infer<typeof BalanceAnalysisSchema>
|
|
549
|
+
|
|
550
|
+
// ============================================================================
|
|
551
|
+
// Validation Reference Schemas
|
|
552
|
+
// ============================================================================
|
|
553
|
+
|
|
554
|
+
/** Validation result for a reference solution */
|
|
555
|
+
export const ValidationResultSchema = z.object({
|
|
556
|
+
/** Test case identifier */
|
|
557
|
+
id: z.string(),
|
|
558
|
+
/** Reference solution provided */
|
|
559
|
+
reference: z.string(),
|
|
560
|
+
/** Whether reference passes the grader */
|
|
561
|
+
passes: z.boolean(),
|
|
562
|
+
/** Grader result */
|
|
563
|
+
graderResult: GraderResultSchema,
|
|
564
|
+
})
|
|
565
|
+
|
|
566
|
+
/** Validation result type */
|
|
567
|
+
export type ValidationResult = z.infer<typeof ValidationResultSchema>
|