@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified Zod schemas and types for the agent eval harness.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* This module follows a schema-first approach where Zod schemas are the
|
|
6
|
+
* single source of truth. TypeScript types are derived using `z.infer<>`.
|
|
7
|
+
*
|
|
8
|
+
* **Exports:**
|
|
9
|
+
* - Harness schemas: PromptCaseSchema, GraderResultSchema, CaptureResultSchema, etc.
|
|
10
|
+
* - JSON-RPC schemas: JsonRpcRequestSchema, JsonRpcResponseSchema, etc. (for headless adapter)
|
|
11
|
+
* - All inferred types via `z.infer<>`
|
|
12
|
+
*
|
|
13
|
+
* **JSON Schema generation (Zod 4):**
|
|
14
|
+
* ```typescript
|
|
15
|
+
* import { z } from 'zod'
|
|
16
|
+
* import { CaptureResultSchema } from '@plaited/agent-eval-harness/schemas'
|
|
17
|
+
* const jsonSchema = z.toJSONSchema(CaptureResultSchema)
|
|
18
|
+
* ```
|
|
19
|
+
*
|
|
20
|
+
* @packageDocumentation
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { z } from 'zod'
|
|
24
|
+
|
|
25
|
+
// ============================================================================
|
|
26
|
+
// Session Types
|
|
27
|
+
// ============================================================================
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Session schema for session creation responses.
|
|
31
|
+
*/
|
|
32
|
+
export const SessionSchema = z.object({
|
|
33
|
+
id: z.string(),
|
|
34
|
+
_meta: z.record(z.string(), z.unknown()).nullish(),
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
/** Session object returned from session creation */
|
|
38
|
+
export type Session = z.infer<typeof SessionSchema>
|
|
39
|
+
|
|
40
|
+
// ============================================================================
|
|
41
|
+
// JSON-RPC 2.0 Schemas (for headless adapter)
|
|
42
|
+
// ============================================================================
|
|
43
|
+
|
|
44
|
+
/** JSON-RPC version literal */
|
|
45
|
+
const JsonRpcVersionSchema = z.literal('2.0')
|
|
46
|
+
|
|
47
|
+
/** Request/response identifier */
|
|
48
|
+
const RequestIdSchema = z.union([z.string(), z.number()])
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* JSON-RPC 2.0 error object schema.
|
|
52
|
+
*
|
|
53
|
+
* @remarks
|
|
54
|
+
* Standard error codes:
|
|
55
|
+
* - `-32700`: Parse error
|
|
56
|
+
* - `-32600`: Invalid request
|
|
57
|
+
* - `-32601`: Method not found
|
|
58
|
+
* - `-32602`: Invalid params
|
|
59
|
+
* - `-32603`: Internal error
|
|
60
|
+
*/
|
|
61
|
+
export const JsonRpcErrorSchema = z.object({
|
|
62
|
+
code: z.number(),
|
|
63
|
+
message: z.string(),
|
|
64
|
+
data: z.unknown().optional(),
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
/** JSON-RPC 2.0 error object */
|
|
68
|
+
export type JsonRpcError = z.infer<typeof JsonRpcErrorSchema>
|
|
69
|
+
|
|
70
|
+
/** JSON-RPC 2.0 request schema */
|
|
71
|
+
export const JsonRpcRequestSchema = z.object({
|
|
72
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
73
|
+
id: RequestIdSchema,
|
|
74
|
+
method: z.string(),
|
|
75
|
+
params: z.unknown().optional(),
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
/** JSON-RPC 2.0 request structure */
|
|
79
|
+
export type JsonRpcRequest<T = unknown> = Omit<z.infer<typeof JsonRpcRequestSchema>, 'params'> & {
|
|
80
|
+
params?: T
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** JSON-RPC 2.0 notification schema (no id, no response expected) */
|
|
84
|
+
export const JsonRpcNotificationSchema = z.object({
|
|
85
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
86
|
+
method: z.string(),
|
|
87
|
+
params: z.unknown().optional(),
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
/** JSON-RPC 2.0 notification structure (no id, no response expected) */
|
|
91
|
+
export type JsonRpcNotification<T = unknown> = Omit<z.infer<typeof JsonRpcNotificationSchema>, 'params'> & {
|
|
92
|
+
params?: T
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/** JSON-RPC 2.0 success response schema */
|
|
96
|
+
export const JsonRpcSuccessResponseSchema = z.object({
|
|
97
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
98
|
+
id: RequestIdSchema,
|
|
99
|
+
result: z.unknown(),
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
/** JSON-RPC 2.0 success response */
|
|
103
|
+
export type JsonRpcSuccessResponse<T = unknown> = Omit<z.infer<typeof JsonRpcSuccessResponseSchema>, 'result'> & {
|
|
104
|
+
result: T
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** JSON-RPC 2.0 error response schema */
|
|
108
|
+
export const JsonRpcErrorResponseSchema = z.object({
|
|
109
|
+
jsonrpc: JsonRpcVersionSchema,
|
|
110
|
+
id: z.union([RequestIdSchema, z.null()]),
|
|
111
|
+
error: JsonRpcErrorSchema,
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
/** JSON-RPC 2.0 error response */
|
|
115
|
+
export type JsonRpcErrorResponse = z.infer<typeof JsonRpcErrorResponseSchema>
|
|
116
|
+
|
|
117
|
+
/** Union of all JSON-RPC response types */
|
|
118
|
+
export const JsonRpcResponseSchema = z.union([JsonRpcSuccessResponseSchema, JsonRpcErrorResponseSchema])
|
|
119
|
+
|
|
120
|
+
/** Union of all JSON-RPC response types */
|
|
121
|
+
export type JsonRpcResponse<T = unknown> = JsonRpcSuccessResponse<T> | JsonRpcErrorResponse
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Union of all JSON-RPC message types.
|
|
125
|
+
*
|
|
126
|
+
* @remarks
|
|
127
|
+
* Use `safeParse` at transport boundaries for runtime validation.
|
|
128
|
+
*/
|
|
129
|
+
export const JsonRpcMessageSchema = z.union([JsonRpcRequestSchema, JsonRpcNotificationSchema, JsonRpcResponseSchema])
|
|
130
|
+
|
|
131
|
+
/** Union of all JSON-RPC message types */
|
|
132
|
+
export type JsonRpcMessage<T = unknown> = JsonRpcRequest<T> | JsonRpcNotification<T> | JsonRpcResponse<T>
|
|
133
|
+
|
|
134
|
+
// ============================================================================
|
|
135
|
+
// MCP Server Configuration Schemas
|
|
136
|
+
// ============================================================================
|
|
137
|
+
|
|
138
|
+
/** Environment variable configuration */
|
|
139
|
+
export const EnvVariableSchema = z.object({
|
|
140
|
+
name: z.string(),
|
|
141
|
+
value: z.string(),
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
/** HTTP header configuration */
|
|
145
|
+
export const HttpHeaderSchema = z.object({
|
|
146
|
+
name: z.string(),
|
|
147
|
+
value: z.string(),
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
/** MCP server stdio transport configuration */
|
|
151
|
+
export const McpServerStdioSchema = z.object({
|
|
152
|
+
type: z.literal('stdio').optional(),
|
|
153
|
+
name: z.string(),
|
|
154
|
+
command: z.string(),
|
|
155
|
+
args: z.array(z.string()),
|
|
156
|
+
env: z.array(EnvVariableSchema),
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
/** MCP server HTTP transport configuration */
|
|
160
|
+
export const McpServerHttpSchema = z.object({
|
|
161
|
+
type: z.literal('http'),
|
|
162
|
+
name: z.string(),
|
|
163
|
+
url: z.string(),
|
|
164
|
+
headers: z.array(HttpHeaderSchema),
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
/** MCP server configuration (stdio or HTTP) */
|
|
168
|
+
export const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
|
|
169
|
+
|
|
170
|
+
/** MCP server configuration type */
|
|
171
|
+
export type McpServerConfig = z.infer<typeof McpServerSchema>
|
|
172
|
+
|
|
173
|
+
// ============================================================================
|
|
174
|
+
// Harness Input Schemas
|
|
175
|
+
// ============================================================================
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Prompt case schema for evaluation inputs.
|
|
179
|
+
*
|
|
180
|
+
* @remarks
|
|
181
|
+
* Each line in a prompts.jsonl file should match this schema.
|
|
182
|
+
* - Single turn: `input: "Hello"` - one prompt, one session
|
|
183
|
+
* - Multi-turn: `input: ["Hello", "How are you?", "Goodbye"]` - sequential turns in one session
|
|
184
|
+
*/
|
|
185
|
+
export const PromptCaseSchema = z.object({
|
|
186
|
+
/** Unique identifier for the test case */
|
|
187
|
+
id: z.string(),
|
|
188
|
+
/** Prompt text(s) - string for single turn, array for multi-turn conversation */
|
|
189
|
+
input: z.union([z.string(), z.array(z.string())]),
|
|
190
|
+
/** Optional grader context hint (not a strict expected match) */
|
|
191
|
+
hint: z.string().optional(),
|
|
192
|
+
/** Optional reference solution for validation */
|
|
193
|
+
reference: z.string().optional(),
|
|
194
|
+
/** Optional metadata for categorization and analysis */
|
|
195
|
+
metadata: z.record(z.string(), z.unknown()).optional(),
|
|
196
|
+
/** Optional per-case timeout override in milliseconds */
|
|
197
|
+
timeout: z.number().optional(),
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
/** Prompt case type */
|
|
201
|
+
export type PromptCase = z.infer<typeof PromptCaseSchema>
|
|
202
|
+
|
|
203
|
+
// ============================================================================
|
|
204
|
+
// Grader Schemas
|
|
205
|
+
// ============================================================================
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Grader result schema.
|
|
209
|
+
*
|
|
210
|
+
* @remarks
|
|
211
|
+
* Result returned by user-provided grader functions.
|
|
212
|
+
*/
|
|
213
|
+
export const GraderResultSchema = z.object({
|
|
214
|
+
/** Whether the output passes the evaluation criteria */
|
|
215
|
+
pass: z.boolean(),
|
|
216
|
+
/** Numeric score from 0.0 to 1.0 */
|
|
217
|
+
score: z.number().min(0).max(1),
|
|
218
|
+
/** Optional explanation for the score */
|
|
219
|
+
reasoning: z.string().optional(),
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
/** Grader result type */
|
|
223
|
+
export type GraderResult = z.infer<typeof GraderResultSchema>
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Grader function type.
|
|
227
|
+
*
|
|
228
|
+
* @remarks
|
|
229
|
+
* User-provided graders implement this interface to score agent outputs.
|
|
230
|
+
* - `input` is the original prompt (string or array for multi-turn)
|
|
231
|
+
* - `hint` provides grader context (renamed from `expected`)
|
|
232
|
+
*/
|
|
233
|
+
export type Grader = (params: {
|
|
234
|
+
input: string | string[]
|
|
235
|
+
output: string
|
|
236
|
+
hint?: string
|
|
237
|
+
trajectory?: TrajectoryStep[]
|
|
238
|
+
}) => Promise<GraderResult>
|
|
239
|
+
|
|
240
|
+
// ============================================================================
|
|
241
|
+
// Trajectory Schemas
|
|
242
|
+
// ============================================================================
|
|
243
|
+
|
|
244
|
+
/** Tool input schema for extracting file paths and content */
|
|
245
|
+
export const ToolInputSchema = z
|
|
246
|
+
.object({
|
|
247
|
+
file_path: z.string().optional(),
|
|
248
|
+
path: z.string().optional(),
|
|
249
|
+
content: z.string().optional(),
|
|
250
|
+
new_string: z.string().optional(),
|
|
251
|
+
})
|
|
252
|
+
.passthrough()
|
|
253
|
+
|
|
254
|
+
/** Tool input type */
|
|
255
|
+
export type ToolInput = z.infer<typeof ToolInputSchema>
|
|
256
|
+
|
|
257
|
+
/** Thought trajectory step */
|
|
258
|
+
export const ThoughtStepSchema = z.object({
|
|
259
|
+
type: z.literal('thought'),
|
|
260
|
+
content: z.string(),
|
|
261
|
+
timestamp: z.number(),
|
|
262
|
+
stepId: z.string().optional(),
|
|
263
|
+
})
|
|
264
|
+
|
|
265
|
+
/** Message trajectory step */
|
|
266
|
+
export const MessageStepSchema = z.object({
|
|
267
|
+
type: z.literal('message'),
|
|
268
|
+
content: z.string(),
|
|
269
|
+
timestamp: z.number(),
|
|
270
|
+
stepId: z.string().optional(),
|
|
271
|
+
})
|
|
272
|
+
|
|
273
|
+
/** Tool call trajectory step */
|
|
274
|
+
export const ToolCallStepSchema = z.object({
|
|
275
|
+
type: z.literal('tool_call'),
|
|
276
|
+
name: z.string(),
|
|
277
|
+
status: z.string(),
|
|
278
|
+
input: z.unknown().optional(),
|
|
279
|
+
output: z.unknown().optional(),
|
|
280
|
+
duration: z.number().optional(),
|
|
281
|
+
timestamp: z.number(),
|
|
282
|
+
stepId: z.string().optional(),
|
|
283
|
+
})
|
|
284
|
+
|
|
285
|
+
/** Plan trajectory step */
|
|
286
|
+
export const PlanStepSchema = z.object({
|
|
287
|
+
type: z.literal('plan'),
|
|
288
|
+
entries: z.array(z.unknown()),
|
|
289
|
+
timestamp: z.number(),
|
|
290
|
+
stepId: z.string().optional(),
|
|
291
|
+
})
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Trajectory step schema (discriminated union).
|
|
295
|
+
*
|
|
296
|
+
* @remarks
|
|
297
|
+
* Represents a single step in the agent's execution trajectory.
|
|
298
|
+
*/
|
|
299
|
+
export const TrajectoryStepSchema = z.discriminatedUnion('type', [
|
|
300
|
+
ThoughtStepSchema,
|
|
301
|
+
MessageStepSchema,
|
|
302
|
+
ToolCallStepSchema,
|
|
303
|
+
PlanStepSchema,
|
|
304
|
+
])
|
|
305
|
+
|
|
306
|
+
/** Trajectory step type */
|
|
307
|
+
export type TrajectoryStep = z.infer<typeof TrajectoryStepSchema>
|
|
308
|
+
|
|
309
|
+
/** Indexed trajectory step with unique ID for correlation */
|
|
310
|
+
export type IndexedStep = TrajectoryStep & { stepId: string }
|
|
311
|
+
|
|
312
|
+
// ============================================================================
|
|
313
|
+
// Capture Result Schemas
|
|
314
|
+
// ============================================================================
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Timing information for a capture result.
|
|
318
|
+
*
|
|
319
|
+
* @remarks
|
|
320
|
+
* Captures both absolute timestamps and derived durations for analysis:
|
|
321
|
+
* - `sessionCreation`: Time to initialize session (agent startup overhead)
|
|
322
|
+
* - `total`: End-to-end duration including all turns
|
|
323
|
+
* - `firstResponse`: Latency to first agent output (optional)
|
|
324
|
+
*
|
|
325
|
+
* Token counts are adapter-dependent and only present if the adapter
|
|
326
|
+
* exposes usage information (e.g., Claude Code includes them, others may not).
|
|
327
|
+
*
|
|
328
|
+
* @public
|
|
329
|
+
*/
|
|
330
|
+
export const TimingSchema = z.object({
|
|
331
|
+
/** Epoch timestamp when capture started */
|
|
332
|
+
start: z.number(),
|
|
333
|
+
/** Epoch timestamp when capture ended */
|
|
334
|
+
end: z.number(),
|
|
335
|
+
/** Time to first response (ms from start) */
|
|
336
|
+
firstResponse: z.number().optional(),
|
|
337
|
+
/** Time to create session (ms) - measures agent initialization overhead */
|
|
338
|
+
sessionCreation: z.number(),
|
|
339
|
+
/** Total duration (end - start) in milliseconds */
|
|
340
|
+
total: z.number(),
|
|
341
|
+
/** Input tokens consumed (if available from headless adapter) */
|
|
342
|
+
inputTokens: z.number().optional(),
|
|
343
|
+
/** Output tokens generated (if available from headless adapter) */
|
|
344
|
+
outputTokens: z.number().optional(),
|
|
345
|
+
})
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Timing information type inferred from TimingSchema.
|
|
349
|
+
*
|
|
350
|
+
* @public
|
|
351
|
+
*/
|
|
352
|
+
export type Timing = z.infer<typeof TimingSchema>
|
|
353
|
+
|
|
354
|
+
/**
|
|
355
|
+
* Trajectory richness level indicating the depth of captured agent activity.
|
|
356
|
+
*
|
|
357
|
+
* @remarks
|
|
358
|
+
* Different adapters provide varying levels of detail:
|
|
359
|
+
* - `full`: Thoughts, tool calls, plans (e.g., Claude Code adapter)
|
|
360
|
+
* - `minimal`: Basic output only (e.g., Droid adapter)
|
|
361
|
+
* - `messages-only`: Messages without internal reasoning
|
|
362
|
+
*/
|
|
363
|
+
export const TrajectoryRichnessSchema = z.enum(['full', 'minimal', 'messages-only'])
|
|
364
|
+
|
|
365
|
+
/** Trajectory richness type */
|
|
366
|
+
export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Capture result schema.
|
|
370
|
+
*
|
|
371
|
+
* @remarks
|
|
372
|
+
* Full trajectory output from the `capture` command.
|
|
373
|
+
* - `input` can be string (single turn) or string[] (multi-turn)
|
|
374
|
+
* - `hint` provides grader context (renamed from `expected`)
|
|
375
|
+
* - `toolErrors` replaces misleading `status: 'passed'|'failed'`
|
|
376
|
+
* Real pass/fail determination comes from your grader.
|
|
377
|
+
*/
|
|
378
|
+
export const CaptureResultSchema = z.object({
|
|
379
|
+
/** Test case identifier */
|
|
380
|
+
id: z.string(),
|
|
381
|
+
/** Original prompt input (string for single turn, array for multi-turn) */
|
|
382
|
+
input: z.union([z.string(), z.array(z.string())]),
|
|
383
|
+
/** Final agent output */
|
|
384
|
+
output: z.string(),
|
|
385
|
+
/** Grader context hint (renamed from expected) */
|
|
386
|
+
hint: z.string().optional(),
|
|
387
|
+
/** Full execution trajectory */
|
|
388
|
+
trajectory: z.array(TrajectoryStepSchema),
|
|
389
|
+
/** Metadata including category, agent info, trajectoryRichness, turnCount */
|
|
390
|
+
metadata: z.record(z.string(), z.unknown()),
|
|
391
|
+
/** Timing information */
|
|
392
|
+
timing: TimingSchema,
|
|
393
|
+
/** Whether any tool calls failed */
|
|
394
|
+
toolErrors: z.boolean(),
|
|
395
|
+
/** Error messages (if any) */
|
|
396
|
+
errors: z.array(z.string()).optional(),
|
|
397
|
+
/** Grader score (if grader was provided) */
|
|
398
|
+
score: GraderResultSchema.optional(),
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
/** Capture result type */
|
|
402
|
+
export type CaptureResult = z.infer<typeof CaptureResultSchema>
|
|
403
|
+
|
|
404
|
+
// ============================================================================
|
|
405
|
+
// Summary Result Schemas
|
|
406
|
+
// ============================================================================
|
|
407
|
+
|
|
408
|
+
/**
|
|
409
|
+
* Summary result schema.
|
|
410
|
+
*
|
|
411
|
+
* @remarks
|
|
412
|
+
* Compact view derived from full capture results via the `summarize` command.
|
|
413
|
+
*/
|
|
414
|
+
export const SummaryResultSchema = z.object({
|
|
415
|
+
/** Test case identifier */
|
|
416
|
+
id: z.string(),
|
|
417
|
+
/** Original prompt input */
|
|
418
|
+
input: z.string(),
|
|
419
|
+
/** Final agent output */
|
|
420
|
+
output: z.string(),
|
|
421
|
+
/** List of tool names called */
|
|
422
|
+
toolCalls: z.array(z.string()),
|
|
423
|
+
/** Duration in milliseconds */
|
|
424
|
+
duration: z.number(),
|
|
425
|
+
})
|
|
426
|
+
|
|
427
|
+
/** Summary result type */
|
|
428
|
+
export type SummaryResult = z.infer<typeof SummaryResultSchema>
|
|
429
|
+
|
|
430
|
+
// ============================================================================
|
|
431
|
+
// Trial Result Schemas
|
|
432
|
+
// ============================================================================
|
|
433
|
+
|
|
434
|
+
/** Single trial within a trial run */
|
|
435
|
+
export const TrialEntrySchema = z.object({
|
|
436
|
+
/** Trial number (1-indexed) */
|
|
437
|
+
trialNum: z.number(),
|
|
438
|
+
/** Agent output for this trial */
|
|
439
|
+
output: z.string(),
|
|
440
|
+
/** Full trajectory for this trial */
|
|
441
|
+
trajectory: z.array(TrajectoryStepSchema),
|
|
442
|
+
/** Duration in milliseconds */
|
|
443
|
+
duration: z.number(),
|
|
444
|
+
/** Pass/fail (if grader provided) */
|
|
445
|
+
pass: z.boolean().optional(),
|
|
446
|
+
/** Numeric score (if grader provided) */
|
|
447
|
+
score: z.number().optional(),
|
|
448
|
+
/** Grader reasoning (if grader provided) */
|
|
449
|
+
reasoning: z.string().optional(),
|
|
450
|
+
})
|
|
451
|
+
|
|
452
|
+
/** Trial entry type */
|
|
453
|
+
export type TrialEntry = z.infer<typeof TrialEntrySchema>
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Trial result schema.
|
|
457
|
+
*
|
|
458
|
+
* @remarks
|
|
459
|
+
* Output from the `trials` command for pass@k/pass^k analysis.
|
|
460
|
+
* Metrics (passRate, passAtK, passExpK) are only present when a grader is provided.
|
|
461
|
+
*/
|
|
462
|
+
export const TrialResultSchema = z.object({
|
|
463
|
+
/** Test case identifier */
|
|
464
|
+
id: z.string(),
|
|
465
|
+
/** Original prompt input (string for single turn, array for multi-turn) */
|
|
466
|
+
input: z.union([z.string(), z.array(z.string())]),
|
|
467
|
+
/** Grader context hint (renamed from expected) */
|
|
468
|
+
hint: z.string().optional(),
|
|
469
|
+
/** Number of trials (k) */
|
|
470
|
+
k: z.number(),
|
|
471
|
+
/** Simple pass rate: passes / k (with grader only) */
|
|
472
|
+
passRate: z.number().optional(),
|
|
473
|
+
/** pass@k: probability of at least one pass in k samples (with grader only) */
|
|
474
|
+
passAtK: z.number().optional(),
|
|
475
|
+
/** pass^k: probability of all k samples passing (with grader only) */
|
|
476
|
+
passExpK: z.number().optional(),
|
|
477
|
+
/** Individual trial results */
|
|
478
|
+
trials: z.array(TrialEntrySchema),
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
/** Trial result type */
|
|
482
|
+
export type TrialResult = z.infer<typeof TrialResultSchema>
|
|
483
|
+
|
|
484
|
+
// ============================================================================
|
|
485
|
+
// Calibration Schemas
|
|
486
|
+
// ============================================================================
|
|
487
|
+
|
|
488
|
+
/** Calibration sample for grader review */
|
|
489
|
+
export const CalibrationSampleSchema = z.object({
|
|
490
|
+
/** Test case identifier */
|
|
491
|
+
id: z.string(),
|
|
492
|
+
/** Original prompt input (string for single turn, array for multi-turn) */
|
|
493
|
+
input: z.union([z.string(), z.array(z.string())]),
|
|
494
|
+
/** Agent output */
|
|
495
|
+
output: z.string(),
|
|
496
|
+
/** Grader context hint (renamed from expected) */
|
|
497
|
+
hint: z.string().optional(),
|
|
498
|
+
/** Original grader score */
|
|
499
|
+
originalScore: GraderResultSchema,
|
|
500
|
+
/** Re-scored result (if different grader provided) */
|
|
501
|
+
rescoredResult: GraderResultSchema.optional(),
|
|
502
|
+
/** Key trajectory snippets */
|
|
503
|
+
trajectorySnippet: z.array(TrajectoryStepSchema),
|
|
504
|
+
})
|
|
505
|
+
|
|
506
|
+
/** Calibration sample type */
|
|
507
|
+
export type CalibrationSample = z.infer<typeof CalibrationSampleSchema>
|
|
508
|
+
|
|
509
|
+
// ============================================================================
|
|
510
|
+
// Balance Analysis Schemas
|
|
511
|
+
// ============================================================================
|
|
512
|
+
|
|
513
|
+
/** Category distribution in test set */
|
|
514
|
+
export const CategoryDistributionSchema = z.object({
|
|
515
|
+
/** Category name */
|
|
516
|
+
name: z.string(),
|
|
517
|
+
/** Number of test cases */
|
|
518
|
+
count: z.number(),
|
|
519
|
+
/** Percentage of total */
|
|
520
|
+
percentage: z.number(),
|
|
521
|
+
})
|
|
522
|
+
|
|
523
|
+
/** Category distribution type */
|
|
524
|
+
export type CategoryDistribution = z.infer<typeof CategoryDistributionSchema>
|
|
525
|
+
|
|
526
|
+
/** Balance analysis result */
|
|
527
|
+
export const BalanceAnalysisSchema = z.object({
|
|
528
|
+
/** Total number of test cases */
|
|
529
|
+
totalCases: z.number(),
|
|
530
|
+
/** Distribution by category */
|
|
531
|
+
categories: z.array(CategoryDistributionSchema),
|
|
532
|
+
/** Categories that may need more test cases */
|
|
533
|
+
underrepresented: z.array(z.string()),
|
|
534
|
+
/** Suggested improvements */
|
|
535
|
+
suggestions: z.array(z.string()),
|
|
536
|
+
})
|
|
537
|
+
|
|
538
|
+
/** Balance analysis type */
|
|
539
|
+
export type BalanceAnalysis = z.infer<typeof BalanceAnalysisSchema>
|
|
540
|
+
|
|
541
|
+
// ============================================================================
|
|
542
|
+
// Validation Reference Schemas
|
|
543
|
+
// ============================================================================
|
|
544
|
+
|
|
545
|
+
/** Validation result for a reference solution */
|
|
546
|
+
export const ValidationResultSchema = z.object({
|
|
547
|
+
/** Test case identifier */
|
|
548
|
+
id: z.string(),
|
|
549
|
+
/** Reference solution provided */
|
|
550
|
+
reference: z.string(),
|
|
551
|
+
/** Whether reference passes the grader */
|
|
552
|
+
passes: z.boolean(),
|
|
553
|
+
/** Grader result */
|
|
554
|
+
graderResult: GraderResultSchema,
|
|
555
|
+
})
|
|
556
|
+
|
|
557
|
+
/** Validation result type */
|
|
558
|
+
export type ValidationResult = z.infer<typeof ValidationResultSchema>
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { describe, expect, test } from 'bun:test'
|
|
2
|
+
import {
|
|
3
|
+
DEFAULT_CALIBRATION_SAMPLE_SIZE,
|
|
4
|
+
DEFAULT_CLIENT_NAME,
|
|
5
|
+
DEFAULT_HARNESS_TIMEOUT,
|
|
6
|
+
DEFAULT_POLLING_INTERVAL,
|
|
7
|
+
DEFAULT_PROTOCOL_TIMEOUT,
|
|
8
|
+
DEFAULT_TRIAL_COUNT,
|
|
9
|
+
HEAD_LINES,
|
|
10
|
+
JSON_RPC_ERRORS,
|
|
11
|
+
MAX_CONTENT_LENGTH,
|
|
12
|
+
PROTOCOL_METHODS,
|
|
13
|
+
PROTOCOL_VERSION,
|
|
14
|
+
TAIL_LINES,
|
|
15
|
+
} from '../constants.ts'
|
|
16
|
+
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// JSON-RPC Protocol Constants
|
|
19
|
+
// ============================================================================
|
|
20
|
+
|
|
21
|
+
describe('PROTOCOL_METHODS', () => {
|
|
22
|
+
test('contains all required lifecycle methods', () => {
|
|
23
|
+
expect(PROTOCOL_METHODS.INITIALIZE).toBe('initialize')
|
|
24
|
+
expect(PROTOCOL_METHODS.SHUTDOWN).toBe('shutdown')
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
test('contains all required session methods', () => {
|
|
28
|
+
expect(PROTOCOL_METHODS.CREATE_SESSION).toBe('session/new')
|
|
29
|
+
expect(PROTOCOL_METHODS.LOAD_SESSION).toBe('session/load')
|
|
30
|
+
expect(PROTOCOL_METHODS.PROMPT).toBe('session/prompt')
|
|
31
|
+
expect(PROTOCOL_METHODS.CANCEL).toBe('session/cancel')
|
|
32
|
+
expect(PROTOCOL_METHODS.UPDATE).toBe('session/update')
|
|
33
|
+
expect(PROTOCOL_METHODS.REQUEST_PERMISSION).toBe('session/request_permission')
|
|
34
|
+
expect(PROTOCOL_METHODS.SET_MODEL).toBe('session/set_model')
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('contains protocol-level methods', () => {
|
|
38
|
+
expect(PROTOCOL_METHODS.CANCEL_REQUEST).toBe('$/cancel_request')
|
|
39
|
+
})
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
describe('PROTOCOL_VERSION', () => {
|
|
43
|
+
test('is version 1', () => {
|
|
44
|
+
expect(PROTOCOL_VERSION).toBe(1)
|
|
45
|
+
})
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
// ============================================================================
|
|
49
|
+
// JSON-RPC Error Codes
|
|
50
|
+
// ============================================================================
|
|
51
|
+
|
|
52
|
+
describe('JSON_RPC_ERRORS', () => {
|
|
53
|
+
test('contains standard JSON-RPC error codes', () => {
|
|
54
|
+
expect(JSON_RPC_ERRORS.PARSE_ERROR).toBe(-32700)
|
|
55
|
+
expect(JSON_RPC_ERRORS.INVALID_REQUEST).toBe(-32600)
|
|
56
|
+
expect(JSON_RPC_ERRORS.METHOD_NOT_FOUND).toBe(-32601)
|
|
57
|
+
expect(JSON_RPC_ERRORS.INVALID_PARAMS).toBe(-32602)
|
|
58
|
+
expect(JSON_RPC_ERRORS.INTERNAL_ERROR).toBe(-32603)
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
test('contains extension error codes', () => {
|
|
62
|
+
expect(JSON_RPC_ERRORS.REQUEST_CANCELLED).toBe(-32800)
|
|
63
|
+
})
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
// ============================================================================
|
|
67
|
+
// Client Defaults
|
|
68
|
+
// ============================================================================
|
|
69
|
+
|
|
70
|
+
describe('Client defaults', () => {
|
|
71
|
+
test('DEFAULT_CLIENT_NAME is set', () => {
|
|
72
|
+
expect(DEFAULT_CLIENT_NAME).toBe('plaited-eval-harness')
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('DEFAULT_PROTOCOL_TIMEOUT is 30 seconds', () => {
|
|
76
|
+
expect(DEFAULT_PROTOCOL_TIMEOUT).toBe(30000)
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
test('DEFAULT_POLLING_INTERVAL is 50ms', () => {
|
|
80
|
+
expect(DEFAULT_POLLING_INTERVAL).toBe(50)
|
|
81
|
+
})
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
// ============================================================================
|
|
85
|
+
// Harness Preview Configuration
|
|
86
|
+
// ============================================================================
|
|
87
|
+
|
|
88
|
+
describe('Preview configuration', () => {
|
|
89
|
+
test('HEAD_LINES is positive', () => {
|
|
90
|
+
expect(HEAD_LINES).toBeGreaterThan(0)
|
|
91
|
+
expect(HEAD_LINES).toBe(8)
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
test('TAIL_LINES is positive', () => {
|
|
95
|
+
expect(TAIL_LINES).toBeGreaterThan(0)
|
|
96
|
+
expect(TAIL_LINES).toBe(4)
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
test('MAX_CONTENT_LENGTH is reasonable', () => {
|
|
100
|
+
expect(MAX_CONTENT_LENGTH).toBeGreaterThan(0)
|
|
101
|
+
expect(MAX_CONTENT_LENGTH).toBe(500)
|
|
102
|
+
})
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
// ============================================================================
|
|
106
|
+
// Harness Defaults
|
|
107
|
+
// ============================================================================
|
|
108
|
+
|
|
109
|
+
describe('Harness defaults', () => {
|
|
110
|
+
test('DEFAULT_HARNESS_TIMEOUT is 60 seconds', () => {
|
|
111
|
+
expect(DEFAULT_HARNESS_TIMEOUT).toBe(60000)
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
test('DEFAULT_TRIAL_COUNT is 5', () => {
|
|
115
|
+
expect(DEFAULT_TRIAL_COUNT).toBe(5)
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
test('DEFAULT_CALIBRATION_SAMPLE_SIZE is 10', () => {
|
|
119
|
+
expect(DEFAULT_CALIBRATION_SAMPLE_SIZE).toBe(10)
|
|
120
|
+
})
|
|
121
|
+
})
|