@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,558 @@
1
+ /**
2
+ * Unified Zod schemas and types for the agent eval harness.
3
+ *
4
+ * @remarks
5
+ * This module follows a schema-first approach where Zod schemas are the
6
+ * single source of truth. TypeScript types are derived using `z.infer<>`.
7
+ *
8
+ * **Exports:**
9
+ * - Harness schemas: PromptCaseSchema, GraderResultSchema, CaptureResultSchema, etc.
10
+ * - JSON-RPC schemas: JsonRpcRequestSchema, JsonRpcResponseSchema, etc. (for headless adapter)
11
+ * - All inferred types via `z.infer<>`
12
+ *
13
+ * **JSON Schema generation (Zod 4):**
14
+ * ```typescript
15
+ * import { z } from 'zod'
16
+ * import { CaptureResultSchema } from '@plaited/agent-eval-harness/schemas'
17
+ * const jsonSchema = z.toJSONSchema(CaptureResultSchema)
18
+ * ```
19
+ *
20
+ * @packageDocumentation
21
+ */
22
+
23
+ import { z } from 'zod'
24
+
25
+ // ============================================================================
26
+ // Session Types
27
+ // ============================================================================
28
+
29
+ /**
30
+ * Session schema for session creation responses.
31
+ */
32
+ export const SessionSchema = z.object({
33
+ id: z.string(),
34
+ _meta: z.record(z.string(), z.unknown()).nullish(),
35
+ })
36
+
37
+ /** Session object returned from session creation */
38
+ export type Session = z.infer<typeof SessionSchema>
39
+
40
+ // ============================================================================
41
+ // JSON-RPC 2.0 Schemas (for headless adapter)
42
+ // ============================================================================
43
+
44
+ /** JSON-RPC version literal */
45
+ const JsonRpcVersionSchema = z.literal('2.0')
46
+
47
+ /** Request/response identifier */
48
+ const RequestIdSchema = z.union([z.string(), z.number()])
49
+
50
+ /**
51
+ * JSON-RPC 2.0 error object schema.
52
+ *
53
+ * @remarks
54
+ * Standard error codes:
55
+ * - `-32700`: Parse error
56
+ * - `-32600`: Invalid request
57
+ * - `-32601`: Method not found
58
+ * - `-32602`: Invalid params
59
+ * - `-32603`: Internal error
60
+ */
61
+ export const JsonRpcErrorSchema = z.object({
62
+ code: z.number(),
63
+ message: z.string(),
64
+ data: z.unknown().optional(),
65
+ })
66
+
67
+ /** JSON-RPC 2.0 error object */
68
+ export type JsonRpcError = z.infer<typeof JsonRpcErrorSchema>
69
+
70
+ /** JSON-RPC 2.0 request schema */
71
+ export const JsonRpcRequestSchema = z.object({
72
+ jsonrpc: JsonRpcVersionSchema,
73
+ id: RequestIdSchema,
74
+ method: z.string(),
75
+ params: z.unknown().optional(),
76
+ })
77
+
78
+ /** JSON-RPC 2.0 request structure */
79
+ export type JsonRpcRequest<T = unknown> = Omit<z.infer<typeof JsonRpcRequestSchema>, 'params'> & {
80
+ params?: T
81
+ }
82
+
83
+ /** JSON-RPC 2.0 notification schema (no id, no response expected) */
84
+ export const JsonRpcNotificationSchema = z.object({
85
+ jsonrpc: JsonRpcVersionSchema,
86
+ method: z.string(),
87
+ params: z.unknown().optional(),
88
+ })
89
+
90
+ /** JSON-RPC 2.0 notification structure (no id, no response expected) */
91
+ export type JsonRpcNotification<T = unknown> = Omit<z.infer<typeof JsonRpcNotificationSchema>, 'params'> & {
92
+ params?: T
93
+ }
94
+
95
+ /** JSON-RPC 2.0 success response schema */
96
+ export const JsonRpcSuccessResponseSchema = z.object({
97
+ jsonrpc: JsonRpcVersionSchema,
98
+ id: RequestIdSchema,
99
+ result: z.unknown(),
100
+ })
101
+
102
+ /** JSON-RPC 2.0 success response */
103
+ export type JsonRpcSuccessResponse<T = unknown> = Omit<z.infer<typeof JsonRpcSuccessResponseSchema>, 'result'> & {
104
+ result: T
105
+ }
106
+
107
+ /** JSON-RPC 2.0 error response schema */
108
+ export const JsonRpcErrorResponseSchema = z.object({
109
+ jsonrpc: JsonRpcVersionSchema,
110
+ id: z.union([RequestIdSchema, z.null()]),
111
+ error: JsonRpcErrorSchema,
112
+ })
113
+
114
+ /** JSON-RPC 2.0 error response */
115
+ export type JsonRpcErrorResponse = z.infer<typeof JsonRpcErrorResponseSchema>
116
+
117
+ /** Union of all JSON-RPC response types */
118
+ export const JsonRpcResponseSchema = z.union([JsonRpcSuccessResponseSchema, JsonRpcErrorResponseSchema])
119
+
120
+ /** Union of all JSON-RPC response types */
121
+ export type JsonRpcResponse<T = unknown> = JsonRpcSuccessResponse<T> | JsonRpcErrorResponse
122
+
123
+ /**
124
+ * Union of all JSON-RPC message types.
125
+ *
126
+ * @remarks
127
+ * Use `safeParse` at transport boundaries for runtime validation.
128
+ */
129
+ export const JsonRpcMessageSchema = z.union([JsonRpcRequestSchema, JsonRpcNotificationSchema, JsonRpcResponseSchema])
130
+
131
+ /** Union of all JSON-RPC message types */
132
+ export type JsonRpcMessage<T = unknown> = JsonRpcRequest<T> | JsonRpcNotification<T> | JsonRpcResponse<T>
133
+
134
+ // ============================================================================
135
+ // MCP Server Configuration Schemas
136
+ // ============================================================================
137
+
138
+ /** Environment variable configuration */
139
+ export const EnvVariableSchema = z.object({
140
+ name: z.string(),
141
+ value: z.string(),
142
+ })
143
+
144
+ /** HTTP header configuration */
145
+ export const HttpHeaderSchema = z.object({
146
+ name: z.string(),
147
+ value: z.string(),
148
+ })
149
+
150
+ /** MCP server stdio transport configuration */
151
+ export const McpServerStdioSchema = z.object({
152
+ type: z.literal('stdio').optional(),
153
+ name: z.string(),
154
+ command: z.string(),
155
+ args: z.array(z.string()),
156
+ env: z.array(EnvVariableSchema),
157
+ })
158
+
159
+ /** MCP server HTTP transport configuration */
160
+ export const McpServerHttpSchema = z.object({
161
+ type: z.literal('http'),
162
+ name: z.string(),
163
+ url: z.string(),
164
+ headers: z.array(HttpHeaderSchema),
165
+ })
166
+
167
+ /** MCP server configuration (stdio or HTTP) */
168
+ export const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
169
+
170
+ /** MCP server configuration type */
171
+ export type McpServerConfig = z.infer<typeof McpServerSchema>
172
+
173
+ // ============================================================================
174
+ // Harness Input Schemas
175
+ // ============================================================================
176
+
177
+ /**
178
+ * Prompt case schema for evaluation inputs.
179
+ *
180
+ * @remarks
181
+ * Each line in a prompts.jsonl file should match this schema.
182
+ * - Single turn: `input: "Hello"` - one prompt, one session
183
+ * - Multi-turn: `input: ["Hello", "How are you?", "Goodbye"]` - sequential turns in one session
184
+ */
185
+ export const PromptCaseSchema = z.object({
186
+ /** Unique identifier for the test case */
187
+ id: z.string(),
188
+ /** Prompt text(s) - string for single turn, array for multi-turn conversation */
189
+ input: z.union([z.string(), z.array(z.string())]),
190
+ /** Optional grader context hint (not a strict expected match) */
191
+ hint: z.string().optional(),
192
+ /** Optional reference solution for validation */
193
+ reference: z.string().optional(),
194
+ /** Optional metadata for categorization and analysis */
195
+ metadata: z.record(z.string(), z.unknown()).optional(),
196
+ /** Optional per-case timeout override in milliseconds */
197
+ timeout: z.number().optional(),
198
+ })
199
+
200
+ /** Prompt case type */
201
+ export type PromptCase = z.infer<typeof PromptCaseSchema>
202
+
203
+ // ============================================================================
204
+ // Grader Schemas
205
+ // ============================================================================
206
+
207
+ /**
208
+ * Grader result schema.
209
+ *
210
+ * @remarks
211
+ * Result returned by user-provided grader functions.
212
+ */
213
+ export const GraderResultSchema = z.object({
214
+ /** Whether the output passes the evaluation criteria */
215
+ pass: z.boolean(),
216
+ /** Numeric score from 0.0 to 1.0 */
217
+ score: z.number().min(0).max(1),
218
+ /** Optional explanation for the score */
219
+ reasoning: z.string().optional(),
220
+ })
221
+
222
+ /** Grader result type */
223
+ export type GraderResult = z.infer<typeof GraderResultSchema>
224
+
225
+ /**
226
+ * Grader function type.
227
+ *
228
+ * @remarks
229
+ * User-provided graders implement this interface to score agent outputs.
230
+ * - `input` is the original prompt (string or array for multi-turn)
231
+ * - `hint` provides grader context (renamed from `expected`)
232
+ */
233
+ export type Grader = (params: {
234
+ input: string | string[]
235
+ output: string
236
+ hint?: string
237
+ trajectory?: TrajectoryStep[]
238
+ }) => Promise<GraderResult>
239
+
240
+ // ============================================================================
241
+ // Trajectory Schemas
242
+ // ============================================================================
243
+
244
+ /** Tool input schema for extracting file paths and content */
245
+ export const ToolInputSchema = z
246
+ .object({
247
+ file_path: z.string().optional(),
248
+ path: z.string().optional(),
249
+ content: z.string().optional(),
250
+ new_string: z.string().optional(),
251
+ })
252
+ .passthrough()
253
+
254
+ /** Tool input type */
255
+ export type ToolInput = z.infer<typeof ToolInputSchema>
256
+
257
+ /** Thought trajectory step */
258
+ export const ThoughtStepSchema = z.object({
259
+ type: z.literal('thought'),
260
+ content: z.string(),
261
+ timestamp: z.number(),
262
+ stepId: z.string().optional(),
263
+ })
264
+
265
+ /** Message trajectory step */
266
+ export const MessageStepSchema = z.object({
267
+ type: z.literal('message'),
268
+ content: z.string(),
269
+ timestamp: z.number(),
270
+ stepId: z.string().optional(),
271
+ })
272
+
273
+ /** Tool call trajectory step */
274
+ export const ToolCallStepSchema = z.object({
275
+ type: z.literal('tool_call'),
276
+ name: z.string(),
277
+ status: z.string(),
278
+ input: z.unknown().optional(),
279
+ output: z.unknown().optional(),
280
+ duration: z.number().optional(),
281
+ timestamp: z.number(),
282
+ stepId: z.string().optional(),
283
+ })
284
+
285
+ /** Plan trajectory step */
286
+ export const PlanStepSchema = z.object({
287
+ type: z.literal('plan'),
288
+ entries: z.array(z.unknown()),
289
+ timestamp: z.number(),
290
+ stepId: z.string().optional(),
291
+ })
292
+
293
+ /**
294
+ * Trajectory step schema (discriminated union).
295
+ *
296
+ * @remarks
297
+ * Represents a single step in the agent's execution trajectory.
298
+ */
299
+ export const TrajectoryStepSchema = z.discriminatedUnion('type', [
300
+ ThoughtStepSchema,
301
+ MessageStepSchema,
302
+ ToolCallStepSchema,
303
+ PlanStepSchema,
304
+ ])
305
+
306
+ /** Trajectory step type */
307
+ export type TrajectoryStep = z.infer<typeof TrajectoryStepSchema>
308
+
309
+ /** Indexed trajectory step with unique ID for correlation */
310
+ export type IndexedStep = TrajectoryStep & { stepId: string }
311
+
312
+ // ============================================================================
313
+ // Capture Result Schemas
314
+ // ============================================================================
315
+
316
+ /**
317
+ * Timing information for a capture result.
318
+ *
319
+ * @remarks
320
+ * Captures both absolute timestamps and derived durations for analysis:
321
+ * - `sessionCreation`: Time to initialize session (agent startup overhead)
322
+ * - `total`: End-to-end duration including all turns
323
+ * - `firstResponse`: Latency to first agent output (optional)
324
+ *
325
+ * Token counts are adapter-dependent and only present if the adapter
326
+ * exposes usage information (e.g., Claude Code includes them, others may not).
327
+ *
328
+ * @public
329
+ */
330
+ export const TimingSchema = z.object({
331
+ /** Epoch timestamp when capture started */
332
+ start: z.number(),
333
+ /** Epoch timestamp when capture ended */
334
+ end: z.number(),
335
+ /** Time to first response (ms from start) */
336
+ firstResponse: z.number().optional(),
337
+ /** Time to create session (ms) - measures agent initialization overhead */
338
+ sessionCreation: z.number(),
339
+ /** Total duration (end - start) in milliseconds */
340
+ total: z.number(),
341
+ /** Input tokens consumed (if available from headless adapter) */
342
+ inputTokens: z.number().optional(),
343
+ /** Output tokens generated (if available from headless adapter) */
344
+ outputTokens: z.number().optional(),
345
+ })
346
+
347
+ /**
348
+ * Timing information type inferred from TimingSchema.
349
+ *
350
+ * @public
351
+ */
352
+ export type Timing = z.infer<typeof TimingSchema>
353
+
354
+ /**
355
+ * Trajectory richness level indicating the depth of captured agent activity.
356
+ *
357
+ * @remarks
358
+ * Different adapters provide varying levels of detail:
359
+ * - `full`: Thoughts, tool calls, plans (e.g., Claude Code adapter)
360
+ * - `minimal`: Basic output only (e.g., Droid adapter)
361
+ * - `messages-only`: Messages without internal reasoning
362
+ */
363
+ export const TrajectoryRichnessSchema = z.enum(['full', 'minimal', 'messages-only'])
364
+
365
+ /** Trajectory richness type */
366
+ export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
367
+
368
+ /**
369
+ * Capture result schema.
370
+ *
371
+ * @remarks
372
+ * Full trajectory output from the `capture` command.
373
+ * - `input` can be string (single turn) or string[] (multi-turn)
374
+ * - `hint` provides grader context (renamed from `expected`)
375
+ * - `toolErrors` replaces misleading `status: 'passed'|'failed'`
376
+ * Real pass/fail determination comes from your grader.
377
+ */
378
+ export const CaptureResultSchema = z.object({
379
+ /** Test case identifier */
380
+ id: z.string(),
381
+ /** Original prompt input (string for single turn, array for multi-turn) */
382
+ input: z.union([z.string(), z.array(z.string())]),
383
+ /** Final agent output */
384
+ output: z.string(),
385
+ /** Grader context hint (renamed from expected) */
386
+ hint: z.string().optional(),
387
+ /** Full execution trajectory */
388
+ trajectory: z.array(TrajectoryStepSchema),
389
+ /** Metadata including category, agent info, trajectoryRichness, turnCount */
390
+ metadata: z.record(z.string(), z.unknown()),
391
+ /** Timing information */
392
+ timing: TimingSchema,
393
+ /** Whether any tool calls failed */
394
+ toolErrors: z.boolean(),
395
+ /** Error messages (if any) */
396
+ errors: z.array(z.string()).optional(),
397
+ /** Grader score (if grader was provided) */
398
+ score: GraderResultSchema.optional(),
399
+ })
400
+
401
+ /** Capture result type */
402
+ export type CaptureResult = z.infer<typeof CaptureResultSchema>
403
+
404
+ // ============================================================================
405
+ // Summary Result Schemas
406
+ // ============================================================================
407
+
408
+ /**
409
+ * Summary result schema.
410
+ *
411
+ * @remarks
412
+ * Compact view derived from full capture results via the `summarize` command.
413
+ */
414
+ export const SummaryResultSchema = z.object({
415
+ /** Test case identifier */
416
+ id: z.string(),
417
+ /** Original prompt input */
418
+ input: z.string(),
419
+ /** Final agent output */
420
+ output: z.string(),
421
+ /** List of tool names called */
422
+ toolCalls: z.array(z.string()),
423
+ /** Duration in milliseconds */
424
+ duration: z.number(),
425
+ })
426
+
427
+ /** Summary result type */
428
+ export type SummaryResult = z.infer<typeof SummaryResultSchema>
429
+
430
+ // ============================================================================
431
+ // Trial Result Schemas
432
+ // ============================================================================
433
+
434
+ /** Single trial within a trial run */
435
+ export const TrialEntrySchema = z.object({
436
+ /** Trial number (1-indexed) */
437
+ trialNum: z.number(),
438
+ /** Agent output for this trial */
439
+ output: z.string(),
440
+ /** Full trajectory for this trial */
441
+ trajectory: z.array(TrajectoryStepSchema),
442
+ /** Duration in milliseconds */
443
+ duration: z.number(),
444
+ /** Pass/fail (if grader provided) */
445
+ pass: z.boolean().optional(),
446
+ /** Numeric score (if grader provided) */
447
+ score: z.number().optional(),
448
+ /** Grader reasoning (if grader provided) */
449
+ reasoning: z.string().optional(),
450
+ })
451
+
452
+ /** Trial entry type */
453
+ export type TrialEntry = z.infer<typeof TrialEntrySchema>
454
+
455
+ /**
456
+ * Trial result schema.
457
+ *
458
+ * @remarks
459
+ * Output from the `trials` command for pass@k/pass^k analysis.
460
+ * Metrics (passRate, passAtK, passExpK) are only present when a grader is provided.
461
+ */
462
+ export const TrialResultSchema = z.object({
463
+ /** Test case identifier */
464
+ id: z.string(),
465
+ /** Original prompt input (string for single turn, array for multi-turn) */
466
+ input: z.union([z.string(), z.array(z.string())]),
467
+ /** Grader context hint (renamed from expected) */
468
+ hint: z.string().optional(),
469
+ /** Number of trials (k) */
470
+ k: z.number(),
471
+ /** Simple pass rate: passes / k (with grader only) */
472
+ passRate: z.number().optional(),
473
+ /** pass@k: probability of at least one pass in k samples (with grader only) */
474
+ passAtK: z.number().optional(),
475
+ /** pass^k: probability of all k samples passing (with grader only) */
476
+ passExpK: z.number().optional(),
477
+ /** Individual trial results */
478
+ trials: z.array(TrialEntrySchema),
479
+ })
480
+
481
+ /** Trial result type */
482
+ export type TrialResult = z.infer<typeof TrialResultSchema>
483
+
484
+ // ============================================================================
485
+ // Calibration Schemas
486
+ // ============================================================================
487
+
488
+ /** Calibration sample for grader review */
489
+ export const CalibrationSampleSchema = z.object({
490
+ /** Test case identifier */
491
+ id: z.string(),
492
+ /** Original prompt input (string for single turn, array for multi-turn) */
493
+ input: z.union([z.string(), z.array(z.string())]),
494
+ /** Agent output */
495
+ output: z.string(),
496
+ /** Grader context hint (renamed from expected) */
497
+ hint: z.string().optional(),
498
+ /** Original grader score */
499
+ originalScore: GraderResultSchema,
500
+ /** Re-scored result (if different grader provided) */
501
+ rescoredResult: GraderResultSchema.optional(),
502
+ /** Key trajectory snippets */
503
+ trajectorySnippet: z.array(TrajectoryStepSchema),
504
+ })
505
+
506
+ /** Calibration sample type */
507
+ export type CalibrationSample = z.infer<typeof CalibrationSampleSchema>
508
+
509
+ // ============================================================================
510
+ // Balance Analysis Schemas
511
+ // ============================================================================
512
+
513
+ /** Category distribution in test set */
514
+ export const CategoryDistributionSchema = z.object({
515
+ /** Category name */
516
+ name: z.string(),
517
+ /** Number of test cases */
518
+ count: z.number(),
519
+ /** Percentage of total */
520
+ percentage: z.number(),
521
+ })
522
+
523
+ /** Category distribution type */
524
+ export type CategoryDistribution = z.infer<typeof CategoryDistributionSchema>
525
+
526
+ /** Balance analysis result */
527
+ export const BalanceAnalysisSchema = z.object({
528
+ /** Total number of test cases */
529
+ totalCases: z.number(),
530
+ /** Distribution by category */
531
+ categories: z.array(CategoryDistributionSchema),
532
+ /** Categories that may need more test cases */
533
+ underrepresented: z.array(z.string()),
534
+ /** Suggested improvements */
535
+ suggestions: z.array(z.string()),
536
+ })
537
+
538
+ /** Balance analysis type */
539
+ export type BalanceAnalysis = z.infer<typeof BalanceAnalysisSchema>
540
+
541
+ // ============================================================================
542
+ // Validation Reference Schemas
543
+ // ============================================================================
544
+
545
+ /** Validation result for a reference solution */
546
+ export const ValidationResultSchema = z.object({
547
+ /** Test case identifier */
548
+ id: z.string(),
549
+ /** Reference solution provided */
550
+ reference: z.string(),
551
+ /** Whether reference passes the grader */
552
+ passes: z.boolean(),
553
+ /** Grader result */
554
+ graderResult: GraderResultSchema,
555
+ })
556
+
557
+ /** Validation result type */
558
+ export type ValidationResult = z.infer<typeof ValidationResultSchema>
@@ -0,0 +1,121 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import {
3
+ DEFAULT_CALIBRATION_SAMPLE_SIZE,
4
+ DEFAULT_CLIENT_NAME,
5
+ DEFAULT_HARNESS_TIMEOUT,
6
+ DEFAULT_POLLING_INTERVAL,
7
+ DEFAULT_PROTOCOL_TIMEOUT,
8
+ DEFAULT_TRIAL_COUNT,
9
+ HEAD_LINES,
10
+ JSON_RPC_ERRORS,
11
+ MAX_CONTENT_LENGTH,
12
+ PROTOCOL_METHODS,
13
+ PROTOCOL_VERSION,
14
+ TAIL_LINES,
15
+ } from '../constants.ts'
16
+
17
+ // ============================================================================
18
+ // JSON-RPC Protocol Constants
19
+ // ============================================================================
20
+
21
+ describe('PROTOCOL_METHODS', () => {
22
+ test('contains all required lifecycle methods', () => {
23
+ expect(PROTOCOL_METHODS.INITIALIZE).toBe('initialize')
24
+ expect(PROTOCOL_METHODS.SHUTDOWN).toBe('shutdown')
25
+ })
26
+
27
+ test('contains all required session methods', () => {
28
+ expect(PROTOCOL_METHODS.CREATE_SESSION).toBe('session/new')
29
+ expect(PROTOCOL_METHODS.LOAD_SESSION).toBe('session/load')
30
+ expect(PROTOCOL_METHODS.PROMPT).toBe('session/prompt')
31
+ expect(PROTOCOL_METHODS.CANCEL).toBe('session/cancel')
32
+ expect(PROTOCOL_METHODS.UPDATE).toBe('session/update')
33
+ expect(PROTOCOL_METHODS.REQUEST_PERMISSION).toBe('session/request_permission')
34
+ expect(PROTOCOL_METHODS.SET_MODEL).toBe('session/set_model')
35
+ })
36
+
37
+ test('contains protocol-level methods', () => {
38
+ expect(PROTOCOL_METHODS.CANCEL_REQUEST).toBe('$/cancel_request')
39
+ })
40
+ })
41
+
42
+ describe('PROTOCOL_VERSION', () => {
43
+ test('is version 1', () => {
44
+ expect(PROTOCOL_VERSION).toBe(1)
45
+ })
46
+ })
47
+
48
+ // ============================================================================
49
+ // JSON-RPC Error Codes
50
+ // ============================================================================
51
+
52
+ describe('JSON_RPC_ERRORS', () => {
53
+ test('contains standard JSON-RPC error codes', () => {
54
+ expect(JSON_RPC_ERRORS.PARSE_ERROR).toBe(-32700)
55
+ expect(JSON_RPC_ERRORS.INVALID_REQUEST).toBe(-32600)
56
+ expect(JSON_RPC_ERRORS.METHOD_NOT_FOUND).toBe(-32601)
57
+ expect(JSON_RPC_ERRORS.INVALID_PARAMS).toBe(-32602)
58
+ expect(JSON_RPC_ERRORS.INTERNAL_ERROR).toBe(-32603)
59
+ })
60
+
61
+ test('contains extension error codes', () => {
62
+ expect(JSON_RPC_ERRORS.REQUEST_CANCELLED).toBe(-32800)
63
+ })
64
+ })
65
+
66
+ // ============================================================================
67
+ // Client Defaults
68
+ // ============================================================================
69
+
70
+ describe('Client defaults', () => {
71
+ test('DEFAULT_CLIENT_NAME is set', () => {
72
+ expect(DEFAULT_CLIENT_NAME).toBe('plaited-eval-harness')
73
+ })
74
+
75
+ test('DEFAULT_PROTOCOL_TIMEOUT is 30 seconds', () => {
76
+ expect(DEFAULT_PROTOCOL_TIMEOUT).toBe(30000)
77
+ })
78
+
79
+ test('DEFAULT_POLLING_INTERVAL is 50ms', () => {
80
+ expect(DEFAULT_POLLING_INTERVAL).toBe(50)
81
+ })
82
+ })
83
+
84
+ // ============================================================================
85
+ // Harness Preview Configuration
86
+ // ============================================================================
87
+
88
+ describe('Preview configuration', () => {
89
+ test('HEAD_LINES is positive', () => {
90
+ expect(HEAD_LINES).toBeGreaterThan(0)
91
+ expect(HEAD_LINES).toBe(8)
92
+ })
93
+
94
+ test('TAIL_LINES is positive', () => {
95
+ expect(TAIL_LINES).toBeGreaterThan(0)
96
+ expect(TAIL_LINES).toBe(4)
97
+ })
98
+
99
+ test('MAX_CONTENT_LENGTH is reasonable', () => {
100
+ expect(MAX_CONTENT_LENGTH).toBeGreaterThan(0)
101
+ expect(MAX_CONTENT_LENGTH).toBe(500)
102
+ })
103
+ })
104
+
105
+ // ============================================================================
106
+ // Harness Defaults
107
+ // ============================================================================
108
+
109
+ describe('Harness defaults', () => {
110
+ test('DEFAULT_HARNESS_TIMEOUT is 60 seconds', () => {
111
+ expect(DEFAULT_HARNESS_TIMEOUT).toBe(60000)
112
+ })
113
+
114
+ test('DEFAULT_TRIAL_COUNT is 5', () => {
115
+ expect(DEFAULT_TRIAL_COUNT).toBe(5)
116
+ })
117
+
118
+ test('DEFAULT_CALIBRATION_SAMPLE_SIZE is 10', () => {
119
+ expect(DEFAULT_CALIBRATION_SAMPLE_SIZE).toBe(10)
120
+ })
121
+ })
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Test fixture: Invalid TypeScript grader (no 'grade' export).
3
+ */
4
+
5
+ export const evaluate = () => ({ pass: true, score: 1.0 })
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test fixture: Python grader that exits with non-zero code.
4
+ """
5
+
6
+ import sys
7
+
8
+ sys.stderr.write("Intentional failure")
9
+ sys.exit(1)
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test fixture: Python grader that outputs invalid JSON.
4
+ """
5
+
6
+ print("not valid json")