@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,356 @@
1
+ /**
2
+ * Unit tests for pipeline commands.
3
+ *
4
+ * @remarks
5
+ * Tests for the Unix-style pipeline commands:
6
+ * - format: formatMarkdown, formatCsv helpers
7
+ * - compare: parseLabeledRun helper
8
+ * - type validation
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ import { describe, expect, test } from 'bun:test'
14
+ import type {
15
+ ComparisonGraderInput,
16
+ ComparisonGraderResult,
17
+ ExtractedResult,
18
+ FormatStyle,
19
+ GradedResult,
20
+ LabeledRun,
21
+ RawOutput,
22
+ } from '../pipeline.types.ts'
23
+
24
+ // ============================================================================
25
+ // Type Validation Tests
26
+ // ============================================================================
27
+
28
+ describe('RawOutput type', () => {
29
+ test('accepts valid raw output', () => {
30
+ const raw: RawOutput = {
31
+ id: 'test-001',
32
+ input: 'What is 2+2?',
33
+ rawLines: ['{"type":"message","content":"4"}'],
34
+ timing: {
35
+ start: 1000,
36
+ end: 2000,
37
+ total: 1000,
38
+ },
39
+ }
40
+ expect(raw.id).toBe('test-001')
41
+ expect(raw.timing.total).toBe(1000)
42
+ })
43
+
44
+ test('accepts array input for multi-turn', () => {
45
+ const raw: RawOutput = {
46
+ id: 'multi-001',
47
+ input: ['Hello', 'How are you?'],
48
+ rawLines: [],
49
+ timing: { start: 0, end: 100, total: 100 },
50
+ }
51
+ expect(Array.isArray(raw.input)).toBe(true)
52
+ expect((raw.input as string[]).length).toBe(2)
53
+ })
54
+
55
+ test('accepts optional hint', () => {
56
+ const raw: RawOutput = {
57
+ id: 'hint-001',
58
+ input: 'Calculate something',
59
+ hint: 'Expected: numeric answer',
60
+ rawLines: [],
61
+ timing: { start: 0, end: 0, total: 0 },
62
+ }
63
+ expect(raw.hint).toBe('Expected: numeric answer')
64
+ })
65
+
66
+ test('accepts optional error', () => {
67
+ const raw: RawOutput = {
68
+ id: 'error-001',
69
+ input: 'fail test',
70
+ rawLines: [],
71
+ timing: { start: 0, end: 100, total: 100 },
72
+ error: 'Timeout exceeded',
73
+ }
74
+ expect(raw.error).toBe('Timeout exceeded')
75
+ })
76
+ })
77
+
78
+ describe('ExtractedResult type', () => {
79
+ test('accepts valid extracted result', () => {
80
+ const extracted: ExtractedResult = {
81
+ id: 'test-001',
82
+ input: 'What is 2+2?',
83
+ output: '4',
84
+ trajectory: [
85
+ {
86
+ type: 'message',
87
+ content: '4',
88
+ timestamp: 100,
89
+ },
90
+ ],
91
+ toolErrors: false,
92
+ timing: { start: 0, end: 100, total: 100 },
93
+ }
94
+ expect(extracted.output).toBe('4')
95
+ expect(extracted.trajectory.length).toBe(1)
96
+ expect(extracted.toolErrors).toBe(false)
97
+ })
98
+
99
+ test('accepts thought and tool_call steps', () => {
100
+ const extracted: ExtractedResult = {
101
+ id: 'complex-001',
102
+ input: 'Create a file',
103
+ output: 'Done',
104
+ trajectory: [
105
+ { type: 'thought', content: 'I need to create a file', timestamp: 50 },
106
+ {
107
+ type: 'tool_call',
108
+ name: 'Write',
109
+ input: { path: '/tmp/test.txt', content: 'hello' },
110
+ status: 'completed',
111
+ timestamp: 200,
112
+ },
113
+ { type: 'message', content: 'Done', timestamp: 250 },
114
+ ],
115
+ toolErrors: false,
116
+ timing: { start: 0, end: 300, total: 300 },
117
+ }
118
+ expect(extracted.trajectory.length).toBe(3)
119
+ expect(extracted.trajectory[1]?.type).toBe('tool_call')
120
+ })
121
+ })
122
+
123
+ describe('GradedResult type', () => {
124
+ test('extends ExtractedResult with score', () => {
125
+ const graded: GradedResult = {
126
+ id: 'graded-001',
127
+ input: 'What is 2+2?',
128
+ output: '4',
129
+ trajectory: [],
130
+ toolErrors: false,
131
+ timing: { start: 0, end: 100, total: 100 },
132
+ score: {
133
+ pass: true,
134
+ score: 1.0,
135
+ reasoning: 'Correct answer',
136
+ },
137
+ }
138
+ expect(graded.score.pass).toBe(true)
139
+ expect(graded.score.score).toBe(1.0)
140
+ expect(graded.score.reasoning).toBe('Correct answer')
141
+ })
142
+
143
+ test('accepts failing score', () => {
144
+ const graded: GradedResult = {
145
+ id: 'fail-001',
146
+ input: 'What is 2+2?',
147
+ output: '5',
148
+ trajectory: [],
149
+ toolErrors: false,
150
+ timing: { start: 0, end: 100, total: 100 },
151
+ score: {
152
+ pass: false,
153
+ score: 0.0,
154
+ reasoning: 'Incorrect answer',
155
+ },
156
+ }
157
+ expect(graded.score.pass).toBe(false)
158
+ expect(graded.score.score).toBe(0.0)
159
+ })
160
+ })
161
+
162
+ describe('FormatStyle type', () => {
163
+ test('accepts valid format styles', () => {
164
+ const styles: FormatStyle[] = ['jsonl', 'markdown', 'csv']
165
+ expect(styles).toContain('jsonl')
166
+ expect(styles).toContain('markdown')
167
+ expect(styles).toContain('csv')
168
+ })
169
+ })
170
+
171
+ describe('LabeledRun type', () => {
172
+ test('accepts label and path', () => {
173
+ const run: LabeledRun = {
174
+ label: 'baseline',
175
+ path: './results/baseline.jsonl',
176
+ }
177
+ expect(run.label).toBe('baseline')
178
+ expect(run.path).toBe('./results/baseline.jsonl')
179
+ })
180
+ })
181
+
182
+ describe('ComparisonGraderInput type', () => {
183
+ test('accepts multiple runs', () => {
184
+ const input: ComparisonGraderInput = {
185
+ id: 'compare-001',
186
+ input: 'What is 2+2?',
187
+ runs: {
188
+ baseline: { output: '4' },
189
+ experiment: { output: 'Four', trajectory: [] },
190
+ },
191
+ }
192
+ expect(Object.keys(input.runs).length).toBe(2)
193
+ expect(input.runs.baseline?.output).toBe('4')
194
+ expect(input.runs.experiment?.trajectory).toEqual([])
195
+ })
196
+ })
197
+
198
+ describe('ComparisonGraderResult type', () => {
199
+ test('accepts rankings with reasoning', () => {
200
+ const result: ComparisonGraderResult = {
201
+ rankings: [
202
+ { run: 'baseline', rank: 1, score: 0.95 },
203
+ { run: 'experiment', rank: 2, score: 0.8 },
204
+ ],
205
+ reasoning: 'Baseline was more concise',
206
+ }
207
+ expect(result.rankings.length).toBe(2)
208
+ expect(result.rankings[0]?.rank).toBe(1)
209
+ expect(result.reasoning).toBeDefined()
210
+ })
211
+ })
212
+
213
+ // ============================================================================
214
+ // Helper Function Tests (via import)
215
+ // ============================================================================
216
+
217
+ // Note: Some helper functions are not exported from the modules.
218
+ // These tests verify the type contracts that the helpers must satisfy.
219
+
220
+ describe('pipeline data flow', () => {
221
+ test('RawOutput can flow to ExtractedResult', () => {
222
+ const raw: RawOutput = {
223
+ id: 'flow-001',
224
+ input: 'test',
225
+ hint: 'expected: something',
226
+ rawLines: ['{"type":"message","content":"result"}'],
227
+ timing: { start: 0, end: 100, total: 100 },
228
+ }
229
+
230
+ // Simulate extraction
231
+ const extracted: ExtractedResult = {
232
+ id: raw.id,
233
+ input: raw.input,
234
+ hint: raw.hint,
235
+ output: 'result',
236
+ trajectory: [{ type: 'message', content: 'result', timestamp: 100 }],
237
+ toolErrors: false,
238
+ timing: raw.timing,
239
+ }
240
+
241
+ expect(extracted.id).toBe(raw.id)
242
+ expect(extracted.input).toBe(raw.input)
243
+ expect(extracted.hint).toBe(raw.hint)
244
+ })
245
+
246
+ test('ExtractedResult can flow to GradedResult', () => {
247
+ const extracted: ExtractedResult = {
248
+ id: 'grade-flow-001',
249
+ input: 'test',
250
+ output: 'result',
251
+ trajectory: [],
252
+ toolErrors: false,
253
+ timing: { start: 0, end: 100, total: 100 },
254
+ }
255
+
256
+ // Simulate grading
257
+ const graded: GradedResult = {
258
+ ...extracted,
259
+ score: { pass: true, score: 1.0 },
260
+ }
261
+
262
+ expect(graded.id).toBe(extracted.id)
263
+ expect(graded.score.pass).toBe(true)
264
+ })
265
+ })
266
+
267
+ describe('comparison data structures', () => {
268
+ test('LabeledRun derived from filename', () => {
269
+ // Simulate parseLabeledRun behavior
270
+ const path = '/path/to/results-baseline.jsonl'
271
+ const basename = path.split('/').pop() ?? ''
272
+ const label = basename.replace('.jsonl', '')
273
+
274
+ const run: LabeledRun = { label, path }
275
+ expect(run.label).toBe('results-baseline')
276
+ })
277
+
278
+ test('LabeledRun with explicit label', () => {
279
+ // Simulate explicit label:path format
280
+ const arg = 'my-baseline:/path/to/results.jsonl'
281
+ const colonIdx = arg.indexOf(':')
282
+ const label = arg.slice(0, colonIdx)
283
+ const path = arg.slice(colonIdx + 1)
284
+
285
+ const run: LabeledRun = { label, path }
286
+ expect(run.label).toBe('my-baseline')
287
+ expect(run.path).toBe('/path/to/results.jsonl')
288
+ })
289
+
290
+ test('comparison aggregates results by prompt ID', () => {
291
+ const results1 = [
292
+ { id: 'p1', output: 'a' },
293
+ { id: 'p2', output: 'b' },
294
+ ]
295
+ const results2 = [
296
+ { id: 'p1', output: 'x' },
297
+ { id: 'p2', output: 'y' },
298
+ ]
299
+
300
+ // Simulate comparison aggregation
301
+ const promptIds = new Set([...results1.map((r) => r.id), ...results2.map((r) => r.id)])
302
+ expect(promptIds.size).toBe(2)
303
+
304
+ const comparisonInput: ComparisonGraderInput = {
305
+ id: 'p1',
306
+ input: 'test prompt',
307
+ runs: {
308
+ run1: { output: results1.find((r) => r.id === 'p1')?.output ?? '' },
309
+ run2: { output: results2.find((r) => r.id === 'p1')?.output ?? '' },
310
+ },
311
+ }
312
+ expect(comparisonInput.runs.run1?.output).toBe('a')
313
+ expect(comparisonInput.runs.run2?.output).toBe('x')
314
+ })
315
+ })
316
+
317
+ describe('format style contracts', () => {
318
+ test('markdown format includes summary when graded', () => {
319
+ // Verify the type contract for markdown formatting
320
+ const gradedResults: GradedResult[] = [
321
+ {
322
+ id: 't1',
323
+ input: 'a',
324
+ output: 'x',
325
+ trajectory: [],
326
+ toolErrors: false,
327
+ timing: { start: 0, end: 100, total: 100 },
328
+ score: { pass: true, score: 1.0 },
329
+ },
330
+ {
331
+ id: 't2',
332
+ input: 'b',
333
+ output: 'y',
334
+ trajectory: [],
335
+ toolErrors: false,
336
+ timing: { start: 0, end: 100, total: 100 },
337
+ score: { pass: false, score: 0.5 },
338
+ },
339
+ ]
340
+
341
+ const passed = gradedResults.filter((r) => r.score.pass).length
342
+ const total = gradedResults.length
343
+ const passRate = passed / total
344
+
345
+ expect(passRate).toBe(0.5)
346
+ })
347
+
348
+ test('csv format escapes special characters', () => {
349
+ // Test CSV escaping contract
350
+ const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"`
351
+
352
+ expect(escapeCsv('hello')).toBe('"hello"')
353
+ expect(escapeCsv('say "hello"')).toBe('"say ""hello"""')
354
+ expect(escapeCsv('line1\nline2')).toBe('"line1\\nline2"')
355
+ })
356
+ })
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Pipeline commands re-export.
3
+ *
4
+ * @remarks
5
+ * Public API for pipeline commands. Import from here for external use.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ export {
11
+ // Types
12
+ type CompareConfig,
13
+ type ComparisonGrader,
14
+ type ComparisonGraderInput,
15
+ type ComparisonGraderResult,
16
+ type ComparisonRanking,
17
+ type ComparisonResult,
18
+ // Commands
19
+ compare,
20
+ type ExtractConfig,
21
+ type ExtractedResult,
22
+ extract,
23
+ type FormatConfig,
24
+ type FormatStyle,
25
+ format,
26
+ type GradeConfig,
27
+ type GradedResult,
28
+ grade,
29
+ type LabeledRun,
30
+ type RawOutput,
31
+ type RunConfig,
32
+ type RunMode,
33
+ run,
34
+ } from './pipeline/pipeline.ts'
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Constants for harness and JSON-RPC protocol operations.
3
+ *
4
+ * @remarks
5
+ * Contains all constant values used across the implementation:
6
+ * - JSON-RPC method names and protocol version
7
+ * - JSON-RPC error codes
8
+ * - Harness defaults (timeouts, preview limits)
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ // ============================================================================
14
+ // JSON-RPC Protocol Methods
15
+ // ============================================================================
16
+
17
+ /** JSON-RPC method names for headless adapter protocol */
18
+ export const PROTOCOL_METHODS = {
19
+ // Lifecycle
20
+ INITIALIZE: 'initialize',
21
+ SHUTDOWN: 'shutdown',
22
+
23
+ // Sessions
24
+ CREATE_SESSION: 'session/new',
25
+ LOAD_SESSION: 'session/load',
26
+ PROMPT: 'session/prompt',
27
+ CANCEL: 'session/cancel',
28
+ UPDATE: 'session/update',
29
+ REQUEST_PERMISSION: 'session/request_permission',
30
+ SET_MODEL: 'session/set_model',
31
+
32
+ // Protocol-level
33
+ CANCEL_REQUEST: '$/cancel_request',
34
+ } as const
35
+
36
+ // ============================================================================
37
+ // Protocol Version
38
+ // ============================================================================
39
+
40
+ /** Current protocol version */
41
+ export const PROTOCOL_VERSION = 1 as const
42
+
43
+ // ============================================================================
44
+ // JSON-RPC Error Codes
45
+ // ============================================================================
46
+
47
+ /** Standard JSON-RPC error codes */
48
+ export const JSON_RPC_ERRORS = {
49
+ PARSE_ERROR: -32700,
50
+ INVALID_REQUEST: -32600,
51
+ METHOD_NOT_FOUND: -32601,
52
+ INVALID_PARAMS: -32602,
53
+ INTERNAL_ERROR: -32603,
54
+ REQUEST_CANCELLED: -32800,
55
+ } as const
56
+
57
+ // ============================================================================
58
+ // Client Defaults
59
+ // ============================================================================
60
+
61
+ /** Default client name for protocol handshake */
62
+ export const DEFAULT_CLIENT_NAME = 'plaited-eval-harness'
63
+
64
+ /** Default timeout for protocol operations in milliseconds */
65
+ export const DEFAULT_PROTOCOL_TIMEOUT = 30000
66
+
67
+ /** Default polling interval for streaming updates in milliseconds */
68
+ export const DEFAULT_POLLING_INTERVAL = 50
69
+
70
+ // ============================================================================
71
+ // Harness Preview Configuration
72
+ // ============================================================================
73
+
74
+ /** Number of lines to show at the head of content previews */
75
+ export const HEAD_LINES = 8
76
+
77
+ /** Number of lines to show at the tail of content previews */
78
+ export const TAIL_LINES = 4
79
+
80
+ /** Maximum content length before applying head/tail preview */
81
+ export const MAX_CONTENT_LENGTH = 500
82
+
83
+ // ============================================================================
84
+ // Harness Defaults
85
+ // ============================================================================
86
+
87
+ /** Default timeout for prompt evaluation in milliseconds */
88
+ export const DEFAULT_HARNESS_TIMEOUT = 60000
89
+
90
+ /** Default number of trials for pass@k analysis */
91
+ export const DEFAULT_TRIAL_COUNT = 5
92
+
93
+ /** Default sample size for calibration */
94
+ export const DEFAULT_CALIBRATION_SAMPLE_SIZE = 10
@@ -0,0 +1,174 @@
1
+ /**
2
+ * Polyglot grader loader module.
3
+ *
4
+ * @remarks
5
+ * Supports loading graders from:
6
+ * - TypeScript/JavaScript modules (import as ES module)
7
+ * - Executable scripts (Python, Ruby, shell, etc. via subprocess)
8
+ *
9
+ * Executable graders use stdin/stdout JSON protocol:
10
+ * - Input: `{"input": "...", "output": "...", "expected": "...", "trajectory": [...]}`
11
+ * - Output: `{"pass": true, "score": 1.0, "reasoning": "..."}`
12
+ *
13
+ * @packageDocumentation
14
+ */
15
+
16
+ import type { Grader, TrajectoryStep } from './schemas.ts'
17
+ import { GraderResultSchema } from './schemas.ts'
18
+
19
+ // ============================================================================
20
+ // Constants
21
+ // ============================================================================
22
+
23
+ /** File extensions that are imported as ES modules */
24
+ const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
25
+
26
+ // ============================================================================
27
+ // Helpers
28
+ // ============================================================================
29
+
30
+ /** Check if a file path is a JavaScript/TypeScript module */
31
+ const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
32
+
33
+ /** Resolve path relative to process.cwd() */
34
+ const resolvePath = (path: string): string => {
35
+ if (path.startsWith('/')) return path
36
+ return `${process.cwd()}/${path}`
37
+ }
38
+
39
+ // ============================================================================
40
+ // Executable Grader
41
+ // ============================================================================
42
+
43
+ /** Input format for executable graders (stdin JSON) */
44
+ type ExecGraderInput = {
45
+ input: string | string[]
46
+ output: string
47
+ hint?: string
48
+ trajectory?: TrajectoryStep[]
49
+ }
50
+
51
+ /**
52
+ * Create a grader function that executes an external script.
53
+ *
54
+ * @remarks
55
+ * The script receives JSON on stdin and must output JSON on stdout.
56
+ * Non-zero exit codes are treated as errors.
57
+ *
58
+ * @param execPath - Absolute path to the executable script
59
+ * @returns Grader function
60
+ */
61
+ const createExecGrader = (execPath: string): Grader => {
62
+ return async (params) => {
63
+ const input: ExecGraderInput = {
64
+ input: params.input,
65
+ output: params.output,
66
+ hint: params.hint,
67
+ trajectory: params.trajectory,
68
+ }
69
+
70
+ const inputJson = JSON.stringify(input)
71
+
72
+ const proc = Bun.spawn([execPath], {
73
+ stdin: new TextEncoder().encode(inputJson),
74
+ stdout: 'pipe',
75
+ stderr: 'pipe',
76
+ })
77
+
78
+ const [stdout, stderr, exitCode] = await Promise.all([
79
+ new Response(proc.stdout).text(),
80
+ new Response(proc.stderr).text(),
81
+ proc.exited,
82
+ ])
83
+
84
+ if (exitCode !== 0) {
85
+ throw new Error(`Grader exited with code ${exitCode}: ${stderr.trim() || 'No error output'}`)
86
+ }
87
+
88
+ const trimmedStdout = stdout.trim()
89
+ if (!trimmedStdout) {
90
+ throw new Error('Grader produced no output')
91
+ }
92
+
93
+ let parsed: unknown
94
+ try {
95
+ parsed = JSON.parse(trimmedStdout)
96
+ } catch {
97
+ throw new Error(`Grader output is not valid JSON: ${trimmedStdout.slice(0, 100)}`)
98
+ }
99
+
100
+ const result = GraderResultSchema.safeParse(parsed)
101
+ if (!result.success) {
102
+ throw new Error(`Invalid grader result: ${result.error.message}`)
103
+ }
104
+
105
+ return result.data
106
+ }
107
+ }
108
+
109
+ // ============================================================================
110
+ // Module Grader
111
+ // ============================================================================
112
+
113
+ /**
114
+ * Load a grader from a JavaScript/TypeScript module.
115
+ *
116
+ * @remarks
117
+ * The module must export a `grade` function matching the `Grader` type.
118
+ *
119
+ * @param modulePath - Absolute path to the module
120
+ * @returns Grader function
121
+ */
122
+ const loadModuleGrader = async (modulePath: string): Promise<Grader> => {
123
+ const graderModule = await import(modulePath)
124
+
125
+ if (typeof graderModule.grade !== 'function') {
126
+ throw new Error(`Grader module must export a 'grade' function`)
127
+ }
128
+
129
+ return graderModule.grade as Grader
130
+ }
131
+
132
+ // ============================================================================
133
+ // Public API
134
+ // ============================================================================
135
+
136
+ /**
137
+ * Load a grader from a file path.
138
+ *
139
+ * @remarks
140
+ * Detection logic:
141
+ * - `.ts`, `.js`, `.mjs`, `.cjs` → Import as ES module
142
+ * - Everything else → Execute as subprocess
143
+ *
144
+ * @param graderPath - Path to the grader (relative or absolute)
145
+ * @returns Grader function
146
+ * @throws Error if grader not found or invalid
147
+ *
148
+ * @example
149
+ * ```typescript
150
+ * // TypeScript grader
151
+ * const grader = await loadGrader('./grader.ts')
152
+ *
153
+ * // Python grader
154
+ * const grader = await loadGrader('./grader.py')
155
+ *
156
+ * // Any executable
157
+ * const grader = await loadGrader('./my-grader')
158
+ * ```
159
+ */
160
+ export const loadGrader = async (graderPath: string): Promise<Grader> => {
161
+ const resolvedPath = resolvePath(graderPath)
162
+
163
+ // Check file exists
164
+ const file = Bun.file(resolvedPath)
165
+ if (!(await file.exists())) {
166
+ throw new Error(`Grader not found: ${resolvedPath}`)
167
+ }
168
+
169
+ if (isJsModule(resolvedPath)) {
170
+ return loadModuleGrader(resolvedPath)
171
+ }
172
+
173
+ return createExecGrader(resolvedPath)
174
+ }