@plaited/acp-harness 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +120 -16
  3. package/bin/cli.ts +105 -636
  4. package/bin/tests/cli.spec.ts +218 -51
  5. package/package.json +20 -4
  6. package/src/acp-client.ts +5 -4
  7. package/src/acp-transport.ts +14 -7
  8. package/src/adapter-check.ts +542 -0
  9. package/src/adapter-scaffold.ts +934 -0
  10. package/src/balance.ts +232 -0
  11. package/src/calibrate.ts +300 -0
  12. package/src/capture.ts +457 -0
  13. package/src/constants.ts +94 -0
  14. package/src/grader-loader.ts +174 -0
  15. package/src/harness.ts +35 -0
  16. package/src/schemas-cli.ts +239 -0
  17. package/src/schemas.ts +567 -0
  18. package/src/summarize.ts +245 -0
  19. package/src/tests/adapter-check.spec.ts +70 -0
  20. package/src/tests/adapter-scaffold.spec.ts +112 -0
  21. package/src/tests/fixtures/grader-bad-module.ts +5 -0
  22. package/src/tests/fixtures/grader-exec-fail.py +9 -0
  23. package/src/tests/fixtures/grader-exec-invalid.py +6 -0
  24. package/src/tests/fixtures/grader-exec.py +29 -0
  25. package/src/tests/fixtures/grader-module.ts +14 -0
  26. package/src/tests/grader-loader.spec.ts +153 -0
  27. package/src/trials.ts +395 -0
  28. package/src/validate-refs.ts +188 -0
  29. package/.claude/rules/accuracy.md +0 -43
  30. package/.claude/rules/bun-apis.md +0 -80
  31. package/.claude/rules/code-review.md +0 -254
  32. package/.claude/rules/git-workflow.md +0 -37
  33. package/.claude/rules/github.md +0 -154
  34. package/.claude/rules/testing.md +0 -172
  35. package/.claude/skills/acp-harness/SKILL.md +0 -310
  36. package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
  37. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
  38. package/.claude/skills/acp-harness/references/downstream.md +0 -288
  39. package/.claude/skills/acp-harness/references/output-formats.md +0 -221
  40. package/.claude-plugin/marketplace.json +0 -15
  41. package/.claude-plugin/plugin.json +0 -16
  42. package/.github/CODEOWNERS +0 -6
  43. package/.github/workflows/ci.yml +0 -63
  44. package/.github/workflows/publish.yml +0 -146
  45. package/.mcp.json +0 -20
  46. package/CLAUDE.md +0 -92
  47. package/Dockerfile.test +0 -23
  48. package/biome.json +0 -96
  49. package/bun.lock +0 -513
  50. package/docker-compose.test.yml +0 -21
  51. package/scripts/bun-test-wrapper.sh +0 -46
  52. package/src/acp.constants.ts +0 -56
  53. package/src/acp.schemas.ts +0 -161
  54. package/src/acp.types.ts +0 -28
  55. package/src/tests/fixtures/.claude/settings.local.json +0 -8
  56. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
  57. package/tsconfig.json +0 -32
@@ -0,0 +1,245 @@
1
+ /**
2
+ * Summarize command - derive compact views from full trajectory results.
3
+ *
4
+ * @remarks
5
+ * Transforms full trajectory JSONL into:
6
+ * - Summary JSONL: Compact format for jq analysis
7
+ * - Markdown: Human-readable format for LLM-as-judge workflows
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { parseArgs } from 'node:util'
13
+ import { extractContent, extractFilePath, headTailPreview } from './capture.ts'
14
+ import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from './constants.ts'
15
+ import type { CaptureResult, SummaryResult } from './schemas.ts'
16
+ import { CaptureResultSchema } from './schemas.ts'
17
+
18
+ // ============================================================================
19
+ // Types
20
+ // ============================================================================
21
+
22
+ /** Configuration for summarize command */
23
+ export type SummarizeConfig = {
24
+ /** Path to results.jsonl file */
25
+ resultsPath: string
26
+ /** Output file path */
27
+ outputPath?: string
28
+ /** Output as markdown instead of JSONL */
29
+ markdown?: boolean
30
+ }
31
+
32
+ // ============================================================================
33
+ // Helpers
34
+ // ============================================================================
35
+
36
+ /** Resolve path relative to process.cwd() */
37
+ const resolvePath = (path: string): string => {
38
+ if (path.startsWith('/')) return path
39
+ return `${process.cwd()}/${path}`
40
+ }
41
+
42
+ /** Load capture results from JSONL file */
43
+ const loadResults = async (path: string): Promise<CaptureResult[]> => {
44
+ const content = await Bun.file(path).text()
45
+ return content
46
+ .trim()
47
+ .split('\n')
48
+ .filter(Boolean)
49
+ .map((line, index) => {
50
+ try {
51
+ return CaptureResultSchema.parse(JSON.parse(line))
52
+ } catch (error) {
53
+ throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
54
+ }
55
+ })
56
+ }
57
+
58
+ /** Format result as summary JSONL */
59
+ const formatSummary = (result: CaptureResult): SummaryResult => {
60
+ return {
61
+ id: result.id,
62
+ input: result.input,
63
+ output: result.output,
64
+ toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
65
+ duration: result.timing.end - result.timing.start,
66
+ }
67
+ }
68
+
69
+ /** Format result as markdown with step IDs */
70
+ const formatMarkdown = (result: CaptureResult): string => {
71
+ const lines: string[] = [
72
+ `## Evaluation Record: ${result.id}`,
73
+ '',
74
+ `**Input:** ${result.input}`,
75
+ '',
76
+ '**Trajectory:**',
77
+ ]
78
+
79
+ let stepNum = 1
80
+ for (const step of result.trajectory) {
81
+ const stepId = `${result.id}-step-${stepNum}`
82
+
83
+ if (step.type === 'thought') {
84
+ const preview = step.content.slice(0, 100)
85
+ const truncated = step.content.length > 100 ? '...' : ''
86
+ lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
87
+ stepNum++
88
+ } else if (step.type === 'tool_call') {
89
+ const duration = step.duration ? ` (${step.duration}ms)` : ''
90
+ const filePath = extractFilePath(step.input)
91
+ const content = extractContent(step.input)
92
+
93
+ lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
94
+
95
+ // Add file path if present
96
+ if (filePath) {
97
+ const charCount = content?.length ?? 0
98
+ lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
99
+ }
100
+
101
+ // Add head/tail preview for content-producing tools
102
+ if (content && content.length > 0) {
103
+ const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content
104
+ // Detect file extension for syntax highlighting
105
+ const ext = filePath?.split('.').pop() ?? 'typescript'
106
+ lines.push(` \`\`\`${ext}`)
107
+ lines.push(` ${preview.split('\n').join('\n ')}`)
108
+ lines.push(' ```')
109
+ }
110
+ stepNum++
111
+ } else if (step.type === 'plan') {
112
+ const entries = step.entries as Array<{ content: string; status: string }>
113
+ const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ')
114
+ const truncated = planSummary.length > 80 ? '...' : ''
115
+ lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
116
+ stepNum++
117
+ } else if (step.type === 'message') {
118
+ const preview = step.content.slice(0, 100)
119
+ const truncated = step.content.length > 100 ? '...' : ''
120
+ lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
121
+ stepNum++
122
+ }
123
+ }
124
+
125
+ lines.push('')
126
+ const outputPreview = result.output.slice(0, 200)
127
+ const outputTruncated = result.output.length > 200 ? '...' : ''
128
+ lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
129
+ lines.push('')
130
+
131
+ const metadataStr = Object.entries(result.metadata)
132
+ .map(([k, v]) => `${k}=${v}`)
133
+ .join(', ')
134
+ lines.push(`**Metadata:** ${metadataStr}`)
135
+ lines.push(`**Tool Errors:** ${result.toolErrors}`)
136
+ lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
137
+
138
+ if (result.score) {
139
+ lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`)
140
+ if (result.score.reasoning) {
141
+ lines.push(`**Reasoning:** ${result.score.reasoning}`)
142
+ }
143
+ }
144
+
145
+ lines.push('')
146
+ lines.push('---')
147
+ lines.push('')
148
+
149
+ return lines.join('\n')
150
+ }
151
+
152
+ // ============================================================================
153
+ // Summarize Implementation
154
+ // ============================================================================
155
+
156
+ /**
157
+ * Execute summarize with configuration object.
158
+ *
159
+ * @param config - Summarize configuration
160
+ * @returns Formatted output string
161
+ */
162
+ export const runSummarize = async (config: SummarizeConfig): Promise<string> => {
163
+ const { resultsPath, outputPath, markdown = false } = config
164
+
165
+ // Load results
166
+ const results = await loadResults(resultsPath)
167
+
168
+ // Format output
169
+ let output: string
170
+ if (markdown) {
171
+ output = results.map(formatMarkdown).join('\n')
172
+ } else {
173
+ output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n')
174
+ }
175
+
176
+ // Write output
177
+ if (outputPath) {
178
+ await Bun.write(resolvePath(outputPath), output)
179
+ } else {
180
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
181
+ console.log(output)
182
+ }
183
+
184
+ return output
185
+ }
186
+
187
+ // ============================================================================
188
+ // CLI Entry Point
189
+ // ============================================================================
190
+
191
+ /**
192
+ * Summarize command CLI handler.
193
+ *
194
+ * @param args - Command line arguments (after 'summarize')
195
+ */
196
+ export const summarize = async (args: string[]): Promise<void> => {
197
+ const { values, positionals } = parseArgs({
198
+ args,
199
+ options: {
200
+ output: { type: 'string', short: 'o' },
201
+ markdown: { type: 'boolean', short: 'm', default: false },
202
+ help: { type: 'boolean', short: 'h' },
203
+ },
204
+ allowPositionals: true,
205
+ })
206
+
207
+ if (values.help) {
208
+ // biome-ignore lint/suspicious/noConsole: CLI help output
209
+ console.log(`
210
+ Usage: acp-harness summarize <results.jsonl> [options]
211
+
212
+ Arguments:
213
+ results.jsonl Input file with capture results
214
+
215
+ Options:
216
+ -o, --output Output file (default: stdout)
217
+ -m, --markdown Output as markdown instead of JSONL
218
+ -h, --help Show this help message
219
+
220
+ Output Formats:
221
+ JSONL (default): Compact summary with id, input, output, toolCalls, duration
222
+ Markdown (-m): Human-readable format with step IDs for LLM-as-judge
223
+
224
+ Examples:
225
+ # Summary JSONL for jq analysis
226
+ acp-harness summarize results.jsonl -o summary.jsonl
227
+
228
+ # Markdown for LLM evaluation
229
+ acp-harness summarize results.jsonl --markdown -o results.md
230
+ `)
231
+ return
232
+ }
233
+
234
+ const resultsPath = positionals[0]
235
+ if (!resultsPath) {
236
+ console.error('Error: results.jsonl path is required')
237
+ process.exit(1)
238
+ }
239
+
240
+ await runSummarize({
241
+ resultsPath,
242
+ outputPath: values.output,
243
+ markdown: values.markdown ?? false,
244
+ })
245
+ }
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Tests for adapter compliance checking functionality.
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test'
6
+ import { type CheckConfig, runCheck } from '../adapter-check.ts'
7
+
8
+ describe('runCheck', () => {
9
+ test('fails spawn check for non-existent command', async () => {
10
+ const config: CheckConfig = {
11
+ command: ['nonexistent-command-xyz'],
12
+ timeout: 1000,
13
+ verbose: false,
14
+ }
15
+
16
+ const result = await runCheck(config)
17
+
18
+ expect(result.passed).toBe(false)
19
+ expect(result.checks.length).toBeGreaterThanOrEqual(1)
20
+ expect(result.checks[0]?.name).toBe('spawn')
21
+ expect(result.checks[0]?.passed).toBe(false)
22
+ })
23
+
24
+ test('fails spawn check for command that exits immediately', async () => {
25
+ const config: CheckConfig = {
26
+ command: ['false'], // Unix command that exits with code 1
27
+ timeout: 1000,
28
+ verbose: false,
29
+ }
30
+
31
+ const result = await runCheck(config)
32
+
33
+ expect(result.passed).toBe(false)
34
+ expect(result.summary.failed).toBeGreaterThanOrEqual(1)
35
+ })
36
+
37
+ test('returns structured result with summary', async () => {
38
+ const config: CheckConfig = {
39
+ command: ['echo', 'test'],
40
+ timeout: 1000,
41
+ verbose: false,
42
+ }
43
+
44
+ const result = await runCheck(config)
45
+
46
+ expect(result).toHaveProperty('passed')
47
+ expect(result).toHaveProperty('checks')
48
+ expect(result).toHaveProperty('summary')
49
+ expect(result.summary).toHaveProperty('total')
50
+ expect(result.summary).toHaveProperty('passed')
51
+ expect(result.summary).toHaveProperty('failed')
52
+ expect(typeof result.passed).toBe('boolean')
53
+ expect(Array.isArray(result.checks)).toBe(true)
54
+ })
55
+
56
+ test('includes verbose details when enabled', async () => {
57
+ const config: CheckConfig = {
58
+ command: ['echo', 'test'],
59
+ timeout: 1000,
60
+ verbose: true,
61
+ }
62
+
63
+ const result = await runCheck(config)
64
+
65
+ // At least the spawn check should have details in verbose mode
66
+ const spawnCheck = result.checks.find((c) => c.name === 'spawn')
67
+ expect(spawnCheck).toBeDefined()
68
+ // Note: details may or may not be present depending on check outcome
69
+ })
70
+ })
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Tests for adapter scaffolding functionality.
3
+ */
4
+
5
+ import { afterEach, describe, expect, test } from 'bun:test'
6
+ import { rm } from 'node:fs/promises'
7
+ import { join } from 'node:path'
8
+ import { runScaffold, type ScaffoldConfig } from '../adapter-scaffold.ts'
9
+
10
+ const testDir = join(import.meta.dir, 'fixtures', 'scaffold-output')
11
+
12
+ describe('runScaffold', () => {
13
+ afterEach(async () => {
14
+ // Clean up test output
15
+ await rm(testDir, { recursive: true, force: true })
16
+ })
17
+
18
+ test('generates TypeScript adapter structure', async () => {
19
+ const config: ScaffoldConfig = {
20
+ name: 'test-agent',
21
+ outputDir: testDir,
22
+ lang: 'ts',
23
+ minimal: false,
24
+ }
25
+
26
+ const result = await runScaffold(config)
27
+
28
+ expect(result.outputDir).toBe(testDir)
29
+ expect(result.lang).toBe('ts')
30
+ expect(result.files).toContain('package.json')
31
+ expect(result.files).toContain('tsconfig.json')
32
+ expect(result.files).toContain('src/index.ts')
33
+ expect(result.files).toContain('src/types.ts')
34
+ expect(result.files).toContain('src/session-manager.ts')
35
+ expect(result.files).toContain('src/handlers/initialize.ts')
36
+ expect(result.files).toContain('src/handlers/session-new.ts')
37
+ expect(result.files).toContain('src/handlers/session-prompt.ts')
38
+ expect(result.files).toContain('src/handlers/session-cancel.ts')
39
+ expect(result.files).toContain('README.md')
40
+
41
+ // Verify files actually exist
42
+ const packageJson = await Bun.file(join(testDir, 'package.json')).text()
43
+ expect(packageJson).toContain('"test-agent-acp"')
44
+
45
+ const indexTs = await Bun.file(join(testDir, 'src', 'index.ts')).text()
46
+ expect(indexTs).toContain('#!/usr/bin/env bun')
47
+ expect(indexTs).toContain('handleInitialize')
48
+ })
49
+
50
+ test('generates minimal TypeScript structure without README', async () => {
51
+ const config: ScaffoldConfig = {
52
+ name: 'minimal-agent',
53
+ outputDir: testDir,
54
+ lang: 'ts',
55
+ minimal: true,
56
+ }
57
+
58
+ const result = await runScaffold(config)
59
+
60
+ expect(result.files).not.toContain('README.md')
61
+ expect(result.files).toContain('package.json')
62
+ expect(result.files).toContain('src/index.ts')
63
+ })
64
+
65
+ test('generates Python adapter structure', async () => {
66
+ const config: ScaffoldConfig = {
67
+ name: 'python-agent',
68
+ outputDir: testDir,
69
+ lang: 'python',
70
+ minimal: false,
71
+ }
72
+
73
+ const result = await runScaffold(config)
74
+
75
+ expect(result.lang).toBe('python')
76
+ expect(result.files).toContain('adapter.py')
77
+ expect(result.files).toContain('README.md')
78
+
79
+ const adapterPy = await Bun.file(join(testDir, 'adapter.py')).text()
80
+ expect(adapterPy).toContain('#!/usr/bin/env python3')
81
+ expect(adapterPy).toContain('python-agent')
82
+ expect(adapterPy).toContain('def handle_initialize')
83
+ })
84
+
85
+ test('generates minimal Python structure without README', async () => {
86
+ const config: ScaffoldConfig = {
87
+ name: 'minimal-python',
88
+ outputDir: testDir,
89
+ lang: 'python',
90
+ minimal: true,
91
+ }
92
+
93
+ const result = await runScaffold(config)
94
+
95
+ expect(result.files).toContain('adapter.py')
96
+ expect(result.files).not.toContain('README.md')
97
+ })
98
+
99
+ test('package.json contains correct name', async () => {
100
+ const config: ScaffoldConfig = {
101
+ name: 'my-special-agent',
102
+ outputDir: testDir,
103
+ lang: 'ts',
104
+ minimal: true,
105
+ }
106
+
107
+ await runScaffold(config)
108
+
109
+ const packageJson = JSON.parse(await Bun.file(join(testDir, 'package.json')).text())
110
+ expect(packageJson.name).toBe('my-special-agent-acp')
111
+ })
112
+ })
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Test fixture: Invalid TypeScript grader (no 'grade' export).
3
+ */
4
+
5
+ export const evaluate = () => ({ pass: true, score: 1.0 })
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test fixture: Python grader that exits with non-zero code.
4
+ """
5
+
6
+ import sys
7
+
8
+ sys.stderr.write("Intentional failure")
9
+ sys.exit(1)
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test fixture: Python grader that outputs invalid JSON.
4
+ """
5
+
6
+ print("not valid json")
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test fixture: Python grader script using stdin/stdout JSON protocol.
4
+ """
5
+
6
+ import json
7
+ import sys
8
+
9
+ def main():
10
+ data = json.load(sys.stdin)
11
+
12
+ output = data.get("output", "").lower()
13
+ expected = (data.get("expected") or "").lower()
14
+
15
+ if expected:
16
+ pass_result = expected in output
17
+ else:
18
+ pass_result = True
19
+
20
+ result = {
21
+ "pass": pass_result,
22
+ "score": 1.0 if pass_result else 0.0,
23
+ "reasoning": "Contains expected" if pass_result else "Missing expected"
24
+ }
25
+
26
+ print(json.dumps(result))
27
+
28
+ if __name__ == "__main__":
29
+ main()
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Test fixture: TypeScript grader module.
3
+ */
4
+
5
+ import type { Grader } from '../../schemas.ts'
6
+
7
+ export const grade: Grader = async ({ input: _input, output, expected }) => {
8
+ const pass = expected ? output.toLowerCase().includes(expected.toLowerCase()) : true
9
+ return {
10
+ pass,
11
+ score: pass ? 1.0 : 0.0,
12
+ reasoning: pass ? 'Contains expected text' : 'Missing expected text',
13
+ }
14
+ }
@@ -0,0 +1,153 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import { join } from 'node:path'
3
+ import { loadGrader } from '../grader-loader.ts'
4
+
5
+ const fixturesDir = join(import.meta.dir, 'fixtures')
6
+
7
+ // ============================================================================
8
+ // Module Graders (TypeScript/JavaScript)
9
+ // ============================================================================
10
+
11
+ describe('loadGrader - module graders', () => {
12
+ test('loads TypeScript grader module', async () => {
13
+ const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
14
+
15
+ const result = await grader({
16
+ input: 'What is 2+2?',
17
+ output: 'The answer is 4',
18
+ expected: '4',
19
+ })
20
+
21
+ expect(result.pass).toBe(true)
22
+ expect(result.score).toBe(1.0)
23
+ expect(result.reasoning).toBe('Contains expected text')
24
+ })
25
+
26
+ test('fails when module does not export grade function', async () => {
27
+ await expect(loadGrader(join(fixturesDir, 'grader-bad-module.ts'))).rejects.toThrow(
28
+ "Grader module must export a 'grade' function",
29
+ )
30
+ })
31
+
32
+ test('fails when module does not exist', async () => {
33
+ await expect(loadGrader(join(fixturesDir, 'nonexistent.ts'))).rejects.toThrow('Grader not found')
34
+ })
35
+ })
36
+
37
+ // ============================================================================
38
+ // Executable Graders (Python, etc.)
39
+ // ============================================================================
40
+
41
+ describe('loadGrader - executable graders', () => {
42
+ test('loads and executes Python grader', async () => {
43
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
44
+
45
+ const result = await grader({
46
+ input: 'What is 2+2?',
47
+ output: 'The answer is 4',
48
+ expected: '4',
49
+ })
50
+
51
+ expect(result.pass).toBe(true)
52
+ expect(result.score).toBe(1.0)
53
+ expect(result.reasoning).toBe('Contains expected')
54
+ })
55
+
56
+ test('Python grader returns pass=false when expected not in output', async () => {
57
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
58
+
59
+ const result = await grader({
60
+ input: 'What is 2+2?',
61
+ output: 'I do not know',
62
+ expected: '4',
63
+ })
64
+
65
+ expect(result.pass).toBe(false)
66
+ expect(result.score).toBe(0.0)
67
+ })
68
+
69
+ test('throws when executable exits with non-zero code', async () => {
70
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec-fail.py'))
71
+
72
+ await expect(
73
+ grader({
74
+ input: 'test',
75
+ output: 'test',
76
+ }),
77
+ ).rejects.toThrow('Grader exited with code 1')
78
+ })
79
+
80
+ test('throws when executable outputs invalid JSON', async () => {
81
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec-invalid.py'))
82
+
83
+ await expect(
84
+ grader({
85
+ input: 'test',
86
+ output: 'test',
87
+ }),
88
+ ).rejects.toThrow('Grader output is not valid JSON')
89
+ })
90
+
91
+ test('fails when executable does not exist', async () => {
92
+ await expect(loadGrader(join(fixturesDir, 'nonexistent.py'))).rejects.toThrow('Grader not found')
93
+ })
94
+ })
95
+
96
+ // ============================================================================
97
+ // Extension Detection
98
+ // ============================================================================
99
+
100
+ describe('loadGrader - extension detection', () => {
101
+ test('detects .ts as module', async () => {
102
+ const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
103
+ // If this doesn't throw, it was loaded as a module (not executed)
104
+ expect(grader).toBeInstanceOf(Function)
105
+ })
106
+
107
+ test('detects .py as executable', async () => {
108
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
109
+ expect(grader).toBeInstanceOf(Function)
110
+ })
111
+ })
112
+
113
+ // ============================================================================
114
+ // Trajectory Support
115
+ // ============================================================================
116
+
117
+ describe('loadGrader - trajectory support', () => {
118
+ test('passes trajectory to module grader', async () => {
119
+ const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
120
+
121
+ const trajectory = [
122
+ { type: 'message' as const, content: 'Hello', timestamp: 0 },
123
+ { type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
124
+ ]
125
+
126
+ const result = await grader({
127
+ input: 'test',
128
+ output: 'The answer is 4',
129
+ expected: '4',
130
+ trajectory,
131
+ })
132
+
133
+ expect(result.pass).toBe(true)
134
+ })
135
+
136
+ test('passes trajectory to executable grader', async () => {
137
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
138
+
139
+ const trajectory = [
140
+ { type: 'message' as const, content: 'Hello', timestamp: 0 },
141
+ { type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
142
+ ]
143
+
144
+ const result = await grader({
145
+ input: 'test',
146
+ output: 'The answer is 4',
147
+ expected: '4',
148
+ trajectory,
149
+ })
150
+
151
+ expect(result.pass).toBe(true)
152
+ })
153
+ })