@plaited/acp-harness 0.2.5 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +120 -16
- package/bin/cli.ts +105 -636
- package/bin/tests/cli.spec.ts +218 -51
- package/package.json +20 -4
- package/src/acp-client.ts +5 -4
- package/src/acp-transport.ts +14 -7
- package/src/adapter-check.ts +542 -0
- package/src/adapter-scaffold.ts +934 -0
- package/src/balance.ts +232 -0
- package/src/calibrate.ts +300 -0
- package/src/capture.ts +457 -0
- package/src/constants.ts +94 -0
- package/src/grader-loader.ts +174 -0
- package/src/harness.ts +35 -0
- package/src/schemas-cli.ts +239 -0
- package/src/schemas.ts +567 -0
- package/src/summarize.ts +245 -0
- package/src/tests/adapter-check.spec.ts +70 -0
- package/src/tests/adapter-scaffold.spec.ts +112 -0
- package/src/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/tests/fixtures/grader-exec.py +29 -0
- package/src/tests/fixtures/grader-module.ts +14 -0
- package/src/tests/grader-loader.spec.ts +153 -0
- package/src/trials.ts +395 -0
- package/src/validate-refs.ts +188 -0
- package/.claude/rules/accuracy.md +0 -43
- package/.claude/rules/bun-apis.md +0 -80
- package/.claude/rules/code-review.md +0 -254
- package/.claude/rules/git-workflow.md +0 -37
- package/.claude/rules/github.md +0 -154
- package/.claude/rules/testing.md +0 -172
- package/.claude/skills/acp-harness/SKILL.md +0 -310
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
- package/.claude/skills/acp-harness/references/downstream.md +0 -288
- package/.claude/skills/acp-harness/references/output-formats.md +0 -221
- package/.claude-plugin/marketplace.json +0 -15
- package/.claude-plugin/plugin.json +0 -16
- package/.github/CODEOWNERS +0 -6
- package/.github/workflows/ci.yml +0 -63
- package/.github/workflows/publish.yml +0 -146
- package/.mcp.json +0 -20
- package/CLAUDE.md +0 -92
- package/Dockerfile.test +0 -23
- package/biome.json +0 -96
- package/bun.lock +0 -513
- package/docker-compose.test.yml +0 -21
- package/scripts/bun-test-wrapper.sh +0 -46
- package/src/acp.constants.ts +0 -56
- package/src/acp.schemas.ts +0 -161
- package/src/acp.types.ts +0 -28
- package/src/tests/fixtures/.claude/settings.local.json +0 -8
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
- package/tsconfig.json +0 -32
package/src/summarize.ts
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Summarize command - derive compact views from full trajectory results.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Transforms full trajectory JSONL into:
|
|
6
|
+
* - Summary JSONL: Compact format for jq analysis
|
|
7
|
+
* - Markdown: Human-readable format for LLM-as-judge workflows
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { parseArgs } from 'node:util'
|
|
13
|
+
import { extractContent, extractFilePath, headTailPreview } from './capture.ts'
|
|
14
|
+
import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from './constants.ts'
|
|
15
|
+
import type { CaptureResult, SummaryResult } from './schemas.ts'
|
|
16
|
+
import { CaptureResultSchema } from './schemas.ts'
|
|
17
|
+
|
|
18
|
+
// ============================================================================
|
|
19
|
+
// Types
|
|
20
|
+
// ============================================================================
|
|
21
|
+
|
|
22
|
+
/** Configuration for summarize command */
|
|
23
|
+
export type SummarizeConfig = {
|
|
24
|
+
/** Path to results.jsonl file */
|
|
25
|
+
resultsPath: string
|
|
26
|
+
/** Output file path */
|
|
27
|
+
outputPath?: string
|
|
28
|
+
/** Output as markdown instead of JSONL */
|
|
29
|
+
markdown?: boolean
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// ============================================================================
|
|
33
|
+
// Helpers
|
|
34
|
+
// ============================================================================
|
|
35
|
+
|
|
36
|
+
/** Resolve path relative to process.cwd() */
|
|
37
|
+
const resolvePath = (path: string): string => {
|
|
38
|
+
if (path.startsWith('/')) return path
|
|
39
|
+
return `${process.cwd()}/${path}`
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** Load capture results from JSONL file */
|
|
43
|
+
const loadResults = async (path: string): Promise<CaptureResult[]> => {
|
|
44
|
+
const content = await Bun.file(path).text()
|
|
45
|
+
return content
|
|
46
|
+
.trim()
|
|
47
|
+
.split('\n')
|
|
48
|
+
.filter(Boolean)
|
|
49
|
+
.map((line, index) => {
|
|
50
|
+
try {
|
|
51
|
+
return CaptureResultSchema.parse(JSON.parse(line))
|
|
52
|
+
} catch (error) {
|
|
53
|
+
throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
54
|
+
}
|
|
55
|
+
})
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Format result as summary JSONL */
|
|
59
|
+
const formatSummary = (result: CaptureResult): SummaryResult => {
|
|
60
|
+
return {
|
|
61
|
+
id: result.id,
|
|
62
|
+
input: result.input,
|
|
63
|
+
output: result.output,
|
|
64
|
+
toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
|
|
65
|
+
duration: result.timing.end - result.timing.start,
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Format result as markdown with step IDs */
|
|
70
|
+
const formatMarkdown = (result: CaptureResult): string => {
|
|
71
|
+
const lines: string[] = [
|
|
72
|
+
`## Evaluation Record: ${result.id}`,
|
|
73
|
+
'',
|
|
74
|
+
`**Input:** ${result.input}`,
|
|
75
|
+
'',
|
|
76
|
+
'**Trajectory:**',
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
let stepNum = 1
|
|
80
|
+
for (const step of result.trajectory) {
|
|
81
|
+
const stepId = `${result.id}-step-${stepNum}`
|
|
82
|
+
|
|
83
|
+
if (step.type === 'thought') {
|
|
84
|
+
const preview = step.content.slice(0, 100)
|
|
85
|
+
const truncated = step.content.length > 100 ? '...' : ''
|
|
86
|
+
lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
|
|
87
|
+
stepNum++
|
|
88
|
+
} else if (step.type === 'tool_call') {
|
|
89
|
+
const duration = step.duration ? ` (${step.duration}ms)` : ''
|
|
90
|
+
const filePath = extractFilePath(step.input)
|
|
91
|
+
const content = extractContent(step.input)
|
|
92
|
+
|
|
93
|
+
lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
|
|
94
|
+
|
|
95
|
+
// Add file path if present
|
|
96
|
+
if (filePath) {
|
|
97
|
+
const charCount = content?.length ?? 0
|
|
98
|
+
lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Add head/tail preview for content-producing tools
|
|
102
|
+
if (content && content.length > 0) {
|
|
103
|
+
const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content
|
|
104
|
+
// Detect file extension for syntax highlighting
|
|
105
|
+
const ext = filePath?.split('.').pop() ?? 'typescript'
|
|
106
|
+
lines.push(` \`\`\`${ext}`)
|
|
107
|
+
lines.push(` ${preview.split('\n').join('\n ')}`)
|
|
108
|
+
lines.push(' ```')
|
|
109
|
+
}
|
|
110
|
+
stepNum++
|
|
111
|
+
} else if (step.type === 'plan') {
|
|
112
|
+
const entries = step.entries as Array<{ content: string; status: string }>
|
|
113
|
+
const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ')
|
|
114
|
+
const truncated = planSummary.length > 80 ? '...' : ''
|
|
115
|
+
lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
|
|
116
|
+
stepNum++
|
|
117
|
+
} else if (step.type === 'message') {
|
|
118
|
+
const preview = step.content.slice(0, 100)
|
|
119
|
+
const truncated = step.content.length > 100 ? '...' : ''
|
|
120
|
+
lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
|
|
121
|
+
stepNum++
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
lines.push('')
|
|
126
|
+
const outputPreview = result.output.slice(0, 200)
|
|
127
|
+
const outputTruncated = result.output.length > 200 ? '...' : ''
|
|
128
|
+
lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
|
|
129
|
+
lines.push('')
|
|
130
|
+
|
|
131
|
+
const metadataStr = Object.entries(result.metadata)
|
|
132
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
133
|
+
.join(', ')
|
|
134
|
+
lines.push(`**Metadata:** ${metadataStr}`)
|
|
135
|
+
lines.push(`**Tool Errors:** ${result.toolErrors}`)
|
|
136
|
+
lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
|
|
137
|
+
|
|
138
|
+
if (result.score) {
|
|
139
|
+
lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`)
|
|
140
|
+
if (result.score.reasoning) {
|
|
141
|
+
lines.push(`**Reasoning:** ${result.score.reasoning}`)
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
lines.push('')
|
|
146
|
+
lines.push('---')
|
|
147
|
+
lines.push('')
|
|
148
|
+
|
|
149
|
+
return lines.join('\n')
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// ============================================================================
|
|
153
|
+
// Summarize Implementation
|
|
154
|
+
// ============================================================================
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Execute summarize with configuration object.
|
|
158
|
+
*
|
|
159
|
+
* @param config - Summarize configuration
|
|
160
|
+
* @returns Formatted output string
|
|
161
|
+
*/
|
|
162
|
+
export const runSummarize = async (config: SummarizeConfig): Promise<string> => {
|
|
163
|
+
const { resultsPath, outputPath, markdown = false } = config
|
|
164
|
+
|
|
165
|
+
// Load results
|
|
166
|
+
const results = await loadResults(resultsPath)
|
|
167
|
+
|
|
168
|
+
// Format output
|
|
169
|
+
let output: string
|
|
170
|
+
if (markdown) {
|
|
171
|
+
output = results.map(formatMarkdown).join('\n')
|
|
172
|
+
} else {
|
|
173
|
+
output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n')
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Write output
|
|
177
|
+
if (outputPath) {
|
|
178
|
+
await Bun.write(resolvePath(outputPath), output)
|
|
179
|
+
} else {
|
|
180
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
181
|
+
console.log(output)
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return output
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// ============================================================================
|
|
188
|
+
// CLI Entry Point
|
|
189
|
+
// ============================================================================
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Summarize command CLI handler.
|
|
193
|
+
*
|
|
194
|
+
* @param args - Command line arguments (after 'summarize')
|
|
195
|
+
*/
|
|
196
|
+
export const summarize = async (args: string[]): Promise<void> => {
|
|
197
|
+
const { values, positionals } = parseArgs({
|
|
198
|
+
args,
|
|
199
|
+
options: {
|
|
200
|
+
output: { type: 'string', short: 'o' },
|
|
201
|
+
markdown: { type: 'boolean', short: 'm', default: false },
|
|
202
|
+
help: { type: 'boolean', short: 'h' },
|
|
203
|
+
},
|
|
204
|
+
allowPositionals: true,
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
if (values.help) {
|
|
208
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
209
|
+
console.log(`
|
|
210
|
+
Usage: acp-harness summarize <results.jsonl> [options]
|
|
211
|
+
|
|
212
|
+
Arguments:
|
|
213
|
+
results.jsonl Input file with capture results
|
|
214
|
+
|
|
215
|
+
Options:
|
|
216
|
+
-o, --output Output file (default: stdout)
|
|
217
|
+
-m, --markdown Output as markdown instead of JSONL
|
|
218
|
+
-h, --help Show this help message
|
|
219
|
+
|
|
220
|
+
Output Formats:
|
|
221
|
+
JSONL (default): Compact summary with id, input, output, toolCalls, duration
|
|
222
|
+
Markdown (-m): Human-readable format with step IDs for LLM-as-judge
|
|
223
|
+
|
|
224
|
+
Examples:
|
|
225
|
+
# Summary JSONL for jq analysis
|
|
226
|
+
acp-harness summarize results.jsonl -o summary.jsonl
|
|
227
|
+
|
|
228
|
+
# Markdown for LLM evaluation
|
|
229
|
+
acp-harness summarize results.jsonl --markdown -o results.md
|
|
230
|
+
`)
|
|
231
|
+
return
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
const resultsPath = positionals[0]
|
|
235
|
+
if (!resultsPath) {
|
|
236
|
+
console.error('Error: results.jsonl path is required')
|
|
237
|
+
process.exit(1)
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
await runSummarize({
|
|
241
|
+
resultsPath,
|
|
242
|
+
outputPath: values.output,
|
|
243
|
+
markdown: values.markdown ?? false,
|
|
244
|
+
})
|
|
245
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for adapter compliance checking functionality.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test'
|
|
6
|
+
import { type CheckConfig, runCheck } from '../adapter-check.ts'
|
|
7
|
+
|
|
8
|
+
describe('runCheck', () => {
|
|
9
|
+
test('fails spawn check for non-existent command', async () => {
|
|
10
|
+
const config: CheckConfig = {
|
|
11
|
+
command: ['nonexistent-command-xyz'],
|
|
12
|
+
timeout: 1000,
|
|
13
|
+
verbose: false,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const result = await runCheck(config)
|
|
17
|
+
|
|
18
|
+
expect(result.passed).toBe(false)
|
|
19
|
+
expect(result.checks.length).toBeGreaterThanOrEqual(1)
|
|
20
|
+
expect(result.checks[0]?.name).toBe('spawn')
|
|
21
|
+
expect(result.checks[0]?.passed).toBe(false)
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
test('fails spawn check for command that exits immediately', async () => {
|
|
25
|
+
const config: CheckConfig = {
|
|
26
|
+
command: ['false'], // Unix command that exits with code 1
|
|
27
|
+
timeout: 1000,
|
|
28
|
+
verbose: false,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const result = await runCheck(config)
|
|
32
|
+
|
|
33
|
+
expect(result.passed).toBe(false)
|
|
34
|
+
expect(result.summary.failed).toBeGreaterThanOrEqual(1)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('returns structured result with summary', async () => {
|
|
38
|
+
const config: CheckConfig = {
|
|
39
|
+
command: ['echo', 'test'],
|
|
40
|
+
timeout: 1000,
|
|
41
|
+
verbose: false,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const result = await runCheck(config)
|
|
45
|
+
|
|
46
|
+
expect(result).toHaveProperty('passed')
|
|
47
|
+
expect(result).toHaveProperty('checks')
|
|
48
|
+
expect(result).toHaveProperty('summary')
|
|
49
|
+
expect(result.summary).toHaveProperty('total')
|
|
50
|
+
expect(result.summary).toHaveProperty('passed')
|
|
51
|
+
expect(result.summary).toHaveProperty('failed')
|
|
52
|
+
expect(typeof result.passed).toBe('boolean')
|
|
53
|
+
expect(Array.isArray(result.checks)).toBe(true)
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
test('includes verbose details when enabled', async () => {
|
|
57
|
+
const config: CheckConfig = {
|
|
58
|
+
command: ['echo', 'test'],
|
|
59
|
+
timeout: 1000,
|
|
60
|
+
verbose: true,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const result = await runCheck(config)
|
|
64
|
+
|
|
65
|
+
// At least the spawn check should have details in verbose mode
|
|
66
|
+
const spawnCheck = result.checks.find((c) => c.name === 'spawn')
|
|
67
|
+
expect(spawnCheck).toBeDefined()
|
|
68
|
+
// Note: details may or may not be present depending on check outcome
|
|
69
|
+
})
|
|
70
|
+
})
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for adapter scaffolding functionality.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { afterEach, describe, expect, test } from 'bun:test'
|
|
6
|
+
import { rm } from 'node:fs/promises'
|
|
7
|
+
import { join } from 'node:path'
|
|
8
|
+
import { runScaffold, type ScaffoldConfig } from '../adapter-scaffold.ts'
|
|
9
|
+
|
|
10
|
+
const testDir = join(import.meta.dir, 'fixtures', 'scaffold-output')
|
|
11
|
+
|
|
12
|
+
describe('runScaffold', () => {
|
|
13
|
+
afterEach(async () => {
|
|
14
|
+
// Clean up test output
|
|
15
|
+
await rm(testDir, { recursive: true, force: true })
|
|
16
|
+
})
|
|
17
|
+
|
|
18
|
+
test('generates TypeScript adapter structure', async () => {
|
|
19
|
+
const config: ScaffoldConfig = {
|
|
20
|
+
name: 'test-agent',
|
|
21
|
+
outputDir: testDir,
|
|
22
|
+
lang: 'ts',
|
|
23
|
+
minimal: false,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const result = await runScaffold(config)
|
|
27
|
+
|
|
28
|
+
expect(result.outputDir).toBe(testDir)
|
|
29
|
+
expect(result.lang).toBe('ts')
|
|
30
|
+
expect(result.files).toContain('package.json')
|
|
31
|
+
expect(result.files).toContain('tsconfig.json')
|
|
32
|
+
expect(result.files).toContain('src/index.ts')
|
|
33
|
+
expect(result.files).toContain('src/types.ts')
|
|
34
|
+
expect(result.files).toContain('src/session-manager.ts')
|
|
35
|
+
expect(result.files).toContain('src/handlers/initialize.ts')
|
|
36
|
+
expect(result.files).toContain('src/handlers/session-new.ts')
|
|
37
|
+
expect(result.files).toContain('src/handlers/session-prompt.ts')
|
|
38
|
+
expect(result.files).toContain('src/handlers/session-cancel.ts')
|
|
39
|
+
expect(result.files).toContain('README.md')
|
|
40
|
+
|
|
41
|
+
// Verify files actually exist
|
|
42
|
+
const packageJson = await Bun.file(join(testDir, 'package.json')).text()
|
|
43
|
+
expect(packageJson).toContain('"test-agent-acp"')
|
|
44
|
+
|
|
45
|
+
const indexTs = await Bun.file(join(testDir, 'src', 'index.ts')).text()
|
|
46
|
+
expect(indexTs).toContain('#!/usr/bin/env bun')
|
|
47
|
+
expect(indexTs).toContain('handleInitialize')
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
test('generates minimal TypeScript structure without README', async () => {
|
|
51
|
+
const config: ScaffoldConfig = {
|
|
52
|
+
name: 'minimal-agent',
|
|
53
|
+
outputDir: testDir,
|
|
54
|
+
lang: 'ts',
|
|
55
|
+
minimal: true,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const result = await runScaffold(config)
|
|
59
|
+
|
|
60
|
+
expect(result.files).not.toContain('README.md')
|
|
61
|
+
expect(result.files).toContain('package.json')
|
|
62
|
+
expect(result.files).toContain('src/index.ts')
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
test('generates Python adapter structure', async () => {
|
|
66
|
+
const config: ScaffoldConfig = {
|
|
67
|
+
name: 'python-agent',
|
|
68
|
+
outputDir: testDir,
|
|
69
|
+
lang: 'python',
|
|
70
|
+
minimal: false,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const result = await runScaffold(config)
|
|
74
|
+
|
|
75
|
+
expect(result.lang).toBe('python')
|
|
76
|
+
expect(result.files).toContain('adapter.py')
|
|
77
|
+
expect(result.files).toContain('README.md')
|
|
78
|
+
|
|
79
|
+
const adapterPy = await Bun.file(join(testDir, 'adapter.py')).text()
|
|
80
|
+
expect(adapterPy).toContain('#!/usr/bin/env python3')
|
|
81
|
+
expect(adapterPy).toContain('python-agent')
|
|
82
|
+
expect(adapterPy).toContain('def handle_initialize')
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
test('generates minimal Python structure without README', async () => {
|
|
86
|
+
const config: ScaffoldConfig = {
|
|
87
|
+
name: 'minimal-python',
|
|
88
|
+
outputDir: testDir,
|
|
89
|
+
lang: 'python',
|
|
90
|
+
minimal: true,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const result = await runScaffold(config)
|
|
94
|
+
|
|
95
|
+
expect(result.files).toContain('adapter.py')
|
|
96
|
+
expect(result.files).not.toContain('README.md')
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
test('package.json contains correct name', async () => {
|
|
100
|
+
const config: ScaffoldConfig = {
|
|
101
|
+
name: 'my-special-agent',
|
|
102
|
+
outputDir: testDir,
|
|
103
|
+
lang: 'ts',
|
|
104
|
+
minimal: true,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
await runScaffold(config)
|
|
108
|
+
|
|
109
|
+
const packageJson = JSON.parse(await Bun.file(join(testDir, 'package.json')).text())
|
|
110
|
+
expect(packageJson.name).toBe('my-special-agent-acp')
|
|
111
|
+
})
|
|
112
|
+
})
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Test fixture: Python grader script using stdin/stdout JSON protocol.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
data = json.load(sys.stdin)
|
|
11
|
+
|
|
12
|
+
output = data.get("output", "").lower()
|
|
13
|
+
expected = (data.get("expected") or "").lower()
|
|
14
|
+
|
|
15
|
+
if expected:
|
|
16
|
+
pass_result = expected in output
|
|
17
|
+
else:
|
|
18
|
+
pass_result = True
|
|
19
|
+
|
|
20
|
+
result = {
|
|
21
|
+
"pass": pass_result,
|
|
22
|
+
"score": 1.0 if pass_result else 0.0,
|
|
23
|
+
"reasoning": "Contains expected" if pass_result else "Missing expected"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
print(json.dumps(result))
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
main()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test fixture: TypeScript grader module.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { Grader } from '../../schemas.ts'
|
|
6
|
+
|
|
7
|
+
export const grade: Grader = async ({ input: _input, output, expected }) => {
|
|
8
|
+
const pass = expected ? output.toLowerCase().includes(expected.toLowerCase()) : true
|
|
9
|
+
return {
|
|
10
|
+
pass,
|
|
11
|
+
score: pass ? 1.0 : 0.0,
|
|
12
|
+
reasoning: pass ? 'Contains expected text' : 'Missing expected text',
|
|
13
|
+
}
|
|
14
|
+
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import { describe, expect, test } from 'bun:test'
|
|
2
|
+
import { join } from 'node:path'
|
|
3
|
+
import { loadGrader } from '../grader-loader.ts'
|
|
4
|
+
|
|
5
|
+
const fixturesDir = join(import.meta.dir, 'fixtures')
|
|
6
|
+
|
|
7
|
+
// ============================================================================
|
|
8
|
+
// Module Graders (TypeScript/JavaScript)
|
|
9
|
+
// ============================================================================
|
|
10
|
+
|
|
11
|
+
describe('loadGrader - module graders', () => {
|
|
12
|
+
test('loads TypeScript grader module', async () => {
|
|
13
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
|
|
14
|
+
|
|
15
|
+
const result = await grader({
|
|
16
|
+
input: 'What is 2+2?',
|
|
17
|
+
output: 'The answer is 4',
|
|
18
|
+
expected: '4',
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
expect(result.pass).toBe(true)
|
|
22
|
+
expect(result.score).toBe(1.0)
|
|
23
|
+
expect(result.reasoning).toBe('Contains expected text')
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
test('fails when module does not export grade function', async () => {
|
|
27
|
+
await expect(loadGrader(join(fixturesDir, 'grader-bad-module.ts'))).rejects.toThrow(
|
|
28
|
+
"Grader module must export a 'grade' function",
|
|
29
|
+
)
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
test('fails when module does not exist', async () => {
|
|
33
|
+
await expect(loadGrader(join(fixturesDir, 'nonexistent.ts'))).rejects.toThrow('Grader not found')
|
|
34
|
+
})
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
// ============================================================================
|
|
38
|
+
// Executable Graders (Python, etc.)
|
|
39
|
+
// ============================================================================
|
|
40
|
+
|
|
41
|
+
describe('loadGrader - executable graders', () => {
|
|
42
|
+
test('loads and executes Python grader', async () => {
|
|
43
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
|
|
44
|
+
|
|
45
|
+
const result = await grader({
|
|
46
|
+
input: 'What is 2+2?',
|
|
47
|
+
output: 'The answer is 4',
|
|
48
|
+
expected: '4',
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
expect(result.pass).toBe(true)
|
|
52
|
+
expect(result.score).toBe(1.0)
|
|
53
|
+
expect(result.reasoning).toBe('Contains expected')
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
test('Python grader returns pass=false when expected not in output', async () => {
|
|
57
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
|
|
58
|
+
|
|
59
|
+
const result = await grader({
|
|
60
|
+
input: 'What is 2+2?',
|
|
61
|
+
output: 'I do not know',
|
|
62
|
+
expected: '4',
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
expect(result.pass).toBe(false)
|
|
66
|
+
expect(result.score).toBe(0.0)
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
test('throws when executable exits with non-zero code', async () => {
|
|
70
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-exec-fail.py'))
|
|
71
|
+
|
|
72
|
+
await expect(
|
|
73
|
+
grader({
|
|
74
|
+
input: 'test',
|
|
75
|
+
output: 'test',
|
|
76
|
+
}),
|
|
77
|
+
).rejects.toThrow('Grader exited with code 1')
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('throws when executable outputs invalid JSON', async () => {
|
|
81
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-exec-invalid.py'))
|
|
82
|
+
|
|
83
|
+
await expect(
|
|
84
|
+
grader({
|
|
85
|
+
input: 'test',
|
|
86
|
+
output: 'test',
|
|
87
|
+
}),
|
|
88
|
+
).rejects.toThrow('Grader output is not valid JSON')
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
test('fails when executable does not exist', async () => {
|
|
92
|
+
await expect(loadGrader(join(fixturesDir, 'nonexistent.py'))).rejects.toThrow('Grader not found')
|
|
93
|
+
})
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
// ============================================================================
|
|
97
|
+
// Extension Detection
|
|
98
|
+
// ============================================================================
|
|
99
|
+
|
|
100
|
+
describe('loadGrader - extension detection', () => {
|
|
101
|
+
test('detects .ts as module', async () => {
|
|
102
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
|
|
103
|
+
// If this doesn't throw, it was loaded as a module (not executed)
|
|
104
|
+
expect(grader).toBeInstanceOf(Function)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
test('detects .py as executable', async () => {
|
|
108
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
|
|
109
|
+
expect(grader).toBeInstanceOf(Function)
|
|
110
|
+
})
|
|
111
|
+
})
|
|
112
|
+
|
|
113
|
+
// ============================================================================
|
|
114
|
+
// Trajectory Support
|
|
115
|
+
// ============================================================================
|
|
116
|
+
|
|
117
|
+
describe('loadGrader - trajectory support', () => {
|
|
118
|
+
test('passes trajectory to module grader', async () => {
|
|
119
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
|
|
120
|
+
|
|
121
|
+
const trajectory = [
|
|
122
|
+
{ type: 'message' as const, content: 'Hello', timestamp: 0 },
|
|
123
|
+
{ type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
const result = await grader({
|
|
127
|
+
input: 'test',
|
|
128
|
+
output: 'The answer is 4',
|
|
129
|
+
expected: '4',
|
|
130
|
+
trajectory,
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
expect(result.pass).toBe(true)
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
test('passes trajectory to executable grader', async () => {
|
|
137
|
+
const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
|
|
138
|
+
|
|
139
|
+
const trajectory = [
|
|
140
|
+
{ type: 'message' as const, content: 'Hello', timestamp: 0 },
|
|
141
|
+
{ type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
const result = await grader({
|
|
145
|
+
input: 'test',
|
|
146
|
+
output: 'The answer is 4',
|
|
147
|
+
expected: '4',
|
|
148
|
+
trajectory,
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
expect(result.pass).toBe(true)
|
|
152
|
+
})
|
|
153
|
+
})
|