@plaited/acp-harness 0.2.6 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +175 -34
  3. package/bin/cli.ts +105 -636
  4. package/bin/tests/cli.spec.ts +218 -51
  5. package/package.json +21 -5
  6. package/src/acp-client.ts +5 -4
  7. package/src/acp-transport.ts +14 -7
  8. package/src/adapter-check.ts +542 -0
  9. package/src/adapter-scaffold.ts +934 -0
  10. package/src/balance.ts +257 -0
  11. package/src/calibrate.ts +319 -0
  12. package/src/capture.ts +457 -0
  13. package/src/constants.ts +94 -0
  14. package/src/grader-loader.ts +174 -0
  15. package/src/harness.ts +35 -0
  16. package/src/schemas-cli.ts +239 -0
  17. package/src/schemas.ts +567 -0
  18. package/src/summarize.ts +259 -0
  19. package/src/tests/adapter-check.spec.ts +70 -0
  20. package/src/tests/adapter-scaffold.spec.ts +112 -0
  21. package/src/tests/balance-helpers.spec.ts +279 -0
  22. package/src/tests/calibrate-helpers.spec.ts +226 -0
  23. package/src/tests/capture-helpers.spec.ts +553 -0
  24. package/src/tests/fixtures/grader-bad-module.ts +5 -0
  25. package/src/tests/fixtures/grader-exec-fail.py +9 -0
  26. package/src/tests/fixtures/grader-exec-invalid.py +6 -0
  27. package/src/tests/fixtures/grader-exec.py +29 -0
  28. package/src/tests/fixtures/grader-module.ts +14 -0
  29. package/src/tests/grader-loader.spec.ts +153 -0
  30. package/src/tests/summarize-helpers.spec.ts +339 -0
  31. package/src/tests/trials-calculations.spec.ts +209 -0
  32. package/src/trials.ts +407 -0
  33. package/src/validate-refs.ts +188 -0
  34. package/.claude/rules/accuracy.md +0 -43
  35. package/.claude/rules/bun-apis.md +0 -80
  36. package/.claude/rules/code-review.md +0 -254
  37. package/.claude/rules/git-workflow.md +0 -37
  38. package/.claude/rules/github.md +0 -154
  39. package/.claude/rules/testing.md +0 -172
  40. package/.claude/skills/acp-harness/SKILL.md +0 -310
  41. package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
  42. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
  43. package/.claude/skills/acp-harness/references/downstream.md +0 -288
  44. package/.claude/skills/acp-harness/references/output-formats.md +0 -221
  45. package/.claude-plugin/marketplace.json +0 -15
  46. package/.claude-plugin/plugin.json +0 -16
  47. package/.github/CODEOWNERS +0 -6
  48. package/.github/workflows/ci.yml +0 -63
  49. package/.github/workflows/publish.yml +0 -146
  50. package/.mcp.json +0 -20
  51. package/CLAUDE.md +0 -92
  52. package/Dockerfile.test +0 -23
  53. package/biome.json +0 -96
  54. package/bun.lock +0 -513
  55. package/docker-compose.test.yml +0 -21
  56. package/scripts/bun-test-wrapper.sh +0 -46
  57. package/src/acp.constants.ts +0 -56
  58. package/src/acp.schemas.ts +0 -161
  59. package/src/acp.types.ts +0 -28
  60. package/src/tests/fixtures/.claude/settings.local.json +0 -8
  61. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
  62. package/tsconfig.json +0 -32
@@ -0,0 +1,259 @@
1
+ /**
2
+ * Summarize command - derive compact views from full trajectory results.
3
+ *
4
+ * @remarks
5
+ * Transforms full trajectory JSONL into:
6
+ * - Summary JSONL: Compact format for jq analysis
7
+ * - Markdown: Human-readable format for LLM-as-judge workflows
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { parseArgs } from 'node:util'
13
+ import { extractContent, extractFilePath, headTailPreview } from './capture.ts'
14
+ import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from './constants.ts'
15
+ import type { CaptureResult, SummaryResult } from './schemas.ts'
16
+ import { CaptureResultSchema } from './schemas.ts'
17
+
18
+ // ============================================================================
19
+ // Types
20
+ // ============================================================================
21
+
22
+ /** Configuration for summarize command */
23
+ export type SummarizeConfig = {
24
+ /** Path to results.jsonl file */
25
+ resultsPath: string
26
+ /** Output file path */
27
+ outputPath?: string
28
+ /** Output as markdown instead of JSONL */
29
+ markdown?: boolean
30
+ }
31
+
32
+ // ============================================================================
33
+ // Helpers
34
+ // ============================================================================
35
+
36
+ /** Resolve path relative to process.cwd() */
37
+ const resolvePath = (path: string): string => {
38
+ if (path.startsWith('/')) return path
39
+ return `${process.cwd()}/${path}`
40
+ }
41
+
42
+ /** Load capture results from JSONL file */
43
+ const loadResults = async (path: string): Promise<CaptureResult[]> => {
44
+ const content = await Bun.file(path).text()
45
+ return content
46
+ .trim()
47
+ .split('\n')
48
+ .filter(Boolean)
49
+ .map((line, index) => {
50
+ try {
51
+ return CaptureResultSchema.parse(JSON.parse(line))
52
+ } catch (error) {
53
+ throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
54
+ }
55
+ })
56
+ }
57
+
58
+ /**
59
+ * Format capture result as compact summary.
60
+ *
61
+ * @param result - Full capture result
62
+ * @returns Compact summary result
63
+ *
64
+ * @public
65
+ */
66
+ export const formatSummary = (result: CaptureResult): SummaryResult => {
67
+ return {
68
+ id: result.id,
69
+ input: result.input,
70
+ output: result.output,
71
+ toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
72
+ duration: result.timing.end - result.timing.start,
73
+ }
74
+ }
75
+
76
+ /**
77
+ * Format capture result as markdown with step IDs.
78
+ *
79
+ * @param result - Full capture result
80
+ * @returns Markdown formatted string
81
+ *
82
+ * @public
83
+ */
84
+ export const formatMarkdown = (result: CaptureResult): string => {
85
+ const lines: string[] = [
86
+ `## Evaluation Record: ${result.id}`,
87
+ '',
88
+ `**Input:** ${result.input}`,
89
+ '',
90
+ '**Trajectory:**',
91
+ ]
92
+
93
+ let stepNum = 1
94
+ for (const step of result.trajectory) {
95
+ const stepId = `${result.id}-step-${stepNum}`
96
+
97
+ if (step.type === 'thought') {
98
+ const preview = step.content.slice(0, 100)
99
+ const truncated = step.content.length > 100 ? '...' : ''
100
+ lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
101
+ stepNum++
102
+ } else if (step.type === 'tool_call') {
103
+ const duration = step.duration ? ` (${step.duration}ms)` : ''
104
+ const filePath = extractFilePath(step.input)
105
+ const content = extractContent(step.input)
106
+
107
+ lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
108
+
109
+ // Add file path if present
110
+ if (filePath) {
111
+ const charCount = content?.length ?? 0
112
+ lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
113
+ }
114
+
115
+ // Add head/tail preview for content-producing tools
116
+ if (content && content.length > 0) {
117
+ const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content
118
+ // Detect file extension for syntax highlighting
119
+ const ext = filePath?.split('.').pop() ?? 'typescript'
120
+ lines.push(` \`\`\`${ext}`)
121
+ lines.push(` ${preview.split('\n').join('\n ')}`)
122
+ lines.push(' ```')
123
+ }
124
+ stepNum++
125
+ } else if (step.type === 'plan') {
126
+ const entries = step.entries as Array<{ content: string; status: string }>
127
+ const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ')
128
+ const truncated = planSummary.length > 80 ? '...' : ''
129
+ lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
130
+ stepNum++
131
+ } else if (step.type === 'message') {
132
+ const preview = step.content.slice(0, 100)
133
+ const truncated = step.content.length > 100 ? '...' : ''
134
+ lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
135
+ stepNum++
136
+ }
137
+ }
138
+
139
+ lines.push('')
140
+ const outputPreview = result.output.slice(0, 200)
141
+ const outputTruncated = result.output.length > 200 ? '...' : ''
142
+ lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
143
+ lines.push('')
144
+
145
+ const metadataStr = Object.entries(result.metadata)
146
+ .map(([k, v]) => `${k}=${v}`)
147
+ .join(', ')
148
+ lines.push(`**Metadata:** ${metadataStr}`)
149
+ lines.push(`**Tool Errors:** ${result.toolErrors}`)
150
+ lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
151
+
152
+ if (result.score) {
153
+ lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`)
154
+ if (result.score.reasoning) {
155
+ lines.push(`**Reasoning:** ${result.score.reasoning}`)
156
+ }
157
+ }
158
+
159
+ lines.push('')
160
+ lines.push('---')
161
+ lines.push('')
162
+
163
+ return lines.join('\n')
164
+ }
165
+
166
+ // ============================================================================
167
+ // Summarize Implementation
168
+ // ============================================================================
169
+
170
+ /**
171
+ * Execute summarize with configuration object.
172
+ *
173
+ * @param config - Summarize configuration
174
+ * @returns Formatted output string
175
+ */
176
+ export const runSummarize = async (config: SummarizeConfig): Promise<string> => {
177
+ const { resultsPath, outputPath, markdown = false } = config
178
+
179
+ // Load results
180
+ const results = await loadResults(resultsPath)
181
+
182
+ // Format output
183
+ let output: string
184
+ if (markdown) {
185
+ output = results.map(formatMarkdown).join('\n')
186
+ } else {
187
+ output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n')
188
+ }
189
+
190
+ // Write output
191
+ if (outputPath) {
192
+ await Bun.write(resolvePath(outputPath), output)
193
+ } else {
194
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
195
+ console.log(output)
196
+ }
197
+
198
+ return output
199
+ }
200
+
201
+ // ============================================================================
202
+ // CLI Entry Point
203
+ // ============================================================================
204
+
205
+ /**
206
+ * Summarize command CLI handler.
207
+ *
208
+ * @param args - Command line arguments (after 'summarize')
209
+ */
210
+ export const summarize = async (args: string[]): Promise<void> => {
211
+ const { values, positionals } = parseArgs({
212
+ args,
213
+ options: {
214
+ output: { type: 'string', short: 'o' },
215
+ markdown: { type: 'boolean', short: 'm', default: false },
216
+ help: { type: 'boolean', short: 'h' },
217
+ },
218
+ allowPositionals: true,
219
+ })
220
+
221
+ if (values.help) {
222
+ // biome-ignore lint/suspicious/noConsole: CLI help output
223
+ console.log(`
224
+ Usage: acp-harness summarize <results.jsonl> [options]
225
+
226
+ Arguments:
227
+ results.jsonl Input file with capture results
228
+
229
+ Options:
230
+ -o, --output Output file (default: stdout)
231
+ -m, --markdown Output as markdown instead of JSONL
232
+ -h, --help Show this help message
233
+
234
+ Output Formats:
235
+ JSONL (default): Compact summary with id, input, output, toolCalls, duration
236
+ Markdown (-m): Human-readable format with step IDs for LLM-as-judge
237
+
238
+ Examples:
239
+ # Summary JSONL for jq analysis
240
+ acp-harness summarize results.jsonl -o summary.jsonl
241
+
242
+ # Markdown for LLM evaluation
243
+ acp-harness summarize results.jsonl --markdown -o results.md
244
+ `)
245
+ return
246
+ }
247
+
248
+ const resultsPath = positionals[0]
249
+ if (!resultsPath) {
250
+ console.error('Error: results.jsonl path is required')
251
+ process.exit(1)
252
+ }
253
+
254
+ await runSummarize({
255
+ resultsPath,
256
+ outputPath: values.output,
257
+ markdown: values.markdown ?? false,
258
+ })
259
+ }
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Tests for adapter compliance checking functionality.
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test'
6
+ import { type CheckConfig, runCheck } from '../adapter-check.ts'
7
+
8
+ describe('runCheck', () => {
9
+ test('fails spawn check for non-existent command', async () => {
10
+ const config: CheckConfig = {
11
+ command: ['nonexistent-command-xyz'],
12
+ timeout: 1000,
13
+ verbose: false,
14
+ }
15
+
16
+ const result = await runCheck(config)
17
+
18
+ expect(result.passed).toBe(false)
19
+ expect(result.checks.length).toBeGreaterThanOrEqual(1)
20
+ expect(result.checks[0]?.name).toBe('spawn')
21
+ expect(result.checks[0]?.passed).toBe(false)
22
+ })
23
+
24
+ test('fails spawn check for command that exits immediately', async () => {
25
+ const config: CheckConfig = {
26
+ command: ['false'], // Unix command that exits with code 1
27
+ timeout: 1000,
28
+ verbose: false,
29
+ }
30
+
31
+ const result = await runCheck(config)
32
+
33
+ expect(result.passed).toBe(false)
34
+ expect(result.summary.failed).toBeGreaterThanOrEqual(1)
35
+ })
36
+
37
+ test('returns structured result with summary', async () => {
38
+ const config: CheckConfig = {
39
+ command: ['echo', 'test'],
40
+ timeout: 1000,
41
+ verbose: false,
42
+ }
43
+
44
+ const result = await runCheck(config)
45
+
46
+ expect(result).toHaveProperty('passed')
47
+ expect(result).toHaveProperty('checks')
48
+ expect(result).toHaveProperty('summary')
49
+ expect(result.summary).toHaveProperty('total')
50
+ expect(result.summary).toHaveProperty('passed')
51
+ expect(result.summary).toHaveProperty('failed')
52
+ expect(typeof result.passed).toBe('boolean')
53
+ expect(Array.isArray(result.checks)).toBe(true)
54
+ })
55
+
56
+ test('includes verbose details when enabled', async () => {
57
+ const config: CheckConfig = {
58
+ command: ['echo', 'test'],
59
+ timeout: 1000,
60
+ verbose: true,
61
+ }
62
+
63
+ const result = await runCheck(config)
64
+
65
+ // At least the spawn check should have details in verbose mode
66
+ const spawnCheck = result.checks.find((c) => c.name === 'spawn')
67
+ expect(spawnCheck).toBeDefined()
68
+ // Note: details may or may not be present depending on check outcome
69
+ })
70
+ })
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Tests for adapter scaffolding functionality.
3
+ */
4
+
5
+ import { afterEach, describe, expect, test } from 'bun:test'
6
+ import { rm } from 'node:fs/promises'
7
+ import { join } from 'node:path'
8
+ import { runScaffold, type ScaffoldConfig } from '../adapter-scaffold.ts'
9
+
10
+ const testDir = join(import.meta.dir, 'fixtures', 'scaffold-output')
11
+
12
+ describe('runScaffold', () => {
13
+ afterEach(async () => {
14
+ // Clean up test output
15
+ await rm(testDir, { recursive: true, force: true })
16
+ })
17
+
18
+ test('generates TypeScript adapter structure', async () => {
19
+ const config: ScaffoldConfig = {
20
+ name: 'test-agent',
21
+ outputDir: testDir,
22
+ lang: 'ts',
23
+ minimal: false,
24
+ }
25
+
26
+ const result = await runScaffold(config)
27
+
28
+ expect(result.outputDir).toBe(testDir)
29
+ expect(result.lang).toBe('ts')
30
+ expect(result.files).toContain('package.json')
31
+ expect(result.files).toContain('tsconfig.json')
32
+ expect(result.files).toContain('src/main.ts')
33
+ expect(result.files).toContain('src/types.ts')
34
+ expect(result.files).toContain('src/session-manager.ts')
35
+ expect(result.files).toContain('src/handlers/initialize.ts')
36
+ expect(result.files).toContain('src/handlers/session-new.ts')
37
+ expect(result.files).toContain('src/handlers/session-prompt.ts')
38
+ expect(result.files).toContain('src/handlers/session-cancel.ts')
39
+ expect(result.files).toContain('README.md')
40
+
41
+ // Verify files actually exist
42
+ const packageJson = await Bun.file(join(testDir, 'package.json')).text()
43
+ expect(packageJson).toContain('"test-agent-acp"')
44
+
45
+ const mainTs = await Bun.file(join(testDir, 'src', 'main.ts')).text()
46
+ expect(mainTs).toContain('#!/usr/bin/env bun')
47
+ expect(mainTs).toContain('handleInitialize')
48
+ })
49
+
50
+ test('generates minimal TypeScript structure without README', async () => {
51
+ const config: ScaffoldConfig = {
52
+ name: 'minimal-agent',
53
+ outputDir: testDir,
54
+ lang: 'ts',
55
+ minimal: true,
56
+ }
57
+
58
+ const result = await runScaffold(config)
59
+
60
+ expect(result.files).not.toContain('README.md')
61
+ expect(result.files).toContain('package.json')
62
+ expect(result.files).toContain('src/main.ts')
63
+ })
64
+
65
+ test('generates Python adapter structure', async () => {
66
+ const config: ScaffoldConfig = {
67
+ name: 'python-agent',
68
+ outputDir: testDir,
69
+ lang: 'python',
70
+ minimal: false,
71
+ }
72
+
73
+ const result = await runScaffold(config)
74
+
75
+ expect(result.lang).toBe('python')
76
+ expect(result.files).toContain('adapter.py')
77
+ expect(result.files).toContain('README.md')
78
+
79
+ const adapterPy = await Bun.file(join(testDir, 'adapter.py')).text()
80
+ expect(adapterPy).toContain('#!/usr/bin/env python3')
81
+ expect(adapterPy).toContain('python-agent')
82
+ expect(adapterPy).toContain('def handle_initialize')
83
+ })
84
+
85
+ test('generates minimal Python structure without README', async () => {
86
+ const config: ScaffoldConfig = {
87
+ name: 'minimal-python',
88
+ outputDir: testDir,
89
+ lang: 'python',
90
+ minimal: true,
91
+ }
92
+
93
+ const result = await runScaffold(config)
94
+
95
+ expect(result.files).toContain('adapter.py')
96
+ expect(result.files).not.toContain('README.md')
97
+ })
98
+
99
+ test('package.json contains correct name', async () => {
100
+ const config: ScaffoldConfig = {
101
+ name: 'my-special-agent',
102
+ outputDir: testDir,
103
+ lang: 'ts',
104
+ minimal: true,
105
+ }
106
+
107
+ await runScaffold(config)
108
+
109
+ const packageJson = JSON.parse(await Bun.file(join(testDir, 'package.json')).text())
110
+ expect(packageJson.name).toBe('my-special-agent-acp')
111
+ })
112
+ })