@plaited/acp-harness 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +120 -16
  3. package/bin/cli.ts +105 -636
  4. package/bin/tests/cli.spec.ts +218 -51
  5. package/package.json +20 -4
  6. package/src/acp-client.ts +5 -4
  7. package/src/acp-transport.ts +14 -7
  8. package/src/adapter-check.ts +542 -0
  9. package/src/adapter-scaffold.ts +934 -0
  10. package/src/balance.ts +232 -0
  11. package/src/calibrate.ts +300 -0
  12. package/src/capture.ts +457 -0
  13. package/src/constants.ts +94 -0
  14. package/src/grader-loader.ts +174 -0
  15. package/src/harness.ts +35 -0
  16. package/src/schemas-cli.ts +239 -0
  17. package/src/schemas.ts +567 -0
  18. package/src/summarize.ts +245 -0
  19. package/src/tests/adapter-check.spec.ts +70 -0
  20. package/src/tests/adapter-scaffold.spec.ts +112 -0
  21. package/src/tests/fixtures/grader-bad-module.ts +5 -0
  22. package/src/tests/fixtures/grader-exec-fail.py +9 -0
  23. package/src/tests/fixtures/grader-exec-invalid.py +6 -0
  24. package/src/tests/fixtures/grader-exec.py +29 -0
  25. package/src/tests/fixtures/grader-module.ts +14 -0
  26. package/src/tests/grader-loader.spec.ts +153 -0
  27. package/src/trials.ts +395 -0
  28. package/src/validate-refs.ts +188 -0
  29. package/.claude/rules/accuracy.md +0 -43
  30. package/.claude/rules/bun-apis.md +0 -80
  31. package/.claude/rules/code-review.md +0 -254
  32. package/.claude/rules/git-workflow.md +0 -37
  33. package/.claude/rules/github.md +0 -154
  34. package/.claude/rules/testing.md +0 -172
  35. package/.claude/skills/acp-harness/SKILL.md +0 -310
  36. package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
  37. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
  38. package/.claude/skills/acp-harness/references/downstream.md +0 -288
  39. package/.claude/skills/acp-harness/references/output-formats.md +0 -221
  40. package/.claude-plugin/marketplace.json +0 -15
  41. package/.claude-plugin/plugin.json +0 -16
  42. package/.github/CODEOWNERS +0 -6
  43. package/.github/workflows/ci.yml +0 -63
  44. package/.github/workflows/publish.yml +0 -146
  45. package/.mcp.json +0 -20
  46. package/CLAUDE.md +0 -92
  47. package/Dockerfile.test +0 -23
  48. package/biome.json +0 -96
  49. package/bun.lock +0 -513
  50. package/docker-compose.test.yml +0 -21
  51. package/scripts/bun-test-wrapper.sh +0 -46
  52. package/src/acp.constants.ts +0 -56
  53. package/src/acp.schemas.ts +0 -161
  54. package/src/acp.types.ts +0 -28
  55. package/src/tests/fixtures/.claude/settings.local.json +0 -8
  56. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
  57. package/tsconfig.json +0 -32
@@ -0,0 +1,174 @@
1
+ /**
2
+ * Polyglot grader loader module.
3
+ *
4
+ * @remarks
5
+ * Supports loading graders from:
6
+ * - TypeScript/JavaScript modules (import as ES module)
7
+ * - Executable scripts (Python, Ruby, shell, etc. via subprocess)
8
+ *
9
+ * Executable graders use stdin/stdout JSON protocol:
10
+ * - Input: `{"input": "...", "output": "...", "expected": "...", "trajectory": [...]}`
11
+ * - Output: `{"pass": true, "score": 1.0, "reasoning": "..."}`
12
+ *
13
+ * @packageDocumentation
14
+ */
15
+
16
+ import type { Grader, TrajectoryStep } from './schemas.ts'
17
+ import { GraderResultSchema } from './schemas.ts'
18
+
19
+ // ============================================================================
20
+ // Constants
21
+ // ============================================================================
22
+
23
+ /** File extensions that are imported as ES modules */
24
+ const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
25
+
26
+ // ============================================================================
27
+ // Helpers
28
+ // ============================================================================
29
+
30
+ /** Check if a file path is a JavaScript/TypeScript module */
31
+ const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
32
+
33
+ /** Resolve path relative to process.cwd() */
34
+ const resolvePath = (path: string): string => {
35
+ if (path.startsWith('/')) return path
36
+ return `${process.cwd()}/${path}`
37
+ }
38
+
39
+ // ============================================================================
40
+ // Executable Grader
41
+ // ============================================================================
42
+
43
+ /** Input format for executable graders (stdin JSON) */
44
+ type ExecGraderInput = {
45
+ input: string
46
+ output: string
47
+ expected?: string
48
+ trajectory?: TrajectoryStep[]
49
+ }
50
+
51
+ /**
52
+ * Create a grader function that executes an external script.
53
+ *
54
+ * @remarks
55
+ * The script receives JSON on stdin and must output JSON on stdout.
56
+ * Non-zero exit codes are treated as errors.
57
+ *
58
+ * @param execPath - Absolute path to the executable script
59
+ * @returns Grader function
60
+ */
61
+ const createExecGrader = (execPath: string): Grader => {
62
+ return async (params) => {
63
+ const input: ExecGraderInput = {
64
+ input: params.input,
65
+ output: params.output,
66
+ expected: params.expected,
67
+ trajectory: params.trajectory,
68
+ }
69
+
70
+ const inputJson = JSON.stringify(input)
71
+
72
+ const proc = Bun.spawn([execPath], {
73
+ stdin: new TextEncoder().encode(inputJson),
74
+ stdout: 'pipe',
75
+ stderr: 'pipe',
76
+ })
77
+
78
+ const [stdout, stderr, exitCode] = await Promise.all([
79
+ new Response(proc.stdout).text(),
80
+ new Response(proc.stderr).text(),
81
+ proc.exited,
82
+ ])
83
+
84
+ if (exitCode !== 0) {
85
+ throw new Error(`Grader exited with code ${exitCode}: ${stderr.trim() || 'No error output'}`)
86
+ }
87
+
88
+ const trimmedStdout = stdout.trim()
89
+ if (!trimmedStdout) {
90
+ throw new Error('Grader produced no output')
91
+ }
92
+
93
+ let parsed: unknown
94
+ try {
95
+ parsed = JSON.parse(trimmedStdout)
96
+ } catch {
97
+ throw new Error(`Grader output is not valid JSON: ${trimmedStdout.slice(0, 100)}`)
98
+ }
99
+
100
+ const result = GraderResultSchema.safeParse(parsed)
101
+ if (!result.success) {
102
+ throw new Error(`Invalid grader result: ${result.error.message}`)
103
+ }
104
+
105
+ return result.data
106
+ }
107
+ }
108
+
109
+ // ============================================================================
110
+ // Module Grader
111
+ // ============================================================================
112
+
113
+ /**
114
+ * Load a grader from a JavaScript/TypeScript module.
115
+ *
116
+ * @remarks
117
+ * The module must export a `grade` function matching the `Grader` type.
118
+ *
119
+ * @param modulePath - Absolute path to the module
120
+ * @returns Grader function
121
+ */
122
+ const loadModuleGrader = async (modulePath: string): Promise<Grader> => {
123
+ const graderModule = await import(modulePath)
124
+
125
+ if (typeof graderModule.grade !== 'function') {
126
+ throw new Error(`Grader module must export a 'grade' function`)
127
+ }
128
+
129
+ return graderModule.grade as Grader
130
+ }
131
+
132
+ // ============================================================================
133
+ // Public API
134
+ // ============================================================================
135
+
136
+ /**
137
+ * Load a grader from a file path.
138
+ *
139
+ * @remarks
140
+ * Detection logic:
141
+ * - `.ts`, `.js`, `.mjs`, `.cjs` → Import as ES module
142
+ * - Everything else → Execute as subprocess
143
+ *
144
+ * @param graderPath - Path to the grader (relative or absolute)
145
+ * @returns Grader function
146
+ * @throws Error if grader not found or invalid
147
+ *
148
+ * @example
149
+ * ```typescript
150
+ * // TypeScript grader
151
+ * const grader = await loadGrader('./grader.ts')
152
+ *
153
+ * // Python grader
154
+ * const grader = await loadGrader('./grader.py')
155
+ *
156
+ * // Any executable
157
+ * const grader = await loadGrader('./my-grader')
158
+ * ```
159
+ */
160
+ export const loadGrader = async (graderPath: string): Promise<Grader> => {
161
+ const resolvedPath = resolvePath(graderPath)
162
+
163
+ // Check file exists
164
+ const file = Bun.file(resolvedPath)
165
+ if (!(await file.exists())) {
166
+ throw new Error(`Grader not found: ${resolvedPath}`)
167
+ }
168
+
169
+ if (isJsModule(resolvedPath)) {
170
+ return loadModuleGrader(resolvedPath)
171
+ }
172
+
173
+ return createExecGrader(resolvedPath)
174
+ }
package/src/harness.ts ADDED
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Harness commands for agent evaluation.
3
+ *
4
+ * @remarks
5
+ * Re-exports all harness command modules for programmatic use.
6
+ * For CLI usage, run `acp-harness <command> --help`.
7
+ *
8
+ * **Commands:**
9
+ * - `capture` - Core trajectory capture
10
+ * - `trials` - Multi-run pass@k/pass^k analysis
11
+ * - `summarize` - Derive compact views from results
12
+ * - `calibrate` - Sample failures for grader review
13
+ * - `validateRefs` - Check reference solutions
14
+ * - `balance` - Analyze test set coverage
15
+ * - `schemasCli` - Export JSON schemas
16
+ *
17
+ * @packageDocumentation
18
+ */
19
+
20
+ export type { BalanceConfig } from './balance.ts'
21
+ export { balance, runBalance } from './balance.ts'
22
+ export type { CalibrateConfig } from './calibrate.ts'
23
+ export { calibrate, runCalibrate } from './calibrate.ts'
24
+ // Config types
25
+ export type { CaptureConfig } from './capture.ts'
26
+ // Command implementations (for programmatic use)
27
+ export { capture, extractOutput, extractTrajectory, hasToolErrors, loadPrompts, runCapture } from './capture.ts'
28
+ export type { SchemasConfig } from './schemas-cli.ts'
29
+ export { runSchemas, schemasCli } from './schemas-cli.ts'
30
+ export type { SummarizeConfig } from './summarize.ts'
31
+ export { runSummarize, summarize } from './summarize.ts'
32
+ export type { TrialsConfig } from './trials.ts'
33
+ export { runTrials, trials } from './trials.ts'
34
+ export type { ValidateRefsConfig } from './validate-refs.ts'
35
+ export { runValidateRefs, validateRefs } from './validate-refs.ts'
@@ -0,0 +1,239 @@
1
+ /**
2
+ * Schemas command - export JSON schemas for non-TypeScript users.
3
+ *
4
+ * @remarks
5
+ * Uses Zod 4's native `z.toJSONSchema()` to generate JSON Schema from
6
+ * the harness schemas. Useful for validation in other languages/tools.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { z } from 'zod'
13
+ import * as schemas from './schemas.ts'
14
+
15
+ // ============================================================================
16
+ // Schema Registry
17
+ // ============================================================================
18
+
19
+ /** Available schemas for export */
20
+ const SCHEMA_REGISTRY: Record<string, z.ZodSchema> = {
21
+ PromptCase: schemas.PromptCaseSchema,
22
+ GraderResult: schemas.GraderResultSchema,
23
+ TrajectoryStep: schemas.TrajectoryStepSchema,
24
+ CaptureResult: schemas.CaptureResultSchema,
25
+ SummaryResult: schemas.SummaryResultSchema,
26
+ TrialEntry: schemas.TrialEntrySchema,
27
+ TrialResult: schemas.TrialResultSchema,
28
+ CalibrationSample: schemas.CalibrationSampleSchema,
29
+ BalanceAnalysis: schemas.BalanceAnalysisSchema,
30
+ ValidationResult: schemas.ValidationResultSchema,
31
+ McpServerConfig: schemas.McpServerSchema,
32
+ Session: schemas.SessionSchema,
33
+ JsonRpcRequest: schemas.JsonRpcRequestSchema,
34
+ JsonRpcResponse: schemas.JsonRpcResponseSchema,
35
+ JsonRpcError: schemas.JsonRpcErrorSchema,
36
+ }
37
+
38
+ // ============================================================================
39
+ // Types
40
+ // ============================================================================
41
+
42
+ /** Configuration for schemas command */
43
+ export type SchemasConfig = {
44
+ /** Specific schema name to export (undefined = all) */
45
+ schemaName?: string
46
+ /** Output file path */
47
+ outputPath?: string
48
+ /** Output as JSON (vs list) */
49
+ json?: boolean
50
+ /** Split into separate files */
51
+ split?: boolean
52
+ /** List available schemas */
53
+ list?: boolean
54
+ }
55
+
56
+ // ============================================================================
57
+ // Helpers
58
+ // ============================================================================
59
+
60
+ /** Resolve path relative to process.cwd() */
61
+ const resolvePath = (path: string): string => {
62
+ if (path.startsWith('/')) return path
63
+ return `${process.cwd()}/${path}`
64
+ }
65
+
66
+ /** Generate JSON Schema from Zod schema */
67
+ const toJsonSchema = (schema: z.ZodSchema, name: string): object => {
68
+ try {
69
+ // Zod 4's native JSON Schema generation
70
+ const jsonSchema = z.toJSONSchema(schema)
71
+ return {
72
+ $schema: 'https://json-schema.org/draft/2020-12/schema',
73
+ title: name,
74
+ ...jsonSchema,
75
+ }
76
+ } catch (error) {
77
+ // Fallback for schemas that can't be converted
78
+ return {
79
+ $schema: 'https://json-schema.org/draft/2020-12/schema',
80
+ title: name,
81
+ description: `Schema for ${name} (auto-generation failed: ${error instanceof Error ? error.message : 'unknown error'})`,
82
+ }
83
+ }
84
+ }
85
+
86
+ // ============================================================================
87
+ // Schemas Implementation
88
+ // ============================================================================
89
+
90
+ /**
91
+ * Execute schemas command with configuration object.
92
+ *
93
+ * @param config - Schemas configuration
94
+ * @returns Generated JSON schemas
95
+ */
96
+ export const runSchemas = async (config: SchemasConfig): Promise<Record<string, object> | string[]> => {
97
+ const { schemaName, outputPath, json = false, split = false, list = false } = config
98
+
99
+ // List mode
100
+ if (list) {
101
+ const names = Object.keys(SCHEMA_REGISTRY)
102
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
103
+ console.log('Available schemas:')
104
+ for (const name of names) {
105
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
106
+ console.log(` - ${name}`)
107
+ }
108
+ return names
109
+ }
110
+
111
+ // Single schema mode
112
+ if (schemaName) {
113
+ const schema = SCHEMA_REGISTRY[schemaName]
114
+ if (!schema) {
115
+ console.error(`Error: Unknown schema '${schemaName}'`)
116
+ console.error(`Available: ${Object.keys(SCHEMA_REGISTRY).join(', ')}`)
117
+ process.exit(1)
118
+ }
119
+
120
+ const jsonSchema = toJsonSchema(schema, schemaName)
121
+ const output = JSON.stringify(jsonSchema, null, 2)
122
+
123
+ if (outputPath) {
124
+ await Bun.write(resolvePath(outputPath), output)
125
+ } else {
126
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
127
+ console.log(output)
128
+ }
129
+
130
+ return { [schemaName]: jsonSchema }
131
+ }
132
+
133
+ // All schemas mode
134
+ const allSchemas: Record<string, object> = {}
135
+
136
+ for (const [name, schema] of Object.entries(SCHEMA_REGISTRY)) {
137
+ allSchemas[name] = toJsonSchema(schema, name)
138
+ }
139
+
140
+ if (split && outputPath) {
141
+ // Create directory and write separate files
142
+ const dir = resolvePath(outputPath)
143
+ await Bun.$`mkdir -p ${dir}`
144
+
145
+ for (const [name, jsonSchema] of Object.entries(allSchemas)) {
146
+ const filePath = `${dir}/${name}.json`
147
+ await Bun.write(filePath, JSON.stringify(jsonSchema, null, 2))
148
+ }
149
+
150
+ console.error(`Wrote ${Object.keys(allSchemas).length} schema files to ${dir}/`)
151
+ } else if (json) {
152
+ const output = JSON.stringify(allSchemas, null, 2)
153
+
154
+ if (outputPath) {
155
+ await Bun.write(resolvePath(outputPath), output)
156
+ } else {
157
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
158
+ console.log(output)
159
+ }
160
+ } else {
161
+ // Default: list schemas
162
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
163
+ console.log('Available schemas (use --json to export):')
164
+ for (const name of Object.keys(allSchemas)) {
165
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
166
+ console.log(` - ${name}`)
167
+ }
168
+ }
169
+
170
+ return allSchemas
171
+ }
172
+
173
+ // ============================================================================
174
+ // CLI Entry Point
175
+ // ============================================================================
176
+
177
+ /**
178
+ * Schemas command CLI handler.
179
+ *
180
+ * @param args - Command line arguments (after 'schemas')
181
+ */
182
+ export const schemasCli = async (args: string[]): Promise<void> => {
183
+ const { values, positionals } = parseArgs({
184
+ args,
185
+ options: {
186
+ output: { type: 'string', short: 'o' },
187
+ json: { type: 'boolean', short: 'j', default: false },
188
+ split: { type: 'boolean', short: 's', default: false },
189
+ list: { type: 'boolean', short: 'l', default: false },
190
+ help: { type: 'boolean', short: 'h' },
191
+ },
192
+ allowPositionals: true,
193
+ })
194
+
195
+ if (values.help) {
196
+ // biome-ignore lint/suspicious/noConsole: CLI help output
197
+ console.log(`
198
+ Usage: acp-harness schemas [schema-name] [options]
199
+
200
+ Arguments:
201
+ schema-name Specific schema to export (optional)
202
+
203
+ Options:
204
+ -o, --output Output file or directory (with --split)
205
+ -j, --json Export as JSON (default: list names)
206
+ -s, --split Split into separate files (requires --output dir)
207
+ -l, --list List available schemas
208
+ -h, --help Show this help message
209
+
210
+ Available Schemas:
211
+ PromptCase, GraderResult, TrajectoryStep, CaptureResult, SummaryResult,
212
+ TrialEntry, TrialResult, CalibrationSample, BalanceAnalysis, ValidationResult,
213
+ McpServerConfig, Session, JsonRpcRequest, JsonRpcResponse, JsonRpcError
214
+
215
+ Examples:
216
+ # List available schemas
217
+ acp-harness schemas --list
218
+
219
+ # Export all schemas as single JSON file
220
+ acp-harness schemas --json -o schemas.json
221
+
222
+ # Export specific schema
223
+ acp-harness schemas CaptureResult --json
224
+ acp-harness schemas TrialResult --json -o trial-schema.json
225
+
226
+ # Export all schemas as separate files
227
+ acp-harness schemas --json --split -o schemas/
228
+ `)
229
+ return
230
+ }
231
+
232
+ await runSchemas({
233
+ schemaName: positionals[0],
234
+ outputPath: values.output,
235
+ json: values.json ?? false,
236
+ split: values.split ?? false,
237
+ list: values.list ?? false,
238
+ })
239
+ }