@plaited/acp-harness 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -31
- package/bin/cli.ts +15 -0
- package/package.json +5 -7
- package/src/acp-client.ts +7 -4
- package/src/adapter-check.ts +0 -1
- package/src/adapter-scaffold.ts +16 -15
- package/src/calibrate.ts +28 -8
- package/src/capture.ts +114 -33
- package/src/grader-loader.ts +3 -3
- package/src/harness.ts +4 -0
- package/src/headless-cli.ts +433 -0
- package/src/headless-history-builder.ts +141 -0
- package/src/headless-output-parser.ts +251 -0
- package/src/headless-session-manager.ts +389 -0
- package/src/headless.schemas.ts +241 -0
- package/src/headless.ts +71 -0
- package/src/headless.types.ts +19 -0
- package/src/integration_tests/acp-claude.spec.ts +170 -0
- package/src/integration_tests/acp-gemini.spec.ts +174 -0
- package/src/schemas.ts +88 -36
- package/src/summarize.ts +4 -8
- package/src/tests/acp-client.spec.ts +1 -1
- package/src/tests/capture-cli.spec.ts +188 -0
- package/src/tests/capture-helpers.spec.ts +229 -67
- package/src/tests/constants.spec.ts +121 -0
- package/src/tests/fixtures/grader-exec.py +3 -3
- package/src/tests/fixtures/grader-module.ts +2 -2
- package/src/tests/grader-loader.spec.ts +5 -5
- package/src/tests/headless.spec.ts +460 -0
- package/src/tests/schemas-cli.spec.ts +142 -0
- package/src/tests/schemas.spec.ts +657 -0
- package/src/tests/summarize-helpers.spec.ts +3 -3
- package/src/tests/trials-cli.spec.ts +145 -0
- package/src/trials.ts +6 -19
- package/src/validate-refs.ts +1 -1
- package/src/tests/acp-integration.docker.ts +0 -214
package/README.md
CHANGED
|
@@ -11,21 +11,25 @@ CLI tool for capturing agent trajectories from ACP-compatible agents. Execute pr
|
|
|
11
11
|
Use these tools directly via the CLI without installation:
|
|
12
12
|
|
|
13
13
|
```bash
|
|
14
|
-
#
|
|
15
|
-
|
|
14
|
+
# Using built-in headless adapter (recommended - no extra install needed)
|
|
15
|
+
export ANTHROPIC_API_KEY=sk-...
|
|
16
|
+
bunx @plaited/acp-harness capture prompts.jsonl \
|
|
17
|
+
bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json \
|
|
18
|
+
-o results.jsonl
|
|
16
19
|
|
|
17
|
-
# Or
|
|
18
|
-
|
|
19
|
-
acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
|
|
20
|
+
# Or with an external ACP adapter
|
|
21
|
+
bunx @plaited/acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
|
|
20
22
|
```
|
|
21
23
|
|
|
22
|
-
**Prerequisite:**
|
|
24
|
+
**Prerequisite:** Set your API key. The `headless` command works with any CLI agent that supports JSON output - no adapter installation required:
|
|
23
25
|
|
|
24
26
|
```bash
|
|
25
|
-
|
|
26
|
-
export
|
|
27
|
+
export ANTHROPIC_API_KEY=sk-... # For Claude
|
|
28
|
+
export GEMINI_API_KEY=... # For Gemini
|
|
27
29
|
```
|
|
28
30
|
|
|
31
|
+
Pre-built schemas are available in `.claude/skills/acp-adapters/schemas/` for Claude and Gemini.
|
|
32
|
+
|
|
29
33
|
### Commands
|
|
30
34
|
|
|
31
35
|
| Command | Description |
|
|
@@ -37,17 +41,21 @@ export ANTHROPIC_API_KEY=sk-...
|
|
|
37
41
|
| `validate-refs <prompts>` | Check reference solutions |
|
|
38
42
|
| `balance <prompts>` | Analyze test set coverage |
|
|
39
43
|
| `schemas [name]` | Export JSON schemas |
|
|
40
|
-
| `
|
|
44
|
+
| `headless --schema <path>` | Schema-driven adapter for any CLI agent |
|
|
41
45
|
| `adapter:check <cmd>` | Validate adapter ACP compliance |
|
|
42
46
|
|
|
43
47
|
### Examples
|
|
44
48
|
|
|
45
49
|
```bash
|
|
46
|
-
# Capture trajectories
|
|
47
|
-
bunx @plaited/acp-harness capture prompts.jsonl
|
|
50
|
+
# Capture trajectories using headless adapter (recommended)
|
|
51
|
+
bunx @plaited/acp-harness capture prompts.jsonl \
|
|
52
|
+
bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json \
|
|
53
|
+
-o results.jsonl
|
|
48
54
|
|
|
49
55
|
# Run trials for pass@k analysis
|
|
50
|
-
bunx @plaited/acp-harness trials prompts.jsonl
|
|
56
|
+
bunx @plaited/acp-harness trials prompts.jsonl \
|
|
57
|
+
bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json \
|
|
58
|
+
-k 5 --grader ./grader.ts
|
|
51
59
|
|
|
52
60
|
# Summarize results
|
|
53
61
|
bunx @plaited/acp-harness summarize results.jsonl -o summary.jsonl
|
|
@@ -55,11 +63,9 @@ bunx @plaited/acp-harness summarize results.jsonl -o summary.jsonl
|
|
|
55
63
|
# Export schemas
|
|
56
64
|
bunx @plaited/acp-harness schemas CaptureResult --json
|
|
57
65
|
|
|
58
|
-
# Scaffold a new adapter
|
|
59
|
-
bunx @plaited/acp-harness adapter:scaffold my-agent -o ./my-agent-acp
|
|
60
|
-
|
|
61
66
|
# Validate adapter compliance
|
|
62
|
-
bunx @plaited/acp-harness adapter:check
|
|
67
|
+
bunx @plaited/acp-harness adapter:check \
|
|
68
|
+
bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json
|
|
63
69
|
```
|
|
64
70
|
|
|
65
71
|
## Skills for AI Agents
|
|
@@ -110,10 +116,12 @@ Discover, create, and validate ACP adapters for agent integration.
|
|
|
110
116
|
|
|
111
117
|
| Command | Description |
|
|
112
118
|
|---------|-------------|
|
|
119
|
+
| `headless` | Schema-driven adapter for any CLI agent |
|
|
113
120
|
| `adapter:scaffold` | Generate new adapter project with handlers |
|
|
114
121
|
| `adapter:check` | Validate ACP protocol compliance |
|
|
115
122
|
|
|
116
123
|
**Use cases:**
|
|
124
|
+
- Wrapping headless CLI agents with schema-driven adapter
|
|
117
125
|
- Finding existing adapters for your agent
|
|
118
126
|
- Building custom ACP adapters from scratch
|
|
119
127
|
- Validating adapter implementations
|
|
@@ -121,10 +129,18 @@ Discover, create, and validate ACP adapters for agent integration.
|
|
|
121
129
|
## Input Format
|
|
122
130
|
|
|
123
131
|
```jsonl
|
|
124
|
-
{"id":"test-001","input":"Create a primary button","
|
|
125
|
-
{"id":"test-002","input":"
|
|
132
|
+
{"id":"test-001","input":"Create a primary button","hint":"should contain <button>","metadata":{"category":"ui"}}
|
|
133
|
+
{"id":"test-002","input":["Create a component","Now add tests"],"metadata":{"category":"multi-turn"}}
|
|
126
134
|
```
|
|
127
135
|
|
|
136
|
+
| Field | Required | Description |
|
|
137
|
+
|-------|----------|-------------|
|
|
138
|
+
| `id` | Yes | Unique identifier |
|
|
139
|
+
| `input` | Yes | Single prompt (string) or conversation turns (string[]) |
|
|
140
|
+
| `hint` | No | Grader context - what to look for |
|
|
141
|
+
| `reference` | No | Reference solution (for validate-refs) |
|
|
142
|
+
| `metadata` | No | Tags, category, difficulty for filtering |
|
|
143
|
+
|
|
128
144
|
## Output Format
|
|
129
145
|
|
|
130
146
|
The harness outputs full trajectory JSONL (`CaptureResult` schema):
|
|
@@ -134,12 +150,12 @@ The harness outputs full trajectory JSONL (`CaptureResult` schema):
|
|
|
134
150
|
"id": "test-001",
|
|
135
151
|
"input": "Create a primary button",
|
|
136
152
|
"output": "Here's a button component...",
|
|
137
|
-
"
|
|
153
|
+
"hint": "should contain <button>",
|
|
138
154
|
"trajectory": [...],
|
|
139
|
-
"metadata": {"category": "ui", "agent": "bunx claude-code-acp"},
|
|
140
|
-
"timing": {"start": 1234567890, "end": 1234567900},
|
|
155
|
+
"metadata": {"category": "ui", "agent": "bunx claude-code-acp", "trajectoryRichness": "full", "turnCount": 1},
|
|
156
|
+
"timing": {"start": 1234567890, "end": 1234567900, "sessionCreation": 234, "total": 10},
|
|
141
157
|
"toolErrors": false,
|
|
142
|
-
"score": {"pass": true, "score": 1.0, "reasoning": "Contains
|
|
158
|
+
"score": {"pass": true, "score": 1.0, "reasoning": "Contains hint"}
|
|
143
159
|
}
|
|
144
160
|
```
|
|
145
161
|
|
|
@@ -147,6 +163,9 @@ Key fields:
|
|
|
147
163
|
- `toolErrors`: Boolean indicating if any tool calls failed
|
|
148
164
|
- `score`: Grader result (only if `--grader` provided)
|
|
149
165
|
- `trajectory`: Full execution trace (thoughts, messages, tool calls, plans)
|
|
166
|
+
- `metadata.trajectoryRichness`: `"full"` | `"messages-only"` | `"minimal"`
|
|
167
|
+
- `timing.sessionCreation`: Time to initialize session (ms)
|
|
168
|
+
- `timing.total`: End-to-end duration (ms)
|
|
150
169
|
|
|
151
170
|
## Graders
|
|
152
171
|
|
|
@@ -159,12 +178,12 @@ Export a `grade` function:
|
|
|
159
178
|
```typescript
|
|
160
179
|
import type { Grader } from '@plaited/acp-harness/schemas'
|
|
161
180
|
|
|
162
|
-
export const grade: Grader = async ({ input, output,
|
|
163
|
-
const pass = output.toLowerCase().includes(
|
|
181
|
+
export const grade: Grader = async ({ input, output, hint, trajectory }) => {
|
|
182
|
+
const pass = output.toLowerCase().includes(hint?.toLowerCase() ?? '')
|
|
164
183
|
return {
|
|
165
184
|
pass,
|
|
166
185
|
score: pass ? 1.0 : 0.0,
|
|
167
|
-
reasoning: pass ? 'Contains
|
|
186
|
+
reasoning: pass ? 'Contains hint content' : 'Missing hint content'
|
|
168
187
|
}
|
|
169
188
|
}
|
|
170
189
|
```
|
|
@@ -184,13 +203,13 @@ import sys
|
|
|
184
203
|
|
|
185
204
|
data = json.load(sys.stdin)
|
|
186
205
|
output = data["output"].lower()
|
|
187
|
-
|
|
206
|
+
hint = (data.get("hint") or "").lower()
|
|
188
207
|
|
|
189
|
-
pass_result =
|
|
208
|
+
pass_result = hint in output if hint else True
|
|
190
209
|
print(json.dumps({
|
|
191
210
|
"pass": pass_result,
|
|
192
211
|
"score": 1.0 if pass_result else 0.0,
|
|
193
|
-
"reasoning": "Contains
|
|
212
|
+
"reasoning": "Contains hint" if pass_result else "Missing hint"
|
|
194
213
|
}))
|
|
195
214
|
```
|
|
196
215
|
|
|
@@ -200,7 +219,7 @@ acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.py
|
|
|
200
219
|
```
|
|
201
220
|
|
|
202
221
|
**Protocol:**
|
|
203
|
-
- Input (stdin): `{"input": "...", "output": "...", "
|
|
222
|
+
- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
|
|
204
223
|
- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
|
|
205
224
|
|
|
206
225
|
## Downstream Integration
|
|
@@ -222,13 +241,16 @@ cat results.jsonl | your-scoring-script.ts
|
|
|
222
241
|
bun install # Install dependencies
|
|
223
242
|
bun run check # Type check + lint + format
|
|
224
243
|
bun test # Run unit tests
|
|
244
|
+
|
|
245
|
+
# Run integration tests in Docker (requires API keys)
|
|
246
|
+
ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.test.yml run --rm acp-test
|
|
225
247
|
```
|
|
226
248
|
|
|
227
249
|
## Requirements
|
|
228
250
|
|
|
229
251
|
- **Runtime:** Bun >= 1.2.9
|
|
230
|
-
- **ACP Adapter:**
|
|
231
|
-
- **API Key:** `ANTHROPIC_API_KEY`
|
|
252
|
+
- **ACP Adapter:** Built-in `headless` command (recommended) or external adapter
|
|
253
|
+
- **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
|
|
232
254
|
|
|
233
255
|
## License
|
|
234
256
|
|
package/bin/cli.ts
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
* - validate-refs: Check reference solutions
|
|
15
15
|
* - balance: Analyze test set coverage
|
|
16
16
|
* - schemas: Export JSON schemas for non-TS users
|
|
17
|
+
* - headless: Schema-driven adapter for any headless CLI agent
|
|
17
18
|
* - adapter:scaffold: Scaffold new ACP adapter project
|
|
18
19
|
* - adapter:check: Validate adapter ACP compliance
|
|
19
20
|
*/
|
|
@@ -23,6 +24,7 @@ import { adapterScaffold } from '../src/adapter-scaffold.ts'
|
|
|
23
24
|
import { balance } from '../src/balance.ts'
|
|
24
25
|
import { calibrate } from '../src/calibrate.ts'
|
|
25
26
|
import { capture } from '../src/capture.ts'
|
|
27
|
+
import { headless } from '../src/headless.ts'
|
|
26
28
|
import { schemasCli } from '../src/schemas-cli.ts'
|
|
27
29
|
import { summarize } from '../src/summarize.ts'
|
|
28
30
|
import { trials } from '../src/trials.ts'
|
|
@@ -43,6 +45,7 @@ Commands:
|
|
|
43
45
|
validate-refs Check reference solutions against grader
|
|
44
46
|
balance Analyze test set coverage
|
|
45
47
|
schemas Export JSON schemas for non-TypeScript users
|
|
48
|
+
headless Schema-driven adapter for any headless CLI agent
|
|
46
49
|
adapter:scaffold Scaffold a new ACP adapter project
|
|
47
50
|
adapter:check Validate adapter ACP compliance
|
|
48
51
|
|
|
@@ -70,6 +73,14 @@ Examples:
|
|
|
70
73
|
# Validate adapter compliance
|
|
71
74
|
acp-harness adapter:check bun ./my-adapter/src/main.ts
|
|
72
75
|
|
|
76
|
+
# Run headless adapter with schema
|
|
77
|
+
acp-harness headless --schema ./claude-headless.json
|
|
78
|
+
|
|
79
|
+
# Capture with headless adapter
|
|
80
|
+
acp-harness capture prompts.jsonl \\
|
|
81
|
+
acp-harness headless --schema ./claude-headless.json \\
|
|
82
|
+
-o results.jsonl
|
|
83
|
+
|
|
73
84
|
Documentation: https://github.com/plaited/acp-harness
|
|
74
85
|
`)
|
|
75
86
|
}
|
|
@@ -104,6 +115,10 @@ const main = async () => {
|
|
|
104
115
|
await schemasCli(args)
|
|
105
116
|
break
|
|
106
117
|
|
|
118
|
+
case 'headless':
|
|
119
|
+
await headless(args)
|
|
120
|
+
break
|
|
121
|
+
|
|
107
122
|
case 'adapter:scaffold':
|
|
108
123
|
await adapterScaffold(args)
|
|
109
124
|
break
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/acp-harness",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from ACP-compatible agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -28,10 +28,8 @@
|
|
|
28
28
|
"./bin/**",
|
|
29
29
|
"!./src/**/tests/*",
|
|
30
30
|
"!./src/**/*.spec.ts",
|
|
31
|
-
"!./src/**/*.docker.ts",
|
|
32
31
|
"!./bin/**/tests/*",
|
|
33
|
-
"!./bin/**/*.spec.ts"
|
|
34
|
-
"!./bin/**/*.docker.ts"
|
|
32
|
+
"!./bin/**/*.spec.ts"
|
|
35
33
|
],
|
|
36
34
|
"publishConfig": {
|
|
37
35
|
"access": "public"
|
|
@@ -43,8 +41,8 @@
|
|
|
43
41
|
"check:types": "tsc --noEmit",
|
|
44
42
|
"check:write": "biome check --write && format-package --write",
|
|
45
43
|
"prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
|
|
46
|
-
"test": "bun test
|
|
47
|
-
"test:
|
|
44
|
+
"test": "bun test ./**/tests/*.spec.ts",
|
|
45
|
+
"test:integration": "bun test ./**/integration_tests/*.spec.ts"
|
|
48
46
|
},
|
|
49
47
|
"lint-staged": {
|
|
50
48
|
"*.{js,cjs,jsx,tsx,ts}": [
|
|
@@ -56,7 +54,7 @@
|
|
|
56
54
|
},
|
|
57
55
|
"dependencies": {
|
|
58
56
|
"zod": "^4.3.5",
|
|
59
|
-
"@plaited/development-skills": "0.6.
|
|
57
|
+
"@plaited/development-skills": "0.6.3"
|
|
60
58
|
},
|
|
61
59
|
"peerDependencies": {
|
|
62
60
|
"typescript-language-server": "^5.1.3",
|
package/src/acp-client.ts
CHANGED
|
@@ -22,7 +22,6 @@ import type {
|
|
|
22
22
|
Implementation,
|
|
23
23
|
InitializeRequest,
|
|
24
24
|
InitializeResponse,
|
|
25
|
-
McpServer,
|
|
26
25
|
PromptRequest,
|
|
27
26
|
PromptResponse,
|
|
28
27
|
RequestPermissionRequest,
|
|
@@ -277,18 +276,22 @@ export const createACPClient = (config: ACPClientConfig) => {
|
|
|
277
276
|
/**
|
|
278
277
|
* Creates a new conversation session.
|
|
279
278
|
*
|
|
280
|
-
* @
|
|
279
|
+
* @remarks
|
|
280
|
+
* MCP servers are auto-discovered by the agent from configuration files
|
|
281
|
+
* in the working directory (e.g., `.mcp.json`, `.gemini/settings.json`).
|
|
282
|
+
*
|
|
283
|
+
* @param params - Session parameters with working directory
|
|
281
284
|
* @returns The created session
|
|
282
285
|
* @throws {ACPClientError} If not connected
|
|
283
286
|
*/
|
|
284
|
-
const createSession = async (params: { cwd: string
|
|
287
|
+
const createSession = async (params: { cwd: string }): Promise<Session> => {
|
|
285
288
|
if (!transport?.isConnected()) {
|
|
286
289
|
throw new ACPClientError('Not connected')
|
|
287
290
|
}
|
|
288
291
|
|
|
289
292
|
const response = await transport.request<{ sessionId: string }>(ACP_METHODS.CREATE_SESSION, {
|
|
290
293
|
cwd: params.cwd,
|
|
291
|
-
mcpServers:
|
|
294
|
+
mcpServers: [], // Required field - empty array lets agents auto-discover from cwd
|
|
292
295
|
})
|
|
293
296
|
return { id: response.sessionId }
|
|
294
297
|
}
|
package/src/adapter-check.ts
CHANGED
package/src/adapter-scaffold.ts
CHANGED
|
@@ -296,7 +296,6 @@ import { sessionManager } from '../session-manager.ts'
|
|
|
296
296
|
|
|
297
297
|
type SessionNewParams = {
|
|
298
298
|
cwd: string
|
|
299
|
-
mcpServers?: unknown[]
|
|
300
299
|
}
|
|
301
300
|
|
|
302
301
|
type SessionNewResult = {
|
|
@@ -304,12 +303,11 @@ type SessionNewResult = {
|
|
|
304
303
|
}
|
|
305
304
|
|
|
306
305
|
export const handleSessionNew = async (params: unknown): Promise<SessionNewResult> => {
|
|
307
|
-
const { cwd
|
|
306
|
+
const { cwd } = params as SessionNewParams
|
|
308
307
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
})
|
|
308
|
+
// MCP servers are discovered from cwd configuration files
|
|
309
|
+
// (e.g., .mcp.json, .gemini/settings.json)
|
|
310
|
+
const sessionId = sessionManager.createSession({ cwd })
|
|
313
311
|
|
|
314
312
|
return { sessionId }
|
|
315
313
|
}
|
|
@@ -438,19 +436,17 @@ import { randomUUID } from 'node:crypto'
|
|
|
438
436
|
type Session = {
|
|
439
437
|
id: string
|
|
440
438
|
cwd: string
|
|
441
|
-
mcpServers: unknown[]
|
|
442
439
|
createdAt: Date
|
|
443
440
|
}
|
|
444
441
|
|
|
445
442
|
class SessionManager {
|
|
446
443
|
#sessions = new Map<string, Session>()
|
|
447
444
|
|
|
448
|
-
createSession(params: { cwd: string
|
|
445
|
+
createSession(params: { cwd: string }): string {
|
|
449
446
|
const id = \`sess_\${randomUUID().slice(0, 8)}\`
|
|
450
447
|
this.#sessions.set(id, {
|
|
451
448
|
id,
|
|
452
449
|
cwd: params.cwd,
|
|
453
|
-
mcpServers: params.mcpServers,
|
|
454
450
|
createdAt: new Date(),
|
|
455
451
|
})
|
|
456
452
|
return id
|
|
@@ -550,13 +546,15 @@ from typing import Any, Dict, Optional
|
|
|
550
546
|
sessions: Dict[str, Dict[str, Any]] = {}
|
|
551
547
|
|
|
552
548
|
|
|
553
|
-
def create_session(cwd: str
|
|
554
|
-
"""Create a new session.
|
|
549
|
+
def create_session(cwd: str) -> str:
|
|
550
|
+
"""Create a new session.
|
|
551
|
+
|
|
552
|
+
MCP servers are discovered from cwd configuration files.
|
|
553
|
+
"""
|
|
555
554
|
session_id = f"sess_{uuid.uuid4().hex[:8]}"
|
|
556
555
|
sessions[session_id] = {
|
|
557
556
|
"id": session_id,
|
|
558
557
|
"cwd": cwd,
|
|
559
|
-
"mcp_servers": mcp_servers,
|
|
560
558
|
}
|
|
561
559
|
return session_id
|
|
562
560
|
|
|
@@ -597,10 +595,13 @@ def handle_initialize(params: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
597
595
|
|
|
598
596
|
|
|
599
597
|
def handle_session_new(params: Dict[str, Any]) -> Dict[str, Any]:
|
|
600
|
-
"""Handle session/new request.
|
|
598
|
+
"""Handle session/new request.
|
|
599
|
+
|
|
600
|
+
MCP servers are discovered from cwd configuration files
|
|
601
|
+
(e.g., .mcp.json, .gemini/settings.json).
|
|
602
|
+
"""
|
|
601
603
|
cwd = params.get("cwd", ".")
|
|
602
|
-
|
|
603
|
-
session_id = create_session(cwd, mcp_servers)
|
|
604
|
+
session_id = create_session(cwd)
|
|
604
605
|
return {"sessionId": session_id}
|
|
605
606
|
|
|
606
607
|
|
package/src/calibrate.ts
CHANGED
|
@@ -57,17 +57,37 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
|
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
/**
|
|
60
|
-
*
|
|
60
|
+
* Randomly sample n elements from an array using Fisher-Yates shuffle.
|
|
61
61
|
*
|
|
62
62
|
* @param arr - Array to sample from
|
|
63
63
|
* @param n - Number of samples to take
|
|
64
|
-
* @returns Array of sampled elements
|
|
64
|
+
* @returns Array of sampled elements in random order
|
|
65
|
+
*
|
|
66
|
+
* @remarks
|
|
67
|
+
* Uses Fisher-Yates (Knuth) shuffle for uniform distribution.
|
|
68
|
+
* Creates a copy to avoid mutating the input array.
|
|
69
|
+
* O(n) time complexity with O(n) space for the copy.
|
|
70
|
+
* Not cryptographically secure (uses Math.random).
|
|
65
71
|
*
|
|
66
72
|
* @public
|
|
67
73
|
*/
|
|
68
74
|
export const sampleArray = <T>(arr: T[], n: number): T[] => {
|
|
69
|
-
|
|
70
|
-
|
|
75
|
+
if (n <= 0) return []
|
|
76
|
+
if (n >= arr.length) return [...arr]
|
|
77
|
+
|
|
78
|
+
const copy = [...arr]
|
|
79
|
+
|
|
80
|
+
// Fisher-Yates shuffle working backwards through array
|
|
81
|
+
// Only shuffle enough elements to get n samples
|
|
82
|
+
const limit = copy.length - n
|
|
83
|
+
for (let i = copy.length - 1; i >= limit && i > 0; i--) {
|
|
84
|
+
// Random index from 0 to i (inclusive)
|
|
85
|
+
const j = Math.floor(Math.random() * (i + 1))
|
|
86
|
+
// Swap elements
|
|
87
|
+
;[copy[i], copy[j]] = [copy[j]!, copy[i]!]
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return copy.slice(-n)
|
|
71
91
|
}
|
|
72
92
|
|
|
73
93
|
/**
|
|
@@ -129,8 +149,8 @@ const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => {
|
|
|
129
149
|
lines.push(`**Input:** ${sample.input}`)
|
|
130
150
|
lines.push('')
|
|
131
151
|
|
|
132
|
-
if (sample.
|
|
133
|
-
lines.push(`**
|
|
152
|
+
if (sample.hint) {
|
|
153
|
+
lines.push(`**Hint:** ${sample.hint}`)
|
|
134
154
|
lines.push('')
|
|
135
155
|
}
|
|
136
156
|
|
|
@@ -212,7 +232,7 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
|
|
|
212
232
|
id: result.id,
|
|
213
233
|
input: result.input,
|
|
214
234
|
output: result.output,
|
|
215
|
-
|
|
235
|
+
hint: result.hint,
|
|
216
236
|
originalScore: result.score as GraderResult,
|
|
217
237
|
trajectorySnippet: getTrajectorySnippet(result.trajectory),
|
|
218
238
|
}
|
|
@@ -222,7 +242,7 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
|
|
|
222
242
|
calibrationSample.rescoredResult = await grader({
|
|
223
243
|
input: result.input,
|
|
224
244
|
output: result.output,
|
|
225
|
-
|
|
245
|
+
hint: result.hint,
|
|
226
246
|
trajectory: result.trajectory,
|
|
227
247
|
})
|
|
228
248
|
}
|