claude-overnight 1.51.1 → 1.51.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -74
- package/dist/cli/help.js +1 -1
- package/dist/cli/settings.js +3 -3
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/adapters/mcp-browser.d.ts +70 -0
- package/dist/prompt-evolution/adapters/mcp-browser.js +358 -0
- package/dist/prompt-evolution/curator.d.ts +22 -0
- package/dist/prompt-evolution/curator.js +123 -0
- package/dist/prompt-evolution/evaluator.d.ts +34 -0
- package/dist/prompt-evolution/evaluator.js +192 -0
- package/dist/prompt-evolution/fixtures/plan-cases.d.ts +13 -0
- package/dist/prompt-evolution/fixtures/plan-cases.js +163 -0
- package/dist/prompt-evolution/index.d.ts +50 -0
- package/dist/prompt-evolution/index.js +178 -0
- package/dist/prompt-evolution/mutator.d.ts +21 -0
- package/dist/prompt-evolution/mutator.js +124 -0
- package/dist/prompt-evolution/scorer.d.ts +14 -0
- package/dist/prompt-evolution/scorer.js +120 -0
- package/dist/prompt-evolution/types.d.ts +113 -0
- package/dist/prompt-evolution/types.js +11 -0
- package/dist/providers/index.js +4 -4
- package/package.json +16 -3
- package/plugins/claude-overnight/.claude-plugin/plugin.json +2 -2
- package/plugins/claude-overnight/skills/claude-overnight/SKILL.md +36 -32
- package/plugins/claude-overnight/skills/claude-overnight/authoring.md +13 -13
- package/plugins/claude-overnight/skills/claude-overnight/recipes.md +8 -8
- package/plugins/claude-overnight/skills/coach/SKILL.md +34 -34
package/README.md
CHANGED
|
@@ -1,16 +1,62 @@
|
|
|
1
1
|
# claude-overnight
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Overnight coding swarms in isolated git worktrees that plan, execute, review, and steer themselves until the objective is met. Hand it a goal and a budget, walk away, review the diff in the morning.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Every agent runs in its own worktree on its own branch, so a misbehaving session cannot trash your working tree. Unmerged branches are preserved for manual review, never discarded. Set a usage cap (say 90%) and your interactive Claude Code still has headroom to answer questions while the swarm runs.
|
|
6
6
|
|
|
7
|
-
Built on the [Claude Agent SDK](https://www.npmjs.com/package/@anthropic-ai/claude-agent-sdk)
|
|
7
|
+
Built on the [Claude Agent SDK](https://www.npmjs.com/package/@anthropic-ai/claude-agent-sdk): every planner, worker, reviewer, and verifier session runs on the SDK's agent harness with full session resume, streaming, and transcripts. `claude-overnight` is the orchestrator around that harness. It plans, routes, curates, resumes, and persists many SDK sessions at once. Because the harness speaks the Anthropic Messages API, any compatible endpoint plugs in as a role.
|
|
8
|
+
|
|
9
|
+
## Three execution layers, mix per run
|
|
10
|
+
|
|
11
|
+
| Layer | Runs on | What it does |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| Planner (harness) | Opus 4.6, Sonnet 4.6 | Thinking wave, orchestration, steering, post-wave review, final gate |
|
|
14
|
+
| Main worker | Sonnet, Gemini 2.5, Qwen 3.6 Plus, DeepSeek, any Anthropic-compatible endpoint | Bulk implementation |
|
|
15
|
+
| Fast worker (optional) | Kimi 2.6 Coding, Cursor composer-2, Haiku | Cheap well-scoped tasks, double-checked by later waves |
|
|
16
|
+
|
|
17
|
+
A common recipe: **Opus planner + Sonnet bulk worker + Cursor composer-2 fast worker**. Another: **Opus planner + Kimi 2.6 bulk worker + Haiku fast worker**. Providers are saved once to `~/.claude/claude-overnight/providers.json` and appear in every future run. The bundled `cursor-composer-in-claude` proxy makes Cursor-hosted models (`auto`, `composer`, `composer-2`) look like a normal provider.
|
|
18
|
+
|
|
19
|
+
## What this recipe does that others do not
|
|
20
|
+
|
|
21
|
+
**Self-curating skill memory that improves mid-run.** Workers emit memory candidates when they discover something reusable: a repo-specific quirk, a recovery path, a command sequence that worked, a tool recipe worth saving. A scribe appends each candidate to disk without blocking the run. At the end of every wave, a **librarian** pass curates the queue. It promotes candidates into canon, patches existing skills via diff-style edits, or quarantines stale ones. **Wave N+1 of the same run starts with a better skill library than wave N.** Across runs, the library compounds. Inspired by Nous Research's Hermes Agent (Feb 2026), with progressive disclosure (L0 stub in every prompt, L1 body loaded on demand, L2 references on request), SQLite FTS5 retrieval, and per-skill win-rate tracking that auto-quarantines rot.
|
|
22
|
+
|
|
23
|
+
**Self-fixing, not just self-running.** Every task agent reviews its own `git diff` via SDK session resume (same session, full task context, no re-prompting) and runs a simplify pass before the commit lands. After each wave a dedicated review agent scans the consolidated diff for cross-agent issues the individual sessions could not see: missed reuse, copy-paste variations, leaky abstractions. When steering declares the objective done, a final gate reviews the full `git diff main` for architecture coherence before anything reaches your working tree.
|
|
24
|
+
|
|
25
|
+
**Multi-wave autonomous loop, not fire-and-forget.** After each wave a steering pass asks "how good is this?" and chooses between executing more tasks, spinning up a deeper reflection wave, or declaring done. The loop keeps going until steering is satisfied, the budget is exhausted, or the usage cap trips. Long runs keep a living status snapshot, archived milestones every five waves, and an evolving goal file, so steering picks up exactly where it left off after a rate limit or an overnight stop.
|
|
26
|
+
|
|
27
|
+
**Headroom-aware usage cap.** Set the cap to 90% of your 5h window and the swarm stops accepting new work there. Your interactive Claude Code keeps the remaining 10% to answer questions or run its own sessions while the overnight run grinds on.
|
|
28
|
+
|
|
29
|
+
**Crash-safe by design.** Planner state, the task plan, design docs, per-query NDJSON transcripts, steering decisions, and wave milestones all land on disk as they are produced. If the process dies mid-plan, the next resume salvages `tasks.json` and skips the expensive thinking wave. Planner crashes do not lose the $2 to $4 of orchestration work that already happened.
|
|
30
|
+
|
|
31
|
+
## Run on Kimi 2.6
|
|
32
|
+
|
|
33
|
+
Want a cheap Anthropic-compatible worker with a simple shell setup? Kimi 2.6 via Kimi's coding endpoint is a drop-in worker that speaks the Anthropic Messages API, same client, same flow, just a different base URL.
|
|
34
|
+
|
|
35
|
+
1. **Configure the provider.** Run `claude-overnight`, choose `Other…` on the worker step, and fill in:
|
|
36
|
+
|
|
37
|
+
| Field | Value |
|
|
38
|
+
|---|---|
|
|
39
|
+
| Name | `Kimi 2.6` |
|
|
40
|
+
| Base URL | `https://api.kimi.com/coding/` |
|
|
41
|
+
| Model id | `kimi-for-coding` |
|
|
42
|
+
| API key | your Kimi coding key |
|
|
43
|
+
|
|
44
|
+
2. That's it. Planner runs on Sonnet (or Opus), worker runs on Kimi.
|
|
45
|
+
|
|
46
|
+
Or set it via env directly:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
export ANTHROPIC_BASE_URL="https://api.kimi.com/coding/"
|
|
50
|
+
export ANTHROPIC_API_KEY="sk-kimi-..."
|
|
51
|
+
export ANTHROPIC_MODEL="kimi-for-coding"
|
|
52
|
+
claude-overnight
|
|
53
|
+
```
|
|
8
54
|
|
|
9
55
|
## Run on Qwen 3.6 Plus
|
|
10
56
|
|
|
11
|
-
Hit your Claude Max plan limits? Running on a tight budget? Qwen 3.6 Plus via Alibaba Cloud's DashScope gateway is a drop-in worker that speaks the Anthropic Messages API
|
|
57
|
+
Hit your Claude Max plan limits? Running on a tight budget? Qwen 3.6 Plus via Alibaba Cloud's DashScope gateway is a drop-in worker that speaks the Anthropic Messages API, same client, same flow, pennies per run.
|
|
12
58
|
|
|
13
|
-
1. **Get an API key.** Sign up at [Alibaba Cloud](https://account.alibabacloud.com/login/login.htm?oauth_callback=https%3A%2F%2Fmodelstudio.console.alibabacloud.com%2Fap-southeast-1%3Ftab%3Ddashboard%23%2Fapi-key&clearRedirectCookie=1)
|
|
59
|
+
1. **Get an API key.** Sign up at [Alibaba Cloud](https://account.alibabacloud.com/login/login.htm?oauth_callback=https%3A%2F%2Fmodelstudio.console.alibabacloud.com%2Fap-southeast-1%3Ftab%3Ddashboard%23%2Fapi-key&clearRedirectCookie=1), the link takes you straight to the API key dashboard.
|
|
14
60
|
2. **Configure the provider.** Run `claude-overnight`, choose `Other…` on the worker step, and fill in:
|
|
15
61
|
|
|
16
62
|
| Field | Value |
|
|
@@ -31,9 +77,9 @@ export ANTHROPIC_MODEL="qwen3.6-plus"
|
|
|
31
77
|
claude-overnight
|
|
32
78
|
```
|
|
33
79
|
|
|
34
|
-
## Run via Cursor
|
|
80
|
+
## Run via Bundled Cursor Proxy
|
|
35
81
|
|
|
36
|
-
Use Cursor
|
|
82
|
+
Use Cursor-hosted models (`auto`, `composer`, `composer-2`, etc.) through the bundled `cursor-composer-in-claude` proxy. `claude-overnight` auto-starts that local Anthropic-compatible proxy, injects the per-worktree workspace header, and treats Cursor as just another provider for the planner, main worker, or fast worker.
|
|
37
83
|
|
|
38
84
|
### macOS: Cursor agent shell patch
|
|
39
85
|
|
|
@@ -56,50 +102,44 @@ alias agent="run_cursor_agent"
|
|
|
56
102
|
|
|
57
103
|
`claude-overnight` prints a one-time notice when you use the Cursor proxy and this snippet is not detected in `~/.zshrc` or `~/.zprofile`. The bundled proxy also sets `CURSOR_AGENT_NODE` / `CURSOR_AGENT_SCRIPT` when it can find `node` and `cursor-agent`, but your interactive shell still benefits from the alias.
|
|
58
104
|
|
|
59
|
-
1. **Install the Cursor CLI
|
|
105
|
+
1. **Install the Cursor CLI:**
|
|
60
106
|
|
|
61
107
|
```bash
|
|
62
108
|
curl https://cursor.com/install -fsS | bash
|
|
63
|
-
npm install -g cursor-api-proxy
|
|
64
109
|
```
|
|
65
110
|
|
|
66
111
|
2. **Get an API key.** Visit [cursor.com/dashboard/integrations](https://cursor.com/dashboard/integrations) and scroll to the "API Keys" section.
|
|
67
112
|
|
|
68
|
-
3. **Set up.** Run `claude-overnight` and when prompted to pick a model, choose **Cursor…**. It walks you through a one-time setup: CLI check, API key entry (persisted to `providers.json`),
|
|
113
|
+
3. **Set up.** Run `claude-overnight` and when prompted to pick a model, choose **Cursor…**. It walks you through a one-time setup: CLI check, API key entry (persisted to `providers.json`), bundled proxy verification, and health check.
|
|
69
114
|
|
|
70
|
-
4.
|
|
71
|
-
|
|
72
|
-
```bash
|
|
73
|
-
npx cursor-api-proxy
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
5. Pick your model (`auto`, `composer`, `composer-2`, etc.). The provider is saved and reappears in every future run.
|
|
115
|
+
4. Pick your model (`auto`, `composer`, `composer-2`, etc.). The provider is saved and reappears in every future run.
|
|
77
116
|
|
|
78
117
|
Or configure the key manually:
|
|
79
118
|
|
|
80
119
|
```bash
|
|
81
|
-
export CURSOR_BRIDGE_API_KEY="
|
|
82
|
-
npx cursor-api-proxy &
|
|
120
|
+
export CURSOR_BRIDGE_API_KEY="crsr_..."
|
|
83
121
|
claude-overnight
|
|
84
122
|
```
|
|
85
123
|
|
|
86
|
-
|
|
124
|
+
If the bundled proxy cannot auto-start, the setup wizard prints the exact `node ".../cursor-composer-in-claude/dist/cli.js"` command for this install so you can launch the same embedded proxy manually.
|
|
125
|
+
|
|
126
|
+
**Tip:** once a Cursor provider is saved, run `claude-overnight` with the `--model=cursor-auto` flag in non-interactive mode to skip the picker. If the proxy isn't running at startup, the tool attempts to restart it automatically.
|
|
87
127
|
|
|
88
128
|
### macOS: “Keychain Not Found” / `cursor-user`
|
|
89
129
|
|
|
90
|
-
The Cursor **`agent`** binary stores an interactive login as **`cursor-user`** in your **login** keychain. For automation, use a **[User API key](https://cursor.com/docs/cli/headless)** (`export CURSOR_API_KEY=...` from [Integrations](https://cursor.com/dashboard/integrations))
|
|
130
|
+
The Cursor **`agent`** binary stores an interactive login as **`cursor-user`** in your **login** keychain. For automation, use a **[User API key](https://cursor.com/docs/cli/headless)** (`export CURSOR_API_KEY=...` from [Integrations](https://cursor.com/dashboard/integrations)): the bundled proxy then does not need Keychain. `claude-overnight` forces `CURSOR_SKIP_KEYCHAIN=1` and `CI=true`; if System Settings still shows **“A keychain cannot be found to store …”**, the login keychain is often missing or damaged: open **Keychain Access → First Aid** on **login**, or use **Reset To Defaults** in the dialog. Some users fix a stuck keychain with:
|
|
91
131
|
|
|
92
132
|
```bash
|
|
93
133
|
security unlock-keychain ~/Library/Keychains/login.keychain-db
|
|
94
134
|
```
|
|
95
135
|
|
|
96
|
-
**Automation:** Saving a key via **Cursor…** in `claude-overnight` is enough
|
|
136
|
+
**Automation:** Saving a key via **Cursor…** in `claude-overnight` is enough. It is written to `providers.json` and injected into both the Claude SDK env and the bundled proxy (including `CURSOR_API_KEY` for the native `agent`). You do not need to `export` variables unless you want to override for one shell.
|
|
97
137
|
|
|
98
138
|
**Advanced:** If something else must share port `8765` and you manage the proxy yourself, set `CURSOR_OVERNIGHT_NO_PROXY_RESTART=1` to skip the automatic “replace listener” step when a Cursor API token is present.
|
|
99
139
|
|
|
100
140
|
**How headless Cursor + macOS Keychain actually works (discovery):** We documented the full investigation: why ACP was the wrong path for opus/sonnet `*-thinking-*` variants (model-name mismatch → silent `exit 1`), how **chat-only workspace** (default in cursor-composer) fakes `HOME` and triggers **Keychain timeouts** despite a User API key, and how a cloned **account pool** makes parallel cursor-agent spawns race-free. See **[docs/CURSOR_PROXY_MACOS_DISCOVERY.md](docs/CURSOR_PROXY_MACOS_DISCOVERY.md)**.
|
|
101
141
|
|
|
102
|
-
**Quick reference
|
|
142
|
+
**Quick reference, bundled proxy env:** `CURSOR_BRIDGE_USE_ACP=0` (CLI streaming path accepts all friendly model names), `CURSOR_BRIDGE_CHAT_ONLY_WORKSPACE=false`, `CURSOR_CONFIG_DIRS=<5 cloned pool dirs>` (parallel-safe), plus `CURSOR_API_KEY` / `CURSOR_AUTH_TOKEN` / `CURSOR_BRIDGE_API_KEY` and `CURSOR_SKIP_KEYCHAIN=1` / `CI=true`. Details and tables are in the doc above.
|
|
103
143
|
|
|
104
144
|
**Regression / stress test:** `npm run matrix:cursor-proxy` (optional `--quick`, `--include-danger`). Use `MATRIX_MODELS=composer-2,claude-opus-4-7-thinking-high` to compare models; override `MATRIX_PORT_BASE`, `MATRIX_MODEL`, `MATRIX_MSG_TIMEOUT_MS` as needed.
|
|
105
145
|
|
|
@@ -109,7 +149,7 @@ security unlock-keychain ~/Library/Keychains/login.keychain-db
|
|
|
109
149
|
npm install -g claude-overnight
|
|
110
150
|
```
|
|
111
151
|
|
|
112
|
-
Requires Node.js ≥ 20
|
|
152
|
+
Requires Node.js ≥ 20. For Anthropic-direct roles, use `claude auth login` or `ANTHROPIC_API_KEY`. For provider-backed roles, save a Kimi / Qwen / Cursor / OpenRouter-compatible provider instead. No Anthropic plan or key? See **Run on Kimi 2.6** or **Run on Qwen 3.6 Plus** above for cheap drop-in alternatives.
|
|
113
153
|
|
|
114
154
|
## Quick start
|
|
115
155
|
|
|
@@ -126,13 +166,13 @@ claude-overnight
|
|
|
126
166
|
|
|
127
167
|
② Budget [10]: 200
|
|
128
168
|
|
|
129
|
-
④ Planner model (thinking, steering
|
|
130
|
-
● Opus
|
|
131
|
-
○ Sonnet
|
|
169
|
+
④ Planner model (thinking, steering; use your strongest):
|
|
170
|
+
● Opus · Opus 4.6 · Most capable
|
|
171
|
+
○ Sonnet · Sonnet 4.6 · Best for everyday tasks
|
|
132
172
|
|
|
133
|
-
⑤ Worker model (
|
|
134
|
-
● Sonnet
|
|
135
|
-
○ Opus
|
|
173
|
+
⑤ Worker model (runs the tasks; Kimi 2.6 / Qwen 3.6 Plus / OpenRouter / etc via Other…):
|
|
174
|
+
● Sonnet · Sonnet 4.6 · Best for everyday tasks
|
|
175
|
+
○ Opus · Opus 4.6 · Most capable
|
|
136
176
|
○ Other… · custom OpenAI/Anthropic-compatible endpoint
|
|
137
177
|
|
|
138
178
|
⑥ Usage cap:
|
|
@@ -159,49 +199,93 @@ claude-overnight
|
|
|
159
199
|
◆ Assessing... ✓ Done
|
|
160
200
|
```
|
|
161
201
|
|
|
162
|
-
You interact once (objective, budget, model, review themes), then the rest runs unattended
|
|
202
|
+
You interact once (objective, budget, model, review themes), then the rest runs unattended, thinking, planning, executing, curating memory, reflecting, steering. Rate-limited? It waits and retries. Crash? Resume where you left off. Capped at usage limit? Pick up next time with full context preserved.
|
|
163
203
|
|
|
164
204
|
## Use cases
|
|
165
205
|
|
|
166
206
|
Overnight refactors, batch feature implementation, codebase-wide cleanups, test generation, documentation sprints, framework migrations, quality audits, long research runs. One objective + a budget + walk away.
|
|
167
207
|
|
|
168
|
-
##
|
|
208
|
+
## Typical flow
|
|
169
209
|
|
|
170
|
-
|
|
210
|
+
```
|
|
211
|
+
┌─ Setup + planning ──────────────────────────────────────────────┐
|
|
212
|
+
│ start/resume → coach rewrites objective → pick planner, │
|
|
213
|
+
│ worker, fast worker → provider preflight → theme review │
|
|
214
|
+
│ → thinking wave (parallel architects) → task orchestration │
|
|
215
|
+
└──────────────────────┬──────────────────────────────────────────┘
|
|
216
|
+
│
|
|
217
|
+
┌─ Wave loop ──────────▼──────────────────────────────────────────┐
|
|
218
|
+
│ beforeWave hook → execution wave (workers in worktrees) │
|
|
219
|
+
│ → per-agent simplify pass (session resume on same context) │
|
|
220
|
+
│ → debrief + afterWave hook → post-wave review agent │
|
|
221
|
+
│ → librarian curates skill candidates into canon │
|
|
222
|
+
│ → steering decides: execute more │ reflect deeper │ done │
|
|
223
|
+
│ ↑ │
|
|
224
|
+
│ └── loop until done, budget out, or cap hit │
|
|
225
|
+
│ → final gate reviews full `git diff main` │
|
|
226
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
227
|
+
|
|
228
|
+
┌─ Skill memory (compounds within a run and across runs) ─────────┐
|
|
229
|
+
│ workers emit candidates → scribe writes to disk │
|
|
230
|
+
│ → librarian curates at wave end → canon markdown + │
|
|
231
|
+
│ SQLite FTS5 index updated → next wave gets an L0 stub, │
|
|
232
|
+
│ hydrates L1 body on demand, L2 references on request │
|
|
233
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
This is the main user-visible lifecycle. Engine-internal branches (health-check heal tasks, A/B skill assignment across sibling branches, zero-work retry, budget-extension prompts, resume salvage after planning crashes) are omitted for clarity.
|
|
237
|
+
|
|
238
|
+
### 1. Thinking phase: parallel architect sessions
|
|
171
239
|
|
|
172
240
|
For budgets > 15, the tool launches **architect agents** that explore your codebase before any code is written. Each one gets a different research angle (architecture, data models, APIs, testing, etc.) and writes a structured design document. The number scales with budget: 5 for budget=50, 10 for budget=2000.
|
|
173
241
|
|
|
174
242
|
### 2. Task orchestration
|
|
175
243
|
|
|
176
|
-
An orchestrator session reads all design documents and synthesizes concrete execution tasks
|
|
244
|
+
An orchestrator session reads all design documents and synthesizes concrete execution tasks, grounded in real files and patterns the architects found. The task plan is also written to a file for resilience: if orchestration is interrupted, partial results survive.
|
|
177
245
|
|
|
178
246
|
### 3. Parallel execution waves
|
|
179
247
|
|
|
180
|
-
Tasks run in parallel agent sessions (each in its own git worktree). After completing its task, each session automatically runs a **simplify pass
|
|
248
|
+
Tasks run in parallel agent sessions (each in its own git worktree). After completing its task, each session automatically runs a **simplify pass**, reviewing its own `git diff` for code reuse opportunities, quality issues, and inefficiencies, then fixing them before the framework commits. This is done via the SDK's **session resume** mechanism: the same agent session continues with a follow-up prompt, so the agent's full context from its task is still available, no need to re-instruct or re-fill context. If a fast worker is configured, steering can route cheaper, well-scoped tasks there while the main worker handles heavier implementation.
|
|
181
249
|
|
|
182
250
|
### 4. Post-wave review
|
|
183
251
|
|
|
184
|
-
After each wave (flex mode, budget remaining), a dedicated **review agent** inspects the consolidated diff for issues the individual agents may have blind-spotted: missed reuse opportunities, copy-paste variations, leaky abstractions, efficiency regressions. Runs as a single-agent wave
|
|
252
|
+
After each wave (flex mode, budget remaining), a dedicated **review agent** inspects the consolidated diff for issues the individual agents may have blind-spotted: missed reuse opportunities, copy-paste variations, leaky abstractions, efficiency regressions. Runs as a single-agent wave, one session reviews what the swarm just produced.
|
|
185
253
|
|
|
186
|
-
### 5.
|
|
254
|
+
### 5. Librarian and dynamic memory
|
|
187
255
|
|
|
188
|
-
|
|
256
|
+
During execution, workers can emit **memory candidates** when they discover something reusable: a repo-specific quirk, a recovery path, a command sequence that worked, or a tool recipe worth reusing later. The scribe writes those candidates to `~/.claude-overnight/skills/<repo-fingerprint>/candidates/` without blocking the run.
|
|
257
|
+
|
|
258
|
+
At the end of each wave, a **librarian** pass curates that queue. It can promote a candidate into canon, patch an existing skill, quarantine stale skills, or reject weak / duplicated candidates. Canon lives on disk as markdown; SQLite is only the ranked index. This is what makes the memory system dynamic rather than a fixed prompt blob.
|
|
189
259
|
|
|
190
260
|
### 6. Steering
|
|
191
261
|
|
|
192
|
-
After each wave, steering
|
|
262
|
+
After each wave, steering asks "how good is this?" rather than "what's missing?". It can:
|
|
193
263
|
|
|
194
264
|
- **Execute** more tasks to build features, fix bugs, polish UX
|
|
195
265
|
- **Reflect** by spinning up 1-2 review sessions for deep quality/architecture audits
|
|
196
266
|
- **Declare done** when the vision is met at high quality
|
|
197
267
|
|
|
198
|
-
###
|
|
268
|
+
### 7. Post-run final gate
|
|
269
|
+
|
|
270
|
+
When the run completes (steering declares done), a final **comprehensive review** runs against the full `git diff main`. Checks architecture coherence, consistency with existing patterns, build integrity, and test pass. The last quality gate before the diff lands.
|
|
271
|
+
|
|
272
|
+
### Run-memory layers
|
|
273
|
+
|
|
274
|
+
Long runs stay sharp because steering maintains three run-memory layers:
|
|
275
|
+
|
|
276
|
+
- **Status**: a living project snapshot, updated every wave. Compressed, never truncated.
|
|
277
|
+
- **Milestones**: strategic snapshots archived every ~5 waves. Long-term memory.
|
|
278
|
+
- **Goal**: the evolving north star. What quality means for this codebase.
|
|
199
279
|
|
|
200
|
-
|
|
280
|
+
### Progressive-disclosure repo memory
|
|
201
281
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
- **
|
|
282
|
+
The repo memory system is separate from the run folder and is designed around three disclosure layers so context stays small:
|
|
283
|
+
|
|
284
|
+
- **L0**: a tiny ranked stub injected into planner and worker prompts. It lists only the names and descriptions of the most relevant project-specific skills and tool recipes.
|
|
285
|
+
- **L1**: the full skill body, loaded on demand with `skill_read(name)` when an agent wants the actual recipe or guidance.
|
|
286
|
+
- **L2**: attached references for deeper context. The library is structured for them even though most runs only need the L0 stub plus occasional L1 hydration.
|
|
287
|
+
|
|
288
|
+
That progressive disclosure matters: the planner and workers do not carry the full memory library in every prompt. They get a compact overview, call `skill_search(query)` if they need to narrow it, and hydrate only the bodies that matter for the task in front of them.
|
|
205
289
|
|
|
206
290
|
## Run history, resume, and knowledge carryforward
|
|
207
291
|
|
|
@@ -222,7 +306,7 @@ Every run gets its own folder in `.claude-overnight/runs/`. Nothing is ever over
|
|
|
222
306
|
run.json, transcripts/themes.ndjson ← see exactly what the planner was doing
|
|
223
307
|
```
|
|
224
308
|
|
|
225
|
-
Any run that stops before the steering system declares the objective complete
|
|
309
|
+
Any run that stops before the steering system declares the objective complete, capped at usage limit, Ctrl+C, crash, rate limit timeout, steering failure, is automatically resumable:
|
|
226
310
|
|
|
227
311
|
```
|
|
228
312
|
⚠ Unfinished run
|
|
@@ -237,7 +321,7 @@ Any run that stops before the steering system declares the objective complete -
|
|
|
237
321
|
|
|
238
322
|
On resume: unmerged branches auto-merge, the wave loop continues, all context is preserved. Designs and reflections stay on disk until the objective is truly complete.
|
|
239
323
|
|
|
240
|
-
If the thinking phase succeeds but orchestration crashes, the next run detects the orphaned design docs and reuses them
|
|
324
|
+
If the thinking phase succeeds but orchestration crashes, the next run detects the orphaned design docs and reuses them, no re-running $9 worth of architect sessions:
|
|
241
325
|
|
|
242
326
|
```
|
|
243
327
|
✓ Reusing 5 design docs (from prior attempt)
|
|
@@ -247,25 +331,25 @@ If the thinking phase succeeds but orchestration crashes, the next run detects t
|
|
|
247
331
|
...
|
|
248
332
|
```
|
|
249
333
|
|
|
250
|
-
**Knowledge carries forward
|
|
334
|
+
**Knowledge carries forward**, new runs inherit knowledge from completed previous runs. Thinking sessions and steering see what past runs built. Run 2 knows run 1 already built the auth system.
|
|
251
335
|
|
|
252
336
|
### Transcripts and streaming
|
|
253
337
|
|
|
254
|
-
Every planner/steering query streams through the Agent SDK with `includePartialMessages: true`, so tool calls, thinking, and text deltas are captured as they happen. Each query also appends an NDJSON transcript under `runs/<ts>/transcripts/<name>.ndjson
|
|
338
|
+
Every planner/steering query streams through the Agent SDK with `includePartialMessages: true`, so tool calls, thinking, and text deltas are captured as they happen. Each query also appends an NDJSON transcript under `runs/<ts>/transcripts/<name>.ndjson`, so if the planner crashes mid-think you still have the forensic trail (prompt preview, every tool use, every text/thinking delta, rate-limit events, and the final result or error). `themes.md` is also written as a human-readable summary right after the thinking wave.
|
|
255
339
|
|
|
256
340
|
Not every provider delivers the same streaming granularity:
|
|
257
341
|
|
|
258
342
|
| Provider | Tool-use events | Thinking deltas | Text deltas |
|
|
259
343
|
| --- | --- | --- | --- |
|
|
260
344
|
| Anthropic (direct) | ✓ | ✓ | ✓ |
|
|
261
|
-
| Cursor proxy (`cursor-composer-in-claude`) |
|
|
262
|
-
| Qwen / OpenRouter / custom Anthropic-compatible | depends on upstream | depends | usually ✓ |
|
|
345
|
+
| Cursor proxy (`cursor-composer-in-claude`) | no | no | ✓ (final answer only) |
|
|
346
|
+
| Kimi / Qwen / OpenRouter / custom Anthropic-compatible | depends on upstream | depends | usually ✓ |
|
|
263
347
|
|
|
264
|
-
When a provider doesn't stream partials (or the model is a reasoning model on the Cursor proxy
|
|
348
|
+
When a provider doesn't stream partials (or the model is a reasoning model on the Cursor proxy, where the proxy suppresses the thinking phase and only emits the final answer), the ticker shows elapsed time with no live text, then the completed result lands in one go. The UI, transcripts, and the resume flow all behave identically either way: streaming is used when available, never required.
|
|
265
349
|
|
|
266
|
-
Add `.claude-overnight/` to your `.gitignore` (with the trailing slash
|
|
350
|
+
Add `.claude-overnight/` to your `.gitignore` (with the trailing slash, see below).
|
|
267
351
|
|
|
268
|
-
A separate, tiny `claude-overnight.log.md` is also written at the repo root on every run. It's human-readable, append-only, one block per run (objective, start/finish, cost, outcome, branch), and is designed to be **committed
|
|
352
|
+
A separate, tiny `claude-overnight.log.md` is also written at the repo root on every run. It's human-readable, append-only, one block per run (objective, start/finish, cost, outcome, branch), and is designed to be **committed**, so even after `.claude-overnight/` is cleaned up you can still recover which prompt produced which commits. Use `.claude-overnight/` (with trailing slash) in your gitignore so this file isn't matched by accident.
|
|
269
353
|
|
|
270
354
|
## Task file and inline modes
|
|
271
355
|
|
|
@@ -309,20 +393,20 @@ claude-overnight "fix auth bug in src/auth.ts" "add tests for user model"
|
|
|
309
393
|
|---|---|---|
|
|
310
394
|
| `--budget=N` | `10` | Total agent sessions |
|
|
311
395
|
| `--concurrency=N` | `5` | Parallel agents |
|
|
312
|
-
| `--model=NAME` | prompted | Worker model
|
|
396
|
+
| `--model=NAME` | prompted | Worker model. Interactive picks planner and worker separately; `Other…` adds Kimi / Qwen / OpenRouter / any Anthropic-compat endpoint. In non-interactive mode, a saved provider's model id is auto-resolved to the provider. |
|
|
313
397
|
| `--usage-cap=N` | unlimited | Stop at N% utilization |
|
|
314
398
|
| `--allow-extra-usage` | off | Allow extra/overage usage (billed separately) |
|
|
315
|
-
| `--extra-usage-budget=N` |
|
|
399
|
+
| `--extra-usage-budget=N` | | Max $ for extra usage (implies --allow-extra-usage) |
|
|
316
400
|
| `--timeout=SECONDS` | `900` | Inactivity timeout per agent (nudges at timeout, kills at 2×) |
|
|
317
|
-
| `--no-flex` |
|
|
318
|
-
| `--dry-run` |
|
|
401
|
+
| `--no-flex` | | Disable multi-wave steering |
|
|
402
|
+
| `--dry-run` | | Show planned tasks without running |
|
|
319
403
|
|
|
320
404
|
## Task file fields
|
|
321
405
|
|
|
322
406
|
| Field | Type | Default | Description |
|
|
323
407
|
|---|---|---|---|
|
|
324
408
|
| `tasks` | `(string \| {prompt, cwd?, model?})[]` | required | Tasks to run |
|
|
325
|
-
| `objective` | `string` |
|
|
409
|
+
| `objective` | `string` | | High-level goal for steering |
|
|
326
410
|
| `flexiblePlan` | `boolean` | `false` | Enable multi-wave planning |
|
|
327
411
|
| `model` | `string` | prompted | Worker model |
|
|
328
412
|
| `concurrency` | `number` | `5` | Parallel agents |
|
|
@@ -331,35 +415,42 @@ claude-overnight "fix auth bug in src/auth.ts" "add tests for user model"
|
|
|
331
415
|
| `mergeStrategy` | `"yolo" \| "branch"` | `"yolo"` | Merge into HEAD or new branch |
|
|
332
416
|
| `usageCap` | `number (0-100)` | unlimited | Stop at N% utilization |
|
|
333
417
|
|
|
334
|
-
## Custom providers (Qwen, OpenRouter, any Anthropic-compatible endpoint)
|
|
418
|
+
## Custom providers (Kimi, Qwen, OpenRouter, any Anthropic-compatible endpoint)
|
|
335
419
|
|
|
336
|
-
Planner, main worker, and optional fast worker are each picked separately
|
|
420
|
+
Planner, main worker, and optional fast worker are each picked separately. Pair Opus-on-Anthropic for the planner/thinker with a cheaper model on another provider for the bulk of work. The fast worker is a real worker (same tools, same env), just on a cheaper/faster model, and steering routes well-scoped tasks to it by default.
|
|
337
421
|
|
|
338
422
|
From the interactive picker, choose `Other…` on the planner, worker, or fast step:
|
|
339
423
|
|
|
340
424
|
```
|
|
341
|
-
⑤ Worker model (
|
|
425
|
+
⑤ Worker model (runs the tasks; Kimi 2.6 / Qwen 3.6 Plus / OpenRouter / etc via Other…):
|
|
342
426
|
○ Sonnet
|
|
343
427
|
○ Opus
|
|
344
428
|
● Other…
|
|
345
429
|
|
|
346
|
-
Name:
|
|
347
|
-
Base URL: https://
|
|
348
|
-
Model id:
|
|
430
|
+
Name: Kimi 2.6
|
|
431
|
+
Base URL: https://api.kimi.com/coding/
|
|
432
|
+
Model id: kimi-for-coding
|
|
349
433
|
API key source:
|
|
350
434
|
● Paste key now · stored plaintext in ~/.claude/claude-overnight/providers.json (0600)
|
|
351
435
|
○ Read from env var · nothing written to disk
|
|
352
436
|
```
|
|
353
437
|
|
|
438
|
+
Common examples:
|
|
439
|
+
|
|
440
|
+
| Name | Base URL | Model id |
|
|
441
|
+
|---|---|---|
|
|
442
|
+
| `Kimi 2.6` | `https://api.kimi.com/coding/` | `kimi-for-coding` |
|
|
443
|
+
| `Qwen 3.6 Plus` | `https://dashscope-intl.aliyuncs.com/apps/anthropic` | `qwen3.6-plus` |
|
|
444
|
+
|
|
354
445
|
Saved providers live user-level at `~/.claude/claude-overnight/providers.json` (mode 0600) and show up automatically in every repo. No per-project config.
|
|
355
446
|
|
|
356
|
-
**How routing works.** Each `query()` gets its own env override (`ANTHROPIC_BASE_URL` + `ANTHROPIC_AUTH_TOKEN`)
|
|
447
|
+
**How routing works.** Each `query()` gets its own env override (`ANTHROPIC_BASE_URL` + `ANTHROPIC_AUTH_TOKEN`), planner queries use the planner provider, main-worker queries use the worker provider, fast-worker queries use the fast provider. No global shell env, no proxy daemon, no `process.env` pollution between calls.
|
|
357
448
|
|
|
358
449
|
**Pre-flight.** Before the swarm starts, each custom provider is pinged with a 1-turn auth check. Bad keys fail fast with `✗ worker preflight failed: ...` instead of N scattered mid-run errors.
|
|
359
450
|
|
|
360
451
|
**Resume.** Provider ids are persisted in `run.json` and rehydrated on resume. If you deleted a provider between runs, resume refuses to start and tells you exactly which id is missing.
|
|
361
452
|
|
|
362
|
-
**Non-interactive / CI.** `claude-overnight --model=qwen3.6-plus` auto-resolves the model id to a saved provider
|
|
453
|
+
**Non-interactive / CI.** `claude-overnight --model=kimi-for-coding` (or `qwen3.6-plus`) auto-resolves the model id to a saved provider, no separate `--provider` flag.
|
|
363
454
|
|
|
364
455
|
## Parallel Playwright Testing
|
|
365
456
|
|
|
@@ -397,8 +488,8 @@ See `QUICKSHEET_PLAYWRIGHT.md` for full config examples.
|
|
|
397
488
|
|
|
398
489
|
By default, extra/overage usage is **blocked**. When your plan's rate limits are exhausted, the run stops cleanly and is resumable. You control this in the interactive prompt (step ⑤) or via CLI flags:
|
|
399
490
|
|
|
400
|
-
- `--allow-extra-usage
|
|
401
|
-
- `--extra-usage-budget=20
|
|
491
|
+
- `--allow-extra-usage`, opt in to extra usage (billed separately)
|
|
492
|
+
- `--extra-usage-budget=20`, allow up to $20 of extra usage, then stop
|
|
402
493
|
|
|
403
494
|
### Live controls during execution
|
|
404
495
|
|
|
@@ -410,11 +501,11 @@ Press these keys while agents are running:
|
|
|
410
501
|
| `t` | Change usage cap threshold (0-100%) |
|
|
411
502
|
| `q` | Graceful stop (press twice to force quit) |
|
|
412
503
|
|
|
413
|
-
Changes take effect between waves
|
|
504
|
+
Changes take effect between waves; active agents finish their current task.
|
|
414
505
|
|
|
415
506
|
### Multi-window usage display
|
|
416
507
|
|
|
417
|
-
The usage bar cycles through all rate limit windows (5h, 7d, etc.) every 3 seconds, showing utilization per window. Usage info is shown during all phases
|
|
508
|
+
The usage bar cycles through all rate limit windows (5h, 7d, etc.) every 3 seconds, showing utilization per window. Usage info is shown during all phases: thinking, orchestration, steering, and execution.
|
|
418
509
|
|
|
419
510
|
When using extra usage with a budget, a dedicated progress bar shows spend vs limit with color-coded fill (magenta → yellow → red).
|
|
420
511
|
|
|
@@ -422,14 +513,14 @@ When using extra usage with a budget, a dedicated progress bar shows spend vs li
|
|
|
422
513
|
|
|
423
514
|
Built for unattended runs lasting hours or days.
|
|
424
515
|
|
|
425
|
-
- **Smooth overage transition**: when extra usage is allowed, plan limit rejection is seamless
|
|
426
|
-
- **Interrupt + resume**: agents and planner queries that go silent are interrupted and resumed with full conversation context via SDK session resume
|
|
516
|
+
- **Smooth overage transition**: when extra usage is allowed, plan limit rejection is seamless, no dispatch blocking, agents continue into overage
|
|
517
|
+
- **Interrupt + resume**: agents and planner queries that go silent are interrupted and resumed with full conversation context via SDK session resume, not killed and restarted from scratch
|
|
427
518
|
- **Hard block**: pauses until the rate limit window resets, then resumes
|
|
428
519
|
- **Soft throttle**: slows dispatch at >75% utilization
|
|
429
520
|
- **Extra usage guard**: detects overage billing and stops unless explicitly allowed
|
|
430
521
|
- **Cooldown between phases**: waits for rate limit reset after thinking before starting orchestration
|
|
431
522
|
- **Retry with backoff**: transient errors (429, overloaded) retry automatically
|
|
432
|
-
- **Usage cap**: set a ceiling, active agents finish, no new ones start
|
|
523
|
+
- **Usage cap**: set a ceiling, active agents finish, no new ones start, run is resumable
|
|
433
524
|
- **Planner retries**: steering and orchestration retry on rate limits (30s/60s/120s backoff) with full context
|
|
434
525
|
|
|
435
526
|
## Git worktrees and branch merging
|
|
@@ -443,13 +534,36 @@ Conflicts retry with `-X theirs`. Unresolved branches are preserved for manual m
|
|
|
443
534
|
|
|
444
535
|
## Claude Code plugin
|
|
445
536
|
|
|
446
|
-
This repo
|
|
537
|
+
This repo ships a Claude Code plugin so any Claude instance (inside this repo or any other) knows how to use, inspect, and resume `claude-overnight` runs:
|
|
447
538
|
|
|
448
539
|
```
|
|
449
540
|
/plugin marketplace add Fornace/claude-overnight
|
|
450
541
|
/plugin install claude-overnight
|
|
451
542
|
```
|
|
452
543
|
|
|
544
|
+
The plugin includes a skill for **authoring runs outside the CLI**. Claude can help you pick the run shape, critique the budget and decomposition, and write a `tasks.json` file before you ever invoke the CLI.
|
|
545
|
+
|
|
546
|
+
### Writing `tasks.json` externally
|
|
547
|
+
|
|
548
|
+
When you pass a pre-written `tasks.json` to the CLI, it **skips the thinking wave and planning phase** and starts executing immediately:
|
|
549
|
+
|
|
550
|
+
```bash
|
|
551
|
+
claude-overnight tasks.json
|
|
552
|
+
```
|
|
553
|
+
|
|
554
|
+
This is useful when:
|
|
555
|
+
- You already have a concrete task list and don't need the planner to explore the codebase.
|
|
556
|
+
- You want to save the planner cost ($2–4) on a straightforward, mechanical job.
|
|
557
|
+
- You used the Claude skill to design the run and want to lock the plan before executing.
|
|
558
|
+
|
|
559
|
+
A fixed-plan `tasks.json` (without `flexiblePlan: true`) bypasses orchestration entirely. A flex-plan `tasks.json` (with `objective` + `flexiblePlan: true` + seed tasks) still uses steering across waves, but skips the initial thinking wave if the tasks are already concrete.
|
|
560
|
+
|
|
561
|
+
### What happens when `tasks.json` exists
|
|
562
|
+
|
|
563
|
+
- **Crash resilience.** During normal planning, the orchestrator writes `tasks.json` to disk as soon as it generates the tasks. If the planner crashes or the process dies before the run state is persisted, the next resume salvages the tasks from `tasks.json` instead of re-running the expensive planning query.
|
|
564
|
+
- **Resume fallback.** If a run's state file is missing or incomplete, the resume flow falls back to `tasks.json` to reconstruct the task list. This also covers legacy runs from before v1.11.7 where the agent wrote the file but the orchestrator didn't save `run.json`.
|
|
565
|
+
- **Orphan recovery.** The state scanner backfills minimal run metadata for any run directory that contains a `tasks.json` but no `run.json`, so incomplete planning shells still show up in `claude-overnight --list`.
|
|
566
|
+
|
|
453
567
|
## Exit codes
|
|
454
568
|
|
|
455
569
|
| Code | Meaning |
|
package/dist/cli/help.js
CHANGED
|
@@ -25,7 +25,7 @@ export function printHelp() {
|
|
|
25
25
|
--dry-run Show planned tasks without running them
|
|
26
26
|
--budget=N Target number of agent runs ${chalk.dim("(default: 10)")}
|
|
27
27
|
--concurrency=N Max parallel agents ${chalk.dim("(default: 5)")}
|
|
28
|
-
--model=NAME Worker model override ${chalk.dim("(interactive mode picks planner + worker separately -- supports 'Other…' for Qwen / OpenRouter / etc.)")}
|
|
28
|
+
--model=NAME Worker model override ${chalk.dim("(interactive mode picks planner + worker separately -- supports 'Other…' for Kimi / Qwen / OpenRouter / etc.)")}
|
|
29
29
|
--fast-model=NAME Fast worker model for quick tasks ${chalk.dim("(optional -- checked by next wave's workers)")}
|
|
30
30
|
--usage-cap=N Stop at N% utilization ${chalk.dim("(e.g. 90 to save 10% for other work)")}
|
|
31
31
|
--allow-extra-usage Allow extra/overage usage ${chalk.dim("(default: stop when plan limits hit)")}
|
package/dist/cli/settings.js
CHANGED
|
@@ -21,13 +21,13 @@ export async function editRunSettings(options) {
|
|
|
21
21
|
const plannerPick = await pickModel(`${chalk.cyan("①")} Planner model ${chalk.dim("(thinking, steering -- use your strongest)")}:`, models, options.defaults?.plannerModel ?? s.plannerModel);
|
|
22
22
|
s.plannerModel = plannerPick.model;
|
|
23
23
|
s.plannerProviderId = plannerPick.providerId;
|
|
24
|
-
const workerPick = await pickModel(`${chalk.cyan("②")} Worker model ${chalk.dim("(what runs the tasks -- Qwen 3.6 Plus / OpenRouter / etc via Other…)")}:`, models, options.defaults?.workerModel ?? s.workerModel);
|
|
24
|
+
const workerPick = await pickModel(`${chalk.cyan("②")} Worker model ${chalk.dim("(what runs the tasks -- Kimi 2.6 / Qwen 3.6 Plus / OpenRouter / etc via Other…)")}:`, models, options.defaults?.workerModel ?? s.workerModel);
|
|
25
25
|
s.workerModel = workerPick.model;
|
|
26
26
|
s.workerProviderId = workerPick.providerId;
|
|
27
27
|
const suggestFast = !!(options.defaults?.fastModel);
|
|
28
|
-
const fastChoice = await select(`${chalk.cyan("③")} Fast worker model ${chalk.dim("(optional -- Haiku/Qwen for well-scoped tasks, checked by next wave's workers)")}:`, [
|
|
28
|
+
const fastChoice = await select(`${chalk.cyan("③")} Fast worker model ${chalk.dim("(optional -- Haiku/Kimi/Qwen for well-scoped tasks, checked by next wave's workers)")}:`, [
|
|
29
29
|
{ name: "Skip", value: "skip", hint: "single-worker mode (main worker handles everything)" },
|
|
30
|
-
{ name: "Pick a fast worker", value: "pick", hint: "Haiku, Qwen, or any provider -- a cheaper, faster second worker" },
|
|
30
|
+
{ name: "Pick a fast worker", value: "pick", hint: "Haiku, Kimi, Qwen, or any provider -- a cheaper, faster second worker" },
|
|
31
31
|
], suggestFast ? 1 : 0);
|
|
32
32
|
if (fastChoice === "pick") {
|
|
33
33
|
const fastPick = await pickModel(`${chalk.cyan("③b")} Fast worker model:`, models, options.defaults?.fastModel ?? s.fastModel);
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.51.
|
|
1
|
+
export declare const VERSION = "1.51.3";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.51.
|
|
2
|
+
export const VERSION = "1.51.3";
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP-browser prompt adapter.
|
|
3
|
+
*
|
|
4
|
+
* MCP-browser stores prompts as inline template literals in
|
|
5
|
+
* platform/supervisor/gemini-client.ts. This adapter:
|
|
6
|
+
* 1. Extracts those prompt strings by parsing the TS file
|
|
7
|
+
* 2. Defines benchmark cases for each prompt type
|
|
8
|
+
* 3. Provides repo contexts for planning/refinement evaluation
|
|
9
|
+
*
|
|
10
|
+
* The prompts are evaluated by sending them to a model (via OpenRouter
|
|
11
|
+
* or any Anthropic-compatible proxy) and scoring the structured output.
|
|
12
|
+
*/
|
|
13
|
+
import type { BenchmarkCase } from "../types.js";
|
|
14
|
+
/** Prompt kinds we can benchmark */
|
|
15
|
+
export type McpPromptKind = "planning" | "review" | "evolution" | "goal-refinement" | "plan-supervision" | "simple-supervision" | "stuck-analysis";
|
|
16
|
+
/** Extract a const prompt string from gemini-client.ts by name */
|
|
17
|
+
export declare function extractPrompt(kind: McpPromptKind): string;
|
|
18
|
+
/** Build a synthetic user prompt for a given kind and scenario */
|
|
19
|
+
export declare function buildUserPrompt(kind: McpPromptKind, scenario: McpScenario): string;
|
|
20
|
+
export interface McpScenario {
|
|
21
|
+
name: string;
|
|
22
|
+
repoContext?: RepoContext;
|
|
23
|
+
stepContext?: StepContext;
|
|
24
|
+
terminalContext?: TerminalContext;
|
|
25
|
+
reviewContext?: ReviewContext;
|
|
26
|
+
evolutionContext?: EvolutionContext;
|
|
27
|
+
goalContext?: GoalContext;
|
|
28
|
+
}
|
|
29
|
+
export interface RepoContext {
|
|
30
|
+
goal: string;
|
|
31
|
+
fileTree: string;
|
|
32
|
+
readmeSnippet: string;
|
|
33
|
+
hasCiCd: boolean;
|
|
34
|
+
}
|
|
35
|
+
export interface StepContext {
|
|
36
|
+
stepTitle: string;
|
|
37
|
+
stepDescription: string;
|
|
38
|
+
acceptanceCriteria: string[];
|
|
39
|
+
phaseTitle: string;
|
|
40
|
+
progress: string;
|
|
41
|
+
}
|
|
42
|
+
export interface TerminalContext {
|
|
43
|
+
state: "idle" | "error" | "context_limit" | "completed" | "working";
|
|
44
|
+
recentOutput: string;
|
|
45
|
+
projectGoal: string;
|
|
46
|
+
}
|
|
47
|
+
export interface ReviewContext {
|
|
48
|
+
stepTitle: string;
|
|
49
|
+
stepDescription: string;
|
|
50
|
+
acceptanceCriteria: string[];
|
|
51
|
+
terminalOutput: string;
|
|
52
|
+
}
|
|
53
|
+
export interface EvolutionContext {
|
|
54
|
+
completedPlanSummary: string;
|
|
55
|
+
reviewNotes: string;
|
|
56
|
+
evolutionNumber: number;
|
|
57
|
+
}
|
|
58
|
+
export interface GoalContext {
|
|
59
|
+
originalTitle: string;
|
|
60
|
+
originalDescription: string;
|
|
61
|
+
gitHistory: string;
|
|
62
|
+
fileTree: string;
|
|
63
|
+
}
|
|
64
|
+
export declare const PLANNING_SCENARIOS: McpScenario[];
|
|
65
|
+
export declare const REVIEW_SCENARIOS: McpScenario[];
|
|
66
|
+
export declare const SUPERVISION_SCENARIOS: McpScenario[];
|
|
67
|
+
export declare const STUCK_SCENARIOS: McpScenario[];
|
|
68
|
+
/** Convert scenarios to benchmark cases for a given prompt kind */
|
|
69
|
+
export declare function scenariosToCases(kind: McpPromptKind, scenarios: McpScenario[]): BenchmarkCase[];
|
|
70
|
+
export declare function hydrateCases(cases: BenchmarkCase[]): BenchmarkCase[];
|