claude-overnight 1.50.6 → 1.51.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +138 -33
- package/dist/cli/help.js +1 -1
- package/dist/cli/settings.js +3 -3
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/curator.d.ts +22 -0
- package/dist/prompt-evolution/curator.js +123 -0
- package/dist/prompt-evolution/evaluator.d.ts +34 -0
- package/dist/prompt-evolution/evaluator.js +157 -0
- package/dist/prompt-evolution/fixtures/plan-cases.d.ts +13 -0
- package/dist/prompt-evolution/fixtures/plan-cases.js +163 -0
- package/dist/prompt-evolution/index.d.ts +48 -0
- package/dist/prompt-evolution/index.js +176 -0
- package/dist/prompt-evolution/mutator.d.ts +21 -0
- package/dist/prompt-evolution/mutator.js +124 -0
- package/dist/prompt-evolution/scorer.d.ts +14 -0
- package/dist/prompt-evolution/scorer.js +120 -0
- package/dist/prompt-evolution/types.d.ts +111 -0
- package/dist/prompt-evolution/types.js +11 -0
- package/dist/providers/index.js +4 -4
- package/dist/run/run.js +11 -3
- package/dist/run/summary.d.ts +2 -0
- package/dist/run/summary.js +52 -28
- package/dist/run/wave-loop.js +2 -1
- package/dist/skills/librarian.js +28 -29
- package/package.json +2 -1
- package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
package/README.md
CHANGED
|
@@ -4,7 +4,50 @@ Parallel Claude agents in isolated git worktrees. Set a usage cap so your intera
|
|
|
4
4
|
|
|
5
5
|
Hand it an objective and a session budget, walk away, review the diff when the run ends. Every agent runs in its own worktree on its own branch — a misbehaving agent can't trash your working tree. Unmerged branches are preserved for manual review, never discarded.
|
|
6
6
|
|
|
7
|
-
Built on the [Claude Agent SDK](https://www.npmjs.com/package/@anthropic-ai/claude-agent-sdk) — every session runs on the SDK's agent harness.
|
|
7
|
+
Built on the [Claude Agent SDK](https://www.npmjs.com/package/@anthropic-ai/claude-agent-sdk) — every planner, worker, reviewer, and verifier session runs on the SDK's agent harness. `claude-overnight` is the orchestrator around that harness: it plans, routes, resumes, reviews, and persists many SDK sessions at once. Because the harness speaks Anthropic Messages, each role can run on Anthropic direct or any compatible endpoint.
|
|
8
|
+
|
|
9
|
+
## Three execution layers
|
|
10
|
+
|
|
11
|
+
Every run can mix and match three execution layers:
|
|
12
|
+
|
|
13
|
+
| Layer | What it does | Typical choice |
|
|
14
|
+
|---|---|---|
|
|
15
|
+
| Planner | Thinking wave, orchestration, steering, review, final gate | your strongest model |
|
|
16
|
+
| Main worker | Bulk implementation | a reliable coding model |
|
|
17
|
+
| Fast worker (optional) | Cheap, well-scoped tasks checked by later waves | a cheaper/faster model |
|
|
18
|
+
|
|
19
|
+
The layers are configured independently. A common setup is Claude on the planner, Kimi or Qwen on the main worker, and Cursor or Haiku as the fast worker.
|
|
20
|
+
|
|
21
|
+
## First-class features
|
|
22
|
+
|
|
23
|
+
- **Harness-first orchestration.** This is not a replacement runtime. It is a multi-session control plane for the Claude Agent SDK harness, so you keep the same tool loop, session resume behavior, streaming model, and transcript format across the whole swarm.
|
|
24
|
+
- **Dynamic repo memory.** Agents can propose reusable memory candidates during execution. A librarian curates them at the end of each wave, updates the skill index, and future waves see only a compact stub plus on-demand hydration instead of a giant static prompt.
|
|
25
|
+
- **Run memory that compounds.** Long runs keep a live status snapshot, archived milestones, and an evolving goal file so steering can pick up exactly where it left off, even after rate limits, crashes, or an overnight stop.
|
|
26
|
+
- **Embedded Cursor flexibility.** Cursor-hosted models are routed through a bundled `cursor-composer-in-claude` proxy, so Cursor becomes just another planner / worker / fast-worker option instead of a separate workflow.
|
|
27
|
+
|
|
28
|
+
## Run on Kimi 2.6
|
|
29
|
+
|
|
30
|
+
Want a cheap Anthropic-compatible worker with a simple shell setup? Kimi 2.6 via Kimi's coding endpoint is a drop-in worker that speaks the Anthropic Messages API -- same client, same flow, just a different base URL.
|
|
31
|
+
|
|
32
|
+
1. **Configure the provider.** Run `claude-overnight`, choose `Other…` on the worker step, and fill in:
|
|
33
|
+
|
|
34
|
+
| Field | Value |
|
|
35
|
+
|---|---|
|
|
36
|
+
| Name | `Kimi 2.6` |
|
|
37
|
+
| Base URL | `https://api.kimi.com/coding/` |
|
|
38
|
+
| Model id | `kimi-for-coding` |
|
|
39
|
+
| API key | your Kimi coding key |
|
|
40
|
+
|
|
41
|
+
2. That's it. Planner runs on Sonnet (or Opus), worker runs on Kimi.
|
|
42
|
+
|
|
43
|
+
Or set it via env directly:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
export ANTHROPIC_BASE_URL="https://api.kimi.com/coding/"
|
|
47
|
+
export ANTHROPIC_API_KEY="sk-kimi-..."
|
|
48
|
+
export ANTHROPIC_MODEL="kimi-for-coding"
|
|
49
|
+
claude-overnight
|
|
50
|
+
```
|
|
8
51
|
|
|
9
52
|
## Run on Qwen 3.6 Plus
|
|
10
53
|
|
|
@@ -31,9 +74,9 @@ export ANTHROPIC_MODEL="qwen3.6-plus"
|
|
|
31
74
|
claude-overnight
|
|
32
75
|
```
|
|
33
76
|
|
|
34
|
-
## Run via Cursor
|
|
77
|
+
## Run via Bundled Cursor Proxy
|
|
35
78
|
|
|
36
|
-
Use Cursor
|
|
79
|
+
Use Cursor-hosted models (`auto`, `composer`, `composer-2`, etc.) through the bundled `cursor-composer-in-claude` proxy. `claude-overnight` auto-starts that local Anthropic-compatible proxy, injects the per-worktree workspace header, and treats Cursor as just another provider for the planner, main worker, or fast worker.
|
|
37
80
|
|
|
38
81
|
### macOS: Cursor agent shell patch
|
|
39
82
|
|
|
@@ -56,34 +99,28 @@ alias agent="run_cursor_agent"
|
|
|
56
99
|
|
|
57
100
|
`claude-overnight` prints a one-time notice when you use the Cursor proxy and this snippet is not detected in `~/.zshrc` or `~/.zprofile`. The bundled proxy also sets `CURSOR_AGENT_NODE` / `CURSOR_AGENT_SCRIPT` when it can find `node` and `cursor-agent`, but your interactive shell still benefits from the alias.
|
|
58
101
|
|
|
59
|
-
1. **Install the Cursor CLI
|
|
102
|
+
1. **Install the Cursor CLI:**
|
|
60
103
|
|
|
61
104
|
```bash
|
|
62
105
|
curl https://cursor.com/install -fsS | bash
|
|
63
|
-
npm install -g cursor-api-proxy
|
|
64
106
|
```
|
|
65
107
|
|
|
66
108
|
2. **Get an API key.** Visit [cursor.com/dashboard/integrations](https://cursor.com/dashboard/integrations) and scroll to the "API Keys" section.
|
|
67
109
|
|
|
68
|
-
3. **Set up.** Run `claude-overnight` and when prompted to pick a model, choose **Cursor…**. It walks you through a one-time setup: CLI check, API key entry (persisted to `providers.json`),
|
|
110
|
+
3. **Set up.** Run `claude-overnight` and when prompted to pick a model, choose **Cursor…**. It walks you through a one-time setup: CLI check, API key entry (persisted to `providers.json`), bundled proxy verification, and health check.
|
|
69
111
|
|
|
70
|
-
4.
|
|
71
|
-
|
|
72
|
-
```bash
|
|
73
|
-
npx cursor-api-proxy
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
5. Pick your model (`auto`, `composer`, `composer-2`, etc.). The provider is saved and reappears in every future run.
|
|
112
|
+
4. Pick your model (`auto`, `composer`, `composer-2`, etc.). The provider is saved and reappears in every future run.
|
|
77
113
|
|
|
78
114
|
Or configure the key manually:
|
|
79
115
|
|
|
80
116
|
```bash
|
|
81
|
-
export CURSOR_BRIDGE_API_KEY="
|
|
82
|
-
npx cursor-api-proxy &
|
|
117
|
+
export CURSOR_BRIDGE_API_KEY="crsr_..."
|
|
83
118
|
claude-overnight
|
|
84
119
|
```
|
|
85
120
|
|
|
86
|
-
|
|
121
|
+
If the bundled proxy cannot auto-start, the setup wizard prints the exact `node ".../cursor-composer-in-claude/dist/cli.js"` command for this install so you can launch the same embedded proxy manually.
|
|
122
|
+
|
|
123
|
+
**Tip:** once a Cursor provider is saved, run `claude-overnight` with the `--model=cursor-auto` flag in non-interactive mode to skip the picker. If the proxy isn't running at startup, the tool attempts to restart it automatically.
|
|
87
124
|
|
|
88
125
|
### macOS: “Keychain Not Found” / `cursor-user`
|
|
89
126
|
|
|
@@ -109,7 +146,7 @@ security unlock-keychain ~/Library/Keychains/login.keychain-db
|
|
|
109
146
|
npm install -g claude-overnight
|
|
110
147
|
```
|
|
111
148
|
|
|
112
|
-
Requires Node.js ≥ 20
|
|
149
|
+
Requires Node.js ≥ 20. For Anthropic-direct roles, use `claude auth login` or `ANTHROPIC_API_KEY`. For provider-backed roles, save a Kimi / Qwen / Cursor / OpenRouter-compatible provider instead. No Anthropic plan or key? See **Run on Kimi 2.6** or **Run on Qwen 3.6 Plus** above -- cheap, drop-in alternatives.
|
|
113
150
|
|
|
114
151
|
## Quick start
|
|
115
152
|
|
|
@@ -130,7 +167,7 @@ claude-overnight
|
|
|
130
167
|
● Opus -- Opus 4.6 · Most capable
|
|
131
168
|
○ Sonnet -- Sonnet 4.6 · Best for everyday tasks
|
|
132
169
|
|
|
133
|
-
⑤ Worker model (what runs the tasks -- Qwen 3.6 Plus / OpenRouter / etc via Other…):
|
|
170
|
+
⑤ Worker model (what runs the tasks -- Kimi 2.6 / Qwen 3.6 Plus / OpenRouter / etc via Other…):
|
|
134
171
|
● Sonnet -- Sonnet 4.6 · Best for everyday tasks
|
|
135
172
|
○ Opus -- Opus 4.6 · Most capable
|
|
136
173
|
○ Other… · custom OpenAI/Anthropic-compatible endpoint
|
|
@@ -159,13 +196,58 @@ claude-overnight
|
|
|
159
196
|
◆ Assessing... ✓ Done
|
|
160
197
|
```
|
|
161
198
|
|
|
162
|
-
You interact once (objective, budget, model, review themes), then the rest runs unattended -- thinking, planning, executing, reflecting, steering. Rate-limited? It waits and retries. Crash? Resume where you left off. Capped at usage limit? Pick up next time with full context preserved.
|
|
199
|
+
You interact once (objective, budget, model, review themes), then the rest runs unattended -- thinking, planning, executing, curating memory, reflecting, steering. Rate-limited? It waits and retries. Crash? Resume where you left off. Capped at usage limit? Pick up next time with full context preserved.
|
|
163
200
|
|
|
164
201
|
## Use cases
|
|
165
202
|
|
|
166
203
|
Overnight refactors, batch feature implementation, codebase-wide cleanups, test generation, documentation sprints, framework migrations, quality audits, long research runs. One objective + a budget + walk away.
|
|
167
204
|
|
|
168
|
-
##
|
|
205
|
+
## Typical flow
|
|
206
|
+
|
|
207
|
+
```mermaid
|
|
208
|
+
flowchart TD
|
|
209
|
+
subgraph Setup["Setup + planning"]
|
|
210
|
+
A["Start or resume run"] --> B["Optional setup coach<br/>rewrite objective + suggest settings"]
|
|
211
|
+
B --> C["Pick planner / worker / fast worker<br/>budget + concurrency + worktree mode"]
|
|
212
|
+
C --> D["Optional provider preflight<br/>real auth / write probes"]
|
|
213
|
+
D --> E["Theme discovery + user review/edit/chat"]
|
|
214
|
+
E --> F["Thinking wave<br/>planner explores the codebase"]
|
|
215
|
+
F --> G["Task orchestration<br/>planner writes concrete tasks"]
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
subgraph Wave["Per-wave loop"]
|
|
219
|
+
G --> H["beforeWave hook<br/>optional shell commands"]
|
|
220
|
+
H --> I["Execution wave<br/>main worker + optional fast worker<br/>isolated git worktrees"]
|
|
221
|
+
I --> J["Per-agent simplify pass<br/>same SDK session resumes"]
|
|
222
|
+
J --> K["Debrief + afterWave hook"]
|
|
223
|
+
K --> L["Post-wave review<br/>flex mode"]
|
|
224
|
+
L --> M["Wave-end librarian pass"]
|
|
225
|
+
M --> N{"Flex mode?"}
|
|
226
|
+
N -->|yes| O["Steering<br/>update status / milestones / goal"]
|
|
227
|
+
N -->|no| P["Verifier<br/>fixed-plan gate between waves"]
|
|
228
|
+
O -->|execute more| H
|
|
229
|
+
O -->|reflect deeper| Q["Reflection wave<br/>extra review / audit"]
|
|
230
|
+
Q --> O
|
|
231
|
+
O -->|done| R["Final gate<br/>review full diff"]
|
|
232
|
+
P -->|more work| H
|
|
233
|
+
P -->|done| R
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
subgraph Memory["Dynamic repo memory"]
|
|
237
|
+
S["Workers discover reusable patterns"] --> T["Scribe writes memory candidates"]
|
|
238
|
+
T --> U["Librarian curates candidates"]
|
|
239
|
+
U --> V["Canon markdown + SQLite index updated"]
|
|
240
|
+
V --> W["Future waves get L0 stub<br/>hydrate L1/L2 on demand"]
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
J -. emits candidates .-> S
|
|
244
|
+
M -. curates queue .-> U
|
|
245
|
+
W -. informs later waves .-> I
|
|
246
|
+
W -. informs planner decisions .-> O
|
|
247
|
+
R --> X["afterRun hook<br/>optional shell commands"]
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
The chart above shows the main user-visible lifecycle. It intentionally omits some engine-internal branches such as health-check heal tasks, A/B skill assignment, zero-work retry, budget-extension prompts, and resume salvage after planning crashes.
|
|
169
251
|
|
|
170
252
|
### 1. Thinking phase -- parallel architect sessions
|
|
171
253
|
|
|
@@ -177,15 +259,17 @@ An orchestrator session reads all design documents and synthesizes concrete exec
|
|
|
177
259
|
|
|
178
260
|
### 3. Parallel execution waves
|
|
179
261
|
|
|
180
|
-
Tasks run in parallel agent sessions (each in its own git worktree). After completing its task, each session automatically runs a **simplify pass** -- reviewing its own `git diff` for code reuse opportunities, quality issues, and inefficiencies, then fixing them before the framework commits. This is done via the SDK's **session resume** mechanism: the same agent session continues with a follow-up prompt, so the agent's full context from its task is still available -- no need to re-instruct or re-fill context.
|
|
262
|
+
Tasks run in parallel agent sessions (each in its own git worktree). After completing its task, each session automatically runs a **simplify pass** -- reviewing its own `git diff` for code reuse opportunities, quality issues, and inefficiencies, then fixing them before the framework commits. This is done via the SDK's **session resume** mechanism: the same agent session continues with a follow-up prompt, so the agent's full context from its task is still available -- no need to re-instruct or re-fill context. If a fast worker is configured, steering can route cheaper, well-scoped tasks there while the main worker handles heavier implementation.
|
|
181
263
|
|
|
182
264
|
### 4. Post-wave review
|
|
183
265
|
|
|
184
266
|
After each wave (flex mode, budget remaining), a dedicated **review agent** inspects the consolidated diff for issues the individual agents may have blind-spotted: missed reuse opportunities, copy-paste variations, leaky abstractions, efficiency regressions. Runs as a single-agent wave -- one session reviews what the swarm just produced.
|
|
185
267
|
|
|
186
|
-
### 5.
|
|
268
|
+
### 5. Librarian and dynamic memory
|
|
187
269
|
|
|
188
|
-
|
|
270
|
+
During execution, workers can emit **memory candidates** when they discover something reusable: a repo-specific quirk, a recovery path, a command sequence that worked, or a tool recipe worth reusing later. The scribe writes those candidates to `~/.claude-overnight/skills/<repo-fingerprint>/candidates/` without blocking the run.
|
|
271
|
+
|
|
272
|
+
At the end of each wave, a **librarian** pass curates that queue. It can promote a candidate into canon, patch an existing skill, quarantine stale skills, or reject weak / duplicated candidates. Canon lives on disk as markdown; SQLite is only the ranked index. This is what makes the memory system dynamic rather than a fixed prompt blob.
|
|
189
273
|
|
|
190
274
|
### 6. Steering
|
|
191
275
|
|
|
@@ -195,14 +279,28 @@ After each wave, steering assesses: "how good is this?" -- not "what's missing?
|
|
|
195
279
|
- **Reflect** by spinning up 1-2 review sessions for deep quality/architecture audits
|
|
196
280
|
- **Declare done** when the vision is met at high quality
|
|
197
281
|
|
|
198
|
-
###
|
|
282
|
+
### 7. Post-run final gate
|
|
199
283
|
|
|
200
|
-
|
|
284
|
+
When the run completes (steering declares done), a final **comprehensive review** runs against the full `git diff main`. Checks architecture coherence, consistency with existing patterns, build integrity, and test pass. The last quality gate before the diff lands.
|
|
285
|
+
|
|
286
|
+
### Run-memory layers
|
|
287
|
+
|
|
288
|
+
Long runs stay sharp because steering maintains three run-memory layers:
|
|
201
289
|
|
|
202
290
|
- **Status** -- a living project snapshot, updated every wave. Compressed, never truncated.
|
|
203
291
|
- **Milestones** -- strategic snapshots archived every ~5 waves. Long-term memory.
|
|
204
292
|
- **Goal** -- the evolving north star. What quality means for this codebase.
|
|
205
293
|
|
|
294
|
+
### Progressive-disclosure repo memory
|
|
295
|
+
|
|
296
|
+
The repo memory system is separate from the run folder and is designed around three disclosure layers so context stays small:
|
|
297
|
+
|
|
298
|
+
- **L0** -- a tiny ranked stub injected into planner and worker prompts. It lists only the names and descriptions of the most relevant project-specific skills and tool recipes.
|
|
299
|
+
- **L1** -- the full skill body, loaded on demand with `skill_read(name)` when an agent wants the actual recipe or guidance.
|
|
300
|
+
- **L2** -- attached references for deeper context. The library is structured for them even though most runs only need the L0 stub plus occasional L1 hydration.
|
|
301
|
+
|
|
302
|
+
That progressive disclosure matters: the planner and workers do not carry the full memory library in every prompt. They get a compact overview, call `skill_search(query)` if they need to narrow it, and hydrate only the bodies that matter for the task in front of them.
|
|
303
|
+
|
|
206
304
|
## Run history, resume, and knowledge carryforward
|
|
207
305
|
|
|
208
306
|
Every run gets its own folder in `.claude-overnight/runs/`. Nothing is ever overwritten.
|
|
@@ -259,7 +357,7 @@ Not every provider delivers the same streaming granularity:
|
|
|
259
357
|
| --- | --- | --- | --- |
|
|
260
358
|
| Anthropic (direct) | ✓ | ✓ | ✓ |
|
|
261
359
|
| Cursor proxy (`cursor-composer-in-claude`) | — | — | ✓ (final answer only) |
|
|
262
|
-
| Qwen / OpenRouter / custom Anthropic-compatible | depends on upstream | depends | usually ✓ |
|
|
360
|
+
| Kimi / Qwen / OpenRouter / custom Anthropic-compatible | depends on upstream | depends | usually ✓ |
|
|
263
361
|
|
|
264
362
|
When a provider doesn't stream partials (or the model is a reasoning model on the Cursor proxy — the proxy suppresses the thinking phase and only emits the final answer), the ticker shows elapsed time with no live text, then the completed result lands in one go. The UI, transcripts, and the resume flow all behave identically either way — streaming is used when available, never required.
|
|
265
363
|
|
|
@@ -309,7 +407,7 @@ claude-overnight "fix auth bug in src/auth.ts" "add tests for user model"
|
|
|
309
407
|
|---|---|---|
|
|
310
408
|
| `--budget=N` | `10` | Total agent sessions |
|
|
311
409
|
| `--concurrency=N` | `5` | Parallel agents |
|
|
312
|
-
| `--model=NAME` | prompted | Worker model -- interactive picks planner + worker separately; `Other…` adds Qwen / OpenRouter / any Anthropic-compat endpoint. In non-interactive mode, a saved provider's model id is auto-resolved to the provider. |
|
|
410
|
+
| `--model=NAME` | prompted | Worker model -- interactive picks planner + worker separately; `Other…` adds Kimi / Qwen / OpenRouter / any Anthropic-compat endpoint. In non-interactive mode, a saved provider's model id is auto-resolved to the provider. |
|
|
313
411
|
| `--usage-cap=N` | unlimited | Stop at N% utilization |
|
|
314
412
|
| `--allow-extra-usage` | off | Allow extra/overage usage (billed separately) |
|
|
315
413
|
| `--extra-usage-budget=N` | -- | Max $ for extra usage (implies --allow-extra-usage) |
|
|
@@ -331,26 +429,33 @@ claude-overnight "fix auth bug in src/auth.ts" "add tests for user model"
|
|
|
331
429
|
| `mergeStrategy` | `"yolo" \| "branch"` | `"yolo"` | Merge into HEAD or new branch |
|
|
332
430
|
| `usageCap` | `number (0-100)` | unlimited | Stop at N% utilization |
|
|
333
431
|
|
|
334
|
-
## Custom providers (Qwen, OpenRouter, any Anthropic-compatible endpoint)
|
|
432
|
+
## Custom providers (Kimi, Qwen, OpenRouter, any Anthropic-compatible endpoint)
|
|
335
433
|
|
|
336
434
|
Planner, main worker, and optional fast worker are each picked separately -- pair Opus-on-Anthropic for the planner/thinker with a cheaper model on another provider for the bulk of work. The fast worker is a real worker (same tools, same env), just on a cheaper/faster model — steering routes well-scoped tasks to it by default.
|
|
337
435
|
|
|
338
436
|
From the interactive picker, choose `Other…` on the planner, worker, or fast step:
|
|
339
437
|
|
|
340
438
|
```
|
|
341
|
-
⑤ Worker model (what runs the tasks -- Qwen 3.6 Plus / OpenRouter / etc via Other…):
|
|
439
|
+
⑤ Worker model (what runs the tasks -- Kimi 2.6 / Qwen 3.6 Plus / OpenRouter / etc via Other…):
|
|
342
440
|
○ Sonnet
|
|
343
441
|
○ Opus
|
|
344
442
|
● Other…
|
|
345
443
|
|
|
346
|
-
Name:
|
|
347
|
-
Base URL: https://
|
|
348
|
-
Model id:
|
|
444
|
+
Name: Kimi 2.6
|
|
445
|
+
Base URL: https://api.kimi.com/coding/
|
|
446
|
+
Model id: kimi-for-coding
|
|
349
447
|
API key source:
|
|
350
448
|
● Paste key now · stored plaintext in ~/.claude/claude-overnight/providers.json (0600)
|
|
351
449
|
○ Read from env var · nothing written to disk
|
|
352
450
|
```
|
|
353
451
|
|
|
452
|
+
Common examples:
|
|
453
|
+
|
|
454
|
+
| Name | Base URL | Model id |
|
|
455
|
+
|---|---|---|
|
|
456
|
+
| `Kimi 2.6` | `https://api.kimi.com/coding/` | `kimi-for-coding` |
|
|
457
|
+
| `Qwen 3.6 Plus` | `https://dashscope-intl.aliyuncs.com/apps/anthropic` | `qwen3.6-plus` |
|
|
458
|
+
|
|
354
459
|
Saved providers live user-level at `~/.claude/claude-overnight/providers.json` (mode 0600) and show up automatically in every repo. No per-project config.
|
|
355
460
|
|
|
356
461
|
**How routing works.** Each `query()` gets its own env override (`ANTHROPIC_BASE_URL` + `ANTHROPIC_AUTH_TOKEN`) -- planner queries use the planner provider, main-worker queries use the worker provider, fast-worker queries use the fast provider. No global shell env, no proxy daemon, no `process.env` pollution between calls.
|
|
@@ -359,7 +464,7 @@ Saved providers live user-level at `~/.claude/claude-overnight/providers.json` (
|
|
|
359
464
|
|
|
360
465
|
**Resume.** Provider ids are persisted in `run.json` and rehydrated on resume. If you deleted a provider between runs, resume refuses to start and tells you exactly which id is missing.
|
|
361
466
|
|
|
362
|
-
**Non-interactive / CI.** `claude-overnight --model=qwen3.6-plus` auto-resolves the model id to a saved provider -- no separate `--provider` flag.
|
|
467
|
+
**Non-interactive / CI.** `claude-overnight --model=kimi-for-coding` (or `qwen3.6-plus`) auto-resolves the model id to a saved provider -- no separate `--provider` flag.
|
|
363
468
|
|
|
364
469
|
## Parallel Playwright Testing
|
|
365
470
|
|
package/dist/cli/help.js
CHANGED
|
@@ -25,7 +25,7 @@ export function printHelp() {
|
|
|
25
25
|
--dry-run Show planned tasks without running them
|
|
26
26
|
--budget=N Target number of agent runs ${chalk.dim("(default: 10)")}
|
|
27
27
|
--concurrency=N Max parallel agents ${chalk.dim("(default: 5)")}
|
|
28
|
-
--model=NAME Worker model override ${chalk.dim("(interactive mode picks planner + worker separately -- supports 'Other…' for Qwen / OpenRouter / etc.)")}
|
|
28
|
+
--model=NAME Worker model override ${chalk.dim("(interactive mode picks planner + worker separately -- supports 'Other…' for Kimi / Qwen / OpenRouter / etc.)")}
|
|
29
29
|
--fast-model=NAME Fast worker model for quick tasks ${chalk.dim("(optional -- checked by next wave's workers)")}
|
|
30
30
|
--usage-cap=N Stop at N% utilization ${chalk.dim("(e.g. 90 to save 10% for other work)")}
|
|
31
31
|
--allow-extra-usage Allow extra/overage usage ${chalk.dim("(default: stop when plan limits hit)")}
|
package/dist/cli/settings.js
CHANGED
|
@@ -21,13 +21,13 @@ export async function editRunSettings(options) {
|
|
|
21
21
|
const plannerPick = await pickModel(`${chalk.cyan("①")} Planner model ${chalk.dim("(thinking, steering -- use your strongest)")}:`, models, options.defaults?.plannerModel ?? s.plannerModel);
|
|
22
22
|
s.plannerModel = plannerPick.model;
|
|
23
23
|
s.plannerProviderId = plannerPick.providerId;
|
|
24
|
-
const workerPick = await pickModel(`${chalk.cyan("②")} Worker model ${chalk.dim("(what runs the tasks -- Qwen 3.6 Plus / OpenRouter / etc via Other…)")}:`, models, options.defaults?.workerModel ?? s.workerModel);
|
|
24
|
+
const workerPick = await pickModel(`${chalk.cyan("②")} Worker model ${chalk.dim("(what runs the tasks -- Kimi 2.6 / Qwen 3.6 Plus / OpenRouter / etc via Other…)")}:`, models, options.defaults?.workerModel ?? s.workerModel);
|
|
25
25
|
s.workerModel = workerPick.model;
|
|
26
26
|
s.workerProviderId = workerPick.providerId;
|
|
27
27
|
const suggestFast = !!(options.defaults?.fastModel);
|
|
28
|
-
const fastChoice = await select(`${chalk.cyan("③")} Fast worker model ${chalk.dim("(optional -- Haiku/Qwen for well-scoped tasks, checked by next wave's workers)")}:`, [
|
|
28
|
+
const fastChoice = await select(`${chalk.cyan("③")} Fast worker model ${chalk.dim("(optional -- Haiku/Kimi/Qwen for well-scoped tasks, checked by next wave's workers)")}:`, [
|
|
29
29
|
{ name: "Skip", value: "skip", hint: "single-worker mode (main worker handles everything)" },
|
|
30
|
-
{ name: "Pick a fast worker", value: "pick", hint: "Haiku, Qwen, or any provider -- a cheaper, faster second worker" },
|
|
30
|
+
{ name: "Pick a fast worker", value: "pick", hint: "Haiku, Kimi, Qwen, or any provider -- a cheaper, faster second worker" },
|
|
31
31
|
], suggestFast ? 1 : 0);
|
|
32
32
|
if (fastChoice === "pick") {
|
|
33
33
|
const fastPick = await pickModel(`${chalk.cyan("③b")} Fast worker model:`, models, options.defaults?.fastModel ?? s.fastModel);
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.
|
|
1
|
+
export declare const VERSION = "1.51.1";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.
|
|
2
|
+
export const VERSION = "1.51.1";
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Curator — selects which prompt variants survive to the next generation.
|
|
3
|
+
*
|
|
4
|
+
* Strategy: Pareto-frontier selection on the multi-objective score vector.
|
|
5
|
+
* This keeps diversity (don't collapse to a single local optimum) while
|
|
6
|
+
* still promoting the best variants.
|
|
7
|
+
*
|
|
8
|
+
* Also applies a novelty bonus so variants that explore different strategies
|
|
9
|
+
* aren't immediately crushed by a dominant but narrow winner.
|
|
10
|
+
*/
|
|
11
|
+
import type { VariantRow, CuratorDecision } from "./types.js";
|
|
12
|
+
export interface CurateOpts {
|
|
13
|
+
/** Number of top variants to keep (elite) */
|
|
14
|
+
eliteCount?: number;
|
|
15
|
+
/** Number of additional diverse variants to keep via novelty */
|
|
16
|
+
diversityCount?: number;
|
|
17
|
+
/** Minimum gmean improvement over current canon to promote */
|
|
18
|
+
promoteThreshold?: number;
|
|
19
|
+
}
|
|
20
|
+
export declare function curate(rows: VariantRow[], currentCanonGmean: number, opts?: CurateOpts): CuratorDecision;
|
|
21
|
+
/** Pretty-print a matrix for human review */
|
|
22
|
+
export declare function formatMatrix(rows: VariantRow[], caseNames: string[]): string;
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Curator — selects which prompt variants survive to the next generation.
|
|
3
|
+
*
|
|
4
|
+
* Strategy: Pareto-frontier selection on the multi-objective score vector.
|
|
5
|
+
* This keeps diversity (don't collapse to a single local optimum) while
|
|
6
|
+
* still promoting the best variants.
|
|
7
|
+
*
|
|
8
|
+
* Also applies a novelty bonus so variants that explore different strategies
|
|
9
|
+
* aren't immediately crushed by a dominant but narrow winner.
|
|
10
|
+
*/
|
|
11
|
+
export function curate(rows, currentCanonGmean, opts = {}) {
|
|
12
|
+
const eliteCount = opts.eliteCount ?? 3;
|
|
13
|
+
const diversityCount = opts.diversityCount ?? 2;
|
|
14
|
+
const promoteThreshold = opts.promoteThreshold ?? 0.02;
|
|
15
|
+
if (rows.length === 0)
|
|
16
|
+
return { promoted: [], quarantined: [], kept: [] };
|
|
17
|
+
// 1. Compute novelty scores (cosine distance from centroid)
|
|
18
|
+
const centroid = computeCentroid(rows.map((r) => vectorize(r.aggregate)));
|
|
19
|
+
const withNovelty = rows.map((r) => ({
|
|
20
|
+
...r,
|
|
21
|
+
novelty: cosineDistance(vectorize(r.aggregate), centroid),
|
|
22
|
+
}));
|
|
23
|
+
// 2. Pareto frontier: no other row dominates this one on all dimensions
|
|
24
|
+
const paretoIds = new Set();
|
|
25
|
+
for (const a of withNovelty) {
|
|
26
|
+
let dominated = false;
|
|
27
|
+
for (const b of withNovelty) {
|
|
28
|
+
if (a.variantId === b.variantId)
|
|
29
|
+
continue;
|
|
30
|
+
if (dominates(b.aggregate, a.aggregate)) {
|
|
31
|
+
dominated = true;
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
if (!dominated)
|
|
36
|
+
paretoIds.add(a.variantId);
|
|
37
|
+
}
|
|
38
|
+
// 3. Elite selection: best by gmean within Pareto set
|
|
39
|
+
const paretoRows = withNovelty.filter((r) => paretoIds.has(r.variantId));
|
|
40
|
+
paretoRows.sort((a, b) => b.gmean - a.gmean);
|
|
41
|
+
const elites = paretoRows.slice(0, eliteCount);
|
|
42
|
+
// 4. Diversity selection: highest novelty among non-elites
|
|
43
|
+
const nonElites = withNovelty.filter((r) => !elites.some((e) => e.variantId === r.variantId));
|
|
44
|
+
nonElites.sort((a, b) => b.novelty - a.novelty);
|
|
45
|
+
const diverse = nonElites.slice(0, diversityCount);
|
|
46
|
+
const kept = [...elites, ...diverse];
|
|
47
|
+
const keptIds = new Set(kept.map((r) => r.variantId));
|
|
48
|
+
// 5. Promotion: if the absolute best exceeds canon + threshold
|
|
49
|
+
const best = paretoRows[0];
|
|
50
|
+
const promoted = [];
|
|
51
|
+
if (best && best.gmean > currentCanonGmean + promoteThreshold) {
|
|
52
|
+
promoted.push(best.variantId);
|
|
53
|
+
}
|
|
54
|
+
// 6. Quarantine: everything not kept
|
|
55
|
+
const quarantined = rows.filter((r) => !keptIds.has(r.variantId)).map((r) => r.variantId);
|
|
56
|
+
return { promoted, quarantined, kept: [...keptIds] };
|
|
57
|
+
}
|
|
58
|
+
/** Returns true if a dominates b on all dimensions (and at least one strictly) */
|
|
59
|
+
function dominates(a, b) {
|
|
60
|
+
const keys = ["parse", "schema", "content", "costEfficiency", "speed"];
|
|
61
|
+
let strictlyBetter = false;
|
|
62
|
+
for (const k of keys) {
|
|
63
|
+
if (a[k] < b[k])
|
|
64
|
+
return false;
|
|
65
|
+
if (a[k] > b[k])
|
|
66
|
+
strictlyBetter = true;
|
|
67
|
+
}
|
|
68
|
+
return strictlyBetter;
|
|
69
|
+
}
|
|
70
|
+
function vectorize(s) {
|
|
71
|
+
return [s.parse, s.schema, s.content, s.costEfficiency, s.speed];
|
|
72
|
+
}
|
|
73
|
+
function computeCentroid(vectors) {
|
|
74
|
+
if (vectors.length === 0)
|
|
75
|
+
return [0, 0, 0, 0, 0];
|
|
76
|
+
const dim = vectors[0].length;
|
|
77
|
+
return Array.from({ length: dim }, (_, i) => vectors.reduce((sum, v) => sum + v[i], 0) / vectors.length);
|
|
78
|
+
}
|
|
79
|
+
function cosineDistance(a, b) {
|
|
80
|
+
let dot = 0;
|
|
81
|
+
let na = 0;
|
|
82
|
+
let nb = 0;
|
|
83
|
+
for (let i = 0; i < a.length; i++) {
|
|
84
|
+
dot += a[i] * b[i];
|
|
85
|
+
na += a[i] * a[i];
|
|
86
|
+
nb += b[i] * b[i];
|
|
87
|
+
}
|
|
88
|
+
if (na === 0 || nb === 0)
|
|
89
|
+
return 0;
|
|
90
|
+
const sim = dot / (Math.sqrt(na) * Math.sqrt(nb));
|
|
91
|
+
return 1 - sim; // distance = 1 - similarity
|
|
92
|
+
}
|
|
93
|
+
/** Pretty-print a matrix for human review */
|
|
94
|
+
export function formatMatrix(rows, caseNames) {
|
|
95
|
+
const lines = [];
|
|
96
|
+
lines.push(`| variant | gen | gmean | parse | schema | content | cost | speed |`);
|
|
97
|
+
lines.push(`|---------|-----|-------|-------|--------|---------|------|-------|`);
|
|
98
|
+
for (const r of rows.sort((a, b) => b.gmean - a.gmean)) {
|
|
99
|
+
const s = r.aggregate;
|
|
100
|
+
lines.push(`| ${r.variantId.slice(0, 12).padEnd(11)} | ${String(r.generation).padStart(3)} | ` +
|
|
101
|
+
`${(r.gmean * 100).toFixed(1).padStart(5)} | ${(s.parse * 100).toFixed(0).padStart(5)} | ` +
|
|
102
|
+
`${(s.schema * 100).toFixed(0).padStart(6)} | ${(s.content * 100).toFixed(0).padStart(7)} | ` +
|
|
103
|
+
`${(s.costEfficiency * 100).toFixed(0).padStart(4)} | ${(s.speed * 100).toFixed(0).padStart(5)} |`);
|
|
104
|
+
}
|
|
105
|
+
lines.push("");
|
|
106
|
+
lines.push("Per-case breakdown:");
|
|
107
|
+
for (const r of rows.sort((a, b) => b.gmean - a.gmean)) {
|
|
108
|
+
lines.push(` ${r.variantId} (gen ${r.generation}):`);
|
|
109
|
+
for (const name of caseNames) {
|
|
110
|
+
const c = [...r.results.values()].find((x) => x.caseName === name);
|
|
111
|
+
if (!c)
|
|
112
|
+
continue;
|
|
113
|
+
const flag = c.notes.length > 0 ? "⚠" : "✓";
|
|
114
|
+
lines.push(` ${flag} ${name}: gmean=${(gmean(c.scores) * 100).toFixed(0)}% notes=${c.notes.slice(0, 2).join("; ") || "ok"}`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return lines.join("\n");
|
|
118
|
+
}
|
|
119
|
+
function gmean(scores) {
|
|
120
|
+
const vals = [scores.parse, scores.schema, scores.content, scores.costEfficiency, scores.speed];
|
|
121
|
+
const product = vals.reduce((a, b) => a * Math.max(b, 0.001), 1);
|
|
122
|
+
return Math.pow(product, 1 / vals.length);
|
|
123
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation matrix runner.
|
|
3
|
+
*
|
|
4
|
+
* Given a set of prompt variants and benchmark cases, produces a matrix:
|
|
5
|
+
* rows = variants
|
|
6
|
+
* columns = cases
|
|
7
|
+
* cells = EvaluationResult with multi-dimensional scores
|
|
8
|
+
*
|
|
9
|
+
* Uses direct HTTP fetch (not the full Agent SDK) so it's fast and works with
|
|
10
|
+
* any Anthropic-compatible endpoint (OpenRouter, local proxies, etc.).
|
|
11
|
+
*/
|
|
12
|
+
import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
|
|
13
|
+
export interface EvalOpts {
|
|
14
|
+
/** Model to run evaluations with. Should be fast/cheap (haiku, flash, etc.) */
|
|
15
|
+
model: string;
|
|
16
|
+
/** Base URL for the API endpoint */
|
|
17
|
+
baseUrl?: string;
|
|
18
|
+
/** Auth token */
|
|
19
|
+
authToken?: string;
|
|
20
|
+
/** Max tokens per evaluation */
|
|
21
|
+
maxTokens?: number;
|
|
22
|
+
/** Concurrency for parallel case evaluation */
|
|
23
|
+
concurrency?: number;
|
|
24
|
+
/** Optional callback for progress */
|
|
25
|
+
onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
|
|
26
|
+
}
|
|
27
|
+
export declare function buildMatrix(variants: Array<{
|
|
28
|
+
id: string;
|
|
29
|
+
promptPath: string;
|
|
30
|
+
generation: number;
|
|
31
|
+
text: string;
|
|
32
|
+
}>, cases: BenchmarkCase[], opts: EvalOpts): Promise<VariantRow[]>;
|
|
33
|
+
/** Render a prompt variant given its source path and optional variant name */
|
|
34
|
+
export declare function renderVariant(promptPath: string, variant: string | undefined, vars: PromptVars): string;
|