claude-overnight 1.3.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,6 +49,7 @@ claude-overnight
49
49
  ◆ Thinking: 5 agents exploring... ← architects analyze your codebase
50
50
  ◆ Orchestrating plan... ← synthesizes 50 concrete tasks
51
51
  ◆ Wave 1 · 50 tasks · $4.20 spent ← fully autonomous from here
52
+ ↑ 1.2M in ↓ 340K out $4.20 / $4.24 total
52
53
  ◆ Assessing... how close to amazing?
53
54
  ◆ Wave 2 · 30 tasks · $18.50 spent ← improvements from assessment
54
55
  ◆ Reflection: 2 agents reviewing ← deep quality audit
@@ -70,7 +71,9 @@ An orchestrator agent reads all design documents and synthesizes concrete execut
70
71
 
71
72
  ### 3. Iterative execution
72
73
 
73
- Tasks run in parallel (each agent in its own git worktree). After each wave, steering assesses: "how good is this?"not "what's missing?" It can:
74
+ Tasks run in parallel (each agent in its own git worktree). After completing its task, each agent automatically runs a **simplify pass** reviewing its own `git diff` for code reuse opportunities, quality issues, and inefficiencies, then fixing them before the framework commits.
75
+
76
+ After each wave, steering assesses: "how good is this?" — not "what's missing?" It can:
74
77
 
75
78
  - **Execute** more tasks to build features, fix bugs, polish UX
76
79
  - **Reflect** by spinning up 1-2 review agents for deep quality/architecture audits
@@ -221,6 +224,8 @@ Changes take effect between waves — active agents finish their current task.
221
224
 
222
225
  The usage bar cycles through all rate limit windows (5h, 7d, etc.) every 3 seconds, showing utilization per window. Usage info is shown during all phases — thinking, orchestration, steering, and execution.
223
226
 
227
+ When using extra usage with a budget, a dedicated progress bar shows spend vs limit with color-coded fill (magenta → yellow → red).
228
+
224
229
  ## Rate limits
225
230
 
226
231
  Built for unattended runs lasting hours or days.
package/dist/index.js CHANGED
@@ -7,7 +7,7 @@ import { createInterface } from "readline";
7
7
  import chalk from "chalk";
8
8
  import { query } from "@anthropic-ai/claude-agent-sdk";
9
9
  import { Swarm } from "./swarm.js";
10
- import { planTasks, refinePlan, detectModelTier, steerWave, identifyThemes, buildThinkingTasks, buildReflectionTasks, orchestrate } from "./planner.js";
10
+ import { planTasks, refinePlan, detectModelTier, steerWave, identifyThemes, buildThinkingTasks, orchestrate } from "./planner.js";
11
11
  import { startRenderLoop, renderSummary } from "./ui.js";
12
12
  // ── CLI flag parsing ──
13
13
  function parseCliFlags(argv) {
@@ -295,6 +295,7 @@ function readRunMemory(runDir, previousRuns) {
295
295
  return {
296
296
  designs: readMdDir(join(runDir, "designs")),
297
297
  reflections: readMdDir(join(runDir, "reflections")),
298
+ verifications: readMdDir(join(runDir, "verifications")),
298
299
  milestones: readMdDir(join(runDir, "milestones")),
299
300
  status,
300
301
  goal,
@@ -385,6 +386,7 @@ function createRunDir(rootDir) {
385
386
  const runDir = join(rootDir, "runs", ts);
386
387
  mkdirSync(join(runDir, "designs"), { recursive: true });
387
388
  mkdirSync(join(runDir, "reflections"), { recursive: true });
389
+ mkdirSync(join(runDir, "verifications"), { recursive: true });
388
390
  mkdirSync(join(runDir, "milestones"), { recursive: true });
389
391
  mkdirSync(join(runDir, "sessions"), { recursive: true });
390
392
  return runDir;
@@ -1079,7 +1081,7 @@ async function main() {
1079
1081
  let accIn = 0, accOut = 0;
1080
1082
  let lastCapped = false, lastAborted = false, objectiveComplete = false;
1081
1083
  let lastWaveKind;
1082
- let reflectionBudgetUsed;
1084
+ let overheadBudgetUsed;
1083
1085
  const branches = [];
1084
1086
  if (resuming && resumeState) {
1085
1087
  // Restore ALL config from saved state
@@ -1091,7 +1093,7 @@ async function main() {
1091
1093
  accFailed = resumeState.accFailed;
1092
1094
  accTools = 0;
1093
1095
  lastWaveKind = resumeState.lastWaveKind;
1094
- reflectionBudgetUsed = resumeState.reflectionBudgetUsed;
1096
+ overheadBudgetUsed = resumeState.overheadBudgetUsed ?? (resumeState.reflectionBudgetUsed ?? 0) + (resumeState.verificationBudgetUsed ?? 0);
1095
1097
  branches.push(...resumeState.branches);
1096
1098
  objective = resumeState.objective;
1097
1099
  workerModel = resumeState.workerModel;
@@ -1121,11 +1123,11 @@ async function main() {
1121
1123
  accIn = thinkingIn;
1122
1124
  accOut = thinkingOut;
1123
1125
  lastWaveKind = "execute";
1124
- reflectionBudgetUsed = 0;
1126
+ overheadBudgetUsed = 0;
1125
1127
  }
1126
1128
  liveConfig.remaining = remaining;
1127
1129
  liveConfig.usageCap = usageCap;
1128
- const maxReflectionBudget = Math.max(2, Math.ceil((budget ?? 10) * 0.05));
1130
+ const maxOverheadBudget = Math.max(4, Math.ceil((budget ?? 10) * 0.15));
1129
1131
  // For flex + branch strategy: create one target branch, waves merge via yolo into it
1130
1132
  let runBranch;
1131
1133
  let originalRef;
@@ -1168,6 +1170,7 @@ async function main() {
1168
1170
  const swarm = new Swarm({
1169
1171
  tasks: currentTasks, concurrency, cwd, model: workerModel, permissionMode, allowedTools,
1170
1172
  useWorktrees, mergeStrategy: waveMerge, agentTimeoutMs, usageCap, allowExtraUsage, extraUsageBudget,
1173
+ baseCostUsd: accCost,
1171
1174
  });
1172
1175
  currentSwarm = swarm;
1173
1176
  const stopRender = startRenderLoop(swarm, liveConfig);
@@ -1210,7 +1213,7 @@ async function main() {
1210
1213
  id: `run-${new Date().toISOString().slice(0, 19)}`, objective: objective, budget: budget ?? tasks.length,
1211
1214
  remaining, workerModel, plannerModel, concurrency, permissionMode,
1212
1215
  usageCap, allowExtraUsage, extraUsageBudget, flex, useWorktrees, mergeStrategy, waveNum, currentTasks,
1213
- lastWaveKind, reflectionBudgetUsed, accCost, accCompleted, accFailed,
1216
+ lastWaveKind, overheadBudgetUsed, accCost, accCompleted, accFailed,
1214
1217
  branches, phase: "steering", startedAt: new Date(runStartedAt).toISOString(), cwd,
1215
1218
  });
1216
1219
  waveHistory.push({
@@ -1225,11 +1228,10 @@ async function main() {
1225
1228
  });
1226
1229
  if (!flex || remaining <= 0 || swarm.aborted || swarm.cappedOut)
1227
1230
  break;
1228
- // ── Steer: assess quality and decide next action ──
1229
- // May loop through reflect→re-steer cycles before producing execution tasks
1230
- let steerDone = false;
1231
+ // ── Steer: assess and compose the next wave ──
1232
+ let steered = false;
1231
1233
  let steerAttempts = 0;
1232
- while (!steerDone && remaining > 0 && !stopping && steerAttempts < 4) {
1234
+ while (!steered && remaining > 0 && !stopping && steerAttempts < 3) {
1233
1235
  steerAttempts++;
1234
1236
  console.log(chalk.cyan(`\n ◆ Assessing...\n`));
1235
1237
  process.stdout.write("\x1B[?25l");
@@ -1238,81 +1240,46 @@ async function main() {
1238
1240
  const steer = await steerWave(objective, waveHistory, remaining, cwd, plannerModel, workerModel, permissionMode, concurrency, makeProgressLog(), memory);
1239
1241
  process.stdout.write(`\x1B[2K\r`);
1240
1242
  process.stdout.write("\x1B[?25h");
1241
- // Persist context layers
1242
1243
  if (steer.statusUpdate)
1243
1244
  writeStatus(runDir, steer.statusUpdate);
1244
1245
  if (steer.goalUpdate) {
1245
1246
  writeGoalUpdate(runDir, steer.goalUpdate);
1246
1247
  console.log(chalk.dim(` Goal refined: ${steer.goalUpdate.slice(0, 100)}\n`));
1247
1248
  }
1248
- // Archive milestone every ~5 execution waves
1249
1249
  const execWaves = waveHistory.filter(w => w.kind === "execute").length;
1250
1250
  if (execWaves > 0 && execWaves % 5 === 0)
1251
1251
  archiveMilestone(runDir, waveNum);
1252
- if (steer.done || steer.action === "done") {
1253
- console.log(chalk.green(` \u2713 ${steer.reasoning}\n`));
1254
- steerDone = true;
1255
- objectiveComplete = true;
1256
- remaining = 0; // exit outer loop too
1257
- break;
1258
- }
1259
- if (steer.action === "reflect") {
1260
- // Safety: no consecutive reflections, budget cap
1261
- const canReflect = lastWaveKind !== "reflect" && reflectionBudgetUsed + 2 <= maxReflectionBudget;
1262
- if (!canReflect) {
1252
+ if (steer.done || steer.tasks.length === 0) {
1253
+ const hasVerification = waveHistory.some(w => w.kind.includes("verif"));
1254
+ if (!hasVerification && remaining >= 1) {
1263
1255
  console.log(chalk.dim(` ${steer.reasoning}`));
1264
- console.log(chalk.yellow(` Reflection skipped (${lastWaveKind === "reflect" ? "consecutive" : "budget cap"}) — re-assessing\n`));
1265
- lastWaveKind = "execute"; // allow next steer to see non-reflect
1266
- continue; // re-steer in this inner loop
1267
- }
1268
- // Run reflection wave
1269
- console.log(chalk.dim(` ${steer.reasoning}`));
1270
- console.log(chalk.cyan(`\n ◆ Reflection: 2 agents reviewing...\n`));
1271
- const reflectionDir = join(runDir, "reflections");
1272
- waveNum++;
1273
- const reflTasks = buildReflectionTasks(objective, memory.goal, reflectionDir, waveNum, plannerModel);
1274
- const reflSwarm = new Swarm({
1275
- tasks: reflTasks, concurrency: 2, cwd,
1276
- model: plannerModel, permissionMode,
1277
- useWorktrees: false, mergeStrategy: "yolo",
1278
- agentTimeoutMs, usageCap, allowExtraUsage, extraUsageBudget,
1279
- });
1280
- currentSwarm = reflSwarm;
1281
- const stopReflRender = startRenderLoop(reflSwarm, liveConfig);
1282
- try {
1283
- await reflSwarm.run();
1284
- }
1285
- finally {
1286
- stopReflRender();
1256
+ console.log(chalk.yellow(` Done blocked verification required before completion\n`));
1257
+ lastWaveKind = "done-blocked";
1258
+ continue; // re-steer steerer will see the hint
1287
1259
  }
1288
- console.log(renderSummary(reflSwarm));
1289
- accCost += reflSwarm.totalCostUsd;
1290
- accIn += reflSwarm.totalInputTokens;
1291
- accOut += reflSwarm.totalOutputTokens;
1292
- accCompleted += reflSwarm.completed;
1293
- accFailed += reflSwarm.failed;
1294
- accTools += reflSwarm.agents.reduce((sum, a) => sum + a.toolCalls, 0);
1295
- remaining -= reflSwarm.completed + reflSwarm.failed;
1296
- reflectionBudgetUsed += reflSwarm.completed + reflSwarm.failed;
1297
- waveHistory.push({
1298
- wave: waveNum,
1299
- kind: "reflect",
1300
- tasks: reflSwarm.agents.map(a => ({ prompt: a.task.prompt, status: a.status, filesChanged: a.filesChanged, error: a.error })),
1301
- });
1302
- lastWaveKind = "reflect";
1303
- continue; // re-steer with reflection artifacts
1304
- }
1305
- // action === "execute"
1306
- if (steer.tasks.length === 0) {
1307
1260
  console.log(chalk.green(` \u2713 ${steer.reasoning}\n`));
1308
1261
  objectiveComplete = true;
1309
1262
  remaining = 0;
1310
1263
  break;
1311
1264
  }
1265
+ const isOverhead = steer.waveKind !== "execute";
1266
+ if (isOverhead && overheadBudgetUsed + steer.tasks.length > maxOverheadBudget) {
1267
+ console.log(chalk.dim(` ${steer.reasoning}`));
1268
+ console.log(chalk.yellow(` Overhead budget exhausted (${overheadBudgetUsed}/${maxOverheadBudget}) — re-assessing\n`));
1269
+ lastWaveKind = "overhead-capped";
1270
+ continue; // re-steer
1271
+ }
1312
1272
  console.log(chalk.dim(` ${steer.reasoning}\n`));
1313
- currentTasks = steer.tasks;
1314
- lastWaveKind = "execute";
1315
- steerDone = true; // exit inner loop, outer loop runs the tasks
1273
+ // Resolve model aliases: "planner" → plannerModel, "worker" → workerModel
1274
+ currentTasks = steer.tasks.map(t => ({
1275
+ ...t,
1276
+ model: t.model === "planner" ? plannerModel : t.model === "worker" ? workerModel
1277
+ : isOverhead && !t.model ? plannerModel : t.model,
1278
+ }));
1279
+ lastWaveKind = steer.waveKind;
1280
+ if (isOverhead)
1281
+ overheadBudgetUsed += currentTasks.length;
1282
+ steered = true;
1316
1283
  }
1317
1284
  catch (err) {
1318
1285
  process.stdout.write("\x1B[?25h");
@@ -1330,7 +1297,7 @@ async function main() {
1330
1297
  id: `run-${new Date().toISOString().slice(0, 19)}`, objective: objective ?? "", budget: budget ?? tasks.length,
1331
1298
  remaining, workerModel, plannerModel, concurrency, permissionMode,
1332
1299
  usageCap, allowExtraUsage, extraUsageBudget, flex, useWorktrees, mergeStrategy, waveNum, currentTasks: [],
1333
- lastWaveKind, reflectionBudgetUsed, accCost, accCompleted, accFailed,
1300
+ lastWaveKind, overheadBudgetUsed, accCost, accCompleted, accFailed,
1334
1301
  branches, phase: finalPhase, startedAt: new Date(runStartedAt).toISOString(), cwd,
1335
1302
  });
1336
1303
  if (trulyDone) {
@@ -1364,8 +1331,8 @@ async function main() {
1364
1331
  boxLines.push(`${elapsedStr} · ${fmtTokens(accIn)} in / ${fmtTokens(accOut)} out · ${accTools} tools`);
1365
1332
  if (totalMerged > 0 || totalConflicts > 0)
1366
1333
  boxLines.push(`${totalMerged} merged${totalConflicts > 0 ? ` · ${totalConflicts} conflicts` : ""}`);
1367
- if (reflectionBudgetUsed > 0)
1368
- boxLines.push(`${reflectionBudgetUsed} reflection agents`);
1334
+ if (overheadBudgetUsed > 0)
1335
+ boxLines.push(`${overheadBudgetUsed} overhead agents (review/verify/explore)`);
1369
1336
  if (lastCapped)
1370
1337
  boxLines.push(chalk.yellow(`Capped at ${usageCap != null ? Math.round(usageCap * 100) : 100}%`));
1371
1338
  const boxW = Math.max(...boxLines.map(l => l.replace(/\x1B\[[0-9;]*m/g, "").length)) + 4;
package/dist/planner.d.ts CHANGED
@@ -10,7 +10,7 @@ export interface PlannerRateLimitInfo {
10
10
  }
11
11
  export interface WaveSummary {
12
12
  wave: number;
13
- kind: "execute" | "reflect" | "think";
13
+ kind: string;
14
14
  tasks: {
15
15
  prompt: string;
16
16
  status: string;
@@ -20,15 +20,16 @@ export interface WaveSummary {
20
20
  }
21
21
  export interface SteerResult {
22
22
  done: boolean;
23
- action: "execute" | "reflect" | "done";
24
23
  tasks: Task[];
25
24
  reasoning: string;
25
+ waveKind: string;
26
26
  goalUpdate?: string;
27
27
  statusUpdate?: string;
28
28
  }
29
29
  export interface RunMemory {
30
30
  designs: string;
31
31
  reflections: string;
32
+ verifications: string;
32
33
  milestones: string;
33
34
  status: string;
34
35
  goal: string;
@@ -40,7 +41,6 @@ export declare function getPlannerRateLimitInfo(): PlannerRateLimitInfo;
40
41
  export declare function planTasks(objective: string, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, budget: number | undefined, concurrency: number, onLog: (text: string) => void, flexNote?: string, outFile?: string): Promise<Task[]>;
41
42
  export declare function identifyThemes(objective: string, count: number, model: string, permissionMode: PermMode, onLog?: (text: string) => void): Promise<string[]>;
42
43
  export declare function buildThinkingTasks(objective: string, themes: string[], designDir: string, plannerModel: string, previousKnowledge?: string): Task[];
43
- export declare function buildReflectionTasks(objective: string, goal: string, reflectionDir: string, waveNum: number, plannerModel: string): Task[];
44
44
  export declare function orchestrate(objective: string, designDocs: string, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, budget: number, concurrency: number, onLog: (text: string) => void, flexNote?: string, outFile?: string): Promise<Task[]>;
45
45
  export declare function refinePlan(objective: string, previousTasks: Task[], feedback: string, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, budget: number | undefined, concurrency: number, onLog: (text: string) => void): Promise<Task[]>;
46
46
  export declare function steerWave(objective: string, history: WaveSummary[], remainingBudget: number, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, concurrency: number, onLog: (text: string) => void, runMemory?: RunMemory): Promise<SteerResult>;
package/dist/planner.js CHANGED
@@ -1,6 +1,18 @@
1
1
  import { query } from "@anthropic-ai/claude-agent-sdk";
2
2
  import { readFileSync } from "fs";
3
3
  import { NudgeError } from "./types.js";
4
+ // The core framing for all planning. Not a checklist — a way of thinking.
5
+ const DESIGN_THINKING = `
6
+ HOW TO THINK ABOUT EVERY TASK:
7
+
8
+ Start from the user's job. What is someone hiring this product to do? "I need to send money abroad cheaply" — not "I need a currency conversion API." Every decision — what to build, how fast it responds, what happens on error — flows from the job.
9
+
10
+ The experience IS the product. A 200ms server response is not a "performance metric" — it's the difference between an app that feels alive and one that feels broken. A loading state is not "polish" — it's the user knowing the app heard them. An error message is not "error handling" — it's the app being honest. There is no line between backend and UX. The server, the API, the database query, the render — they're all one experience the user either trusts or doesn't.
11
+
12
+ Build the core, verify it works, learn, iterate. Don't plan 20 features and build them all. Build the ONE thing that matters most, run it, see if it actually works from a user's chair. What you learn from seeing it run will change what you build next. Each wave should make what exists better before adding what doesn't exist yet.
13
+
14
+ Consistency is what makes complex things feel simple. One design system, rigid rules, no exceptions. This is how Revolut ships a super-app with 30+ features that doesn't feel like chaos.
15
+ `;
4
16
  const NUDGE_MS = 15 * 60 * 1000; // 15 min — close & restart with "continue"
5
17
  const HARD_TIMEOUT_MS = 30 * 60 * 1000; // 30 min — give up
6
18
  export function detectModelTier(model) {
@@ -412,17 +424,20 @@ export function buildThinkingTasks(objective, themes, designDir, plannerModel, p
412
424
  OVERALL OBJECTIVE: ${objective}
413
425
  ${prevBlock}
414
426
  YOUR FOCUS: ${theme}
415
-
427
+ ${DESIGN_THINKING}
416
428
  Explore the codebase thoroughly using Read, Glob, and Grep. Then write a design document to ${designDir}/focus-${i}.md with these sections:
417
429
 
418
430
  ## Findings
419
431
  Key files, patterns, and architecture you discovered. Cite specific file paths and function names.
420
432
 
433
+ ## The Job
434
+ What is someone hiring this product to do? Not the feature — the outcome. Frame everything below through this lens.
435
+
421
436
  ## Proposed Work Items
422
437
  For each item:
423
438
  - **What**: What to build or change
424
439
  - **Where**: Specific file paths
425
- - **Why**: Why this matters
440
+ - **Why**: How this serves the job — including how fast it needs to respond and what happens when it fails
426
441
  - **Risk**: Conflicts or complications
427
442
 
428
443
  ## Key Files
@@ -432,44 +447,6 @@ Be thorough — your findings drive the execution plan.`,
432
447
  model: plannerModel,
433
448
  }));
434
449
  }
435
- export function buildReflectionTasks(objective, goal, reflectionDir, waveNum, plannerModel) {
436
- const goalBlock = goal ? `\nEVOLVED GOAL:\n${goal}\n` : "";
437
- return [
438
- {
439
- id: "review-0",
440
- prompt: `You are a senior code reviewer performing a deep quality audit.
441
-
442
- OBJECTIVE: ${objective}
443
- ${goalBlock}
444
- Read the codebase thoroughly. Assess:
445
- - **Correctness**: Bugs, missing error handling, broken flows?
446
- - **Architecture**: Clean design? Unnecessary or missing abstractions?
447
- - **Code quality**: Readability, naming, duplication, dead code?
448
- - **Completeness**: What's missing vs. the objective? Half-done work?
449
- - **Polish**: Edge cases, error messages, loading states?
450
-
451
- Write findings to ${reflectionDir}/wave-${waveNum}-quality.md.
452
- End with a ## Verdict: is this closer to "good enough" or "amazing"? What would make the biggest difference?`,
453
- model: plannerModel,
454
- },
455
- {
456
- id: "review-1",
457
- prompt: `You are a UX and integration reviewer.
458
-
459
- OBJECTIVE: ${objective}
460
- ${goalBlock}
461
- Read the codebase. Assess:
462
- - **UX coherence**: Do user-facing flows make sense end-to-end? Consistent experience?
463
- - **Integration**: Do pieces fit together? Seams, inconsistencies, broken contracts?
464
- - **Testing**: Meaningful coverage? Testing the right things?
465
- - **Gaps**: Unhandled use cases? What would surprise a user?
466
-
467
- Write findings to ${reflectionDir}/wave-${waveNum}-ux.md.
468
- End with ## Priorities: rank the top 3 things that would most improve the result.`,
469
- model: plannerModel,
470
- },
471
- ];
472
- }
473
450
  export async function orchestrate(objective, designDocs, cwd, plannerModel, workerModel, permissionMode, budget, concurrency, onLog, flexNote, outFile) {
474
451
  const capability = modelCapabilityBlock(workerModel);
475
452
  const flexLine = flexNote ? `\n\n${flexNote}` : "";
@@ -483,7 +460,7 @@ Your architects explored the codebase and found:
483
460
  ${designDocs}
484
461
 
485
462
  AGENT CAPABILITY: ${capability}
486
-
463
+ ${DESIGN_THINKING}
487
464
  Create exactly ~${budget} concrete execution tasks based on these findings.
488
465
 
489
466
  Requirements:
@@ -492,7 +469,8 @@ Requirements:
492
469
  - ${concurrency} agents run in parallel — tasks must touch DIFFERENT files
493
470
  - Trust the research — don't tell agents to re-explore what's documented
494
471
  - Reference specific files and patterns from the findings
495
- - Priority order: foundational first, polish last${flexLine}
472
+ - Build the core user job first, then expand. Each task should produce something complete and usable — not scaffolding for later
473
+ - There is no separate "polish" phase. Loading states, error handling, sub-200ms responses, and edge cases are part of every task${flexLine}
496
474
 
497
475
  Respond with ONLY a JSON object (no markdown fences):
498
476
  {"tasks": [{"prompt": "..."}]}${fileInstruction}`;
@@ -655,24 +633,25 @@ async function extractTaskJson(raw, retry, onLog, outFile) {
655
633
  // ── Wave steering ──
656
634
  export async function steerWave(objective, history, remainingBudget, cwd, plannerModel, workerModel, permissionMode, concurrency, onLog, runMemory) {
657
635
  const capability = modelCapabilityBlock(workerModel);
658
- // Three-layer context: status (current), milestones (strategic), recent waves (tactical)
659
636
  const recentWaves = history.slice(-3);
660
637
  const recentText = recentWaves.length > 0 ? recentWaves.map(w => {
661
- const tag = w.kind === "reflect" ? " (reflection)" : w.kind === "think" ? " (thinking)" : "";
662
638
  const lines = w.tasks.map(t => {
663
639
  const files = t.filesChanged ? ` (${t.filesChanged} files)` : "";
664
640
  const err = t.error ? ` — ${t.error}` : "";
665
641
  return ` - [${t.status}] ${t.prompt.slice(0, 120)}${files}${err}`;
666
642
  }).join("\n");
667
- return `Wave ${w.wave + 1}${tag}:\n${lines}`;
643
+ return `Wave ${w.wave + 1} (${w.kind}):\n${lines}`;
668
644
  }).join("\n\n") : "(first wave)";
669
- const lastWasReflection = history.length > 0 && history[history.length - 1].kind === "reflect";
670
- const noReflectHint = lastWasReflection ? `\nIMPORTANT: The previous wave was a reflection. You MUST choose "execute" or "done" — not "reflect" again.\n` : "";
645
+ const lastKind = history.length > 0 ? history[history.length - 1].kind : "";
646
+ const repeatHint = lastKind && lastKind !== "execute"
647
+ ? `\nThe previous wave was "${lastKind}". Don't repeat the same wave kind unless you have a strong reason.\n`
648
+ : "";
671
649
  const cap = (s, max) => s.length > max ? s.slice(0, max) + "\n...(truncated)" : s;
672
650
  const statusBlock = runMemory?.status ? `\nCurrent project status:\n${runMemory.status}\n` : "";
673
651
  const milestoneBlock = runMemory?.milestones ? `\nMilestone snapshots:\n${cap(runMemory.milestones, 4000)}\n` : "";
674
652
  const designBlock = runMemory?.designs ? `\nArchitectural research:\n${cap(runMemory.designs, 4000)}\n` : "";
675
653
  const reflectionBlock = runMemory?.reflections ? `\nLatest quality reports:\n${cap(runMemory.reflections, 3000)}\n` : "";
654
+ const verificationBlock = runMemory?.verifications ? `\nVerification results (from actually running the app):\n${cap(runMemory.verifications, 3000)}\n` : "";
676
655
  const goalBlock = runMemory?.goal ? `\nNorth star — what "amazing" means:\n${runMemory.goal}\n` : "";
677
656
  const prevRunBlock = runMemory?.previousRuns ? `\nKnowledge from previous runs:\n${cap(runMemory.previousRuns, 3000)}\n` : "";
678
657
  const prompt = `You are the quality director for an autonomous multi-wave agent system. Your job is to push the work toward "amazing," not just "done."
@@ -681,38 +660,63 @@ Objective: ${objective}
681
660
  ${goalBlock}${statusBlock}${milestoneBlock}${prevRunBlock}
682
661
  Recent waves:
683
662
  ${recentText}
684
- ${designBlock}${reflectionBlock}
663
+ ${designBlock}${reflectionBlock}${verificationBlock}
685
664
  Remaining budget: ${remainingBudget} agent sessions. ${concurrency} agents run in parallel — tasks must touch DIFFERENT files.
686
665
  ${capability}
666
+ ${DESIGN_THINKING}
687
667
  Total waves completed: ${history.length}
688
668
 
689
- Read the codebase. Assess: how close is this to the VISION? Not "what's missing" — "how good is what we built?"
669
+ Read the codebase. Assess from the user's chair: does this product do the job someone would hire it for? Does it feel fast, honest, and trustworthy? Not "is the code clean" — "would I use this?"
670
+
671
+ If verification found issues, those are the priority. Fix what's broken before building what's missing. Iterate on what exists before expanding scope.
672
+
673
+ ## Compose the next wave
674
+
675
+ You have full creative freedom. Design the wave that will have the highest impact right now. Here are archetypes to draw from — mix, adapt, or invent your own:
690
676
 
691
- Then choose ONE action:
677
+ **Execute** Agents implement concrete changes in parallel. Each touches different files. The bread and butter.
678
+ Example: 5 agents each owning a different feature or fix
692
679
 
693
- **"reflect"** — Spin up 1-2 review agents for a deep quality audit. Choose when:
694
- - Substantial new code shipped and hasn't been reviewed
695
- - You're unsure about quality and need expert eyes
696
- - A subsystem just "completed" and deserves verification
680
+ **Explore** — Multiple agents independently tackle the same problem from different angles. Each writes a design/approach to a separate file. Use when you need creative alternatives before committing.
681
+ Example: 3 agents each design a different navigation approach, writing to designs/nav-{approach}.md
697
682
 
698
- **"execute"** — Plan the next batch of tasks. Choose when:
699
- - You know what needs doing (from reviews or your own assessment)
700
- - There are clear gaps, bugs, or improvements to make
683
+ **Critique** — Agents review what exists as skeptical experts. They read the codebase and write findings to files. Use after substantial new code ships.
684
+ Example: 1 code quality reviewer, 1 UX reviewer examining flows end-to-end
701
685
 
702
- **"done"** — The objective is met at high quality. Choose when:
703
- - The code works correctly and handles edge cases
704
- - The architecture is clean and pieces fit together
705
- - Further work would be diminishing returns
706
- ${noReflectHint}
686
+ **Synthesize** — An agent reads multiple alternatives or review findings and makes a decision. Writes the chosen approach or prioritized fix list.
687
+ Example: 1 agent reads 3 design docs and writes the implementation plan
688
+
689
+ **Verify** Agents actually RUN the application: build it, start it, navigate it, click things, try edge cases. They report what works and what's broken. Not code reading — real testing.
690
+ Example: 1 agent does end-to-end QA, writing a report with reproduction steps
691
+
692
+ **User-test** — Agents emulate specific user personas interacting with the product. "First-time user who just downloaded this." "Power user trying to do X fast." They test from that perspective and report friction.
693
+ Example: 2 agents, one new user, one power user, each writing a report
694
+
695
+ **Polish** — Agents focus purely on feel: loading states, error messages, micro-interactions, empty states, responsiveness. Not features — the texture that makes users trust the product.
696
+ Example: 2 agents, one on happy paths, one on error/edge states
697
+
698
+ You can combine these. A wave can have 3 execute agents + 1 verification agent. Or 2 divergent explorers. Whatever the situation calls for.
699
+
700
+ For non-execute tasks (critique, verify, user-test, synthesize), tell agents to write their output to files in the run directory so findings persist for future waves. Use paths like: .claude-overnight/latest/reflections/wave-N-{topic}.md or .claude-overnight/latest/verifications/wave-N-{topic}.md.
701
+
702
+ IMPORTANT: You cannot declare "done" unless at least one verification wave has confirmed the app works. If you're considering done but haven't verified, compose a verification wave first.
703
+ ${repeatHint}
707
704
  Respond with ONLY a JSON object (no markdown fences):
708
705
  {
709
- "action": "execute" | "reflect" | "done",
710
- "done": true/false,
711
- "reasoning": "your assessment and why you chose this action",
706
+ "done": false,
707
+ "waveKind": "execute",
708
+ "reasoning": "your assessment and why you chose this wave composition",
712
709
  "goalUpdate": "optional — refine what 'amazing' means as you learn more",
713
- "statusUpdate": "REQUIRED — write a concise project status: what's built, what works, what's rough, quality level, key gaps. This replaces the previous status and is your memory for future waves.",
714
- "tasks": [{"prompt": "..."}]
715
- }`;
710
+ "statusUpdate": "REQUIRED — concise project status: what's built, what works, what's rough, quality level, key gaps. This replaces the previous status.",
711
+ "tasks": [
712
+ {"prompt": "task instruction...", "model": "worker"},
713
+ {"prompt": "review task...", "model": "planner"}
714
+ ]
715
+ }
716
+
717
+ The "model" field on each task: use "worker" (${workerModel}) for implementation tasks, "planner" (${plannerModel}) for review/analysis/verification tasks. Default is "worker".
718
+
719
+ If done: {"done": true, "waveKind": "done", "reasoning": "...", "statusUpdate": "...", "tasks": []}`;
716
720
  onLog("Assessing...");
717
721
  const resultText = await runPlannerQuery(prompt, { cwd, model: plannerModel, permissionMode }, onLog);
718
722
  const parsed = await (async () => {
@@ -720,21 +724,20 @@ Respond with ONLY a JSON object (no markdown fences):
720
724
  if (first)
721
725
  return first;
722
726
  onLog("Retrying...");
723
- const retryText = await runPlannerQuery(`Your previous response was not valid JSON. Respond with ONLY a JSON object {"action":"execute"|"reflect"|"done","done":true/false,"reasoning":"...","tasks":[{"prompt":"..."}]}.\n\n${prompt}`, { cwd, model: plannerModel, permissionMode }, onLog);
724
- return attemptJsonParse(retryText) ?? { action: "done", done: true, reasoning: "Could not parse steering response" };
727
+ const retryText = await runPlannerQuery(`Your previous response was not valid JSON. Respond with ONLY a JSON object {"done":false,"waveKind":"execute","reasoning":"...","statusUpdate":"...","tasks":[{"prompt":"..."}]}.\n\n${prompt}`, { cwd, model: plannerModel, permissionMode }, onLog);
728
+ return attemptJsonParse(retryText) ?? { done: true, waveKind: "done", reasoning: "Could not parse steering response" };
725
729
  })();
726
- const action = parsed.action || (parsed.done ? "done" : "execute");
730
+ const isDone = parsed.done === true;
731
+ const waveKind = parsed.waveKind || parsed.action || (isDone ? "done" : "execute");
727
732
  const statusUpdate = parsed.statusUpdate || undefined;
728
- if (action === "done") {
729
- return { done: true, action: "done", tasks: [], reasoning: parsed.reasoning || "Objective complete", goalUpdate: parsed.goalUpdate, statusUpdate };
730
- }
731
- if (action === "reflect") {
732
- return { done: false, action: "reflect", tasks: [], reasoning: parsed.reasoning || "Quality audit needed", goalUpdate: parsed.goalUpdate, statusUpdate };
733
+ if (isDone) {
734
+ return { done: true, tasks: [], reasoning: parsed.reasoning || "Objective complete", waveKind: "done", goalUpdate: parsed.goalUpdate, statusUpdate };
733
735
  }
734
736
  let tasks = (parsed.tasks || []).map((t, i) => ({
735
737
  id: String(i),
736
738
  prompt: typeof t === "string" ? t : t.prompt,
739
+ ...(t.model && { model: t.model }),
737
740
  }));
738
741
  tasks = postProcess(tasks, remainingBudget, onLog);
739
- return { done: tasks.length === 0, action: tasks.length === 0 ? "done" : "execute", tasks, reasoning: parsed.reasoning || "", goalUpdate: parsed.goalUpdate, statusUpdate };
742
+ return { done: tasks.length === 0, tasks, reasoning: parsed.reasoning || "", waveKind: tasks.length === 0 ? "done" : waveKind, goalUpdate: parsed.goalUpdate, statusUpdate };
740
743
  }
package/dist/swarm.d.ts CHANGED
@@ -16,6 +16,8 @@ export interface SwarmConfig {
16
16
  allowExtraUsage?: boolean;
17
17
  /** Max $ to spend on extra usage before stopping. Only applies when allowExtraUsage is true. */
18
18
  extraUsageBudget?: number;
19
+ /** Cost from previous waves — lets the UI show an accurate running total. */
20
+ baseCostUsd?: number;
19
21
  }
20
22
  export interface MergeResult {
21
23
  branch: string;
@@ -64,6 +66,7 @@ export declare class Swarm {
64
66
  usageCap: number | undefined;
65
67
  readonly allowExtraUsage: boolean;
66
68
  readonly extraUsageBudget: number | undefined;
69
+ readonly baseCostUsd: number;
67
70
  constructor(config: SwarmConfig);
68
71
  get active(): number;
69
72
  get pending(): number;
package/dist/swarm.js CHANGED
@@ -4,6 +4,15 @@ import { join } from "path";
4
4
  import { tmpdir } from "os";
5
5
  import { query } from "@anthropic-ai/claude-agent-sdk";
6
6
  import { NudgeError } from "./types.js";
7
+ const SIMPLIFY_PROMPT = `You just finished your task. Now review and simplify your changes.
8
+
9
+ Run \`git diff\` to see what you changed, then fix any issues:
10
+
11
+ 1. **Reuse**: Search the codebase — did you write something that already exists? Use existing utilities, helpers, patterns instead.
12
+ 2. **Quality**: Redundant state, copy-paste with slight variation, leaky abstractions, unnecessary wrappers/nesting, comments that narrate what the code does? Delete them.
13
+ 3. **Efficiency**: Redundant computations, sequential operations that could be parallel, unnecessary existence checks before operations, unbounded data structures, missing cleanup?
14
+
15
+ Less code is better. Delete and simplify rather than add. Fix directly — no need to explain.`;
7
16
  export class Swarm {
8
17
  agents = [];
9
18
  logs = [];
@@ -41,6 +50,7 @@ export class Swarm {
41
50
  usageCap; // mutable — can be changed live
42
51
  allowExtraUsage;
43
52
  extraUsageBudget;
53
+ baseCostUsd;
44
54
  constructor(config) {
45
55
  if (!config.tasks.length) {
46
56
  throw new Error("SwarmConfig: tasks array must not be empty");
@@ -64,6 +74,7 @@ export class Swarm {
64
74
  this.usageCap = config.usageCap;
65
75
  this.allowExtraUsage = config.allowExtraUsage ?? false;
66
76
  this.extraUsageBudget = config.extraUsageBudget;
77
+ this.baseCostUsd = config.baseCostUsd ?? 0;
67
78
  this.queue = [...config.tasks];
68
79
  this.total = config.tasks.length;
69
80
  }
@@ -225,9 +236,10 @@ export class Swarm {
225
236
  try {
226
237
  const perm = this.config.permissionMode ?? "auto";
227
238
  let resumeSessionId;
239
+ let resumePrompt = "Continue. Complete the task.";
228
240
  const runOnce = async (isResume) => {
229
241
  const agentPrompt = isResume
230
- ? "Continue. Complete the task."
242
+ ? resumePrompt
231
243
  : this.config.useWorktrees
232
244
  ? `You are working in an isolated git worktree. Focus only on this task. Do NOT commit your changes — the framework handles that.\n\n${task.prompt}`
233
245
  : task.prompt;
@@ -301,6 +313,17 @@ export class Swarm {
301
313
  throw nudgeErr;
302
314
  }
303
315
  }
316
+ // Simplify pass: resume session with review prompt
317
+ if (resumeSessionId && agent.status === "running") {
318
+ try {
319
+ this.log(id, "Simplify pass");
320
+ resumePrompt = SIMPLIFY_PROMPT;
321
+ await runOnce(true);
322
+ }
323
+ catch {
324
+ this.log(id, "Simplify pass skipped");
325
+ }
326
+ }
304
327
  if (agent.status === "running") {
305
328
  agent.finishedAt = Date.now();
306
329
  const duration = agent.finishedAt - (agent.startedAt || agent.finishedAt);
package/dist/types.d.ts CHANGED
@@ -133,13 +133,13 @@ export interface RunState {
133
133
  mergeStrategy: MergeStrategy;
134
134
  waveNum: number;
135
135
  currentTasks: Task[];
136
- lastWaveKind: "execute" | "reflect" | "think";
137
- reflectionBudgetUsed: number;
136
+ lastWaveKind: string;
137
+ overheadBudgetUsed: number;
138
138
  accCost: number;
139
139
  accCompleted: number;
140
140
  accFailed: number;
141
141
  branches: BranchRecord[];
142
- phase: "executing" | "steering" | "reflecting" | "capped" | "done";
142
+ phase: "executing" | "steering" | "reflecting" | "verifying" | "capped" | "done";
143
143
  startedAt: string;
144
144
  cwd: string;
145
145
  }
package/dist/ui.js CHANGED
@@ -39,14 +39,19 @@ export function renderFrame(swarm, showHotkeys = false) {
39
39
  chalk.gray(`${swarm.pending} queued`) +
40
40
  " " +
41
41
  chalk.gray(`\u23F1 ${fmtDur(Date.now() - swarm.startedAt)}`));
42
- // Stats line
42
+ // Stats line — show wave cost + overall if there's a base
43
43
  const tokIn = fmtTokens(swarm.totalInputTokens);
44
44
  const tokOut = fmtTokens(swarm.totalOutputTokens);
45
- const cost = swarm.totalCostUsd > 0
46
- ? chalk.yellow(`$${swarm.totalCostUsd.toFixed(3)}`)
47
- : "";
45
+ const waveCost = swarm.totalCostUsd;
46
+ const totalCost = swarm.baseCostUsd + waveCost;
47
+ let costStr = "";
48
+ if (totalCost > 0) {
49
+ costStr = swarm.baseCostUsd > 0
50
+ ? chalk.yellow(`$${waveCost.toFixed(3)}`) + chalk.dim(` / $${totalCost.toFixed(2)} total`)
51
+ : chalk.yellow(`$${waveCost.toFixed(3)}`);
52
+ }
48
53
  out.push(chalk.gray(` \u2191 ${tokIn} in \u2193 ${tokOut} out`) +
49
- (cost ? ` ${cost}` : ""));
54
+ (costStr ? ` ${costStr}` : ""));
50
55
  // ── Usage bar(s) — cycle through windows every 3s ──
51
56
  const windows = Array.from(swarm.rateLimitWindows.values());
52
57
  const rlPct = swarm.rateLimitUtilization;
@@ -82,10 +87,7 @@ export function renderFrame(swarm, showHotkeys = false) {
82
87
  label = chalk.red(`Waiting for reset ${mm > 0 ? `${mm}m ${ss}s` : `${ss}s`}`);
83
88
  }
84
89
  if (swarm.isUsingOverage && !swarm.cappedOut) {
85
- const budgetInfo = swarm.extraUsageBudget != null
86
- ? ` $${swarm.overageCostUsd.toFixed(2)}/$${swarm.extraUsageBudget}`
87
- : "";
88
- label += chalk.red(` [EXTRA USAGE${budgetInfo}]`);
90
+ label += chalk.red(" [EXTRA USAGE]");
89
91
  }
90
92
  const prefix = windowLabel ? chalk.dim(windowLabel.padEnd(6)) : chalk.dim("Usage ");
91
93
  out.push(` ${prefix}${barStr} ${label}`);
@@ -104,6 +106,23 @@ export function renderFrame(swarm, showHotkeys = false) {
104
106
  renderBar(rlPct);
105
107
  }
106
108
  }
109
+ // ── Extra usage budget bar ──
110
+ if (swarm.isUsingOverage && swarm.extraUsageBudget != null && swarm.extraUsageBudget > 0) {
111
+ const barW = Math.min(30, w - 40);
112
+ const pct = Math.min(1, swarm.overageCostUsd / swarm.extraUsageBudget);
113
+ const filled = Math.round(pct * barW);
114
+ let barStr = "";
115
+ for (let i = 0; i < barW; i++) {
116
+ if (i < filled)
117
+ barStr += pct > 0.9 ? chalk.red("\u2588") : pct > 0.75 ? chalk.yellow("\u2588") : chalk.magenta("\u2588");
118
+ else
119
+ barStr += chalk.gray("\u2591");
120
+ }
121
+ const label = swarm.cappedOut
122
+ ? chalk.red(`$${swarm.overageCostUsd.toFixed(2)}/$${swarm.extraUsageBudget} — budget hit`)
123
+ : `$${swarm.overageCostUsd.toFixed(2)}/$${swarm.extraUsageBudget}`;
124
+ out.push(` ${chalk.dim("Extra ")}${barStr} ${label}`);
125
+ }
107
126
  out.push("");
108
127
  // ── Agent table ──
109
128
  const running = swarm.agents.filter((a) => a.status === "running");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-overnight",
3
- "version": "1.3.0",
3
+ "version": "1.5.1",
4
4
  "description": "Run 10, 100, or 1000 Claude agents overnight. Parallel autonomous AI coding with thinking waves, iterative quality steering, crash recovery, and rate limit handling. Built on the Claude Agent SDK.",
5
5
  "type": "module",
6
6
  "bin": {