npm - claude-overnight - Versions diffs - 1.3.0 → 1.5.1 - Mend

claude-overnight 1.3.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md CHANGED Viewed

@@ -49,6 +49,7 @@ claude-overnight
 ◆ Thinking: 5 agents exploring...         ← architects analyze your codebase
 ◆ Orchestrating plan...                   ← synthesizes 50 concrete tasks
 ◆ Wave 1 · 50 tasks · $4.20 spent        ← fully autonomous from here
+  ↑ 1.2M in  ↓ 340K out  $4.20 / $4.24 total
 ◆ Assessing... how close to amazing?
 ◆ Wave 2 · 30 tasks · $18.50 spent       ← improvements from assessment
 ◆ Reflection: 2 agents reviewing          ← deep quality audit
@@ -70,7 +71,9 @@ An orchestrator agent reads all design documents and synthesizes concrete execut
 ### 3. Iterative execution
-Tasks run in parallel (each agent in its own git worktree). After each wave, steering assesses: "how good is this?" — not "what's missing?" It can:
+Tasks run in parallel (each agent in its own git worktree). After completing its task, each agent automatically runs a **simplify pass** — reviewing its own `git diff` for code reuse opportunities, quality issues, and inefficiencies, then fixing them before the framework commits.
+After each wave, steering assesses: "how good is this?" — not "what's missing?" It can:
 - **Execute** more tasks to build features, fix bugs, polish UX
 - **Reflect** by spinning up 1-2 review agents for deep quality/architecture audits
@@ -221,6 +224,8 @@ Changes take effect between waves — active agents finish their current task.
 The usage bar cycles through all rate limit windows (5h, 7d, etc.) every 3 seconds, showing utilization per window. Usage info is shown during all phases — thinking, orchestration, steering, and execution.
+When using extra usage with a budget, a dedicated progress bar shows spend vs limit with color-coded fill (magenta → yellow → red).
 ## Rate limits
 Built for unattended runs lasting hours or days.

package/dist/index.js CHANGED Viewed

@@ -7,7 +7,7 @@ import { createInterface } from "readline";
 import chalk from "chalk";
 import { query } from "@anthropic-ai/claude-agent-sdk";
 import { Swarm } from "./swarm.js";
-import { planTasks, refinePlan, detectModelTier, steerWave, identifyThemes, buildThinkingTasks, buildReflectionTasks, orchestrate } from "./planner.js";
+import { planTasks, refinePlan, detectModelTier, steerWave, identifyThemes, buildThinkingTasks, orchestrate } from "./planner.js";
 import { startRenderLoop, renderSummary } from "./ui.js";
 // ── CLI flag parsing ──
 function parseCliFlags(argv) {
@@ -295,6 +295,7 @@ function readRunMemory(runDir, previousRuns) {
     return {
         designs: readMdDir(join(runDir, "designs")),
         reflections: readMdDir(join(runDir, "reflections")),
+        verifications: readMdDir(join(runDir, "verifications")),
         milestones: readMdDir(join(runDir, "milestones")),
         status,
         goal,
@@ -385,6 +386,7 @@ function createRunDir(rootDir) {
     const runDir = join(rootDir, "runs", ts);
     mkdirSync(join(runDir, "designs"), { recursive: true });
     mkdirSync(join(runDir, "reflections"), { recursive: true });
+    mkdirSync(join(runDir, "verifications"), { recursive: true });
     mkdirSync(join(runDir, "milestones"), { recursive: true });
     mkdirSync(join(runDir, "sessions"), { recursive: true });
     return runDir;
@@ -1079,7 +1081,7 @@ async function main() {
     let accIn = 0, accOut = 0;
     let lastCapped = false, lastAborted = false, objectiveComplete = false;
     let lastWaveKind;
-    let reflectionBudgetUsed;
+    let overheadBudgetUsed;
     const branches = [];
     if (resuming && resumeState) {
         // Restore ALL config from saved state
@@ -1091,7 +1093,7 @@ async function main() {
         accFailed = resumeState.accFailed;
         accTools = 0;
         lastWaveKind = resumeState.lastWaveKind;
-        reflectionBudgetUsed = resumeState.reflectionBudgetUsed;
+        overheadBudgetUsed = resumeState.overheadBudgetUsed ?? (resumeState.reflectionBudgetUsed ?? 0) + (resumeState.verificationBudgetUsed ?? 0);
         branches.push(...resumeState.branches);
         objective = resumeState.objective;
         workerModel = resumeState.workerModel;
@@ -1121,11 +1123,11 @@ async function main() {
         accIn = thinkingIn;
         accOut = thinkingOut;
         lastWaveKind = "execute";
-        reflectionBudgetUsed = 0;
+        overheadBudgetUsed = 0;
     }
     liveConfig.remaining = remaining;
     liveConfig.usageCap = usageCap;
-    const maxReflectionBudget = Math.max(2, Math.ceil((budget ?? 10) * 0.05));
+    const maxOverheadBudget = Math.max(4, Math.ceil((budget ?? 10) * 0.15));
     // For flex + branch strategy: create one target branch, waves merge via yolo into it
     let runBranch;
     let originalRef;
@@ -1168,6 +1170,7 @@ async function main() {
         const swarm = new Swarm({
             tasks: currentTasks, concurrency, cwd, model: workerModel, permissionMode, allowedTools,
             useWorktrees, mergeStrategy: waveMerge, agentTimeoutMs, usageCap, allowExtraUsage, extraUsageBudget,
+            baseCostUsd: accCost,
         });
         currentSwarm = swarm;
         const stopRender = startRenderLoop(swarm, liveConfig);
@@ -1210,7 +1213,7 @@ async function main() {
             id: `run-${new Date().toISOString().slice(0, 19)}`, objective: objective, budget: budget ?? tasks.length,
             remaining, workerModel, plannerModel, concurrency, permissionMode,
             usageCap, allowExtraUsage, extraUsageBudget, flex, useWorktrees, mergeStrategy, waveNum, currentTasks,
-            lastWaveKind, reflectionBudgetUsed, accCost, accCompleted, accFailed,
+            lastWaveKind, overheadBudgetUsed, accCost, accCompleted, accFailed,
             branches, phase: "steering", startedAt: new Date(runStartedAt).toISOString(), cwd,
         });
         waveHistory.push({
@@ -1225,11 +1228,10 @@ async function main() {
         });
         if (!flex || remaining <= 0 || swarm.aborted || swarm.cappedOut)
             break;
-        // ── Steer: assess quality and decide next action ──
-        // May loop through reflect→re-steer cycles before producing execution tasks
-        let steerDone = false;
+        // ── Steer: assess and compose the next wave ──
+        let steered = false;
         let steerAttempts = 0;
-        while (!steerDone && remaining > 0 && !stopping && steerAttempts < 4) {
+        while (!steered && remaining > 0 && !stopping && steerAttempts < 3) {
             steerAttempts++;
             console.log(chalk.cyan(`\n  ◆ Assessing...\n`));
             process.stdout.write("\x1B[?25l");
@@ -1238,81 +1240,46 @@ async function main() {
                 const steer = await steerWave(objective, waveHistory, remaining, cwd, plannerModel, workerModel, permissionMode, concurrency, makeProgressLog(), memory);
                 process.stdout.write(`\x1B[2K\r`);
                 process.stdout.write("\x1B[?25h");
-                // Persist context layers
                 if (steer.statusUpdate)
                     writeStatus(runDir, steer.statusUpdate);
                 if (steer.goalUpdate) {
                     writeGoalUpdate(runDir, steer.goalUpdate);
                     console.log(chalk.dim(`  Goal refined: ${steer.goalUpdate.slice(0, 100)}\n`));
                 }
-                // Archive milestone every ~5 execution waves
                 const execWaves = waveHistory.filter(w => w.kind === "execute").length;
                 if (execWaves > 0 && execWaves % 5 === 0)
                     archiveMilestone(runDir, waveNum);
-                if (steer.done || steer.action === "done") {
-                    console.log(chalk.green(`  \u2713 ${steer.reasoning}\n`));
-                    steerDone = true;
-                    objectiveComplete = true;
-                    remaining = 0; // exit outer loop too
-                    break;
-                }
-                if (steer.action === "reflect") {
-                    // Safety: no consecutive reflections, budget cap
-                    const canReflect = lastWaveKind !== "reflect" && reflectionBudgetUsed + 2 <= maxReflectionBudget;
-                    if (!canReflect) {
+                if (steer.done || steer.tasks.length === 0) {
+                    const hasVerification = waveHistory.some(w => w.kind.includes("verif"));
+                    if (!hasVerification && remaining >= 1) {
                         console.log(chalk.dim(`  ${steer.reasoning}`));
-                        console.log(chalk.yellow(`  Reflection skipped (${lastWaveKind === "reflect" ? "consecutive" : "budget cap"}) — re-assessing\n`));
-                        lastWaveKind = "execute"; // allow next steer to see non-reflect
-                        continue; // re-steer in this inner loop
-                    }
-                    // Run reflection wave
-                    console.log(chalk.dim(`  ${steer.reasoning}`));
-                    console.log(chalk.cyan(`\n  ◆ Reflection: 2 agents reviewing...\n`));
-                    const reflectionDir = join(runDir, "reflections");
-                    waveNum++;
-                    const reflTasks = buildReflectionTasks(objective, memory.goal, reflectionDir, waveNum, plannerModel);
-                    const reflSwarm = new Swarm({
-                        tasks: reflTasks, concurrency: 2, cwd,
-                        model: plannerModel, permissionMode,
-                        useWorktrees: false, mergeStrategy: "yolo",
-                        agentTimeoutMs, usageCap, allowExtraUsage, extraUsageBudget,
-                    });
-                    currentSwarm = reflSwarm;
-                    const stopReflRender = startRenderLoop(reflSwarm, liveConfig);
-                    try {
-                        await reflSwarm.run();
-                    }
-                    finally {
-                        stopReflRender();
+                        console.log(chalk.yellow(`  Done blocked — verification required before completion\n`));
+                        lastWaveKind = "done-blocked";
+                        continue; // re-steer — steerer will see the hint
                     }
-                    console.log(renderSummary(reflSwarm));
-                    accCost += reflSwarm.totalCostUsd;
-                    accIn += reflSwarm.totalInputTokens;
-                    accOut += reflSwarm.totalOutputTokens;
-                    accCompleted += reflSwarm.completed;
-                    accFailed += reflSwarm.failed;
-                    accTools += reflSwarm.agents.reduce((sum, a) => sum + a.toolCalls, 0);
-                    remaining -= reflSwarm.completed + reflSwarm.failed;
-                    reflectionBudgetUsed += reflSwarm.completed + reflSwarm.failed;
-                    waveHistory.push({
-                        wave: waveNum,
-                        kind: "reflect",
-                        tasks: reflSwarm.agents.map(a => ({ prompt: a.task.prompt, status: a.status, filesChanged: a.filesChanged, error: a.error })),
-                    });
-                    lastWaveKind = "reflect";
-                    continue; // re-steer with reflection artifacts
-                }
-                // action === "execute"
-                if (steer.tasks.length === 0) {
                     console.log(chalk.green(`  \u2713 ${steer.reasoning}\n`));
                     objectiveComplete = true;
                     remaining = 0;
                     break;
                 }
+                const isOverhead = steer.waveKind !== "execute";
+                if (isOverhead && overheadBudgetUsed + steer.tasks.length > maxOverheadBudget) {
+                    console.log(chalk.dim(`  ${steer.reasoning}`));
+                    console.log(chalk.yellow(`  Overhead budget exhausted (${overheadBudgetUsed}/${maxOverheadBudget}) — re-assessing\n`));
+                    lastWaveKind = "overhead-capped";
+                    continue; // re-steer
+                }
                 console.log(chalk.dim(`  ${steer.reasoning}\n`));
-                currentTasks = steer.tasks;
-                lastWaveKind = "execute";
-                steerDone = true; // exit inner loop, outer loop runs the tasks
+                // Resolve model aliases: "planner" → plannerModel, "worker" → workerModel
+                currentTasks = steer.tasks.map(t => ({
+                    ...t,
+                    model: t.model === "planner" ? plannerModel : t.model === "worker" ? workerModel
+                        : isOverhead && !t.model ? plannerModel : t.model,
+                }));
+                lastWaveKind = steer.waveKind;
+                if (isOverhead)
+                    overheadBudgetUsed += currentTasks.length;
+                steered = true;
             }
             catch (err) {
                 process.stdout.write("\x1B[?25h");
@@ -1330,7 +1297,7 @@ async function main() {
         id: `run-${new Date().toISOString().slice(0, 19)}`, objective: objective ?? "", budget: budget ?? tasks.length,
         remaining, workerModel, plannerModel, concurrency, permissionMode,
         usageCap, allowExtraUsage, extraUsageBudget, flex, useWorktrees, mergeStrategy, waveNum, currentTasks: [],
-        lastWaveKind, reflectionBudgetUsed, accCost, accCompleted, accFailed,
+        lastWaveKind, overheadBudgetUsed, accCost, accCompleted, accFailed,
         branches, phase: finalPhase, startedAt: new Date(runStartedAt).toISOString(), cwd,
     });
     if (trulyDone) {
@@ -1364,8 +1331,8 @@ async function main() {
     boxLines.push(`${elapsedStr} · ${fmtTokens(accIn)} in / ${fmtTokens(accOut)} out · ${accTools} tools`);
     if (totalMerged > 0 || totalConflicts > 0)
         boxLines.push(`${totalMerged} merged${totalConflicts > 0 ? ` · ${totalConflicts} conflicts` : ""}`);
-    if (reflectionBudgetUsed > 0)
-        boxLines.push(`${reflectionBudgetUsed} reflection agents`);
+    if (overheadBudgetUsed > 0)
+        boxLines.push(`${overheadBudgetUsed} overhead agents (review/verify/explore)`);
     if (lastCapped)
         boxLines.push(chalk.yellow(`Capped at ${usageCap != null ? Math.round(usageCap * 100) : 100}%`));
     const boxW = Math.max(...boxLines.map(l => l.replace(/\x1B\[[0-9;]*m/g, "").length)) + 4;

package/dist/planner.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export interface PlannerRateLimitInfo {
 }
 export interface WaveSummary {
     wave: number;
-    kind: "execute" | "reflect" | "think";
+    kind: string;
     tasks: {
         prompt: string;
         status: string;
@@ -20,15 +20,16 @@ export interface WaveSummary {
 }
 export interface SteerResult {
     done: boolean;
-    action: "execute" | "reflect" | "done";
     tasks: Task[];
     reasoning: string;
+    waveKind: string;
     goalUpdate?: string;
     statusUpdate?: string;
 }
 export interface RunMemory {
     designs: string;
     reflections: string;
+    verifications: string;
     milestones: string;
     status: string;
     goal: string;
@@ -40,7 +41,6 @@ export declare function getPlannerRateLimitInfo(): PlannerRateLimitInfo;
 export declare function planTasks(objective: string, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, budget: number | undefined, concurrency: number, onLog: (text: string) => void, flexNote?: string, outFile?: string): Promise<Task[]>;
 export declare function identifyThemes(objective: string, count: number, model: string, permissionMode: PermMode, onLog?: (text: string) => void): Promise<string[]>;
 export declare function buildThinkingTasks(objective: string, themes: string[], designDir: string, plannerModel: string, previousKnowledge?: string): Task[];
-export declare function buildReflectionTasks(objective: string, goal: string, reflectionDir: string, waveNum: number, plannerModel: string): Task[];
 export declare function orchestrate(objective: string, designDocs: string, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, budget: number, concurrency: number, onLog: (text: string) => void, flexNote?: string, outFile?: string): Promise<Task[]>;
 export declare function refinePlan(objective: string, previousTasks: Task[], feedback: string, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, budget: number | undefined, concurrency: number, onLog: (text: string) => void): Promise<Task[]>;
 export declare function steerWave(objective: string, history: WaveSummary[], remainingBudget: number, cwd: string, plannerModel: string, workerModel: string, permissionMode: PermMode, concurrency: number, onLog: (text: string) => void, runMemory?: RunMemory): Promise<SteerResult>;

package/dist/planner.js CHANGED Viewed

@@ -1,6 +1,18 @@
 import { query } from "@anthropic-ai/claude-agent-sdk";
 import { readFileSync } from "fs";
 import { NudgeError } from "./types.js";
+// The core framing for all planning. Not a checklist — a way of thinking.
+const DESIGN_THINKING = `
+HOW TO THINK ABOUT EVERY TASK:
+Start from the user's job. What is someone hiring this product to do? "I need to send money abroad cheaply" — not "I need a currency conversion API." Every decision — what to build, how fast it responds, what happens on error — flows from the job.
+The experience IS the product. A 200ms server response is not a "performance metric" — it's the difference between an app that feels alive and one that feels broken. A loading state is not "polish" — it's the user knowing the app heard them. An error message is not "error handling" — it's the app being honest. There is no line between backend and UX. The server, the API, the database query, the render — they're all one experience the user either trusts or doesn't.
+Build the core, verify it works, learn, iterate. Don't plan 20 features and build them all. Build the ONE thing that matters most, run it, see if it actually works from a user's chair. What you learn from seeing it run will change what you build next. Each wave should make what exists better before adding what doesn't exist yet.
+Consistency is what makes complex things feel simple. One design system, rigid rules, no exceptions. This is how Revolut ships a super-app with 30+ features that doesn't feel like chaos.
+`;
 const NUDGE_MS = 15 * 60 * 1000; // 15 min — close & restart with "continue"
 const HARD_TIMEOUT_MS = 30 * 60 * 1000; // 30 min — give up
 export function detectModelTier(model) {
@@ -412,17 +424,20 @@ export function buildThinkingTasks(objective, themes, designDir, plannerModel, p
 OVERALL OBJECTIVE: ${objective}
 ${prevBlock}
 YOUR FOCUS: ${theme}
+${DESIGN_THINKING}
 Explore the codebase thoroughly using Read, Glob, and Grep. Then write a design document to ${designDir}/focus-${i}.md with these sections:
 ## Findings
 Key files, patterns, and architecture you discovered. Cite specific file paths and function names.
+## The Job
+What is someone hiring this product to do? Not the feature — the outcome. Frame everything below through this lens.
 ## Proposed Work Items
 For each item:
 - **What**: What to build or change
 - **Where**: Specific file paths
-- **Why**: Why this matters
+- **Why**: How this serves the job — including how fast it needs to respond and what happens when it fails
 - **Risk**: Conflicts or complications
 ## Key Files
@@ -432,44 +447,6 @@ Be thorough — your findings drive the execution plan.`,
         model: plannerModel,
     }));
 }
-export function buildReflectionTasks(objective, goal, reflectionDir, waveNum, plannerModel) {
-    const goalBlock = goal ? `\nEVOLVED GOAL:\n${goal}\n` : "";
-    return [
-        {
-            id: "review-0",
-            prompt: `You are a senior code reviewer performing a deep quality audit.
-OBJECTIVE: ${objective}
-${goalBlock}
-Read the codebase thoroughly. Assess:
-- **Correctness**: Bugs, missing error handling, broken flows?
-- **Architecture**: Clean design? Unnecessary or missing abstractions?
-- **Code quality**: Readability, naming, duplication, dead code?
-- **Completeness**: What's missing vs. the objective? Half-done work?
-- **Polish**: Edge cases, error messages, loading states?
-Write findings to ${reflectionDir}/wave-${waveNum}-quality.md.
-End with a ## Verdict: is this closer to "good enough" or "amazing"? What would make the biggest difference?`,
-            model: plannerModel,
-        },
-        {
-            id: "review-1",
-            prompt: `You are a UX and integration reviewer.
-OBJECTIVE: ${objective}
-${goalBlock}
-Read the codebase. Assess:
-- **UX coherence**: Do user-facing flows make sense end-to-end? Consistent experience?
-- **Integration**: Do pieces fit together? Seams, inconsistencies, broken contracts?
-- **Testing**: Meaningful coverage? Testing the right things?
-- **Gaps**: Unhandled use cases? What would surprise a user?
-Write findings to ${reflectionDir}/wave-${waveNum}-ux.md.
-End with ## Priorities: rank the top 3 things that would most improve the result.`,
-            model: plannerModel,
-        },
-    ];
-}
 export async function orchestrate(objective, designDocs, cwd, plannerModel, workerModel, permissionMode, budget, concurrency, onLog, flexNote, outFile) {
     const capability = modelCapabilityBlock(workerModel);
     const flexLine = flexNote ? `\n\n${flexNote}` : "";
@@ -483,7 +460,7 @@ Your architects explored the codebase and found:
 ${designDocs}
 AGENT CAPABILITY: ${capability}
+${DESIGN_THINKING}
 Create exactly ~${budget} concrete execution tasks based on these findings.
 Requirements:
@@ -492,7 +469,8 @@ Requirements:
 - ${concurrency} agents run in parallel — tasks must touch DIFFERENT files
 - Trust the research — don't tell agents to re-explore what's documented
 - Reference specific files and patterns from the findings
-- Priority order: foundational first, polish last${flexLine}
+- Build the core user job first, then expand. Each task should produce something complete and usable — not scaffolding for later
+- There is no separate "polish" phase. Loading states, error handling, sub-200ms responses, and edge cases are part of every task${flexLine}
 Respond with ONLY a JSON object (no markdown fences):
 {"tasks": [{"prompt": "..."}]}${fileInstruction}`;
@@ -655,24 +633,25 @@ async function extractTaskJson(raw, retry, onLog, outFile) {
 // ── Wave steering ──
 export async function steerWave(objective, history, remainingBudget, cwd, plannerModel, workerModel, permissionMode, concurrency, onLog, runMemory) {
     const capability = modelCapabilityBlock(workerModel);
-    // Three-layer context: status (current), milestones (strategic), recent waves (tactical)
     const recentWaves = history.slice(-3);
     const recentText = recentWaves.length > 0 ? recentWaves.map(w => {
-        const tag = w.kind === "reflect" ? " (reflection)" : w.kind === "think" ? " (thinking)" : "";
         const lines = w.tasks.map(t => {
             const files = t.filesChanged ? ` (${t.filesChanged} files)` : "";
             const err = t.error ? ` — ${t.error}` : "";
             return `  - [${t.status}] ${t.prompt.slice(0, 120)}${files}${err}`;
         }).join("\n");
-        return `Wave ${w.wave + 1}${tag}:\n${lines}`;
+        return `Wave ${w.wave + 1} (${w.kind}):\n${lines}`;
     }).join("\n\n") : "(first wave)";
-    const lastWasReflection = history.length > 0 && history[history.length - 1].kind === "reflect";
-    const noReflectHint = lastWasReflection ? `\nIMPORTANT: The previous wave was a reflection. You MUST choose "execute" or "done" — not "reflect" again.\n` : "";
+    const lastKind = history.length > 0 ? history[history.length - 1].kind : "";
+    const repeatHint = lastKind && lastKind !== "execute"
+        ? `\nThe previous wave was "${lastKind}". Don't repeat the same wave kind unless you have a strong reason.\n`
+        : "";
     const cap = (s, max) => s.length > max ? s.slice(0, max) + "\n...(truncated)" : s;
     const statusBlock = runMemory?.status ? `\nCurrent project status:\n${runMemory.status}\n` : "";
     const milestoneBlock = runMemory?.milestones ? `\nMilestone snapshots:\n${cap(runMemory.milestones, 4000)}\n` : "";
     const designBlock = runMemory?.designs ? `\nArchitectural research:\n${cap(runMemory.designs, 4000)}\n` : "";
     const reflectionBlock = runMemory?.reflections ? `\nLatest quality reports:\n${cap(runMemory.reflections, 3000)}\n` : "";
+    const verificationBlock = runMemory?.verifications ? `\nVerification results (from actually running the app):\n${cap(runMemory.verifications, 3000)}\n` : "";
     const goalBlock = runMemory?.goal ? `\nNorth star — what "amazing" means:\n${runMemory.goal}\n` : "";
     const prevRunBlock = runMemory?.previousRuns ? `\nKnowledge from previous runs:\n${cap(runMemory.previousRuns, 3000)}\n` : "";
     const prompt = `You are the quality director for an autonomous multi-wave agent system. Your job is to push the work toward "amazing," not just "done."
@@ -681,38 +660,63 @@ Objective: ${objective}
 ${goalBlock}${statusBlock}${milestoneBlock}${prevRunBlock}
 Recent waves:
 ${recentText}
-${designBlock}${reflectionBlock}
+${designBlock}${reflectionBlock}${verificationBlock}
 Remaining budget: ${remainingBudget} agent sessions. ${concurrency} agents run in parallel — tasks must touch DIFFERENT files.
 ${capability}
+${DESIGN_THINKING}
 Total waves completed: ${history.length}
-Read the codebase. Assess: how close is this to the VISION? Not "what's missing" — "how good is what we built?"
+Read the codebase. Assess from the user's chair: does this product do the job someone would hire it for? Does it feel fast, honest, and trustworthy? Not "is the code clean" — "would I use this?"
+If verification found issues, those are the priority. Fix what's broken before building what's missing. Iterate on what exists before expanding scope.
+## Compose the next wave
+You have full creative freedom. Design the wave that will have the highest impact right now. Here are archetypes to draw from — mix, adapt, or invent your own:
-Then choose ONE action:
+**Execute** — Agents implement concrete changes in parallel. Each touches different files. The bread and butter.
+  Example: 5 agents each owning a different feature or fix
-**"reflect"** — Spin up 1-2 review agents for a deep quality audit. Choose when:
-  - Substantial new code shipped and hasn't been reviewed
-  - You're unsure about quality and need expert eyes
-  - A subsystem just "completed" and deserves verification
+**Explore** — Multiple agents independently tackle the same problem from different angles. Each writes a design/approach to a separate file. Use when you need creative alternatives before committing.
+  Example: 3 agents each design a different navigation approach, writing to designs/nav-{approach}.md
-**"execute"** — Plan the next batch of tasks. Choose when:
-  - You know what needs doing (from reviews or your own assessment)
-  - There are clear gaps, bugs, or improvements to make
+**Critique** — Agents review what exists as skeptical experts. They read the codebase and write findings to files. Use after substantial new code ships.
+  Example: 1 code quality reviewer, 1 UX reviewer examining flows end-to-end
-**"done"** — The objective is met at high quality. Choose when:
-  - The code works correctly and handles edge cases
-  - The architecture is clean and pieces fit together
-  - Further work would be diminishing returns
-${noReflectHint}
+**Synthesize** — An agent reads multiple alternatives or review findings and makes a decision. Writes the chosen approach or prioritized fix list.
+  Example: 1 agent reads 3 design docs and writes the implementation plan
+**Verify** — Agents actually RUN the application: build it, start it, navigate it, click things, try edge cases. They report what works and what's broken. Not code reading — real testing.
+  Example: 1 agent does end-to-end QA, writing a report with reproduction steps
+**User-test** — Agents emulate specific user personas interacting with the product. "First-time user who just downloaded this." "Power user trying to do X fast." They test from that perspective and report friction.
+  Example: 2 agents, one new user, one power user, each writing a report
+**Polish** — Agents focus purely on feel: loading states, error messages, micro-interactions, empty states, responsiveness. Not features — the texture that makes users trust the product.
+  Example: 2 agents, one on happy paths, one on error/edge states
+You can combine these. A wave can have 3 execute agents + 1 verification agent. Or 2 divergent explorers. Whatever the situation calls for.
+For non-execute tasks (critique, verify, user-test, synthesize), tell agents to write their output to files in the run directory so findings persist for future waves. Use paths like: .claude-overnight/latest/reflections/wave-N-{topic}.md or .claude-overnight/latest/verifications/wave-N-{topic}.md.
+IMPORTANT: You cannot declare "done" unless at least one verification wave has confirmed the app works. If you're considering done but haven't verified, compose a verification wave first.
+${repeatHint}
 Respond with ONLY a JSON object (no markdown fences):
 {
-  "action": "execute" | "reflect" | "done",
-  "done": true/false,
-  "reasoning": "your assessment and why you chose this action",
+  "done": false,
+  "waveKind": "execute",
+  "reasoning": "your assessment and why you chose this wave composition",
   "goalUpdate": "optional — refine what 'amazing' means as you learn more",
-  "statusUpdate": "REQUIRED — write a concise project status: what's built, what works, what's rough, quality level, key gaps. This replaces the previous status and is your memory for future waves.",
-  "tasks": [{"prompt": "..."}]
-}`;
+  "statusUpdate": "REQUIRED — concise project status: what's built, what works, what's rough, quality level, key gaps. This replaces the previous status.",
+  "tasks": [
+    {"prompt": "task instruction...", "model": "worker"},
+    {"prompt": "review task...", "model": "planner"}
+  ]
+}
+The "model" field on each task: use "worker" (${workerModel}) for implementation tasks, "planner" (${plannerModel}) for review/analysis/verification tasks. Default is "worker".
+If done: {"done": true, "waveKind": "done", "reasoning": "...", "statusUpdate": "...", "tasks": []}`;
     onLog("Assessing...");
     const resultText = await runPlannerQuery(prompt, { cwd, model: plannerModel, permissionMode }, onLog);
     const parsed = await (async () => {
@@ -720,21 +724,20 @@ Respond with ONLY a JSON object (no markdown fences):
         if (first)
             return first;
         onLog("Retrying...");
-        const retryText = await runPlannerQuery(`Your previous response was not valid JSON. Respond with ONLY a JSON object {"action":"execute"|"reflect"|"done","done":true/false,"reasoning":"...","tasks":[{"prompt":"..."}]}.\n\n${prompt}`, { cwd, model: plannerModel, permissionMode }, onLog);
-        return attemptJsonParse(retryText) ?? { action: "done", done: true, reasoning: "Could not parse steering response" };
+        const retryText = await runPlannerQuery(`Your previous response was not valid JSON. Respond with ONLY a JSON object {"done":false,"waveKind":"execute","reasoning":"...","statusUpdate":"...","tasks":[{"prompt":"..."}]}.\n\n${prompt}`, { cwd, model: plannerModel, permissionMode }, onLog);
+        return attemptJsonParse(retryText) ?? { done: true, waveKind: "done", reasoning: "Could not parse steering response" };
     })();
-    const action = parsed.action || (parsed.done ? "done" : "execute");
+    const isDone = parsed.done === true;
+    const waveKind = parsed.waveKind || parsed.action || (isDone ? "done" : "execute");
     const statusUpdate = parsed.statusUpdate || undefined;
-    if (action === "done") {
-        return { done: true, action: "done", tasks: [], reasoning: parsed.reasoning || "Objective complete", goalUpdate: parsed.goalUpdate, statusUpdate };
-    }
-    if (action === "reflect") {
-        return { done: false, action: "reflect", tasks: [], reasoning: parsed.reasoning || "Quality audit needed", goalUpdate: parsed.goalUpdate, statusUpdate };
+    if (isDone) {
+        return { done: true, tasks: [], reasoning: parsed.reasoning || "Objective complete", waveKind: "done", goalUpdate: parsed.goalUpdate, statusUpdate };
     }
     let tasks = (parsed.tasks || []).map((t, i) => ({
         id: String(i),
         prompt: typeof t === "string" ? t : t.prompt,
+        ...(t.model && { model: t.model }),
     }));
     tasks = postProcess(tasks, remainingBudget, onLog);
-    return { done: tasks.length === 0, action: tasks.length === 0 ? "done" : "execute", tasks, reasoning: parsed.reasoning || "", goalUpdate: parsed.goalUpdate, statusUpdate };
+    return { done: tasks.length === 0, tasks, reasoning: parsed.reasoning || "", waveKind: tasks.length === 0 ? "done" : waveKind, goalUpdate: parsed.goalUpdate, statusUpdate };
 }

package/dist/swarm.d.ts CHANGED Viewed

@@ -16,6 +16,8 @@ export interface SwarmConfig {
     allowExtraUsage?: boolean;
     /** Max $ to spend on extra usage before stopping. Only applies when allowExtraUsage is true. */
     extraUsageBudget?: number;
+    /** Cost from previous waves — lets the UI show an accurate running total. */
+    baseCostUsd?: number;
 }
 export interface MergeResult {
     branch: string;
@@ -64,6 +66,7 @@ export declare class Swarm {
     usageCap: number | undefined;
     readonly allowExtraUsage: boolean;
     readonly extraUsageBudget: number | undefined;
+    readonly baseCostUsd: number;
     constructor(config: SwarmConfig);
     get active(): number;
     get pending(): number;

package/dist/swarm.js CHANGED Viewed

@@ -4,6 +4,15 @@ import { join } from "path";
 import { tmpdir } from "os";
 import { query } from "@anthropic-ai/claude-agent-sdk";
 import { NudgeError } from "./types.js";
+const SIMPLIFY_PROMPT = `You just finished your task. Now review and simplify your changes.
+Run \`git diff\` to see what you changed, then fix any issues:
+1. **Reuse**: Search the codebase — did you write something that already exists? Use existing utilities, helpers, patterns instead.
+2. **Quality**: Redundant state, copy-paste with slight variation, leaky abstractions, unnecessary wrappers/nesting, comments that narrate what the code does? Delete them.
+3. **Efficiency**: Redundant computations, sequential operations that could be parallel, unnecessary existence checks before operations, unbounded data structures, missing cleanup?
+Less code is better. Delete and simplify rather than add. Fix directly — no need to explain.`;
 export class Swarm {
     agents = [];
     logs = [];
@@ -41,6 +50,7 @@ export class Swarm {
     usageCap; // mutable — can be changed live
     allowExtraUsage;
     extraUsageBudget;
+    baseCostUsd;
     constructor(config) {
         if (!config.tasks.length) {
             throw new Error("SwarmConfig: tasks array must not be empty");
@@ -64,6 +74,7 @@ export class Swarm {
         this.usageCap = config.usageCap;
         this.allowExtraUsage = config.allowExtraUsage ?? false;
         this.extraUsageBudget = config.extraUsageBudget;
+        this.baseCostUsd = config.baseCostUsd ?? 0;
         this.queue = [...config.tasks];
         this.total = config.tasks.length;
     }
@@ -225,9 +236,10 @@ export class Swarm {
             try {
                 const perm = this.config.permissionMode ?? "auto";
                 let resumeSessionId;
+                let resumePrompt = "Continue. Complete the task.";
                 const runOnce = async (isResume) => {
                     const agentPrompt = isResume
-                        ? "Continue. Complete the task."
+                        ? resumePrompt
                         : this.config.useWorktrees
                             ? `You are working in an isolated git worktree. Focus only on this task. Do NOT commit your changes — the framework handles that.\n\n${task.prompt}`
                             : task.prompt;
@@ -301,6 +313,17 @@ export class Swarm {
                         throw nudgeErr;
                     }
                 }
+                // Simplify pass: resume session with review prompt
+                if (resumeSessionId && agent.status === "running") {
+                    try {
+                        this.log(id, "Simplify pass");
+                        resumePrompt = SIMPLIFY_PROMPT;
+                        await runOnce(true);
+                    }
+                    catch {
+                        this.log(id, "Simplify pass skipped");
+                    }
+                }
                 if (agent.status === "running") {
                     agent.finishedAt = Date.now();
                     const duration = agent.finishedAt - (agent.startedAt || agent.finishedAt);

package/dist/types.d.ts CHANGED Viewed

@@ -133,13 +133,13 @@ export interface RunState {
     mergeStrategy: MergeStrategy;
     waveNum: number;
     currentTasks: Task[];
-    lastWaveKind: "execute" | "reflect" | "think";
-    reflectionBudgetUsed: number;
+    lastWaveKind: string;
+    overheadBudgetUsed: number;
     accCost: number;
     accCompleted: number;
     accFailed: number;
     branches: BranchRecord[];
-    phase: "executing" | "steering" | "reflecting" | "capped" | "done";
+    phase: "executing" | "steering" | "reflecting" | "verifying" | "capped" | "done";
     startedAt: string;
     cwd: string;
 }

package/dist/ui.js CHANGED Viewed

@@ -39,14 +39,19 @@ export function renderFrame(swarm, showHotkeys = false) {
         chalk.gray(`${swarm.pending} queued`) +
         "  " +
         chalk.gray(`\u23F1 ${fmtDur(Date.now() - swarm.startedAt)}`));
-    // Stats line
+    // Stats line — show wave cost + overall if there's a base
     const tokIn = fmtTokens(swarm.totalInputTokens);
     const tokOut = fmtTokens(swarm.totalOutputTokens);
-    const cost = swarm.totalCostUsd > 0
-        ? chalk.yellow(`$${swarm.totalCostUsd.toFixed(3)}`)
-        : "";
+    const waveCost = swarm.totalCostUsd;
+    const totalCost = swarm.baseCostUsd + waveCost;
+    let costStr = "";
+    if (totalCost > 0) {
+        costStr = swarm.baseCostUsd > 0
+            ? chalk.yellow(`$${waveCost.toFixed(3)}`) + chalk.dim(` / $${totalCost.toFixed(2)} total`)
+            : chalk.yellow(`$${waveCost.toFixed(3)}`);
+    }
     out.push(chalk.gray(`  \u2191 ${tokIn} in  \u2193 ${tokOut} out`) +
-        (cost ? `  ${cost}` : ""));
+        (costStr ? `  ${costStr}` : ""));
     // ── Usage bar(s) — cycle through windows every 3s ──
     const windows = Array.from(swarm.rateLimitWindows.values());
     const rlPct = swarm.rateLimitUtilization;
@@ -82,10 +87,7 @@ export function renderFrame(swarm, showHotkeys = false) {
                 label = chalk.red(`Waiting for reset ${mm > 0 ? `${mm}m ${ss}s` : `${ss}s`}`);
             }
             if (swarm.isUsingOverage && !swarm.cappedOut) {
-                const budgetInfo = swarm.extraUsageBudget != null
-                    ? ` $${swarm.overageCostUsd.toFixed(2)}/$${swarm.extraUsageBudget}`
-                    : "";
-                label += chalk.red(` [EXTRA USAGE${budgetInfo}]`);
+                label += chalk.red(" [EXTRA USAGE]");
             }
             const prefix = windowLabel ? chalk.dim(windowLabel.padEnd(6)) : chalk.dim("Usage ");
             out.push(`  ${prefix}${barStr}  ${label}`);
@@ -104,6 +106,23 @@ export function renderFrame(swarm, showHotkeys = false) {
             renderBar(rlPct);
         }
     }
+    // ── Extra usage budget bar ──
+    if (swarm.isUsingOverage && swarm.extraUsageBudget != null && swarm.extraUsageBudget > 0) {
+        const barW = Math.min(30, w - 40);
+        const pct = Math.min(1, swarm.overageCostUsd / swarm.extraUsageBudget);
+        const filled = Math.round(pct * barW);
+        let barStr = "";
+        for (let i = 0; i < barW; i++) {
+            if (i < filled)
+                barStr += pct > 0.9 ? chalk.red("\u2588") : pct > 0.75 ? chalk.yellow("\u2588") : chalk.magenta("\u2588");
+            else
+                barStr += chalk.gray("\u2591");
+        }
+        const label = swarm.cappedOut
+            ? chalk.red(`$${swarm.overageCostUsd.toFixed(2)}/$${swarm.extraUsageBudget} — budget hit`)
+            : `$${swarm.overageCostUsd.toFixed(2)}/$${swarm.extraUsageBudget}`;
+        out.push(`  ${chalk.dim("Extra ")}${barStr}  ${label}`);
+    }
     out.push("");
     // ── Agent table ──
     const running = swarm.agents.filter((a) => a.status === "running");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-overnight",
-  "version": "1.3.0",
+  "version": "1.5.1",
   "description": "Run 10, 100, or 1000 Claude agents overnight. Parallel autonomous AI coding with thinking waves, iterative quality steering, crash recovery, and rate limit handling. Built on the Claude Agent SDK.",
   "type": "module",
   "bin": {