npm - oh-my-opencode - Versions diffs - 3.5.4 → 3.5.6 - Mend

oh-my-opencode 3.5.4 → 3.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/agents/prometheus/identity-constraints.d.ts +1 -1
package/dist/agents/prometheus/plan-template.d.ts +1 -1
package/dist/agents/prometheus/system-prompt.d.ts +1 -1
package/dist/cli/index.js +15 -14
package/dist/hooks/todo-continuation-enforcer/constants.d.ts +2 -0
package/dist/hooks/todo-continuation-enforcer/types.d.ts +1 -0
package/dist/index.js +273 -248
package/dist/tools/delegate-task/constants.d.ts +1 -1
package/package.json +8 -8

package/dist/index.js CHANGED Viewed

@@ -4963,6 +4963,16 @@ Approach:
 - Draft with care
 - Polish for clarity and impact
 - Documentation, READMEs, articles, technical writing
+ANTI-AI-SLOP RULES (NON-NEGOTIABLE):
+- NEVER use em dashes (\u2014) or en dashes (\u2013). Use commas, periods, ellipses, or line breaks instead. Zero tolerance.
+- Remove AI-sounding phrases: "delve", "it's important to note", "I'd be happy to", "certainly", "please don't hesitate", "leverage", "utilize", "in order to", "moving forward", "circle back", "at the end of the day", "robust", "streamline", "facilitate"
+- Pick plain words. "Use" not "utilize". "Start" not "commence". "Help" not "facilitate".
+- Use contractions naturally: "don't" not "do not", "it's" not "it is".
+- Vary sentence length. Don't make every sentence the same length.
+- NEVER start consecutive sentences with the same word.
+- No filler openings: skip "In today's world...", "As we all know...", "It goes without saying..."
+- Write like a human, not a corporate template.
 </Category_Context>`, DEEP_CATEGORY_PROMPT_APPEND = `<Category_Context>
 You are working on GOAL-ORIENTED AUTONOMOUS tasks.
@@ -5238,14 +5248,14 @@ WHY THIS FORMAT IS MANDATORY:
 `, PLAN_AGENT_NAMES, PLAN_FAMILY_NAMES;
 var init_constants = __esm(() => {
   DEFAULT_CATEGORIES = {
-    "visual-engineering": { model: "google/gemini-3-pro" },
+    "visual-engineering": { model: "google/gemini-3-pro", variant: "high" },
     ultrabrain: { model: "openai/gpt-5.3-codex", variant: "xhigh" },
     deep: { model: "openai/gpt-5.3-codex", variant: "medium" },
     artistry: { model: "google/gemini-3-pro", variant: "high" },
     quick: { model: "anthropic/claude-haiku-4-5" },
     "unspecified-low": { model: "anthropic/claude-sonnet-4-5" },
     "unspecified-high": { model: "anthropic/claude-opus-4-6", variant: "max" },
-    writing: { model: "google/gemini-3-flash" }
+    writing: { model: "kimi-for-coding/k2p5" }
   };
   CATEGORY_PROMPT_APPENDS = {
     "visual-engineering": VISUAL_CATEGORY_PROMPT_APPEND,
@@ -12230,6 +12240,8 @@ var TOAST_DURATION_MS = 900;
 var COUNTDOWN_GRACE_PERIOD_MS = 500;
 var ABORT_WINDOW_MS = 3000;
 var CONTINUATION_COOLDOWN_MS = 30000;
+var MAX_CONSECUTIVE_FAILURES = 5;
+var FAILURE_RESET_WINDOW_MS = 5 * 60 * 1000;
 // src/hooks/todo-continuation-enforcer/handler.ts
 init_logger();
@@ -12454,11 +12466,14 @@ ${todoList}`;
     if (injectionState) {
       injectionState.inFlight = false;
       injectionState.lastInjectedAt = Date.now();
+      injectionState.consecutiveFailures = 0;
     }
   } catch (error) {
     log(`[${HOOK_NAME}] Injection failed`, { sessionID, error: String(error) });
     if (injectionState) {
       injectionState.inFlight = false;
+      injectionState.lastInjectedAt = Date.now();
+      injectionState.consecutiveFailures = (injectionState.consecutiveFailures ?? 0) + 1;
     }
   }
 }
@@ -12577,8 +12592,28 @@ async function handleSessionIdle(args) {
     log(`[${HOOK_NAME}] Skipped: injection in flight`, { sessionID });
     return;
   }
-  if (state.lastInjectedAt && Date.now() - state.lastInjectedAt < CONTINUATION_COOLDOWN_MS) {
-    log(`[${HOOK_NAME}] Skipped: cooldown active`, { sessionID });
+  if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES && state.lastInjectedAt && Date.now() - state.lastInjectedAt >= FAILURE_RESET_WINDOW_MS) {
+    state.consecutiveFailures = 0;
+    log(`[${HOOK_NAME}] Reset consecutive failures after recovery window`, {
+      sessionID,
+      failureResetWindowMs: FAILURE_RESET_WINDOW_MS
+    });
+  }
+  if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
+    log(`[${HOOK_NAME}] Skipped: max consecutive failures reached`, {
+      sessionID,
+      consecutiveFailures: state.consecutiveFailures,
+      maxConsecutiveFailures: MAX_CONSECUTIVE_FAILURES
+    });
+    return;
+  }
+  const effectiveCooldown = CONTINUATION_COOLDOWN_MS * Math.pow(2, Math.min(state.consecutiveFailures, 5));
+  if (state.lastInjectedAt && Date.now() - state.lastInjectedAt < effectiveCooldown) {
+    log(`[${HOOK_NAME}] Skipped: cooldown active`, {
+      sessionID,
+      effectiveCooldown,
+      consecutiveFailures: state.consecutiveFailures
+    });
     return;
   }
   let resolvedInfo;
@@ -12767,7 +12802,9 @@ function createSessionStateStore() {
       existing.lastAccessedAt = Date.now();
       return existing.state;
     }
-    const state = {};
+    const state = {
+      consecutiveFailures: 0
+    };
     sessions.set(sessionID, { state, lastAccessedAt: Date.now() });
     return state;
   }
@@ -28285,11 +28322,17 @@ function createRecoveryState() {
 function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
   const autoCompactState = createRecoveryState();
   const experimental = options?.experimental;
+  const pendingCompactionTimeoutBySession = new Map;
   const eventHandler = async ({ event }) => {
     const props = event.properties;
     if (event.type === "session.deleted") {
       const sessionInfo = props?.info;
       if (sessionInfo?.id) {
+        const timeoutID = pendingCompactionTimeoutBySession.get(sessionInfo.id);
+        if (timeoutID !== undefined) {
+          clearTimeout(timeoutID);
+          pendingCompactionTimeoutBySession.delete(sessionInfo.id);
+        }
         autoCompactState.pendingCompact.delete(sessionInfo.id);
         autoCompactState.errorDataBySession.delete(sessionInfo.id);
         autoCompactState.retryStateBySession.delete(sessionInfo.id);
@@ -28324,9 +28367,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
             duration: 3000
           }
         }).catch(() => {});
-        setTimeout(() => {
+        const timeoutID = setTimeout(() => {
+          pendingCompactionTimeoutBySession.delete(sessionID);
           executeCompact(sessionID, { providerID, modelID }, autoCompactState, ctx.client, ctx.directory, experimental);
         }, 300);
+        pendingCompactionTimeoutBySession.set(sessionID, timeoutID);
       }
       return;
     }
@@ -28352,6 +28397,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
         return;
       if (!autoCompactState.pendingCompact.has(sessionID))
         return;
+      const timeoutID = pendingCompactionTimeoutBySession.get(sessionID);
+      if (timeoutID !== undefined) {
+        clearTimeout(timeoutID);
+        pendingCompactionTimeoutBySession.delete(sessionID);
+      }
       const errorData = autoCompactState.errorDataBySession.get(sessionID);
       const lastAssistant = await getLastAssistant(sessionID, ctx.client, ctx.directory);
       if (lastAssistant?.summary === true) {
@@ -33130,9 +33180,10 @@ var AGENT_MODEL_REQUIREMENTS = {
 var CATEGORY_MODEL_REQUIREMENTS = {
   "visual-engineering": {
     fallbackChain: [
-      { providers: ["google", "github-copilot", "opencode"], model: "gemini-3-pro" },
+      { providers: ["google", "github-copilot", "opencode"], model: "gemini-3-pro", variant: "high" },
+      { providers: ["zai-coding-plan"], model: "glm-5" },
       { providers: ["anthropic", "github-copilot", "opencode"], model: "claude-opus-4-6", variant: "max" },
-      { providers: ["zai-coding-plan"], model: "glm-4.7" }
+      { providers: ["kimi-for-coding"], model: "k2p5" }
     ]
   },
   ultrabrain: {
@@ -33181,10 +33232,9 @@ var CATEGORY_MODEL_REQUIREMENTS = {
   },
   writing: {
     fallbackChain: [
+      { providers: ["kimi-for-coding"], model: "k2p5" },
       { providers: ["google", "github-copilot", "opencode"], model: "gemini-3-flash" },
-      { providers: ["anthropic", "github-copilot", "opencode"], model: "claude-sonnet-4-5" },
-      { providers: ["zai-coding-plan"], model: "glm-4.7" },
-      { providers: ["openai", "github-copilot", "opencode"], model: "gpt-5.2" }
+      { providers: ["anthropic", "github-copilot", "opencode"], model: "claude-sonnet-4-5" }
     ]
   }
 };
@@ -44976,6 +45026,7 @@ function createUnstableAgentBabysitterHook(ctx, options) {
   };
 }
 // src/hooks/preemptive-compaction.ts
+init_logger();
 var DEFAULT_ACTUAL_LIMIT = 200000;
 var ANTHROPIC_ACTUAL_LIMIT3 = process.env.ANTHROPIC_1M_CONTEXT === "true" || process.env.VERTEX_ANTHROPIC_1M_CONTEXT === "true" ? 1e6 : DEFAULT_ACTUAL_LIMIT;
 var PREEMPTIVE_COMPACTION_THRESHOLD = 0.78;
@@ -45007,7 +45058,9 @@ function createPreemptiveCompactionHook(ctx) {
         query: { directory: ctx.directory }
       });
       compactedSessions.add(sessionID);
-    } catch {} finally {
+    } catch (error45) {
+      log("[preemptive-compaction] Compaction failed", { sessionID, error: String(error45) });
+    } finally {
       compactionInProgress.delete(sessionID);
     }
   };
@@ -51135,6 +51188,7 @@ Task ID: ${task.id}`;
     const pollStart = Date.now();
     let lastMsgCount = 0;
     let stablePolls = 0;
+    let terminalStatus;
     while (Date.now() - pollStart < timingCfg.MAX_POLL_TIME_MS) {
       if (ctx.abort?.aborted) {
         return `Task aborted (was running in background mode).
@@ -51142,6 +51196,11 @@ Task ID: ${task.id}`;
 Session ID: ${sessionID}`;
       }
       await new Promise((resolve10) => setTimeout(resolve10, timingCfg.POLL_INTERVAL_MS));
+      const currentTask = manager.getTask(task.id);
+      if (currentTask && (currentTask.status === "interrupt" || currentTask.status === "error" || currentTask.status === "cancelled")) {
+        terminalStatus = { status: currentTask.status, error: currentTask.error };
+        break;
+      }
       const statusResult = await client2.session.status();
       const allStatuses = statusResult.data ?? {};
       const sessionStatus = allStatuses[sessionID];
@@ -51164,6 +51223,23 @@ Session ID: ${sessionID}`;
         lastMsgCount = currentMsgCount;
       }
     }
+    if (terminalStatus) {
+      const duration4 = formatDuration2(startTime);
+      return `SUPERVISED TASK FAILED (${terminalStatus.status})
+Task was interrupted/failed while running in monitored background mode.
+${terminalStatus.error ? `Error: ${terminalStatus.error}` : ""}
+Duration: ${duration4}
+Agent: ${agentToUse}${args.category ? ` (category: ${args.category})` : ""}
+Model: ${actualModel}
+The task session may contain partial results.
+<task_metadata>
+session_id: ${sessionID}
+</task_metadata>`;
+    }
     const messagesResult = await client2.session.messages({ path: { id: sessionID } });
     const messages = messagesResult.data ?? messagesResult;
     const assistantMessages = messages.filter((m) => m.info?.role === "assistant").sort((a, b) => (b.info?.time?.created ?? 0) - (a.info?.time?.created ?? 0));
@@ -53497,6 +53573,11 @@ class BackgroundManager {
       });
       return existingTask;
     }
+    const completionTimer = this.completionTimers.get(existingTask.id);
+    if (completionTimer) {
+      clearTimeout(completionTimer);
+      this.completionTimers.delete(existingTask.id);
+    }
     const concurrencyKey = existingTask.concurrencyGroup ?? existingTask.agent;
     await this.concurrencyManager.acquire(concurrencyKey);
     existingTask.concurrencyKey = concurrencyKey;
@@ -53600,7 +53681,7 @@ class BackgroundManager {
   }
   handleEvent(event) {
     const props = event.properties;
-    if (event.type === "message.part.updated") {
+    if (event.type === "message.part.updated" || event.type === "message.part.delta") {
       if (!props || typeof props !== "object" || !("sessionID" in props))
         return;
       const partInfo = props;
@@ -53704,6 +53785,10 @@ class BackgroundManager {
       this.cleanupPendingByParent(task);
       this.tasks.delete(task.id);
       this.clearNotificationsForTask(task.id);
+      const toastManager = getTaskToastManager();
+      if (toastManager) {
+        toastManager.removeTask(task.id);
+      }
       if (task.sessionID) {
         subagentSessions.delete(task.sessionID);
       }
@@ -53746,6 +53831,10 @@ class BackgroundManager {
         this.cleanupPendingByParent(task);
         this.tasks.delete(task.id);
         this.clearNotificationsForTask(task.id);
+        const toastManager = getTaskToastManager();
+        if (toastManager) {
+          toastManager.removeTask(task.id);
+        }
         if (task.sessionID) {
           subagentSessions.delete(task.sessionID);
         }
@@ -53860,6 +53949,10 @@ class BackgroundManager {
       }).catch(() => {});
     }
     if (options?.skipNotification) {
+      const toastManager = getTaskToastManager();
+      if (toastManager) {
+        toastManager.removeTask(task.id);
+      }
       log(`[background-agent] Task cancelled via ${source} (notification skipped):`, task.id);
       return true;
     }
@@ -54037,11 +54130,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
       }
     } catch (error45) {
       if (this.isAbortedSessionError(error45)) {
-        log("[background-agent] Parent session aborted, skipping notification:", {
+        log("[background-agent] Parent session aborted while loading messages; using messageDir fallback:", {
           taskId: task.id,
           parentSessionID: task.parentSessionID
         });
-        return;
       }
       const messageDir = getMessageDir12(task.parentSessionID);
       const currentMessage = messageDir ? findNearestMessageWithFields(messageDir) : null;
@@ -54071,13 +54163,13 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
       });
     } catch (error45) {
       if (this.isAbortedSessionError(error45)) {
-        log("[background-agent] Parent session aborted, skipping notification:", {
+        log("[background-agent] Parent session aborted while sending notification; continuing cleanup:", {
           taskId: task.id,
           parentSessionID: task.parentSessionID
         });
-        return;
+      } else {
+        log("[background-agent] Failed to send notification:", error45);
       }
-      log("[background-agent] Failed to send notification:", error45);
     }
     if (allComplete) {
       for (const completedTask of completedTasks) {
@@ -54190,6 +54282,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
           }
         }
         this.clearNotificationsForTask(taskId);
+        const toastManager = getTaskToastManager();
+        if (toastManager) {
+          toastManager.removeTask(taskId);
+        }
         this.tasks.delete(taskId);
         if (task.sessionID) {
           subagentSessions.delete(task.sessionID);
@@ -54225,7 +54321,8 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
       const sessionID = task.sessionID;
       if (!startedAt || !sessionID)
         continue;
-      const sessionIsRunning = allStatuses[sessionID]?.type === "running";
+      const sessionStatus = allStatuses[sessionID]?.type;
+      const sessionIsRunning = sessionStatus !== undefined && sessionStatus !== "idle";
       const runtime = now - startedAt.getTime();
       if (!task.progress?.lastUpdate) {
         if (sessionIsRunning)
@@ -64430,7 +64527,21 @@ Your ONLY valid output locations are \`.sisyphus/plans/*.md\` and \`.sisyphus/dr
 Example: \`.sisyphus/plans/auth-refactor.md\`
-### 5. SINGLE PLAN MANDATE (CRITICAL)
+### 5. MAXIMUM PARALLELISM PRINCIPLE (NON-NEGOTIABLE)
+Your plans MUST maximize parallel execution. This is a core planning quality metric.
+**Granularity Rule**: One task = one module/concern = 1-3 files.
+If a task touches 4+ files or 2+ unrelated concerns, SPLIT IT.
+**Parallelism Target**: Aim for 5-8 tasks per wave.
+If any wave has fewer than 3 tasks (except the final integration), you under-split.
+**Dependency Minimization**: Structure tasks so shared dependencies
+(types, interfaces, configs) are extracted as early Wave-1 tasks,
+unblocking maximum parallelism in subsequent waves.
+### 6. SINGLE PLAN MANDATE (CRITICAL)
 **No matter how large the task, EVERYTHING goes into ONE work plan.**
 **NEVER:**
@@ -64453,7 +64564,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
 **The plan can have 50+ TODOs. That's OK. ONE PLAN.**
-### 5.1 SINGLE ATOMIC WRITE (CRITICAL - Prevents Content Loss)
+### 6.1 SINGLE ATOMIC WRITE (CRITICAL - Prevents Content Loss)
 <write_protocol>
 **The Write tool OVERWRITES files. It does NOT append.**
@@ -64496,7 +64607,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
 - [ ] File already exists with my content? \u2192 Use Edit to append, NOT Write
 </write_protocol>
-### 6. DRAFT AS WORKING MEMORY (MANDATORY)
+### 7. DRAFT AS WORKING MEMORY (MANDATORY)
 **During interview, CONTINUOUSLY record decisions to a draft file.**
 **Draft Location**: \`.sisyphus/drafts/{name}.md\`
@@ -65303,108 +65414,25 @@ Generate plan to: \`.sisyphus/plans/{name}.md\`
 ## Verification Strategy (MANDATORY)
-> **UNIVERSAL RULE: ZERO HUMAN INTERVENTION**
->
-> ALL tasks in this plan MUST be verifiable WITHOUT any human action.
-> This is NOT conditional \u2014 it applies to EVERY task, regardless of test strategy.
->
-> **FORBIDDEN** \u2014 acceptance criteria that require:
-> - "User manually tests..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uD14C\uC2A4\uD2B8..."
-> - "User visually confirms..." / "\uC0AC\uC6A9\uC790\uAC00 \uB208\uC73C\uB85C \uD655\uC778..."
-> - "User interacts with..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uC870\uC791..."
-> - "Ask user to verify..." / "\uC0AC\uC6A9\uC790\uC5D0\uAC8C \uD655\uC778 \uC694\uCCAD..."
-> - ANY step where a human must perform an action
->
-> **ALL verification is executed by the agent** using tools (Playwright, interactive_bash, curl, etc.). No exceptions.
+> **ZERO HUMAN INTERVENTION** \u2014 ALL verification is agent-executed. No exceptions.
+> Acceptance criteria requiring "user manually tests/confirms" are FORBIDDEN.
 ### Test Decision
 - **Infrastructure exists**: [YES/NO]
 - **Automated tests**: [TDD / Tests-after / None]
 - **Framework**: [bun test / vitest / jest / pytest / none]
+- **If TDD**: Each task follows RED (failing test) \u2192 GREEN (minimal impl) \u2192 REFACTOR
-### If TDD Enabled
-Each TODO follows RED-GREEN-REFACTOR:
-**Task Structure:**
-1. **RED**: Write failing test first
-   - Test file: \`[path].test.ts\`
-   - Test command: \`bun test [file]\`
-   - Expected: FAIL (test exists, implementation doesn't)
-2. **GREEN**: Implement minimum code to pass
-   - Command: \`bun test [file]\`
-   - Expected: PASS
-3. **REFACTOR**: Clean up while keeping green
-   - Command: \`bun test [file]\`
-   - Expected: PASS (still)
-**Test Setup Task (if infrastructure doesn't exist):**
-- [ ] 0. Setup Test Infrastructure
-  - Install: \`bun add -d [test-framework]\`
-  - Config: Create \`[config-file]\`
-  - Verify: \`bun test --help\` \u2192 shows help
-  - Example: Create \`src/__tests__/example.test.ts\`
-  - Verify: \`bun test\` \u2192 1 test passes
-### Agent-Executed QA Scenarios (MANDATORY \u2014 ALL tasks)
-> Whether TDD is enabled or not, EVERY task MUST include Agent-Executed QA Scenarios.
-> - **With TDD**: QA scenarios complement unit tests at integration/E2E level
-> - **Without TDD**: QA scenarios are the PRIMARY verification method
->
-> These describe how the executing agent DIRECTLY verifies the deliverable
-> by running it \u2014 opening browsers, executing commands, sending API requests.
-> The agent performs what a human tester would do, but automated via tools.
-**Verification Tool by Deliverable Type:**
-| Type | Tool | How Agent Verifies |
-|------|------|-------------------|
-| **Frontend/UI** | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
-| **TUI/CLI** | interactive_bash (tmux) | Run command, send keystrokes, validate output |
-| **API/Backend** | Bash (curl/httpie) | Send requests, parse responses, assert fields |
-| **Library/Module** | Bash (bun/node REPL) | Import, call functions, compare output |
-| **Config/Infra** | Bash (shell commands) | Apply config, run state checks, validate |
-**Each Scenario MUST Follow This Format:**
-\`\`\`
-Scenario: [Descriptive name \u2014 what user action/flow is being verified]
-  Tool: [Playwright / interactive_bash / Bash]
-  Preconditions: [What must be true before this scenario runs]
-  Steps:
-    1. [Exact action with specific selector/command/endpoint]
-    2. [Next action with expected intermediate state]
-    3. [Assertion with exact expected value]
-  Expected Result: [Concrete, observable outcome]
-  Failure Indicators: [What would indicate failure]
-  Evidence: [Screenshot path / output capture / response body path]
-\`\`\`
+### QA Policy
+Every task MUST include agent-executed QA scenarios (see TODO template below).
+Evidence saved to \`.sisyphus/evidence/task-{N}-{scenario-slug}.{ext}\`.
-**Scenario Detail Requirements:**
-- **Selectors**: Specific CSS selectors (\`.login-button\`, not "the login button")
-- **Data**: Concrete test data (\`"test@example.com"\`, not \`"[email]"\`)
-- **Assertions**: Exact values (\`text contains "Welcome back"\`, not "verify it works")
-- **Timing**: Include wait conditions where relevant (\`Wait for .dashboard (timeout: 10s)\`)
-- **Negative Scenarios**: At least ONE failure/error scenario per feature
-- **Evidence Paths**: Specific file paths (\`.sisyphus/evidence/task-N-scenario-name.png\`)
-**Anti-patterns (NEVER write scenarios like this):**
-- \u274C "Verify the login page works correctly"
-- \u274C "Check that the API returns the right data"
-- \u274C "Test the form validation"
-- \u274C "User opens browser and confirms..."
-**Write scenarios like this instead:**
-- \u2705 \`Navigate to /login \u2192 Fill input[name="email"] with "test@example.com" \u2192 Fill input[name="password"] with "Pass123!" \u2192 Click button[type="submit"] \u2192 Wait for /dashboard \u2192 Assert h1 contains "Welcome"\`
-- \u2705 \`POST /api/users {"name":"Test","email":"new@test.com"} \u2192 Assert status 201 \u2192 Assert response.id is UUID \u2192 GET /api/users/{id} \u2192 Assert name equals "Test"\`
-- \u2705 \`Run ./cli --config test.yaml \u2192 Wait for "Loaded" in stdout \u2192 Send "q" \u2192 Assert exit code 0 \u2192 Assert stdout contains "Goodbye"\`
-**Evidence Requirements:**
-- Screenshots: \`.sisyphus/evidence/\` for all UI verifications
-- Terminal output: Captured for CLI/TUI verifications
-- Response bodies: Saved for API verifications
-- All evidence referenced by specific file path in acceptance criteria
+| Deliverable Type | Verification Tool | Method |
+|------------------|-------------------|--------|
+| Frontend/UI | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
+| TUI/CLI | interactive_bash (tmux) | Run command, send keystrokes, validate output |
+| API/Backend | Bash (curl) | Send requests, assert status + response fields |
+| Library/Module | Bash (bun/node REPL) | Import, call functions, compare output |
 ---
@@ -65414,49 +65442,82 @@ Scenario: [Descriptive name \u2014 what user action/flow is being verified]
 > Maximize throughput by grouping independent tasks into parallel waves.
 > Each wave completes before the next begins.
+> Target: 5-8 tasks per wave. Fewer than 3 per wave (except final) = under-splitting.
 \`\`\`
-Wave 1 (Start Immediately):
-\u251C\u2500\u2500 Task 1: [no dependencies]
-\u2514\u2500\u2500 Task 5: [no dependencies]
-Wave 2 (After Wave 1):
-\u251C\u2500\u2500 Task 2: [depends: 1]
-\u251C\u2500\u2500 Task 3: [depends: 1]
-\u2514\u2500\u2500 Task 6: [depends: 5]
-Wave 3 (After Wave 2):
-\u2514\u2500\u2500 Task 4: [depends: 2, 3]
-Critical Path: Task 1 \u2192 Task 2 \u2192 Task 4
-Parallel Speedup: ~40% faster than sequential
+Wave 1 (Start Immediately \u2014 foundation + scaffolding):
+\u251C\u2500\u2500 Task 1: Project scaffolding + config [quick]
+\u251C\u2500\u2500 Task 2: Design system tokens [quick]
+\u251C\u2500\u2500 Task 3: Type definitions [quick]
+\u251C\u2500\u2500 Task 4: Schema definitions [quick]
+\u251C\u2500\u2500 Task 5: Storage interface + in-memory impl [quick]
+\u251C\u2500\u2500 Task 6: Auth middleware [quick]
+\u2514\u2500\u2500 Task 7: Client module [quick]
+Wave 2 (After Wave 1 \u2014 core modules, MAX PARALLEL):
+\u251C\u2500\u2500 Task 8: Core business logic (depends: 3, 5, 7) [deep]
+\u251C\u2500\u2500 Task 9: API endpoints (depends: 4, 5) [unspecified-high]
+\u251C\u2500\u2500 Task 10: Secondary storage impl (depends: 5) [unspecified-high]
+\u251C\u2500\u2500 Task 11: Retry/fallback logic (depends: 8) [deep]
+\u251C\u2500\u2500 Task 12: UI layout + navigation (depends: 2) [visual-engineering]
+\u251C\u2500\u2500 Task 13: API client + hooks (depends: 4) [quick]
+\u2514\u2500\u2500 Task 14: Telemetry middleware (depends: 5, 10) [unspecified-high]
+Wave 3 (After Wave 2 \u2014 integration + UI):
+\u251C\u2500\u2500 Task 15: Main route combining modules (depends: 6, 11, 14) [deep]
+\u251C\u2500\u2500 Task 16: UI data visualization (depends: 12, 13) [visual-engineering]
+\u251C\u2500\u2500 Task 17: Deployment config A (depends: 15) [quick]
+\u251C\u2500\u2500 Task 18: Deployment config B (depends: 15) [quick]
+\u251C\u2500\u2500 Task 19: Deployment config C (depends: 15) [quick]
+\u2514\u2500\u2500 Task 20: UI request log + build (depends: 16) [visual-engineering]
+Wave 4 (After Wave 3 \u2014 verification):
+\u251C\u2500\u2500 Task 21: Integration tests (depends: 15) [deep]
+\u251C\u2500\u2500 Task 22: UI QA - Playwright (depends: 20) [unspecified-high]
+\u251C\u2500\u2500 Task 23: E2E QA (depends: 21) [deep]
+\u2514\u2500\u2500 Task 24: Git cleanup + tagging (depends: 21) [git]
+Wave FINAL (After ALL tasks \u2014 independent review, 4 parallel):
+\u251C\u2500\u2500 Task F1: Plan compliance audit (oracle)
+\u251C\u2500\u2500 Task F2: Code quality review (unspecified-high)
+\u251C\u2500\u2500 Task F3: Real manual QA (unspecified-high)
+\u2514\u2500\u2500 Task F4: Scope fidelity check (deep)
+Critical Path: Task 1 \u2192 Task 5 \u2192 Task 8 \u2192 Task 11 \u2192 Task 15 \u2192 Task 21 \u2192 F1-F4
+Parallel Speedup: ~70% faster than sequential
+Max Concurrent: 7 (Waves 1 & 2)
 \`\`\`
-### Dependency Matrix
+### Dependency Matrix (abbreviated \u2014 show ALL tasks in your generated plan)
-| Task | Depends On | Blocks | Can Parallelize With |
-|------|------------|--------|---------------------|
-| 1 | None | 2, 3 | 5 |
-| 2 | 1 | 4 | 3, 6 |
-| 3 | 1 | 4 | 2, 6 |
-| 4 | 2, 3 | None | None (final) |
-| 5 | None | 6 | 1 |
-| 6 | 5 | None | 2, 3 |
+| Task | Depends On | Blocks | Wave |
+|------|------------|--------|------|
+| 1-7 | \u2014 | 8-14 | 1 |
+| 8 | 3, 5, 7 | 11, 15 | 2 |
+| 11 | 8 | 15 | 2 |
+| 14 | 5, 10 | 15 | 2 |
+| 15 | 6, 11, 14 | 17-19, 21 | 3 |
+| 21 | 15 | 23, 24 | 4 |
+> This is abbreviated for reference. YOUR generated plan must include the FULL matrix for ALL tasks.
 ### Agent Dispatch Summary
-| Wave | Tasks | Recommended Agents |
-|------|-------|-------------------|
-| 1 | 1, 5 | task(category="...", load_skills=[...], run_in_background=false) |
-| 2 | 2, 3, 6 | dispatch parallel after Wave 1 completes |
-| 3 | 4 | final integration task |
+| Wave | # Parallel | Tasks \u2192 Agent Category |
+|------|------------|----------------------|
+| 1 | **7** | T1-T4 \u2192 \`quick\`, T5 \u2192 \`quick\`, T6 \u2192 \`quick\`, T7 \u2192 \`quick\` |
+| 2 | **7** | T8 \u2192 \`deep\`, T9 \u2192 \`unspecified-high\`, T10 \u2192 \`unspecified-high\`, T11 \u2192 \`deep\`, T12 \u2192 \`visual-engineering\`, T13 \u2192 \`quick\`, T14 \u2192 \`unspecified-high\` |
+| 3 | **6** | T15 \u2192 \`deep\`, T16 \u2192 \`visual-engineering\`, T17-T19 \u2192 \`quick\`, T20 \u2192 \`visual-engineering\` |
+| 4 | **4** | T21 \u2192 \`deep\`, T22 \u2192 \`unspecified-high\`, T23 \u2192 \`deep\`, T24 \u2192 \`git\` |
+| FINAL | **4** | F1 \u2192 \`oracle\`, F2 \u2192 \`unspecified-high\`, F3 \u2192 \`unspecified-high\`, F4 \u2192 \`deep\` |
 ---
 ## TODOs
 > Implementation + Test = ONE Task. Never separate.
-> EVERY task MUST have: Recommended Agent Profile + Parallelization info.
+> EVERY task MUST have: Recommended Agent Profile + Parallelization info + QA Scenarios.
+> **A task WITHOUT QA Scenarios is INCOMPLETE. No exceptions.**
 - [ ] 1. [Task Title]
@@ -65490,22 +65551,15 @@ Parallel Speedup: ~40% faster than sequential
   **Pattern References** (existing code to follow):
   - \`src/services/auth.ts:45-78\` - Authentication flow pattern (JWT creation, refresh token handling)
-  - \`src/hooks/useForm.ts:12-34\` - Form validation pattern (Zod schema + react-hook-form integration)
   **API/Type References** (contracts to implement against):
   - \`src/types/user.ts:UserDTO\` - Response shape for user endpoints
-  - \`src/api/schema.ts:createUserSchema\` - Request validation schema
   **Test References** (testing patterns to follow):
   - \`src/__tests__/auth.test.ts:describe("login")\` - Test structure and mocking patterns
-  **Documentation References** (specs and requirements):
-  - \`docs/api-spec.md#authentication\` - API contract details
-  - \`ARCHITECTURE.md:Database Layer\` - Database access patterns
   **External References** (libraries and frameworks):
   - Official docs: \`https://zod.dev/?id=basic-usage\` - Zod validation syntax
-  - Example repo: \`github.com/example/project/src/auth\` - Reference implementation
   **WHY Each Reference Matters** (explain the relevance):
   - Don't just list files - explain what pattern/information the executor should extract
@@ -65516,113 +65570,60 @@ Parallel Speedup: ~40% faster than sequential
   > **AGENT-EXECUTABLE VERIFICATION ONLY** \u2014 No human action permitted.
   > Every criterion MUST be verifiable by running a command or using a tool.
-  > REPLACE all placeholders with actual values from task context.
   **If TDD (tests enabled):**
   - [ ] Test file created: src/auth/login.test.ts
-  - [ ] Test covers: successful login returns JWT token
   - [ ] bun test src/auth/login.test.ts \u2192 PASS (3 tests, 0 failures)
-  **Agent-Executed QA Scenarios (MANDATORY \u2014 per-scenario, ultra-detailed):**
-  > Write MULTIPLE named scenarios per task: happy path AND failure cases.
-  > Each scenario = exact tool + steps with real selectors/data + evidence path.
-  **Example \u2014 Frontend/UI (Playwright):**
+  **QA Scenarios (MANDATORY \u2014 task is INCOMPLETE without these):**
-  \\\`\\\`\\\`
-  Scenario: Successful login redirects to dashboard
-    Tool: Playwright (playwright skill)
-    Preconditions: Dev server running on localhost:3000, test user exists
-    Steps:
-      1. Navigate to: http://localhost:3000/login
-      2. Wait for: input[name="email"] visible (timeout: 5s)
-      3. Fill: input[name="email"] \u2192 "test@example.com"
-      4. Fill: input[name="password"] \u2192 "ValidPass123!"
-      5. Click: button[type="submit"]
-      6. Wait for: navigation to /dashboard (timeout: 10s)
-      7. Assert: h1 text contains "Welcome back"
-      8. Assert: cookie "session_token" exists
-      9. Screenshot: .sisyphus/evidence/task-1-login-success.png
-    Expected Result: Dashboard loads with welcome message
-    Evidence: .sisyphus/evidence/task-1-login-success.png
-  Scenario: Login fails with invalid credentials
-    Tool: Playwright (playwright skill)
-    Preconditions: Dev server running, no valid user with these credentials
-    Steps:
-      1. Navigate to: http://localhost:3000/login
-      2. Fill: input[name="email"] \u2192 "wrong@example.com"
-      3. Fill: input[name="password"] \u2192 "WrongPass"
-      4. Click: button[type="submit"]
-      5. Wait for: .error-message visible (timeout: 5s)
-      6. Assert: .error-message text contains "Invalid credentials"
-      7. Assert: URL is still /login (no redirect)
-      8. Screenshot: .sisyphus/evidence/task-1-login-failure.png
-    Expected Result: Error message shown, stays on login page
-    Evidence: .sisyphus/evidence/task-1-login-failure.png
-  \\\`\\\`\\\`
-  **Example \u2014 API/Backend (curl):**
+  > **This is NOT optional. A task without QA scenarios WILL BE REJECTED.**
+  >
+  > Write scenario tests that verify the ACTUAL BEHAVIOR of what you built.
+  > Minimum: 1 happy path + 1 failure/edge case per task.
+  > Each scenario = exact tool + exact steps + exact assertions + evidence path.
+  >
+  > **The executing agent MUST run these scenarios after implementation.**
+  > **The orchestrator WILL verify evidence files exist before marking task complete.**
   \\\`\\\`\\\`
-  Scenario: Create user returns 201 with UUID
-    Tool: Bash (curl)
-    Preconditions: Server running on localhost:8080
+  Scenario: [Happy path \u2014 what SHOULD work]
+    Tool: [Playwright / interactive_bash / Bash (curl)]
+    Preconditions: [Exact setup state]
     Steps:
-      1. curl -s -w "\\n%{http_code}" -X POST http://localhost:8080/api/users \\
-           -H "Content-Type: application/json" \\
-           -d '{"email":"new@test.com","name":"Test User"}'
-      2. Assert: HTTP status is 201
-      3. Assert: response.id matches UUID format
-      4. GET /api/users/{returned-id} \u2192 Assert name equals "Test User"
-    Expected Result: User created and retrievable
-    Evidence: Response bodies captured
-  Scenario: Duplicate email returns 409
-    Tool: Bash (curl)
-    Preconditions: User with email "new@test.com" already exists
+      1. [Exact action \u2014 specific command/selector/endpoint, no vagueness]
+      2. [Next action \u2014 with expected intermediate state]
+      3. [Assertion \u2014 exact expected value, not "verify it works"]
+    Expected Result: [Concrete, observable, binary pass/fail]
+    Failure Indicators: [What specifically would mean this failed]
+    Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}.{ext}
+  Scenario: [Failure/edge case \u2014 what SHOULD fail gracefully]
+    Tool: [same format]
+    Preconditions: [Invalid input / missing dependency / error state]
     Steps:
-      1. Repeat POST with same email
-      2. Assert: HTTP status is 409
-      3. Assert: response.error contains "already exists"
-    Expected Result: Conflict error returned
-    Evidence: Response body captured
+      1. [Trigger the error condition]
+      2. [Assert error is handled correctly]
+    Expected Result: [Graceful failure with correct error message/code]
+    Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}-error.{ext}
   \\\`\\\`\\\`
-  **Example \u2014 TUI/CLI (interactive_bash):**
-  \\\`\\\`\\\`
-  Scenario: CLI loads config and displays menu
-    Tool: interactive_bash (tmux)
-    Preconditions: Binary built, test config at ./test.yaml
-    Steps:
-      1. tmux new-session: ./my-cli --config test.yaml
-      2. Wait for: "Configuration loaded" in output (timeout: 5s)
-      3. Assert: Menu items visible ("1. Create", "2. List", "3. Exit")
-      4. Send keys: "3" then Enter
-      5. Assert: "Goodbye" in output
-      6. Assert: Process exited with code 0
-    Expected Result: CLI starts, shows menu, exits cleanly
-    Evidence: Terminal output captured
-  Scenario: CLI handles missing config gracefully
-    Tool: interactive_bash (tmux)
-    Preconditions: No config file at ./nonexistent.yaml
-    Steps:
-      1. tmux new-session: ./my-cli --config nonexistent.yaml
-      2. Wait for: output (timeout: 3s)
-      3. Assert: stderr contains "Config file not found"
-      4. Assert: Process exited with code 1
-    Expected Result: Meaningful error, non-zero exit
-    Evidence: Error output captured
-  \\\`\\\`\\\`
+  > **Specificity requirements \u2014 every scenario MUST use:**
+  > - **Selectors**: Specific CSS selectors (\`.login-button\`, not "the login button")
+  > - **Data**: Concrete test data (\`"test@example.com"\`, not \`"[email]"\`)
+  > - **Assertions**: Exact values (\`text contains "Welcome back"\`, not "verify it works")
+  > - **Timing**: Wait conditions where relevant (\`timeout: 10s\`)
+  > - **Negative**: At least ONE failure/error scenario per task
+  >
+  > **Anti-patterns (your scenario is INVALID if it looks like this):**
+  > - \u274C "Verify it works correctly" \u2014 HOW? What does "correctly" mean?
+  > - \u274C "Check the API returns data" \u2014 WHAT data? What fields? What values?
+  > - \u274C "Test the component renders" \u2014 WHERE? What selector? What content?
+  > - \u274C Any scenario without an evidence path
   **Evidence to Capture:**
-  - [ ] Screenshots in .sisyphus/evidence/ for UI scenarios
-  - [ ] Terminal output for CLI/TUI scenarios
-  - [ ] Response bodies for API scenarios
   - [ ] Each evidence file named: task-{N}-{scenario-slug}.{ext}
+  - [ ] Screenshots for UI, terminal output for CLI, response bodies for API
   **Commit**: YES | NO (groups with N)
   - Message: \`type(scope): desc\`
@@ -65631,6 +65632,28 @@ Parallel Speedup: ~40% faster than sequential
 ---
+## Final Verification Wave (MANDATORY \u2014 after ALL implementation tasks)
+> 4 review agents run in PARALLEL. ALL must APPROVE. Rejection \u2192 fix \u2192 re-run.
+- [ ] F1. **Plan Compliance Audit** \u2014 \`oracle\`
+  Read the plan end-to-end. For each "Must Have": verify implementation exists (read file, curl endpoint, run command). For each "Must NOT Have": search codebase for forbidden patterns \u2014 reject with file:line if found. Check evidence files exist in .sisyphus/evidence/. Compare deliverables against plan.
+  Output: \`Must Have [N/N] | Must NOT Have [N/N] | Tasks [N/N] | VERDICT: APPROVE/REJECT\`
+- [ ] F2. **Code Quality Review** \u2014 \`unspecified-high\`
+  Run \`tsc --noEmit\` + linter + \`bun test\`. Review all changed files for: \`as any\`/\`@ts-ignore\`, empty catches, console.log in prod, commented-out code, unused imports. Check AI slop: excessive comments, over-abstraction, generic names (data/result/item/temp).
+  Output: \`Build [PASS/FAIL] | Lint [PASS/FAIL] | Tests [N pass/N fail] | Files [N clean/N issues] | VERDICT\`
+- [ ] F3. **Real Manual QA** \u2014 \`unspecified-high\` (+ \`playwright\` skill if UI)
+  Start from clean state. Execute EVERY QA scenario from EVERY task \u2014 follow exact steps, capture evidence. Test cross-task integration (features working together, not isolation). Test edge cases: empty state, invalid input, rapid actions. Save to \`.sisyphus/evidence/final-qa/\`.
+  Output: \`Scenarios [N/N pass] | Integration [N/N] | Edge Cases [N tested] | VERDICT\`
+- [ ] F4. **Scope Fidelity Check** \u2014 \`deep\`
+  For each task: read "What to do", read actual diff (git log/diff). Verify 1:1 \u2014 everything in spec was built (no missing), nothing beyond spec was built (no creep). Check "Must NOT do" compliance. Detect cross-task contamination: Task N touching Task M's files. Flag unaccounted changes.
+  Output: \`Tasks [N/N compliant] | Contamination [CLEAN/N issues] | Unaccounted [CLEAN/N files] | VERDICT\`
+---
 ## Commit Strategy
 | After Task | Message | Files | Verification |
@@ -67540,9 +67563,11 @@ function createChatMessageHandler2(args) {
     }
     const message = output.message;
     if (firstMessageVariantGate.shouldOverride(input.sessionID)) {
-      const variant = input.model && input.agent ? resolveVariantForModel(pluginConfig, input.agent, input.model) : resolveAgentVariant(pluginConfig, input.agent);
-      if (variant !== undefined) {
-        message["variant"] = variant;
+      if (message["variant"] === undefined) {
+        const variant = input.model && input.agent ? resolveVariantForModel(pluginConfig, input.agent, input.model) : resolveAgentVariant(pluginConfig, input.agent);
+        if (variant !== undefined) {
+          message["variant"] = variant;
+        }
       }
       firstMessageVariantGate.markApplied(input.sessionID);
     } else {