oh-my-opencode 3.5.4 → 3.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4963,6 +4963,16 @@ Approach:
4963
4963
  - Draft with care
4964
4964
  - Polish for clarity and impact
4965
4965
  - Documentation, READMEs, articles, technical writing
4966
+
4967
+ ANTI-AI-SLOP RULES (NON-NEGOTIABLE):
4968
+ - NEVER use em dashes (\u2014) or en dashes (\u2013). Use commas, periods, ellipses, or line breaks instead. Zero tolerance.
4969
+ - Remove AI-sounding phrases: "delve", "it's important to note", "I'd be happy to", "certainly", "please don't hesitate", "leverage", "utilize", "in order to", "moving forward", "circle back", "at the end of the day", "robust", "streamline", "facilitate"
4970
+ - Pick plain words. "Use" not "utilize". "Start" not "commence". "Help" not "facilitate".
4971
+ - Use contractions naturally: "don't" not "do not", "it's" not "it is".
4972
+ - Vary sentence length. Don't make every sentence the same length.
4973
+ - NEVER start consecutive sentences with the same word.
4974
+ - No filler openings: skip "In today's world...", "As we all know...", "It goes without saying..."
4975
+ - Write like a human, not a corporate template.
4966
4976
  </Category_Context>`, DEEP_CATEGORY_PROMPT_APPEND = `<Category_Context>
4967
4977
  You are working on GOAL-ORIENTED AUTONOMOUS tasks.
4968
4978
 
@@ -5238,14 +5248,14 @@ WHY THIS FORMAT IS MANDATORY:
5238
5248
  `, PLAN_AGENT_NAMES, PLAN_FAMILY_NAMES;
5239
5249
  var init_constants = __esm(() => {
5240
5250
  DEFAULT_CATEGORIES = {
5241
- "visual-engineering": { model: "google/gemini-3-pro" },
5251
+ "visual-engineering": { model: "google/gemini-3-pro", variant: "high" },
5242
5252
  ultrabrain: { model: "openai/gpt-5.3-codex", variant: "xhigh" },
5243
5253
  deep: { model: "openai/gpt-5.3-codex", variant: "medium" },
5244
5254
  artistry: { model: "google/gemini-3-pro", variant: "high" },
5245
5255
  quick: { model: "anthropic/claude-haiku-4-5" },
5246
5256
  "unspecified-low": { model: "anthropic/claude-sonnet-4-5" },
5247
5257
  "unspecified-high": { model: "anthropic/claude-opus-4-6", variant: "max" },
5248
- writing: { model: "google/gemini-3-flash" }
5258
+ writing: { model: "kimi-for-coding/k2p5" }
5249
5259
  };
5250
5260
  CATEGORY_PROMPT_APPENDS = {
5251
5261
  "visual-engineering": VISUAL_CATEGORY_PROMPT_APPEND,
@@ -12230,6 +12240,8 @@ var TOAST_DURATION_MS = 900;
12230
12240
  var COUNTDOWN_GRACE_PERIOD_MS = 500;
12231
12241
  var ABORT_WINDOW_MS = 3000;
12232
12242
  var CONTINUATION_COOLDOWN_MS = 30000;
12243
+ var MAX_CONSECUTIVE_FAILURES = 5;
12244
+ var FAILURE_RESET_WINDOW_MS = 5 * 60 * 1000;
12233
12245
 
12234
12246
  // src/hooks/todo-continuation-enforcer/handler.ts
12235
12247
  init_logger();
@@ -12454,11 +12466,14 @@ ${todoList}`;
12454
12466
  if (injectionState) {
12455
12467
  injectionState.inFlight = false;
12456
12468
  injectionState.lastInjectedAt = Date.now();
12469
+ injectionState.consecutiveFailures = 0;
12457
12470
  }
12458
12471
  } catch (error) {
12459
12472
  log(`[${HOOK_NAME}] Injection failed`, { sessionID, error: String(error) });
12460
12473
  if (injectionState) {
12461
12474
  injectionState.inFlight = false;
12475
+ injectionState.lastInjectedAt = Date.now();
12476
+ injectionState.consecutiveFailures = (injectionState.consecutiveFailures ?? 0) + 1;
12462
12477
  }
12463
12478
  }
12464
12479
  }
@@ -12577,8 +12592,28 @@ async function handleSessionIdle(args) {
12577
12592
  log(`[${HOOK_NAME}] Skipped: injection in flight`, { sessionID });
12578
12593
  return;
12579
12594
  }
12580
- if (state.lastInjectedAt && Date.now() - state.lastInjectedAt < CONTINUATION_COOLDOWN_MS) {
12581
- log(`[${HOOK_NAME}] Skipped: cooldown active`, { sessionID });
12595
+ if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES && state.lastInjectedAt && Date.now() - state.lastInjectedAt >= FAILURE_RESET_WINDOW_MS) {
12596
+ state.consecutiveFailures = 0;
12597
+ log(`[${HOOK_NAME}] Reset consecutive failures after recovery window`, {
12598
+ sessionID,
12599
+ failureResetWindowMs: FAILURE_RESET_WINDOW_MS
12600
+ });
12601
+ }
12602
+ if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
12603
+ log(`[${HOOK_NAME}] Skipped: max consecutive failures reached`, {
12604
+ sessionID,
12605
+ consecutiveFailures: state.consecutiveFailures,
12606
+ maxConsecutiveFailures: MAX_CONSECUTIVE_FAILURES
12607
+ });
12608
+ return;
12609
+ }
12610
+ const effectiveCooldown = CONTINUATION_COOLDOWN_MS * Math.pow(2, Math.min(state.consecutiveFailures, 5));
12611
+ if (state.lastInjectedAt && Date.now() - state.lastInjectedAt < effectiveCooldown) {
12612
+ log(`[${HOOK_NAME}] Skipped: cooldown active`, {
12613
+ sessionID,
12614
+ effectiveCooldown,
12615
+ consecutiveFailures: state.consecutiveFailures
12616
+ });
12582
12617
  return;
12583
12618
  }
12584
12619
  let resolvedInfo;
@@ -12767,7 +12802,9 @@ function createSessionStateStore() {
12767
12802
  existing.lastAccessedAt = Date.now();
12768
12803
  return existing.state;
12769
12804
  }
12770
- const state = {};
12805
+ const state = {
12806
+ consecutiveFailures: 0
12807
+ };
12771
12808
  sessions.set(sessionID, { state, lastAccessedAt: Date.now() });
12772
12809
  return state;
12773
12810
  }
@@ -28285,11 +28322,17 @@ function createRecoveryState() {
28285
28322
  function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
28286
28323
  const autoCompactState = createRecoveryState();
28287
28324
  const experimental = options?.experimental;
28325
+ const pendingCompactionTimeoutBySession = new Map;
28288
28326
  const eventHandler = async ({ event }) => {
28289
28327
  const props = event.properties;
28290
28328
  if (event.type === "session.deleted") {
28291
28329
  const sessionInfo = props?.info;
28292
28330
  if (sessionInfo?.id) {
28331
+ const timeoutID = pendingCompactionTimeoutBySession.get(sessionInfo.id);
28332
+ if (timeoutID !== undefined) {
28333
+ clearTimeout(timeoutID);
28334
+ pendingCompactionTimeoutBySession.delete(sessionInfo.id);
28335
+ }
28293
28336
  autoCompactState.pendingCompact.delete(sessionInfo.id);
28294
28337
  autoCompactState.errorDataBySession.delete(sessionInfo.id);
28295
28338
  autoCompactState.retryStateBySession.delete(sessionInfo.id);
@@ -28324,9 +28367,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
28324
28367
  duration: 3000
28325
28368
  }
28326
28369
  }).catch(() => {});
28327
- setTimeout(() => {
28370
+ const timeoutID = setTimeout(() => {
28371
+ pendingCompactionTimeoutBySession.delete(sessionID);
28328
28372
  executeCompact(sessionID, { providerID, modelID }, autoCompactState, ctx.client, ctx.directory, experimental);
28329
28373
  }, 300);
28374
+ pendingCompactionTimeoutBySession.set(sessionID, timeoutID);
28330
28375
  }
28331
28376
  return;
28332
28377
  }
@@ -28352,6 +28397,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
28352
28397
  return;
28353
28398
  if (!autoCompactState.pendingCompact.has(sessionID))
28354
28399
  return;
28400
+ const timeoutID = pendingCompactionTimeoutBySession.get(sessionID);
28401
+ if (timeoutID !== undefined) {
28402
+ clearTimeout(timeoutID);
28403
+ pendingCompactionTimeoutBySession.delete(sessionID);
28404
+ }
28355
28405
  const errorData = autoCompactState.errorDataBySession.get(sessionID);
28356
28406
  const lastAssistant = await getLastAssistant(sessionID, ctx.client, ctx.directory);
28357
28407
  if (lastAssistant?.summary === true) {
@@ -33130,9 +33180,10 @@ var AGENT_MODEL_REQUIREMENTS = {
33130
33180
  var CATEGORY_MODEL_REQUIREMENTS = {
33131
33181
  "visual-engineering": {
33132
33182
  fallbackChain: [
33133
- { providers: ["google", "github-copilot", "opencode"], model: "gemini-3-pro" },
33183
+ { providers: ["google", "github-copilot", "opencode"], model: "gemini-3-pro", variant: "high" },
33184
+ { providers: ["zai-coding-plan"], model: "glm-5" },
33134
33185
  { providers: ["anthropic", "github-copilot", "opencode"], model: "claude-opus-4-6", variant: "max" },
33135
- { providers: ["zai-coding-plan"], model: "glm-4.7" }
33186
+ { providers: ["kimi-for-coding"], model: "k2p5" }
33136
33187
  ]
33137
33188
  },
33138
33189
  ultrabrain: {
@@ -33181,10 +33232,9 @@ var CATEGORY_MODEL_REQUIREMENTS = {
33181
33232
  },
33182
33233
  writing: {
33183
33234
  fallbackChain: [
33235
+ { providers: ["kimi-for-coding"], model: "k2p5" },
33184
33236
  { providers: ["google", "github-copilot", "opencode"], model: "gemini-3-flash" },
33185
- { providers: ["anthropic", "github-copilot", "opencode"], model: "claude-sonnet-4-5" },
33186
- { providers: ["zai-coding-plan"], model: "glm-4.7" },
33187
- { providers: ["openai", "github-copilot", "opencode"], model: "gpt-5.2" }
33237
+ { providers: ["anthropic", "github-copilot", "opencode"], model: "claude-sonnet-4-5" }
33188
33238
  ]
33189
33239
  }
33190
33240
  };
@@ -44976,6 +45026,7 @@ function createUnstableAgentBabysitterHook(ctx, options) {
44976
45026
  };
44977
45027
  }
44978
45028
  // src/hooks/preemptive-compaction.ts
45029
+ init_logger();
44979
45030
  var DEFAULT_ACTUAL_LIMIT = 200000;
44980
45031
  var ANTHROPIC_ACTUAL_LIMIT3 = process.env.ANTHROPIC_1M_CONTEXT === "true" || process.env.VERTEX_ANTHROPIC_1M_CONTEXT === "true" ? 1e6 : DEFAULT_ACTUAL_LIMIT;
44981
45032
  var PREEMPTIVE_COMPACTION_THRESHOLD = 0.78;
@@ -45007,7 +45058,9 @@ function createPreemptiveCompactionHook(ctx) {
45007
45058
  query: { directory: ctx.directory }
45008
45059
  });
45009
45060
  compactedSessions.add(sessionID);
45010
- } catch {} finally {
45061
+ } catch (error45) {
45062
+ log("[preemptive-compaction] Compaction failed", { sessionID, error: String(error45) });
45063
+ } finally {
45011
45064
  compactionInProgress.delete(sessionID);
45012
45065
  }
45013
45066
  };
@@ -51135,6 +51188,7 @@ Task ID: ${task.id}`;
51135
51188
  const pollStart = Date.now();
51136
51189
  let lastMsgCount = 0;
51137
51190
  let stablePolls = 0;
51191
+ let terminalStatus;
51138
51192
  while (Date.now() - pollStart < timingCfg.MAX_POLL_TIME_MS) {
51139
51193
  if (ctx.abort?.aborted) {
51140
51194
  return `Task aborted (was running in background mode).
@@ -51142,6 +51196,11 @@ Task ID: ${task.id}`;
51142
51196
  Session ID: ${sessionID}`;
51143
51197
  }
51144
51198
  await new Promise((resolve10) => setTimeout(resolve10, timingCfg.POLL_INTERVAL_MS));
51199
+ const currentTask = manager.getTask(task.id);
51200
+ if (currentTask && (currentTask.status === "interrupt" || currentTask.status === "error" || currentTask.status === "cancelled")) {
51201
+ terminalStatus = { status: currentTask.status, error: currentTask.error };
51202
+ break;
51203
+ }
51145
51204
  const statusResult = await client2.session.status();
51146
51205
  const allStatuses = statusResult.data ?? {};
51147
51206
  const sessionStatus = allStatuses[sessionID];
@@ -51164,6 +51223,23 @@ Session ID: ${sessionID}`;
51164
51223
  lastMsgCount = currentMsgCount;
51165
51224
  }
51166
51225
  }
51226
+ if (terminalStatus) {
51227
+ const duration4 = formatDuration2(startTime);
51228
+ return `SUPERVISED TASK FAILED (${terminalStatus.status})
51229
+
51230
+ Task was interrupted/failed while running in monitored background mode.
51231
+ ${terminalStatus.error ? `Error: ${terminalStatus.error}` : ""}
51232
+
51233
+ Duration: ${duration4}
51234
+ Agent: ${agentToUse}${args.category ? ` (category: ${args.category})` : ""}
51235
+ Model: ${actualModel}
51236
+
51237
+ The task session may contain partial results.
51238
+
51239
+ <task_metadata>
51240
+ session_id: ${sessionID}
51241
+ </task_metadata>`;
51242
+ }
51167
51243
  const messagesResult = await client2.session.messages({ path: { id: sessionID } });
51168
51244
  const messages = messagesResult.data ?? messagesResult;
51169
51245
  const assistantMessages = messages.filter((m) => m.info?.role === "assistant").sort((a, b) => (b.info?.time?.created ?? 0) - (a.info?.time?.created ?? 0));
@@ -53497,6 +53573,11 @@ class BackgroundManager {
53497
53573
  });
53498
53574
  return existingTask;
53499
53575
  }
53576
+ const completionTimer = this.completionTimers.get(existingTask.id);
53577
+ if (completionTimer) {
53578
+ clearTimeout(completionTimer);
53579
+ this.completionTimers.delete(existingTask.id);
53580
+ }
53500
53581
  const concurrencyKey = existingTask.concurrencyGroup ?? existingTask.agent;
53501
53582
  await this.concurrencyManager.acquire(concurrencyKey);
53502
53583
  existingTask.concurrencyKey = concurrencyKey;
@@ -53600,7 +53681,7 @@ class BackgroundManager {
53600
53681
  }
53601
53682
  handleEvent(event) {
53602
53683
  const props = event.properties;
53603
- if (event.type === "message.part.updated") {
53684
+ if (event.type === "message.part.updated" || event.type === "message.part.delta") {
53604
53685
  if (!props || typeof props !== "object" || !("sessionID" in props))
53605
53686
  return;
53606
53687
  const partInfo = props;
@@ -53704,6 +53785,10 @@ class BackgroundManager {
53704
53785
  this.cleanupPendingByParent(task);
53705
53786
  this.tasks.delete(task.id);
53706
53787
  this.clearNotificationsForTask(task.id);
53788
+ const toastManager = getTaskToastManager();
53789
+ if (toastManager) {
53790
+ toastManager.removeTask(task.id);
53791
+ }
53707
53792
  if (task.sessionID) {
53708
53793
  subagentSessions.delete(task.sessionID);
53709
53794
  }
@@ -53746,6 +53831,10 @@ class BackgroundManager {
53746
53831
  this.cleanupPendingByParent(task);
53747
53832
  this.tasks.delete(task.id);
53748
53833
  this.clearNotificationsForTask(task.id);
53834
+ const toastManager = getTaskToastManager();
53835
+ if (toastManager) {
53836
+ toastManager.removeTask(task.id);
53837
+ }
53749
53838
  if (task.sessionID) {
53750
53839
  subagentSessions.delete(task.sessionID);
53751
53840
  }
@@ -53860,6 +53949,10 @@ class BackgroundManager {
53860
53949
  }).catch(() => {});
53861
53950
  }
53862
53951
  if (options?.skipNotification) {
53952
+ const toastManager = getTaskToastManager();
53953
+ if (toastManager) {
53954
+ toastManager.removeTask(task.id);
53955
+ }
53863
53956
  log(`[background-agent] Task cancelled via ${source} (notification skipped):`, task.id);
53864
53957
  return true;
53865
53958
  }
@@ -54037,11 +54130,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
54037
54130
  }
54038
54131
  } catch (error45) {
54039
54132
  if (this.isAbortedSessionError(error45)) {
54040
- log("[background-agent] Parent session aborted, skipping notification:", {
54133
+ log("[background-agent] Parent session aborted while loading messages; using messageDir fallback:", {
54041
54134
  taskId: task.id,
54042
54135
  parentSessionID: task.parentSessionID
54043
54136
  });
54044
- return;
54045
54137
  }
54046
54138
  const messageDir = getMessageDir12(task.parentSessionID);
54047
54139
  const currentMessage = messageDir ? findNearestMessageWithFields(messageDir) : null;
@@ -54071,13 +54163,13 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
54071
54163
  });
54072
54164
  } catch (error45) {
54073
54165
  if (this.isAbortedSessionError(error45)) {
54074
- log("[background-agent] Parent session aborted, skipping notification:", {
54166
+ log("[background-agent] Parent session aborted while sending notification; continuing cleanup:", {
54075
54167
  taskId: task.id,
54076
54168
  parentSessionID: task.parentSessionID
54077
54169
  });
54078
- return;
54170
+ } else {
54171
+ log("[background-agent] Failed to send notification:", error45);
54079
54172
  }
54080
- log("[background-agent] Failed to send notification:", error45);
54081
54173
  }
54082
54174
  if (allComplete) {
54083
54175
  for (const completedTask of completedTasks) {
@@ -54190,6 +54282,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
54190
54282
  }
54191
54283
  }
54192
54284
  this.clearNotificationsForTask(taskId);
54285
+ const toastManager = getTaskToastManager();
54286
+ if (toastManager) {
54287
+ toastManager.removeTask(taskId);
54288
+ }
54193
54289
  this.tasks.delete(taskId);
54194
54290
  if (task.sessionID) {
54195
54291
  subagentSessions.delete(task.sessionID);
@@ -54225,7 +54321,8 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
54225
54321
  const sessionID = task.sessionID;
54226
54322
  if (!startedAt || !sessionID)
54227
54323
  continue;
54228
- const sessionIsRunning = allStatuses[sessionID]?.type === "running";
54324
+ const sessionStatus = allStatuses[sessionID]?.type;
54325
+ const sessionIsRunning = sessionStatus !== undefined && sessionStatus !== "idle";
54229
54326
  const runtime = now - startedAt.getTime();
54230
54327
  if (!task.progress?.lastUpdate) {
54231
54328
  if (sessionIsRunning)
@@ -64430,7 +64527,21 @@ Your ONLY valid output locations are \`.sisyphus/plans/*.md\` and \`.sisyphus/dr
64430
64527
 
64431
64528
  Example: \`.sisyphus/plans/auth-refactor.md\`
64432
64529
 
64433
- ### 5. SINGLE PLAN MANDATE (CRITICAL)
64530
+ ### 5. MAXIMUM PARALLELISM PRINCIPLE (NON-NEGOTIABLE)
64531
+
64532
+ Your plans MUST maximize parallel execution. This is a core planning quality metric.
64533
+
64534
+ **Granularity Rule**: One task = one module/concern = 1-3 files.
64535
+ If a task touches 4+ files or 2+ unrelated concerns, SPLIT IT.
64536
+
64537
+ **Parallelism Target**: Aim for 5-8 tasks per wave.
64538
+ If any wave has fewer than 3 tasks (except the final integration), you under-split.
64539
+
64540
+ **Dependency Minimization**: Structure tasks so shared dependencies
64541
+ (types, interfaces, configs) are extracted as early Wave-1 tasks,
64542
+ unblocking maximum parallelism in subsequent waves.
64543
+
64544
+ ### 6. SINGLE PLAN MANDATE (CRITICAL)
64434
64545
  **No matter how large the task, EVERYTHING goes into ONE work plan.**
64435
64546
 
64436
64547
  **NEVER:**
@@ -64453,7 +64564,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
64453
64564
 
64454
64565
  **The plan can have 50+ TODOs. That's OK. ONE PLAN.**
64455
64566
 
64456
- ### 5.1 SINGLE ATOMIC WRITE (CRITICAL - Prevents Content Loss)
64567
+ ### 6.1 SINGLE ATOMIC WRITE (CRITICAL - Prevents Content Loss)
64457
64568
 
64458
64569
  <write_protocol>
64459
64570
  **The Write tool OVERWRITES files. It does NOT append.**
@@ -64496,7 +64607,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
64496
64607
  - [ ] File already exists with my content? \u2192 Use Edit to append, NOT Write
64497
64608
  </write_protocol>
64498
64609
 
64499
- ### 6. DRAFT AS WORKING MEMORY (MANDATORY)
64610
+ ### 7. DRAFT AS WORKING MEMORY (MANDATORY)
64500
64611
  **During interview, CONTINUOUSLY record decisions to a draft file.**
64501
64612
 
64502
64613
  **Draft Location**: \`.sisyphus/drafts/{name}.md\`
@@ -65303,108 +65414,25 @@ Generate plan to: \`.sisyphus/plans/{name}.md\`
65303
65414
 
65304
65415
  ## Verification Strategy (MANDATORY)
65305
65416
 
65306
- > **UNIVERSAL RULE: ZERO HUMAN INTERVENTION**
65307
- >
65308
- > ALL tasks in this plan MUST be verifiable WITHOUT any human action.
65309
- > This is NOT conditional \u2014 it applies to EVERY task, regardless of test strategy.
65310
- >
65311
- > **FORBIDDEN** \u2014 acceptance criteria that require:
65312
- > - "User manually tests..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uD14C\uC2A4\uD2B8..."
65313
- > - "User visually confirms..." / "\uC0AC\uC6A9\uC790\uAC00 \uB208\uC73C\uB85C \uD655\uC778..."
65314
- > - "User interacts with..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uC870\uC791..."
65315
- > - "Ask user to verify..." / "\uC0AC\uC6A9\uC790\uC5D0\uAC8C \uD655\uC778 \uC694\uCCAD..."
65316
- > - ANY step where a human must perform an action
65317
- >
65318
- > **ALL verification is executed by the agent** using tools (Playwright, interactive_bash, curl, etc.). No exceptions.
65417
+ > **ZERO HUMAN INTERVENTION** \u2014 ALL verification is agent-executed. No exceptions.
65418
+ > Acceptance criteria requiring "user manually tests/confirms" are FORBIDDEN.
65319
65419
 
65320
65420
  ### Test Decision
65321
65421
  - **Infrastructure exists**: [YES/NO]
65322
65422
  - **Automated tests**: [TDD / Tests-after / None]
65323
65423
  - **Framework**: [bun test / vitest / jest / pytest / none]
65424
+ - **If TDD**: Each task follows RED (failing test) \u2192 GREEN (minimal impl) \u2192 REFACTOR
65324
65425
 
65325
- ### If TDD Enabled
65326
-
65327
- Each TODO follows RED-GREEN-REFACTOR:
65328
-
65329
- **Task Structure:**
65330
- 1. **RED**: Write failing test first
65331
- - Test file: \`[path].test.ts\`
65332
- - Test command: \`bun test [file]\`
65333
- - Expected: FAIL (test exists, implementation doesn't)
65334
- 2. **GREEN**: Implement minimum code to pass
65335
- - Command: \`bun test [file]\`
65336
- - Expected: PASS
65337
- 3. **REFACTOR**: Clean up while keeping green
65338
- - Command: \`bun test [file]\`
65339
- - Expected: PASS (still)
65340
-
65341
- **Test Setup Task (if infrastructure doesn't exist):**
65342
- - [ ] 0. Setup Test Infrastructure
65343
- - Install: \`bun add -d [test-framework]\`
65344
- - Config: Create \`[config-file]\`
65345
- - Verify: \`bun test --help\` \u2192 shows help
65346
- - Example: Create \`src/__tests__/example.test.ts\`
65347
- - Verify: \`bun test\` \u2192 1 test passes
65348
-
65349
- ### Agent-Executed QA Scenarios (MANDATORY \u2014 ALL tasks)
65350
-
65351
- > Whether TDD is enabled or not, EVERY task MUST include Agent-Executed QA Scenarios.
65352
- > - **With TDD**: QA scenarios complement unit tests at integration/E2E level
65353
- > - **Without TDD**: QA scenarios are the PRIMARY verification method
65354
- >
65355
- > These describe how the executing agent DIRECTLY verifies the deliverable
65356
- > by running it \u2014 opening browsers, executing commands, sending API requests.
65357
- > The agent performs what a human tester would do, but automated via tools.
65358
-
65359
- **Verification Tool by Deliverable Type:**
65360
-
65361
- | Type | Tool | How Agent Verifies |
65362
- |------|------|-------------------|
65363
- | **Frontend/UI** | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
65364
- | **TUI/CLI** | interactive_bash (tmux) | Run command, send keystrokes, validate output |
65365
- | **API/Backend** | Bash (curl/httpie) | Send requests, parse responses, assert fields |
65366
- | **Library/Module** | Bash (bun/node REPL) | Import, call functions, compare output |
65367
- | **Config/Infra** | Bash (shell commands) | Apply config, run state checks, validate |
65368
-
65369
- **Each Scenario MUST Follow This Format:**
65370
-
65371
- \`\`\`
65372
- Scenario: [Descriptive name \u2014 what user action/flow is being verified]
65373
- Tool: [Playwright / interactive_bash / Bash]
65374
- Preconditions: [What must be true before this scenario runs]
65375
- Steps:
65376
- 1. [Exact action with specific selector/command/endpoint]
65377
- 2. [Next action with expected intermediate state]
65378
- 3. [Assertion with exact expected value]
65379
- Expected Result: [Concrete, observable outcome]
65380
- Failure Indicators: [What would indicate failure]
65381
- Evidence: [Screenshot path / output capture / response body path]
65382
- \`\`\`
65426
+ ### QA Policy
65427
+ Every task MUST include agent-executed QA scenarios (see TODO template below).
65428
+ Evidence saved to \`.sisyphus/evidence/task-{N}-{scenario-slug}.{ext}\`.
65383
65429
 
65384
- **Scenario Detail Requirements:**
65385
- - **Selectors**: Specific CSS selectors (\`.login-button\`, not "the login button")
65386
- - **Data**: Concrete test data (\`"test@example.com"\`, not \`"[email]"\`)
65387
- - **Assertions**: Exact values (\`text contains "Welcome back"\`, not "verify it works")
65388
- - **Timing**: Include wait conditions where relevant (\`Wait for .dashboard (timeout: 10s)\`)
65389
- - **Negative Scenarios**: At least ONE failure/error scenario per feature
65390
- - **Evidence Paths**: Specific file paths (\`.sisyphus/evidence/task-N-scenario-name.png\`)
65391
-
65392
- **Anti-patterns (NEVER write scenarios like this):**
65393
- - \u274C "Verify the login page works correctly"
65394
- - \u274C "Check that the API returns the right data"
65395
- - \u274C "Test the form validation"
65396
- - \u274C "User opens browser and confirms..."
65397
-
65398
- **Write scenarios like this instead:**
65399
- - \u2705 \`Navigate to /login \u2192 Fill input[name="email"] with "test@example.com" \u2192 Fill input[name="password"] with "Pass123!" \u2192 Click button[type="submit"] \u2192 Wait for /dashboard \u2192 Assert h1 contains "Welcome"\`
65400
- - \u2705 \`POST /api/users {"name":"Test","email":"new@test.com"} \u2192 Assert status 201 \u2192 Assert response.id is UUID \u2192 GET /api/users/{id} \u2192 Assert name equals "Test"\`
65401
- - \u2705 \`Run ./cli --config test.yaml \u2192 Wait for "Loaded" in stdout \u2192 Send "q" \u2192 Assert exit code 0 \u2192 Assert stdout contains "Goodbye"\`
65402
-
65403
- **Evidence Requirements:**
65404
- - Screenshots: \`.sisyphus/evidence/\` for all UI verifications
65405
- - Terminal output: Captured for CLI/TUI verifications
65406
- - Response bodies: Saved for API verifications
65407
- - All evidence referenced by specific file path in acceptance criteria
65430
+ | Deliverable Type | Verification Tool | Method |
65431
+ |------------------|-------------------|--------|
65432
+ | Frontend/UI | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
65433
+ | TUI/CLI | interactive_bash (tmux) | Run command, send keystrokes, validate output |
65434
+ | API/Backend | Bash (curl) | Send requests, assert status + response fields |
65435
+ | Library/Module | Bash (bun/node REPL) | Import, call functions, compare output |
65408
65436
 
65409
65437
  ---
65410
65438
 
@@ -65414,49 +65442,82 @@ Scenario: [Descriptive name \u2014 what user action/flow is being verified]
65414
65442
 
65415
65443
  > Maximize throughput by grouping independent tasks into parallel waves.
65416
65444
  > Each wave completes before the next begins.
65445
+ > Target: 5-8 tasks per wave. Fewer than 3 per wave (except final) = under-splitting.
65417
65446
 
65418
65447
  \`\`\`
65419
- Wave 1 (Start Immediately):
65420
- \u251C\u2500\u2500 Task 1: [no dependencies]
65421
- \u2514\u2500\u2500 Task 5: [no dependencies]
65422
-
65423
- Wave 2 (After Wave 1):
65424
- \u251C\u2500\u2500 Task 2: [depends: 1]
65425
- \u251C\u2500\u2500 Task 3: [depends: 1]
65426
- \u2514\u2500\u2500 Task 6: [depends: 5]
65427
-
65428
- Wave 3 (After Wave 2):
65429
- \u2514\u2500\u2500 Task 4: [depends: 2, 3]
65430
-
65431
- Critical Path: Task 1 \u2192 Task 2 \u2192 Task 4
65432
- Parallel Speedup: ~40% faster than sequential
65448
+ Wave 1 (Start Immediately \u2014 foundation + scaffolding):
65449
+ \u251C\u2500\u2500 Task 1: Project scaffolding + config [quick]
65450
+ \u251C\u2500\u2500 Task 2: Design system tokens [quick]
65451
+ \u251C\u2500\u2500 Task 3: Type definitions [quick]
65452
+ \u251C\u2500\u2500 Task 4: Schema definitions [quick]
65453
+ \u251C\u2500\u2500 Task 5: Storage interface + in-memory impl [quick]
65454
+ \u251C\u2500\u2500 Task 6: Auth middleware [quick]
65455
+ \u2514\u2500\u2500 Task 7: Client module [quick]
65456
+
65457
+ Wave 2 (After Wave 1 \u2014 core modules, MAX PARALLEL):
65458
+ \u251C\u2500\u2500 Task 8: Core business logic (depends: 3, 5, 7) [deep]
65459
+ \u251C\u2500\u2500 Task 9: API endpoints (depends: 4, 5) [unspecified-high]
65460
+ \u251C\u2500\u2500 Task 10: Secondary storage impl (depends: 5) [unspecified-high]
65461
+ \u251C\u2500\u2500 Task 11: Retry/fallback logic (depends: 8) [deep]
65462
+ \u251C\u2500\u2500 Task 12: UI layout + navigation (depends: 2) [visual-engineering]
65463
+ \u251C\u2500\u2500 Task 13: API client + hooks (depends: 4) [quick]
65464
+ \u2514\u2500\u2500 Task 14: Telemetry middleware (depends: 5, 10) [unspecified-high]
65465
+
65466
+ Wave 3 (After Wave 2 \u2014 integration + UI):
65467
+ \u251C\u2500\u2500 Task 15: Main route combining modules (depends: 6, 11, 14) [deep]
65468
+ \u251C\u2500\u2500 Task 16: UI data visualization (depends: 12, 13) [visual-engineering]
65469
+ \u251C\u2500\u2500 Task 17: Deployment config A (depends: 15) [quick]
65470
+ \u251C\u2500\u2500 Task 18: Deployment config B (depends: 15) [quick]
65471
+ \u251C\u2500\u2500 Task 19: Deployment config C (depends: 15) [quick]
65472
+ \u2514\u2500\u2500 Task 20: UI request log + build (depends: 16) [visual-engineering]
65473
+
65474
+ Wave 4 (After Wave 3 \u2014 verification):
65475
+ \u251C\u2500\u2500 Task 21: Integration tests (depends: 15) [deep]
65476
+ \u251C\u2500\u2500 Task 22: UI QA - Playwright (depends: 20) [unspecified-high]
65477
+ \u251C\u2500\u2500 Task 23: E2E QA (depends: 21) [deep]
65478
+ \u2514\u2500\u2500 Task 24: Git cleanup + tagging (depends: 21) [git]
65479
+
65480
+ Wave FINAL (After ALL tasks \u2014 independent review, 4 parallel):
65481
+ \u251C\u2500\u2500 Task F1: Plan compliance audit (oracle)
65482
+ \u251C\u2500\u2500 Task F2: Code quality review (unspecified-high)
65483
+ \u251C\u2500\u2500 Task F3: Real manual QA (unspecified-high)
65484
+ \u2514\u2500\u2500 Task F4: Scope fidelity check (deep)
65485
+
65486
+ Critical Path: Task 1 \u2192 Task 5 \u2192 Task 8 \u2192 Task 11 \u2192 Task 15 \u2192 Task 21 \u2192 F1-F4
65487
+ Parallel Speedup: ~70% faster than sequential
65488
+ Max Concurrent: 7 (Waves 1 & 2)
65433
65489
  \`\`\`
65434
65490
 
65435
- ### Dependency Matrix
65491
+ ### Dependency Matrix (abbreviated \u2014 show ALL tasks in your generated plan)
65436
65492
 
65437
- | Task | Depends On | Blocks | Can Parallelize With |
65438
- |------|------------|--------|---------------------|
65439
- | 1 | None | 2, 3 | 5 |
65440
- | 2 | 1 | 4 | 3, 6 |
65441
- | 3 | 1 | 4 | 2, 6 |
65442
- | 4 | 2, 3 | None | None (final) |
65443
- | 5 | None | 6 | 1 |
65444
- | 6 | 5 | None | 2, 3 |
65493
+ | Task | Depends On | Blocks | Wave |
65494
+ |------|------------|--------|------|
65495
+ | 1-7 | \u2014 | 8-14 | 1 |
65496
+ | 8 | 3, 5, 7 | 11, 15 | 2 |
65497
+ | 11 | 8 | 15 | 2 |
65498
+ | 14 | 5, 10 | 15 | 2 |
65499
+ | 15 | 6, 11, 14 | 17-19, 21 | 3 |
65500
+ | 21 | 15 | 23, 24 | 4 |
65501
+
65502
+ > This is abbreviated for reference. YOUR generated plan must include the FULL matrix for ALL tasks.
65445
65503
 
65446
65504
  ### Agent Dispatch Summary
65447
65505
 
65448
- | Wave | Tasks | Recommended Agents |
65449
- |------|-------|-------------------|
65450
- | 1 | 1, 5 | task(category="...", load_skills=[...], run_in_background=false) |
65451
- | 2 | 2, 3, 6 | dispatch parallel after Wave 1 completes |
65452
- | 3 | 4 | final integration task |
65506
+ | Wave | # Parallel | Tasks \u2192 Agent Category |
65507
+ |------|------------|----------------------|
65508
+ | 1 | **7** | T1-T4 \u2192 \`quick\`, T5 \u2192 \`quick\`, T6 \u2192 \`quick\`, T7 \u2192 \`quick\` |
65509
+ | 2 | **7** | T8 \u2192 \`deep\`, T9 \u2192 \`unspecified-high\`, T10 \u2192 \`unspecified-high\`, T11 \u2192 \`deep\`, T12 \u2192 \`visual-engineering\`, T13 \u2192 \`quick\`, T14 \u2192 \`unspecified-high\` |
65510
+ | 3 | **6** | T15 \u2192 \`deep\`, T16 \u2192 \`visual-engineering\`, T17-T19 \u2192 \`quick\`, T20 \u2192 \`visual-engineering\` |
65511
+ | 4 | **4** | T21 \u2192 \`deep\`, T22 \u2192 \`unspecified-high\`, T23 \u2192 \`deep\`, T24 \u2192 \`git\` |
65512
+ | FINAL | **4** | F1 \u2192 \`oracle\`, F2 \u2192 \`unspecified-high\`, F3 \u2192 \`unspecified-high\`, F4 \u2192 \`deep\` |
65453
65513
 
65454
65514
  ---
65455
65515
 
65456
65516
  ## TODOs
65457
65517
 
65458
65518
  > Implementation + Test = ONE Task. Never separate.
65459
- > EVERY task MUST have: Recommended Agent Profile + Parallelization info.
65519
+ > EVERY task MUST have: Recommended Agent Profile + Parallelization info + QA Scenarios.
65520
+ > **A task WITHOUT QA Scenarios is INCOMPLETE. No exceptions.**
65460
65521
 
65461
65522
  - [ ] 1. [Task Title]
65462
65523
 
@@ -65490,22 +65551,15 @@ Parallel Speedup: ~40% faster than sequential
65490
65551
 
65491
65552
  **Pattern References** (existing code to follow):
65492
65553
  - \`src/services/auth.ts:45-78\` - Authentication flow pattern (JWT creation, refresh token handling)
65493
- - \`src/hooks/useForm.ts:12-34\` - Form validation pattern (Zod schema + react-hook-form integration)
65494
65554
 
65495
65555
  **API/Type References** (contracts to implement against):
65496
65556
  - \`src/types/user.ts:UserDTO\` - Response shape for user endpoints
65497
- - \`src/api/schema.ts:createUserSchema\` - Request validation schema
65498
65557
 
65499
65558
  **Test References** (testing patterns to follow):
65500
65559
  - \`src/__tests__/auth.test.ts:describe("login")\` - Test structure and mocking patterns
65501
65560
 
65502
- **Documentation References** (specs and requirements):
65503
- - \`docs/api-spec.md#authentication\` - API contract details
65504
- - \`ARCHITECTURE.md:Database Layer\` - Database access patterns
65505
-
65506
65561
  **External References** (libraries and frameworks):
65507
65562
  - Official docs: \`https://zod.dev/?id=basic-usage\` - Zod validation syntax
65508
- - Example repo: \`github.com/example/project/src/auth\` - Reference implementation
65509
65563
 
65510
65564
  **WHY Each Reference Matters** (explain the relevance):
65511
65565
  - Don't just list files - explain what pattern/information the executor should extract
@@ -65516,113 +65570,60 @@ Parallel Speedup: ~40% faster than sequential
65516
65570
 
65517
65571
  > **AGENT-EXECUTABLE VERIFICATION ONLY** \u2014 No human action permitted.
65518
65572
  > Every criterion MUST be verifiable by running a command or using a tool.
65519
- > REPLACE all placeholders with actual values from task context.
65520
65573
 
65521
65574
  **If TDD (tests enabled):**
65522
65575
  - [ ] Test file created: src/auth/login.test.ts
65523
- - [ ] Test covers: successful login returns JWT token
65524
65576
  - [ ] bun test src/auth/login.test.ts \u2192 PASS (3 tests, 0 failures)
65525
65577
 
65526
- **Agent-Executed QA Scenarios (MANDATORY \u2014 per-scenario, ultra-detailed):**
65527
-
65528
- > Write MULTIPLE named scenarios per task: happy path AND failure cases.
65529
- > Each scenario = exact tool + steps with real selectors/data + evidence path.
65530
-
65531
- **Example \u2014 Frontend/UI (Playwright):**
65578
+ **QA Scenarios (MANDATORY \u2014 task is INCOMPLETE without these):**
65532
65579
 
65533
- \\\`\\\`\\\`
65534
- Scenario: Successful login redirects to dashboard
65535
- Tool: Playwright (playwright skill)
65536
- Preconditions: Dev server running on localhost:3000, test user exists
65537
- Steps:
65538
- 1. Navigate to: http://localhost:3000/login
65539
- 2. Wait for: input[name="email"] visible (timeout: 5s)
65540
- 3. Fill: input[name="email"] \u2192 "test@example.com"
65541
- 4. Fill: input[name="password"] \u2192 "ValidPass123!"
65542
- 5. Click: button[type="submit"]
65543
- 6. Wait for: navigation to /dashboard (timeout: 10s)
65544
- 7. Assert: h1 text contains "Welcome back"
65545
- 8. Assert: cookie "session_token" exists
65546
- 9. Screenshot: .sisyphus/evidence/task-1-login-success.png
65547
- Expected Result: Dashboard loads with welcome message
65548
- Evidence: .sisyphus/evidence/task-1-login-success.png
65549
-
65550
- Scenario: Login fails with invalid credentials
65551
- Tool: Playwright (playwright skill)
65552
- Preconditions: Dev server running, no valid user with these credentials
65553
- Steps:
65554
- 1. Navigate to: http://localhost:3000/login
65555
- 2. Fill: input[name="email"] \u2192 "wrong@example.com"
65556
- 3. Fill: input[name="password"] \u2192 "WrongPass"
65557
- 4. Click: button[type="submit"]
65558
- 5. Wait for: .error-message visible (timeout: 5s)
65559
- 6. Assert: .error-message text contains "Invalid credentials"
65560
- 7. Assert: URL is still /login (no redirect)
65561
- 8. Screenshot: .sisyphus/evidence/task-1-login-failure.png
65562
- Expected Result: Error message shown, stays on login page
65563
- Evidence: .sisyphus/evidence/task-1-login-failure.png
65564
- \\\`\\\`\\\`
65565
-
65566
- **Example \u2014 API/Backend (curl):**
65580
+ > **This is NOT optional. A task without QA scenarios WILL BE REJECTED.**
65581
+ >
65582
+ > Write scenario tests that verify the ACTUAL BEHAVIOR of what you built.
65583
+ > Minimum: 1 happy path + 1 failure/edge case per task.
65584
+ > Each scenario = exact tool + exact steps + exact assertions + evidence path.
65585
+ >
65586
+ > **The executing agent MUST run these scenarios after implementation.**
65587
+ > **The orchestrator WILL verify evidence files exist before marking task complete.**
65567
65588
 
65568
65589
  \\\`\\\`\\\`
65569
- Scenario: Create user returns 201 with UUID
65570
- Tool: Bash (curl)
65571
- Preconditions: Server running on localhost:8080
65590
+ Scenario: [Happy path \u2014 what SHOULD work]
65591
+ Tool: [Playwright / interactive_bash / Bash (curl)]
65592
+ Preconditions: [Exact setup state]
65572
65593
  Steps:
65573
- 1. curl -s -w "\\n%{http_code}" -X POST http://localhost:8080/api/users \\
65574
- -H "Content-Type: application/json" \\
65575
- -d '{"email":"new@test.com","name":"Test User"}'
65576
- 2. Assert: HTTP status is 201
65577
- 3. Assert: response.id matches UUID format
65578
- 4. GET /api/users/{returned-id} \u2192 Assert name equals "Test User"
65579
- Expected Result: User created and retrievable
65580
- Evidence: Response bodies captured
65581
-
65582
- Scenario: Duplicate email returns 409
65583
- Tool: Bash (curl)
65584
- Preconditions: User with email "new@test.com" already exists
65594
+ 1. [Exact action \u2014 specific command/selector/endpoint, no vagueness]
65595
+ 2. [Next action \u2014 with expected intermediate state]
65596
+ 3. [Assertion \u2014 exact expected value, not "verify it works"]
65597
+ Expected Result: [Concrete, observable, binary pass/fail]
65598
+ Failure Indicators: [What specifically would mean this failed]
65599
+ Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}.{ext}
65600
+
65601
+ Scenario: [Failure/edge case \u2014 what SHOULD fail gracefully]
65602
+ Tool: [same format]
65603
+ Preconditions: [Invalid input / missing dependency / error state]
65585
65604
  Steps:
65586
- 1. Repeat POST with same email
65587
- 2. Assert: HTTP status is 409
65588
- 3. Assert: response.error contains "already exists"
65589
- Expected Result: Conflict error returned
65590
- Evidence: Response body captured
65605
+ 1. [Trigger the error condition]
65606
+ 2. [Assert error is handled correctly]
65607
+ Expected Result: [Graceful failure with correct error message/code]
65608
+ Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}-error.{ext}
65591
65609
  \\\`\\\`\\\`
65592
65610
 
65593
- **Example \u2014 TUI/CLI (interactive_bash):**
65594
-
65595
- \\\`\\\`\\\`
65596
- Scenario: CLI loads config and displays menu
65597
- Tool: interactive_bash (tmux)
65598
- Preconditions: Binary built, test config at ./test.yaml
65599
- Steps:
65600
- 1. tmux new-session: ./my-cli --config test.yaml
65601
- 2. Wait for: "Configuration loaded" in output (timeout: 5s)
65602
- 3. Assert: Menu items visible ("1. Create", "2. List", "3. Exit")
65603
- 4. Send keys: "3" then Enter
65604
- 5. Assert: "Goodbye" in output
65605
- 6. Assert: Process exited with code 0
65606
- Expected Result: CLI starts, shows menu, exits cleanly
65607
- Evidence: Terminal output captured
65608
-
65609
- Scenario: CLI handles missing config gracefully
65610
- Tool: interactive_bash (tmux)
65611
- Preconditions: No config file at ./nonexistent.yaml
65612
- Steps:
65613
- 1. tmux new-session: ./my-cli --config nonexistent.yaml
65614
- 2. Wait for: output (timeout: 3s)
65615
- 3. Assert: stderr contains "Config file not found"
65616
- 4. Assert: Process exited with code 1
65617
- Expected Result: Meaningful error, non-zero exit
65618
- Evidence: Error output captured
65619
- \\\`\\\`\\\`
65611
+ > **Specificity requirements \u2014 every scenario MUST use:**
65612
+ > - **Selectors**: Specific CSS selectors (\`.login-button\`, not "the login button")
65613
+ > - **Data**: Concrete test data (\`"test@example.com"\`, not \`"[email]"\`)
65614
+ > - **Assertions**: Exact values (\`text contains "Welcome back"\`, not "verify it works")
65615
+ > - **Timing**: Wait conditions where relevant (\`timeout: 10s\`)
65616
+ > - **Negative**: At least ONE failure/error scenario per task
65617
+ >
65618
+ > **Anti-patterns (your scenario is INVALID if it looks like this):**
65619
+ > - \u274C "Verify it works correctly" \u2014 HOW? What does "correctly" mean?
65620
+ > - \u274C "Check the API returns data" \u2014 WHAT data? What fields? What values?
65621
+ > - \u274C "Test the component renders" \u2014 WHERE? What selector? What content?
65622
+ > - \u274C Any scenario without an evidence path
65620
65623
 
65621
65624
  **Evidence to Capture:**
65622
- - [ ] Screenshots in .sisyphus/evidence/ for UI scenarios
65623
- - [ ] Terminal output for CLI/TUI scenarios
65624
- - [ ] Response bodies for API scenarios
65625
65625
  - [ ] Each evidence file named: task-{N}-{scenario-slug}.{ext}
65626
+ - [ ] Screenshots for UI, terminal output for CLI, response bodies for API
65626
65627
 
65627
65628
  **Commit**: YES | NO (groups with N)
65628
65629
  - Message: \`type(scope): desc\`
@@ -65631,6 +65632,28 @@ Parallel Speedup: ~40% faster than sequential
65631
65632
 
65632
65633
  ---
65633
65634
 
65635
+ ## Final Verification Wave (MANDATORY \u2014 after ALL implementation tasks)
65636
+
65637
+ > 4 review agents run in PARALLEL. ALL must APPROVE. Rejection \u2192 fix \u2192 re-run.
65638
+
65639
+ - [ ] F1. **Plan Compliance Audit** \u2014 \`oracle\`
65640
+ Read the plan end-to-end. For each "Must Have": verify implementation exists (read file, curl endpoint, run command). For each "Must NOT Have": search codebase for forbidden patterns \u2014 reject with file:line if found. Check evidence files exist in .sisyphus/evidence/. Compare deliverables against plan.
65641
+ Output: \`Must Have [N/N] | Must NOT Have [N/N] | Tasks [N/N] | VERDICT: APPROVE/REJECT\`
65642
+
65643
+ - [ ] F2. **Code Quality Review** \u2014 \`unspecified-high\`
65644
+ Run \`tsc --noEmit\` + linter + \`bun test\`. Review all changed files for: \`as any\`/\`@ts-ignore\`, empty catches, console.log in prod, commented-out code, unused imports. Check AI slop: excessive comments, over-abstraction, generic names (data/result/item/temp).
65645
+ Output: \`Build [PASS/FAIL] | Lint [PASS/FAIL] | Tests [N pass/N fail] | Files [N clean/N issues] | VERDICT\`
65646
+
65647
+ - [ ] F3. **Real Manual QA** \u2014 \`unspecified-high\` (+ \`playwright\` skill if UI)
65648
+ Start from clean state. Execute EVERY QA scenario from EVERY task \u2014 follow exact steps, capture evidence. Test cross-task integration (features working together, not isolation). Test edge cases: empty state, invalid input, rapid actions. Save to \`.sisyphus/evidence/final-qa/\`.
65649
+ Output: \`Scenarios [N/N pass] | Integration [N/N] | Edge Cases [N tested] | VERDICT\`
65650
+
65651
+ - [ ] F4. **Scope Fidelity Check** \u2014 \`deep\`
65652
+ For each task: read "What to do", read actual diff (git log/diff). Verify 1:1 \u2014 everything in spec was built (no missing), nothing beyond spec was built (no creep). Check "Must NOT do" compliance. Detect cross-task contamination: Task N touching Task M's files. Flag unaccounted changes.
65653
+ Output: \`Tasks [N/N compliant] | Contamination [CLEAN/N issues] | Unaccounted [CLEAN/N files] | VERDICT\`
65654
+
65655
+ ---
65656
+
65634
65657
  ## Commit Strategy
65635
65658
 
65636
65659
  | After Task | Message | Files | Verification |
@@ -67540,9 +67563,11 @@ function createChatMessageHandler2(args) {
67540
67563
  }
67541
67564
  const message = output.message;
67542
67565
  if (firstMessageVariantGate.shouldOverride(input.sessionID)) {
67543
- const variant = input.model && input.agent ? resolveVariantForModel(pluginConfig, input.agent, input.model) : resolveAgentVariant(pluginConfig, input.agent);
67544
- if (variant !== undefined) {
67545
- message["variant"] = variant;
67566
+ if (message["variant"] === undefined) {
67567
+ const variant = input.model && input.agent ? resolveVariantForModel(pluginConfig, input.agent, input.model) : resolveAgentVariant(pluginConfig, input.agent);
67568
+ if (variant !== undefined) {
67569
+ message["variant"] = variant;
67570
+ }
67546
67571
  }
67547
67572
  firstMessageVariantGate.markApplied(input.sessionID);
67548
67573
  } else {