oh-my-opencode 3.5.5 → 3.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -12240,6 +12240,8 @@ var TOAST_DURATION_MS = 900;
12240
12240
  var COUNTDOWN_GRACE_PERIOD_MS = 500;
12241
12241
  var ABORT_WINDOW_MS = 3000;
12242
12242
  var CONTINUATION_COOLDOWN_MS = 30000;
12243
+ var MAX_CONSECUTIVE_FAILURES = 5;
12244
+ var FAILURE_RESET_WINDOW_MS = 5 * 60 * 1000;
12243
12245
 
12244
12246
  // src/hooks/todo-continuation-enforcer/handler.ts
12245
12247
  init_logger();
@@ -12464,11 +12466,14 @@ ${todoList}`;
12464
12466
  if (injectionState) {
12465
12467
  injectionState.inFlight = false;
12466
12468
  injectionState.lastInjectedAt = Date.now();
12469
+ injectionState.consecutiveFailures = 0;
12467
12470
  }
12468
12471
  } catch (error) {
12469
12472
  log(`[${HOOK_NAME}] Injection failed`, { sessionID, error: String(error) });
12470
12473
  if (injectionState) {
12471
12474
  injectionState.inFlight = false;
12475
+ injectionState.lastInjectedAt = Date.now();
12476
+ injectionState.consecutiveFailures = (injectionState.consecutiveFailures ?? 0) + 1;
12472
12477
  }
12473
12478
  }
12474
12479
  }
@@ -12587,8 +12592,28 @@ async function handleSessionIdle(args) {
12587
12592
  log(`[${HOOK_NAME}] Skipped: injection in flight`, { sessionID });
12588
12593
  return;
12589
12594
  }
12590
- if (state.lastInjectedAt && Date.now() - state.lastInjectedAt < CONTINUATION_COOLDOWN_MS) {
12591
- log(`[${HOOK_NAME}] Skipped: cooldown active`, { sessionID });
12595
+ if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES && state.lastInjectedAt && Date.now() - state.lastInjectedAt >= FAILURE_RESET_WINDOW_MS) {
12596
+ state.consecutiveFailures = 0;
12597
+ log(`[${HOOK_NAME}] Reset consecutive failures after recovery window`, {
12598
+ sessionID,
12599
+ failureResetWindowMs: FAILURE_RESET_WINDOW_MS
12600
+ });
12601
+ }
12602
+ if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
12603
+ log(`[${HOOK_NAME}] Skipped: max consecutive failures reached`, {
12604
+ sessionID,
12605
+ consecutiveFailures: state.consecutiveFailures,
12606
+ maxConsecutiveFailures: MAX_CONSECUTIVE_FAILURES
12607
+ });
12608
+ return;
12609
+ }
12610
+ const effectiveCooldown = CONTINUATION_COOLDOWN_MS * Math.pow(2, Math.min(state.consecutiveFailures, 5));
12611
+ if (state.lastInjectedAt && Date.now() - state.lastInjectedAt < effectiveCooldown) {
12612
+ log(`[${HOOK_NAME}] Skipped: cooldown active`, {
12613
+ sessionID,
12614
+ effectiveCooldown,
12615
+ consecutiveFailures: state.consecutiveFailures
12616
+ });
12592
12617
  return;
12593
12618
  }
12594
12619
  let resolvedInfo;
@@ -12777,7 +12802,9 @@ function createSessionStateStore() {
12777
12802
  existing.lastAccessedAt = Date.now();
12778
12803
  return existing.state;
12779
12804
  }
12780
- const state = {};
12805
+ const state = {
12806
+ consecutiveFailures: 0
12807
+ };
12781
12808
  sessions.set(sessionID, { state, lastAccessedAt: Date.now() });
12782
12809
  return state;
12783
12810
  }
@@ -28295,11 +28322,17 @@ function createRecoveryState() {
28295
28322
  function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
28296
28323
  const autoCompactState = createRecoveryState();
28297
28324
  const experimental = options?.experimental;
28325
+ const pendingCompactionTimeoutBySession = new Map;
28298
28326
  const eventHandler = async ({ event }) => {
28299
28327
  const props = event.properties;
28300
28328
  if (event.type === "session.deleted") {
28301
28329
  const sessionInfo = props?.info;
28302
28330
  if (sessionInfo?.id) {
28331
+ const timeoutID = pendingCompactionTimeoutBySession.get(sessionInfo.id);
28332
+ if (timeoutID !== undefined) {
28333
+ clearTimeout(timeoutID);
28334
+ pendingCompactionTimeoutBySession.delete(sessionInfo.id);
28335
+ }
28303
28336
  autoCompactState.pendingCompact.delete(sessionInfo.id);
28304
28337
  autoCompactState.errorDataBySession.delete(sessionInfo.id);
28305
28338
  autoCompactState.retryStateBySession.delete(sessionInfo.id);
@@ -28334,9 +28367,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
28334
28367
  duration: 3000
28335
28368
  }
28336
28369
  }).catch(() => {});
28337
- setTimeout(() => {
28370
+ const timeoutID = setTimeout(() => {
28371
+ pendingCompactionTimeoutBySession.delete(sessionID);
28338
28372
  executeCompact(sessionID, { providerID, modelID }, autoCompactState, ctx.client, ctx.directory, experimental);
28339
28373
  }, 300);
28374
+ pendingCompactionTimeoutBySession.set(sessionID, timeoutID);
28340
28375
  }
28341
28376
  return;
28342
28377
  }
@@ -28362,6 +28397,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
28362
28397
  return;
28363
28398
  if (!autoCompactState.pendingCompact.has(sessionID))
28364
28399
  return;
28400
+ const timeoutID = pendingCompactionTimeoutBySession.get(sessionID);
28401
+ if (timeoutID !== undefined) {
28402
+ clearTimeout(timeoutID);
28403
+ pendingCompactionTimeoutBySession.delete(sessionID);
28404
+ }
28365
28405
  const errorData = autoCompactState.errorDataBySession.get(sessionID);
28366
28406
  const lastAssistant = await getLastAssistant(sessionID, ctx.client, ctx.directory);
28367
28407
  if (lastAssistant?.summary === true) {
@@ -44986,6 +45026,7 @@ function createUnstableAgentBabysitterHook(ctx, options) {
44986
45026
  };
44987
45027
  }
44988
45028
  // src/hooks/preemptive-compaction.ts
45029
+ init_logger();
44989
45030
  var DEFAULT_ACTUAL_LIMIT = 200000;
44990
45031
  var ANTHROPIC_ACTUAL_LIMIT3 = process.env.ANTHROPIC_1M_CONTEXT === "true" || process.env.VERTEX_ANTHROPIC_1M_CONTEXT === "true" ? 1e6 : DEFAULT_ACTUAL_LIMIT;
44991
45032
  var PREEMPTIVE_COMPACTION_THRESHOLD = 0.78;
@@ -45017,7 +45058,9 @@ function createPreemptiveCompactionHook(ctx) {
45017
45058
  query: { directory: ctx.directory }
45018
45059
  });
45019
45060
  compactedSessions.add(sessionID);
45020
- } catch {} finally {
45061
+ } catch (error45) {
45062
+ log("[preemptive-compaction] Compaction failed", { sessionID, error: String(error45) });
45063
+ } finally {
45021
45064
  compactionInProgress.delete(sessionID);
45022
45065
  }
45023
45066
  };
@@ -51145,6 +51188,7 @@ Task ID: ${task.id}`;
51145
51188
  const pollStart = Date.now();
51146
51189
  let lastMsgCount = 0;
51147
51190
  let stablePolls = 0;
51191
+ let terminalStatus;
51148
51192
  while (Date.now() - pollStart < timingCfg.MAX_POLL_TIME_MS) {
51149
51193
  if (ctx.abort?.aborted) {
51150
51194
  return `Task aborted (was running in background mode).
@@ -51152,6 +51196,11 @@ Task ID: ${task.id}`;
51152
51196
  Session ID: ${sessionID}`;
51153
51197
  }
51154
51198
  await new Promise((resolve10) => setTimeout(resolve10, timingCfg.POLL_INTERVAL_MS));
51199
+ const currentTask = manager.getTask(task.id);
51200
+ if (currentTask && (currentTask.status === "interrupt" || currentTask.status === "error" || currentTask.status === "cancelled")) {
51201
+ terminalStatus = { status: currentTask.status, error: currentTask.error };
51202
+ break;
51203
+ }
51155
51204
  const statusResult = await client2.session.status();
51156
51205
  const allStatuses = statusResult.data ?? {};
51157
51206
  const sessionStatus = allStatuses[sessionID];
@@ -51174,6 +51223,23 @@ Session ID: ${sessionID}`;
51174
51223
  lastMsgCount = currentMsgCount;
51175
51224
  }
51176
51225
  }
51226
+ if (terminalStatus) {
51227
+ const duration4 = formatDuration2(startTime);
51228
+ return `SUPERVISED TASK FAILED (${terminalStatus.status})
51229
+
51230
+ Task was interrupted/failed while running in monitored background mode.
51231
+ ${terminalStatus.error ? `Error: ${terminalStatus.error}` : ""}
51232
+
51233
+ Duration: ${duration4}
51234
+ Agent: ${agentToUse}${args.category ? ` (category: ${args.category})` : ""}
51235
+ Model: ${actualModel}
51236
+
51237
+ The task session may contain partial results.
51238
+
51239
+ <task_metadata>
51240
+ session_id: ${sessionID}
51241
+ </task_metadata>`;
51242
+ }
51177
51243
  const messagesResult = await client2.session.messages({ path: { id: sessionID } });
51178
51244
  const messages = messagesResult.data ?? messagesResult;
51179
51245
  const assistantMessages = messages.filter((m) => m.info?.role === "assistant").sort((a, b) => (b.info?.time?.created ?? 0) - (a.info?.time?.created ?? 0));
@@ -53507,6 +53573,11 @@ class BackgroundManager {
53507
53573
  });
53508
53574
  return existingTask;
53509
53575
  }
53576
+ const completionTimer = this.completionTimers.get(existingTask.id);
53577
+ if (completionTimer) {
53578
+ clearTimeout(completionTimer);
53579
+ this.completionTimers.delete(existingTask.id);
53580
+ }
53510
53581
  const concurrencyKey = existingTask.concurrencyGroup ?? existingTask.agent;
53511
53582
  await this.concurrencyManager.acquire(concurrencyKey);
53512
53583
  existingTask.concurrencyKey = concurrencyKey;
@@ -53714,6 +53785,10 @@ class BackgroundManager {
53714
53785
  this.cleanupPendingByParent(task);
53715
53786
  this.tasks.delete(task.id);
53716
53787
  this.clearNotificationsForTask(task.id);
53788
+ const toastManager = getTaskToastManager();
53789
+ if (toastManager) {
53790
+ toastManager.removeTask(task.id);
53791
+ }
53717
53792
  if (task.sessionID) {
53718
53793
  subagentSessions.delete(task.sessionID);
53719
53794
  }
@@ -53756,6 +53831,10 @@ class BackgroundManager {
53756
53831
  this.cleanupPendingByParent(task);
53757
53832
  this.tasks.delete(task.id);
53758
53833
  this.clearNotificationsForTask(task.id);
53834
+ const toastManager = getTaskToastManager();
53835
+ if (toastManager) {
53836
+ toastManager.removeTask(task.id);
53837
+ }
53759
53838
  if (task.sessionID) {
53760
53839
  subagentSessions.delete(task.sessionID);
53761
53840
  }
@@ -53870,6 +53949,10 @@ class BackgroundManager {
53870
53949
  }).catch(() => {});
53871
53950
  }
53872
53951
  if (options?.skipNotification) {
53952
+ const toastManager = getTaskToastManager();
53953
+ if (toastManager) {
53954
+ toastManager.removeTask(task.id);
53955
+ }
53873
53956
  log(`[background-agent] Task cancelled via ${source} (notification skipped):`, task.id);
53874
53957
  return true;
53875
53958
  }
@@ -54047,11 +54130,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
54047
54130
  }
54048
54131
  } catch (error45) {
54049
54132
  if (this.isAbortedSessionError(error45)) {
54050
- log("[background-agent] Parent session aborted, skipping notification:", {
54133
+ log("[background-agent] Parent session aborted while loading messages; using messageDir fallback:", {
54051
54134
  taskId: task.id,
54052
54135
  parentSessionID: task.parentSessionID
54053
54136
  });
54054
- return;
54055
54137
  }
54056
54138
  const messageDir = getMessageDir12(task.parentSessionID);
54057
54139
  const currentMessage = messageDir ? findNearestMessageWithFields(messageDir) : null;
@@ -54081,13 +54163,13 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
54081
54163
  });
54082
54164
  } catch (error45) {
54083
54165
  if (this.isAbortedSessionError(error45)) {
54084
- log("[background-agent] Parent session aborted, skipping notification:", {
54166
+ log("[background-agent] Parent session aborted while sending notification; continuing cleanup:", {
54085
54167
  taskId: task.id,
54086
54168
  parentSessionID: task.parentSessionID
54087
54169
  });
54088
- return;
54170
+ } else {
54171
+ log("[background-agent] Failed to send notification:", error45);
54089
54172
  }
54090
- log("[background-agent] Failed to send notification:", error45);
54091
54173
  }
54092
54174
  if (allComplete) {
54093
54175
  for (const completedTask of completedTasks) {
@@ -54200,6 +54282,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
54200
54282
  }
54201
54283
  }
54202
54284
  this.clearNotificationsForTask(taskId);
54285
+ const toastManager = getTaskToastManager();
54286
+ if (toastManager) {
54287
+ toastManager.removeTask(taskId);
54288
+ }
54203
54289
  this.tasks.delete(taskId);
54204
54290
  if (task.sessionID) {
54205
54291
  subagentSessions.delete(task.sessionID);
@@ -64441,7 +64527,21 @@ Your ONLY valid output locations are \`.sisyphus/plans/*.md\` and \`.sisyphus/dr
64441
64527
 
64442
64528
  Example: \`.sisyphus/plans/auth-refactor.md\`
64443
64529
 
64444
- ### 5. SINGLE PLAN MANDATE (CRITICAL)
64530
+ ### 5. MAXIMUM PARALLELISM PRINCIPLE (NON-NEGOTIABLE)
64531
+
64532
+ Your plans MUST maximize parallel execution. This is a core planning quality metric.
64533
+
64534
+ **Granularity Rule**: One task = one module/concern = 1-3 files.
64535
+ If a task touches 4+ files or 2+ unrelated concerns, SPLIT IT.
64536
+
64537
+ **Parallelism Target**: Aim for 5-8 tasks per wave.
64538
+ If any wave has fewer than 3 tasks (except the final integration), you under-split.
64539
+
64540
+ **Dependency Minimization**: Structure tasks so shared dependencies
64541
+ (types, interfaces, configs) are extracted as early Wave-1 tasks,
64542
+ unblocking maximum parallelism in subsequent waves.
64543
+
64544
+ ### 6. SINGLE PLAN MANDATE (CRITICAL)
64445
64545
  **No matter how large the task, EVERYTHING goes into ONE work plan.**
64446
64546
 
64447
64547
  **NEVER:**
@@ -64464,7 +64564,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
64464
64564
 
64465
64565
  **The plan can have 50+ TODOs. That's OK. ONE PLAN.**
64466
64566
 
64467
- ### 5.1 SINGLE ATOMIC WRITE (CRITICAL - Prevents Content Loss)
64567
+ ### 6.1 SINGLE ATOMIC WRITE (CRITICAL - Prevents Content Loss)
64468
64568
 
64469
64569
  <write_protocol>
64470
64570
  **The Write tool OVERWRITES files. It does NOT append.**
@@ -64507,7 +64607,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
64507
64607
  - [ ] File already exists with my content? \u2192 Use Edit to append, NOT Write
64508
64608
  </write_protocol>
64509
64609
 
64510
- ### 6. DRAFT AS WORKING MEMORY (MANDATORY)
64610
+ ### 7. DRAFT AS WORKING MEMORY (MANDATORY)
64511
64611
  **During interview, CONTINUOUSLY record decisions to a draft file.**
64512
64612
 
64513
64613
  **Draft Location**: \`.sisyphus/drafts/{name}.md\`
@@ -65314,108 +65414,25 @@ Generate plan to: \`.sisyphus/plans/{name}.md\`
65314
65414
 
65315
65415
  ## Verification Strategy (MANDATORY)
65316
65416
 
65317
- > **UNIVERSAL RULE: ZERO HUMAN INTERVENTION**
65318
- >
65319
- > ALL tasks in this plan MUST be verifiable WITHOUT any human action.
65320
- > This is NOT conditional \u2014 it applies to EVERY task, regardless of test strategy.
65321
- >
65322
- > **FORBIDDEN** \u2014 acceptance criteria that require:
65323
- > - "User manually tests..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uD14C\uC2A4\uD2B8..."
65324
- > - "User visually confirms..." / "\uC0AC\uC6A9\uC790\uAC00 \uB208\uC73C\uB85C \uD655\uC778..."
65325
- > - "User interacts with..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uC870\uC791..."
65326
- > - "Ask user to verify..." / "\uC0AC\uC6A9\uC790\uC5D0\uAC8C \uD655\uC778 \uC694\uCCAD..."
65327
- > - ANY step where a human must perform an action
65328
- >
65329
- > **ALL verification is executed by the agent** using tools (Playwright, interactive_bash, curl, etc.). No exceptions.
65417
+ > **ZERO HUMAN INTERVENTION** \u2014 ALL verification is agent-executed. No exceptions.
65418
+ > Acceptance criteria requiring "user manually tests/confirms" are FORBIDDEN.
65330
65419
 
65331
65420
  ### Test Decision
65332
65421
  - **Infrastructure exists**: [YES/NO]
65333
65422
  - **Automated tests**: [TDD / Tests-after / None]
65334
65423
  - **Framework**: [bun test / vitest / jest / pytest / none]
65424
+ - **If TDD**: Each task follows RED (failing test) \u2192 GREEN (minimal impl) \u2192 REFACTOR
65335
65425
 
65336
- ### If TDD Enabled
65337
-
65338
- Each TODO follows RED-GREEN-REFACTOR:
65339
-
65340
- **Task Structure:**
65341
- 1. **RED**: Write failing test first
65342
- - Test file: \`[path].test.ts\`
65343
- - Test command: \`bun test [file]\`
65344
- - Expected: FAIL (test exists, implementation doesn't)
65345
- 2. **GREEN**: Implement minimum code to pass
65346
- - Command: \`bun test [file]\`
65347
- - Expected: PASS
65348
- 3. **REFACTOR**: Clean up while keeping green
65349
- - Command: \`bun test [file]\`
65350
- - Expected: PASS (still)
65351
-
65352
- **Test Setup Task (if infrastructure doesn't exist):**
65353
- - [ ] 0. Setup Test Infrastructure
65354
- - Install: \`bun add -d [test-framework]\`
65355
- - Config: Create \`[config-file]\`
65356
- - Verify: \`bun test --help\` \u2192 shows help
65357
- - Example: Create \`src/__tests__/example.test.ts\`
65358
- - Verify: \`bun test\` \u2192 1 test passes
65359
-
65360
- ### Agent-Executed QA Scenarios (MANDATORY \u2014 ALL tasks)
65361
-
65362
- > Whether TDD is enabled or not, EVERY task MUST include Agent-Executed QA Scenarios.
65363
- > - **With TDD**: QA scenarios complement unit tests at integration/E2E level
65364
- > - **Without TDD**: QA scenarios are the PRIMARY verification method
65365
- >
65366
- > These describe how the executing agent DIRECTLY verifies the deliverable
65367
- > by running it \u2014 opening browsers, executing commands, sending API requests.
65368
- > The agent performs what a human tester would do, but automated via tools.
65369
-
65370
- **Verification Tool by Deliverable Type:**
65371
-
65372
- | Type | Tool | How Agent Verifies |
65373
- |------|------|-------------------|
65374
- | **Frontend/UI** | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
65375
- | **TUI/CLI** | interactive_bash (tmux) | Run command, send keystrokes, validate output |
65376
- | **API/Backend** | Bash (curl/httpie) | Send requests, parse responses, assert fields |
65377
- | **Library/Module** | Bash (bun/node REPL) | Import, call functions, compare output |
65378
- | **Config/Infra** | Bash (shell commands) | Apply config, run state checks, validate |
65379
-
65380
- **Each Scenario MUST Follow This Format:**
65381
-
65382
- \`\`\`
65383
- Scenario: [Descriptive name \u2014 what user action/flow is being verified]
65384
- Tool: [Playwright / interactive_bash / Bash]
65385
- Preconditions: [What must be true before this scenario runs]
65386
- Steps:
65387
- 1. [Exact action with specific selector/command/endpoint]
65388
- 2. [Next action with expected intermediate state]
65389
- 3. [Assertion with exact expected value]
65390
- Expected Result: [Concrete, observable outcome]
65391
- Failure Indicators: [What would indicate failure]
65392
- Evidence: [Screenshot path / output capture / response body path]
65393
- \`\`\`
65426
+ ### QA Policy
65427
+ Every task MUST include agent-executed QA scenarios (see TODO template below).
65428
+ Evidence saved to \`.sisyphus/evidence/task-{N}-{scenario-slug}.{ext}\`.
65394
65429
 
65395
- **Scenario Detail Requirements:**
65396
- - **Selectors**: Specific CSS selectors (\`.login-button\`, not "the login button")
65397
- - **Data**: Concrete test data (\`"test@example.com"\`, not \`"[email]"\`)
65398
- - **Assertions**: Exact values (\`text contains "Welcome back"\`, not "verify it works")
65399
- - **Timing**: Include wait conditions where relevant (\`Wait for .dashboard (timeout: 10s)\`)
65400
- - **Negative Scenarios**: At least ONE failure/error scenario per feature
65401
- - **Evidence Paths**: Specific file paths (\`.sisyphus/evidence/task-N-scenario-name.png\`)
65402
-
65403
- **Anti-patterns (NEVER write scenarios like this):**
65404
- - \u274C "Verify the login page works correctly"
65405
- - \u274C "Check that the API returns the right data"
65406
- - \u274C "Test the form validation"
65407
- - \u274C "User opens browser and confirms..."
65408
-
65409
- **Write scenarios like this instead:**
65410
- - \u2705 \`Navigate to /login \u2192 Fill input[name="email"] with "test@example.com" \u2192 Fill input[name="password"] with "Pass123!" \u2192 Click button[type="submit"] \u2192 Wait for /dashboard \u2192 Assert h1 contains "Welcome"\`
65411
- - \u2705 \`POST /api/users {"name":"Test","email":"new@test.com"} \u2192 Assert status 201 \u2192 Assert response.id is UUID \u2192 GET /api/users/{id} \u2192 Assert name equals "Test"\`
65412
- - \u2705 \`Run ./cli --config test.yaml \u2192 Wait for "Loaded" in stdout \u2192 Send "q" \u2192 Assert exit code 0 \u2192 Assert stdout contains "Goodbye"\`
65413
-
65414
- **Evidence Requirements:**
65415
- - Screenshots: \`.sisyphus/evidence/\` for all UI verifications
65416
- - Terminal output: Captured for CLI/TUI verifications
65417
- - Response bodies: Saved for API verifications
65418
- - All evidence referenced by specific file path in acceptance criteria
65430
+ | Deliverable Type | Verification Tool | Method |
65431
+ |------------------|-------------------|--------|
65432
+ | Frontend/UI | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
65433
+ | TUI/CLI | interactive_bash (tmux) | Run command, send keystrokes, validate output |
65434
+ | API/Backend | Bash (curl) | Send requests, assert status + response fields |
65435
+ | Library/Module | Bash (bun/node REPL) | Import, call functions, compare output |
65419
65436
 
65420
65437
  ---
65421
65438
 
@@ -65425,49 +65442,82 @@ Scenario: [Descriptive name \u2014 what user action/flow is being verified]
65425
65442
 
65426
65443
  > Maximize throughput by grouping independent tasks into parallel waves.
65427
65444
  > Each wave completes before the next begins.
65445
+ > Target: 5-8 tasks per wave. Fewer than 3 per wave (except final) = under-splitting.
65428
65446
 
65429
65447
  \`\`\`
65430
- Wave 1 (Start Immediately):
65431
- \u251C\u2500\u2500 Task 1: [no dependencies]
65432
- \u2514\u2500\u2500 Task 5: [no dependencies]
65433
-
65434
- Wave 2 (After Wave 1):
65435
- \u251C\u2500\u2500 Task 2: [depends: 1]
65436
- \u251C\u2500\u2500 Task 3: [depends: 1]
65437
- \u2514\u2500\u2500 Task 6: [depends: 5]
65438
-
65439
- Wave 3 (After Wave 2):
65440
- \u2514\u2500\u2500 Task 4: [depends: 2, 3]
65441
-
65442
- Critical Path: Task 1 \u2192 Task 2 \u2192 Task 4
65443
- Parallel Speedup: ~40% faster than sequential
65448
+ Wave 1 (Start Immediately \u2014 foundation + scaffolding):
65449
+ \u251C\u2500\u2500 Task 1: Project scaffolding + config [quick]
65450
+ \u251C\u2500\u2500 Task 2: Design system tokens [quick]
65451
+ \u251C\u2500\u2500 Task 3: Type definitions [quick]
65452
+ \u251C\u2500\u2500 Task 4: Schema definitions [quick]
65453
+ \u251C\u2500\u2500 Task 5: Storage interface + in-memory impl [quick]
65454
+ \u251C\u2500\u2500 Task 6: Auth middleware [quick]
65455
+ \u2514\u2500\u2500 Task 7: Client module [quick]
65456
+
65457
+ Wave 2 (After Wave 1 \u2014 core modules, MAX PARALLEL):
65458
+ \u251C\u2500\u2500 Task 8: Core business logic (depends: 3, 5, 7) [deep]
65459
+ \u251C\u2500\u2500 Task 9: API endpoints (depends: 4, 5) [unspecified-high]
65460
+ \u251C\u2500\u2500 Task 10: Secondary storage impl (depends: 5) [unspecified-high]
65461
+ \u251C\u2500\u2500 Task 11: Retry/fallback logic (depends: 8) [deep]
65462
+ \u251C\u2500\u2500 Task 12: UI layout + navigation (depends: 2) [visual-engineering]
65463
+ \u251C\u2500\u2500 Task 13: API client + hooks (depends: 4) [quick]
65464
+ \u2514\u2500\u2500 Task 14: Telemetry middleware (depends: 5, 10) [unspecified-high]
65465
+
65466
+ Wave 3 (After Wave 2 \u2014 integration + UI):
65467
+ \u251C\u2500\u2500 Task 15: Main route combining modules (depends: 6, 11, 14) [deep]
65468
+ \u251C\u2500\u2500 Task 16: UI data visualization (depends: 12, 13) [visual-engineering]
65469
+ \u251C\u2500\u2500 Task 17: Deployment config A (depends: 15) [quick]
65470
+ \u251C\u2500\u2500 Task 18: Deployment config B (depends: 15) [quick]
65471
+ \u251C\u2500\u2500 Task 19: Deployment config C (depends: 15) [quick]
65472
+ \u2514\u2500\u2500 Task 20: UI request log + build (depends: 16) [visual-engineering]
65473
+
65474
+ Wave 4 (After Wave 3 \u2014 verification):
65475
+ \u251C\u2500\u2500 Task 21: Integration tests (depends: 15) [deep]
65476
+ \u251C\u2500\u2500 Task 22: UI QA - Playwright (depends: 20) [unspecified-high]
65477
+ \u251C\u2500\u2500 Task 23: E2E QA (depends: 21) [deep]
65478
+ \u2514\u2500\u2500 Task 24: Git cleanup + tagging (depends: 21) [git]
65479
+
65480
+ Wave FINAL (After ALL tasks \u2014 independent review, 4 parallel):
65481
+ \u251C\u2500\u2500 Task F1: Plan compliance audit (oracle)
65482
+ \u251C\u2500\u2500 Task F2: Code quality review (unspecified-high)
65483
+ \u251C\u2500\u2500 Task F3: Real manual QA (unspecified-high)
65484
+ \u2514\u2500\u2500 Task F4: Scope fidelity check (deep)
65485
+
65486
+ Critical Path: Task 1 \u2192 Task 5 \u2192 Task 8 \u2192 Task 11 \u2192 Task 15 \u2192 Task 21 \u2192 F1-F4
65487
+ Parallel Speedup: ~70% faster than sequential
65488
+ Max Concurrent: 7 (Waves 1 & 2)
65444
65489
  \`\`\`
65445
65490
 
65446
- ### Dependency Matrix
65491
+ ### Dependency Matrix (abbreviated \u2014 show ALL tasks in your generated plan)
65447
65492
 
65448
- | Task | Depends On | Blocks | Can Parallelize With |
65449
- |------|------------|--------|---------------------|
65450
- | 1 | None | 2, 3 | 5 |
65451
- | 2 | 1 | 4 | 3, 6 |
65452
- | 3 | 1 | 4 | 2, 6 |
65453
- | 4 | 2, 3 | None | None (final) |
65454
- | 5 | None | 6 | 1 |
65455
- | 6 | 5 | None | 2, 3 |
65493
+ | Task | Depends On | Blocks | Wave |
65494
+ |------|------------|--------|------|
65495
+ | 1-7 | \u2014 | 8-14 | 1 |
65496
+ | 8 | 3, 5, 7 | 11, 15 | 2 |
65497
+ | 11 | 8 | 15 | 2 |
65498
+ | 14 | 5, 10 | 15 | 2 |
65499
+ | 15 | 6, 11, 14 | 17-19, 21 | 3 |
65500
+ | 21 | 15 | 23, 24 | 4 |
65501
+
65502
+ > This is abbreviated for reference. YOUR generated plan must include the FULL matrix for ALL tasks.
65456
65503
 
65457
65504
  ### Agent Dispatch Summary
65458
65505
 
65459
- | Wave | Tasks | Recommended Agents |
65460
- |------|-------|-------------------|
65461
- | 1 | 1, 5 | task(category="...", load_skills=[...], run_in_background=false) |
65462
- | 2 | 2, 3, 6 | dispatch parallel after Wave 1 completes |
65463
- | 3 | 4 | final integration task |
65506
+ | Wave | # Parallel | Tasks \u2192 Agent Category |
65507
+ |------|------------|----------------------|
65508
+ | 1 | **7** | T1-T4 \u2192 \`quick\`, T5 \u2192 \`quick\`, T6 \u2192 \`quick\`, T7 \u2192 \`quick\` |
65509
+ | 2 | **7** | T8 \u2192 \`deep\`, T9 \u2192 \`unspecified-high\`, T10 \u2192 \`unspecified-high\`, T11 \u2192 \`deep\`, T12 \u2192 \`visual-engineering\`, T13 \u2192 \`quick\`, T14 \u2192 \`unspecified-high\` |
65510
+ | 3 | **6** | T15 \u2192 \`deep\`, T16 \u2192 \`visual-engineering\`, T17-T19 \u2192 \`quick\`, T20 \u2192 \`visual-engineering\` |
65511
+ | 4 | **4** | T21 \u2192 \`deep\`, T22 \u2192 \`unspecified-high\`, T23 \u2192 \`deep\`, T24 \u2192 \`git\` |
65512
+ | FINAL | **4** | F1 \u2192 \`oracle\`, F2 \u2192 \`unspecified-high\`, F3 \u2192 \`unspecified-high\`, F4 \u2192 \`deep\` |
65464
65513
 
65465
65514
  ---
65466
65515
 
65467
65516
  ## TODOs
65468
65517
 
65469
65518
  > Implementation + Test = ONE Task. Never separate.
65470
- > EVERY task MUST have: Recommended Agent Profile + Parallelization info.
65519
+ > EVERY task MUST have: Recommended Agent Profile + Parallelization info + QA Scenarios.
65520
+ > **A task WITHOUT QA Scenarios is INCOMPLETE. No exceptions.**
65471
65521
 
65472
65522
  - [ ] 1. [Task Title]
65473
65523
 
@@ -65501,22 +65551,15 @@ Parallel Speedup: ~40% faster than sequential
65501
65551
 
65502
65552
  **Pattern References** (existing code to follow):
65503
65553
  - \`src/services/auth.ts:45-78\` - Authentication flow pattern (JWT creation, refresh token handling)
65504
- - \`src/hooks/useForm.ts:12-34\` - Form validation pattern (Zod schema + react-hook-form integration)
65505
65554
 
65506
65555
  **API/Type References** (contracts to implement against):
65507
65556
  - \`src/types/user.ts:UserDTO\` - Response shape for user endpoints
65508
- - \`src/api/schema.ts:createUserSchema\` - Request validation schema
65509
65557
 
65510
65558
  **Test References** (testing patterns to follow):
65511
65559
  - \`src/__tests__/auth.test.ts:describe("login")\` - Test structure and mocking patterns
65512
65560
 
65513
- **Documentation References** (specs and requirements):
65514
- - \`docs/api-spec.md#authentication\` - API contract details
65515
- - \`ARCHITECTURE.md:Database Layer\` - Database access patterns
65516
-
65517
65561
  **External References** (libraries and frameworks):
65518
65562
  - Official docs: \`https://zod.dev/?id=basic-usage\` - Zod validation syntax
65519
- - Example repo: \`github.com/example/project/src/auth\` - Reference implementation
65520
65563
 
65521
65564
  **WHY Each Reference Matters** (explain the relevance):
65522
65565
  - Don't just list files - explain what pattern/information the executor should extract
@@ -65527,113 +65570,60 @@ Parallel Speedup: ~40% faster than sequential
65527
65570
 
65528
65571
  > **AGENT-EXECUTABLE VERIFICATION ONLY** \u2014 No human action permitted.
65529
65572
  > Every criterion MUST be verifiable by running a command or using a tool.
65530
- > REPLACE all placeholders with actual values from task context.
65531
65573
 
65532
65574
  **If TDD (tests enabled):**
65533
65575
  - [ ] Test file created: src/auth/login.test.ts
65534
- - [ ] Test covers: successful login returns JWT token
65535
65576
  - [ ] bun test src/auth/login.test.ts \u2192 PASS (3 tests, 0 failures)
65536
65577
 
65537
- **Agent-Executed QA Scenarios (MANDATORY \u2014 per-scenario, ultra-detailed):**
65538
-
65539
- > Write MULTIPLE named scenarios per task: happy path AND failure cases.
65540
- > Each scenario = exact tool + steps with real selectors/data + evidence path.
65541
-
65542
- **Example \u2014 Frontend/UI (Playwright):**
65543
-
65544
- \\\`\\\`\\\`
65545
- Scenario: Successful login redirects to dashboard
65546
- Tool: Playwright (playwright skill)
65547
- Preconditions: Dev server running on localhost:3000, test user exists
65548
- Steps:
65549
- 1. Navigate to: http://localhost:3000/login
65550
- 2. Wait for: input[name="email"] visible (timeout: 5s)
65551
- 3. Fill: input[name="email"] \u2192 "test@example.com"
65552
- 4. Fill: input[name="password"] \u2192 "ValidPass123!"
65553
- 5. Click: button[type="submit"]
65554
- 6. Wait for: navigation to /dashboard (timeout: 10s)
65555
- 7. Assert: h1 text contains "Welcome back"
65556
- 8. Assert: cookie "session_token" exists
65557
- 9. Screenshot: .sisyphus/evidence/task-1-login-success.png
65558
- Expected Result: Dashboard loads with welcome message
65559
- Evidence: .sisyphus/evidence/task-1-login-success.png
65560
-
65561
- Scenario: Login fails with invalid credentials
65562
- Tool: Playwright (playwright skill)
65563
- Preconditions: Dev server running, no valid user with these credentials
65564
- Steps:
65565
- 1. Navigate to: http://localhost:3000/login
65566
- 2. Fill: input[name="email"] \u2192 "wrong@example.com"
65567
- 3. Fill: input[name="password"] \u2192 "WrongPass"
65568
- 4. Click: button[type="submit"]
65569
- 5. Wait for: .error-message visible (timeout: 5s)
65570
- 6. Assert: .error-message text contains "Invalid credentials"
65571
- 7. Assert: URL is still /login (no redirect)
65572
- 8. Screenshot: .sisyphus/evidence/task-1-login-failure.png
65573
- Expected Result: Error message shown, stays on login page
65574
- Evidence: .sisyphus/evidence/task-1-login-failure.png
65575
- \\\`\\\`\\\`
65578
+ **QA Scenarios (MANDATORY \u2014 task is INCOMPLETE without these):**
65576
65579
 
65577
- **Example \u2014 API/Backend (curl):**
65580
+ > **This is NOT optional. A task without QA scenarios WILL BE REJECTED.**
65581
+ >
65582
+ > Write scenario tests that verify the ACTUAL BEHAVIOR of what you built.
65583
+ > Minimum: 1 happy path + 1 failure/edge case per task.
65584
+ > Each scenario = exact tool + exact steps + exact assertions + evidence path.
65585
+ >
65586
+ > **The executing agent MUST run these scenarios after implementation.**
65587
+ > **The orchestrator WILL verify evidence files exist before marking task complete.**
65578
65588
 
65579
65589
  \\\`\\\`\\\`
65580
- Scenario: Create user returns 201 with UUID
65581
- Tool: Bash (curl)
65582
- Preconditions: Server running on localhost:8080
65590
+ Scenario: [Happy path \u2014 what SHOULD work]
65591
+ Tool: [Playwright / interactive_bash / Bash (curl)]
65592
+ Preconditions: [Exact setup state]
65583
65593
  Steps:
65584
- 1. curl -s -w "\\n%{http_code}" -X POST http://localhost:8080/api/users \\
65585
- -H "Content-Type: application/json" \\
65586
- -d '{"email":"new@test.com","name":"Test User"}'
65587
- 2. Assert: HTTP status is 201
65588
- 3. Assert: response.id matches UUID format
65589
- 4. GET /api/users/{returned-id} \u2192 Assert name equals "Test User"
65590
- Expected Result: User created and retrievable
65591
- Evidence: Response bodies captured
65592
-
65593
- Scenario: Duplicate email returns 409
65594
- Tool: Bash (curl)
65595
- Preconditions: User with email "new@test.com" already exists
65594
+ 1. [Exact action \u2014 specific command/selector/endpoint, no vagueness]
65595
+ 2. [Next action \u2014 with expected intermediate state]
65596
+ 3. [Assertion \u2014 exact expected value, not "verify it works"]
65597
+ Expected Result: [Concrete, observable, binary pass/fail]
65598
+ Failure Indicators: [What specifically would mean this failed]
65599
+ Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}.{ext}
65600
+
65601
+ Scenario: [Failure/edge case \u2014 what SHOULD fail gracefully]
65602
+ Tool: [same format]
65603
+ Preconditions: [Invalid input / missing dependency / error state]
65596
65604
  Steps:
65597
- 1. Repeat POST with same email
65598
- 2. Assert: HTTP status is 409
65599
- 3. Assert: response.error contains "already exists"
65600
- Expected Result: Conflict error returned
65601
- Evidence: Response body captured
65605
+ 1. [Trigger the error condition]
65606
+ 2. [Assert error is handled correctly]
65607
+ Expected Result: [Graceful failure with correct error message/code]
65608
+ Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}-error.{ext}
65602
65609
  \\\`\\\`\\\`
65603
65610
 
65604
- **Example \u2014 TUI/CLI (interactive_bash):**
65605
-
65606
- \\\`\\\`\\\`
65607
- Scenario: CLI loads config and displays menu
65608
- Tool: interactive_bash (tmux)
65609
- Preconditions: Binary built, test config at ./test.yaml
65610
- Steps:
65611
- 1. tmux new-session: ./my-cli --config test.yaml
65612
- 2. Wait for: "Configuration loaded" in output (timeout: 5s)
65613
- 3. Assert: Menu items visible ("1. Create", "2. List", "3. Exit")
65614
- 4. Send keys: "3" then Enter
65615
- 5. Assert: "Goodbye" in output
65616
- 6. Assert: Process exited with code 0
65617
- Expected Result: CLI starts, shows menu, exits cleanly
65618
- Evidence: Terminal output captured
65619
-
65620
- Scenario: CLI handles missing config gracefully
65621
- Tool: interactive_bash (tmux)
65622
- Preconditions: No config file at ./nonexistent.yaml
65623
- Steps:
65624
- 1. tmux new-session: ./my-cli --config nonexistent.yaml
65625
- 2. Wait for: output (timeout: 3s)
65626
- 3. Assert: stderr contains "Config file not found"
65627
- 4. Assert: Process exited with code 1
65628
- Expected Result: Meaningful error, non-zero exit
65629
- Evidence: Error output captured
65630
- \\\`\\\`\\\`
65611
+ > **Specificity requirements \u2014 every scenario MUST use:**
65612
+ > - **Selectors**: Specific CSS selectors (\`.login-button\`, not "the login button")
65613
+ > - **Data**: Concrete test data (\`"test@example.com"\`, not \`"[email]"\`)
65614
+ > - **Assertions**: Exact values (\`text contains "Welcome back"\`, not "verify it works")
65615
+ > - **Timing**: Wait conditions where relevant (\`timeout: 10s\`)
65616
+ > - **Negative**: At least ONE failure/error scenario per task
65617
+ >
65618
+ > **Anti-patterns (your scenario is INVALID if it looks like this):**
65619
+ > - \u274C "Verify it works correctly" \u2014 HOW? What does "correctly" mean?
65620
+ > - \u274C "Check the API returns data" \u2014 WHAT data? What fields? What values?
65621
+ > - \u274C "Test the component renders" \u2014 WHERE? What selector? What content?
65622
+ > - \u274C Any scenario without an evidence path
65631
65623
 
65632
65624
  **Evidence to Capture:**
65633
- - [ ] Screenshots in .sisyphus/evidence/ for UI scenarios
65634
- - [ ] Terminal output for CLI/TUI scenarios
65635
- - [ ] Response bodies for API scenarios
65636
65625
  - [ ] Each evidence file named: task-{N}-{scenario-slug}.{ext}
65626
+ - [ ] Screenshots for UI, terminal output for CLI, response bodies for API
65637
65627
 
65638
65628
  **Commit**: YES | NO (groups with N)
65639
65629
  - Message: \`type(scope): desc\`
@@ -65642,6 +65632,28 @@ Parallel Speedup: ~40% faster than sequential
65642
65632
 
65643
65633
  ---
65644
65634
 
65635
+ ## Final Verification Wave (MANDATORY \u2014 after ALL implementation tasks)
65636
+
65637
+ > 4 review agents run in PARALLEL. ALL must APPROVE. Rejection \u2192 fix \u2192 re-run.
65638
+
65639
+ - [ ] F1. **Plan Compliance Audit** \u2014 \`oracle\`
65640
+ Read the plan end-to-end. For each "Must Have": verify implementation exists (read file, curl endpoint, run command). For each "Must NOT Have": search codebase for forbidden patterns \u2014 reject with file:line if found. Check evidence files exist in .sisyphus/evidence/. Compare deliverables against plan.
65641
+ Output: \`Must Have [N/N] | Must NOT Have [N/N] | Tasks [N/N] | VERDICT: APPROVE/REJECT\`
65642
+
65643
+ - [ ] F2. **Code Quality Review** \u2014 \`unspecified-high\`
65644
+ Run \`tsc --noEmit\` + linter + \`bun test\`. Review all changed files for: \`as any\`/\`@ts-ignore\`, empty catches, console.log in prod, commented-out code, unused imports. Check AI slop: excessive comments, over-abstraction, generic names (data/result/item/temp).
65645
+ Output: \`Build [PASS/FAIL] | Lint [PASS/FAIL] | Tests [N pass/N fail] | Files [N clean/N issues] | VERDICT\`
65646
+
65647
+ - [ ] F3. **Real Manual QA** \u2014 \`unspecified-high\` (+ \`playwright\` skill if UI)
65648
+ Start from clean state. Execute EVERY QA scenario from EVERY task \u2014 follow exact steps, capture evidence. Test cross-task integration (features working together, not isolation). Test edge cases: empty state, invalid input, rapid actions. Save to \`.sisyphus/evidence/final-qa/\`.
65649
+ Output: \`Scenarios [N/N pass] | Integration [N/N] | Edge Cases [N tested] | VERDICT\`
65650
+
65651
+ - [ ] F4. **Scope Fidelity Check** \u2014 \`deep\`
65652
+ For each task: read "What to do", read actual diff (git log/diff). Verify 1:1 \u2014 everything in spec was built (no missing), nothing beyond spec was built (no creep). Check "Must NOT do" compliance. Detect cross-task contamination: Task N touching Task M's files. Flag unaccounted changes.
65653
+ Output: \`Tasks [N/N compliant] | Contamination [CLEAN/N issues] | Unaccounted [CLEAN/N files] | VERDICT\`
65654
+
65655
+ ---
65656
+
65645
65657
  ## Commit Strategy
65646
65658
 
65647
65659
  | After Task | Message | Files | Verification |
@@ -67551,9 +67563,11 @@ function createChatMessageHandler2(args) {
67551
67563
  }
67552
67564
  const message = output.message;
67553
67565
  if (firstMessageVariantGate.shouldOverride(input.sessionID)) {
67554
- const variant = input.model && input.agent ? resolveVariantForModel(pluginConfig, input.agent, input.model) : resolveAgentVariant(pluginConfig, input.agent);
67555
- if (variant !== undefined) {
67556
- message["variant"] = variant;
67566
+ if (message["variant"] === undefined) {
67567
+ const variant = input.model && input.agent ? resolveVariantForModel(pluginConfig, input.agent, input.model) : resolveAgentVariant(pluginConfig, input.agent);
67568
+ if (variant !== undefined) {
67569
+ message["variant"] = variant;
67570
+ }
67557
67571
  }
67558
67572
  firstMessageVariantGate.markApplied(input.sessionID);
67559
67573
  } else {