oh-my-opencode 3.5.5 → 3.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/prometheus/identity-constraints.d.ts +1 -1
- package/dist/agents/prometheus/plan-template.d.ts +1 -1
- package/dist/agents/prometheus/system-prompt.d.ts +1 -1
- package/dist/cli/index.js +10 -9
- package/dist/hooks/todo-continuation-enforcer/constants.d.ts +2 -0
- package/dist/hooks/todo-continuation-enforcer/types.d.ts +1 -0
- package/dist/index.js +253 -239
- package/package.json +8 -8
package/dist/index.js
CHANGED
|
@@ -12240,6 +12240,8 @@ var TOAST_DURATION_MS = 900;
|
|
|
12240
12240
|
var COUNTDOWN_GRACE_PERIOD_MS = 500;
|
|
12241
12241
|
var ABORT_WINDOW_MS = 3000;
|
|
12242
12242
|
var CONTINUATION_COOLDOWN_MS = 30000;
|
|
12243
|
+
var MAX_CONSECUTIVE_FAILURES = 5;
|
|
12244
|
+
var FAILURE_RESET_WINDOW_MS = 5 * 60 * 1000;
|
|
12243
12245
|
|
|
12244
12246
|
// src/hooks/todo-continuation-enforcer/handler.ts
|
|
12245
12247
|
init_logger();
|
|
@@ -12464,11 +12466,14 @@ ${todoList}`;
|
|
|
12464
12466
|
if (injectionState) {
|
|
12465
12467
|
injectionState.inFlight = false;
|
|
12466
12468
|
injectionState.lastInjectedAt = Date.now();
|
|
12469
|
+
injectionState.consecutiveFailures = 0;
|
|
12467
12470
|
}
|
|
12468
12471
|
} catch (error) {
|
|
12469
12472
|
log(`[${HOOK_NAME}] Injection failed`, { sessionID, error: String(error) });
|
|
12470
12473
|
if (injectionState) {
|
|
12471
12474
|
injectionState.inFlight = false;
|
|
12475
|
+
injectionState.lastInjectedAt = Date.now();
|
|
12476
|
+
injectionState.consecutiveFailures = (injectionState.consecutiveFailures ?? 0) + 1;
|
|
12472
12477
|
}
|
|
12473
12478
|
}
|
|
12474
12479
|
}
|
|
@@ -12587,8 +12592,28 @@ async function handleSessionIdle(args) {
|
|
|
12587
12592
|
log(`[${HOOK_NAME}] Skipped: injection in flight`, { sessionID });
|
|
12588
12593
|
return;
|
|
12589
12594
|
}
|
|
12590
|
-
if (state.lastInjectedAt && Date.now() - state.lastInjectedAt
|
|
12591
|
-
|
|
12595
|
+
if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES && state.lastInjectedAt && Date.now() - state.lastInjectedAt >= FAILURE_RESET_WINDOW_MS) {
|
|
12596
|
+
state.consecutiveFailures = 0;
|
|
12597
|
+
log(`[${HOOK_NAME}] Reset consecutive failures after recovery window`, {
|
|
12598
|
+
sessionID,
|
|
12599
|
+
failureResetWindowMs: FAILURE_RESET_WINDOW_MS
|
|
12600
|
+
});
|
|
12601
|
+
}
|
|
12602
|
+
if (state.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
|
|
12603
|
+
log(`[${HOOK_NAME}] Skipped: max consecutive failures reached`, {
|
|
12604
|
+
sessionID,
|
|
12605
|
+
consecutiveFailures: state.consecutiveFailures,
|
|
12606
|
+
maxConsecutiveFailures: MAX_CONSECUTIVE_FAILURES
|
|
12607
|
+
});
|
|
12608
|
+
return;
|
|
12609
|
+
}
|
|
12610
|
+
const effectiveCooldown = CONTINUATION_COOLDOWN_MS * Math.pow(2, Math.min(state.consecutiveFailures, 5));
|
|
12611
|
+
if (state.lastInjectedAt && Date.now() - state.lastInjectedAt < effectiveCooldown) {
|
|
12612
|
+
log(`[${HOOK_NAME}] Skipped: cooldown active`, {
|
|
12613
|
+
sessionID,
|
|
12614
|
+
effectiveCooldown,
|
|
12615
|
+
consecutiveFailures: state.consecutiveFailures
|
|
12616
|
+
});
|
|
12592
12617
|
return;
|
|
12593
12618
|
}
|
|
12594
12619
|
let resolvedInfo;
|
|
@@ -12777,7 +12802,9 @@ function createSessionStateStore() {
|
|
|
12777
12802
|
existing.lastAccessedAt = Date.now();
|
|
12778
12803
|
return existing.state;
|
|
12779
12804
|
}
|
|
12780
|
-
const state = {
|
|
12805
|
+
const state = {
|
|
12806
|
+
consecutiveFailures: 0
|
|
12807
|
+
};
|
|
12781
12808
|
sessions.set(sessionID, { state, lastAccessedAt: Date.now() });
|
|
12782
12809
|
return state;
|
|
12783
12810
|
}
|
|
@@ -28295,11 +28322,17 @@ function createRecoveryState() {
|
|
|
28295
28322
|
function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
|
|
28296
28323
|
const autoCompactState = createRecoveryState();
|
|
28297
28324
|
const experimental = options?.experimental;
|
|
28325
|
+
const pendingCompactionTimeoutBySession = new Map;
|
|
28298
28326
|
const eventHandler = async ({ event }) => {
|
|
28299
28327
|
const props = event.properties;
|
|
28300
28328
|
if (event.type === "session.deleted") {
|
|
28301
28329
|
const sessionInfo = props?.info;
|
|
28302
28330
|
if (sessionInfo?.id) {
|
|
28331
|
+
const timeoutID = pendingCompactionTimeoutBySession.get(sessionInfo.id);
|
|
28332
|
+
if (timeoutID !== undefined) {
|
|
28333
|
+
clearTimeout(timeoutID);
|
|
28334
|
+
pendingCompactionTimeoutBySession.delete(sessionInfo.id);
|
|
28335
|
+
}
|
|
28303
28336
|
autoCompactState.pendingCompact.delete(sessionInfo.id);
|
|
28304
28337
|
autoCompactState.errorDataBySession.delete(sessionInfo.id);
|
|
28305
28338
|
autoCompactState.retryStateBySession.delete(sessionInfo.id);
|
|
@@ -28334,9 +28367,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
|
|
|
28334
28367
|
duration: 3000
|
|
28335
28368
|
}
|
|
28336
28369
|
}).catch(() => {});
|
|
28337
|
-
setTimeout(() => {
|
|
28370
|
+
const timeoutID = setTimeout(() => {
|
|
28371
|
+
pendingCompactionTimeoutBySession.delete(sessionID);
|
|
28338
28372
|
executeCompact(sessionID, { providerID, modelID }, autoCompactState, ctx.client, ctx.directory, experimental);
|
|
28339
28373
|
}, 300);
|
|
28374
|
+
pendingCompactionTimeoutBySession.set(sessionID, timeoutID);
|
|
28340
28375
|
}
|
|
28341
28376
|
return;
|
|
28342
28377
|
}
|
|
@@ -28362,6 +28397,11 @@ function createAnthropicContextWindowLimitRecoveryHook(ctx, options) {
|
|
|
28362
28397
|
return;
|
|
28363
28398
|
if (!autoCompactState.pendingCompact.has(sessionID))
|
|
28364
28399
|
return;
|
|
28400
|
+
const timeoutID = pendingCompactionTimeoutBySession.get(sessionID);
|
|
28401
|
+
if (timeoutID !== undefined) {
|
|
28402
|
+
clearTimeout(timeoutID);
|
|
28403
|
+
pendingCompactionTimeoutBySession.delete(sessionID);
|
|
28404
|
+
}
|
|
28365
28405
|
const errorData = autoCompactState.errorDataBySession.get(sessionID);
|
|
28366
28406
|
const lastAssistant = await getLastAssistant(sessionID, ctx.client, ctx.directory);
|
|
28367
28407
|
if (lastAssistant?.summary === true) {
|
|
@@ -44986,6 +45026,7 @@ function createUnstableAgentBabysitterHook(ctx, options) {
|
|
|
44986
45026
|
};
|
|
44987
45027
|
}
|
|
44988
45028
|
// src/hooks/preemptive-compaction.ts
|
|
45029
|
+
init_logger();
|
|
44989
45030
|
var DEFAULT_ACTUAL_LIMIT = 200000;
|
|
44990
45031
|
var ANTHROPIC_ACTUAL_LIMIT3 = process.env.ANTHROPIC_1M_CONTEXT === "true" || process.env.VERTEX_ANTHROPIC_1M_CONTEXT === "true" ? 1e6 : DEFAULT_ACTUAL_LIMIT;
|
|
44991
45032
|
var PREEMPTIVE_COMPACTION_THRESHOLD = 0.78;
|
|
@@ -45017,7 +45058,9 @@ function createPreemptiveCompactionHook(ctx) {
|
|
|
45017
45058
|
query: { directory: ctx.directory }
|
|
45018
45059
|
});
|
|
45019
45060
|
compactedSessions.add(sessionID);
|
|
45020
|
-
} catch
|
|
45061
|
+
} catch (error45) {
|
|
45062
|
+
log("[preemptive-compaction] Compaction failed", { sessionID, error: String(error45) });
|
|
45063
|
+
} finally {
|
|
45021
45064
|
compactionInProgress.delete(sessionID);
|
|
45022
45065
|
}
|
|
45023
45066
|
};
|
|
@@ -51145,6 +51188,7 @@ Task ID: ${task.id}`;
|
|
|
51145
51188
|
const pollStart = Date.now();
|
|
51146
51189
|
let lastMsgCount = 0;
|
|
51147
51190
|
let stablePolls = 0;
|
|
51191
|
+
let terminalStatus;
|
|
51148
51192
|
while (Date.now() - pollStart < timingCfg.MAX_POLL_TIME_MS) {
|
|
51149
51193
|
if (ctx.abort?.aborted) {
|
|
51150
51194
|
return `Task aborted (was running in background mode).
|
|
@@ -51152,6 +51196,11 @@ Task ID: ${task.id}`;
|
|
|
51152
51196
|
Session ID: ${sessionID}`;
|
|
51153
51197
|
}
|
|
51154
51198
|
await new Promise((resolve10) => setTimeout(resolve10, timingCfg.POLL_INTERVAL_MS));
|
|
51199
|
+
const currentTask = manager.getTask(task.id);
|
|
51200
|
+
if (currentTask && (currentTask.status === "interrupt" || currentTask.status === "error" || currentTask.status === "cancelled")) {
|
|
51201
|
+
terminalStatus = { status: currentTask.status, error: currentTask.error };
|
|
51202
|
+
break;
|
|
51203
|
+
}
|
|
51155
51204
|
const statusResult = await client2.session.status();
|
|
51156
51205
|
const allStatuses = statusResult.data ?? {};
|
|
51157
51206
|
const sessionStatus = allStatuses[sessionID];
|
|
@@ -51174,6 +51223,23 @@ Session ID: ${sessionID}`;
|
|
|
51174
51223
|
lastMsgCount = currentMsgCount;
|
|
51175
51224
|
}
|
|
51176
51225
|
}
|
|
51226
|
+
if (terminalStatus) {
|
|
51227
|
+
const duration4 = formatDuration2(startTime);
|
|
51228
|
+
return `SUPERVISED TASK FAILED (${terminalStatus.status})
|
|
51229
|
+
|
|
51230
|
+
Task was interrupted/failed while running in monitored background mode.
|
|
51231
|
+
${terminalStatus.error ? `Error: ${terminalStatus.error}` : ""}
|
|
51232
|
+
|
|
51233
|
+
Duration: ${duration4}
|
|
51234
|
+
Agent: ${agentToUse}${args.category ? ` (category: ${args.category})` : ""}
|
|
51235
|
+
Model: ${actualModel}
|
|
51236
|
+
|
|
51237
|
+
The task session may contain partial results.
|
|
51238
|
+
|
|
51239
|
+
<task_metadata>
|
|
51240
|
+
session_id: ${sessionID}
|
|
51241
|
+
</task_metadata>`;
|
|
51242
|
+
}
|
|
51177
51243
|
const messagesResult = await client2.session.messages({ path: { id: sessionID } });
|
|
51178
51244
|
const messages = messagesResult.data ?? messagesResult;
|
|
51179
51245
|
const assistantMessages = messages.filter((m) => m.info?.role === "assistant").sort((a, b) => (b.info?.time?.created ?? 0) - (a.info?.time?.created ?? 0));
|
|
@@ -53507,6 +53573,11 @@ class BackgroundManager {
|
|
|
53507
53573
|
});
|
|
53508
53574
|
return existingTask;
|
|
53509
53575
|
}
|
|
53576
|
+
const completionTimer = this.completionTimers.get(existingTask.id);
|
|
53577
|
+
if (completionTimer) {
|
|
53578
|
+
clearTimeout(completionTimer);
|
|
53579
|
+
this.completionTimers.delete(existingTask.id);
|
|
53580
|
+
}
|
|
53510
53581
|
const concurrencyKey = existingTask.concurrencyGroup ?? existingTask.agent;
|
|
53511
53582
|
await this.concurrencyManager.acquire(concurrencyKey);
|
|
53512
53583
|
existingTask.concurrencyKey = concurrencyKey;
|
|
@@ -53714,6 +53785,10 @@ class BackgroundManager {
|
|
|
53714
53785
|
this.cleanupPendingByParent(task);
|
|
53715
53786
|
this.tasks.delete(task.id);
|
|
53716
53787
|
this.clearNotificationsForTask(task.id);
|
|
53788
|
+
const toastManager = getTaskToastManager();
|
|
53789
|
+
if (toastManager) {
|
|
53790
|
+
toastManager.removeTask(task.id);
|
|
53791
|
+
}
|
|
53717
53792
|
if (task.sessionID) {
|
|
53718
53793
|
subagentSessions.delete(task.sessionID);
|
|
53719
53794
|
}
|
|
@@ -53756,6 +53831,10 @@ class BackgroundManager {
|
|
|
53756
53831
|
this.cleanupPendingByParent(task);
|
|
53757
53832
|
this.tasks.delete(task.id);
|
|
53758
53833
|
this.clearNotificationsForTask(task.id);
|
|
53834
|
+
const toastManager = getTaskToastManager();
|
|
53835
|
+
if (toastManager) {
|
|
53836
|
+
toastManager.removeTask(task.id);
|
|
53837
|
+
}
|
|
53759
53838
|
if (task.sessionID) {
|
|
53760
53839
|
subagentSessions.delete(task.sessionID);
|
|
53761
53840
|
}
|
|
@@ -53870,6 +53949,10 @@ class BackgroundManager {
|
|
|
53870
53949
|
}).catch(() => {});
|
|
53871
53950
|
}
|
|
53872
53951
|
if (options?.skipNotification) {
|
|
53952
|
+
const toastManager = getTaskToastManager();
|
|
53953
|
+
if (toastManager) {
|
|
53954
|
+
toastManager.removeTask(task.id);
|
|
53955
|
+
}
|
|
53873
53956
|
log(`[background-agent] Task cancelled via ${source} (notification skipped):`, task.id);
|
|
53874
53957
|
return true;
|
|
53875
53958
|
}
|
|
@@ -54047,11 +54130,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
|
|
|
54047
54130
|
}
|
|
54048
54131
|
} catch (error45) {
|
|
54049
54132
|
if (this.isAbortedSessionError(error45)) {
|
|
54050
|
-
log("[background-agent] Parent session aborted
|
|
54133
|
+
log("[background-agent] Parent session aborted while loading messages; using messageDir fallback:", {
|
|
54051
54134
|
taskId: task.id,
|
|
54052
54135
|
parentSessionID: task.parentSessionID
|
|
54053
54136
|
});
|
|
54054
|
-
return;
|
|
54055
54137
|
}
|
|
54056
54138
|
const messageDir = getMessageDir12(task.parentSessionID);
|
|
54057
54139
|
const currentMessage = messageDir ? findNearestMessageWithFields(messageDir) : null;
|
|
@@ -54081,13 +54163,13 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
|
|
|
54081
54163
|
});
|
|
54082
54164
|
} catch (error45) {
|
|
54083
54165
|
if (this.isAbortedSessionError(error45)) {
|
|
54084
|
-
log("[background-agent] Parent session aborted
|
|
54166
|
+
log("[background-agent] Parent session aborted while sending notification; continuing cleanup:", {
|
|
54085
54167
|
taskId: task.id,
|
|
54086
54168
|
parentSessionID: task.parentSessionID
|
|
54087
54169
|
});
|
|
54088
|
-
|
|
54170
|
+
} else {
|
|
54171
|
+
log("[background-agent] Failed to send notification:", error45);
|
|
54089
54172
|
}
|
|
54090
|
-
log("[background-agent] Failed to send notification:", error45);
|
|
54091
54173
|
}
|
|
54092
54174
|
if (allComplete) {
|
|
54093
54175
|
for (const completedTask of completedTasks) {
|
|
@@ -54200,6 +54282,10 @@ Use \`background_output(task_id="${task.id}")\` to retrieve this result when rea
|
|
|
54200
54282
|
}
|
|
54201
54283
|
}
|
|
54202
54284
|
this.clearNotificationsForTask(taskId);
|
|
54285
|
+
const toastManager = getTaskToastManager();
|
|
54286
|
+
if (toastManager) {
|
|
54287
|
+
toastManager.removeTask(taskId);
|
|
54288
|
+
}
|
|
54203
54289
|
this.tasks.delete(taskId);
|
|
54204
54290
|
if (task.sessionID) {
|
|
54205
54291
|
subagentSessions.delete(task.sessionID);
|
|
@@ -64441,7 +64527,21 @@ Your ONLY valid output locations are \`.sisyphus/plans/*.md\` and \`.sisyphus/dr
|
|
|
64441
64527
|
|
|
64442
64528
|
Example: \`.sisyphus/plans/auth-refactor.md\`
|
|
64443
64529
|
|
|
64444
|
-
### 5.
|
|
64530
|
+
### 5. MAXIMUM PARALLELISM PRINCIPLE (NON-NEGOTIABLE)
|
|
64531
|
+
|
|
64532
|
+
Your plans MUST maximize parallel execution. This is a core planning quality metric.
|
|
64533
|
+
|
|
64534
|
+
**Granularity Rule**: One task = one module/concern = 1-3 files.
|
|
64535
|
+
If a task touches 4+ files or 2+ unrelated concerns, SPLIT IT.
|
|
64536
|
+
|
|
64537
|
+
**Parallelism Target**: Aim for 5-8 tasks per wave.
|
|
64538
|
+
If any wave has fewer than 3 tasks (except the final integration), you under-split.
|
|
64539
|
+
|
|
64540
|
+
**Dependency Minimization**: Structure tasks so shared dependencies
|
|
64541
|
+
(types, interfaces, configs) are extracted as early Wave-1 tasks,
|
|
64542
|
+
unblocking maximum parallelism in subsequent waves.
|
|
64543
|
+
|
|
64544
|
+
### 6. SINGLE PLAN MANDATE (CRITICAL)
|
|
64445
64545
|
**No matter how large the task, EVERYTHING goes into ONE work plan.**
|
|
64446
64546
|
|
|
64447
64547
|
**NEVER:**
|
|
@@ -64464,7 +64564,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
|
|
|
64464
64564
|
|
|
64465
64565
|
**The plan can have 50+ TODOs. That's OK. ONE PLAN.**
|
|
64466
64566
|
|
|
64467
|
-
###
|
|
64567
|
+
### 6.1 SINGLE ATOMIC WRITE (CRITICAL - Prevents Content Loss)
|
|
64468
64568
|
|
|
64469
64569
|
<write_protocol>
|
|
64470
64570
|
**The Write tool OVERWRITES files. It does NOT append.**
|
|
@@ -64507,7 +64607,7 @@ Example: \`.sisyphus/plans/auth-refactor.md\`
|
|
|
64507
64607
|
- [ ] File already exists with my content? \u2192 Use Edit to append, NOT Write
|
|
64508
64608
|
</write_protocol>
|
|
64509
64609
|
|
|
64510
|
-
###
|
|
64610
|
+
### 7. DRAFT AS WORKING MEMORY (MANDATORY)
|
|
64511
64611
|
**During interview, CONTINUOUSLY record decisions to a draft file.**
|
|
64512
64612
|
|
|
64513
64613
|
**Draft Location**: \`.sisyphus/drafts/{name}.md\`
|
|
@@ -65314,108 +65414,25 @@ Generate plan to: \`.sisyphus/plans/{name}.md\`
|
|
|
65314
65414
|
|
|
65315
65415
|
## Verification Strategy (MANDATORY)
|
|
65316
65416
|
|
|
65317
|
-
> **
|
|
65318
|
-
>
|
|
65319
|
-
> ALL tasks in this plan MUST be verifiable WITHOUT any human action.
|
|
65320
|
-
> This is NOT conditional \u2014 it applies to EVERY task, regardless of test strategy.
|
|
65321
|
-
>
|
|
65322
|
-
> **FORBIDDEN** \u2014 acceptance criteria that require:
|
|
65323
|
-
> - "User manually tests..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uD14C\uC2A4\uD2B8..."
|
|
65324
|
-
> - "User visually confirms..." / "\uC0AC\uC6A9\uC790\uAC00 \uB208\uC73C\uB85C \uD655\uC778..."
|
|
65325
|
-
> - "User interacts with..." / "\uC0AC\uC6A9\uC790\uAC00 \uC9C1\uC811 \uC870\uC791..."
|
|
65326
|
-
> - "Ask user to verify..." / "\uC0AC\uC6A9\uC790\uC5D0\uAC8C \uD655\uC778 \uC694\uCCAD..."
|
|
65327
|
-
> - ANY step where a human must perform an action
|
|
65328
|
-
>
|
|
65329
|
-
> **ALL verification is executed by the agent** using tools (Playwright, interactive_bash, curl, etc.). No exceptions.
|
|
65417
|
+
> **ZERO HUMAN INTERVENTION** \u2014 ALL verification is agent-executed. No exceptions.
|
|
65418
|
+
> Acceptance criteria requiring "user manually tests/confirms" are FORBIDDEN.
|
|
65330
65419
|
|
|
65331
65420
|
### Test Decision
|
|
65332
65421
|
- **Infrastructure exists**: [YES/NO]
|
|
65333
65422
|
- **Automated tests**: [TDD / Tests-after / None]
|
|
65334
65423
|
- **Framework**: [bun test / vitest / jest / pytest / none]
|
|
65424
|
+
- **If TDD**: Each task follows RED (failing test) \u2192 GREEN (minimal impl) \u2192 REFACTOR
|
|
65335
65425
|
|
|
65336
|
-
###
|
|
65337
|
-
|
|
65338
|
-
|
|
65339
|
-
|
|
65340
|
-
**Task Structure:**
|
|
65341
|
-
1. **RED**: Write failing test first
|
|
65342
|
-
- Test file: \`[path].test.ts\`
|
|
65343
|
-
- Test command: \`bun test [file]\`
|
|
65344
|
-
- Expected: FAIL (test exists, implementation doesn't)
|
|
65345
|
-
2. **GREEN**: Implement minimum code to pass
|
|
65346
|
-
- Command: \`bun test [file]\`
|
|
65347
|
-
- Expected: PASS
|
|
65348
|
-
3. **REFACTOR**: Clean up while keeping green
|
|
65349
|
-
- Command: \`bun test [file]\`
|
|
65350
|
-
- Expected: PASS (still)
|
|
65351
|
-
|
|
65352
|
-
**Test Setup Task (if infrastructure doesn't exist):**
|
|
65353
|
-
- [ ] 0. Setup Test Infrastructure
|
|
65354
|
-
- Install: \`bun add -d [test-framework]\`
|
|
65355
|
-
- Config: Create \`[config-file]\`
|
|
65356
|
-
- Verify: \`bun test --help\` \u2192 shows help
|
|
65357
|
-
- Example: Create \`src/__tests__/example.test.ts\`
|
|
65358
|
-
- Verify: \`bun test\` \u2192 1 test passes
|
|
65359
|
-
|
|
65360
|
-
### Agent-Executed QA Scenarios (MANDATORY \u2014 ALL tasks)
|
|
65361
|
-
|
|
65362
|
-
> Whether TDD is enabled or not, EVERY task MUST include Agent-Executed QA Scenarios.
|
|
65363
|
-
> - **With TDD**: QA scenarios complement unit tests at integration/E2E level
|
|
65364
|
-
> - **Without TDD**: QA scenarios are the PRIMARY verification method
|
|
65365
|
-
>
|
|
65366
|
-
> These describe how the executing agent DIRECTLY verifies the deliverable
|
|
65367
|
-
> by running it \u2014 opening browsers, executing commands, sending API requests.
|
|
65368
|
-
> The agent performs what a human tester would do, but automated via tools.
|
|
65369
|
-
|
|
65370
|
-
**Verification Tool by Deliverable Type:**
|
|
65371
|
-
|
|
65372
|
-
| Type | Tool | How Agent Verifies |
|
|
65373
|
-
|------|------|-------------------|
|
|
65374
|
-
| **Frontend/UI** | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
|
|
65375
|
-
| **TUI/CLI** | interactive_bash (tmux) | Run command, send keystrokes, validate output |
|
|
65376
|
-
| **API/Backend** | Bash (curl/httpie) | Send requests, parse responses, assert fields |
|
|
65377
|
-
| **Library/Module** | Bash (bun/node REPL) | Import, call functions, compare output |
|
|
65378
|
-
| **Config/Infra** | Bash (shell commands) | Apply config, run state checks, validate |
|
|
65379
|
-
|
|
65380
|
-
**Each Scenario MUST Follow This Format:**
|
|
65381
|
-
|
|
65382
|
-
\`\`\`
|
|
65383
|
-
Scenario: [Descriptive name \u2014 what user action/flow is being verified]
|
|
65384
|
-
Tool: [Playwright / interactive_bash / Bash]
|
|
65385
|
-
Preconditions: [What must be true before this scenario runs]
|
|
65386
|
-
Steps:
|
|
65387
|
-
1. [Exact action with specific selector/command/endpoint]
|
|
65388
|
-
2. [Next action with expected intermediate state]
|
|
65389
|
-
3. [Assertion with exact expected value]
|
|
65390
|
-
Expected Result: [Concrete, observable outcome]
|
|
65391
|
-
Failure Indicators: [What would indicate failure]
|
|
65392
|
-
Evidence: [Screenshot path / output capture / response body path]
|
|
65393
|
-
\`\`\`
|
|
65426
|
+
### QA Policy
|
|
65427
|
+
Every task MUST include agent-executed QA scenarios (see TODO template below).
|
|
65428
|
+
Evidence saved to \`.sisyphus/evidence/task-{N}-{scenario-slug}.{ext}\`.
|
|
65394
65429
|
|
|
65395
|
-
|
|
65396
|
-
|
|
65397
|
-
|
|
65398
|
-
|
|
65399
|
-
|
|
65400
|
-
|
|
65401
|
-
- **Evidence Paths**: Specific file paths (\`.sisyphus/evidence/task-N-scenario-name.png\`)
|
|
65402
|
-
|
|
65403
|
-
**Anti-patterns (NEVER write scenarios like this):**
|
|
65404
|
-
- \u274C "Verify the login page works correctly"
|
|
65405
|
-
- \u274C "Check that the API returns the right data"
|
|
65406
|
-
- \u274C "Test the form validation"
|
|
65407
|
-
- \u274C "User opens browser and confirms..."
|
|
65408
|
-
|
|
65409
|
-
**Write scenarios like this instead:**
|
|
65410
|
-
- \u2705 \`Navigate to /login \u2192 Fill input[name="email"] with "test@example.com" \u2192 Fill input[name="password"] with "Pass123!" \u2192 Click button[type="submit"] \u2192 Wait for /dashboard \u2192 Assert h1 contains "Welcome"\`
|
|
65411
|
-
- \u2705 \`POST /api/users {"name":"Test","email":"new@test.com"} \u2192 Assert status 201 \u2192 Assert response.id is UUID \u2192 GET /api/users/{id} \u2192 Assert name equals "Test"\`
|
|
65412
|
-
- \u2705 \`Run ./cli --config test.yaml \u2192 Wait for "Loaded" in stdout \u2192 Send "q" \u2192 Assert exit code 0 \u2192 Assert stdout contains "Goodbye"\`
|
|
65413
|
-
|
|
65414
|
-
**Evidence Requirements:**
|
|
65415
|
-
- Screenshots: \`.sisyphus/evidence/\` for all UI verifications
|
|
65416
|
-
- Terminal output: Captured for CLI/TUI verifications
|
|
65417
|
-
- Response bodies: Saved for API verifications
|
|
65418
|
-
- All evidence referenced by specific file path in acceptance criteria
|
|
65430
|
+
| Deliverable Type | Verification Tool | Method |
|
|
65431
|
+
|------------------|-------------------|--------|
|
|
65432
|
+
| Frontend/UI | Playwright (playwright skill) | Navigate, interact, assert DOM, screenshot |
|
|
65433
|
+
| TUI/CLI | interactive_bash (tmux) | Run command, send keystrokes, validate output |
|
|
65434
|
+
| API/Backend | Bash (curl) | Send requests, assert status + response fields |
|
|
65435
|
+
| Library/Module | Bash (bun/node REPL) | Import, call functions, compare output |
|
|
65419
65436
|
|
|
65420
65437
|
---
|
|
65421
65438
|
|
|
@@ -65425,49 +65442,82 @@ Scenario: [Descriptive name \u2014 what user action/flow is being verified]
|
|
|
65425
65442
|
|
|
65426
65443
|
> Maximize throughput by grouping independent tasks into parallel waves.
|
|
65427
65444
|
> Each wave completes before the next begins.
|
|
65445
|
+
> Target: 5-8 tasks per wave. Fewer than 3 per wave (except final) = under-splitting.
|
|
65428
65446
|
|
|
65429
65447
|
\`\`\`
|
|
65430
|
-
Wave 1 (Start Immediately):
|
|
65431
|
-
\u251C\u2500\u2500 Task 1: [
|
|
65432
|
-
\
|
|
65433
|
-
|
|
65434
|
-
|
|
65435
|
-
\u251C\u2500\u2500 Task
|
|
65436
|
-
\u251C\u2500\u2500 Task
|
|
65437
|
-
\u2514\u2500\u2500 Task
|
|
65438
|
-
|
|
65439
|
-
Wave
|
|
65440
|
-
\
|
|
65441
|
-
|
|
65442
|
-
|
|
65443
|
-
|
|
65448
|
+
Wave 1 (Start Immediately \u2014 foundation + scaffolding):
|
|
65449
|
+
\u251C\u2500\u2500 Task 1: Project scaffolding + config [quick]
|
|
65450
|
+
\u251C\u2500\u2500 Task 2: Design system tokens [quick]
|
|
65451
|
+
\u251C\u2500\u2500 Task 3: Type definitions [quick]
|
|
65452
|
+
\u251C\u2500\u2500 Task 4: Schema definitions [quick]
|
|
65453
|
+
\u251C\u2500\u2500 Task 5: Storage interface + in-memory impl [quick]
|
|
65454
|
+
\u251C\u2500\u2500 Task 6: Auth middleware [quick]
|
|
65455
|
+
\u2514\u2500\u2500 Task 7: Client module [quick]
|
|
65456
|
+
|
|
65457
|
+
Wave 2 (After Wave 1 \u2014 core modules, MAX PARALLEL):
|
|
65458
|
+
\u251C\u2500\u2500 Task 8: Core business logic (depends: 3, 5, 7) [deep]
|
|
65459
|
+
\u251C\u2500\u2500 Task 9: API endpoints (depends: 4, 5) [unspecified-high]
|
|
65460
|
+
\u251C\u2500\u2500 Task 10: Secondary storage impl (depends: 5) [unspecified-high]
|
|
65461
|
+
\u251C\u2500\u2500 Task 11: Retry/fallback logic (depends: 8) [deep]
|
|
65462
|
+
\u251C\u2500\u2500 Task 12: UI layout + navigation (depends: 2) [visual-engineering]
|
|
65463
|
+
\u251C\u2500\u2500 Task 13: API client + hooks (depends: 4) [quick]
|
|
65464
|
+
\u2514\u2500\u2500 Task 14: Telemetry middleware (depends: 5, 10) [unspecified-high]
|
|
65465
|
+
|
|
65466
|
+
Wave 3 (After Wave 2 \u2014 integration + UI):
|
|
65467
|
+
\u251C\u2500\u2500 Task 15: Main route combining modules (depends: 6, 11, 14) [deep]
|
|
65468
|
+
\u251C\u2500\u2500 Task 16: UI data visualization (depends: 12, 13) [visual-engineering]
|
|
65469
|
+
\u251C\u2500\u2500 Task 17: Deployment config A (depends: 15) [quick]
|
|
65470
|
+
\u251C\u2500\u2500 Task 18: Deployment config B (depends: 15) [quick]
|
|
65471
|
+
\u251C\u2500\u2500 Task 19: Deployment config C (depends: 15) [quick]
|
|
65472
|
+
\u2514\u2500\u2500 Task 20: UI request log + build (depends: 16) [visual-engineering]
|
|
65473
|
+
|
|
65474
|
+
Wave 4 (After Wave 3 \u2014 verification):
|
|
65475
|
+
\u251C\u2500\u2500 Task 21: Integration tests (depends: 15) [deep]
|
|
65476
|
+
\u251C\u2500\u2500 Task 22: UI QA - Playwright (depends: 20) [unspecified-high]
|
|
65477
|
+
\u251C\u2500\u2500 Task 23: E2E QA (depends: 21) [deep]
|
|
65478
|
+
\u2514\u2500\u2500 Task 24: Git cleanup + tagging (depends: 21) [git]
|
|
65479
|
+
|
|
65480
|
+
Wave FINAL (After ALL tasks \u2014 independent review, 4 parallel):
|
|
65481
|
+
\u251C\u2500\u2500 Task F1: Plan compliance audit (oracle)
|
|
65482
|
+
\u251C\u2500\u2500 Task F2: Code quality review (unspecified-high)
|
|
65483
|
+
\u251C\u2500\u2500 Task F3: Real manual QA (unspecified-high)
|
|
65484
|
+
\u2514\u2500\u2500 Task F4: Scope fidelity check (deep)
|
|
65485
|
+
|
|
65486
|
+
Critical Path: Task 1 \u2192 Task 5 \u2192 Task 8 \u2192 Task 11 \u2192 Task 15 \u2192 Task 21 \u2192 F1-F4
|
|
65487
|
+
Parallel Speedup: ~70% faster than sequential
|
|
65488
|
+
Max Concurrent: 7 (Waves 1 & 2)
|
|
65444
65489
|
\`\`\`
|
|
65445
65490
|
|
|
65446
|
-
### Dependency Matrix
|
|
65491
|
+
### Dependency Matrix (abbreviated \u2014 show ALL tasks in your generated plan)
|
|
65447
65492
|
|
|
65448
|
-
| Task | Depends On | Blocks |
|
|
65449
|
-
|
|
65450
|
-
| 1 |
|
|
65451
|
-
|
|
|
65452
|
-
|
|
|
65453
|
-
|
|
|
65454
|
-
|
|
|
65455
|
-
|
|
|
65493
|
+
| Task | Depends On | Blocks | Wave |
|
|
65494
|
+
|------|------------|--------|------|
|
|
65495
|
+
| 1-7 | \u2014 | 8-14 | 1 |
|
|
65496
|
+
| 8 | 3, 5, 7 | 11, 15 | 2 |
|
|
65497
|
+
| 11 | 8 | 15 | 2 |
|
|
65498
|
+
| 14 | 5, 10 | 15 | 2 |
|
|
65499
|
+
| 15 | 6, 11, 14 | 17-19, 21 | 3 |
|
|
65500
|
+
| 21 | 15 | 23, 24 | 4 |
|
|
65501
|
+
|
|
65502
|
+
> This is abbreviated for reference. YOUR generated plan must include the FULL matrix for ALL tasks.
|
|
65456
65503
|
|
|
65457
65504
|
### Agent Dispatch Summary
|
|
65458
65505
|
|
|
65459
|
-
| Wave |
|
|
65460
|
-
|
|
65461
|
-
| 1 |
|
|
65462
|
-
| 2 |
|
|
65463
|
-
| 3 |
|
|
65506
|
+
| Wave | # Parallel | Tasks \u2192 Agent Category |
|
|
65507
|
+
|------|------------|----------------------|
|
|
65508
|
+
| 1 | **7** | T1-T4 \u2192 \`quick\`, T5 \u2192 \`quick\`, T6 \u2192 \`quick\`, T7 \u2192 \`quick\` |
|
|
65509
|
+
| 2 | **7** | T8 \u2192 \`deep\`, T9 \u2192 \`unspecified-high\`, T10 \u2192 \`unspecified-high\`, T11 \u2192 \`deep\`, T12 \u2192 \`visual-engineering\`, T13 \u2192 \`quick\`, T14 \u2192 \`unspecified-high\` |
|
|
65510
|
+
| 3 | **6** | T15 \u2192 \`deep\`, T16 \u2192 \`visual-engineering\`, T17-T19 \u2192 \`quick\`, T20 \u2192 \`visual-engineering\` |
|
|
65511
|
+
| 4 | **4** | T21 \u2192 \`deep\`, T22 \u2192 \`unspecified-high\`, T23 \u2192 \`deep\`, T24 \u2192 \`git\` |
|
|
65512
|
+
| FINAL | **4** | F1 \u2192 \`oracle\`, F2 \u2192 \`unspecified-high\`, F3 \u2192 \`unspecified-high\`, F4 \u2192 \`deep\` |
|
|
65464
65513
|
|
|
65465
65514
|
---
|
|
65466
65515
|
|
|
65467
65516
|
## TODOs
|
|
65468
65517
|
|
|
65469
65518
|
> Implementation + Test = ONE Task. Never separate.
|
|
65470
|
-
> EVERY task MUST have: Recommended Agent Profile + Parallelization info.
|
|
65519
|
+
> EVERY task MUST have: Recommended Agent Profile + Parallelization info + QA Scenarios.
|
|
65520
|
+
> **A task WITHOUT QA Scenarios is INCOMPLETE. No exceptions.**
|
|
65471
65521
|
|
|
65472
65522
|
- [ ] 1. [Task Title]
|
|
65473
65523
|
|
|
@@ -65501,22 +65551,15 @@ Parallel Speedup: ~40% faster than sequential
|
|
|
65501
65551
|
|
|
65502
65552
|
**Pattern References** (existing code to follow):
|
|
65503
65553
|
- \`src/services/auth.ts:45-78\` - Authentication flow pattern (JWT creation, refresh token handling)
|
|
65504
|
-
- \`src/hooks/useForm.ts:12-34\` - Form validation pattern (Zod schema + react-hook-form integration)
|
|
65505
65554
|
|
|
65506
65555
|
**API/Type References** (contracts to implement against):
|
|
65507
65556
|
- \`src/types/user.ts:UserDTO\` - Response shape for user endpoints
|
|
65508
|
-
- \`src/api/schema.ts:createUserSchema\` - Request validation schema
|
|
65509
65557
|
|
|
65510
65558
|
**Test References** (testing patterns to follow):
|
|
65511
65559
|
- \`src/__tests__/auth.test.ts:describe("login")\` - Test structure and mocking patterns
|
|
65512
65560
|
|
|
65513
|
-
**Documentation References** (specs and requirements):
|
|
65514
|
-
- \`docs/api-spec.md#authentication\` - API contract details
|
|
65515
|
-
- \`ARCHITECTURE.md:Database Layer\` - Database access patterns
|
|
65516
|
-
|
|
65517
65561
|
**External References** (libraries and frameworks):
|
|
65518
65562
|
- Official docs: \`https://zod.dev/?id=basic-usage\` - Zod validation syntax
|
|
65519
|
-
- Example repo: \`github.com/example/project/src/auth\` - Reference implementation
|
|
65520
65563
|
|
|
65521
65564
|
**WHY Each Reference Matters** (explain the relevance):
|
|
65522
65565
|
- Don't just list files - explain what pattern/information the executor should extract
|
|
@@ -65527,113 +65570,60 @@ Parallel Speedup: ~40% faster than sequential
|
|
|
65527
65570
|
|
|
65528
65571
|
> **AGENT-EXECUTABLE VERIFICATION ONLY** \u2014 No human action permitted.
|
|
65529
65572
|
> Every criterion MUST be verifiable by running a command or using a tool.
|
|
65530
|
-
> REPLACE all placeholders with actual values from task context.
|
|
65531
65573
|
|
|
65532
65574
|
**If TDD (tests enabled):**
|
|
65533
65575
|
- [ ] Test file created: src/auth/login.test.ts
|
|
65534
|
-
- [ ] Test covers: successful login returns JWT token
|
|
65535
65576
|
- [ ] bun test src/auth/login.test.ts \u2192 PASS (3 tests, 0 failures)
|
|
65536
65577
|
|
|
65537
|
-
**
|
|
65538
|
-
|
|
65539
|
-
> Write MULTIPLE named scenarios per task: happy path AND failure cases.
|
|
65540
|
-
> Each scenario = exact tool + steps with real selectors/data + evidence path.
|
|
65541
|
-
|
|
65542
|
-
**Example \u2014 Frontend/UI (Playwright):**
|
|
65543
|
-
|
|
65544
|
-
\\\`\\\`\\\`
|
|
65545
|
-
Scenario: Successful login redirects to dashboard
|
|
65546
|
-
Tool: Playwright (playwright skill)
|
|
65547
|
-
Preconditions: Dev server running on localhost:3000, test user exists
|
|
65548
|
-
Steps:
|
|
65549
|
-
1. Navigate to: http://localhost:3000/login
|
|
65550
|
-
2. Wait for: input[name="email"] visible (timeout: 5s)
|
|
65551
|
-
3. Fill: input[name="email"] \u2192 "test@example.com"
|
|
65552
|
-
4. Fill: input[name="password"] \u2192 "ValidPass123!"
|
|
65553
|
-
5. Click: button[type="submit"]
|
|
65554
|
-
6. Wait for: navigation to /dashboard (timeout: 10s)
|
|
65555
|
-
7. Assert: h1 text contains "Welcome back"
|
|
65556
|
-
8. Assert: cookie "session_token" exists
|
|
65557
|
-
9. Screenshot: .sisyphus/evidence/task-1-login-success.png
|
|
65558
|
-
Expected Result: Dashboard loads with welcome message
|
|
65559
|
-
Evidence: .sisyphus/evidence/task-1-login-success.png
|
|
65560
|
-
|
|
65561
|
-
Scenario: Login fails with invalid credentials
|
|
65562
|
-
Tool: Playwright (playwright skill)
|
|
65563
|
-
Preconditions: Dev server running, no valid user with these credentials
|
|
65564
|
-
Steps:
|
|
65565
|
-
1. Navigate to: http://localhost:3000/login
|
|
65566
|
-
2. Fill: input[name="email"] \u2192 "wrong@example.com"
|
|
65567
|
-
3. Fill: input[name="password"] \u2192 "WrongPass"
|
|
65568
|
-
4. Click: button[type="submit"]
|
|
65569
|
-
5. Wait for: .error-message visible (timeout: 5s)
|
|
65570
|
-
6. Assert: .error-message text contains "Invalid credentials"
|
|
65571
|
-
7. Assert: URL is still /login (no redirect)
|
|
65572
|
-
8. Screenshot: .sisyphus/evidence/task-1-login-failure.png
|
|
65573
|
-
Expected Result: Error message shown, stays on login page
|
|
65574
|
-
Evidence: .sisyphus/evidence/task-1-login-failure.png
|
|
65575
|
-
\\\`\\\`\\\`
|
|
65578
|
+
**QA Scenarios (MANDATORY \u2014 task is INCOMPLETE without these):**
|
|
65576
65579
|
|
|
65577
|
-
**
|
|
65580
|
+
> **This is NOT optional. A task without QA scenarios WILL BE REJECTED.**
|
|
65581
|
+
>
|
|
65582
|
+
> Write scenario tests that verify the ACTUAL BEHAVIOR of what you built.
|
|
65583
|
+
> Minimum: 1 happy path + 1 failure/edge case per task.
|
|
65584
|
+
> Each scenario = exact tool + exact steps + exact assertions + evidence path.
|
|
65585
|
+
>
|
|
65586
|
+
> **The executing agent MUST run these scenarios after implementation.**
|
|
65587
|
+
> **The orchestrator WILL verify evidence files exist before marking task complete.**
|
|
65578
65588
|
|
|
65579
65589
|
\\\`\\\`\\\`
|
|
65580
|
-
Scenario:
|
|
65581
|
-
Tool: Bash (curl)
|
|
65582
|
-
Preconditions:
|
|
65590
|
+
Scenario: [Happy path \u2014 what SHOULD work]
|
|
65591
|
+
Tool: [Playwright / interactive_bash / Bash (curl)]
|
|
65592
|
+
Preconditions: [Exact setup state]
|
|
65583
65593
|
Steps:
|
|
65584
|
-
1.
|
|
65585
|
-
|
|
65586
|
-
|
|
65587
|
-
|
|
65588
|
-
|
|
65589
|
-
|
|
65590
|
-
|
|
65591
|
-
|
|
65592
|
-
|
|
65593
|
-
|
|
65594
|
-
Tool: Bash (curl)
|
|
65595
|
-
Preconditions: User with email "new@test.com" already exists
|
|
65594
|
+
1. [Exact action \u2014 specific command/selector/endpoint, no vagueness]
|
|
65595
|
+
2. [Next action \u2014 with expected intermediate state]
|
|
65596
|
+
3. [Assertion \u2014 exact expected value, not "verify it works"]
|
|
65597
|
+
Expected Result: [Concrete, observable, binary pass/fail]
|
|
65598
|
+
Failure Indicators: [What specifically would mean this failed]
|
|
65599
|
+
Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}.{ext}
|
|
65600
|
+
|
|
65601
|
+
Scenario: [Failure/edge case \u2014 what SHOULD fail gracefully]
|
|
65602
|
+
Tool: [same format]
|
|
65603
|
+
Preconditions: [Invalid input / missing dependency / error state]
|
|
65596
65604
|
Steps:
|
|
65597
|
-
1.
|
|
65598
|
-
2. Assert
|
|
65599
|
-
|
|
65600
|
-
|
|
65601
|
-
Evidence: Response body captured
|
|
65605
|
+
1. [Trigger the error condition]
|
|
65606
|
+
2. [Assert error is handled correctly]
|
|
65607
|
+
Expected Result: [Graceful failure with correct error message/code]
|
|
65608
|
+
Evidence: .sisyphus/evidence/task-{N}-{scenario-slug}-error.{ext}
|
|
65602
65609
|
\\\`\\\`\\\`
|
|
65603
65610
|
|
|
65604
|
-
**
|
|
65605
|
-
|
|
65606
|
-
|
|
65607
|
-
|
|
65608
|
-
|
|
65609
|
-
|
|
65610
|
-
|
|
65611
|
-
|
|
65612
|
-
|
|
65613
|
-
|
|
65614
|
-
|
|
65615
|
-
|
|
65616
|
-
6. Assert: Process exited with code 0
|
|
65617
|
-
Expected Result: CLI starts, shows menu, exits cleanly
|
|
65618
|
-
Evidence: Terminal output captured
|
|
65619
|
-
|
|
65620
|
-
Scenario: CLI handles missing config gracefully
|
|
65621
|
-
Tool: interactive_bash (tmux)
|
|
65622
|
-
Preconditions: No config file at ./nonexistent.yaml
|
|
65623
|
-
Steps:
|
|
65624
|
-
1. tmux new-session: ./my-cli --config nonexistent.yaml
|
|
65625
|
-
2. Wait for: output (timeout: 3s)
|
|
65626
|
-
3. Assert: stderr contains "Config file not found"
|
|
65627
|
-
4. Assert: Process exited with code 1
|
|
65628
|
-
Expected Result: Meaningful error, non-zero exit
|
|
65629
|
-
Evidence: Error output captured
|
|
65630
|
-
\\\`\\\`\\\`
|
|
65611
|
+
> **Specificity requirements \u2014 every scenario MUST use:**
|
|
65612
|
+
> - **Selectors**: Specific CSS selectors (\`.login-button\`, not "the login button")
|
|
65613
|
+
> - **Data**: Concrete test data (\`"test@example.com"\`, not \`"[email]"\`)
|
|
65614
|
+
> - **Assertions**: Exact values (\`text contains "Welcome back"\`, not "verify it works")
|
|
65615
|
+
> - **Timing**: Wait conditions where relevant (\`timeout: 10s\`)
|
|
65616
|
+
> - **Negative**: At least ONE failure/error scenario per task
|
|
65617
|
+
>
|
|
65618
|
+
> **Anti-patterns (your scenario is INVALID if it looks like this):**
|
|
65619
|
+
> - \u274C "Verify it works correctly" \u2014 HOW? What does "correctly" mean?
|
|
65620
|
+
> - \u274C "Check the API returns data" \u2014 WHAT data? What fields? What values?
|
|
65621
|
+
> - \u274C "Test the component renders" \u2014 WHERE? What selector? What content?
|
|
65622
|
+
> - \u274C Any scenario without an evidence path
|
|
65631
65623
|
|
|
65632
65624
|
**Evidence to Capture:**
|
|
65633
|
-
- [ ] Screenshots in .sisyphus/evidence/ for UI scenarios
|
|
65634
|
-
- [ ] Terminal output for CLI/TUI scenarios
|
|
65635
|
-
- [ ] Response bodies for API scenarios
|
|
65636
65625
|
- [ ] Each evidence file named: task-{N}-{scenario-slug}.{ext}
|
|
65626
|
+
- [ ] Screenshots for UI, terminal output for CLI, response bodies for API
|
|
65637
65627
|
|
|
65638
65628
|
**Commit**: YES | NO (groups with N)
|
|
65639
65629
|
- Message: \`type(scope): desc\`
|
|
@@ -65642,6 +65632,28 @@ Parallel Speedup: ~40% faster than sequential
|
|
|
65642
65632
|
|
|
65643
65633
|
---
|
|
65644
65634
|
|
|
65635
|
+
## Final Verification Wave (MANDATORY \u2014 after ALL implementation tasks)
|
|
65636
|
+
|
|
65637
|
+
> 4 review agents run in PARALLEL. ALL must APPROVE. Rejection \u2192 fix \u2192 re-run.
|
|
65638
|
+
|
|
65639
|
+
- [ ] F1. **Plan Compliance Audit** \u2014 \`oracle\`
|
|
65640
|
+
Read the plan end-to-end. For each "Must Have": verify implementation exists (read file, curl endpoint, run command). For each "Must NOT Have": search codebase for forbidden patterns \u2014 reject with file:line if found. Check evidence files exist in .sisyphus/evidence/. Compare deliverables against plan.
|
|
65641
|
+
Output: \`Must Have [N/N] | Must NOT Have [N/N] | Tasks [N/N] | VERDICT: APPROVE/REJECT\`
|
|
65642
|
+
|
|
65643
|
+
- [ ] F2. **Code Quality Review** \u2014 \`unspecified-high\`
|
|
65644
|
+
Run \`tsc --noEmit\` + linter + \`bun test\`. Review all changed files for: \`as any\`/\`@ts-ignore\`, empty catches, console.log in prod, commented-out code, unused imports. Check AI slop: excessive comments, over-abstraction, generic names (data/result/item/temp).
|
|
65645
|
+
Output: \`Build [PASS/FAIL] | Lint [PASS/FAIL] | Tests [N pass/N fail] | Files [N clean/N issues] | VERDICT\`
|
|
65646
|
+
|
|
65647
|
+
- [ ] F3. **Real Manual QA** \u2014 \`unspecified-high\` (+ \`playwright\` skill if UI)
|
|
65648
|
+
Start from clean state. Execute EVERY QA scenario from EVERY task \u2014 follow exact steps, capture evidence. Test cross-task integration (features working together, not isolation). Test edge cases: empty state, invalid input, rapid actions. Save to \`.sisyphus/evidence/final-qa/\`.
|
|
65649
|
+
Output: \`Scenarios [N/N pass] | Integration [N/N] | Edge Cases [N tested] | VERDICT\`
|
|
65650
|
+
|
|
65651
|
+
- [ ] F4. **Scope Fidelity Check** \u2014 \`deep\`
|
|
65652
|
+
For each task: read "What to do", read actual diff (git log/diff). Verify 1:1 \u2014 everything in spec was built (no missing), nothing beyond spec was built (no creep). Check "Must NOT do" compliance. Detect cross-task contamination: Task N touching Task M's files. Flag unaccounted changes.
|
|
65653
|
+
Output: \`Tasks [N/N compliant] | Contamination [CLEAN/N issues] | Unaccounted [CLEAN/N files] | VERDICT\`
|
|
65654
|
+
|
|
65655
|
+
---
|
|
65656
|
+
|
|
65645
65657
|
## Commit Strategy
|
|
65646
65658
|
|
|
65647
65659
|
| After Task | Message | Files | Verification |
|
|
@@ -67551,9 +67563,11 @@ function createChatMessageHandler2(args) {
|
|
|
67551
67563
|
}
|
|
67552
67564
|
const message = output.message;
|
|
67553
67565
|
if (firstMessageVariantGate.shouldOverride(input.sessionID)) {
|
|
67554
|
-
|
|
67555
|
-
|
|
67556
|
-
|
|
67566
|
+
if (message["variant"] === undefined) {
|
|
67567
|
+
const variant = input.model && input.agent ? resolveVariantForModel(pluginConfig, input.agent, input.model) : resolveAgentVariant(pluginConfig, input.agent);
|
|
67568
|
+
if (variant !== undefined) {
|
|
67569
|
+
message["variant"] = variant;
|
|
67570
|
+
}
|
|
67557
67571
|
}
|
|
67558
67572
|
firstMessageVariantGate.markApplied(input.sessionID);
|
|
67559
67573
|
} else {
|