@ai-dev-methodologies/rlp-desk 0.9.2 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/docs/blueprints/blueprint-flywheel-enhancement.md +352 -0
- package/docs/blueprints/plan-flywheel-enhancement.md +817 -0
- package/docs/protocol-reference.md +1 -1
- package/docs/superpowers/plans/2026-04-24-gpt-5-5-default.md +517 -0
- package/docs/superpowers/specs/2026-04-24-gpt-5-5-default.md +107 -0
- package/package.json +1 -1
- package/src/commands/rlp-desk.md +27 -23
- package/src/governance.md +20 -7
- package/src/model-upgrade-table.md +5 -5
- package/src/node/run.mjs +14 -2
- package/src/node/runner/campaign-main-loop.mjs +95 -5
package/src/commands/rlp-desk.md
CHANGED
|
@@ -75,13 +75,13 @@ Ask about these items one by one (or in small groups):
|
|
|
75
75
|
|
|
76
76
|
| Complexity | Worker | per-US Verifier | Final Verifier | Consensus |
|
|
77
77
|
|------------|--------|-----------------|----------------|-----------|
|
|
78
|
-
| LOW | gpt-5.
|
|
79
|
-
| MEDIUM | gpt-5.
|
|
80
|
-
| HIGH | gpt-5.
|
|
81
|
-
| CRITICAL | gpt-5.
|
|
78
|
+
| LOW | gpt-5.5:medium | sonnet | opus | final-only |
|
|
79
|
+
| MEDIUM | gpt-5.5:medium | opus | opus | final-only |
|
|
80
|
+
| HIGH | gpt-5.5:high | opus | opus | all |
|
|
81
|
+
| CRITICAL | gpt-5.5:high | opus | opus + human | all |
|
|
82
82
|
|
|
83
83
|
**Worker model selection** (cross-engine):
|
|
84
|
-
- **gpt-5.
|
|
84
|
+
- **gpt-5.5:medium** — default recommendation (full context window, progressive upgrade handles harder US)
|
|
85
85
|
- **spark:high** — only when US is small enough for spark's 100k context (single-file, AC count <= 4, simple logic). Do NOT use as primary recommendation — spark context window is too small for most tasks
|
|
86
86
|
|
|
87
87
|
Present complexity score with evidence to the user, e.g.: "I rate this MEDIUM because: US count=4 (MEDIUM), file scope=2 (MEDIUM), logic=conditionals (MEDIUM), deps=none (LOW), impact=modify (MEDIUM). Highest=MEDIUM."
|
|
@@ -91,8 +91,8 @@ Ask about these items one by one (or in small groups):
|
|
|
91
91
|
**If codex is NOT installed** — say: "Codex is not installed. Defaulting to claude-only Worker. Note: without a second engine, your Verifier shares the same perspective as the Worker — there is a risk of blind spots where both Worker and Verifier miss the same issue. To unlock cross-engine coverage: `npm install -g @openai/codex`"
|
|
92
92
|
|
|
93
93
|
8. **Batch Capacity Check** — when verify-mode is batch and PRD is large:
|
|
94
|
-
- batch + spark + AC > 4 → warn "spark 100k context limit — switch to gpt-5.
|
|
95
|
-
- batch + gpt-5.
|
|
94
|
+
- batch + spark + AC > 4 → warn "spark 100k context limit — switch to gpt-5.5 or split smaller"
|
|
95
|
+
- batch + gpt-5.5 + AC > 15 → warn "too many ACs for single batch — consider wave split (3-4 US per wave)"
|
|
96
96
|
- per-us → no warning (US-level processing, no limit concern)
|
|
97
97
|
9. **Verify Mode** — per-us (default) or batch. Ask: "Verify after each user story (per-us, recommended) or only after all stories are done (batch)?" Default recommendation: per-us for 2+ stories.
|
|
98
98
|
10. **Consensus** — Ask: "Use cross-engine consensus? off (single engine), final-only (cross-engine on final verify only), or all (cross-engine on every verify). Requires codex CLI." Default: off. Recommended: final-only when codex is installed.
|
|
@@ -164,26 +164,26 @@ Tell the user:
|
|
|
164
164
|
Available run commands (copy the one you want):
|
|
165
165
|
|
|
166
166
|
# ★ Recommended: cross-engine + final-consensus (full context + blind-spot coverage):
|
|
167
|
-
/rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.
|
|
167
|
+
/rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.5:medium --consensus final-only --debug
|
|
168
168
|
|
|
169
169
|
# Small tasks only (single-file, AC <= 4, simple logic — spark 100k context limit):
|
|
170
170
|
/rlp-desk run <actual-slug> --mode tmux --worker-model spark:high --consensus final-only --debug
|
|
171
171
|
|
|
172
172
|
# Critical (full consensus on every verify):
|
|
173
|
-
/rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.
|
|
173
|
+
/rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.5:high --consensus all --debug
|
|
174
174
|
|
|
175
175
|
# Claude-only:
|
|
176
176
|
/rlp-desk run <actual-slug> --debug
|
|
177
177
|
|
|
178
178
|
# Full options reference:
|
|
179
179
|
# --mode agent|tmux (default: agent)
|
|
180
|
-
# --worker-model MODEL haiku|sonnet|opus or gpt-5.
|
|
180
|
+
# --worker-model MODEL haiku|sonnet|opus or gpt-5.5:high|spark:high (default: haiku)
|
|
181
181
|
# --lock-worker-model disable auto model upgrade
|
|
182
182
|
# --verifier-model MODEL per-US verifier (default: sonnet)
|
|
183
183
|
# --final-verifier-model MODEL final ALL verifier (default: opus)
|
|
184
184
|
# --consensus off|all|final-only cross-engine consensus (default: off)
|
|
185
|
-
# --consensus-model MODEL per-US cross-verifier (default: gpt-5.
|
|
186
|
-
# --final-consensus-model MODEL final cross-verifier (default: gpt-5.
|
|
185
|
+
# --consensus-model MODEL per-US cross-verifier (default: gpt-5.5:medium)
|
|
186
|
+
# --final-consensus-model MODEL final cross-verifier (default: gpt-5.5:high)
|
|
187
187
|
# --verify-mode per-us|batch (default: per-us)
|
|
188
188
|
# --cb-threshold N (default: 6)
|
|
189
189
|
# --max-iter N (default: 100)
|
|
@@ -192,6 +192,8 @@ Tell the user:
|
|
|
192
192
|
# --with-self-verification post-campaign SV report
|
|
193
193
|
# --flywheel off|on-fail direction review on fail (default: off)
|
|
194
194
|
# --flywheel-model MODEL flywheel reviewer model (default: opus)
|
|
195
|
+
# --flywheel-guard off|on guard validates flywheel decisions (default: off)
|
|
196
|
+
# --flywheel-guard-model MODEL guard reviewer model (default: opus)
|
|
195
197
|
```
|
|
196
198
|
|
|
197
199
|
**If codex is NOT installed** — show claude-only presets + install recommendation:
|
|
@@ -222,6 +224,8 @@ Tell the user:
|
|
|
222
224
|
# --with-self-verification post-campaign SV report
|
|
223
225
|
# --flywheel off|on-fail direction review on fail (default: off)
|
|
224
226
|
# --flywheel-model MODEL flywheel reviewer model (default: opus)
|
|
227
|
+
# --flywheel-guard off|on guard validates flywheel decisions (default: off)
|
|
228
|
+
# --flywheel-guard-model MODEL guard reviewer model (default: opus)
|
|
225
229
|
```
|
|
226
230
|
|
|
227
231
|
Replace `<actual-slug>` with the real slug from this init (e.g. `auth-refactor`).
|
|
@@ -236,16 +240,16 @@ Tell the user:
|
|
|
236
240
|
|
|
237
241
|
Options (parse from `$ARGUMENTS`):
|
|
238
242
|
- `--mode agent|tmux` (default: `agent`) — execution mode
|
|
239
|
-
- `--worker-model MODEL` (default: `haiku`) — Worker model. Format: `model` = claude engine, `model:reasoning` = codex engine. Examples: `haiku`, `sonnet`, `opus`, `spark:high`, `gpt-5.
|
|
240
|
-
- `--lock-worker-model` — disable automatic model upgrade on failure
|
|
243
|
+
- `--worker-model MODEL` (default: `haiku`) — Worker model. Format: `model` = claude engine, `model:reasoning` = codex engine. Examples: `haiku`, `sonnet`, `opus`, `spark:high`, `gpt-5.5:high`. Parsed by `parse_model_flag()` which auto-splits engine/model/reasoning.
|
|
244
|
+
- `--lock-worker-model` — disable automatic model upgrade on failure. Worker stays on the specified model regardless of consecutive failures.
|
|
241
245
|
- `--verifier-model MODEL` (default: `sonnet`) — per-US verification model. Campaign-fixed (no progressive upgrade). Lighter than final verifier.
|
|
242
246
|
- `--final-verifier-model MODEL` (default: `opus`) — final ALL verification model. Independent from per-US verifier. Used only for the final full-AC verify pass.
|
|
243
247
|
- `--consensus off|all|final-only` (default: `off`) — cross-engine consensus verification mode.
|
|
244
248
|
- `off`: single-engine verification only
|
|
245
249
|
- `all`: cross-engine consensus on every verify (per-US and final)
|
|
246
250
|
- `final-only`: cross-engine consensus only on the final ALL verify
|
|
247
|
-
- `--consensus-model MODEL` (default: `gpt-5.
|
|
248
|
-
- `--final-consensus-model MODEL` (default: `gpt-5.
|
|
251
|
+
- `--consensus-model MODEL` (default: `gpt-5.5:medium`) — per-US cross-verifier model. Lighter weight for cost efficiency.
|
|
252
|
+
- `--final-consensus-model MODEL` (default: `gpt-5.5:high`) — final cross-verifier model. Stricter. Note: spark is not allowed here (100k output limit).
|
|
249
253
|
- `--verify-mode per-us|batch` (default: `per-us`) — verification strategy
|
|
250
254
|
- `per-us`: verify after each US, then final full verify of all AC
|
|
251
255
|
- `batch`: verify only after all US done (legacy behavior)
|
|
@@ -288,8 +292,8 @@ VERIFIER_MODEL=<--verifier-model value, default: sonnet> \
|
|
|
288
292
|
FINAL_VERIFIER_MODEL=<--final-verifier-model value, default: opus> \
|
|
289
293
|
VERIFY_MODE=<--verify-mode value, default: per-us> \
|
|
290
294
|
CONSENSUS_MODE=<--consensus value, default: off> \
|
|
291
|
-
CONSENSUS_MODEL=<--consensus-model value, default: gpt-5.
|
|
292
|
-
FINAL_CONSENSUS_MODEL=<--final-consensus-model value, default: gpt-5.
|
|
295
|
+
CONSENSUS_MODEL=<--consensus-model value, default: gpt-5.5:medium> \
|
|
296
|
+
FINAL_CONSENSUS_MODEL=<--final-consensus-model value, default: gpt-5.5:high> \
|
|
293
297
|
CB_THRESHOLD=<--cb-threshold value, default: 6> \
|
|
294
298
|
ITER_TIMEOUT=<--iter-timeout value, default: 600> \
|
|
295
299
|
DEBUG=<1 if --debug, else 0> \
|
|
@@ -469,8 +473,8 @@ Bash("codex exec --model <codex_model> --reasoning-effort <codex_reasoning> <ful
|
|
|
469
473
|
**⑦b Consensus Verification** (when `--consensus` is `all`, or `final-only` and scope is ALL):
|
|
470
474
|
After the primary verifier runs, run a cross-engine second verifier:
|
|
471
475
|
- Determine cross-verifier model based on scope:
|
|
472
|
-
- per-US verify → use `--consensus-model` (default: gpt-5.
|
|
473
|
-
- final ALL verify → use `--final-consensus-model` (default: gpt-5.
|
|
476
|
+
- per-US verify → use `--consensus-model` (default: gpt-5.5:medium)
|
|
477
|
+
- final ALL verify → use `--final-consensus-model` (default: gpt-5.5:high)
|
|
474
478
|
- If primary engine is claude → cross-verifier uses codex (the consensus model)
|
|
475
479
|
- If primary engine is codex → cross-verifier uses claude `opus` (fixed)
|
|
476
480
|
- Both produce `verify-verdict.json` (Leader renames to `verify-verdict-claude.json` and `verify-verdict-codex.json`)
|
|
@@ -728,13 +732,13 @@ Example:
|
|
|
728
732
|
|
|
729
733
|
Run options:
|
|
730
734
|
--mode agent|tmux Execution mode (default: agent)
|
|
731
|
-
--worker-model MODEL Worker model: haiku|sonnet|opus or gpt-5.
|
|
735
|
+
--worker-model MODEL Worker model: haiku|sonnet|opus or gpt-5.5:high|spark:high (default: haiku)
|
|
732
736
|
--lock-worker-model Disable auto model upgrade on failure
|
|
733
737
|
--verifier-model MODEL per-US verifier (default: sonnet)
|
|
734
738
|
--final-verifier-model MODEL Final ALL verifier (default: opus)
|
|
735
739
|
--consensus off|all|final-only Cross-engine consensus (default: off)
|
|
736
|
-
--consensus-model MODEL per-US cross-verifier (default: gpt-5.
|
|
737
|
-
--final-consensus-model MODEL Final cross-verifier (default: gpt-5.
|
|
740
|
+
--consensus-model MODEL per-US cross-verifier (default: gpt-5.5:medium)
|
|
741
|
+
--final-consensus-model MODEL Final cross-verifier (default: gpt-5.5:high)
|
|
738
742
|
--verify-mode per-us|batch Verification strategy (default: per-us)
|
|
739
743
|
--cb-threshold N Consecutive failures before BLOCKED (default: 6)
|
|
740
744
|
--max-iter N Max iterations (default: 100)
|
package/src/governance.md
CHANGED
|
@@ -14,7 +14,7 @@ The Leader orchestrates, while Worker/Verifier run in isolated fresh contexts ev
|
|
|
14
14
|
- **Worker must NEVER modify Claude Code settings** (settings.json, settings.local.json). Permission prompts must be reported as blocked, not bypassed by editing settings.
|
|
15
15
|
- **Verifier is independent**: The Verifier judges based on evidence alone, without knowledge of the Worker's reasoning process.
|
|
16
16
|
- **Sentinels are Leader-owned**: Only the Leader writes COMPLETE/BLOCKED sentinels.
|
|
17
|
-
- **Supported engines**: claude (default; models: haiku, sonnet, opus) and codex (opt-in via `--worker-model spark:high` or `--worker-model gpt-5.
|
|
17
|
+
- **Supported engines**: claude (default; models: haiku, sonnet, opus) and codex (opt-in via `--worker-model spark:high` or `--worker-model gpt-5.5:high`).
|
|
18
18
|
|
|
19
19
|
## 1a. Iron Laws
|
|
20
20
|
|
|
@@ -300,11 +300,11 @@ The Leader decides each iteration. Decision criteria:
|
|
|
300
300
|
|
|
301
301
|
### Codex (opt-in engine)
|
|
302
302
|
|
|
303
|
-
Model routing uses `--worker-model` and `--verifier-model` with codex format: `spark:high` or `gpt-5.
|
|
303
|
+
Model routing uses `--worker-model` and `--verifier-model` with codex format: `spark:high` or `gpt-5.5:high`.
|
|
304
304
|
|
|
305
305
|
```
|
|
306
306
|
--worker-model spark:high # codex worker, spark model, high reasoning
|
|
307
|
-
--verifier-model gpt-5.
|
|
307
|
+
--verifier-model gpt-5.5:high # codex verifier, gpt-5.5, high reasoning
|
|
308
308
|
```
|
|
309
309
|
|
|
310
310
|
`parse_model_flag()` auto-detects engine from the model name: plain names (haiku, sonnet, opus) = claude; `name:reasoning` format = codex. Claude is the default engine; codex is explicitly opt-in.
|
|
@@ -331,7 +331,7 @@ Agent(
|
|
|
331
331
|
)
|
|
332
332
|
```
|
|
333
333
|
|
|
334
|
-
If `--worker-model` or `--verifier-model` uses codex format (e.g., `spark:high`, `gpt-5.
|
|
334
|
+
If `--worker-model` or `--verifier-model` uses codex format (e.g., `spark:high`, `gpt-5.5:high`) (opt-in):
|
|
335
335
|
```
|
|
336
336
|
# Worker or Verifier (codex engine)
|
|
337
337
|
Bash("codex -m <codex_model> -c model_reasoning_effort=<codex_reasoning> --dangerously-bypass-approvals-and-sandbox <prompt>")
|
|
@@ -377,7 +377,7 @@ claude -p "$(cat /path/to/prompt.md)" \
|
|
|
377
377
|
When `WORKER_ENGINE=codex` or `VERIFIER_ENGINE=codex`, the `codex` CLI is used instead:
|
|
378
378
|
```bash
|
|
379
379
|
# codex engine (opt-in)
|
|
380
|
-
codex -m gpt-5.
|
|
380
|
+
codex -m gpt-5.5 \
|
|
381
381
|
-c model_reasoning_effort="high" \
|
|
382
382
|
--dangerously-bypass-approvals-and-sandbox \
|
|
383
383
|
"$(cat /path/to/prompt.md)"
|
|
@@ -483,6 +483,19 @@ for iteration in 1..max_iter:
|
|
|
483
483
|
parsing memory.md. In Agent() mode, the Leader MAY read iter-signal.json
|
|
484
484
|
as a structured alternative to parsing the Stop Status from memory.md.
|
|
485
485
|
|
|
486
|
+
⑥½ Flywheel direction review (when --flywheel on-fail and consecutive_failures > 0)
|
|
487
|
+
- Dispatch Flywheel agent (fresh context, --flywheel-model)
|
|
488
|
+
- Read flywheel-signal.json for direction decision (hold/pivot/reduce/expand)
|
|
489
|
+
- If --flywheel-guard on:
|
|
490
|
+
- Dispatch Guard agent (fresh context, --flywheel-guard-model)
|
|
491
|
+
- Read flywheel-guard-verdict.json:
|
|
492
|
+
• pass → proceed to Worker with updated contract
|
|
493
|
+
• pass + analysis_only → skip Worker, record analysis, next iteration
|
|
494
|
+
• fail → re-run Flywheel with guard feedback (max 2 retries)
|
|
495
|
+
• fail + retries exhausted → BLOCKED
|
|
496
|
+
• inconclusive → BLOCKED (escalate to user)
|
|
497
|
+
- Guard count tracked per-US in status.json
|
|
498
|
+
|
|
486
499
|
⑦ Execute Verifier (see §7a for per-US and §7b for consensus details)
|
|
487
500
|
- Build prompt (scoped to us_id if per-us mode) → log
|
|
488
501
|
- Agent(subagent_type="executor", model=selected, prompt=prompt)
|
|
@@ -556,9 +569,9 @@ Worker completes US → signal verify
|
|
|
556
569
|
|
|
557
570
|
| Scenario | Primary verifier | Cross verifier |
|
|
558
571
|
|----------|-----------------|----------------|
|
|
559
|
-
| per-US, primary=claude | `--verifier-model` (sonnet) | `--consensus-model` (gpt-5.
|
|
572
|
+
| per-US, primary=claude | `--verifier-model` (sonnet) | `--consensus-model` (gpt-5.5:medium) |
|
|
560
573
|
| per-US, primary=codex | `--verifier-model` | claude opus (fixed) |
|
|
561
|
-
| final, primary=claude | `--final-verifier-model` (opus) | `--final-consensus-model` (gpt-5.
|
|
574
|
+
| final, primary=claude | `--final-verifier-model` (opus) | `--final-consensus-model` (gpt-5.5:high) |
|
|
562
575
|
| final, primary=codex | `--final-verifier-model` | claude opus (fixed) |
|
|
563
576
|
|
|
564
577
|
- Both must pass. No engine priority.
|
|
@@ -18,14 +18,14 @@ CB default: 6. Override: `--cb-threshold N`. Worker only — Verifier fixed at c
|
|
|
18
18
|
| HIGH | gpt-5.3-codex-spark:high | gpt-5.3-codex-spark:xhigh | gpt-5.3-codex-spark:xhigh | BLOCKED |
|
|
19
19
|
| CRITICAL | gpt-5.3-codex-spark:xhigh | gpt-5.3-codex-spark:xhigh | gpt-5.3-codex-spark:xhigh | BLOCKED |
|
|
20
20
|
|
|
21
|
-
## Non-Pro (gpt-5.
|
|
21
|
+
## Non-Pro (gpt-5.5)
|
|
22
22
|
|
|
23
23
|
| Complexity | 1-2 | 3-4 | 5-6 | 7+ |
|
|
24
24
|
|------------|-----|-----|-----|-----|
|
|
25
|
-
| LOW | gpt-5.
|
|
26
|
-
| MEDIUM | gpt-5.
|
|
27
|
-
| HIGH | gpt-5.
|
|
28
|
-
| CRITICAL | gpt-5.
|
|
25
|
+
| LOW | gpt-5.5:low | gpt-5.5:medium | gpt-5.5:high | BLOCKED |
|
|
26
|
+
| MEDIUM | gpt-5.5:medium | gpt-5.5:high | gpt-5.5:xhigh | BLOCKED |
|
|
27
|
+
| HIGH | gpt-5.5:high | gpt-5.5:xhigh | gpt-5.5:xhigh | BLOCKED |
|
|
28
|
+
| CRITICAL | gpt-5.5:xhigh | gpt-5.5:xhigh | gpt-5.5:xhigh | BLOCKED |
|
|
29
29
|
|
|
30
30
|
## Claude-only
|
|
31
31
|
|
package/src/node/run.mjs
CHANGED
|
@@ -11,8 +11,8 @@ const RUN_DEFAULTS = {
|
|
|
11
11
|
verifierModel: 'sonnet',
|
|
12
12
|
finalVerifierModel: 'opus',
|
|
13
13
|
consensusMode: 'off',
|
|
14
|
-
consensusModel: 'gpt-5.
|
|
15
|
-
finalConsensusModel: 'gpt-5.
|
|
14
|
+
consensusModel: 'gpt-5.5:medium',
|
|
15
|
+
finalConsensusModel: 'gpt-5.5:high',
|
|
16
16
|
verifyMode: 'per-us',
|
|
17
17
|
cbThreshold: 6,
|
|
18
18
|
maxIterations: 100,
|
|
@@ -23,6 +23,8 @@ const RUN_DEFAULTS = {
|
|
|
23
23
|
withSelfVerification: false,
|
|
24
24
|
flywheel: 'off',
|
|
25
25
|
flywheelModel: 'opus',
|
|
26
|
+
flywheelGuard: 'off',
|
|
27
|
+
flywheelGuardModel: 'opus',
|
|
26
28
|
};
|
|
27
29
|
|
|
28
30
|
function write(stream, value) {
|
|
@@ -61,6 +63,8 @@ function buildHelpText() {
|
|
|
61
63
|
' --with-self-verification',
|
|
62
64
|
' --flywheel off|on-fail',
|
|
63
65
|
' --flywheel-model MODEL',
|
|
66
|
+
' --flywheel-guard off|on',
|
|
67
|
+
' --flywheel-guard-model MODEL',
|
|
64
68
|
' --help',
|
|
65
69
|
].join('\n');
|
|
66
70
|
}
|
|
@@ -154,6 +158,14 @@ function parseRunOptions(args, cwd) {
|
|
|
154
158
|
options.flywheelModel = consumeValue(args, index, token);
|
|
155
159
|
index += 1;
|
|
156
160
|
break;
|
|
161
|
+
case '--flywheel-guard':
|
|
162
|
+
options.flywheelGuard = consumeValue(args, index, token);
|
|
163
|
+
index += 1;
|
|
164
|
+
break;
|
|
165
|
+
case '--flywheel-guard-model':
|
|
166
|
+
options.flywheelGuardModel = consumeValue(args, index, token);
|
|
167
|
+
index += 1;
|
|
168
|
+
break;
|
|
157
169
|
default:
|
|
158
170
|
throw new Error(`unknown option: ${token}`);
|
|
159
171
|
}
|
|
@@ -26,9 +26,9 @@ const execFileAsync = promisify(execFile);
|
|
|
26
26
|
const REQUIRED_SCAFFOLD_NAMES = ['workerPrompt', 'verifierPrompt', 'memoryFile', 'prdFile', 'testSpecFile'];
|
|
27
27
|
const CLAUDE_MODELS = new Set(['haiku', 'sonnet', 'opus']);
|
|
28
28
|
const MODEL_UPGRADES = {
|
|
29
|
-
'gpt-5.
|
|
30
|
-
'gpt-5.
|
|
31
|
-
'gpt-5.
|
|
29
|
+
'gpt-5.5:medium': 'gpt-5.5:high',
|
|
30
|
+
'gpt-5.5:high': 'gpt-5.5:xhigh',
|
|
31
|
+
'gpt-5.5:xhigh': 'BLOCKED',
|
|
32
32
|
'gpt-5.3-codex-spark:medium': 'gpt-5.3-codex-spark:high',
|
|
33
33
|
'gpt-5.3-codex-spark:high': 'gpt-5.3-codex-spark:xhigh',
|
|
34
34
|
'gpt-5.3-codex-spark:xhigh': 'BLOCKED',
|
|
@@ -63,6 +63,8 @@ function buildPaths(rootDir, slug) {
|
|
|
63
63
|
statusFile: path.join(campaignLogDir, 'runtime', 'status.json'),
|
|
64
64
|
flywheelPromptFile: path.join(deskRoot, 'prompts', `${slug}.flywheel.prompt.md`),
|
|
65
65
|
flywheelSignalFile: path.join(deskRoot, 'memos', `${slug}-flywheel-signal.json`),
|
|
66
|
+
flywheelGuardPromptFile: path.join(deskRoot, 'prompts', `${slug}.flywheel-guard.prompt.md`),
|
|
67
|
+
flywheelGuardVerdictFile: path.join(deskRoot, 'memos', `${slug}-flywheel-guard-verdict.json`),
|
|
66
68
|
};
|
|
67
69
|
}
|
|
68
70
|
|
|
@@ -257,6 +259,7 @@ async function readCurrentState(paths, slug, options) {
|
|
|
257
259
|
leader_pane_id: status.leader_pane_id ?? null,
|
|
258
260
|
worker_pane_id: status.worker_pane_id ?? null,
|
|
259
261
|
verifier_pane_id: status.verifier_pane_id ?? null,
|
|
262
|
+
flywheel_guard_count: status.flywheel_guard_count ?? {},
|
|
260
263
|
started_at_utc: startedAt,
|
|
261
264
|
};
|
|
262
265
|
}
|
|
@@ -412,12 +415,32 @@ async function dispatchFlywheel({ paths, sendKeys, flywheelPaneId, flywheelModel
|
|
|
412
415
|
await sendKeys(flywheelPaneId, triggerCmd);
|
|
413
416
|
}
|
|
414
417
|
|
|
418
|
+
function buildGuardTriggerCmd({ guardPromptFile, guardModel, rootDir }) {
|
|
419
|
+
return `cd ${JSON.stringify(rootDir)} && DISABLE_OMC=1 claude --model ${guardModel} --no-mcp -p "$(cat ${JSON.stringify(guardPromptFile)})"`;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
async function dispatchGuard({ paths, sendKeys, guardPaneId, guardModel, rootDir }) {
|
|
423
|
+
const triggerCmd = buildGuardTriggerCmd({
|
|
424
|
+
guardPromptFile: paths.flywheelGuardPromptFile,
|
|
425
|
+
guardModel,
|
|
426
|
+
rootDir,
|
|
427
|
+
});
|
|
428
|
+
await sendKeys(guardPaneId, triggerCmd);
|
|
429
|
+
}
|
|
430
|
+
|
|
415
431
|
export function shouldRunFlywheel(flywheelMode, state) {
|
|
416
432
|
if (flywheelMode === 'off') return false;
|
|
417
433
|
if (flywheelMode === 'on-fail' && (state.consecutive_failures ?? 0) > 0) return true;
|
|
418
434
|
return false;
|
|
419
435
|
}
|
|
420
436
|
|
|
437
|
+
export function shouldRunGuard(flywheelGuard, state, usId) {
|
|
438
|
+
if (flywheelGuard !== 'on') return false;
|
|
439
|
+
const count = (state.flywheel_guard_count ?? {})[usId] ?? 0;
|
|
440
|
+
if (count >= 3) return false;
|
|
441
|
+
return true;
|
|
442
|
+
}
|
|
443
|
+
|
|
421
444
|
export async function run(slug, options = {}) {
|
|
422
445
|
const rootDir = path.resolve(options.rootDir ?? process.cwd());
|
|
423
446
|
const paths = buildPaths(rootDir, slug);
|
|
@@ -553,9 +576,76 @@ export async function run(slug, options = {}) {
|
|
|
553
576
|
});
|
|
554
577
|
|
|
555
578
|
state.last_flywheel_decision = flywheelSignal.decision;
|
|
556
|
-
// Campaign memory already updated by flywheel agent
|
|
557
|
-
// Clean signal file for next iteration
|
|
558
579
|
await fs.unlink(paths.flywheelSignalFile).catch(() => {});
|
|
580
|
+
|
|
581
|
+
// Flywheel Guard (independent validation of flywheel decision)
|
|
582
|
+
if (shouldRunGuard(options.flywheelGuard ?? 'off', state, state.current_us)) {
|
|
583
|
+
state.phase = 'guard';
|
|
584
|
+
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
585
|
+
|
|
586
|
+
const guardPaneId = state.flywheel_pane_id ?? state.verifier_pane_id;
|
|
587
|
+
const guardModel = options.flywheelGuardModel ?? 'opus';
|
|
588
|
+
|
|
589
|
+
await dispatchGuard({ paths, sendKeys, guardPaneId, guardModel, rootDir });
|
|
590
|
+
|
|
591
|
+
const guardVerdict = await pollForSignal(paths.flywheelGuardVerdictFile, {
|
|
592
|
+
mode: 'claude',
|
|
593
|
+
paneId: guardPaneId,
|
|
594
|
+
});
|
|
595
|
+
|
|
596
|
+
if (!state.flywheel_guard_count[state.current_us]) {
|
|
597
|
+
state.flywheel_guard_count[state.current_us] = 0;
|
|
598
|
+
}
|
|
599
|
+
state.flywheel_guard_count[state.current_us] += 1;
|
|
600
|
+
|
|
601
|
+
await fs.unlink(paths.flywheelGuardVerdictFile).catch(() => {});
|
|
602
|
+
|
|
603
|
+
if (guardVerdict.verdict === 'inconclusive') {
|
|
604
|
+
state.phase = 'blocked';
|
|
605
|
+
await writeSentinel(paths.blockedSentinel, 'blocked', state.current_us);
|
|
606
|
+
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
607
|
+
return {
|
|
608
|
+
status: 'blocked',
|
|
609
|
+
usId: state.current_us,
|
|
610
|
+
reason: 'flywheel-guard-escalate-inconclusive',
|
|
611
|
+
guardIssues: guardVerdict.issues,
|
|
612
|
+
statusFile: paths.statusFile,
|
|
613
|
+
};
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
if (guardVerdict.verdict === 'fail') {
|
|
617
|
+
if (state.flywheel_guard_count[state.current_us] >= 3) {
|
|
618
|
+
state.phase = 'blocked';
|
|
619
|
+
await writeSentinel(paths.blockedSentinel, 'blocked', state.current_us);
|
|
620
|
+
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
621
|
+
return {
|
|
622
|
+
status: 'blocked',
|
|
623
|
+
usId: state.current_us,
|
|
624
|
+
reason: 'flywheel-guard-retries-exhausted',
|
|
625
|
+
guardIssues: guardVerdict.issues,
|
|
626
|
+
statusFile: paths.statusFile,
|
|
627
|
+
};
|
|
628
|
+
}
|
|
629
|
+
// Retry: skip Worker, continue to next iteration (flywheel will re-run)
|
|
630
|
+
state.phase = 'worker';
|
|
631
|
+
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
632
|
+
state.iteration += 1;
|
|
633
|
+
continue;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// verdict === 'pass'
|
|
637
|
+
if (guardVerdict.analysis_only) {
|
|
638
|
+
state.phase = 'worker';
|
|
639
|
+
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
640
|
+
state.iteration += 1;
|
|
641
|
+
continue;
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
// Reset guard count on pass (flywheel direction accepted)
|
|
646
|
+
if (state.flywheel_guard_count[state.current_us]) {
|
|
647
|
+
state.flywheel_guard_count[state.current_us] = 0;
|
|
648
|
+
}
|
|
559
649
|
}
|
|
560
650
|
|
|
561
651
|
state.phase = 'worker';
|