@ai-dev-methodologies/rlp-desk 0.9.3 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,13 +75,13 @@ Ask about these items one by one (or in small groups):
75
75
 
76
76
  | Complexity | Worker | per-US Verifier | Final Verifier | Consensus |
77
77
  |------------|--------|-----------------|----------------|-----------|
78
- | LOW | gpt-5.4:medium | sonnet | opus | final-only |
79
- | MEDIUM | gpt-5.4:medium | opus | opus | final-only |
80
- | HIGH | gpt-5.4:high | opus | opus | all |
81
- | CRITICAL | gpt-5.4:high | opus | opus + human | all |
78
+ | LOW | gpt-5.5:medium | sonnet | opus | final-only |
79
+ | MEDIUM | gpt-5.5:medium | opus | opus | final-only |
80
+ | HIGH | gpt-5.5:high | opus | opus | all |
81
+ | CRITICAL | gpt-5.5:high | opus | opus + human | all |
82
82
 
83
83
  **Worker model selection** (cross-engine):
84
- - **gpt-5.4:medium** — default recommendation (full context window, progressive upgrade handles harder US)
84
+ - **gpt-5.5:medium** — default recommendation (full context window, progressive upgrade handles harder US)
85
85
  - **spark:high** — only when US is small enough for spark's 100k context (single-file, AC count <= 4, simple logic). Do NOT use as primary recommendation — spark context window is too small for most tasks
86
86
 
87
87
  Present complexity score with evidence to the user, e.g.: "I rate this MEDIUM because: US count=4 (MEDIUM), file scope=2 (MEDIUM), logic=conditionals (MEDIUM), deps=none (LOW), impact=modify (MEDIUM). Highest=MEDIUM."
@@ -91,8 +91,8 @@ Ask about these items one by one (or in small groups):
91
91
  **If codex is NOT installed** — say: "Codex is not installed. Defaulting to claude-only Worker. Note: without a second engine, your Verifier shares the same perspective as the Worker — there is a risk of blind spots where both Worker and Verifier miss the same issue. To unlock cross-engine coverage: `npm install -g @openai/codex`"
92
92
 
93
93
  8. **Batch Capacity Check** — when verify-mode is batch and PRD is large:
94
- - batch + spark + AC > 4 → warn "spark 100k context limit — switch to gpt-5.4 or split smaller"
95
- - batch + gpt-5.4 + AC > 15 → warn "too many ACs for single batch — consider wave split (3-4 US per wave)"
94
+ - batch + spark + AC > 4 → warn "spark 100k context limit — switch to gpt-5.5 or split smaller"
95
+ - batch + gpt-5.5 + AC > 15 → warn "too many ACs for single batch — consider wave split (3-4 US per wave)"
96
96
  - per-us → no warning (US-level processing, no limit concern)
97
97
  9. **Verify Mode** — per-us (default) or batch. Ask: "Verify after each user story (per-us, recommended) or only after all stories are done (batch)?" Default recommendation: per-us for 2+ stories.
98
98
  10. **Consensus** — Ask: "Use cross-engine consensus? off (single engine), final-only (cross-engine on final verify only), or all (cross-engine on every verify). Requires codex CLI." Default: off. Recommended: final-only when codex is installed.
@@ -164,26 +164,26 @@ Tell the user:
164
164
  Available run commands (copy the one you want):
165
165
 
166
166
  # ★ Recommended: cross-engine + final-consensus (full context + blind-spot coverage):
167
- /rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.4:medium --consensus final-only --debug
167
+ /rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.5:medium --consensus final-only --debug
168
168
 
169
169
  # Small tasks only (single-file, AC <= 4, simple logic — spark 100k context limit):
170
170
  /rlp-desk run <actual-slug> --mode tmux --worker-model spark:high --consensus final-only --debug
171
171
 
172
172
  # Critical (full consensus on every verify):
173
- /rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.4:high --consensus all --debug
173
+ /rlp-desk run <actual-slug> --mode tmux --worker-model gpt-5.5:high --consensus all --debug
174
174
 
175
175
  # Claude-only:
176
176
  /rlp-desk run <actual-slug> --debug
177
177
 
178
178
  # Full options reference:
179
179
  # --mode agent|tmux (default: agent)
180
- # --worker-model MODEL haiku|sonnet|opus or gpt-5.4:high|spark:high (default: haiku)
180
+ # --worker-model MODEL haiku|sonnet|opus or gpt-5.5:high|spark:high (default: haiku)
181
181
  # --lock-worker-model disable auto model upgrade
182
182
  # --verifier-model MODEL per-US verifier (default: sonnet)
183
183
  # --final-verifier-model MODEL final ALL verifier (default: opus)
184
184
  # --consensus off|all|final-only cross-engine consensus (default: off)
185
- # --consensus-model MODEL per-US cross-verifier (default: gpt-5.4:medium)
186
- # --final-consensus-model MODEL final cross-verifier (default: gpt-5.4:high)
185
+ # --consensus-model MODEL per-US cross-verifier (default: gpt-5.5:medium)
186
+ # --final-consensus-model MODEL final cross-verifier (default: gpt-5.5:high)
187
187
  # --verify-mode per-us|batch (default: per-us)
188
188
  # --cb-threshold N (default: 6)
189
189
  # --max-iter N (default: 100)
@@ -240,16 +240,16 @@ Tell the user:
240
240
 
241
241
  Options (parse from `$ARGUMENTS`):
242
242
  - `--mode agent|tmux` (default: `agent`) — execution mode
243
- - `--worker-model MODEL` (default: `haiku`) — Worker model. Format: `model` = claude engine, `model:reasoning` = codex engine. Examples: `haiku`, `sonnet`, `opus`, `spark:high`, `gpt-5.4:high`. Parsed by `parse_model_flag()` which auto-splits engine/model/reasoning.
244
- - `--lock-worker-model` — disable automatic model upgrade on failure (check_model_upgrade). Worker stays on the specified model regardless of consecutive failures.
243
+ - `--worker-model MODEL` (default: `haiku`) — Worker model. Format: `model` = claude engine, `model:reasoning` = codex engine. Examples: `haiku`, `sonnet`, `opus`, `spark:high`, `gpt-5.5:high`. Parsed by `parse_model_flag()` which auto-splits engine/model/reasoning.
244
+ - `--lock-worker-model` — disable automatic model upgrade on failure. Worker stays on the specified model regardless of consecutive failures.
245
245
  - `--verifier-model MODEL` (default: `sonnet`) — per-US verification model. Campaign-fixed (no progressive upgrade). Lighter than final verifier.
246
246
  - `--final-verifier-model MODEL` (default: `opus`) — final ALL verification model. Independent from per-US verifier. Used only for the final full-AC verify pass.
247
247
  - `--consensus off|all|final-only` (default: `off`) — cross-engine consensus verification mode.
248
248
  - `off`: single-engine verification only
249
249
  - `all`: cross-engine consensus on every verify (per-US and final)
250
250
  - `final-only`: cross-engine consensus only on the final ALL verify
251
- - `--consensus-model MODEL` (default: `gpt-5.4:medium`) — per-US cross-verifier model. Lighter weight for cost efficiency.
252
- - `--final-consensus-model MODEL` (default: `gpt-5.4:high`) — final cross-verifier model. Stricter. Note: spark is not allowed here (100k output limit).
251
+ - `--consensus-model MODEL` (default: `gpt-5.5:medium`) — per-US cross-verifier model. Lighter weight for cost efficiency.
252
+ - `--final-consensus-model MODEL` (default: `gpt-5.5:high`) — final cross-verifier model. Stricter. Note: spark is not allowed here (100k output limit).
253
253
  - `--verify-mode per-us|batch` (default: `per-us`) — verification strategy
254
254
  - `per-us`: verify after each US, then final full verify of all AC
255
255
  - `batch`: verify only after all US done (legacy behavior)
@@ -292,8 +292,8 @@ VERIFIER_MODEL=<--verifier-model value, default: sonnet> \
292
292
  FINAL_VERIFIER_MODEL=<--final-verifier-model value, default: opus> \
293
293
  VERIFY_MODE=<--verify-mode value, default: per-us> \
294
294
  CONSENSUS_MODE=<--consensus value, default: off> \
295
- CONSENSUS_MODEL=<--consensus-model value, default: gpt-5.4:medium> \
296
- FINAL_CONSENSUS_MODEL=<--final-consensus-model value, default: gpt-5.4:high> \
295
+ CONSENSUS_MODEL=<--consensus-model value, default: gpt-5.5:medium> \
296
+ FINAL_CONSENSUS_MODEL=<--final-consensus-model value, default: gpt-5.5:high> \
297
297
  CB_THRESHOLD=<--cb-threshold value, default: 6> \
298
298
  ITER_TIMEOUT=<--iter-timeout value, default: 600> \
299
299
  DEBUG=<1 if --debug, else 0> \
@@ -473,8 +473,8 @@ Bash("codex exec --model <codex_model> --reasoning-effort <codex_reasoning> <ful
473
473
  **⑦b Consensus Verification** (when `--consensus` is `all`, or `final-only` and scope is ALL):
474
474
  After the primary verifier runs, run a cross-engine second verifier:
475
475
  - Determine cross-verifier model based on scope:
476
- - per-US verify → use `--consensus-model` (default: gpt-5.4:medium)
477
- - final ALL verify → use `--final-consensus-model` (default: gpt-5.4:high)
476
+ - per-US verify → use `--consensus-model` (default: gpt-5.5:medium)
477
+ - final ALL verify → use `--final-consensus-model` (default: gpt-5.5:high)
478
478
  - If primary engine is claude → cross-verifier uses codex (the consensus model)
479
479
  - If primary engine is codex → cross-verifier uses claude `opus` (fixed)
480
480
  - Both produce `verify-verdict.json` (Leader renames to `verify-verdict-claude.json` and `verify-verdict-codex.json`)
@@ -732,13 +732,13 @@ Example:
732
732
 
733
733
  Run options:
734
734
  --mode agent|tmux Execution mode (default: agent)
735
- --worker-model MODEL Worker model: haiku|sonnet|opus or gpt-5.4:high|spark:high (default: haiku)
735
+ --worker-model MODEL Worker model: haiku|sonnet|opus or gpt-5.5:high|spark:high (default: haiku)
736
736
  --lock-worker-model Disable auto model upgrade on failure
737
737
  --verifier-model MODEL per-US verifier (default: sonnet)
738
738
  --final-verifier-model MODEL Final ALL verifier (default: opus)
739
739
  --consensus off|all|final-only Cross-engine consensus (default: off)
740
- --consensus-model MODEL per-US cross-verifier (default: gpt-5.4:medium)
741
- --final-consensus-model MODEL Final cross-verifier (default: gpt-5.4:high)
740
+ --consensus-model MODEL per-US cross-verifier (default: gpt-5.5:medium)
741
+ --final-consensus-model MODEL Final cross-verifier (default: gpt-5.5:high)
742
742
  --verify-mode per-us|batch Verification strategy (default: per-us)
743
743
  --cb-threshold N Consecutive failures before BLOCKED (default: 6)
744
744
  --max-iter N Max iterations (default: 100)
package/src/governance.md CHANGED
@@ -14,7 +14,7 @@ The Leader orchestrates, while Worker/Verifier run in isolated fresh contexts ev
14
14
  - **Worker must NEVER modify Claude Code settings** (settings.json, settings.local.json). Permission prompts must be reported as blocked, not bypassed by editing settings.
15
15
  - **Verifier is independent**: The Verifier judges based on evidence alone, without knowledge of the Worker's reasoning process.
16
16
  - **Sentinels are Leader-owned**: Only the Leader writes COMPLETE/BLOCKED sentinels.
17
- - **Supported engines**: claude (default; models: haiku, sonnet, opus) and codex (opt-in via `--worker-model spark:high` or `--worker-model gpt-5.4:high`).
17
+ - **Supported engines**: claude (default; models: haiku, sonnet, opus) and codex (opt-in via `--worker-model spark:high` or `--worker-model gpt-5.5:high`).
18
18
 
19
19
  ## 1a. Iron Laws
20
20
 
@@ -300,11 +300,11 @@ The Leader decides each iteration. Decision criteria:
300
300
 
301
301
  ### Codex (opt-in engine)
302
302
 
303
- Model routing uses `--worker-model` and `--verifier-model` with codex format: `spark:high` or `gpt-5.4:high`.
303
+ Model routing uses `--worker-model` and `--verifier-model` with codex format: `spark:high` or `gpt-5.5:high`.
304
304
 
305
305
  ```
306
306
  --worker-model spark:high # codex worker, spark model, high reasoning
307
- --verifier-model gpt-5.4:high # codex verifier, gpt-5.4, high reasoning
307
+ --verifier-model gpt-5.5:high # codex verifier, gpt-5.5, high reasoning
308
308
  ```
309
309
 
310
310
  `parse_model_flag()` auto-detects engine from the model name: plain names (haiku, sonnet, opus) = claude; `name:reasoning` format = codex. Claude is the default engine; codex is explicitly opt-in.
@@ -331,7 +331,7 @@ Agent(
331
331
  )
332
332
  ```
333
333
 
334
- If `--worker-model` or `--verifier-model` uses codex format (e.g., `spark:high`, `gpt-5.4:high`) (opt-in):
334
+ If `--worker-model` or `--verifier-model` uses codex format (e.g., `spark:high`, `gpt-5.5:high`) (opt-in):
335
335
  ```
336
336
  # Worker or Verifier (codex engine)
337
337
  Bash("codex -m <codex_model> -c model_reasoning_effort=<codex_reasoning> --dangerously-bypass-approvals-and-sandbox <prompt>")
@@ -377,7 +377,7 @@ claude -p "$(cat /path/to/prompt.md)" \
377
377
  When `WORKER_ENGINE=codex` or `VERIFIER_ENGINE=codex`, the `codex` CLI is used instead:
378
378
  ```bash
379
379
  # codex engine (opt-in)
380
- codex -m gpt-5.4 \
380
+ codex -m gpt-5.5 \
381
381
  -c model_reasoning_effort="high" \
382
382
  --dangerously-bypass-approvals-and-sandbox \
383
383
  "$(cat /path/to/prompt.md)"
@@ -569,9 +569,9 @@ Worker completes US → signal verify
569
569
 
570
570
  | Scenario | Primary verifier | Cross verifier |
571
571
  |----------|-----------------|----------------|
572
- | per-US, primary=claude | `--verifier-model` (sonnet) | `--consensus-model` (gpt-5.4:medium) |
572
+ | per-US, primary=claude | `--verifier-model` (sonnet) | `--consensus-model` (gpt-5.5:medium) |
573
573
  | per-US, primary=codex | `--verifier-model` | claude opus (fixed) |
574
- | final, primary=claude | `--final-verifier-model` (opus) | `--final-consensus-model` (gpt-5.4:high) |
574
+ | final, primary=claude | `--final-verifier-model` (opus) | `--final-consensus-model` (gpt-5.5:high) |
575
575
  | final, primary=codex | `--final-verifier-model` | claude opus (fixed) |
576
576
 
577
577
  - Both must pass. No engine priority.
@@ -18,14 +18,14 @@ CB default: 6. Override: `--cb-threshold N`. Worker only — Verifier fixed at c
18
18
  | HIGH | gpt-5.3-codex-spark:high | gpt-5.3-codex-spark:xhigh | gpt-5.3-codex-spark:xhigh | BLOCKED |
19
19
  | CRITICAL | gpt-5.3-codex-spark:xhigh | gpt-5.3-codex-spark:xhigh | gpt-5.3-codex-spark:xhigh | BLOCKED |
20
20
 
21
- ## Non-Pro (gpt-5.4)
21
+ ## Non-Pro (gpt-5.5)
22
22
 
23
23
  | Complexity | 1-2 | 3-4 | 5-6 | 7+ |
24
24
  |------------|-----|-----|-----|-----|
25
- | LOW | gpt-5.4:low | gpt-5.4:medium | gpt-5.4:high | BLOCKED |
26
- | MEDIUM | gpt-5.4:medium | gpt-5.4:high | gpt-5.4:xhigh | BLOCKED |
27
- | HIGH | gpt-5.4:high | gpt-5.4:xhigh | gpt-5.4:xhigh | BLOCKED |
28
- | CRITICAL | gpt-5.4:xhigh | gpt-5.4:xhigh | gpt-5.4:xhigh | BLOCKED |
25
+ | LOW | gpt-5.5:low | gpt-5.5:medium | gpt-5.5:high | BLOCKED |
26
+ | MEDIUM | gpt-5.5:medium | gpt-5.5:high | gpt-5.5:xhigh | BLOCKED |
27
+ | HIGH | gpt-5.5:high | gpt-5.5:xhigh | gpt-5.5:xhigh | BLOCKED |
28
+ | CRITICAL | gpt-5.5:xhigh | gpt-5.5:xhigh | gpt-5.5:xhigh | BLOCKED |
29
29
 
30
30
  ## Claude-only
31
31
 
package/src/node/run.mjs CHANGED
@@ -11,8 +11,8 @@ const RUN_DEFAULTS = {
11
11
  verifierModel: 'sonnet',
12
12
  finalVerifierModel: 'opus',
13
13
  consensusMode: 'off',
14
- consensusModel: 'gpt-5.4:medium',
15
- finalConsensusModel: 'gpt-5.4:high',
14
+ consensusModel: 'gpt-5.5:medium',
15
+ finalConsensusModel: 'gpt-5.5:high',
16
16
  verifyMode: 'per-us',
17
17
  cbThreshold: 6,
18
18
  maxIterations: 100,
@@ -24,11 +24,10 @@ import {
24
24
 
25
25
  const execFileAsync = promisify(execFile);
26
26
  const REQUIRED_SCAFFOLD_NAMES = ['workerPrompt', 'verifierPrompt', 'memoryFile', 'prdFile', 'testSpecFile'];
27
- const CLAUDE_MODELS = new Set(['haiku', 'sonnet', 'opus']);
28
27
  const MODEL_UPGRADES = {
29
- 'gpt-5.4:medium': 'gpt-5.4:high',
30
- 'gpt-5.4:high': 'gpt-5.4:xhigh',
31
- 'gpt-5.4:xhigh': 'BLOCKED',
28
+ 'gpt-5.5:medium': 'gpt-5.5:high',
29
+ 'gpt-5.5:high': 'gpt-5.5:xhigh',
30
+ 'gpt-5.5:xhigh': 'BLOCKED',
32
31
  'gpt-5.3-codex-spark:medium': 'gpt-5.3-codex-spark:high',
33
32
  'gpt-5.3-codex-spark:high': 'gpt-5.3-codex-spark:xhigh',
34
33
  'gpt-5.3-codex-spark:xhigh': 'BLOCKED',