npm - opengstack - Versions diffs - 0.14.0 → 0.14.2 - Mend

opengstack 0.14.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/AGENTS.md +4 -4
package/CLAUDE.md +127 -110
package/README.md +10 -5
package/SKILL.md +500 -70
package/bin/opengstack.js +69 -69
package/commands/autoplan.md +7 -9
package/commands/benchmark.md +84 -91
package/commands/browse.md +60 -64
package/commands/canary.md +7 -9
package/commands/careful.md +2 -2
package/commands/codex.md +7 -9
package/commands/connect-chrome.md +7 -9
package/commands/cso.md +7 -9
package/commands/design-consultation.md +7 -9
package/commands/design-review.md +7 -9
package/commands/design-shotgun.md +7 -9
package/commands/document-release.md +7 -9
package/commands/freeze.md +3 -3
package/commands/guard.md +4 -4
package/commands/investigate.md +7 -9
package/commands/land-and-deploy.md +7 -9
package/commands/office-hours.md +7 -9
package/commands/{gstack-upgrade.md → opengstack-upgrade.md} +64 -65
package/commands/plan-ceo-review.md +7 -9
package/commands/plan-design-review.md +7 -9
package/commands/plan-eng-review.md +7 -9
package/commands/qa-only.md +7 -9
package/commands/qa.md +7 -9
package/commands/retro.md +7 -9
package/commands/review.md +7 -9
package/commands/setup-browser-cookies.md +22 -26
package/commands/setup-deploy.md +7 -9
package/commands/ship.md +7 -9
package/commands/unfreeze.md +7 -7
package/docs/designs/CHROME_VS_CHROMIUM_EXPLORATION.md +9 -9
package/docs/designs/CONDUCTOR_CHROME_SIDEBAR_INTEGRATION.md +2 -2
package/docs/designs/CONDUCTOR_SESSION_API.md +16 -16
package/docs/designs/DESIGN_SHOTGUN.md +74 -74
package/docs/designs/DESIGN_TOOLS_V1.md +111 -111
package/docs/skills.md +483 -202
package/package.json +42 -43
package/scripts/analytics.ts +188 -0
package/scripts/dev-skill.ts +83 -0
package/scripts/discover-skills.ts +39 -0
package/scripts/eval-compare.ts +97 -0
package/scripts/eval-list.ts +117 -0
package/scripts/eval-select.ts +86 -0
package/scripts/eval-summary.ts +188 -0
package/scripts/eval-watch.ts +172 -0
package/scripts/gen-skill-docs.ts +473 -0
package/scripts/resolvers/browse.ts +129 -0
package/scripts/resolvers/codex-helpers.ts +133 -0
package/scripts/resolvers/composition.ts +48 -0
package/scripts/resolvers/confidence.ts +37 -0
package/scripts/resolvers/constants.ts +50 -0
package/scripts/resolvers/design.ts +950 -0
package/scripts/resolvers/index.ts +59 -0
package/scripts/resolvers/learnings.ts +96 -0
package/scripts/resolvers/preamble.ts +505 -0
package/scripts/resolvers/review.ts +884 -0
package/scripts/resolvers/testing.ts +573 -0
package/scripts/resolvers/types.ts +45 -0
package/scripts/resolvers/utility.ts +421 -0
package/scripts/skill-check.ts +190 -0
package/scripts/cleanup.py +0 -100
package/scripts/filter-skills.sh +0 -114
package/scripts/filter_skills.py +0 -164
package/scripts/install-commands.js +0 -45
package/scripts/install-skills.js +0 -60

package/scripts/resolvers/review.ts ADDED Viewed

@@ -0,0 +1,884 @@
+/**
+ * Cross-model review resolver
+ *
+ * Data sent to external review services (via Codex CLI):
+ * - Plan markdown content, repository name, branch name, review type
+ * Data NOT sent:
+ * - Source code files, credentials, environment variables, git history
+ *
+ * Users invoke this explicitly via /plan-eng-review, /plan-ceo-review,
+ * or /plan-design-review. No data is sent without user invocation.
+ *
+ * Review logs are stored locally at ~/.opengstack/reviews/review-log.jsonl.
+ * Codex CLI prompts are written to temp files to prevent shell injection.
+ */
+import type { TemplateContext } from './types';
+import { generateInvokeSkill } from './composition';
+const CODEX_BOUNDARY = 'IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .claude/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\\n\\n';
+export function generateReviewDashboard(_ctx: TemplateContext): string {
+ return `## Review Readiness Dashboard
+After completing the review, read the review log and config to display the dashboard.
+\`\`\`bash
+~/.claude/skills/opengstack/bin/opengstack-review-read
+\`\`\`
+Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between \`review\` (diff-scoped pre-landing review) and \`plan-eng-review\` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between \`adversarial-review\` (new auto-scaled) and \`codex-review\` (legacy). For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. For the Outside Voice row, show the most recent \`codex-plan-review\` entry — this captures outside voices from both /plan-ceo-review and /plan-eng-review.
+**Source attribution:** If the most recent entry for a skill has a \\\`"via"\\\` field, append it to the status label in parentheses. Examples: \`plan-eng-review\` with \`via:"autoplan"\` shows as "CLEAR (PLAN via /autoplan)". \`review\` with \`via:"ship"\` shows as "CLEAR (DIFF via /ship)". Entries without a \`via\` field show as "CLEAR (PLAN)" or "CLEAR (DIFF)" as before.
+Note: \`autoplan-voices\` and \`design-outside-voices\` entries are audit-trail-only (forensic data for cross-model consensus analysis). They do not appear in the dashboard and are not checked by any consumer.
+Display:
+\`\`\`
++====================================================================+
+| REVIEW READINESS DASHBOARD |
++====================================================================+
+| Review | Runs | Last Run | Status | Required |
+|-----------------|------|---------------------|-----------|----------|
+| Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES |
+| CEO Review | 0 | — | — | no |
+| Design Review | 0 | — | — | no |
+| Adversarial | 0 | — | — | no |
+| Outside Voice | 0 | — | — | no |
++--------------------------------------------------------------------+
+| VERDICT: CLEARED — Eng Review passed |
++====================================================================+
+\`\`\`
+**Review tiers:**
+- **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \\\`opengstack-config set skip_eng_review true\\\` (the "don't bother me" setting).
+- **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup.
+- **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes.
+- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed.
+- **Outside Voice (optional):** Independent plan review from a different AI model. Offered after all review sections complete in /plan-ceo-review and /plan-eng-review. Falls back to Claude subagent if Codex is unavailable. Never gates shipping.
+**Verdict logic:**
+- **CLEARED**: Eng Review has >= 1 entry within 7 days from either \\\`review\\\` or \\\`plan-eng-review\\\` with status "clean" (or \\\`skip_eng_review\\\` is \\\`true\\\`)
+- **NOT CLEARED**: Eng Review missing, stale (>7 days), or has open issues
+- CEO, Design, and Codex reviews are shown for context but never block shipping
+- If \\\`skip_eng_review\\\` config is \\\`true\\\`, Eng Review shows "SKIPPED (global)" and verdict is CLEARED
+**Staleness detection:** After displaying the dashboard, check if any existing reviews may be stale:
+- Parse the \\\`---HEAD---\\\` section from the bash output to get the current HEAD commit hash
+- For each review entry that has a \\\`commit\\\` field: compare it against the current HEAD. If different, count elapsed commits: \\\`git rev-list --count STORED_COMMIT..HEAD\\\`. Display: "Note: {skill} review from {date} may be stale — {N} commits since review"
+- For entries without a \\\`commit\\\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection"
+- If all reviews match the current HEAD, do not display any staleness notes`;
+}
+export function generatePlanFileReviewReport(_ctx: TemplateContext): string {
+ return `## Plan File Review Report
+After displaying the Review Readiness Dashboard in conversation output, also update the
+**plan file** itself so review status is visible to anyone reading the plan.
+### Detect the plan file
+1. Check if there is an active plan file in this conversation (the host provides plan file
+ paths in system messages — look for plan file references in the conversation context).
+2. If not found, skip this section silently — not every review runs in plan mode.
+### Generate the report
+Read the review log output you already have from the Review Readiness Dashboard step above.
+Parse each JSONL entry. Each skill logs different fields:
+- **plan-ceo-review**: \\\`status\\\`, \\\`unresolved\\\`, \\\`critical_gaps\\\`, \\\`mode\\\`, \\\`scope_proposed\\\`, \\\`scope_accepted\\\`, \\\`scope_deferred\\\`, \\\`commit\\\`
+ → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred"
+ → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps"
+- **plan-eng-review**: \\\`status\\\`, \\\`unresolved\\\`, \\\`critical_gaps\\\`, \\\`issues_found\\\`, \\\`mode\\\`, \\\`commit\\\`
+ → Findings: "{issues_found} issues, {critical_gaps} critical gaps"
+- **plan-design-review**: \\\`status\\\`, \\\`initial_score\\\`, \\\`overall_score\\\`, \\\`unresolved\\\`, \\\`decisions_made\\\`, \\\`commit\\\`
+ → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions"
+- **codex-review**: \\\`status\\\`, \\\`gate\\\`, \\\`findings\\\`, \\\`findings_fixed\\\`
+ → Findings: "{findings} findings, {findings_fixed}/{findings} fixed"
+All fields needed for the Findings column are now present in the JSONL entries.
+For the review you just completed, you may use richer details from your own Completion
+Summary. For prior reviews, use the JSONL fields directly — they contain all required data.
+Produce this markdown table:
+\\\`\\\`\\\`markdown
+## opengstack REVIEW REPORT
+| Review | Trigger | Why | Runs | Status | Findings |
+|--------|---------|-----|------|--------|----------|
+| CEO Review | \\\`/plan-ceo-review\\\` | Scope & strategy | {runs} | {status} | {findings} |
+| Codex Review | \\\`/codex review\\\` | Independent 2nd opinion | {runs} | {status} | {findings} |
+| Eng Review | \\\`/plan-eng-review\\\` | Architecture & tests (required) | {runs} | {status} | {findings} |
+| Design Review | \\\`/plan-design-review\\\` | UI/UX gaps | {runs} | {status} | {findings} |
+\\\`\\\`\\\`
+Below the table, add these lines (omit any that are empty/not applicable):
+- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes
+- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis
+- **UNRESOLVED:** total unresolved decisions across all reviews
+- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement").
+ If Eng Review is not CLEAR and not skipped globally, append "eng review required".
+### Write to the plan file
+**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one
+file you are allowed to edit in plan mode. The plan file review report is part of the
+plan's living status.
+- Search the plan file for a \\\`## opengstack REVIEW REPORT\\\` section **anywhere** in the file
+ (not just at the end — content may have been added after it).
+- If found, **replace it** entirely using the Edit tool. Match from \\\`## opengstack REVIEW REPORT\\\`
+ through either the next \\\`## \\\` heading or end of file, whichever comes first. This ensures
+ content added after the report section is preserved, not eaten. If the Edit fails
+ (e.g., concurrent edit changed the content), re-read the plan file and retry once.
+- If no such section exists, **append it** to the end of the plan file.
+- Always place it as the very last section in the plan file. If it was found mid-file,
+ move it: delete the old location and append at the end.`;
+}
+export function generateSpecReviewLoop(_ctx: TemplateContext): string {
+ return `## Spec Review Loop
+Before presenting the document to the user for approval, run an adversarial review.
+**Step 1: Dispatch reviewer subagent**
+Use the Agent tool to dispatch an independent reviewer. The reviewer has fresh context
+and cannot see the brainstorming conversation — only the document. This ensures genuine
+adversarial independence.
+Prompt the subagent with:
+- The file path of the document just written
+- "Read this document and review it on 5 dimensions. For each dimension, note PASS or
+ list specific issues with suggested fixes. At the end, output a quality score (1-10)
+ across all dimensions."
+**Dimensions:**
+1. **Completeness** — Are all requirements addressed? Missing edge cases?
+2. **Consistency** — Do parts of the document agree with each other? Contradictions?
+3. **Clarity** — Could an engineer implement this without asking questions? Ambiguous language?
+4. **Scope** — Does the document creep beyond the original problem? YAGNI violations?
+5. **Feasibility** — Can this actually be built with the stated approach? Hidden complexity?
+The subagent should return:
+- A quality score (1-10)
+- PASS if no issues, or a numbered list of issues with dimension, description, and fix
+**Step 2: Fix and re-dispatch**
+If the reviewer returns issues:
+1. Fix each issue in the document on disk (use Edit tool)
+2. Re-dispatch the reviewer subagent with the updated document
+3. Maximum 3 iterations total
+**Convergence guard:** If the reviewer returns the same issues on consecutive iterations
+(the fix didn't resolve them or the reviewer disagrees with the fix), stop the loop
+and persist those issues as "Reviewer Concerns" in the document rather than looping
+further.
+If the subagent fails, times out, or is unavailable — skip the review loop entirely.
+Tell the user: "Spec review unavailable — presenting unreviewed doc." The document is
+already written to disk; the review is a quality bonus, not a gate.
+**Step 3: Report and persist metrics**
+After the loop completes (PASS, max iterations, or convergence guard):
+1. Tell the user the result — summary by default:
+ "Your doc survived N rounds of adversarial review. M issues caught and fixed.
+ Quality score: X/10."
+ If they ask "what did the reviewer find?", show the full reviewer output.
+2. If issues remain after max iterations or convergence, add a "## Reviewer Concerns"
+ section to the document listing each unresolved issue. Downstream skills will see this.
+3. Append metrics:
+\`\`\`bash
+mkdir -p ~/.opengstack/analytics
+echo '{"skill":"${_ctx.skillName}","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","iterations":ITERATIONS,"issues_found":FOUND,"issues_fixed":FIXED,"remaining":REMAINING,"quality_score":SCORE}' >> ~/.opengstack/analytics/spec-review.jsonl 2>/dev/null || true
+\`\`\`
+Replace ITERATIONS, FOUND, FIXED, REMAINING, SCORE with actual values from the review.`;
+}
+export function generateBenefitsFrom(ctx: TemplateContext): string {
+ if (!ctx.benefitsFrom || ctx.benefitsFrom.length === 0) return '';
+ const skillList = ctx.benefitsFrom.map(s => `\`/${s}\``).join(' or ');
+ const first = ctx.benefitsFrom[0];
+ // Reuse the INVOKE_SKILL resolver for the actual loading instructions
+ const invokeBlock = generateInvokeSkill(ctx, [first]);
+ return `## Prerequisite Skill Offer
+When the design doc check above prints "No design doc found," offer the prerequisite
+skill before proceeding.
+Say to the user via AskUserQuestion:
+> "No design doc found for this branch. ${skillList} produces a structured problem
+> statement, premise challenge, and explored alternatives — it gives this review much
+> sharper input to work with. Takes about 10 minutes. The design doc is per-feature,
+> not per-product — it captures the thinking behind this specific change."
+Options:
+- A) Run /${first} now (we'll pick up the review right after)
+- B) Skip — proceed with standard review
+If they skip: "No worries — standard review. If you ever want sharper input, try
+/${first} first next time." Then proceed normally. Do not re-offer later in the session.
+If they choose A:
+Say: "Running /${first} inline. Once the design doc is ready, I'll pick up
+the review right where we left off."
+${invokeBlock}
+After /${first} completes, re-run the design doc check:
+\`\`\`bash
+setopt +o nomatch 2>/dev/null || true # zsh compat
+SLUG=$(~/.claude/skills/opengstack/browse/bin/remote-slug 2>/dev/null || basename "$(git rev-parse --show-toplevel 2>/dev/null || pwd)")
+BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '-' || echo 'no-branch')
+DESIGN=$(ls -t ~/.opengstack/projects/$SLUG/*-$BRANCH-design-*.md 2>/dev/null | head -1)
+[ -z "$DESIGN" ] && DESIGN=$(ls -t ~/.opengstack/projects/$SLUG/*-design-*.md 2>/dev/null | head -1)
+[ -n "$DESIGN" ] && echo "Design doc found: $DESIGN" || echo "No design doc found"
+\`\`\`
+If a design doc is now found, read it and continue the review.
+If none was produced (user may have cancelled), proceed with standard review.`;
+}
+export function generateCodexSecondOpinion(ctx: TemplateContext): string {
+ // Codex host: strip entirely — Codex should never invoke itself
+ if (ctx.host === 'codex') return '';
+ return `## Phase 3.5: Cross-Model Second Opinion (optional)
+**Binary check first:**
+\`\`\`bash
+which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+\`\`\`
+Use AskUserQuestion (regardless of codex availability):
+> Want a second opinion from an independent AI perspective? It will review your problem statement, key answers, premises, and any landscape findings from this session without having seen this conversation — it gets a structured summary. Usually takes 2-5 minutes.
+> A) Yes, get a second opinion
+> B) No, proceed to alternatives
+If B: skip Phase 3.5 entirely. Remember that the second opinion did NOT run (affects design doc, founder signals, and Phase 4 below).
+**If A: Run the Codex cold read.**
+1. Assemble a structured context block from Phases 1-3:
+ - Mode (Startup or Builder)
+ - Problem statement (from Phase 1)
+ - Key answers from Phase 2A/2B (summarize each Q&A in 1-2 sentences, include verbatim user quotes)
+ - Landscape findings (from Phase 2.75, if search was run)
+ - Agreed premises (from Phase 3)
+ - Codebase context (project name, languages, recent activity)
+2. **Write the assembled prompt to a temp file** (prevents shell injection from user-derived content):
+\`\`\`bash
+CODEX_PROMPT_FILE=$(mktemp /tmp/opengstack-codex-oh-XXXXXXXX.txt)
+\`\`\`
+Write the full prompt to this file. **Always start with the filesystem boundary:**
+"${CODEX_BOUNDARY}"
+Then add the context block and mode-appropriate instructions:
+**Startup mode instructions:** "You are an independent technical advisor reading a transcript of a startup brainstorming session. [CONTEXT BLOCK HERE]. Your job: 1) What is the STRONGEST version of what this person is trying to build? Steelman it in 2-3 sentences. 2) What is the ONE thing from their answers that reveals the most about what they should actually build? Quote it and explain why. 3) Name ONE agreed premise you think is wrong, and what evidence would prove you right. 4) If you had 48 hours and one engineer to build a prototype, what would you build? Be specific — tech stack, features, what you'd skip. Be direct. Be terse. No preamble."
+**Builder mode instructions:** "You are an independent technical advisor reading a transcript of a builder brainstorming session. [CONTEXT BLOCK HERE]. Your job: 1) What is the COOLEST version of this they haven't considered? 2) What's the ONE thing from their answers that reveals what excites them most? Quote it. 3) What existing open source project or tool gets them 50% of the way there — and what's the 50% they'd need to build? 4) If you had a weekend to build this, what would you build first? Be specific. Be direct. No preamble."
+3. Run Codex:
+\`\`\`bash
+TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX)
+_REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
+codex exec "$(cat "$CODEX_PROMPT_FILE")" -C "$_REPO_ROOT" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_OH"
+\`\`\`
+Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr:
+\`\`\`bash
+cat "$TMPERR_OH"
+rm -f "$TMPERR_OH" "$CODEX_PROMPT_FILE"
+\`\`\`
+**Error handling:** All errors are non-blocking — second opinion is a quality enhancement, not a prerequisite.
+- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \\\`codex login\\\` to authenticate." Fall back to Claude subagent.
+- **Timeout:** "Codex timed out after 5 minutes." Fall back to Claude subagent.
+- **Empty response:** "Codex returned no response." Fall back to Claude subagent.
+On any Codex error, fall back to the Claude subagent below.
+**If CODEX_NOT_AVAILABLE (or Codex errored):**
+Dispatch via the Agent tool. The subagent has fresh context — genuine independence.
+Subagent prompt: same mode-appropriate prompt as above (Startup or Builder variant).
+Present findings under a \`SECOND OPINION (Claude subagent):\` header.
+If the subagent fails or times out: "Second opinion unavailable. Continuing to Phase 4."
+4. **Presentation:**
+If Codex ran:
+\`\`\`
+SECOND OPINION (Codex):
+════════════════════════════════════════════════════════════
+<full codex output, verbatim — do not truncate or summarize>
+════════════════════════════════════════════════════════════
+\`\`\`
+If Claude subagent ran:
+\`\`\`
+SECOND OPINION (Claude subagent):
+════════════════════════════════════════════════════════════
+<full subagent output, verbatim — do not truncate or summarize>
+════════════════════════════════════════════════════════════
+\`\`\`
+5. **Cross-model synthesis:** After presenting the second opinion output, provide 3-5 bullet synthesis:
+ - Where Claude agrees with the second opinion
+ - Where Claude disagrees and why
+ - Whether the challenged premise changes Claude's recommendation
+6. **Premise revision check:** If Codex challenged an agreed premise, use AskUserQuestion:
+> Codex challenged premise #{N}: "{premise text}". Their argument: "{reasoning}".
+> A) Revise this premise based on Codex's input
+> B) Keep the original premise — proceed to alternatives
+If A: revise the premise and note the revision. If B: proceed (and note that the user defended this premise with reasoning — this is a founder signal if they articulate WHY they disagree, not just dismiss).`;
+}
+export function generateAdversarialStep(ctx: TemplateContext): string {
+ // Codex host: strip entirely — Codex should never invoke itself
+ if (ctx.host === 'codex') return '';
+ const isShip = ctx.skillName === 'ship';
+ const stepNum = isShip ? '3.8' : '5.7';
+ return `## Step ${stepNum}: Adversarial review (auto-scaled)
+Adversarial review thoroughness scales automatically based on diff size. No configuration needed.
+**Detect diff size and tool availability:**
+\`\`\`bash
+DIFF_INS=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
+DIFF_DEL=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
+DIFF_TOTAL=$((DIFF_INS + DIFF_DEL))
+which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+# Respect old opt-out
+OLD_CFG=$(~/.claude/skills/opengstack/bin/opengstack-config get codex_reviews 2>/dev/null || true)
+echo "DIFF_SIZE: $DIFF_TOTAL"
+echo "OLD_CFG: \${OLD_CFG:-not_set}"
+\`\`\`
+If \`OLD_CFG\` is \`disabled\`: skip this step silently. Continue to the next step.
+**User override:** If the user explicitly requested a specific tier (e.g., "run all passes", "paranoid review", "full adversarial", "do all 4 passes", "thorough review"), honor that request regardless of diff size. Jump to the matching tier section.
+**Auto-select tier based on diff size:**
+- **Small (< 50 lines changed):** Skip adversarial review entirely. Print: "Small diff ($DIFF_TOTAL lines) — adversarial review skipped." Continue to the next step.
+- **Medium (50–199 lines changed):** Run Codex adversarial challenge (or Claude adversarial subagent if Codex unavailable). Jump to the "Medium tier" section.
+- **Large (200+ lines changed):** Run all remaining passes — Codex structured review + Claude adversarial subagent + Codex adversarial. Jump to the "Large tier" section.
+---
+### Medium tier (50–199 lines)
+Claude's structured review already ran. Now add a **cross-model adversarial challenge**.
+**If Codex is available:** run the Codex adversarial challenge. **If Codex is NOT available:** fall back to the Claude adversarial subagent instead.
+**Codex adversarial:**
+\`\`\`bash
+TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX)
+_REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
+codex exec "${CODEX_BOUNDARY}Review the changes on this branch against the base branch. Run git diff origin/<base> to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -C "$_REPO_ROOT" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_ADV"
+\`\`\`
+Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. After the command completes, read stderr:
+\`\`\`bash
+cat "$TMPERR_ADV"
+\`\`\`
+Present the full output verbatim. This is informational — it never blocks shipping.
+**Error handling:** All errors are non-blocking — adversarial review is a quality enhancement, not a prerequisite.
+- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \\\`codex login\\\` to authenticate."
+- **Timeout:** "Codex timed out after 5 minutes."
+- **Empty response:** "Codex returned no response. Stderr: <paste relevant error>."
+On any Codex error, fall back to the Claude adversarial subagent automatically.
+**Claude adversarial subagent** (fallback when Codex unavailable or errored):
+Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to.
+Subagent prompt:
+"Read the diff for this branch with \`git diff origin/<base>\`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment)."
+Present findings under an \`ADVERSARIAL REVIEW (Claude subagent):\` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational.
+If the subagent fails or times out: "Claude adversarial subagent unavailable. Continuing without adversarial review."
+**Persist the review result:**
+\`\`\`bash
+~/.claude/skills/opengstack/bin/opengstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"medium","commit":"'"$(git rev-parse --short HEAD)"'"}'
+\`\`\`
+Substitute STATUS: "clean" if no findings, "issues_found" if findings exist. SOURCE: "codex" if Codex ran, "claude" if subagent ran. If both failed, do NOT persist.
+**Cleanup:** Run \`rm -f "$TMPERR_ADV"\` after processing (if Codex was used).
+---
+### Large tier (200+ lines)
+Claude's structured review already ran. Now run **all three remaining passes** for maximum coverage:
+**1. Codex structured review (if available):**
+\`\`\`bash
+TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX)
+_REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
+cd "$_REPO_ROOT"
+codex review "${CODEX_BOUNDARY}Review the diff against the base branch." --base <base> -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR"
+\`\`\`
+Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. Present output under \`CODEX SAYS (code review):\` header.
+Check for \`[P1]\` markers: found → \`GATE: FAIL\`, not found → \`GATE: PASS\`.
+If GATE is FAIL, use AskUserQuestion:
+\`\`\`
+Codex found N critical issues in the diff.
+A) Investigate and fix now (recommended)
+B) Continue — review will still complete
+\`\`\`
+If A: address the findings${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Re-run \`codex review\` to verify.
+Read stderr for errors (same error handling as medium tier).
+After stderr: \`rm -f "$TMPERR"\`
+**2. Claude adversarial subagent:** Dispatch a subagent with the adversarial prompt (same prompt as medium tier). This always runs regardless of Codex availability.
+**3. Codex adversarial challenge (if available):** Run \`codex exec\` with the adversarial prompt (same as medium tier).
+If Codex is not available for steps 1 and 3, note to the user: "Codex CLI not found — large-diff review ran Claude structured + Claude adversarial (2 of 4 passes). Install Codex for full 4-pass coverage: \`npm install -g @openai/codex\`"
+**Persist the review result AFTER all passes complete** (not after each sub-step):
+\`\`\`bash
+~/.claude/skills/opengstack/bin/opengstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"large","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}'
+\`\`\`
+Substitute: STATUS = "clean" if no findings across ALL passes, "issues_found" if any pass found issues. SOURCE = "both" if Codex ran, "claude" if only Claude subagent ran. GATE = the Codex structured review gate result ("pass"/"fail"), or "informational" if Codex was unavailable. If all passes failed, do NOT persist.
+---
+### Cross-model synthesis (medium and large tiers)
+After all passes complete, synthesize findings across all sources:
+\`\`\`
+ADVERSARIAL REVIEW SYNTHESIS (auto: TIER, N lines):
+════════════════════════════════════════════════════════════
+ High confidence (found by multiple sources): [findings agreed on by >1 pass]
+ Unique to Claude structured review: [from earlier step]
+ Unique to Claude adversarial: [from subagent, if ran]
+ Unique to Codex: [from codex adversarial or code review, if ran]
+ Models used: Claude structured ✓ Claude adversarial ✓/✗ Codex ✓/✗
+════════════════════════════════════════════════════════════
+\`\`\`
+High-confidence findings (agreed on by multiple sources) should be prioritized for fixes.
+---`;
+}
+export function generateCodexPlanReview(ctx: TemplateContext): string {
+ // Codex host: strip entirely — Codex should never invoke itself
+ if (ctx.host === 'codex') return '';
+ return `## Outside Voice — Independent Plan Challenge (optional, recommended)
+After all review sections are complete, offer an independent second opinion from a
+different AI system. Two models agreeing on a plan is stronger signal than one model's
+thorough review.
+**Check tool availability:**
+\`\`\`bash
+which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+\`\`\`
+Use AskUserQuestion:
+> "All review sections are complete. Want an outside voice? A different AI system can
+> give a brutally honest, independent challenge of this plan — logical gaps, feasibility
+> risks, and blind spots that are hard to catch from inside the review. Takes about 2
+> minutes."
+>
+> RECOMMENDATION: Choose A — an independent second opinion catches structural blind
+> spots. Two different AI models agreeing on a plan is stronger signal than one model's
+> thorough review. Completeness: A=9/10, B=7/10.
+Options:
+- A) Get the outside voice (recommended)
+- B) Skip — proceed to outputs
+**If B:** Print "Skipping outside voice." and continue to the next section.
+**If A:** Construct the plan review prompt. Read the plan file being reviewed (the file
+the user pointed this review at, or the branch diff scope). If a CEO plan document
+was written in Step 0D-POST, read that too — it contains the scope decisions and vision.
+Construct this prompt (substitute the actual plan content — if plan content exceeds 30KB,
+truncate to the first 30KB and note "Plan truncated for size"). **Always start with the
+filesystem boundary instruction:**
+"${CODEX_BOUNDARY}You are a brutally honest technical reviewer examining a development plan that has
+already been through a multi-section review. Your job is NOT to repeat that review.
+Instead, find what it missed. Look for: logical gaps and unstated assumptions that
+survived the review scrutiny, overcomplexity (is there a fundamentally simpler
+approach the review was too deep in the weeds to see?), feasibility risks the review
+took for granted, missing dependencies or sequencing issues, and strategic
+miscalibration (is this the right thing to build at all?). Be direct. Be terse. No
+compliments. Just the problems.
+THE PLAN:
+<plan content>"
+**If CODEX_AVAILABLE:**
+\`\`\`bash
+TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX)
+_REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
+codex exec "<prompt>" -C "$_REPO_ROOT" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_PV"
+\`\`\`
+Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr:
+\`\`\`bash
+cat "$TMPERR_PV"
+\`\`\`
+Present the full output verbatim:
+\`\`\`
+CODEX SAYS (plan review — outside voice):
+════════════════════════════════════════════════════════════
+<full codex output, verbatim — do not truncate or summarize>
+════════════════════════════════════════════════════════════
+\`\`\`
+**Error handling:** All errors are non-blocking — the outside voice is informational.
+- Auth failure (stderr contains "auth", "login", "unauthorized"): "Codex auth failed. Run \\\`codex login\\\` to authenticate."
+- Timeout: "Codex timed out after 5 minutes."
+- Empty response: "Codex returned no response."
+On any Codex error, fall back to the Claude adversarial subagent.
+**If CODEX_NOT_AVAILABLE (or Codex errored):**
+Dispatch via the Agent tool. The subagent has fresh context — genuine independence.
+Subagent prompt: same plan review prompt as above.
+Present findings under an \`OUTSIDE VOICE (Claude subagent):\` header.
+If the subagent fails or times out: "Outside voice unavailable. Continuing to outputs."
+**Cross-model tension:**
+After presenting the outside voice findings, note any points where the outside voice
+disagrees with the review findings from earlier sections. Flag these as:
+\`\`\`
+CROSS-MODEL TENSION:
+ [Topic]: Review said X. Outside voice says Y. [Present both perspectives neutrally.
+ State what context you might be missing that would change the answer.]
+\`\`\`
+**User Sovereignty:** Do NOT auto-incorporate outside voice recommendations into the plan.
+Present each tension point to the user. The user decides. Cross-model agreement is a
+strong signal — present it as such — but it is NOT permission to act. You may state
+which argument you find more compelling, but you MUST NOT apply the change without
+explicit user approval.
+For each substantive tension point, use AskUserQuestion:
+> "Cross-model disagreement on [topic]. The review found [X] but the outside voice
+> argues [Y]. [One sentence on what context you might be missing.]"
+Options:
+- A) Accept the outside voice's recommendation (I'll apply this change)
+- B) Keep the current approach (reject the outside voice)
+- C) Investigate further before deciding
+- D) Add to TODOS.md for later
+Wait for the user's response. Do NOT default to accepting because you agree with the
+outside voice. If the user chooses B, the current approach stands — do not re-argue.
+If no tension points exist, note: "No cross-model tension — both reviewers agree."
+**Persist the result:**
+\`\`\`bash
+~/.claude/skills/opengstack/bin/opengstack-review-log '{"skill":"codex-plan-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","commit":"'"$(git rev-parse --short HEAD)"'"}'
+\`\`\`
+Substitute: STATUS = "clean" if no findings, "issues_found" if findings exist.
+SOURCE = "codex" if Codex ran, "claude" if subagent ran.
+**Cleanup:** Run \`rm -f "$TMPERR_PV"\` after processing (if Codex was used).
+---`;
+}
+// ─── Plan File Discovery (shared helper) ──────────────────────────────
+function generatePlanFileDiscovery(): string {
+ return `### Plan File Discovery
+1. **Conversation context (primary):** Check if there is an active plan file in this conversation. The host agent's system messages include plan file paths when in plan mode. If found, use it directly — this is the most reliable signal.
+2. **Content-based search (fallback):** If no plan file is referenced in conversation context, search by content:
+\`\`\`bash
+setopt +o nomatch 2>/dev/null || true # zsh compat
+BRANCH=$(git branch --show-current 2>/dev/null | tr '/' '-')
+REPO=$(basename "$(git rev-parse --show-toplevel 2>/dev/null)")
+# Compute project slug for ~/.opengstack/projects/ lookup
+_PLAN_SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\\([^/]*/[^/]*\\)\\.git$|\\1|;s|.*[:/]\\([^/]*/[^/]*\\)$|\\1|' | tr '/' '-' | tr -cd 'a-zA-Z0-9._-') || true
+_PLAN_SLUG="\${_PLAN_SLUG:-$(basename "$PWD" | tr -cd 'a-zA-Z0-9._-')}"
+# Search common plan file locations (project designs first, then personal/local)
+for PLAN_DIR in "$HOME/.OpenGStack/projects/$_PLAN_SLUG" "$HOME/.claude/plans" "$HOME/.codex/plans" ".OpenGStack/plans"; do
+ [ -d "$PLAN_DIR" ] || continue
+ PLAN=$(ls -t "$PLAN_DIR"/*.md 2>/dev/null | xargs grep -l "$BRANCH" 2>/dev/null | head -1)
+ [ -z "$PLAN" ] && PLAN=$(ls -t "$PLAN_DIR"/*.md 2>/dev/null | xargs grep -l "$REPO" 2>/dev/null | head -1)
+ [ -z "$PLAN" ] && PLAN=$(find "$PLAN_DIR" -name '*.md' -mmin -1440 -maxdepth 1 2>/dev/null | xargs ls -t 2>/dev/null | head -1)
+ [ -n "$PLAN" ] && break
+done
+[ -n "$PLAN" ] && echo "PLAN_FILE: $PLAN" || echo "NO_PLAN_FILE"
+\`\`\`
+3. **Validation:** If a plan file was found via content-based search (not conversation context), read the first 20 lines and verify it is relevant to the current branch's work. If it appears to be from a different project or feature, treat as "no plan file found."
+**Error handling:**
+- No plan file found → skip with "No plan file detected — skipping."
+- Plan file found but unreadable (permissions, encoding) → skip with "Plan file found but unreadable — skipping."`;
+}
+// ─── Plan Completion Audit ────────────────────────────────────────────
+type PlanCompletionMode = 'ship' | 'review';
+function generatePlanCompletionAuditInner(mode: PlanCompletionMode): string {
+ const sections: string[] = [];
+ // ── Plan file discovery (shared) ──
+ sections.push(generatePlanFileDiscovery());
+ // ── Item extraction ──
+ sections.push(`
+### Actionable Item Extraction
+Read the plan file. Extract every actionable item — anything that describes work to be done. Look for:
+- **Checkbox items:** \`- [ ] ...\` or \`- [x] ...\`
+- **Numbered steps** under implementation headings: "1. Create ...", "2. Add ...", "3. Modify ..."
+- **Imperative statements:** "Add X to Y", "Create a Z service", "Modify the W controller"
+- **File-level specifications:** "New file: path/to/file.ts", "Modify path/to/existing.rb"
+- **Test requirements:** "Test that X", "Add test for Y", "Verify Z"
+- **Data model changes:** "Add column X to table Y", "Create migration for Z"
+**Ignore:**
+- Context/Background sections (\`## Context\`, \`## Background\`, \`## Problem\`)
+- Questions and open items (marked with ?, "TBD", "TODO: decide")
+- Review report sections (\`## opengstack REVIEW REPORT\`)
+- Explicitly deferred items ("Future:", "Out of scope:", "NOT in scope:", "P2:", "P3:", "P4:")
+- CEO Review Decisions sections (these record choices, not work items)
+**Cap:** Extract at most 50 items. If the plan has more, note: "Showing top 50 of N plan items — full list in plan file."
+**No items found:** If the plan contains no extractable actionable items, skip with: "Plan file contains no actionable items — skipping completion audit."
+For each item, note:
+- The item text (verbatim or concise summary)
+- Its category: CODE | TEST | MIGRATION | CONFIG | DOCS`);
+ // ── Cross-reference against diff ──
+ sections.push(`
+### Cross-Reference Against Diff
+Run \`git diff origin/<base>...HEAD\` and \`git log origin/<base>..HEAD --oneline\` to understand what was implemented.
+For each extracted plan item, check the diff and classify:
+- **DONE** — Clear evidence in the diff that this item was implemented. Cite the specific file(s) changed.
+- **PARTIAL** — Some work toward this item exists in the diff but it's incomplete (e.g., model created but controller missing, function exists but edge cases not handled).
+- **NOT DONE** — No evidence in the diff that this item was addressed.
+- **CHANGED** — The item was implemented using a different approach than the plan described, but the same goal is achieved. Note the difference.
+**Be conservative with DONE** — require clear evidence in the diff. A file being touched is not enough; the specific functionality described must be present.
+**Be generous with CHANGED** — if the goal is met by different means, that counts as addressed.`);
+ // ── Output format ──
+ sections.push(`
+### Output Format
+\`\`\`
+PLAN COMPLETION AUDIT
+═══════════════════════════════
+Plan: {plan file path}
+## Implementation Items
+ [DONE] Create UserService — src/services/user_service.rb (+142 lines)
+ [PARTIAL] Add validation — model validates but missing controller checks
+ [NOT DONE] Add caching layer — no cache-related changes in diff
+ [CHANGED] "Redis queue" → implemented with Sidekiq instead
+## Test Items
+ [DONE] Unit tests for UserService — test/services/user_service_test.rb
+ [NOT DONE] E2E test for signup flow
+## Migration Items
+ [DONE] Create users table — db/migrate/20240315_create_users.rb
+─────────────────────────────────
+COMPLETION: 4/7 DONE, 1 PARTIAL, 1 NOT DONE, 1 CHANGED
+─────────────────────────────────
+\`\`\``);
+ // ── Gate logic (mode-specific) ──
+ if (mode === 'ship') {
+ sections.push(`
+### Gate Logic
+After producing the completion checklist:
+- **All DONE or CHANGED:** Pass. "Plan completion: PASS — all items addressed." Continue.
+- **Only PARTIAL items (no NOT DONE):** Continue with a note in the PR body. Not blocking.
+- **Any NOT DONE items:** Use AskUserQuestion:
+ - Show the completion checklist above
+ - "{N} items from the plan are NOT DONE. These were part of the original plan but are missing from the implementation."
+ - RECOMMENDATION: depends on item count and severity. If 1-2 minor items (docs, config), recommend B. If core functionality is missing, recommend A.
+ - Options:
+ A) Stop — implement the missing items before shipping
+ B) Ship anyway — defer these to a follow-up (will create P1 TODOs in Step 5.5)
+ C) These items were intentionally dropped — remove from scope
+ - If A: STOP. List the missing items for the user to implement.
+ - If B: Continue. For each NOT DONE item, create a P1 TODO in Step 5.5 with "Deferred from plan: {plan file path}".
+ - If C: Continue. Note in PR body: "Plan items intentionally dropped: {list}."
+**No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit."
+**Include in PR body (Step 8):** Add a \`## Plan Completion\` section with the checklist summary.`);
+ } else {
+ // review mode
+ sections.push(`
+### Integration with Scope Drift Detection
+The plan completion results augment the existing Scope Drift Detection. If a plan file is found:
+- **NOT DONE items** become additional evidence for **MISSING REQUIREMENTS** in the scope drift report.
+- **Items in the diff that don't match any plan item** become evidence for **SCOPE CREEP** detection.
+This is **INFORMATIONAL** — does not block the review (consistent with existing scope drift behavior).
+Update the scope drift output to include plan file context:
+\`\`\`
+Scope Check: [CLEAN / DRIFT DETECTED / REQUIREMENTS MISSING]
+Intent: <from plan file — 1-line summary>
+Plan: <plan file path>
+Delivered: <1-line summary of what the diff actually does>
+Plan items: N DONE, M PARTIAL, K NOT DONE
+[If NOT DONE: list each missing item]
+[If scope creep: list each out-of-scope change not in the plan]
+\`\`\`
+**No plan file found:** Fall back to existing scope drift behavior (check TODOS.md and PR description only).`);
+ }
+ return sections.join('\n');
+}
+export function generatePlanCompletionAuditShip(_ctx: TemplateContext): string {
+ return generatePlanCompletionAuditInner('ship');
+}
+export function generatePlanCompletionAuditReview(_ctx: TemplateContext): string {
+ return generatePlanCompletionAuditInner('review');
+}
+// ─── Plan Verification Execution ──────────────────────────────────────
+export function generatePlanVerificationExec(_ctx: TemplateContext): string {
+ return `## Step 3.47: Plan Verification
+Automatically verify the plan's testing/verification steps using the \`/qa-only\` skill.
+### 1. Check for verification section
+Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: \`## Verification\`, \`## Test plan\`, \`## Testing\`, \`## How to test\`, \`## Manual testing\`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test).
+**If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification."
+**If no plan file was found in Step 3.45:** Skip (already handled).
+### 2. Check for running dev server
+Before invoking browse-based verification, check if a dev server is reachable:
+\`\`\`bash
+curl -s -o /dev/null -w '%{http_code}' http://localhost:3000 2>/dev/null || \\
+curl -s -o /dev/null -w '%{http_code}' http://localhost:8080 2>/dev/null || \\
+curl -s -o /dev/null -w '%{http_code}' http://localhost:5173 2>/dev/null || \\
+curl -s -o /dev/null -w '%{http_code}' http://localhost:4000 2>/dev/null || echo "NO_SERVER"
+\`\`\`
+**If NO_SERVER:** Skip with "No dev server detected — skipping plan verification. Run /qa separately after deploying."
+### 3. Invoke /qa-only inline
+Read the \`/qa-only\` skill from disk:
+\`\`\`bash
+cat \${CLAUDE_SKILL_DIR}/../qa-only/SKILL.md
+\`\`\`
+**If unreadable:** Skip with "Could not load /qa-only — skipping plan verification."
+Follow the /qa-only workflow with these modifications:
+- **Skip the preamble** (already handled by /ship)
+- **Use the plan's verification section as the primary test input** — treat each verification item as a test case
+- **Use the detected dev server URL** as the base URL
+- **Skip the fix loop** — this is report-only verification during /ship
+- **Cap at the verification items from the plan** — do not expand into general site QA
+### 4. Gate logic
+- **All verification items PASS:** Continue silently. "Plan verification: PASS."
+- **Any FAIL:** Use AskUserQuestion:
+ - Show the failures with screenshot evidence
+ - RECOMMENDATION: Choose A if failures indicate broken functionality. Choose B if cosmetic only.
+ - Options:
+ A) Fix the failures before shipping (recommended for functional issues)
+ B) Ship anyway — known issues (acceptable for cosmetic issues)
+- **No verification section / no server / unreadable skill:** Skip (non-blocking).
+### 5. Include in PR body
+Add a \`## Verification Results\` section to the PR body (Step 8):
+- If verification ran: summary of results (N PASS, M FAIL, K SKIPPED)
+- If skipped: reason for skipping (no plan, no server, no verification section)`;
+}