npm - create-sdd-project - Versions diffs - 0.16.7 → 0.16.9 - Mend

create-sdd-project 0.16.7 → 0.16.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/lib/doctor.js CHANGED Viewed

@@ -73,6 +73,9 @@ function runDoctor(cwd) {
   // 12. Gemini Settings Format
   results.push(checkGeminiSettings(cwd, aiTools));
+  // 13. Gemini TOML Commands Format
+  results.push(checkGeminiCommands(cwd, aiTools));
   return results;
 }
@@ -613,6 +616,330 @@ function checkGeminiSettings(cwd, aiTools) {
   };
 }
+/**
+ * Validate a .gemini/commands/*.toml file using a strict subset of TOML
+ * grammar sufficient for our narrow use case.
+ *
+ * Scope: the templates we ship only use two top-level keys (`description`,
+ * `prompt`) with string values — standard quoted (`"..."`), single-quoted
+ * literal (`'...'`), or triple-quoted multiline (`"""..."""` / `'''...'''`).
+ * This validator enforces that subset strictly:
+ *
+ * - Each non-blank, non-comment line must be either a top-level assignment
+ *   `key = <string-literal>` or the start of a multiline string
+ * - Top-level keys must match `[A-Za-z][A-Za-z0-9_-]*` (bare keys only —
+ *   quoted keys like `"prompt" = "x"` are flagged as invalid; our templates
+ *   never use them)
+ * - Duplicate top-level keys are rejected (TOML spec forbids them)
+ * - Strings must be properly closed on the same line (except triple-quoted,
+ *   which can span lines)
+ * - Trailing content after a closed string is rejected (only a `#` comment
+ *   is allowed after the value)
+ * - Values that are not string literals (numbers, booleans, arrays, etc.)
+ *   are flagged as non-string
+ * - Assignments inside `[table]` or `[[array-table]]` sections are not
+ *   considered top-level and the scan stops there (our templates don't use
+ *   tables)
+ *
+ * This validator is intentionally stricter than full TOML and looser in a
+ * few edge cases (e.g., escape sequences inside basic strings are accepted
+ * as `\\.`). The goal is to catch files that Gemini CLI's FileCommandLoader
+ * would silently skip — not to be a general-purpose TOML parser. If our
+ * templates ever need richer TOML features, upgrade to `@iarna/toml` as
+ * a runtime dependency at that point.
+ *
+ * Returns:
+ *   { ok: true, keys: { prompt?: 'string' | 'non-string', description?: 'string' | 'non-string' } }
+ *   { ok: false, error: '<message>', line: N }
+ */
+function validateTomlCommandFile(content) {
+  const keysSeen = {};
+  const lines = content.split(/\r?\n|\r/);
+  let i = 0;
+  while (i < lines.length) {
+    const raw = lines[i];
+    const trimmed = raw.trim();
+    // Blank line or full-line comment
+    if (trimmed === '' || trimmed.startsWith('#')) {
+      i++;
+      continue;
+    }
+    // Table / array-table — end of top-level scope, stop scanning
+    if (/^\[\[?/.test(trimmed)) {
+      break;
+    }
+    // Top-level assignment: bare key = value
+    const keyMatch = trimmed.match(/^([A-Za-z][A-Za-z0-9_-]*)\s*=\s*(.*)$/);
+    if (!keyMatch) {
+      return {
+        ok: false,
+        error: `line ${i + 1}: not a valid top-level assignment: ${trimmed.slice(0, 60)}`,
+      };
+    }
+    const key = keyMatch[1];
+    const value = keyMatch[2];
+    if (keysSeen[key] !== undefined) {
+      return { ok: false, error: `line ${i + 1}: duplicate top-level key '${key}'` };
+    }
+    // Multi-line basic string: """..."""
+    if (value.startsWith('"""')) {
+      const after = value.slice(3);
+      const closeIdx = after.indexOf('"""');
+      if (closeIdx !== -1) {
+        // Closed on same line — check no trailing content except optional comment
+        const trailing = after.slice(closeIdx + 3).trim();
+        if (trailing !== '' && !trailing.startsWith('#')) {
+          return {
+            ok: false,
+            error: `line ${i + 1}: trailing content after """ close: ${trailing.slice(0, 40)}`,
+          };
+        }
+        keysSeen[key] = 'string';
+        i++;
+        continue;
+      }
+      // Scan forward for closing """
+      let j = i + 1;
+      let closed = false;
+      while (j < lines.length) {
+        const idx2 = lines[j].indexOf('"""');
+        if (idx2 !== -1) {
+          const trailing2 = lines[j].slice(idx2 + 3).trim();
+          if (trailing2 !== '' && !trailing2.startsWith('#')) {
+            return {
+              ok: false,
+              error: `line ${j + 1}: trailing content after """ close: ${trailing2.slice(0, 40)}`,
+            };
+          }
+          closed = true;
+          i = j + 1;
+          break;
+        }
+        j++;
+      }
+      if (!closed) {
+        return {
+          ok: false,
+          error: `line ${i + 1}: unterminated triple-quoted basic string (""" never closed)`,
+        };
+      }
+      keysSeen[key] = 'string';
+      continue;
+    }
+    // Multi-line literal string: '''...'''
+    if (value.startsWith("'''")) {
+      const after = value.slice(3);
+      const closeIdx = after.indexOf("'''");
+      if (closeIdx !== -1) {
+        const trailing = after.slice(closeIdx + 3).trim();
+        if (trailing !== '' && !trailing.startsWith('#')) {
+          return {
+            ok: false,
+            error: `line ${i + 1}: trailing content after ''' close: ${trailing.slice(0, 40)}`,
+          };
+        }
+        keysSeen[key] = 'string';
+        i++;
+        continue;
+      }
+      let j = i + 1;
+      let closed = false;
+      while (j < lines.length) {
+        const idx2 = lines[j].indexOf("'''");
+        if (idx2 !== -1) {
+          const trailing2 = lines[j].slice(idx2 + 3).trim();
+          if (trailing2 !== '' && !trailing2.startsWith('#')) {
+            return {
+              ok: false,
+              error: `line ${j + 1}: trailing content after ''' close: ${trailing2.slice(0, 40)}`,
+            };
+          }
+          closed = true;
+          i = j + 1;
+          break;
+        }
+        j++;
+      }
+      if (!closed) {
+        return {
+          ok: false,
+          error: `line ${i + 1}: unterminated triple-quoted literal string (''' never closed)`,
+        };
+      }
+      keysSeen[key] = 'string';
+      continue;
+    }
+    // Basic string: "..." with standard escapes; must close on same line
+    // and allow only a trailing comment after the closing quote.
+    if (value.startsWith('"')) {
+      const basicMatch = value.match(/^"((?:[^"\\]|\\.)*)"(?:\s*(?:#.*)?)?$/);
+      if (!basicMatch) {
+        return {
+          ok: false,
+          error: `line ${i + 1}: invalid basic string value (unterminated or trailing content): ${value.slice(0, 60)}`,
+        };
+      }
+      keysSeen[key] = 'string';
+      i++;
+      continue;
+    }
+    // Literal string: '...' with no escapes; must close on same line
+    if (value.startsWith("'")) {
+      const litMatch = value.match(/^'([^']*)'(?:\s*(?:#.*)?)?$/);
+      if (!litMatch) {
+        return {
+          ok: false,
+          error: `line ${i + 1}: invalid literal string value (unterminated or trailing content): ${value.slice(0, 60)}`,
+        };
+      }
+      keysSeen[key] = 'string';
+      i++;
+      continue;
+    }
+    // Any other value is not a string literal (int, bool, array, table, etc.)
+    keysSeen[key] = 'non-string';
+    i++;
+  }
+  return { ok: true, keys: keysSeen };
+}
+function checkGeminiCommands(cwd, aiTools) {
+  if (aiTools === 'claude') {
+    return {
+      status: PASS,
+      message: 'Gemini commands: N/A (Claude only)',
+      details: [],
+    };
+  }
+  const commandsDir = path.join(cwd, '.gemini', 'commands');
+  if (!fs.existsSync(commandsDir)) {
+    return {
+      status: WARN,
+      message: 'Gemini commands: .gemini/commands/ missing',
+      details: ['Run: npx create-sdd-project --upgrade to recreate template commands'],
+    };
+  }
+  // readdirSync with withFileTypes so we can filter symlinks before reading.
+  // Symlinks in .gemini/commands/ would make doctor read arbitrary files on
+  // the user's machine — low severity in a local CLI, but worth guarding.
+  const entries = fs
+    .readdirSync(commandsDir, { withFileTypes: true })
+    .filter((e) => e.name.endsWith('.toml'))
+    .sort((a, b) => a.name.localeCompare(b.name));
+  if (entries.length === 0) {
+    return {
+      status: WARN,
+      message: 'Gemini commands: no .toml files in .gemini/commands/',
+      details: ['Gemini CLI slash commands require .toml files. Run: npx create-sdd-project --upgrade'],
+    };
+  }
+  const issues = [];
+  let validCount = 0;
+  for (const entry of entries) {
+    const file = entry.name;
+    const filePath = path.join(commandsDir, file);
+    // Reject symlinks (Dirent can lie about isFile() when followed; use lstat).
+    let lst;
+    try {
+      lst = fs.lstatSync(filePath);
+    } catch (e) {
+      issues.push(`${file}: cannot lstat (${e.code || e.message})`);
+      continue;
+    }
+    if (lst.isSymbolicLink()) {
+      issues.push(`${file}: is a symlink — refusing to follow (security). Delete and run --upgrade to restore template`);
+      continue;
+    }
+    if (!lst.isFile()) {
+      issues.push(`${file}: not a regular file`);
+      continue;
+    }
+    let content;
+    try {
+      content = fs.readFileSync(filePath, 'utf8');
+    } catch (e) {
+      issues.push(`${file}: cannot read (${e.code || e.message})`);
+      continue;
+    }
+    if (content.trim() === '') {
+      issues.push(`${file}: empty file (Gemini CLI will skip this command silently)`);
+      continue;
+    }
+    // Validate using the strict grammar subset for our templates.
+    // Gemini CLI's FileCommandLoader schema is:
+    //   z.object({ prompt: z.string(), description: z.string().optional() })
+    const result = validateTomlCommandFile(content);
+    if (!result.ok) {
+      issues.push(`${file}: ${result.error}`);
+      continue;
+    }
+    const promptKind = result.keys.prompt;
+    const descriptionKind = result.keys.description;
+    if (promptKind === undefined) {
+      issues.push(
+        `${file}: missing required field 'prompt' (Gemini CLI will silently skip this command)`
+      );
+      continue;
+    }
+    if (promptKind !== 'string') {
+      issues.push(
+        `${file}: 'prompt' field must be a string (Gemini CLI requires z.string())`
+      );
+      continue;
+    }
+    if (descriptionKind !== undefined && descriptionKind !== 'string') {
+      issues.push(
+        `${file}: 'description' field is present but is not a string`
+      );
+      continue;
+    }
+    validCount++;
+  }
+  if (issues.length > 0) {
+    return {
+      status: FAIL,
+      message: `Gemini commands: ${issues.length} invalid TOML file${issues.length > 1 ? 's' : ''}`,
+      details: [
+        ...issues,
+        'Gemini CLI silently skips invalid TOML commands — they will not appear as slash commands in the UI.',
+        'Run: npx create-sdd-project --upgrade to restore template commands.',
+      ],
+    };
+  }
+  return {
+    status: PASS,
+    message: `Gemini commands: ${validCount}/${entries.length} valid`,
+    details: [],
+  };
+}
 module.exports = {
   runDoctor,
   printResults,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "create-sdd-project",
-  "version": "0.16.7",
+  "version": "0.16.9",
   "description": "Create a new SDD DevFlow project with AI-assisted development workflow",
   "bin": {
     "create-sdd-project": "bin/cli.js"

package/template/.claude/agents/backend-planner.md CHANGED Viewed

@@ -56,6 +56,37 @@ Write the following sections into the ticket's `## Implementation Plan` section:
 - Specific patterns from the codebase to follow (with file references)
 - Any gotchas or constraints the developer should know
+## Pre-Emission Verification (MANDATORY)
+Before writing the final plan, **verify every structural claim empirically against the actual code**. Planners that emit claims without verification produce plans with mechanical bugs (wrong paths, stale types, obsolete schemas, missing files, wrong primary key types) that block TDD and force re-planning.
+**IMPORTANT — do NOT hallucinate verification**: You MUST use your environment tools (`Grep`, `Read`, `Bash`) to actually execute these checks against the real code. Do NOT write fake commands or fabricated output to satisfy the format. If you have not executed the check, do not list it. Leaving the `Verification commands run` subsection empty is better than fabricating it — the downstream review-plan command is configured to treat empty verification as a flag for stricter review, not as a failure.
+For every item you intend to list under `Files to Modify`, `Files to Create`, `Key Patterns`, or `Existing Code to Reuse`:
+1. **Grep or read the referenced files** to confirm they exist at the path you cite
+2. **Verify types, enums, and validation schemas** mentioned match the current code. Use `Grep` on exported symbol names across the workspace — shared schemas often live in multiple packages, so one rewrite can leave dangling references
+3. **Verify primary keys, IDs, and foreign keys** by reading the ORM schema file (or equivalent) — don't assume `id` is a positive int when it's a `uuid`, and vice versa. Validator types MUST match the DB column type
+4. **Verify the current state of enums before proposing to drop or replace them** — enum types are often referenced in 2-3 places (TypeScript type, validation schema, ORM enum, DB column). ALL references must be cleaned in the SAME commit or the workspace breaks mid-migration
+5. **For any migration that DROPs a table or type**, confirm the table is either unused or its data has been backed up — add a pre-flight safety check to the plan
+After finishing the plan, append a final subsection to the ticket:
+### Verification commands run
+List every empirical check you executed using this format: `<command> → <observed fact> → <impact on plan>`. One line per check. **Every entry must have all three fields** — a bare command without an observed fact is not verification, it's cargo-culting.
+Example format:
+- `Grep: "PortionContext" in packages/` → 2 hits: `shared/src/schemas/enums.ts:18`, `shared/src/schemas/standardPortion.ts:4` → both must be deleted in the migration commit, listed under "Files to Modify"
+- `Read: packages/api/prisma/schema.prisma:318-330` → confirmed `dishId String @db.Uuid` (not int) → Seed CSV validator must use `z.string().uuid()`, NOT `z.number().int()`
+- `Grep: "formatPortionTermLabel" in packages/shared/` → helper does not yet exist → list under "Files to Create" for commit 1 of the TDD order
+- (continue with every empirical check)
+**If this subsection is empty or missing**, prepend the plan with a warning: `⚠ This plan is text-only and has not been empirically verified against the code. Cross-model reviewers MUST run empirical checks before approving.`
+The `review-plan` command reads this subsection to calibrate reviewer effort. An empty or missing subsection is treated as a flag for stricter review.
 ## Rules
 - **NEVER** write implementation code — only the plan

package/template/.claude/agents/frontend-planner.md CHANGED Viewed

@@ -56,6 +56,37 @@ Write the following sections into the ticket's `## Implementation Plan` section:
 - Specific patterns from the codebase to follow (with file references)
 - Any gotchas or constraints the developer should know
+## Pre-Emission Verification (MANDATORY)
+Before writing the final plan, **verify every structural claim empirically against the actual code**. Planners that emit claims without verification produce plans with mechanical bugs (wrong component paths, stale prop types, missing exported helpers, inconsistent helper usage between packages) that block TDD and force re-planning.
+**IMPORTANT — do NOT hallucinate verification**: You MUST use your environment tools (`Grep`, `Read`, `Bash`) to actually execute these checks. Do NOT fabricate commands or output to satisfy the format. An empty `Verification commands run` subsection is better than a fake one — the downstream review-plan command flags empty sections for stricter review, not for failure.
+For every item you intend to list under `Files to Modify`, `Files to Create`, `Key Patterns`, or `Existing Code to Reuse`:
+1. **Grep or read the referenced files** to confirm they exist at the path you cite
+2. **Verify component prop types and shared helpers** — before proposing a helper inline, check if one already exists in `packages/shared/` or equivalent. Helpers used by both web and bot MUST live in `shared/` and be imported; do NOT duplicate inline in each package
+3. **Verify API response shapes** by reading the shared validation schemas — the frontend MUST match the backend contract, not invent fields
+4. **Verify existing CSS tokens, Tailwind utilities, and component library primitives** before proposing new classes — design tokens (colors, spacing, typography) live in `tailwind.config.ts` or `globals.css`, not in component files
+5. **Verify accessibility semantics** — if the plan proposes `aria-*` attributes, confirm the pattern against existing accessible components in the codebase
+After finishing the plan, append a final subsection to the ticket:
+### Verification commands run
+List every empirical check using this format: `<command> → <observed fact> → <impact on plan>`. One line per check. **Every entry must have all three fields** — a bare command without an observed fact is cargo-culting.
+Example format:
+- `Grep: "formatPortionTermLabel" in packages/` → helper exists in `packages/shared/src/portion/portionLabel.ts:32` → do not duplicate inline, import from `@foodxplorer/shared`, list under "Existing Code to Reuse"
+- `Read: packages/shared/src/schemas/estimate.ts:180-205` → confirmed `portionAssumption` field is optional with `source: "per_dish" | "generic"` → NutritionCard must handle both branches, listed under "Key Patterns"
+- `Grep: "aria-labelledby" in packages/web/src/components/` → existing pattern uses `useId()` for hook-generated IDs → reuse same pattern in new component, not hardcoded strings
+- (continue with every empirical check)
+**If this subsection is empty or missing**, prepend the plan with a warning: `⚠ This plan is text-only and has not been empirically verified against the code. Cross-model reviewers MUST run empirical checks before approving.`
+The `review-plan` command reads this subsection to calibrate reviewer effort. An empty or missing subsection is treated as a flag for stricter review.
 ## Rules
 - **NEVER** write implementation code — only the plan

package/template/.claude/commands/review-plan.md CHANGED Viewed

@@ -26,16 +26,47 @@ mkdir -p "$REVIEW_DIR"
 cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
 You are reviewing an Implementation Plan for a software feature. Your job is to find real problems, not praise. But if the plan is solid, say APPROVED — do not manufacture issues that are not there.
+## CRITICAL: This is an EMPIRICAL review, not a text-only review
+Before reporting findings, you MUST verify structural claims against the actual code. Plans often have subtle mechanical bugs (wrong paths, stale types, obsolete schemas, incorrect primary key types, dangling references in shared packages) that only surface when you verify empirically. A text-only review that reads the plan alone will miss them.
+Required empirical checks before emitting findings:
+1. **Read every file path the plan cites** — confirm it exists. If the plan says `packages/api/src/foo.ts`, open it. If it doesn't exist, that's a CRITICAL finding.
+2. **Grep for every exported symbol** the plan claims to reuse, modify, or delete (types, enums, Zod schemas, functions, classes). Shared symbols often live in 2-3 places — one rewrite leaves dangling references if the others aren't cleaned in the same commit.
+3. **Verify primary key and foreign key types** by reading the actual schema file (`prisma/schema.prisma` or equivalent). Don't assume `id` is a positive int when it's a `uuid`, and vice versa. Validator types MUST match the DB column type — this is one of the most common mechanical bugs in plans.
+4. **For any DROP / DELETE / CASCADE operation**, grep the workspace for ALL references to the dropped symbol. The plan must clean them all in the same commit or the workspace breaks mid-migration.
+5. **If the plan cites "Existing Code to Reuse"**, read those files to confirm they actually provide what the plan claims. Plans frequently cite helpers that don't exist or whose signature is different.
+Do NOT rely on the plan's assertions alone. Do NOT assume file paths, types, or schemas are correct without verifying.
+**Ticket-level signal to calibrate your review depth**: Look for a `### Verification commands run` subsection inside `## Implementation Plan`. If the planner listed empirical commands there, the plan was self-verified; focus your review on higher-order issues (gaps, vagueness, over-engineering). If that subsection is missing or empty, the plan is text-only and you MUST run the empirical checks above with extra rigor.
+## Review criteria
 Below you will find the Spec (what to build) and the Implementation Plan (how to build it). Review the plan and report:
-1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec
+1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec, **path/type/schema mismatches against the actual code (verified empirically)**
 2. Gaps — Missing error handling, edge cases, rollback scenarios
 3. Vagueness — Steps too ambiguous to implement with TDD (no clear input/output)
 4. Over-engineering — Unnecessary abstractions, premature optimization
 5. Order issues — Steps that depend on later steps
-For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
+For each issue, state: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite the file:line you read to discover the issue when applicable`.
+## Output format — mandatory sections
+At the END of your review, include these two subsections:
+### Files read during review
+(list every file you opened, with brief note of what each confirmed or contradicted)
-End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
+### Commands executed
+(list every grep / find / sed / read command you ran, with the pattern)
+If BOTH subsections are empty, prepend your review with: `⚠ TEXT-ONLY REVIEW — no empirical verification performed. Findings are based on plan text alone.`
+End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
 ---
 SPEC AND PLAN:
@@ -63,6 +94,71 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
 Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
+### Meta-check: reviewer empirical asymmetry
+After both reviews are in, check for empirical asymmetry. This guards against one model being text-only while the other does real empirical verification — a pattern observed in practice where different reviewers have different agentic habits.
+**Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review has a populated `### Files read during review` section with real file paths and line numbers, and the other review has an empty or missing section (or quotes the example text without adding real entries), the light review is incomplete. This is the authoritative check — the shell heuristic below is a secondary confirmation only.
+**Secondary check — shell heuristic**: The following bash block is a mechanical double-check. It only flags **missing empirical evidence**, NOT low finding counts (a clean plan legitimately produces 0 CRITICAL/IMPORTANT findings; re-prompting on that is noise).
+```bash
+# Count REAL empirical markers — anchored to the mandatory markdown headers only,
+# not to substring mentions elsewhere in the review text. Uses `wc -l` to avoid
+# the `grep -c || echo 0` pitfall that produces "0\n0" when grep exits non-zero.
+count_empirical() {
+  local file="$1"
+  [ -r "$file" ] || { echo 0; return; }
+  # Look for non-empty lines under the two mandatory markdown headers.
+  awk '
+    /^### Files read during review$/ { in_files=1; in_cmds=0; next }
+    /^### Commands executed$/ { in_files=0; in_cmds=1; next }
+    /^### / { in_files=0; in_cmds=0 }
+    (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
+    END { print n+0 }
+  ' "$file"
+}
+GEMINI_EMPIRICAL=$(count_empirical "$REVIEW_DIR/gemini.txt")
+CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
+echo "Empirical evidence — Gemini: $GEMINI_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
+```
+**Trigger re-prompt ONLY when one reviewer has zero empirical entries**. Do NOT trigger based on finding counts — a clean plan legitimately produces zero findings, and re-prompting on that wastes time.
+If re-prompt is needed, write a concrete reprompt file and re-run the light reviewer. Use CONCRETE shell variables, not literal `<angle>` placeholders (those are bash input redirects and will fail at runtime):
+```bash
+# Example — ONLY the SKELETON. Replace LIGHT_CLI, LIGHT_NAME, and OTHER_CLI with
+# the actual reviewer details for your environment. This is documentation, not
+# a cargo-cult-runnable block. Run the concrete version manually.
+cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
+Your previous review was text-only: the `### Files read during review` section was empty or missing. Plans frequently have subtle mechanical bugs (wrong file paths, stale type references, primary key type mismatches, dangling shared-package references) that only appear with empirical verification.
+Re-review the plan with EMPIRICAL verification. You MUST use your environment tools to read and grep real files. Do NOT hallucinate commands or output. You MUST:
+1. Read every file path the plan cites and confirm it exists
+2. Grep the workspace for every type/enum/schema/function the plan references
+3. Verify primary and foreign key types against the actual schema file
+4. For any DROP/DELETE/CASCADE, grep ALL references to confirm atomic cleanup
+5. List the files you opened and the commands you ran at the END of your review, with real observed facts for each (not just the command string)
+Look for: path mismatches, stale type references, primary key type mismatches, and dangling references in shared packages that need cleanup in the same commit.
+REPROMPT
+# Then re-run the LIGHT reviewer with the reprompt prepended. Example for Gemini:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | gemini > "$REVIEW_DIR/gemini_reprompted.txt" 2>&1
+#   cat "$REVIEW_DIR/gemini_reprompted.txt"
+# Example for Codex:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
+#   cat "$REVIEW_DIR/codex_reprompted.txt"
+```
+Merge the re-prompted findings into your consolidation. Do NOT skip this meta-check — it catches real bugs that would otherwise enter TDD.
+See `.claude/skills/development-workflow/references/cross-model-review.md` for calibration notes on reviewer patterns (which models tend toward empirical verification vs standards-compliance checking).
 ### Path B: One CLI available
 ```bash

package/template/.claude/commands/review-spec.md CHANGED Viewed

@@ -26,18 +26,45 @@ mkdir -p "$REVIEW_DIR"
 cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
 You are reviewing a Feature Specification for a software feature. Your job is to find real problems in the REQUIREMENTS — not the implementation (there is no implementation yet). If the spec is solid, say APPROVED — do not manufacture issues.
+## This is a CONTEXTUAL review — verify consistency against the codebase
+A spec review is not "how would I implement this" — that's for the plan phase. But a spec review IS responsible for detecting inconsistencies between what the spec claims and what already exists in the project. To do that rigorously, you MUST read project context files:
+1. **Read the project standards** referenced in the spec (`ai-specs/specs/base-standards.mdc`, `backend-standards.mdc`, `frontend-standards.mdc`) — confirm the spec doesn't contradict them
+2. **Read the key_facts.md and decisions.md** — confirm the spec doesn't reintroduce patterns the project explicitly rejected in a prior ADR
+3. **Read the existing API spec** (`docs/specs/api-spec.yaml`) — confirm new endpoints don't collide with existing ones and follow the same conventions
+4. **Grep for existing similar features** — if the spec proposes "add metrics for X", grep the workspace for existing metrics implementations to see if the spec is compatible with what's already there
+5. **For any field, type, or enum the spec proposes**, grep shared schemas to see if a similar concept already exists under a different name
+Do NOT review the spec as isolated text. A spec that looks internally consistent but contradicts the existing architecture is worse than one with obvious gaps.
+## Review criteria
 Below you will find the Spec (what to build), the Acceptance Criteria, and project context (architecture, decisions). Review the spec and report:
 1. Completeness — Are all user needs covered? Missing requirements?
 2. Ambiguity — Are requirements clear enough to plan and implement with TDD?
 3. Edge cases — Are failure modes, boundary conditions, and error responses specified?
 4. API contract — Are endpoints, fields, types, status codes well-defined? (if applicable)
 5. Scope — Is the spec doing too much or too little for one feature?
-6. Consistency — Does the spec conflict with existing architecture, patterns, or decisions?
+6. Consistency — Does the spec conflict with existing architecture, patterns, or prior ADRs? **(verify by reading the referenced files, not by inference)**
 7. Testability — Can each acceptance criterion be verified with an automated test?
-For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
+For each issue, state: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite the file:line you read when the issue involves conflict with existing code`.
+## Output format — mandatory sections
-End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
+At the END of your review, include these two subsections:
+### Files read during review
+(list every file you opened, with brief note of what each confirmed or contradicted)
+### Commands executed
+(list every grep / find / sed / read command you ran, with the pattern)
+If BOTH subsections are empty, prepend your review with: `⚠ TEXT-ONLY REVIEW — no empirical verification of architectural consistency. Findings are based on spec text alone.`
+End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
 ---
 SPEC AND ACCEPTANCE CRITERIA:
@@ -70,6 +97,59 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
 Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
+### Meta-check: reviewer empirical asymmetry
+After both reviews are in, check for empirical asymmetry. One reviewer may be text-only while the other reads files — re-prompt the light reviewer if so.
+**Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review cites actual files from the project (standards, decisions, existing schemas) and the other review contains only generic commentary, the light review is incomplete. This is the authoritative check.
+**Secondary check — shell heuristic**: The block below only flags **missing empirical evidence**, NOT low finding counts. A clean spec legitimately produces zero findings; re-prompting on that wastes time.
+```bash
+count_empirical() {
+  local file="$1"
+  [ -r "$file" ] || { echo 0; return; }
+  awk '
+    /^### Files read during review$/ { in_files=1; in_cmds=0; next }
+    /^### Commands executed$/ { in_files=0; in_cmds=1; next }
+    /^### / { in_files=0; in_cmds=0 }
+    (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
+    END { print n+0 }
+  ' "$file"
+}
+GEMINI_EMPIRICAL=$(count_empirical "$REVIEW_DIR/gemini.txt")
+CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
+echo "Empirical evidence — Gemini: $GEMINI_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
+```
+**Trigger re-prompt ONLY when one reviewer has zero empirical entries**. If so, write a concrete reprompt file and re-run the light reviewer. Use CONCRETE shell variables, not literal `<angle>` placeholders:
+```bash
+# Documentation skeleton — replace with concrete reviewer invocation.
+cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
+Your previous review was text-only: the `### Files read during review` section was empty or missing. Specs can silently contradict existing architecture (prior ADRs, established patterns, conflicting schemas) in ways that only surface when you read project context files.
+Re-review the spec with CONTEXTUAL verification. You MUST use your environment tools to read real files. Do NOT hallucinate commands or output. You MUST:
+1. Read ai-specs/specs/base-standards.mdc, backend-standards.mdc, frontend-standards.mdc
+2. Read docs/project_notes/key_facts.md and decisions.md
+3. Grep the workspace for existing similar features to check for collision or duplication
+4. Verify proposed fields/types/enums don't already exist under different names in shared schemas
+5. List the files you opened at the END of your review, with real observed facts for each
+Look for: contradictions with prior ADRs, collisions with existing APIs, duplication of concepts that already exist under different names, spec vocabulary that doesn't match the project's existing terminology.
+REPROMPT
+# Example for Gemini:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | gemini > "$REVIEW_DIR/gemini_reprompted.txt" 2>&1
+# Example for Codex:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
+```
+Merge the re-prompted findings. See `.claude/skills/development-workflow/references/cross-model-review.md` for calibration notes on reviewer patterns.
 ### Path B: One CLI available
 ```bash

package/template/.claude/skills/development-workflow/references/cross-model-review.md ADDED Viewed

@@ -0,0 +1,86 @@
+# Cross-Model Review — Calibration Notes
+Reference for interpreting cross-model review output during spec/plan review (Steps 0 and 2 of the workflow).
+## What cross-model review is for
+Cross-model review runs the same spec or plan through TWO independent AI models (typically Codex CLI + Gemini CLI, or Codex + Claude). The goal is to surface issues that a single model would miss — not because the models are bad, but because different models have different failure modes.
+Key insight: **two reviewers catching the same issue is strong signal. One reviewer catching an issue the other missed is also valuable — but you need to know *why* one missed it to calibrate correctly.**
+## Reviewer patterns observed in practice
+These patterns are not absolute — they're tendencies observed over many reviews. Use them to calibrate expectations, not to dismiss findings from any model.
+### Codex CLI
+- **Tends to be agentic** — runs shell commands (`rg`, `sed`, `find`, file reads) during review
+- **Primary bug-finder** — catches mechanical mismatches (wrong file paths, stale type references, primary key type mismatches, dangling references in shared packages)
+- **Cross-references plan claims against code** — high empirical rigor
+- **Weakness**: can produce long, exhaustive output that buries the highest-severity findings in noise
+- **Typical severity distribution for Standard/Complex plans**: 2-4 M1/M2 findings, 3-6 M3 suggestions
+### Gemini CLI
+- **Tends to be text-aware but less agentic** — reads the spec/plan and project standards, but may not grep the actual code
+- **Primary standards-compliance checker** — catches inconsistencies between the plan and documented standards (base-standards.mdc, decisions.md)
+- **Reads project context** via the `instructions` field of `.gemini/settings.json` — references `ai-specs/specs/`, `.gemini/agents/`, standards
+- **Weakness**: can approve plans that look internally consistent but have mechanical bugs only visible through empirical verification
+- **Typical severity distribution for Standard/Complex plans**: 1-2 M2 findings, 2-4 M3 suggestions
+### Claude CLI (when used as third reviewer)
+- **Tends to be analytical** — reasons through scope, ordering, edge cases
+- **Primary scope-and-structure checker** — catches over-engineering, out-of-scope additions, feature creep
+- **Weakness**: similar to Gemini, less empirical by default
+## What the calibration means for you
+### When reviews agree
+If both reviewers flag the same finding, weight is high. Address first.
+### When reviews disagree
+Don't arbitrate from authority — arbitrate from evidence. If Codex cites `packages/api/prisma/schema.prisma:323` showing `id String @db.Uuid` and Gemini didn't read that file, Codex's finding carries the empirical weight. Resolve in Codex's direction.
+If the disagreement is about scope or over-engineering (e.g., Codex says "add caching layer" and Gemini says "out of scope for this feature"), read both rationales carefully — this is where different models genuinely produce different takes. Lean toward YAGNI by default.
+### When reviews are asymmetric (one light, one heavy)
+Asymmetry is the most important signal. If Codex produces 3 M1 blockers and Gemini produces 0 M1 + 2 M3, **do NOT** conclude "Gemini approved so half of Codex's findings are wrong". Conclude: **"Gemini did a text-only review and missed the empirical bugs"**. The `review-plan` command includes an automated asymmetry check that re-prompts the light reviewer with stricter empirical instructions. Use it.
+### When both reviews are light
+If both produce 0 M1/M2 findings AND both produce empty "Files read during review" sections, the review is text-only. This is acceptable for trivial changes (one-line fixes, typo corrections) but NOT for Standard/Complex features. For non-trivial work, re-run the review with both reviewers and explicitly invoke the empirical verification checklist from the `review-plan` / `review-spec` commands.
+## Historical calibration data
+These are real examples from this project's history — add more as patterns emerge.
+### F-UX-B Plan review (2026-04-13)
+**Gemini** produced 48 lines, 2 M3 + 1 P2 findings, verdict APPROVE WITH CHANGES. Cited `ai-specs/specs/base-standards.mdc` section "5. Implementation Workflow" — demonstrably read project context.
+**Codex** produced 829 lines, 3 M1 + 1 M2 + 2 M3, verdict REJECT. Ran `rg` and `sed` during review. Cited `packages/shared/src/schemas/standardPortion.ts:1-36`, `enums.ts:18-25`, `packages/api/prisma/schema.prisma:323`, `.gemini/agents/backend-planner.md:1-34`.
+**Codex M1s were real bugs**: helper fallback produced "Media_racion" instead of "Media ración" (underscore bug); shared schema drift (PortionContextSchema existed in 2 places, one would become dangling); dishId validator expected positive int but DB column was uuid.
+**Takeaway**: Gemini's context-loading fix (sdd-devflow v0.16.7) worked — it read standards correctly. But context loading ≠ empirical verification. Codex's agentic habit of running commands against the code caught bugs Gemini's text review missed. Both reviewers are needed and complementary.
+## When to override the calibration
+These are tendencies, not certainties. Override when:
+- A "light" reviewer produces a specific, high-quality finding with cited evidence — don't dismiss it
+- A "heavy" reviewer produces verbose output with few actionable findings — don't over-weight length
+- A new model version changes behavior significantly — update this file with the new observation
+The calibration is a tool, not a rule. Always read both reviews carefully.
+## Related files
+- `.claude/commands/review-spec.md` — spec review command with empirical checklist
+- `.claude/commands/review-plan.md` — plan review command with empirical checklist + asymmetry meta-check
+- `.claude/agents/backend-planner.md` — planner with Pre-Emission Verification section
+- `.claude/agents/frontend-planner.md` — planner with Pre-Emission Verification section

package/template/.gemini/agents/backend-planner.md CHANGED Viewed

@@ -28,6 +28,29 @@ Generate a detailed Implementation Plan and write it into the ticket's `## Imple
 - Implementation Order (Domain > Application > Infrastructure > Presentation > Tests)
 - Testing Strategy
 - Key Patterns
+- **Verification commands run** (see Pre-Emission Verification below)
+## Pre-Emission Verification (MANDATORY)
+Before emitting the final plan, verify every structural claim empirically against the actual code. Plans emitted without verification produce mechanical bugs (wrong paths, stale types, obsolete schemas, wrong PK types) that block TDD.
+**Do NOT hallucinate**: You MUST use your environment tools to execute the checks against the real code. Do NOT fabricate commands or output. An empty `Verification commands run` subsection is better than a fake one — the downstream review-plan command flags empty sections for stricter review, not as failure.
+Required checks:
+1. Grep or read every file you cite in `Files to Modify`, `Files to Create`, `Key Patterns`, `Existing Code to Reuse` — confirm it exists at that path
+2. Grep exported symbol names (types, enums, validation schemas) across the workspace. Shared schemas often live in 2-3 places; one rewrite leaves dangling references if the others aren't cleaned in the same commit
+3. Read `prisma/schema.prisma` (or equivalent) before asserting primary key types. Validators MUST match the DB column type (uuid vs int vs cuid). Do NOT assume
+4. Before proposing to DROP an enum or table, grep workspace for all references AND confirm the table is unused or add a pre-flight safety check (SELECT COUNT + pg_dump backup)
+Append to the ticket a final subsection `### Verification commands run`. Use this exact 3-field format per entry: `<command> → <observed fact> → <impact on plan>`. Every entry must have all three fields — a bare command without an observed fact is not verification. Example:
+- `Grep: "PortionContext" in packages/` → 2 hits (`enums.ts:18`, `standardPortion.ts:4`) → both must be deleted in the migration commit
+- `Read: packages/api/prisma/schema.prisma:323` → `dishId String @db.Uuid` (not int) → validator uses `z.string().uuid()`
+If the subsection is empty or missing, prepend the plan with `⚠ This plan is text-only and has not been empirically verified. Cross-model reviewers MUST run empirical checks.`
+The `review-plan` command reads this subsection to calibrate reviewer effort. Empty = stricter review.
 ## Rules

package/template/.gemini/agents/frontend-planner.md CHANGED Viewed

@@ -29,6 +29,30 @@ Generate a detailed Implementation Plan and write it into the ticket's `## Imple
 - Implementation Order (Types > Services > Stores > Components > Pages > Tests)
 - Testing Strategy
 - Key Patterns
+- **Verification commands run** (see Pre-Emission Verification below)
+## Pre-Emission Verification (MANDATORY)
+Before emitting the final plan, verify every structural claim empirically against the actual code. Plans emitted without verification produce mechanical bugs (wrong component paths, stale prop types, duplicated helpers between packages, invented API fields) that block TDD.
+**Do NOT hallucinate**: You MUST use your environment tools to execute the checks. Do NOT fabricate commands or output. An empty `Verification commands run` subsection is better than a fake one.
+Required checks:
+1. Grep or read every file you cite — confirm path exists
+2. Before proposing an inline helper, grep `packages/shared/` for an existing equivalent. Helpers used by BOTH web and bot MUST live in `shared/` and be imported; do NOT duplicate inline per package
+3. Read the shared validation schema for any API response the frontend renders. Frontend MUST match the backend contract, not invent fields
+4. Verify CSS tokens and component primitives exist before proposing new classes. Design tokens live in `tailwind.config.ts` or `globals.css`, not in component files
+5. Verify accessibility semantics (`aria-*`, role, labelled-by) against existing accessible components in the codebase
+Append to the ticket a final subsection `### Verification commands run`. Use this exact 3-field format per entry: `<command> → <observed fact> → <impact on plan>`. Every entry must have all three fields. Example:
+- `Grep: "formatPortionTermLabel" in packages/` → helper exists in `packages/shared/src/portion/portionLabel.ts:32` → import from `@foodxplorer/shared`, do not duplicate
+- `Read: packages/shared/src/schemas/estimate.ts:180-205` → `portionAssumption` is optional with `source: "per_dish" | "generic"` → component handles both branches
+If empty or missing, prepend plan with `⚠ This plan is text-only and has not been empirically verified. Cross-model reviewers MUST run empirical checks.`
+The `review-plan` command reads this subsection to calibrate reviewer effort. Empty = stricter review.
 ## Rules

package/template/.gemini/commands/review-plan-instructions.md CHANGED Viewed

@@ -28,16 +28,47 @@ mkdir -p "$REVIEW_DIR"
 cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
 You are reviewing an Implementation Plan for a software feature. Your job is to find real problems, not praise. But if the plan is solid, say APPROVED — do not manufacture issues that are not there.
-Below you will find the Spec (what to build) and the Implementation Plan (how to build it). Review the plan and report:
-1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec
+## CRITICAL: This is an EMPIRICAL review, not a text-only review
+Before reporting findings, you MUST verify structural claims against the actual code. Plans often have subtle mechanical bugs (wrong paths, stale types, obsolete schemas, incorrect primary key types, dangling references in shared packages) that only surface when you verify empirically.
+Required empirical checks:
+1. Read every file path the plan cites — confirm it exists
+2. Grep for every exported symbol the plan claims to reuse, modify, or delete (types, enums, Zod schemas, functions). Shared symbols often live in 2-3 places; one rewrite leaves dangling references unless all are cleaned in the same commit
+3. Verify primary and foreign key types by reading the actual schema file (`prisma/schema.prisma` or equivalent). Validators MUST match the DB column type (uuid vs int vs cuid)
+4. For any DROP / DELETE / CASCADE, grep ALL references to the dropped symbol — the plan must clean them atomically
+5. If the plan cites "Existing Code to Reuse", read those files to confirm they actually provide what the plan claims
+Do NOT rely on the plan's assertions alone. Do NOT assume file paths, types, or schemas are correct without verifying.
+**Ticket-level signal**: Look for a `### Verification commands run` subsection inside `## Implementation Plan`. If present and populated, the planner self-verified; focus your review on higher-order issues. If missing or empty, the plan is text-only and you MUST run the empirical checks above with extra rigor.
+## Review criteria
+Below you will find the Spec and the Implementation Plan. Review the plan and report:
+1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec, **path/type/schema mismatches against the actual code (verified empirically)**
 2. Gaps — Missing error handling, edge cases, rollback scenarios
-3. Vagueness — Steps too ambiguous to implement with TDD (no clear input/output)
+3. Vagueness — Steps too ambiguous to implement with TDD
 4. Over-engineering — Unnecessary abstractions, premature optimization
 5. Order issues — Steps that depend on later steps
-For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
+For each issue: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite file:line when applicable`.
+## Output format — mandatory
+At the END of your review:
+### Files read during review
+(list every file you opened, with brief note of what each confirmed or contradicted)
+### Commands executed
+(list every grep / find / sed / read command you ran, with the pattern)
+If BOTH are empty, prepend: `⚠ TEXT-ONLY REVIEW — no empirical verification performed.`
-End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
+End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
 ---
 SPEC AND PLAN:
@@ -65,6 +96,57 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
 Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
+#### Meta-check: reviewer empirical asymmetry
+After both reviews are in, check for empirical asymmetry. One reviewer may be text-only while the other runs empirical checks — re-prompt the light reviewer if so.
+**Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review has a populated `### Files read during review` section with real file paths and line numbers, and the other review has an empty or missing section, the light review is incomplete. This is the authoritative check.
+**Secondary check — shell heuristic**: The block below flags **missing empirical evidence only**, NOT low finding counts (a clean plan legitimately produces zero findings).
+```bash
+count_empirical() {
+  local file="$1"
+  [ -r "$file" ] || { echo 0; return; }
+  awk '
+    /^### Files read during review$/ { in_files=1; in_cmds=0; next }
+    /^### Commands executed$/ { in_files=0; in_cmds=1; next }
+    /^### / { in_files=0; in_cmds=0 }
+    (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
+    END { print n+0 }
+  ' "$file"
+}
+CLAUDE_EMPIRICAL=$(count_empirical "$REVIEW_DIR/claude.txt")
+CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
+echo "Empirical evidence — Claude: $CLAUDE_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
+```
+**Trigger re-prompt ONLY when one reviewer has zero empirical entries**. If so, write a concrete reprompt file and re-run the light reviewer. Use concrete shell variables, not literal `<angle>` placeholders:
+```bash
+cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
+Your previous review was text-only: the `### Files read during review` section was empty or missing. Plans frequently have subtle mechanical bugs (wrong file paths, stale type references, primary key type mismatches, dangling shared-package references) that only appear with empirical verification.
+Re-review the plan with EMPIRICAL verification. You MUST use your environment tools to read and grep real files. Do NOT hallucinate commands or output. You MUST:
+1. Read every file path the plan cites and confirm it exists
+2. Grep the workspace for every type/enum/schema/function the plan references
+3. Verify primary and foreign key types against the actual schema file
+4. For any DROP/DELETE/CASCADE, grep ALL references to confirm atomic cleanup
+5. List the files you opened and the commands you ran at the END of your review, with real observed facts for each (not just the command string)
+Look for: path mismatches, stale type references, primary key type mismatches, and dangling references in shared packages that need cleanup in the same commit.
+REPROMPT
+# Example for Claude:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | claude --print > "$REVIEW_DIR/claude_reprompted.txt" 2>&1
+# Example for Codex:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
+```
+Merge the re-prompted findings into your consolidation. See `.gemini/skills/development-workflow/references/cross-model-review.md` for calibration notes on reviewer patterns.
 #### Path B: One CLI available
 ```bash

package/template/.gemini/commands/review-spec-instructions.md CHANGED Viewed

@@ -28,18 +28,43 @@ mkdir -p "$REVIEW_DIR"
 cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
 You are reviewing a Feature Specification for a software feature. Your job is to find real problems in the REQUIREMENTS — not the implementation (there is no implementation yet). If the spec is solid, say APPROVED — do not manufacture issues.
-Below you will find the Spec (what to build), the Acceptance Criteria, and project context (architecture, decisions). Review the spec and report:
+## This is a CONTEXTUAL review — verify consistency against the codebase
+A spec can look internally consistent but contradict existing architecture (prior ADRs, established patterns, conflicting schemas). Text-only review misses those. You MUST read project context files:
+1. Read the standards referenced in the spec (`base-standards.mdc`, `backend-standards.mdc`, `frontend-standards.mdc`) — confirm no contradiction
+2. Read `docs/project_notes/key_facts.md` and `decisions.md` — confirm no regression against prior ADRs
+3. Read `docs/specs/api-spec.yaml` — confirm new endpoints don't collide with existing ones and follow the same conventions
+4. Grep for existing similar features — if the spec proposes "add metrics for X", grep for existing metrics implementations to check for collision or duplication
+5. For any field/type/enum proposed, grep shared schemas to see if the concept already exists under a different name
+## Review criteria
+Below you will find the Spec (what to build), the Acceptance Criteria, and project context. Review the spec and report:
 1. Completeness — Are all user needs covered? Missing requirements?
 2. Ambiguity — Are requirements clear enough to plan and implement with TDD?
 3. Edge cases — Are failure modes, boundary conditions, and error responses specified?
 4. API contract — Are endpoints, fields, types, status codes well-defined? (if applicable)
 5. Scope — Is the spec doing too much or too little for one feature?
-6. Consistency — Does the spec conflict with existing architecture, patterns, or decisions?
+6. Consistency — Does the spec conflict with existing architecture, patterns, or prior ADRs? **(verify by reading the referenced files, not by inference)**
 7. Testability — Can each acceptance criterion be verified with an automated test?
-For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
+For each issue: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite file:line when involves conflict with existing code`.
+## Output format — mandatory
-End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
+At the END of your review:
+### Files read during review
+(list every file you opened, with brief note of what each confirmed or contradicted)
+### Commands executed
+(list every grep / find / sed / read command you ran, with the pattern)
+If BOTH are empty, prepend: `⚠ TEXT-ONLY REVIEW — no empirical verification of architectural consistency.`
+End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
 ---
 SPEC AND ACCEPTANCE CRITERIA:
@@ -72,6 +97,57 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
 Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
+#### Meta-check: reviewer empirical asymmetry
+After both reviews are in, check for empirical asymmetry.
+**Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review cites actual files from the project (standards, decisions, existing schemas) and the other contains only generic commentary, the light review is incomplete. This is the authoritative check.
+**Secondary check — shell heuristic**: flags **missing empirical evidence only**, not low finding counts.
+```bash
+count_empirical() {
+  local file="$1"
+  [ -r "$file" ] || { echo 0; return; }
+  awk '
+    /^### Files read during review$/ { in_files=1; in_cmds=0; next }
+    /^### Commands executed$/ { in_files=0; in_cmds=1; next }
+    /^### / { in_files=0; in_cmds=0 }
+    (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
+    END { print n+0 }
+  ' "$file"
+}
+CLAUDE_EMPIRICAL=$(count_empirical "$REVIEW_DIR/claude.txt")
+CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
+echo "Empirical evidence — Claude: $CLAUDE_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
+```
+**Trigger re-prompt ONLY when one reviewer has zero empirical entries**. If so:
+```bash
+cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
+Your previous review was text-only: the `### Files read during review` section was empty or missing. Specs can silently contradict existing architecture in ways that only surface when you read project context files.
+Re-review the spec with CONTEXTUAL verification. You MUST use your environment tools to read real files. Do NOT hallucinate commands or output. You MUST:
+1. Read ai-specs/specs/base-standards.mdc, backend-standards.mdc, frontend-standards.mdc
+2. Read docs/project_notes/key_facts.md and decisions.md
+3. Grep the workspace for existing similar features to check for collision or duplication
+4. Verify proposed fields/types/enums don't already exist under different names in shared schemas
+5. List the files you opened at the END of your review, with real observed facts for each
+Look for: contradictions with prior ADRs, collisions with existing APIs, duplication of concepts that already exist under different names, spec vocabulary that doesn't match the project's existing terminology.
+REPROMPT
+# Example for Claude:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | claude --print > "$REVIEW_DIR/claude_reprompted.txt" 2>&1
+# Example for Codex:
+#   cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
+```
+Merge the re-prompted findings. See `.gemini/skills/development-workflow/references/cross-model-review.md` for calibration notes.
 #### Path B: One CLI available
 ```bash

package/template/.gemini/skills/development-workflow/references/cross-model-review.md ADDED Viewed

@@ -0,0 +1,62 @@
+# Cross-Model Review — Calibration Notes
+Reference for interpreting cross-model review output during spec/plan review.
+## Purpose
+Cross-model review runs the same spec or plan through two independent AI models to surface issues a single model would miss. Different models have different failure modes — that's the value.
+## Reviewer patterns observed in practice
+Tendencies, not absolutes. Use them to calibrate expectations.
+### Codex CLI
+- Agentic: runs `rg`, `sed`, file reads during review
+- Primary bug-finder: catches path mismatches, stale types, wrong primary key types, dangling shared-package references
+- Weakness: verbose output that can bury high-severity findings
+### Gemini CLI
+- Text-aware but less agentic: reads spec/plan and standards, may not grep code
+- Primary standards-compliance checker: catches contradictions with ADRs, existing patterns
+- Reads project context via `instructions` field in `.gemini/settings.json`
+- Weakness: can approve plans with mechanical bugs only visible empirically
+### Claude CLI (when used as reviewer)
+- Analytical: scope, ordering, edge cases, over-engineering
+- Primary scope checker: catches feature creep, YAGNI violations
+- Weakness: similar to Gemini, less empirical by default
+## Interpreting reviews
+**When reviews agree**: strong signal. Address first.
+**When reviews disagree**: arbitrate from evidence, not authority. If one reviewer cites a specific file:line and the other didn't read it, the cited finding carries empirical weight. For scope/over-engineering disagreements, lean YAGNI.
+**When reviews are asymmetric**: if one reviewer finds 3 M1 blockers and the other finds 0 M1, do NOT conclude half the findings are wrong. Conclude the light reviewer was text-only and missed empirical bugs. The `review-plan` command auto-detects asymmetry and re-prompts the light reviewer.
+**When both reviews are light**: acceptable for trivial changes only. For Standard/Complex features, re-run both reviewers with the empirical verification checklist explicitly invoked.
+## Historical calibration data
+### F-UX-B Plan review (2026-04-13, foodXPlorer)
+- Gemini: 48 lines, 2 M3 + 1 P2, verdict APPROVE WITH CHANGES. Cited base-standards.mdc section 5 — demonstrably read project context.
+- Codex: 829 lines, 3 M1 + 1 M2 + 2 M3, verdict REJECT. Ran `rg`/`sed`, cited specific file:lines across 4+ files.
+- Codex M1s were all real bugs verified empirically by the agent before applying fixes.
+Takeaway: context loading (fixed in sdd-devflow v0.16.7) ≠ empirical verification. Both reviewers are complementary.
+## Override when
+- Light reviewer produces specific high-quality finding with cited evidence — don't dismiss
+- Heavy reviewer produces verbose output with few actionable findings — don't over-weight length
+- New model version changes behavior — update this file
+The calibration is a tool, not a rule. Read both reviews carefully.
+## Related files
+- `.gemini/commands/review-spec-instructions.md`
+- `.gemini/commands/review-plan-instructions.md`
+- `.gemini/agents/backend-planner.md`
+- `.gemini/agents/frontend-planner.md`