create-sdd-project 0.16.7 → 0.16.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/doctor.js CHANGED
@@ -73,6 +73,9 @@ function runDoctor(cwd) {
73
73
  // 12. Gemini Settings Format
74
74
  results.push(checkGeminiSettings(cwd, aiTools));
75
75
 
76
+ // 13. Gemini TOML Commands Format
77
+ results.push(checkGeminiCommands(cwd, aiTools));
78
+
76
79
  return results;
77
80
  }
78
81
 
@@ -613,6 +616,330 @@ function checkGeminiSettings(cwd, aiTools) {
613
616
  };
614
617
  }
615
618
 
619
+ /**
620
+ * Validate a .gemini/commands/*.toml file using a strict subset of TOML
621
+ * grammar sufficient for our narrow use case.
622
+ *
623
+ * Scope: the templates we ship only use two top-level keys (`description`,
624
+ * `prompt`) with string values — standard quoted (`"..."`), single-quoted
625
+ * literal (`'...'`), or triple-quoted multiline (`"""..."""` / `'''...'''`).
626
+ * This validator enforces that subset strictly:
627
+ *
628
+ * - Each non-blank, non-comment line must be either a top-level assignment
629
+ * `key = <string-literal>` or the start of a multiline string
630
+ * - Top-level keys must match `[A-Za-z][A-Za-z0-9_-]*` (bare keys only —
631
+ * quoted keys like `"prompt" = "x"` are flagged as invalid; our templates
632
+ * never use them)
633
+ * - Duplicate top-level keys are rejected (TOML spec forbids them)
634
+ * - Strings must be properly closed on the same line (except triple-quoted,
635
+ * which can span lines)
636
+ * - Trailing content after a closed string is rejected (only a `#` comment
637
+ * is allowed after the value)
638
+ * - Values that are not string literals (numbers, booleans, arrays, etc.)
639
+ * are flagged as non-string
640
+ * - Assignments inside `[table]` or `[[array-table]]` sections are not
641
+ * considered top-level and the scan stops there (our templates don't use
642
+ * tables)
643
+ *
644
+ * This validator is intentionally stricter than full TOML and looser in a
645
+ * few edge cases (e.g., escape sequences inside basic strings are accepted
646
+ * as `\\.`). The goal is to catch files that Gemini CLI's FileCommandLoader
647
+ * would silently skip — not to be a general-purpose TOML parser. If our
648
+ * templates ever need richer TOML features, upgrade to `@iarna/toml` as
649
+ * a runtime dependency at that point.
650
+ *
651
+ * Returns:
652
+ * { ok: true, keys: { prompt?: 'string' | 'non-string', description?: 'string' | 'non-string' } }
653
+ * { ok: false, error: '<message>', line: N }
654
+ */
655
+ function validateTomlCommandFile(content) {
656
+ const keysSeen = {};
657
+ const lines = content.split(/\r?\n|\r/);
658
+ let i = 0;
659
+
660
+ while (i < lines.length) {
661
+ const raw = lines[i];
662
+ const trimmed = raw.trim();
663
+
664
+ // Blank line or full-line comment
665
+ if (trimmed === '' || trimmed.startsWith('#')) {
666
+ i++;
667
+ continue;
668
+ }
669
+
670
+ // Table / array-table — end of top-level scope, stop scanning
671
+ if (/^\[\[?/.test(trimmed)) {
672
+ break;
673
+ }
674
+
675
+ // Top-level assignment: bare key = value
676
+ const keyMatch = trimmed.match(/^([A-Za-z][A-Za-z0-9_-]*)\s*=\s*(.*)$/);
677
+ if (!keyMatch) {
678
+ return {
679
+ ok: false,
680
+ error: `line ${i + 1}: not a valid top-level assignment: ${trimmed.slice(0, 60)}`,
681
+ };
682
+ }
683
+
684
+ const key = keyMatch[1];
685
+ const value = keyMatch[2];
686
+
687
+ if (keysSeen[key] !== undefined) {
688
+ return { ok: false, error: `line ${i + 1}: duplicate top-level key '${key}'` };
689
+ }
690
+
691
+ // Multi-line basic string: """..."""
692
+ if (value.startsWith('"""')) {
693
+ const after = value.slice(3);
694
+ const closeIdx = after.indexOf('"""');
695
+ if (closeIdx !== -1) {
696
+ // Closed on same line — check no trailing content except optional comment
697
+ const trailing = after.slice(closeIdx + 3).trim();
698
+ if (trailing !== '' && !trailing.startsWith('#')) {
699
+ return {
700
+ ok: false,
701
+ error: `line ${i + 1}: trailing content after """ close: ${trailing.slice(0, 40)}`,
702
+ };
703
+ }
704
+ keysSeen[key] = 'string';
705
+ i++;
706
+ continue;
707
+ }
708
+ // Scan forward for closing """
709
+ let j = i + 1;
710
+ let closed = false;
711
+ while (j < lines.length) {
712
+ const idx2 = lines[j].indexOf('"""');
713
+ if (idx2 !== -1) {
714
+ const trailing2 = lines[j].slice(idx2 + 3).trim();
715
+ if (trailing2 !== '' && !trailing2.startsWith('#')) {
716
+ return {
717
+ ok: false,
718
+ error: `line ${j + 1}: trailing content after """ close: ${trailing2.slice(0, 40)}`,
719
+ };
720
+ }
721
+ closed = true;
722
+ i = j + 1;
723
+ break;
724
+ }
725
+ j++;
726
+ }
727
+ if (!closed) {
728
+ return {
729
+ ok: false,
730
+ error: `line ${i + 1}: unterminated triple-quoted basic string (""" never closed)`,
731
+ };
732
+ }
733
+ keysSeen[key] = 'string';
734
+ continue;
735
+ }
736
+
737
+ // Multi-line literal string: '''...'''
738
+ if (value.startsWith("'''")) {
739
+ const after = value.slice(3);
740
+ const closeIdx = after.indexOf("'''");
741
+ if (closeIdx !== -1) {
742
+ const trailing = after.slice(closeIdx + 3).trim();
743
+ if (trailing !== '' && !trailing.startsWith('#')) {
744
+ return {
745
+ ok: false,
746
+ error: `line ${i + 1}: trailing content after ''' close: ${trailing.slice(0, 40)}`,
747
+ };
748
+ }
749
+ keysSeen[key] = 'string';
750
+ i++;
751
+ continue;
752
+ }
753
+ let j = i + 1;
754
+ let closed = false;
755
+ while (j < lines.length) {
756
+ const idx2 = lines[j].indexOf("'''");
757
+ if (idx2 !== -1) {
758
+ const trailing2 = lines[j].slice(idx2 + 3).trim();
759
+ if (trailing2 !== '' && !trailing2.startsWith('#')) {
760
+ return {
761
+ ok: false,
762
+ error: `line ${j + 1}: trailing content after ''' close: ${trailing2.slice(0, 40)}`,
763
+ };
764
+ }
765
+ closed = true;
766
+ i = j + 1;
767
+ break;
768
+ }
769
+ j++;
770
+ }
771
+ if (!closed) {
772
+ return {
773
+ ok: false,
774
+ error: `line ${i + 1}: unterminated triple-quoted literal string (''' never closed)`,
775
+ };
776
+ }
777
+ keysSeen[key] = 'string';
778
+ continue;
779
+ }
780
+
781
+ // Basic string: "..." with standard escapes; must close on same line
782
+ // and allow only a trailing comment after the closing quote.
783
+ if (value.startsWith('"')) {
784
+ const basicMatch = value.match(/^"((?:[^"\\]|\\.)*)"(?:\s*(?:#.*)?)?$/);
785
+ if (!basicMatch) {
786
+ return {
787
+ ok: false,
788
+ error: `line ${i + 1}: invalid basic string value (unterminated or trailing content): ${value.slice(0, 60)}`,
789
+ };
790
+ }
791
+ keysSeen[key] = 'string';
792
+ i++;
793
+ continue;
794
+ }
795
+
796
+ // Literal string: '...' with no escapes; must close on same line
797
+ if (value.startsWith("'")) {
798
+ const litMatch = value.match(/^'([^']*)'(?:\s*(?:#.*)?)?$/);
799
+ if (!litMatch) {
800
+ return {
801
+ ok: false,
802
+ error: `line ${i + 1}: invalid literal string value (unterminated or trailing content): ${value.slice(0, 60)}`,
803
+ };
804
+ }
805
+ keysSeen[key] = 'string';
806
+ i++;
807
+ continue;
808
+ }
809
+
810
+ // Any other value is not a string literal (int, bool, array, table, etc.)
811
+ keysSeen[key] = 'non-string';
812
+ i++;
813
+ }
814
+
815
+ return { ok: true, keys: keysSeen };
816
+ }
817
+
818
+ function checkGeminiCommands(cwd, aiTools) {
819
+ if (aiTools === 'claude') {
820
+ return {
821
+ status: PASS,
822
+ message: 'Gemini commands: N/A (Claude only)',
823
+ details: [],
824
+ };
825
+ }
826
+
827
+ const commandsDir = path.join(cwd, '.gemini', 'commands');
828
+ if (!fs.existsSync(commandsDir)) {
829
+ return {
830
+ status: WARN,
831
+ message: 'Gemini commands: .gemini/commands/ missing',
832
+ details: ['Run: npx create-sdd-project --upgrade to recreate template commands'],
833
+ };
834
+ }
835
+
836
+ // readdirSync with withFileTypes so we can filter symlinks before reading.
837
+ // Symlinks in .gemini/commands/ would make doctor read arbitrary files on
838
+ // the user's machine — low severity in a local CLI, but worth guarding.
839
+ const entries = fs
840
+ .readdirSync(commandsDir, { withFileTypes: true })
841
+ .filter((e) => e.name.endsWith('.toml'))
842
+ .sort((a, b) => a.name.localeCompare(b.name));
843
+
844
+ if (entries.length === 0) {
845
+ return {
846
+ status: WARN,
847
+ message: 'Gemini commands: no .toml files in .gemini/commands/',
848
+ details: ['Gemini CLI slash commands require .toml files. Run: npx create-sdd-project --upgrade'],
849
+ };
850
+ }
851
+
852
+ const issues = [];
853
+ let validCount = 0;
854
+
855
+ for (const entry of entries) {
856
+ const file = entry.name;
857
+ const filePath = path.join(commandsDir, file);
858
+
859
+ // Reject symlinks (Dirent can lie about isFile() when followed; use lstat).
860
+ let lst;
861
+ try {
862
+ lst = fs.lstatSync(filePath);
863
+ } catch (e) {
864
+ issues.push(`${file}: cannot lstat (${e.code || e.message})`);
865
+ continue;
866
+ }
867
+ if (lst.isSymbolicLink()) {
868
+ issues.push(`${file}: is a symlink — refusing to follow (security). Delete and run --upgrade to restore template`);
869
+ continue;
870
+ }
871
+ if (!lst.isFile()) {
872
+ issues.push(`${file}: not a regular file`);
873
+ continue;
874
+ }
875
+
876
+ let content;
877
+ try {
878
+ content = fs.readFileSync(filePath, 'utf8');
879
+ } catch (e) {
880
+ issues.push(`${file}: cannot read (${e.code || e.message})`);
881
+ continue;
882
+ }
883
+
884
+ if (content.trim() === '') {
885
+ issues.push(`${file}: empty file (Gemini CLI will skip this command silently)`);
886
+ continue;
887
+ }
888
+
889
+ // Validate using the strict grammar subset for our templates.
890
+ // Gemini CLI's FileCommandLoader schema is:
891
+ // z.object({ prompt: z.string(), description: z.string().optional() })
892
+ const result = validateTomlCommandFile(content);
893
+
894
+ if (!result.ok) {
895
+ issues.push(`${file}: ${result.error}`);
896
+ continue;
897
+ }
898
+
899
+ const promptKind = result.keys.prompt;
900
+ const descriptionKind = result.keys.description;
901
+
902
+ if (promptKind === undefined) {
903
+ issues.push(
904
+ `${file}: missing required field 'prompt' (Gemini CLI will silently skip this command)`
905
+ );
906
+ continue;
907
+ }
908
+ if (promptKind !== 'string') {
909
+ issues.push(
910
+ `${file}: 'prompt' field must be a string (Gemini CLI requires z.string())`
911
+ );
912
+ continue;
913
+ }
914
+ if (descriptionKind !== undefined && descriptionKind !== 'string') {
915
+ issues.push(
916
+ `${file}: 'description' field is present but is not a string`
917
+ );
918
+ continue;
919
+ }
920
+
921
+ validCount++;
922
+ }
923
+
924
+ if (issues.length > 0) {
925
+ return {
926
+ status: FAIL,
927
+ message: `Gemini commands: ${issues.length} invalid TOML file${issues.length > 1 ? 's' : ''}`,
928
+ details: [
929
+ ...issues,
930
+ 'Gemini CLI silently skips invalid TOML commands — they will not appear as slash commands in the UI.',
931
+ 'Run: npx create-sdd-project --upgrade to restore template commands.',
932
+ ],
933
+ };
934
+ }
935
+
936
+ return {
937
+ status: PASS,
938
+ message: `Gemini commands: ${validCount}/${entries.length} valid`,
939
+ details: [],
940
+ };
941
+ }
942
+
616
943
  module.exports = {
617
944
  runDoctor,
618
945
  printResults,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "create-sdd-project",
3
- "version": "0.16.7",
3
+ "version": "0.16.9",
4
4
  "description": "Create a new SDD DevFlow project with AI-assisted development workflow",
5
5
  "bin": {
6
6
  "create-sdd-project": "bin/cli.js"
@@ -56,6 +56,37 @@ Write the following sections into the ticket's `## Implementation Plan` section:
56
56
  - Specific patterns from the codebase to follow (with file references)
57
57
  - Any gotchas or constraints the developer should know
58
58
 
59
+ ## Pre-Emission Verification (MANDATORY)
60
+
61
+ Before writing the final plan, **verify every structural claim empirically against the actual code**. Planners that emit claims without verification produce plans with mechanical bugs (wrong paths, stale types, obsolete schemas, missing files, wrong primary key types) that block TDD and force re-planning.
62
+
63
+ **IMPORTANT — do NOT hallucinate verification**: You MUST use your environment tools (`Grep`, `Read`, `Bash`) to actually execute these checks against the real code. Do NOT write fake commands or fabricated output to satisfy the format. If you have not executed the check, do not list it. Leaving the `Verification commands run` subsection empty is better than fabricating it — the downstream review-plan command is configured to treat empty verification as a flag for stricter review, not as a failure.
64
+
65
+ For every item you intend to list under `Files to Modify`, `Files to Create`, `Key Patterns`, or `Existing Code to Reuse`:
66
+
67
+ 1. **Grep or read the referenced files** to confirm they exist at the path you cite
68
+ 2. **Verify types, enums, and validation schemas** mentioned match the current code. Use `Grep` on exported symbol names across the workspace — shared schemas often live in multiple packages, so one rewrite can leave dangling references
69
+ 3. **Verify primary keys, IDs, and foreign keys** by reading the ORM schema file (or equivalent) — don't assume `id` is a positive int when it's a `uuid`, and vice versa. Validator types MUST match the DB column type
70
+ 4. **Verify the current state of enums before proposing to drop or replace them** — enum types are often referenced in 2-3 places (TypeScript type, validation schema, ORM enum, DB column). ALL references must be cleaned in the SAME commit or the workspace breaks mid-migration
71
+ 5. **For any migration that DROPs a table or type**, confirm the table is either unused or its data has been backed up — add a pre-flight safety check to the plan
72
+
73
+ After finishing the plan, append a final subsection to the ticket:
74
+
75
+ ### Verification commands run
76
+
77
+ List every empirical check you executed using this format: `<command> → <observed fact> → <impact on plan>`. One line per check. **Every entry must have all three fields** — a bare command without an observed fact is not verification, it's cargo-culting.
78
+
79
+ Example format:
80
+
81
+ - `Grep: "PortionContext" in packages/` → 2 hits: `shared/src/schemas/enums.ts:18`, `shared/src/schemas/standardPortion.ts:4` → both must be deleted in the migration commit, listed under "Files to Modify"
82
+ - `Read: packages/api/prisma/schema.prisma:318-330` → confirmed `dishId String @db.Uuid` (not int) → Seed CSV validator must use `z.string().uuid()`, NOT `z.number().int()`
83
+ - `Grep: "formatPortionTermLabel" in packages/shared/` → helper does not yet exist → list under "Files to Create" for commit 1 of the TDD order
84
+ - (continue with every empirical check)
85
+
86
+ **If this subsection is empty or missing**, prepend the plan with a warning: `⚠ This plan is text-only and has not been empirically verified against the code. Cross-model reviewers MUST run empirical checks before approving.`
87
+
88
+ The `review-plan` command reads this subsection to calibrate reviewer effort. An empty or missing subsection is treated as a flag for stricter review.
89
+
59
90
  ## Rules
60
91
 
61
92
  - **NEVER** write implementation code — only the plan
@@ -56,6 +56,37 @@ Write the following sections into the ticket's `## Implementation Plan` section:
56
56
  - Specific patterns from the codebase to follow (with file references)
57
57
  - Any gotchas or constraints the developer should know
58
58
 
59
+ ## Pre-Emission Verification (MANDATORY)
60
+
61
+ Before writing the final plan, **verify every structural claim empirically against the actual code**. Planners that emit claims without verification produce plans with mechanical bugs (wrong component paths, stale prop types, missing exported helpers, inconsistent helper usage between packages) that block TDD and force re-planning.
62
+
63
+ **IMPORTANT — do NOT hallucinate verification**: You MUST use your environment tools (`Grep`, `Read`, `Bash`) to actually execute these checks. Do NOT fabricate commands or output to satisfy the format. An empty `Verification commands run` subsection is better than a fake one — the downstream review-plan command flags empty sections for stricter review, not for failure.
64
+
65
+ For every item you intend to list under `Files to Modify`, `Files to Create`, `Key Patterns`, or `Existing Code to Reuse`:
66
+
67
+ 1. **Grep or read the referenced files** to confirm they exist at the path you cite
68
+ 2. **Verify component prop types and shared helpers** — before proposing a helper inline, check if one already exists in `packages/shared/` or equivalent. Helpers used by both web and bot MUST live in `shared/` and be imported; do NOT duplicate inline in each package
69
+ 3. **Verify API response shapes** by reading the shared validation schemas — the frontend MUST match the backend contract, not invent fields
70
+ 4. **Verify existing CSS tokens, Tailwind utilities, and component library primitives** before proposing new classes — design tokens (colors, spacing, typography) live in `tailwind.config.ts` or `globals.css`, not in component files
71
+ 5. **Verify accessibility semantics** — if the plan proposes `aria-*` attributes, confirm the pattern against existing accessible components in the codebase
72
+
73
+ After finishing the plan, append a final subsection to the ticket:
74
+
75
+ ### Verification commands run
76
+
77
+ List every empirical check using this format: `<command> → <observed fact> → <impact on plan>`. One line per check. **Every entry must have all three fields** — a bare command without an observed fact is cargo-culting.
78
+
79
+ Example format:
80
+
81
+ - `Grep: "formatPortionTermLabel" in packages/` → helper exists in `packages/shared/src/portion/portionLabel.ts:32` → do not duplicate inline, import from `@foodxplorer/shared`, list under "Existing Code to Reuse"
82
+ - `Read: packages/shared/src/schemas/estimate.ts:180-205` → confirmed `portionAssumption` field is optional with `source: "per_dish" | "generic"` → NutritionCard must handle both branches, listed under "Key Patterns"
83
+ - `Grep: "aria-labelledby" in packages/web/src/components/` → existing pattern uses `useId()` for hook-generated IDs → reuse same pattern in new component, not hardcoded strings
84
+ - (continue with every empirical check)
85
+
86
+ **If this subsection is empty or missing**, prepend the plan with a warning: `⚠ This plan is text-only and has not been empirically verified against the code. Cross-model reviewers MUST run empirical checks before approving.`
87
+
88
+ The `review-plan` command reads this subsection to calibrate reviewer effort. An empty or missing subsection is treated as a flag for stricter review.
89
+
59
90
  ## Rules
60
91
 
61
92
  - **NEVER** write implementation code — only the plan
@@ -26,16 +26,47 @@ mkdir -p "$REVIEW_DIR"
26
26
  cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
27
27
  You are reviewing an Implementation Plan for a software feature. Your job is to find real problems, not praise. But if the plan is solid, say APPROVED — do not manufacture issues that are not there.
28
28
 
29
+ ## CRITICAL: This is an EMPIRICAL review, not a text-only review
30
+
31
+ Before reporting findings, you MUST verify structural claims against the actual code. Plans often have subtle mechanical bugs (wrong paths, stale types, obsolete schemas, incorrect primary key types, dangling references in shared packages) that only surface when you verify empirically. A text-only review that reads the plan alone will miss them.
32
+
33
+ Required empirical checks before emitting findings:
34
+
35
+ 1. **Read every file path the plan cites** — confirm it exists. If the plan says `packages/api/src/foo.ts`, open it. If it doesn't exist, that's a CRITICAL finding.
36
+ 2. **Grep for every exported symbol** the plan claims to reuse, modify, or delete (types, enums, Zod schemas, functions, classes). Shared symbols often live in 2-3 places — one rewrite leaves dangling references if the others aren't cleaned in the same commit.
37
+ 3. **Verify primary key and foreign key types** by reading the actual schema file (`prisma/schema.prisma` or equivalent). Don't assume `id` is a positive int when it's a `uuid`, and vice versa. Validator types MUST match the DB column type — this is one of the most common mechanical bugs in plans.
38
+ 4. **For any DROP / DELETE / CASCADE operation**, grep the workspace for ALL references to the dropped symbol. The plan must clean them all in the same commit or the workspace breaks mid-migration.
39
+ 5. **If the plan cites "Existing Code to Reuse"**, read those files to confirm they actually provide what the plan claims. Plans frequently cite helpers that don't exist or whose signature is different.
40
+
41
+ Do NOT rely on the plan's assertions alone. Do NOT assume file paths, types, or schemas are correct without verifying.
42
+
43
+ **Ticket-level signal to calibrate your review depth**: Look for a `### Verification commands run` subsection inside `## Implementation Plan`. If the planner listed empirical commands there, the plan was self-verified; focus your review on higher-order issues (gaps, vagueness, over-engineering). If that subsection is missing or empty, the plan is text-only and you MUST run the empirical checks above with extra rigor.
44
+
45
+ ## Review criteria
46
+
29
47
  Below you will find the Spec (what to build) and the Implementation Plan (how to build it). Review the plan and report:
30
- 1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec
48
+
49
+ 1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec, **path/type/schema mismatches against the actual code (verified empirically)**
31
50
  2. Gaps — Missing error handling, edge cases, rollback scenarios
32
51
  3. Vagueness — Steps too ambiguous to implement with TDD (no clear input/output)
33
52
  4. Over-engineering — Unnecessary abstractions, premature optimization
34
53
  5. Order issues — Steps that depend on later steps
35
54
 
36
- For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
55
+ For each issue, state: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite the file:line you read to discover the issue when applicable`.
56
+
57
+ ## Output format — mandatory sections
58
+
59
+ At the END of your review, include these two subsections:
60
+
61
+ ### Files read during review
62
+ (list every file you opened, with brief note of what each confirmed or contradicted)
37
63
 
38
- End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
64
+ ### Commands executed
65
+ (list every grep / find / sed / read command you ran, with the pattern)
66
+
67
+ If BOTH subsections are empty, prepend your review with: `⚠ TEXT-ONLY REVIEW — no empirical verification performed. Findings are based on plan text alone.`
68
+
69
+ End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
39
70
 
40
71
  ---
41
72
  SPEC AND PLAN:
@@ -63,6 +94,71 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
63
94
 
64
95
  Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
65
96
 
97
+ ### Meta-check: reviewer empirical asymmetry
98
+
99
+ After both reviews are in, check for empirical asymmetry. This guards against one model being text-only while the other does real empirical verification — a pattern observed in practice where different reviewers have different agentic habits.
100
+
101
+ **Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review has a populated `### Files read during review` section with real file paths and line numbers, and the other review has an empty or missing section (or quotes the example text without adding real entries), the light review is incomplete. This is the authoritative check — the shell heuristic below is a secondary confirmation only.
102
+
103
+ **Secondary check — shell heuristic**: The following bash block is a mechanical double-check. It only flags **missing empirical evidence**, NOT low finding counts (a clean plan legitimately produces 0 CRITICAL/IMPORTANT findings; re-prompting on that is noise).
104
+
105
+ ```bash
106
+ # Count REAL empirical markers — anchored to the mandatory markdown headers only,
107
+ # not to substring mentions elsewhere in the review text. Uses `wc -l` to avoid
108
+ # the `grep -c || echo 0` pitfall that produces "0\n0" when grep exits non-zero.
109
+ count_empirical() {
110
+ local file="$1"
111
+ [ -r "$file" ] || { echo 0; return; }
112
+ # Look for non-empty lines under the two mandatory markdown headers.
113
+ awk '
114
+ /^### Files read during review$/ { in_files=1; in_cmds=0; next }
115
+ /^### Commands executed$/ { in_files=0; in_cmds=1; next }
116
+ /^### / { in_files=0; in_cmds=0 }
117
+ (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
118
+ END { print n+0 }
119
+ ' "$file"
120
+ }
121
+
122
+ GEMINI_EMPIRICAL=$(count_empirical "$REVIEW_DIR/gemini.txt")
123
+ CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
124
+
125
+ echo "Empirical evidence — Gemini: $GEMINI_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
126
+ ```
127
+
128
+ **Trigger re-prompt ONLY when one reviewer has zero empirical entries**. Do NOT trigger based on finding counts — a clean plan legitimately produces zero findings, and re-prompting on that wastes time.
129
+
130
+ If re-prompt is needed, write a concrete reprompt file and re-run the light reviewer. Use CONCRETE shell variables, not literal `<angle>` placeholders (those are bash input redirects and will fail at runtime):
131
+
132
+ ```bash
133
+ # Example — ONLY the SKELETON. Replace LIGHT_CLI, LIGHT_NAME, and OTHER_CLI with
134
+ # the actual reviewer details for your environment. This is documentation, not
135
+ # a cargo-cult-runnable block. Run the concrete version manually.
136
+
137
+ cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
138
+ Your previous review was text-only: the `### Files read during review` section was empty or missing. Plans frequently have subtle mechanical bugs (wrong file paths, stale type references, primary key type mismatches, dangling shared-package references) that only appear with empirical verification.
139
+
140
+ Re-review the plan with EMPIRICAL verification. You MUST use your environment tools to read and grep real files. Do NOT hallucinate commands or output. You MUST:
141
+ 1. Read every file path the plan cites and confirm it exists
142
+ 2. Grep the workspace for every type/enum/schema/function the plan references
143
+ 3. Verify primary and foreign key types against the actual schema file
144
+ 4. For any DROP/DELETE/CASCADE, grep ALL references to confirm atomic cleanup
145
+ 5. List the files you opened and the commands you ran at the END of your review, with real observed facts for each (not just the command string)
146
+
147
+ Look for: path mismatches, stale type references, primary key type mismatches, and dangling references in shared packages that need cleanup in the same commit.
148
+ REPROMPT
149
+
150
+ # Then re-run the LIGHT reviewer with the reprompt prepended. Example for Gemini:
151
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | gemini > "$REVIEW_DIR/gemini_reprompted.txt" 2>&1
152
+ # cat "$REVIEW_DIR/gemini_reprompted.txt"
153
+ # Example for Codex:
154
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
155
+ # cat "$REVIEW_DIR/codex_reprompted.txt"
156
+ ```
157
+
158
+ Merge the re-prompted findings into your consolidation. Do NOT skip this meta-check — it catches real bugs that would otherwise enter TDD.
159
+
160
+ See `.claude/skills/development-workflow/references/cross-model-review.md` for calibration notes on reviewer patterns (which models tend toward empirical verification vs standards-compliance checking).
161
+
66
162
  ### Path B: One CLI available
67
163
 
68
164
  ```bash
@@ -26,18 +26,45 @@ mkdir -p "$REVIEW_DIR"
26
26
  cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
27
27
  You are reviewing a Feature Specification for a software feature. Your job is to find real problems in the REQUIREMENTS — not the implementation (there is no implementation yet). If the spec is solid, say APPROVED — do not manufacture issues.
28
28
 
29
+ ## This is a CONTEXTUAL review — verify consistency against the codebase
30
+
31
+ A spec review is not "how would I implement this" — that's for the plan phase. But a spec review IS responsible for detecting inconsistencies between what the spec claims and what already exists in the project. To do that rigorously, you MUST read project context files:
32
+
33
+ 1. **Read the project standards** referenced in the spec (`ai-specs/specs/base-standards.mdc`, `backend-standards.mdc`, `frontend-standards.mdc`) — confirm the spec doesn't contradict them
34
+ 2. **Read the key_facts.md and decisions.md** — confirm the spec doesn't reintroduce patterns the project explicitly rejected in a prior ADR
35
+ 3. **Read the existing API spec** (`docs/specs/api-spec.yaml`) — confirm new endpoints don't collide with existing ones and follow the same conventions
36
+ 4. **Grep for existing similar features** — if the spec proposes "add metrics for X", grep the workspace for existing metrics implementations to see if the spec is compatible with what's already there
37
+ 5. **For any field, type, or enum the spec proposes**, grep shared schemas to see if a similar concept already exists under a different name
38
+
39
+ Do NOT review the spec as isolated text. A spec that looks internally consistent but contradicts the existing architecture is worse than one with obvious gaps.
40
+
41
+ ## Review criteria
42
+
29
43
  Below you will find the Spec (what to build), the Acceptance Criteria, and project context (architecture, decisions). Review the spec and report:
44
+
30
45
  1. Completeness — Are all user needs covered? Missing requirements?
31
46
  2. Ambiguity — Are requirements clear enough to plan and implement with TDD?
32
47
  3. Edge cases — Are failure modes, boundary conditions, and error responses specified?
33
48
  4. API contract — Are endpoints, fields, types, status codes well-defined? (if applicable)
34
49
  5. Scope — Is the spec doing too much or too little for one feature?
35
- 6. Consistency — Does the spec conflict with existing architecture, patterns, or decisions?
50
+ 6. Consistency — Does the spec conflict with existing architecture, patterns, or prior ADRs? **(verify by reading the referenced files, not by inference)**
36
51
  7. Testability — Can each acceptance criterion be verified with an automated test?
37
52
 
38
- For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
53
+ For each issue, state: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite the file:line you read when the issue involves conflict with existing code`.
54
+
55
+ ## Output format — mandatory sections
39
56
 
40
- End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
57
+ At the END of your review, include these two subsections:
58
+
59
+ ### Files read during review
60
+ (list every file you opened, with brief note of what each confirmed or contradicted)
61
+
62
+ ### Commands executed
63
+ (list every grep / find / sed / read command you ran, with the pattern)
64
+
65
+ If BOTH subsections are empty, prepend your review with: `⚠ TEXT-ONLY REVIEW — no empirical verification of architectural consistency. Findings are based on spec text alone.`
66
+
67
+ End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
41
68
 
42
69
  ---
43
70
  SPEC AND ACCEPTANCE CRITERIA:
@@ -70,6 +97,59 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
70
97
 
71
98
  Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
72
99
 
100
+ ### Meta-check: reviewer empirical asymmetry
101
+
102
+ After both reviews are in, check for empirical asymmetry. One reviewer may be text-only while the other reads files — re-prompt the light reviewer if so.
103
+
104
+ **Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review cites actual files from the project (standards, decisions, existing schemas) and the other review contains only generic commentary, the light review is incomplete. This is the authoritative check.
105
+
106
+ **Secondary check — shell heuristic**: The block below only flags **missing empirical evidence**, NOT low finding counts. A clean spec legitimately produces zero findings; re-prompting on that wastes time.
107
+
108
+ ```bash
109
+ count_empirical() {
110
+ local file="$1"
111
+ [ -r "$file" ] || { echo 0; return; }
112
+ awk '
113
+ /^### Files read during review$/ { in_files=1; in_cmds=0; next }
114
+ /^### Commands executed$/ { in_files=0; in_cmds=1; next }
115
+ /^### / { in_files=0; in_cmds=0 }
116
+ (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
117
+ END { print n+0 }
118
+ ' "$file"
119
+ }
120
+
121
+ GEMINI_EMPIRICAL=$(count_empirical "$REVIEW_DIR/gemini.txt")
122
+ CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
123
+
124
+ echo "Empirical evidence — Gemini: $GEMINI_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
125
+ ```
126
+
127
+ **Trigger re-prompt ONLY when one reviewer has zero empirical entries**. If so, write a concrete reprompt file and re-run the light reviewer. Use CONCRETE shell variables, not literal `<angle>` placeholders:
128
+
129
+ ```bash
130
+ # Documentation skeleton — replace with concrete reviewer invocation.
131
+
132
+ cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
133
+ Your previous review was text-only: the `### Files read during review` section was empty or missing. Specs can silently contradict existing architecture (prior ADRs, established patterns, conflicting schemas) in ways that only surface when you read project context files.
134
+
135
+ Re-review the spec with CONTEXTUAL verification. You MUST use your environment tools to read real files. Do NOT hallucinate commands or output. You MUST:
136
+ 1. Read ai-specs/specs/base-standards.mdc, backend-standards.mdc, frontend-standards.mdc
137
+ 2. Read docs/project_notes/key_facts.md and decisions.md
138
+ 3. Grep the workspace for existing similar features to check for collision or duplication
139
+ 4. Verify proposed fields/types/enums don't already exist under different names in shared schemas
140
+ 5. List the files you opened at the END of your review, with real observed facts for each
141
+
142
+ Look for: contradictions with prior ADRs, collisions with existing APIs, duplication of concepts that already exist under different names, spec vocabulary that doesn't match the project's existing terminology.
143
+ REPROMPT
144
+
145
+ # Example for Gemini:
146
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | gemini > "$REVIEW_DIR/gemini_reprompted.txt" 2>&1
147
+ # Example for Codex:
148
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
149
+ ```
150
+
151
+ Merge the re-prompted findings. See `.claude/skills/development-workflow/references/cross-model-review.md` for calibration notes on reviewer patterns.
152
+
73
153
  ### Path B: One CLI available
74
154
 
75
155
  ```bash
@@ -0,0 +1,86 @@
1
+ # Cross-Model Review — Calibration Notes
2
+
3
+ Reference for interpreting cross-model review output during spec/plan review (Steps 0 and 2 of the workflow).
4
+
5
+ ## What cross-model review is for
6
+
7
+ Cross-model review runs the same spec or plan through TWO independent AI models (typically Codex CLI + Gemini CLI, or Codex + Claude). The goal is to surface issues that a single model would miss — not because the models are bad, but because different models have different failure modes.
8
+
9
+ Key insight: **two reviewers catching the same issue is strong signal. One reviewer catching an issue the other missed is also valuable — but you need to know *why* one missed it to calibrate correctly.**
10
+
11
+ ## Reviewer patterns observed in practice
12
+
13
+ These patterns are not absolute — they're tendencies observed over many reviews. Use them to calibrate expectations, not to dismiss findings from any model.
14
+
15
+ ### Codex CLI
16
+
17
+ - **Tends to be agentic** — runs shell commands (`rg`, `sed`, `find`, file reads) during review
18
+ - **Primary bug-finder** — catches mechanical mismatches (wrong file paths, stale type references, primary key type mismatches, dangling references in shared packages)
19
+ - **Cross-references plan claims against code** — high empirical rigor
20
+ - **Weakness**: can produce long, exhaustive output that buries the highest-severity findings in noise
21
+ - **Typical severity distribution for Standard/Complex plans**: 2-4 M1/M2 findings, 3-6 M3 suggestions
22
+
23
+ ### Gemini CLI
24
+
25
+ - **Tends to be text-aware but less agentic** — reads the spec/plan and project standards, but may not grep the actual code
26
+ - **Primary standards-compliance checker** — catches inconsistencies between the plan and documented standards (base-standards.mdc, decisions.md)
27
+ - **Reads project context** via the `instructions` field of `.gemini/settings.json` — references `ai-specs/specs/`, `.gemini/agents/`, standards
28
+ - **Weakness**: can approve plans that look internally consistent but have mechanical bugs only visible through empirical verification
29
+ - **Typical severity distribution for Standard/Complex plans**: 1-2 M2 findings, 2-4 M3 suggestions
30
+
31
+ ### Claude CLI (when used as third reviewer)
32
+
33
+ - **Tends to be analytical** — reasons through scope, ordering, edge cases
34
+ - **Primary scope-and-structure checker** — catches over-engineering, out-of-scope additions, feature creep
35
+ - **Weakness**: similar to Gemini, less empirical by default
36
+
37
+ ## What the calibration means for you
38
+
39
+ ### When reviews agree
40
+
41
+ If both reviewers flag the same finding, weight is high. Address first.
42
+
43
+ ### When reviews disagree
44
+
45
+ Don't arbitrate from authority — arbitrate from evidence. If Codex cites `packages/api/prisma/schema.prisma:323` showing `id String @db.Uuid` and Gemini didn't read that file, Codex's finding carries the empirical weight. Resolve in Codex's direction.
46
+
47
+ If the disagreement is about scope or over-engineering (e.g., Codex says "add caching layer" and Gemini says "out of scope for this feature"), read both rationales carefully — this is where different models genuinely produce different takes. Lean toward YAGNI by default.
48
+
49
+ ### When reviews are asymmetric (one light, one heavy)
50
+
51
+ Asymmetry is the most important signal. If Codex produces 3 M1 blockers and Gemini produces 0 M1 + 2 M3, **do NOT** conclude "Gemini approved so half of Codex's findings are wrong". Conclude: **"Gemini did a text-only review and missed the empirical bugs"**. The `review-plan` command includes an automated asymmetry check that re-prompts the light reviewer with stricter empirical instructions. Use it.
52
+
53
+ ### When both reviews are light
54
+
55
+ If both produce 0 M1/M2 findings AND both produce empty "Files read during review" sections, the review is text-only. This is acceptable for trivial changes (one-line fixes, typo corrections) but NOT for Standard/Complex features. For non-trivial work, re-run the review with both reviewers and explicitly invoke the empirical verification checklist from the `review-plan` / `review-spec` commands.
56
+
57
+ ## Historical calibration data
58
+
59
+ These are real examples from this project's history — add more as patterns emerge.
60
+
61
+ ### F-UX-B Plan review (2026-04-13)
62
+
63
+ **Gemini** produced 48 lines, 2 M3 + 1 P2 findings, verdict APPROVE WITH CHANGES. Cited `ai-specs/specs/base-standards.mdc` section "5. Implementation Workflow" — demonstrably read project context.
64
+
65
+ **Codex** produced 829 lines, 3 M1 + 1 M2 + 2 M3, verdict REJECT. Ran `rg` and `sed` during review. Cited `packages/shared/src/schemas/standardPortion.ts:1-36`, `enums.ts:18-25`, `packages/api/prisma/schema.prisma:323`, `.gemini/agents/backend-planner.md:1-34`.
66
+
67
+ **Codex M1s were real bugs**: helper fallback produced "Media_racion" instead of "Media ración" (underscore bug); shared schema drift (PortionContextSchema existed in 2 places, one would become dangling); dishId validator expected positive int but DB column was uuid.
68
+
69
+ **Takeaway**: Gemini's context-loading fix (sdd-devflow v0.16.7) worked — it read standards correctly. But context loading ≠ empirical verification. Codex's agentic habit of running commands against the code caught bugs Gemini's text review missed. Both reviewers are needed and complementary.
70
+
71
+ ## When to override the calibration
72
+
73
+ These are tendencies, not certainties. Override when:
74
+
75
+ - A "light" reviewer produces a specific, high-quality finding with cited evidence — don't dismiss it
76
+ - A "heavy" reviewer produces verbose output with few actionable findings — don't over-weight length
77
+ - A new model version changes behavior significantly — update this file with the new observation
78
+
79
+ The calibration is a tool, not a rule. Always read both reviews carefully.
80
+
81
+ ## Related files
82
+
83
+ - `.claude/commands/review-spec.md` — spec review command with empirical checklist
84
+ - `.claude/commands/review-plan.md` — plan review command with empirical checklist + asymmetry meta-check
85
+ - `.claude/agents/backend-planner.md` — planner with Pre-Emission Verification section
86
+ - `.claude/agents/frontend-planner.md` — planner with Pre-Emission Verification section
@@ -28,6 +28,29 @@ Generate a detailed Implementation Plan and write it into the ticket's `## Imple
28
28
  - Implementation Order (Domain > Application > Infrastructure > Presentation > Tests)
29
29
  - Testing Strategy
30
30
  - Key Patterns
31
+ - **Verification commands run** (see Pre-Emission Verification below)
32
+
33
+ ## Pre-Emission Verification (MANDATORY)
34
+
35
+ Before emitting the final plan, verify every structural claim empirically against the actual code. Plans emitted without verification produce mechanical bugs (wrong paths, stale types, obsolete schemas, wrong PK types) that block TDD.
36
+
37
+ **Do NOT hallucinate**: You MUST use your environment tools to execute the checks against the real code. Do NOT fabricate commands or output. An empty `Verification commands run` subsection is better than a fake one — the downstream review-plan command flags empty sections for stricter review, not as failure.
38
+
39
+ Required checks:
40
+
41
+ 1. Grep or read every file you cite in `Files to Modify`, `Files to Create`, `Key Patterns`, `Existing Code to Reuse` — confirm it exists at that path
42
+ 2. Grep exported symbol names (types, enums, validation schemas) across the workspace. Shared schemas often live in 2-3 places; one rewrite leaves dangling references if the others aren't cleaned in the same commit
43
+ 3. Read `prisma/schema.prisma` (or equivalent) before asserting primary key types. Validators MUST match the DB column type (uuid vs int vs cuid). Do NOT assume
44
+ 4. Before proposing to DROP an enum or table, grep workspace for all references AND confirm the table is unused or add a pre-flight safety check (SELECT COUNT + pg_dump backup)
45
+
46
+ Append to the ticket a final subsection `### Verification commands run`. Use this exact 3-field format per entry: `<command> → <observed fact> → <impact on plan>`. Every entry must have all three fields — a bare command without an observed fact is not verification. Example:
47
+
48
+ - `Grep: "PortionContext" in packages/` → 2 hits (`enums.ts:18`, `standardPortion.ts:4`) → both must be deleted in the migration commit
49
+ - `Read: packages/api/prisma/schema.prisma:323` → `dishId String @db.Uuid` (not int) → validator uses `z.string().uuid()`
50
+
51
+ If the subsection is empty or missing, prepend the plan with `⚠ This plan is text-only and has not been empirically verified. Cross-model reviewers MUST run empirical checks.`
52
+
53
+ The `review-plan` command reads this subsection to calibrate reviewer effort. Empty = stricter review.
31
54
 
32
55
  ## Rules
33
56
 
@@ -29,6 +29,30 @@ Generate a detailed Implementation Plan and write it into the ticket's `## Imple
29
29
  - Implementation Order (Types > Services > Stores > Components > Pages > Tests)
30
30
  - Testing Strategy
31
31
  - Key Patterns
32
+ - **Verification commands run** (see Pre-Emission Verification below)
33
+
34
+ ## Pre-Emission Verification (MANDATORY)
35
+
36
+ Before emitting the final plan, verify every structural claim empirically against the actual code. Plans emitted without verification produce mechanical bugs (wrong component paths, stale prop types, duplicated helpers between packages, invented API fields) that block TDD.
37
+
38
+ **Do NOT hallucinate**: You MUST use your environment tools to execute the checks. Do NOT fabricate commands or output. An empty `Verification commands run` subsection is better than a fake one.
39
+
40
+ Required checks:
41
+
42
+ 1. Grep or read every file you cite — confirm path exists
43
+ 2. Before proposing an inline helper, grep `packages/shared/` for an existing equivalent. Helpers used by BOTH web and bot MUST live in `shared/` and be imported; do NOT duplicate inline per package
44
+ 3. Read the shared validation schema for any API response the frontend renders. Frontend MUST match the backend contract, not invent fields
45
+ 4. Verify CSS tokens and component primitives exist before proposing new classes. Design tokens live in `tailwind.config.ts` or `globals.css`, not in component files
46
+ 5. Verify accessibility semantics (`aria-*`, role, labelled-by) against existing accessible components in the codebase
47
+
48
+ Append to the ticket a final subsection `### Verification commands run`. Use this exact 3-field format per entry: `<command> → <observed fact> → <impact on plan>`. Every entry must have all three fields. Example:
49
+
50
+ - `Grep: "formatPortionTermLabel" in packages/` → helper exists in `packages/shared/src/portion/portionLabel.ts:32` → import from `@foodxplorer/shared`, do not duplicate
51
+ - `Read: packages/shared/src/schemas/estimate.ts:180-205` → `portionAssumption` is optional with `source: "per_dish" | "generic"` → component handles both branches
52
+
53
+ If empty or missing, prepend plan with `⚠ This plan is text-only and has not been empirically verified. Cross-model reviewers MUST run empirical checks.`
54
+
55
+ The `review-plan` command reads this subsection to calibrate reviewer effort. Empty = stricter review.
32
56
 
33
57
  ## Rules
34
58
 
@@ -28,16 +28,47 @@ mkdir -p "$REVIEW_DIR"
28
28
  cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
29
29
  You are reviewing an Implementation Plan for a software feature. Your job is to find real problems, not praise. But if the plan is solid, say APPROVED — do not manufacture issues that are not there.
30
30
 
31
- Below you will find the Spec (what to build) and the Implementation Plan (how to build it). Review the plan and report:
32
- 1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec
31
+ ## CRITICAL: This is an EMPIRICAL review, not a text-only review
32
+
33
+ Before reporting findings, you MUST verify structural claims against the actual code. Plans often have subtle mechanical bugs (wrong paths, stale types, obsolete schemas, incorrect primary key types, dangling references in shared packages) that only surface when you verify empirically.
34
+
35
+ Required empirical checks:
36
+
37
+ 1. Read every file path the plan cites — confirm it exists
38
+ 2. Grep for every exported symbol the plan claims to reuse, modify, or delete (types, enums, Zod schemas, functions). Shared symbols often live in 2-3 places; one rewrite leaves dangling references unless all are cleaned in the same commit
39
+ 3. Verify primary and foreign key types by reading the actual schema file (`prisma/schema.prisma` or equivalent). Validators MUST match the DB column type (uuid vs int vs cuid)
40
+ 4. For any DROP / DELETE / CASCADE, grep ALL references to the dropped symbol — the plan must clean them atomically
41
+ 5. If the plan cites "Existing Code to Reuse", read those files to confirm they actually provide what the plan claims
42
+
43
+ Do NOT rely on the plan's assertions alone. Do NOT assume file paths, types, or schemas are correct without verifying.
44
+
45
+ **Ticket-level signal**: Look for a `### Verification commands run` subsection inside `## Implementation Plan`. If present and populated, the planner self-verified; focus your review on higher-order issues. If missing or empty, the plan is text-only and you MUST run the empirical checks above with extra rigor.
46
+
47
+ ## Review criteria
48
+
49
+ Below you will find the Spec and the Implementation Plan. Review the plan and report:
50
+
51
+ 1. Errors — Wrong assumptions, impossible steps, missing dependencies, plan contradicts the spec, **path/type/schema mismatches against the actual code (verified empirically)**
33
52
  2. Gaps — Missing error handling, edge cases, rollback scenarios
34
- 3. Vagueness — Steps too ambiguous to implement with TDD (no clear input/output)
53
+ 3. Vagueness — Steps too ambiguous to implement with TDD
35
54
  4. Over-engineering — Unnecessary abstractions, premature optimization
36
55
  5. Order issues — Steps that depend on later steps
37
56
 
38
- For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
57
+ For each issue: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite file:line when applicable`.
58
+
59
+ ## Output format — mandatory
60
+
61
+ At the END of your review:
62
+
63
+ ### Files read during review
64
+ (list every file you opened, with brief note of what each confirmed or contradicted)
65
+
66
+ ### Commands executed
67
+ (list every grep / find / sed / read command you ran, with the pattern)
68
+
69
+ If BOTH are empty, prepend: `⚠ TEXT-ONLY REVIEW — no empirical verification performed.`
39
70
 
40
- End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
71
+ End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
41
72
 
42
73
  ---
43
74
  SPEC AND PLAN:
@@ -65,6 +96,57 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
65
96
 
66
97
  Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
67
98
 
99
+ #### Meta-check: reviewer empirical asymmetry
100
+
101
+ After both reviews are in, check for empirical asymmetry. One reviewer may be text-only while the other runs empirical checks — re-prompt the light reviewer if so.
102
+
103
+ **Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review has a populated `### Files read during review` section with real file paths and line numbers, and the other review has an empty or missing section, the light review is incomplete. This is the authoritative check.
104
+
105
+ **Secondary check — shell heuristic**: The block below flags **missing empirical evidence only**, NOT low finding counts (a clean plan legitimately produces zero findings).
106
+
107
+ ```bash
108
+ count_empirical() {
109
+ local file="$1"
110
+ [ -r "$file" ] || { echo 0; return; }
111
+ awk '
112
+ /^### Files read during review$/ { in_files=1; in_cmds=0; next }
113
+ /^### Commands executed$/ { in_files=0; in_cmds=1; next }
114
+ /^### / { in_files=0; in_cmds=0 }
115
+ (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
116
+ END { print n+0 }
117
+ ' "$file"
118
+ }
119
+
120
+ CLAUDE_EMPIRICAL=$(count_empirical "$REVIEW_DIR/claude.txt")
121
+ CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
122
+
123
+ echo "Empirical evidence — Claude: $CLAUDE_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
124
+ ```
125
+
126
+ **Trigger re-prompt ONLY when one reviewer has zero empirical entries**. If so, write a concrete reprompt file and re-run the light reviewer. Use concrete shell variables, not literal `<angle>` placeholders:
127
+
128
+ ```bash
129
+ cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
130
+ Your previous review was text-only: the `### Files read during review` section was empty or missing. Plans frequently have subtle mechanical bugs (wrong file paths, stale type references, primary key type mismatches, dangling shared-package references) that only appear with empirical verification.
131
+
132
+ Re-review the plan with EMPIRICAL verification. You MUST use your environment tools to read and grep real files. Do NOT hallucinate commands or output. You MUST:
133
+ 1. Read every file path the plan cites and confirm it exists
134
+ 2. Grep the workspace for every type/enum/schema/function the plan references
135
+ 3. Verify primary and foreign key types against the actual schema file
136
+ 4. For any DROP/DELETE/CASCADE, grep ALL references to confirm atomic cleanup
137
+ 5. List the files you opened and the commands you ran at the END of your review, with real observed facts for each (not just the command string)
138
+
139
+ Look for: path mismatches, stale type references, primary key type mismatches, and dangling references in shared packages that need cleanup in the same commit.
140
+ REPROMPT
141
+
142
+ # Example for Claude:
143
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | claude --print > "$REVIEW_DIR/claude_reprompted.txt" 2>&1
144
+ # Example for Codex:
145
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
146
+ ```
147
+
148
+ Merge the re-prompted findings into your consolidation. See `.gemini/skills/development-workflow/references/cross-model-review.md` for calibration notes on reviewer patterns.
149
+
68
150
  #### Path B: One CLI available
69
151
 
70
152
  ```bash
@@ -28,18 +28,43 @@ mkdir -p "$REVIEW_DIR"
28
28
  cat > "$REVIEW_DIR/input.txt" <<'CRITERIA'
29
29
  You are reviewing a Feature Specification for a software feature. Your job is to find real problems in the REQUIREMENTS — not the implementation (there is no implementation yet). If the spec is solid, say APPROVED — do not manufacture issues.
30
30
 
31
- Below you will find the Spec (what to build), the Acceptance Criteria, and project context (architecture, decisions). Review the spec and report:
31
+ ## This is a CONTEXTUAL review verify consistency against the codebase
32
+
33
+ A spec can look internally consistent but contradict existing architecture (prior ADRs, established patterns, conflicting schemas). Text-only review misses those. You MUST read project context files:
34
+
35
+ 1. Read the standards referenced in the spec (`base-standards.mdc`, `backend-standards.mdc`, `frontend-standards.mdc`) — confirm no contradiction
36
+ 2. Read `docs/project_notes/key_facts.md` and `decisions.md` — confirm no regression against prior ADRs
37
+ 3. Read `docs/specs/api-spec.yaml` — confirm new endpoints don't collide with existing ones and follow the same conventions
38
+ 4. Grep for existing similar features — if the spec proposes "add metrics for X", grep for existing metrics implementations to check for collision or duplication
39
+ 5. For any field/type/enum proposed, grep shared schemas to see if the concept already exists under a different name
40
+
41
+ ## Review criteria
42
+
43
+ Below you will find the Spec (what to build), the Acceptance Criteria, and project context. Review the spec and report:
44
+
32
45
  1. Completeness — Are all user needs covered? Missing requirements?
33
46
  2. Ambiguity — Are requirements clear enough to plan and implement with TDD?
34
47
  3. Edge cases — Are failure modes, boundary conditions, and error responses specified?
35
48
  4. API contract — Are endpoints, fields, types, status codes well-defined? (if applicable)
36
49
  5. Scope — Is the spec doing too much or too little for one feature?
37
- 6. Consistency — Does the spec conflict with existing architecture, patterns, or decisions?
50
+ 6. Consistency — Does the spec conflict with existing architecture, patterns, or prior ADRs? **(verify by reading the referenced files, not by inference)**
38
51
  7. Testability — Can each acceptance criterion be verified with an automated test?
39
52
 
40
- For each issue, state: [CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix.
53
+ For each issue: `[CRITICAL/IMPORTANT/SUGGESTION] — description — proposed fix — cite file:line when involves conflict with existing code`.
54
+
55
+ ## Output format — mandatory
41
56
 
42
- End with: VERDICT: APPROVED | VERDICT: REVISE (if any CRITICAL or 2+ IMPORTANT issues)
57
+ At the END of your review:
58
+
59
+ ### Files read during review
60
+ (list every file you opened, with brief note of what each confirmed or contradicted)
61
+
62
+ ### Commands executed
63
+ (list every grep / find / sed / read command you ran, with the pattern)
64
+
65
+ If BOTH are empty, prepend: `⚠ TEXT-ONLY REVIEW — no empirical verification of architectural consistency.`
66
+
67
+ End with: `VERDICT: APPROVED` | `VERDICT: REVISE` (if any CRITICAL or 2+ IMPORTANT issues)
43
68
 
44
69
  ---
45
70
  SPEC AND ACCEPTANCE CRITERIA:
@@ -72,6 +97,57 @@ echo "=== CODEX REVIEW ===" && cat "$REVIEW_DIR/codex.txt"
72
97
 
73
98
  Consolidate findings — issues flagged by both models independently carry higher weight. Deduplicate and prioritize. Ignore output from any reviewer that failed.
74
99
 
100
+ #### Meta-check: reviewer empirical asymmetry
101
+
102
+ After both reviews are in, check for empirical asymmetry.
103
+
104
+ **Primary check — qualitative, agent-driven**: Read both reviews yourself. If one review cites actual files from the project (standards, decisions, existing schemas) and the other contains only generic commentary, the light review is incomplete. This is the authoritative check.
105
+
106
+ **Secondary check — shell heuristic**: flags **missing empirical evidence only**, not low finding counts.
107
+
108
+ ```bash
109
+ count_empirical() {
110
+ local file="$1"
111
+ [ -r "$file" ] || { echo 0; return; }
112
+ awk '
113
+ /^### Files read during review$/ { in_files=1; in_cmds=0; next }
114
+ /^### Commands executed$/ { in_files=0; in_cmds=1; next }
115
+ /^### / { in_files=0; in_cmds=0 }
116
+ (in_files || in_cmds) && NF > 0 && $0 !~ /^\(list/ { n++ }
117
+ END { print n+0 }
118
+ ' "$file"
119
+ }
120
+
121
+ CLAUDE_EMPIRICAL=$(count_empirical "$REVIEW_DIR/claude.txt")
122
+ CODEX_EMPIRICAL=$(count_empirical "$REVIEW_DIR/codex.txt")
123
+
124
+ echo "Empirical evidence — Claude: $CLAUDE_EMPIRICAL entries, Codex: $CODEX_EMPIRICAL entries"
125
+ ```
126
+
127
+ **Trigger re-prompt ONLY when one reviewer has zero empirical entries**. If so:
128
+
129
+ ```bash
130
+ cat > "$REVIEW_DIR/reprompt.txt" <<'REPROMPT'
131
+ Your previous review was text-only: the `### Files read during review` section was empty or missing. Specs can silently contradict existing architecture in ways that only surface when you read project context files.
132
+
133
+ Re-review the spec with CONTEXTUAL verification. You MUST use your environment tools to read real files. Do NOT hallucinate commands or output. You MUST:
134
+ 1. Read ai-specs/specs/base-standards.mdc, backend-standards.mdc, frontend-standards.mdc
135
+ 2. Read docs/project_notes/key_facts.md and decisions.md
136
+ 3. Grep the workspace for existing similar features to check for collision or duplication
137
+ 4. Verify proposed fields/types/enums don't already exist under different names in shared schemas
138
+ 5. List the files you opened at the END of your review, with real observed facts for each
139
+
140
+ Look for: contradictions with prior ADRs, collisions with existing APIs, duplication of concepts that already exist under different names, spec vocabulary that doesn't match the project's existing terminology.
141
+ REPROMPT
142
+
143
+ # Example for Claude:
144
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | claude --print > "$REVIEW_DIR/claude_reprompted.txt" 2>&1
145
+ # Example for Codex:
146
+ # cat "$REVIEW_DIR/reprompt.txt" "$REVIEW_DIR/input.txt" | codex exec - > "$REVIEW_DIR/codex_reprompted.txt" 2>&1
147
+ ```
148
+
149
+ Merge the re-prompted findings. See `.gemini/skills/development-workflow/references/cross-model-review.md` for calibration notes.
150
+
75
151
  #### Path B: One CLI available
76
152
 
77
153
  ```bash
@@ -0,0 +1,62 @@
1
+ # Cross-Model Review — Calibration Notes
2
+
3
+ Reference for interpreting cross-model review output during spec/plan review.
4
+
5
+ ## Purpose
6
+
7
+ Cross-model review runs the same spec or plan through two independent AI models to surface issues a single model would miss. Different models have different failure modes — that's the value.
8
+
9
+ ## Reviewer patterns observed in practice
10
+
11
+ Tendencies, not absolutes. Use them to calibrate expectations.
12
+
13
+ ### Codex CLI
14
+ - Agentic: runs `rg`, `sed`, file reads during review
15
+ - Primary bug-finder: catches path mismatches, stale types, wrong primary key types, dangling shared-package references
16
+ - Weakness: verbose output that can bury high-severity findings
17
+
18
+ ### Gemini CLI
19
+ - Text-aware but less agentic: reads spec/plan and standards, may not grep code
20
+ - Primary standards-compliance checker: catches contradictions with ADRs, existing patterns
21
+ - Reads project context via `instructions` field in `.gemini/settings.json`
22
+ - Weakness: can approve plans with mechanical bugs only visible empirically
23
+
24
+ ### Claude CLI (when used as reviewer)
25
+ - Analytical: scope, ordering, edge cases, over-engineering
26
+ - Primary scope checker: catches feature creep, YAGNI violations
27
+ - Weakness: similar to Gemini, less empirical by default
28
+
29
+ ## Interpreting reviews
30
+
31
+ **When reviews agree**: strong signal. Address first.
32
+
33
+ **When reviews disagree**: arbitrate from evidence, not authority. If one reviewer cites a specific file:line and the other didn't read it, the cited finding carries empirical weight. For scope/over-engineering disagreements, lean YAGNI.
34
+
35
+ **When reviews are asymmetric**: if one reviewer finds 3 M1 blockers and the other finds 0 M1, do NOT conclude half the findings are wrong. Conclude the light reviewer was text-only and missed empirical bugs. The `review-plan` command auto-detects asymmetry and re-prompts the light reviewer.
36
+
37
+ **When both reviews are light**: acceptable for trivial changes only. For Standard/Complex features, re-run both reviewers with the empirical verification checklist explicitly invoked.
38
+
39
+ ## Historical calibration data
40
+
41
+ ### F-UX-B Plan review (2026-04-13, foodXPlorer)
42
+
43
+ - Gemini: 48 lines, 2 M3 + 1 P2, verdict APPROVE WITH CHANGES. Cited base-standards.mdc section 5 — demonstrably read project context.
44
+ - Codex: 829 lines, 3 M1 + 1 M2 + 2 M3, verdict REJECT. Ran `rg`/`sed`, cited specific file:lines across 4+ files.
45
+ - Codex M1s were all real bugs verified empirically by the agent before applying fixes.
46
+
47
+ Takeaway: context loading (fixed in sdd-devflow v0.16.7) ≠ empirical verification. Both reviewers are complementary.
48
+
49
+ ## Override when
50
+
51
+ - Light reviewer produces specific high-quality finding with cited evidence — don't dismiss
52
+ - Heavy reviewer produces verbose output with few actionable findings — don't over-weight length
53
+ - New model version changes behavior — update this file
54
+
55
+ The calibration is a tool, not a rule. Read both reviews carefully.
56
+
57
+ ## Related files
58
+
59
+ - `.gemini/commands/review-spec-instructions.md`
60
+ - `.gemini/commands/review-plan-instructions.md`
61
+ - `.gemini/agents/backend-planner.md`
62
+ - `.gemini/agents/frontend-planner.md`