devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,234 @@
1
+ #!/usr/bin/env bash
2
+ # run-iter-0033c.sh — orchestrate the iter-0033c suite (NEW L2 vs NEW L1).
3
+ #
4
+ # Codex R0.5-infra design: bypass run-suite.sh + ship-gate.py + compile-report.py
5
+ # (those enforce variant/bare semantics that don't apply here). Call run-fixture.sh
6
+ # directly per fixture per arm; per-fixture interleaving for fail-early on hard-floor
7
+ # violations (Codex R0.5-infra Q4).
8
+ #
9
+ # Per Mission 1: serial only, no parallel-fleet.
10
+ #
11
+ # Usage:
12
+ # run-iter-0033c.sh --label <label> [--fixtures F1,F2,...] [--c1-summary <path>] [--f9-judge <path>]
13
+ # [--manifest-out <path>] [--results-out-dir <path>] [--skip-judge]
14
+ #
15
+ # Pre-flight: smoke 1b (codex availability) — fail-fast.
16
+ # Arms per fixture:
17
+ # - All fixtures: solo_claude (L1 rerun) + l2_gated (L2 natural triggers)
18
+ # - Pair-eligible (per manifest): also l2_forced (L2 diagnostic)
19
+ # After arms: judge.sh per fixture; manifest build; iter-0033c-compare.py.
20
+ set -euo pipefail
21
+
22
+ usage() {
23
+ cat >&2 <<EOF
24
+ usage: $0 --label <label>
25
+ [--fixtures F1,F2,F3,F4,F5,F6,F7,F8,F9]
26
+ [--c1-summary <path>] # default: benchmark/auto-resolve/results/3bc86dd-iter0033c1-new-20260501T004229Z/summary.json
27
+ [--f9-judge <path>] # default: benchmark/auto-resolve/results/4e3d89a-iter-0033a-f9-smoke3-20260430T232747Z/F9-e2e-ideate-to-resolve/judge.json
28
+ [--results-root <path>] # default: benchmark/auto-resolve/results
29
+ [--skip-judge] # skip judge.sh (re-runnable post-hoc)
30
+ EOF
31
+ exit 1
32
+ }
33
+
34
+ LABEL=""
35
+ FIXTURES_CSV="F1,F2,F3,F4,F5,F6,F7,F8,F9"
36
+ C1_SUMMARY="benchmark/auto-resolve/results/3bc86dd-iter0033c1-new-20260501T004229Z/summary.json"
37
+ F9_JUDGE="benchmark/auto-resolve/results/4e3d89a-iter-0033a-f9-smoke3-20260430T232747Z/F9-e2e-ideate-to-resolve/judge.json"
38
+ RESULTS_ROOT="benchmark/auto-resolve/results"
39
+ SKIP_JUDGE=0
40
+ while [ $# -gt 0 ]; do
41
+ case "$1" in
42
+ --label) LABEL="$2"; shift 2;;
43
+ --fixtures) FIXTURES_CSV="$2"; shift 2;;
44
+ --c1-summary) C1_SUMMARY="$2"; shift 2;;
45
+ --f9-judge) F9_JUDGE="$2"; shift 2;;
46
+ --results-root) RESULTS_ROOT="$2"; shift 2;;
47
+ --skip-judge) SKIP_JUDGE=1; shift;;
48
+ *) usage;;
49
+ esac
50
+ done
51
+ [ -n "$LABEL" ] || usage
52
+
53
+ REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
54
+ cd "$REPO_ROOT"
55
+
56
+ # --- Smoke 1b: codex availability fail-fast ---
57
+ echo "=== Smoke 1b: Codex availability ==="
58
+ if ! command -v codex >/dev/null 2>&1; then
59
+ echo "FAIL: codex not on PATH — iter-0033c L2 arms cannot run" >&2
60
+ exit 1
61
+ fi
62
+ echo "PASS: $(command -v codex) ($(codex --version 2>&1 | head -1))"
63
+
64
+ # --- Mirror committed skills to .claude/skills (parity with run-suite.sh:111-141) ---
65
+ # Iteration commits land in config/skills/; the variant-arm runtime resolves
66
+ # from .claude/skills/. Without this step, edits to SKILL.md / phase prompts /
67
+ # _shared scripts (e.g. archive_run.py iter-0033c fix) silently run against the
68
+ # stale mirror. UNSHIPPED list mirrors bin/devlyn.js:299-304.
69
+ SRC_SKILLS="$REPO_ROOT/config/skills"
70
+ DST_SKILLS="$REPO_ROOT/.claude/skills"
71
+ mkdir -p "$DST_SKILLS"
72
+ mirrored=0
73
+ for src_dir in "$SRC_SKILLS"/*/; do
74
+ [ -d "$src_dir" ] || continue
75
+ name=$(basename "$src_dir")
76
+ case "$name" in
77
+ devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
78
+ continue ;;
79
+ esac
80
+ staging="$DST_SKILLS/.${name}.staging"
81
+ rm -rf "$staging"
82
+ cp -R "$src_dir" "$staging"
83
+ rm -rf "$DST_SKILLS/$name"
84
+ mv "$staging" "$DST_SKILLS/$name"
85
+ mirrored=$((mirrored + 1))
86
+ done
87
+ echo "[run-iter-0033c] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
88
+
89
+ # --- Setup ---
90
+ HEAD_SHA=$(git rev-parse --short HEAD)
91
+ TS=$(date -u +%Y%m%dT%H%M%SZ)
92
+ RUN_ID="${HEAD_SHA}-iter0033c-${LABEL}-${TS}"
93
+ RESULTS_DIR="$RESULTS_ROOT/$RUN_ID"
94
+ mkdir -p "$RESULTS_DIR"
95
+ echo "[run-iter-0033c] RUN_ID=$RUN_ID"
96
+ echo "[run-iter-0033c] RESULTS_DIR=$RESULTS_DIR"
97
+
98
+ # --- Determine pair-eligible set from manifest input bundle ---
99
+ # Build a draft manifest using the C1 summary as the L1 placeholder; we'll
100
+ # rebuild with the real L1 rerun summary at the end. For now we just need
101
+ # the pair-eligible set for arm-selection per fixture.
102
+ DRAFT_MANIFEST="$RESULTS_DIR/manifest-draft.json"
103
+ python3 benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
104
+ --c1-summary "$C1_SUMMARY" \
105
+ --f9-judge "$F9_JUDGE" \
106
+ --l1-rerun-summary "$C1_SUMMARY" \
107
+ --output "$DRAFT_MANIFEST"
108
+ PAIR_ELIGIBLE=$(python3 -c "import json;print(' '.join(json.load(open('$DRAFT_MANIFEST'))['fixtures_pair_eligible']))")
109
+ echo "[run-iter-0033c] pair-eligible: $PAIR_ELIGIBLE"
110
+
111
+ # --- Per-fixture interleaved arm loop ---
112
+ IFS=',' read -ra FIXTURES <<< "$FIXTURES_CSV"
113
+ declare -a TIMINGS=()
114
+ for short in "${FIXTURES[@]}"; do
115
+ # Resolve short ID to canonical fixture dir name.
116
+ case "$short" in
117
+ F1) fx="F1-cli-trivial-flag";;
118
+ F2) fx="F2-cli-medium-subcommand";;
119
+ F3) fx="F3-backend-contract-risk";;
120
+ F4) fx="F4-web-browser-design";;
121
+ F5) fx="F5-fix-loop-red-green";;
122
+ F6) fx="F6-dep-audit-native-module";;
123
+ F7) fx="F7-out-of-scope-trap";;
124
+ F8) fx="F8-known-limit-ambiguous";;
125
+ F9) fx="F9-e2e-ideate-to-resolve";;
126
+ *) echo "[run-iter-0033c] unknown fixture short id: $short" >&2; exit 1;;
127
+ esac
128
+ echo ""
129
+ echo "=== Fixture $fx ==="
130
+ ARMS=("solo_claude" "l2_gated")
131
+ if [[ " $PAIR_ELIGIBLE " =~ " $short " ]]; then
132
+ ARMS+=("l2_forced")
133
+ fi
134
+ for arm in "${ARMS[@]}"; do
135
+ echo "[run-iter-0033c] $fx :: $arm START $(date -u +%FT%TZ)"
136
+ arm_t0=$(date +%s)
137
+ if ! bash benchmark/auto-resolve/scripts/run-fixture.sh \
138
+ --fixture "$fx" --arm "$arm" \
139
+ --run-id "$RUN_ID" --resolve-skill new \
140
+ > "$RESULTS_DIR/${fx}-${arm}.log" 2>&1; then
141
+ echo "[run-iter-0033c] $fx :: $arm FAILED — see $RESULTS_DIR/${fx}-${arm}.log"
142
+ # Continue to next arm; full failure surface goes through compare.py gates.
143
+ fi
144
+ arm_t1=$(date +%s)
145
+ elapsed=$((arm_t1 - arm_t0))
146
+ TIMINGS+=("$fx:$arm:${elapsed}s")
147
+ echo "[run-iter-0033c] $fx :: $arm END elapsed=${elapsed}s"
148
+ done
149
+
150
+ # Per-fixture judge (graded across ARMS_PRESENT)
151
+ if [ "$SKIP_JUDGE" -eq 0 ]; then
152
+ echo "[run-iter-0033c] $fx :: judge START"
153
+ if ! bash benchmark/auto-resolve/scripts/judge.sh \
154
+ --fixture "$fx" --run-id "$RUN_ID" \
155
+ > "$RESULTS_DIR/${fx}-judge.log" 2>&1; then
156
+ echo "[run-iter-0033c] $fx :: judge FAILED — see $RESULTS_DIR/${fx}-judge.log"
157
+ fi
158
+ echo "[run-iter-0033c] $fx :: judge END"
159
+ fi
160
+ done
161
+
162
+ # --- Build L1 rerun summary from solo_claude arm result.json + judge.json ---
163
+ L1_RERUN_SUMMARY="$RESULTS_DIR/l1-rerun-summary.json"
164
+ python3 - "$RESULTS_DIR" "$L1_RERUN_SUMMARY" "$RUN_ID" "$HEAD_SHA" <<'PY'
165
+ import json, sys
166
+ from pathlib import Path
167
+ results_dir = Path(sys.argv[1])
168
+ out_path = Path(sys.argv[2])
169
+ run_id = sys.argv[3]
170
+ head_sha = sys.argv[4]
171
+ rows = []
172
+ for fx_dir in sorted(results_dir.iterdir()):
173
+ if not fx_dir.is_dir():
174
+ continue
175
+ judge_p = fx_dir / "judge.json"
176
+ if not judge_p.is_file():
177
+ continue
178
+ judge = json.loads(judge_p.read_text())
179
+ mapping = judge.get("_blind_mapping") or {}
180
+ inv = {v: k for k, v in mapping.items()}
181
+ arms = {}
182
+ for arm_name in ("solo_claude", "l2_gated", "l2_forced", "bare"):
183
+ letter = inv.get(arm_name)
184
+ if not letter:
185
+ continue
186
+ arm_dir = fx_dir / arm_name
187
+ result = {}
188
+ if (arm_dir / "result.json").is_file():
189
+ result = json.loads((arm_dir / "result.json").read_text())
190
+ arms[arm_name] = {
191
+ "score": judge.get(f"{letter}_score"),
192
+ "wall_s": result.get("elapsed_seconds"),
193
+ "verify_score": result.get("verify_score"),
194
+ "files_changed": result.get("files_changed"),
195
+ "timed_out": result.get("timed_out"),
196
+ "disqualifier": result.get("disqualifier"),
197
+ }
198
+ rows.append({"fixture": fx_dir.name, "arms": arms})
199
+ out = {
200
+ "run_id": run_id,
201
+ "git_sha": head_sha,
202
+ "fixtures_total": len(rows),
203
+ "rows": rows,
204
+ }
205
+ out_path.write_text(json.dumps(out, indent=2) + "\n")
206
+ print(f"[l1-rerun-summary] wrote {out_path} (fixtures={len(rows)})")
207
+ PY
208
+
209
+ # --- Build final manifest with real L1 rerun summary ---
210
+ FINAL_MANIFEST="$RESULTS_DIR/iter-0033c-pair-eligible.json"
211
+ python3 benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
212
+ --c1-summary "$C1_SUMMARY" \
213
+ --f9-judge "$F9_JUDGE" \
214
+ --l1-rerun-summary "$L1_RERUN_SUMMARY" \
215
+ --output "$FINAL_MANIFEST"
216
+
217
+ # --- Run iter-0033c gate compare ---
218
+ GATES_JSON="$RESULTS_DIR/gates.json"
219
+ GATES_MD="$RESULTS_DIR/gates.md"
220
+ python3 benchmark/auto-resolve/scripts/iter-0033c-compare.py \
221
+ --manifest "$FINAL_MANIFEST" \
222
+ --results-dir "$RESULTS_DIR" \
223
+ --work-dir-root /tmp \
224
+ --run-id "$RUN_ID" \
225
+ --out-json "$GATES_JSON" \
226
+ --out-md "$GATES_MD" \
227
+ || true # gates may FAIL — exit non-zero handled by inspecting gates.json
228
+
229
+ echo ""
230
+ echo "=== iter-0033c done ==="
231
+ echo "RESULTS_DIR=$RESULTS_DIR"
232
+ echo "MANIFEST=$FINAL_MANIFEST"
233
+ echo "GATES=$GATES_MD"
234
+ printf '\n--- per-arm wall ---\n%s\n' "$(printf '%s\n' "${TIMINGS[@]}")"
@@ -0,0 +1,214 @@
1
+ #!/usr/bin/env bash
2
+ # run-suite.sh — the single-command benchmark entry.
3
+ #
4
+ # Orchestrates: fixture setup + arm invocations + blind judge + report + ship
5
+ # gate. Called by `npx devlyn-cli benchmark` as well as directly.
6
+ #
7
+ # Usage:
8
+ # run-suite.sh # all fixtures, n=1 smoke
9
+ # run-suite.sh --n 3 # 3 runs per fixture for ship decisions
10
+ # run-suite.sh F2 F5 # specific fixtures only
11
+ # run-suite.sh --dry-run # skip model invocations, validate setup
12
+ # run-suite.sh --judge-only --run-id X # re-judge an existing run
13
+ # run-suite.sh --label v3.6 # tag this run
14
+ # run-suite.sh --bless # if ship-gate PASS, promote to baselines/shipped.json
15
+ # run-suite.sh --resolve-skill new # invoke /devlyn:resolve --spec (the only supported value post iter-0034 cutover; flag kept as accepted no-op for historical runners)
16
+ #
17
+ # Exits 0 on PASS, 1 on FAIL.
18
+
19
+ set -euo pipefail
20
+
21
+ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
22
+ REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
23
+
24
+ N=1
25
+ LABEL=""
26
+ DRY_RUN=0
27
+ JUDGE_ONLY=0
28
+ RUN_ID_ARG=""
29
+ BLESS=0
30
+ ACCEPT_MISSING=0
31
+ SUITE="golden"
32
+ RESOLVE_SKILL="new"
33
+ FIXTURES=()
34
+
35
+ while [ $# -gt 0 ]; do
36
+ case "$1" in
37
+ --n) N="$2"; shift 2;;
38
+ --label) LABEL="$2"; shift 2;;
39
+ --dry-run) DRY_RUN=1; shift;;
40
+ --judge-only) JUDGE_ONLY=1; shift;;
41
+ --run-id) RUN_ID_ARG="$2"; shift 2;;
42
+ --bless) BLESS=1; shift;;
43
+ --accept-missing) ACCEPT_MISSING=1; shift;;
44
+ --suite) SUITE="$2"; shift 2;;
45
+ --resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
46
+ -h|--help)
47
+ head -22 "$0" | sed -n '3,22p'; exit 0;;
48
+ [FS][0-9]*) FIXTURES+=("$1"); shift;;
49
+ *)
50
+ echo "unknown arg: $1" >&2; exit 1;;
51
+ esac
52
+ done
53
+
54
+ # iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` deleted.
55
+ # Only `new` (= /devlyn:resolve --spec) is supported. The flag is retained as
56
+ # an accepted no-op so historical runners (e.g. run-iter-0033c.sh) keep working
57
+ # without edit. `old` is hard-errored with a pointer at the cutover commit.
58
+ if [ "$RESOLVE_SKILL" = "old" ]; then
59
+ echo "--resolve-skill old is no longer supported: /devlyn:auto-resolve was deleted in the iter-0034 Phase 4 cutover. Use --resolve-skill new (default) or omit the flag." >&2
60
+ exit 1
61
+ fi
62
+ [ "$RESOLVE_SKILL" = "new" ] || \
63
+ { echo "--resolve-skill must be 'new' (got '$RESOLVE_SKILL')" >&2; exit 1; }
64
+
65
+ # Suite → fixtures directory + discovery prefix.
66
+ case "$SUITE" in
67
+ golden) FIXTURES_DIR="$BENCH_ROOT/fixtures"; FIXTURES_GLOB="F*";;
68
+ shadow) FIXTURES_DIR="$BENCH_ROOT/shadow-fixtures"; FIXTURES_GLOB="S*";;
69
+ *) echo "error: --suite must be 'golden' or 'shadow' (got '$SUITE')" >&2; exit 1;;
70
+ esac
71
+
72
+ # n must be 1 while iteration semantics aren't wired through judge/report.
73
+ # Remove this block when compile-report.py gains multi-iter aggregation.
74
+ if [ "$N" -ne 1 ]; then
75
+ echo "error: --n $N not yet supported — judge/report currently expect a single iteration per fixture." >&2
76
+ echo " Track progress in benchmark/auto-resolve/BENCHMARK-DESIGN.md (#multi-iter-roadmap)." >&2
77
+ exit 2
78
+ fi
79
+
80
+ # Auto-discover fixtures if none specified
81
+ if [ ${#FIXTURES[@]} -eq 0 ]; then
82
+ for d in "$FIXTURES_DIR"/$FIXTURES_GLOB/; do
83
+ [ -d "$d" ] && FIXTURES+=("$(basename "$d")")
84
+ done
85
+ fi
86
+
87
+ if [ ${#FIXTURES[@]} -eq 0 ]; then
88
+ echo "no fixtures found in $FIXTURES_DIR/ — build the suite first" >&2
89
+ exit 1
90
+ fi
91
+
92
+ # RUN_ID
93
+ if [ -n "$RUN_ID_ARG" ]; then
94
+ RUN_ID="$RUN_ID_ARG"
95
+ else
96
+ TS=$(date -u +%Y%m%dT%H%M%SZ)
97
+ SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
98
+ RUN_ID="${TS}-${SHA}${LABEL:+-$LABEL}"
99
+ fi
100
+
101
+ RES_DIR="$BENCH_ROOT/results/$RUN_ID"
102
+ mkdir -p "$RES_DIR"
103
+
104
+ echo ""
105
+ echo "═══ Benchmark Suite Run ═══"
106
+ echo "Run-id: $RUN_ID"
107
+ echo "Label: ${LABEL:-(unlabeled)}"
108
+ echo "Suite: $SUITE ($FIXTURES_DIR)"
109
+ echo "Fixtures: ${FIXTURES[*]}"
110
+ echo "n: $N"
111
+ echo "Resolve skill: $RESOLVE_SKILL"
112
+ [ $DRY_RUN -eq 1 ] && echo "Mode: DRY RUN (no model invocations)"
113
+ [ $JUDGE_ONLY -eq 1 ] && echo "Mode: JUDGE ONLY (re-judging existing artifacts)"
114
+ echo ""
115
+
116
+ # ---- Mirror committed skills into .claude/skills (iter-0017) --------------
117
+ # The variant arm reads $REPO_ROOT/.claude/skills/, but iteration commits land
118
+ # in config/skills/. Without this step every checkout/revert that touches
119
+ # SKILL.md or phase prompts requires a manual `node bin/devlyn.js -y` or
120
+ # surgical cp; forgetting it silently runs the suite against stale skills.
121
+ # Replicates the clean-then-copy semantics of bin/devlyn.js
122
+ # (cleanManagedSkillDirs ~L313 + copyRecursive ~L274). Per-skill staging dir
123
+ # + atomic mv keeps a Ctrl-C window from leaving a managed skill missing.
124
+ # UNSHIPPED list mirrors bin/devlyn.js:299-304 — keep them in sync.
125
+ # Skipped only in --judge-only (no model invocations); runs in --dry-run.
126
+ if [ $JUDGE_ONLY -eq 0 ]; then
127
+ SRC_SKILLS="$REPO_ROOT/config/skills"
128
+ DST_SKILLS="$REPO_ROOT/.claude/skills"
129
+ mkdir -p "$DST_SKILLS"
130
+ mirrored=0
131
+ for src_dir in "$SRC_SKILLS"/*/; do
132
+ [ -d "$src_dir" ] || continue
133
+ name=$(basename "$src_dir")
134
+ case "$name" in
135
+ devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
136
+ continue ;;
137
+ esac
138
+ staging="$DST_SKILLS/.${name}.staging"
139
+ rm -rf "$staging"
140
+ cp -R "$src_dir" "$staging"
141
+ rm -rf "$DST_SKILLS/$name"
142
+ mv "$staging" "$DST_SKILLS/$name"
143
+ mirrored=$((mirrored + 1))
144
+ done
145
+ echo "[suite] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
146
+ fi
147
+
148
+ # Prereq checks
149
+ if [ $DRY_RUN -eq 0 ] && [ $JUDGE_ONLY -eq 0 ]; then
150
+ command -v claude >/dev/null 2>&1 || { echo "claude CLI missing; install Claude Code first"; exit 1; }
151
+ fi
152
+ if [ $JUDGE_ONLY -eq 0 ]; then
153
+ command -v codex >/dev/null 2>&1 || echo "warning: codex CLI missing — judge will fail"
154
+ fi
155
+ command -v python3 >/dev/null 2>&1 || { echo "python3 missing"; exit 1; }
156
+
157
+ # Install test-repo deps once per suite run (shared cache)
158
+ if [ $DRY_RUN -eq 0 ] && [ $JUDGE_ONLY -eq 0 ]; then
159
+ TEST_REPO="$BENCH_ROOT/fixtures/test-repo"
160
+ if [ ! -d "$TEST_REPO/node_modules" ]; then
161
+ echo "[suite] installing test-repo deps (one-time)"
162
+ if ! (cd "$TEST_REPO" && npm install --no-audit --no-fund --loglevel=error); then
163
+ echo "[suite] ✗ npm install in test-repo failed — check network/npm auth. Aborting." >&2
164
+ exit 1
165
+ fi
166
+ fi
167
+ fi
168
+
169
+ # ---- Run arms ---------------------------------------------------------------
170
+ if [ $JUDGE_ONLY -eq 0 ]; then
171
+ for fid in "${FIXTURES[@]}"; do
172
+ [ -d "$FIXTURES_DIR/$fid" ] || { echo "[suite] skip $fid (missing)"; continue; }
173
+ for arm in variant solo_claude bare; do
174
+ echo "[suite] ► $fid / $arm (resolve-skill=$RESOLVE_SKILL)"
175
+ extra=""
176
+ [ $DRY_RUN -eq 1 ] && extra="--dry-run"
177
+ bash "$BENCH_ROOT/scripts/run-fixture.sh" \
178
+ --fixture "$fid" --arm "$arm" --run-id "$RUN_ID" \
179
+ --resolve-skill "$RESOLVE_SKILL" $extra \
180
+ || echo "[suite] ✗ $fid / $arm (arm failure tolerated; artifacts still captured)"
181
+ done
182
+ done
183
+ fi
184
+
185
+ # ---- Judge ------------------------------------------------------------------
186
+ for fid in "${FIXTURES[@]}"; do
187
+ if [ ! -d "$BENCH_ROOT/results/$RUN_ID/$fid" ]; then
188
+ echo "[suite] skip judge for $fid (no results)"
189
+ continue
190
+ fi
191
+ if [ $DRY_RUN -eq 1 ]; then
192
+ echo "[suite] DRY RUN — skipping judge for $fid"
193
+ continue
194
+ fi
195
+ echo "[suite] ► judge $fid"
196
+ bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
197
+ || echo "[suite] ✗ judge failed for $fid (will appear as NO_JUDGE in report)"
198
+ done
199
+
200
+ # ---- Compile report + ship gate --------------------------------------------
201
+ if [ $DRY_RUN -eq 1 ]; then
202
+ echo ""
203
+ echo "[suite] DRY RUN complete — results in $RES_DIR"
204
+ echo "Run without --dry-run to invoke models."
205
+ exit 0
206
+ fi
207
+
208
+ echo ""
209
+ python3 "$BENCH_ROOT/scripts/compile-report.py" --run-id "$RUN_ID" ${LABEL:+--label "$LABEL"}
210
+
211
+ extra_flag=""
212
+ [ $BLESS -eq 1 ] && extra_flag="$extra_flag --bless"
213
+ [ $ACCEPT_MISSING -eq 1 ] && extra_flag="$extra_flag --accept-missing"
214
+ python3 "$BENCH_ROOT/scripts/ship-gate.py" --run-id "$RUN_ID" $extra_flag
@@ -0,0 +1,222 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ship-gate.py — apply RUBRIC.md ship thresholds to a suite run's summary.json.
4
+
5
+ Usage:
6
+ ship-gate.py --run-id <ID> # check gates, return 0/1 via exit code
7
+ ship-gate.py --run-id <ID> --bless # if PASS, promote summary to baselines/shipped.json
8
+
9
+ Exits 0 on PASS, 1 on FAIL.
10
+ """
11
+ from __future__ import annotations
12
+ import argparse, json, pathlib, sys, shutil, datetime
13
+
14
+
15
+ def main() -> int:
16
+ p = argparse.ArgumentParser()
17
+ p.add_argument("--run-id", required=True)
18
+ p.add_argument("--bless", action="store_true")
19
+ p.add_argument("--accept-missing", action="store_true",
20
+ help="skip hard-floor gates that require fixtures not yet implemented "
21
+ "(F9 and the 7-of-9 count) — only for suites in bootstrap")
22
+ args = p.parse_args()
23
+
24
+ root = pathlib.Path(__file__).resolve().parent.parent
25
+ summary_p = root / "results" / args.run_id / "summary.json"
26
+ if not summary_p.exists():
27
+ print(f"no summary at {summary_p}", file=sys.stderr); return 1
28
+ summary = json.loads(summary_p.read_text())
29
+
30
+ baseline_p = root / "history" / "baselines" / "shipped.json"
31
+ baseline = None
32
+ if baseline_p.exists():
33
+ try:
34
+ baseline = json.loads(baseline_p.read_text())
35
+ except Exception:
36
+ baseline = None
37
+
38
+ failures: list[str] = []
39
+ warnings: list[str] = []
40
+
41
+ # Hard floor 1: no disqualifier in variant
42
+ if summary["hard_floor_violations"] > 0:
43
+ failures.append(f"{summary['hard_floor_violations']} variant disqualifier(s) — see report")
44
+
45
+ # Hard floor 2: F9 must pass (skipped during bootstrap via --accept-missing)
46
+ # Variant arm legacy gate kept for L2 baseline comparability.
47
+ # iter-0033a (2026-04-30): renamed F9 dir from -to-preflight to -to-resolve to
48
+ # match the shipped 2-skill contract (no preflight). The OLD pre-rename id
49
+ # is preserved in fixtures/retired/ for replay.
50
+ f9_row = next((r for r in summary["rows"] if r.get("fixture") == "F9-e2e-ideate-to-resolve"), None)
51
+ if f9_row is None:
52
+ if not args.accept_missing:
53
+ failures.append("F9 (E2E novice flow) missing — add fixture or run with --accept-missing")
54
+ else:
55
+ if (f9_row.get("margin") or -999) < 5:
56
+ failures.append("F9 (E2E novice flow) must have variant margin ≥ +5")
57
+
58
+ # Hard floor 3: ≥ 7 of 9 gated fixtures with margin ≥ +5
59
+ # (skipped during bootstrap via --accept-missing)
60
+ if summary["gated_fixtures"] > 0 and summary["margin_ge_5_count"] < 7:
61
+ if not args.accept_missing:
62
+ failures.append(
63
+ f"only {summary['margin_ge_5_count']} of {summary['gated_fixtures']} "
64
+ f"gated fixtures have variant margin ≥ +5 (need ≥ 7)"
65
+ )
66
+
67
+ # iter-0023 — L1 (solo_claude) gates per NORTH-STAR.md ops test #1.
68
+ # Codex R1 (this iter) caught that ship-gate enforced only legacy L2
69
+ # `variant` margin and never read `solo_over_bare`. Now NORTH-STAR's
70
+ # documented L1 floor (≥ +5, ≥ 7/9 fixtures, F9 ≥ +5, no L1
71
+ # disqualifier) is mechanically enforced.
72
+ arms_present = summary.get("arms_present", {})
73
+ margins_avg = summary.get("margins_avg", {})
74
+ if arms_present.get("solo_claude"):
75
+ l1_avg = margins_avg.get("solo_over_bare")
76
+ if l1_avg is not None and l1_avg < 5:
77
+ warnings.append(
78
+ f"L1 (solo_over_bare) suite avg {l1_avg:+.1f} below NORTH-STAR floor +5 "
79
+ "(reporting only — per-fixture L1 gates below are decisive)"
80
+ )
81
+ # F9 L1 floor
82
+ if f9_row is not None:
83
+ f9_l1 = (f9_row.get("margins") or {}).get("solo_over_bare")
84
+ if f9_l1 is None:
85
+ if not args.accept_missing:
86
+ failures.append("F9 L1 (solo_over_bare) margin missing — measurement invalid")
87
+ elif f9_l1 < 5:
88
+ failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+d} < +5 floor")
89
+ # 7-of-9 L1 floor — headroom-aware (added 2026-05-02 per iter-0033 R4
90
+ # Codex collab + NORTH-STAR amendment + RUBRIC hard-floor 3 update).
91
+ # A fixture is excluded from the denominator when 100 - L0_score < 5
92
+ # AND L1_score >= 95 AND the L1 arm has no disqualifier / CRITICAL-HIGH
93
+ # finding / watchdog timeout / regression worse than gate #4. Excluded
94
+ # fixtures become fixture-rotation candidates if RUBRIC's
95
+ # two-shipped-version saturation rule fires.
96
+ l1_ge_5 = 0
97
+ l1_gated = 0
98
+ l1_excluded_headroom = []
99
+ for r in summary.get("rows", []):
100
+ if (r.get("category") or "").lower() == "known-limit":
101
+ continue
102
+ arms = r.get("arms") or {}
103
+ l0 = arms.get("bare") or {}
104
+ l1 = arms.get("solo_claude") or {}
105
+ l0_score = l0.get("score")
106
+ l1_score = l1.get("score")
107
+ m = (r.get("margins") or {}).get("solo_over_bare")
108
+ if m is None:
109
+ continue
110
+ # Headroom carve-out — must satisfy ALL conditions:
111
+ # (a) bare ceiling-near (100 - L0 < 5)
112
+ # (b) L1 also ceiling-near (>=95)
113
+ # (c) L1 arm clean (no disqualifier, no axis-invalid, fix-loop didn't fail)
114
+ l1_dq_here = bool(l1.get("disqualifier"))
115
+ l1_axis_inv = (l1.get("_axis_validation_out_of_range_count") or 0) > 0
116
+ if (
117
+ isinstance(l0_score, (int, float)) and isinstance(l1_score, (int, float))
118
+ and (100 - l0_score) < 5 and l1_score >= 95
119
+ and not l1_dq_here and not l1_axis_inv
120
+ ):
121
+ l1_excluded_headroom.append({
122
+ "fixture": r.get("fixture"),
123
+ "l0_score": l0_score,
124
+ "l1_score": l1_score,
125
+ "margin": m,
126
+ })
127
+ continue
128
+ l1_gated += 1
129
+ if m >= 5:
130
+ l1_ge_5 += 1
131
+ if l1_gated > 0 and l1_ge_5 < 7 and not args.accept_missing:
132
+ failures.append(
133
+ f"L1: only {l1_ge_5} of {l1_gated} headroom-available fixtures have solo_over_bare ≥ +5 (need ≥ 7)"
134
+ )
135
+ if l1_excluded_headroom:
136
+ warnings.append(
137
+ "L1 headroom-excluded (saturation candidates per RUBRIC two-shipped-version rule): "
138
+ + ", ".join(
139
+ f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+d})"
140
+ for x in l1_excluded_headroom
141
+ )
142
+ )
143
+ # L1 disqualifier floor
144
+ l1_dq = sum(
145
+ 1 for r in summary.get("rows", [])
146
+ if ((r.get("arms") or {}).get("solo_claude") or {}).get("disqualifier")
147
+ )
148
+ if l1_dq > 0:
149
+ failures.append(f"L1 disqualifier(s): {l1_dq} solo_claude arm(s) hit a disqualifier")
150
+ # L1 axis-validity gate (judge.sh records out-of-range axis cells under
151
+ # `_axis_validation` per fixture). If any L1 row has invalid axis data,
152
+ # the L1 score for that row is not trustworthy.
153
+ l1_axis_invalid = 0
154
+ for r in summary.get("rows", []):
155
+ av = (r.get("arms") or {}).get("solo_claude") or {}
156
+ inv = av.get("_axis_validation_out_of_range_count")
157
+ if inv is not None and inv > 0:
158
+ l1_axis_invalid += 1
159
+ if l1_axis_invalid > 0:
160
+ failures.append(
161
+ f"L1 axis-invalid: {l1_axis_invalid} fixture(s) have out-of-range axis cells — "
162
+ "re-judge before trusting L1 margins"
163
+ )
164
+
165
+ # Hard floor 4: no per-fixture regression worse than −5 vs shipped baseline
166
+ if baseline:
167
+ prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
168
+ for r in summary["rows"]:
169
+ fid = r.get("fixture")
170
+ prev = prev_rows.get(fid)
171
+ if prev and r.get("variant_score") is not None and prev.get("variant_score") is not None:
172
+ delta = r["variant_score"] - prev["variant_score"]
173
+ if delta < -5:
174
+ failures.append(f"{fid} regressed {delta:+d} vs shipped (floor: −5)")
175
+
176
+ # Soft gate: suite average margin drop > 3
177
+ if baseline:
178
+ margin_delta = summary["margin_avg"] - baseline.get("margin_avg", 0)
179
+ if margin_delta < -3:
180
+ warnings.append(f"suite margin dropped {margin_delta:+.1f} vs shipped (soft gate: > −3)")
181
+
182
+ # Soft gate: any fixture that was > +5 before is now ≤ 0
183
+ if baseline:
184
+ prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
185
+ for r in summary["rows"]:
186
+ fid = r.get("fixture")
187
+ prev = prev_rows.get(fid)
188
+ if prev and (prev.get("margin") or 0) > 5 and (r.get("margin") or 0) <= 0:
189
+ warnings.append(
190
+ f"{fid} lost its margin: was {prev['margin']:+d}, now {r['margin']:+d}"
191
+ )
192
+
193
+ verdict = "PASS" if not failures else "FAIL"
194
+ print(f"\n═══ SHIP-GATE VERDICT: {verdict} ═══\n")
195
+ if failures:
196
+ print("Hard-floor failures:")
197
+ for f in failures:
198
+ print(f" ✗ {f}")
199
+ print()
200
+ if warnings:
201
+ print("Soft-gate warnings:")
202
+ for w in warnings:
203
+ print(f" ⚠ {w}")
204
+ print()
205
+ if not failures and not warnings:
206
+ print("No gate violations. Suite is ship-ready.")
207
+
208
+ # Bless if PASS + --bless — opt-in promotion to shipped baseline.
209
+ # Per BENCHMARK-DESIGN.md Karpathy Check, automatic history mutation is
210
+ # deferred until after the suite format stabilizes; `--bless` stays as
211
+ # the explicit promotion path, and `summary.json` inside the run dir
212
+ # is the durable record for ad-hoc inspection.
213
+ if verdict == "PASS" and args.bless:
214
+ baseline_p.parent.mkdir(parents=True, exist_ok=True)
215
+ shutil.copyfile(summary_p, baseline_p)
216
+ print(f"\nBlessed: {baseline_p}")
217
+
218
+ return 0 if verdict == "PASS" else 1
219
+
220
+
221
+ if __name__ == "__main__":
222
+ sys.exit(main())