devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,331 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ compile-report.py — aggregate one run's fixture artifacts into a summary.
4
+
5
+ Usage:
6
+ compile-report.py --run-id <ID> [--label <VERSION>]
7
+
8
+ Reads: benchmark/auto-resolve/results/<run-id>/<fixture>/{judge.json, variant/result.json, bare/result.json}
9
+ Writes:
10
+ results/<run-id>/summary.json (machine)
11
+ results/<run-id>/report.md (human)
12
+
13
+ The report is the output of `npx devlyn-cli benchmark`. Ship-gate.py consumes summary.json.
14
+ """
15
+ from __future__ import annotations
16
+ import argparse, json, pathlib, sys, subprocess, datetime
17
+
18
+
19
+ def git_sha() -> str:
20
+ try:
21
+ return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
22
+ except Exception:
23
+ return "unknown"
24
+
25
+
26
+ def git_branch() -> str:
27
+ try:
28
+ return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], text=True).strip()
29
+ except Exception:
30
+ return "unknown"
31
+
32
+
33
+ def main() -> int:
34
+ p = argparse.ArgumentParser()
35
+ p.add_argument("--run-id", required=True)
36
+ p.add_argument("--label", default=None, help="version label, e.g. v3.6")
37
+ args = p.parse_args()
38
+
39
+ bench_root = pathlib.Path(__file__).resolve().parent.parent
40
+ res_root = bench_root / "results" / args.run_id
41
+ if not res_root.is_dir():
42
+ print(f"no results dir: {res_root}", file=sys.stderr); return 1
43
+
44
+ fixtures = sorted([d.name for d in res_root.iterdir() if d.is_dir()])
45
+ rows = []
46
+ for fid in fixtures:
47
+ fdir = res_root / fid
48
+ judge_path = fdir / "judge.json"
49
+ if not judge_path.exists():
50
+ rows.append({"fixture": fid, "status": "NO_JUDGE", "reason": "judge.json missing"})
51
+ continue
52
+ judge = json.loads(judge_path.read_text())
53
+ # iter-0019: 3-arm aware. judge.json now carries scores_by_arm /
54
+ # findings_by_arm / disqualifiers_by_arm / margins. Older judge.json
55
+ # (pre-iter-0019, only variant_score + bare_score) is handled by
56
+ # falling back to legacy fields.
57
+ scores_by_arm = judge.get("scores_by_arm") or {}
58
+ if not scores_by_arm:
59
+ if "variant_score" in judge:
60
+ scores_by_arm["variant"] = judge["variant_score"]
61
+ if "bare_score" in judge:
62
+ scores_by_arm["bare"] = judge["bare_score"]
63
+
64
+ findings_by_arm = judge.get("findings_by_arm") or {}
65
+ dq_by_arm = judge.get("disqualifiers_by_arm") or {}
66
+ margins = judge.get("margins") or {}
67
+
68
+ arm_results = {}
69
+ for arm in ("variant", "solo_claude", "bare"):
70
+ res_p = fdir / arm / "result.json"
71
+ arm_results[arm] = json.loads(res_p.read_text()) if res_p.exists() else {}
72
+ var_res = arm_results["variant"]
73
+ solo_res = arm_results["solo_claude"]
74
+ bare_res = arm_results["bare"]
75
+
76
+ meta_p = bench_root / "fixtures" / fid / "metadata.json"
77
+ category = "unknown"
78
+ if meta_p.exists():
79
+ try:
80
+ category = json.loads(meta_p.read_text()).get("category", "unknown")
81
+ except Exception:
82
+ pass
83
+
84
+ def wall_ratio(numer, denom):
85
+ if numer and denom:
86
+ return round(numer / denom, 2)
87
+ return None
88
+
89
+ # Disqualifier flags per arm = OR of deterministic result.json flag and
90
+ # judge's subjective flag (from new dq_by_arm map; fall back to legacy
91
+ # A/B-letter shape if present).
92
+ def arm_dq_judge(arm: str):
93
+ if arm in dq_by_arm:
94
+ return bool(dq_by_arm[arm].get("disqualifier", False))
95
+ mapping = judge.get("_blind_mapping", {}) or {}
96
+ for letter in ("A", "B", "C"):
97
+ if mapping.get(letter) == arm:
98
+ return bool((judge.get("disqualifiers", {}) or {}).get(letter, False))
99
+ return False
100
+
101
+ # Per-arm payload — arm absent = scores_by_arm key absent, downstream
102
+ # consumers null-check.
103
+ arms_block = {}
104
+ for arm in ("variant", "solo_claude", "bare"):
105
+ r = arm_results.get(arm) or {}
106
+ score = scores_by_arm.get(arm)
107
+ judge_dq = arm_dq_judge(arm)
108
+ det_dq = bool(r.get("disqualifier", False))
109
+ arms_block[arm] = {
110
+ "score": score,
111
+ "wall_s": r.get("elapsed_seconds"),
112
+ "verify_score": r.get("verify_score"),
113
+ "files_changed": r.get("files_changed"),
114
+ "timed_out": bool(r.get("timed_out", False)),
115
+ "disqualifier": judge_dq or det_dq,
116
+ "dq_judge": judge_dq,
117
+ "dq_deterministic": det_dq,
118
+ "critical_findings": findings_by_arm.get(arm, []) if findings_by_arm else [],
119
+ }
120
+
121
+ # Pairwise margins. Prefer judge-side margins (single calibrated
122
+ # scoring) over arithmetic differences, but fall through to compute
123
+ # from scores_by_arm if the judge didn't emit margins.
124
+ def m(left, right, key):
125
+ if margins.get(key) is not None:
126
+ return margins[key]
127
+ l = scores_by_arm.get(left); r2 = scores_by_arm.get(right)
128
+ if l is None or r2 is None:
129
+ return None
130
+ return l - r2
131
+
132
+ row = {
133
+ "fixture": fid,
134
+ "category": category,
135
+ "arms": arms_block,
136
+ # Pairwise margins (positive = first arm beat second).
137
+ "margins": {
138
+ "variant_over_bare": m("variant", "bare", "variant_over_bare"),
139
+ "solo_over_bare": m("solo_claude", "bare", "solo_over_bare"),
140
+ "variant_over_solo": m("variant", "solo_claude", "variant_over_solo"),
141
+ },
142
+ # Wall ratios per pairwise comparison (NORTH-STAR.md tests #2/#7
143
+ # generalized): each layer must beat previous-layer-best-of-N.
144
+ "wall_ratios": {
145
+ "variant_over_bare": wall_ratio(arms_block["variant"]["wall_s"], arms_block["bare"]["wall_s"]),
146
+ "solo_over_bare": wall_ratio(arms_block["solo_claude"]["wall_s"], arms_block["bare"]["wall_s"]),
147
+ "variant_over_solo": wall_ratio(arms_block["variant"]["wall_s"], arms_block["solo_claude"]["wall_s"]),
148
+ },
149
+ "winner": judge.get("winner_arm"),
150
+ # Legacy fields preserved so older summary readers still parse.
151
+ "variant_score": arms_block["variant"]["score"],
152
+ "bare_score": arms_block["bare"]["score"],
153
+ "margin": m("variant", "bare", "variant_over_bare"),
154
+ "variant_disqualifier": arms_block["variant"]["disqualifier"],
155
+ "variant_dq_judge": arms_block["variant"]["dq_judge"],
156
+ "variant_dq_deterministic": arms_block["variant"]["dq_deterministic"],
157
+ "variant_wall_s": arms_block["variant"]["wall_s"],
158
+ "bare_wall_s": arms_block["bare"]["wall_s"],
159
+ "wall_ratio_variant_over_bare": wall_ratio(
160
+ arms_block["variant"]["wall_s"], arms_block["bare"]["wall_s"]),
161
+ "variant_verify_score": arms_block["variant"]["verify_score"],
162
+ "bare_verify_score": arms_block["bare"]["verify_score"],
163
+ "variant_files_changed": arms_block["variant"]["files_changed"],
164
+ "bare_files_changed": arms_block["bare"]["files_changed"],
165
+ "critical_findings_variant": arms_block["variant"]["critical_findings"],
166
+ "critical_findings_bare": arms_block["bare"]["critical_findings"],
167
+ }
168
+ rows.append(row)
169
+
170
+ # Aggregate
171
+ scored = [r for r in rows if r.get("variant_score") is not None]
172
+ excluded_known_limit = [r for r in scored if r.get("category") == "edge"] # F8 and similar
173
+ gated_rows = [r for r in scored if r.get("category") != "edge"]
174
+
175
+ def avg(values):
176
+ vals = [v for v in values if v is not None]
177
+ return round(sum(vals) / len(vals), 1) if vals else None
178
+
179
+ # iter-0019: per-arm averages (whichever arms ran).
180
+ arm_avg = {}
181
+ for arm in ("variant", "solo_claude", "bare"):
182
+ arm_avg[arm] = avg([(r.get("arms", {}).get(arm) or {}).get("score") for r in scored])
183
+
184
+ # Pairwise margin averages (positive = first arm wins on average).
185
+ def margin_avg(key):
186
+ return avg([(r.get("margins") or {}).get(key) for r in scored])
187
+
188
+ margins_avg = {
189
+ "variant_over_bare": margin_avg("variant_over_bare"),
190
+ "solo_over_bare": margin_avg("solo_over_bare"),
191
+ "variant_over_solo": margin_avg("variant_over_solo"),
192
+ }
193
+ # Pairwise wall-ratio averages.
194
+ def wall_avg(key):
195
+ return avg([(r.get("wall_ratios") or {}).get(key) for r in scored])
196
+
197
+ wall_ratio_avg_by_pair = {
198
+ "variant_over_bare": wall_avg("variant_over_bare"),
199
+ "solo_over_bare": wall_avg("solo_over_bare"),
200
+ "variant_over_solo": wall_avg("variant_over_solo"),
201
+ }
202
+
203
+ # margin_ge_5_count over the iter-0018-era variant-vs-bare metric,
204
+ # because the legacy ship-gate.py reads that. Pair-aware gates get
205
+ # added in iter-0021 / 0022 once the data shape stabilizes.
206
+ margin_ge_5 = sum(1 for r in gated_rows if (r.get("margin") or 0) >= 5)
207
+ disqualifier_count = sum(1 for r in scored if r.get("variant_disqualifier"))
208
+
209
+ # arm-presence flags so consumers know whether the iter is 2-arm legacy
210
+ # or 3-arm post-iter-0019.
211
+ has_solo = any((r.get("arms", {}).get("solo_claude") or {}).get("score") is not None for r in scored)
212
+
213
+ summary = {
214
+ "run_id": args.run_id,
215
+ "label": args.label,
216
+ "git_sha": git_sha(),
217
+ "branch": git_branch(),
218
+ "completed_at": datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z",
219
+ "fixtures_total": len(rows),
220
+ "fixtures_scored": len(scored),
221
+ # Legacy 2-arm fields kept for ship-gate.py + history readers.
222
+ "variant_avg": arm_avg.get("variant"),
223
+ "bare_avg": arm_avg.get("bare"),
224
+ "margin_avg": margins_avg.get("variant_over_bare"),
225
+ "hard_floor_violations": disqualifier_count,
226
+ "margin_ge_5_count": margin_ge_5,
227
+ "gated_fixtures": len(gated_rows),
228
+ "known_limit_fixtures": len(excluded_known_limit),
229
+ "wall_ratio_variant_over_bare_avg": wall_ratio_avg_by_pair.get("variant_over_bare"),
230
+ # iter-0019 — 3-arm aware aggregates.
231
+ "arms_present": {"variant": True, "solo_claude": has_solo, "bare": True},
232
+ "scores_avg_by_arm": arm_avg,
233
+ "margins_avg": margins_avg,
234
+ "wall_ratio_avg_by_pair": wall_ratio_avg_by_pair,
235
+ "rows": rows,
236
+ }
237
+ (res_root / "summary.json").write_text(json.dumps(summary, indent=2))
238
+
239
+ # Render human-readable report
240
+ lines = [
241
+ f"# Benchmark Suite Run — {summary['completed_at']}",
242
+ "",
243
+ f"Run-id: `{args.run_id}`",
244
+ f"Label: `{args.label or '(unlabeled)'}`",
245
+ f"Branch: `{summary['branch']}`",
246
+ f"Git SHA: `{summary['git_sha'][:12]}`",
247
+ "",
248
+ "| Fixture | Category | L2 (variant) | L1 (solo_claude) | L0 (bare) | L2-L0 | L1-L0 | L2-L1 | Winner | Wall L2/L1/L0 | Wall L2/L0 |",
249
+ "|---------|----------|--------------|------------------|-----------|-------|-------|-------|--------|---------------|-----------|",
250
+ ]
251
+ for r in rows:
252
+ if r.get("variant_score") is None:
253
+ lines.append(f"| {r['fixture']} | — | — | — | — | — | — | — | NO_JUDGE | — | — |")
254
+ continue
255
+ arms = r.get("arms", {}) or {}
256
+ v = arms.get("variant", {}) or {}
257
+ s = arms.get("solo_claude", {}) or {}
258
+ b = arms.get("bare", {}) or {}
259
+ margins = r.get("margins", {}) or {}
260
+ wallr = r.get("wall_ratios", {}) or {}
261
+ def fmt_score(arm):
262
+ if arm.get("score") is None:
263
+ return "—"
264
+ tag = " ⚠DQ" if arm.get("disqualifier") else (" ⏱TO" if arm.get("timed_out") else "")
265
+ return f"{arm['score']}{tag}"
266
+ def fmt_margin(v): return f"{v:+d}" if isinstance(v, int) else "—"
267
+ def fmt_wall(arm):
268
+ return f"{arm['wall_s']}s" if arm.get("wall_s") else "?"
269
+ l2_l0_wall = f"{wallr.get('variant_over_bare'):.1f}x" if wallr.get("variant_over_bare") else "—"
270
+ wall_triplet = f"{fmt_wall(v)}/{fmt_wall(s)}/{fmt_wall(b)}"
271
+ lines.append(
272
+ f"| {r['fixture']} | {r['category']} | {fmt_score(v)} | {fmt_score(s)} | {fmt_score(b)} | "
273
+ f"{fmt_margin(margins.get('variant_over_bare'))} | {fmt_margin(margins.get('solo_over_bare'))} | "
274
+ f"{fmt_margin(margins.get('variant_over_solo'))} | {r.get('winner') or '—'} | {wall_triplet} | {l2_l0_wall} |"
275
+ )
276
+ def fmt_avg(v): return f"{v:.1f}" if isinstance(v, (int, float)) else "n/a"
277
+ def fmt_signed(v): return f"{v:+.1f}" if isinstance(v, (int, float)) else "n/a"
278
+ def fmt_ratio(v): return f"{v:.1f}x" if isinstance(v, (int, float)) else "n/a"
279
+ margin_avg_val = summary.get("margin_avg")
280
+ margin_avg_str = fmt_signed(margin_avg_val)
281
+ wall_ratio_str = fmt_ratio(summary.get("wall_ratio_variant_over_bare_avg"))
282
+
283
+ lines += [
284
+ "",
285
+ f"**Suite average variant (L2) score:** {fmt_avg(summary['variant_avg'])}",
286
+ ]
287
+ if summary.get("arms_present", {}).get("solo_claude"):
288
+ lines.append(f"**Suite average solo_claude (L1) score:** {fmt_avg(summary['scores_avg_by_arm'].get('solo_claude'))}")
289
+ lines += [
290
+ f"**Suite average bare (L0) score:** {fmt_avg(summary['bare_avg'])}",
291
+ "",
292
+ f"**L2 vs L0 margin avg:** {margin_avg_str} (ship floor: +5, NORTH-STAR preferred: +8)",
293
+ ]
294
+ if summary.get("arms_present", {}).get("solo_claude"):
295
+ ms = summary.get("margins_avg", {}) or {}
296
+ ws = summary.get("wall_ratio_avg_by_pair", {}) or {}
297
+ lines += [
298
+ f"**L1 vs L0 margin avg:** {fmt_signed(ms.get('solo_over_bare'))} (NORTH-STAR L1 contract: ≥+5)",
299
+ f"**L2 vs L1 margin avg:** {fmt_signed(ms.get('variant_over_solo'))} (NORTH-STAR L2 contract: ≥+5 on pair-eligible)",
300
+ f"**Wall ratio L2/L0:** {fmt_ratio(ws.get('variant_over_bare'))}",
301
+ f"**Wall ratio L1/L0:** {fmt_ratio(ws.get('solo_over_bare'))}",
302
+ f"**Wall ratio L2/L1:** {fmt_ratio(ws.get('variant_over_solo'))}",
303
+ ]
304
+ else:
305
+ lines.append(f"**Wall ratio variant/bare (mean):** {wall_ratio_str} (no solo_claude arm in this run)")
306
+ lines += [
307
+ f"**Hard-floor violations:** {summary['hard_floor_violations']}",
308
+ f"**Fixtures with margin ≥ +5:** {summary['margin_ge_5_count']} / {summary['gated_fixtures']} (gate: ≥ 7 of 9)",
309
+ ]
310
+ # Critical findings digest — per-arm sections.
311
+ def has_findings(arm):
312
+ return bool((arm or {}).get("critical_findings"))
313
+ cf_rows = [r for r in rows if any(has_findings((r.get("arms") or {}).get(a)) for a in ("variant", "solo_claude", "bare"))]
314
+ if cf_rows:
315
+ lines += ["", "## Critical Findings", ""]
316
+ for r in cf_rows:
317
+ lines.append(f"### {r['fixture']}")
318
+ for arm_label, arm_key in [("Variant (L2)", "variant"), ("Solo Claude (L1)", "solo_claude"), ("Bare (L0)", "bare")]:
319
+ arm = (r.get("arms") or {}).get(arm_key) or {}
320
+ if has_findings(arm):
321
+ lines.append(f"**{arm_label}:**")
322
+ for f in arm["critical_findings"]:
323
+ lines.append(f"- {f}")
324
+ lines.append("")
325
+ (res_root / "report.md").write_text("\n".join(lines))
326
+ print((res_root / "report.md").read_text())
327
+ return 0
328
+
329
+
330
+ if __name__ == "__main__":
331
+ sys.exit(main())