devlyn-cli 1.15.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +175 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
compile-report.py — aggregate one run's fixture artifacts into a summary.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
compile-report.py --run-id <ID> [--label <VERSION>]
|
|
7
|
+
|
|
8
|
+
Reads: benchmark/auto-resolve/results/<run-id>/<fixture>/{judge.json, variant/result.json, bare/result.json}
|
|
9
|
+
Writes:
|
|
10
|
+
results/<run-id>/summary.json (machine)
|
|
11
|
+
results/<run-id>/report.md (human)
|
|
12
|
+
|
|
13
|
+
The report is the output of `npx devlyn-cli benchmark`. Ship-gate.py consumes summary.json.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
import argparse, json, pathlib, sys, subprocess, datetime
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def git_sha() -> str:
|
|
20
|
+
try:
|
|
21
|
+
return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
|
|
22
|
+
except Exception:
|
|
23
|
+
return "unknown"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def git_branch() -> str:
|
|
27
|
+
try:
|
|
28
|
+
return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], text=True).strip()
|
|
29
|
+
except Exception:
|
|
30
|
+
return "unknown"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main() -> int:
|
|
34
|
+
p = argparse.ArgumentParser()
|
|
35
|
+
p.add_argument("--run-id", required=True)
|
|
36
|
+
p.add_argument("--label", default=None, help="version label, e.g. v3.6")
|
|
37
|
+
args = p.parse_args()
|
|
38
|
+
|
|
39
|
+
bench_root = pathlib.Path(__file__).resolve().parent.parent
|
|
40
|
+
res_root = bench_root / "results" / args.run_id
|
|
41
|
+
if not res_root.is_dir():
|
|
42
|
+
print(f"no results dir: {res_root}", file=sys.stderr); return 1
|
|
43
|
+
|
|
44
|
+
fixtures = sorted([d.name for d in res_root.iterdir() if d.is_dir()])
|
|
45
|
+
rows = []
|
|
46
|
+
for fid in fixtures:
|
|
47
|
+
fdir = res_root / fid
|
|
48
|
+
judge_path = fdir / "judge.json"
|
|
49
|
+
if not judge_path.exists():
|
|
50
|
+
rows.append({"fixture": fid, "status": "NO_JUDGE", "reason": "judge.json missing"})
|
|
51
|
+
continue
|
|
52
|
+
judge = json.loads(judge_path.read_text())
|
|
53
|
+
# iter-0019: 3-arm aware. judge.json now carries scores_by_arm /
|
|
54
|
+
# findings_by_arm / disqualifiers_by_arm / margins. Older judge.json
|
|
55
|
+
# (pre-iter-0019, only variant_score + bare_score) is handled by
|
|
56
|
+
# falling back to legacy fields.
|
|
57
|
+
scores_by_arm = judge.get("scores_by_arm") or {}
|
|
58
|
+
if not scores_by_arm:
|
|
59
|
+
if "variant_score" in judge:
|
|
60
|
+
scores_by_arm["variant"] = judge["variant_score"]
|
|
61
|
+
if "bare_score" in judge:
|
|
62
|
+
scores_by_arm["bare"] = judge["bare_score"]
|
|
63
|
+
|
|
64
|
+
findings_by_arm = judge.get("findings_by_arm") or {}
|
|
65
|
+
dq_by_arm = judge.get("disqualifiers_by_arm") or {}
|
|
66
|
+
margins = judge.get("margins") or {}
|
|
67
|
+
|
|
68
|
+
arm_results = {}
|
|
69
|
+
for arm in ("variant", "solo_claude", "bare"):
|
|
70
|
+
res_p = fdir / arm / "result.json"
|
|
71
|
+
arm_results[arm] = json.loads(res_p.read_text()) if res_p.exists() else {}
|
|
72
|
+
var_res = arm_results["variant"]
|
|
73
|
+
solo_res = arm_results["solo_claude"]
|
|
74
|
+
bare_res = arm_results["bare"]
|
|
75
|
+
|
|
76
|
+
meta_p = bench_root / "fixtures" / fid / "metadata.json"
|
|
77
|
+
category = "unknown"
|
|
78
|
+
if meta_p.exists():
|
|
79
|
+
try:
|
|
80
|
+
category = json.loads(meta_p.read_text()).get("category", "unknown")
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
def wall_ratio(numer, denom):
|
|
85
|
+
if numer and denom:
|
|
86
|
+
return round(numer / denom, 2)
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
# Disqualifier flags per arm = OR of deterministic result.json flag and
|
|
90
|
+
# judge's subjective flag (from new dq_by_arm map; fall back to legacy
|
|
91
|
+
# A/B-letter shape if present).
|
|
92
|
+
def arm_dq_judge(arm: str):
|
|
93
|
+
if arm in dq_by_arm:
|
|
94
|
+
return bool(dq_by_arm[arm].get("disqualifier", False))
|
|
95
|
+
mapping = judge.get("_blind_mapping", {}) or {}
|
|
96
|
+
for letter in ("A", "B", "C"):
|
|
97
|
+
if mapping.get(letter) == arm:
|
|
98
|
+
return bool((judge.get("disqualifiers", {}) or {}).get(letter, False))
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# Per-arm payload — arm absent = scores_by_arm key absent, downstream
|
|
102
|
+
# consumers null-check.
|
|
103
|
+
arms_block = {}
|
|
104
|
+
for arm in ("variant", "solo_claude", "bare"):
|
|
105
|
+
r = arm_results.get(arm) or {}
|
|
106
|
+
score = scores_by_arm.get(arm)
|
|
107
|
+
judge_dq = arm_dq_judge(arm)
|
|
108
|
+
det_dq = bool(r.get("disqualifier", False))
|
|
109
|
+
arms_block[arm] = {
|
|
110
|
+
"score": score,
|
|
111
|
+
"wall_s": r.get("elapsed_seconds"),
|
|
112
|
+
"verify_score": r.get("verify_score"),
|
|
113
|
+
"files_changed": r.get("files_changed"),
|
|
114
|
+
"timed_out": bool(r.get("timed_out", False)),
|
|
115
|
+
"disqualifier": judge_dq or det_dq,
|
|
116
|
+
"dq_judge": judge_dq,
|
|
117
|
+
"dq_deterministic": det_dq,
|
|
118
|
+
"critical_findings": findings_by_arm.get(arm, []) if findings_by_arm else [],
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# Pairwise margins. Prefer judge-side margins (single calibrated
|
|
122
|
+
# scoring) over arithmetic differences, but fall through to compute
|
|
123
|
+
# from scores_by_arm if the judge didn't emit margins.
|
|
124
|
+
def m(left, right, key):
|
|
125
|
+
if margins.get(key) is not None:
|
|
126
|
+
return margins[key]
|
|
127
|
+
l = scores_by_arm.get(left); r2 = scores_by_arm.get(right)
|
|
128
|
+
if l is None or r2 is None:
|
|
129
|
+
return None
|
|
130
|
+
return l - r2
|
|
131
|
+
|
|
132
|
+
row = {
|
|
133
|
+
"fixture": fid,
|
|
134
|
+
"category": category,
|
|
135
|
+
"arms": arms_block,
|
|
136
|
+
# Pairwise margins (positive = first arm beat second).
|
|
137
|
+
"margins": {
|
|
138
|
+
"variant_over_bare": m("variant", "bare", "variant_over_bare"),
|
|
139
|
+
"solo_over_bare": m("solo_claude", "bare", "solo_over_bare"),
|
|
140
|
+
"variant_over_solo": m("variant", "solo_claude", "variant_over_solo"),
|
|
141
|
+
},
|
|
142
|
+
# Wall ratios per pairwise comparison (NORTH-STAR.md tests #2/#7
|
|
143
|
+
# generalized): each layer must beat previous-layer-best-of-N.
|
|
144
|
+
"wall_ratios": {
|
|
145
|
+
"variant_over_bare": wall_ratio(arms_block["variant"]["wall_s"], arms_block["bare"]["wall_s"]),
|
|
146
|
+
"solo_over_bare": wall_ratio(arms_block["solo_claude"]["wall_s"], arms_block["bare"]["wall_s"]),
|
|
147
|
+
"variant_over_solo": wall_ratio(arms_block["variant"]["wall_s"], arms_block["solo_claude"]["wall_s"]),
|
|
148
|
+
},
|
|
149
|
+
"winner": judge.get("winner_arm"),
|
|
150
|
+
# Legacy fields preserved so older summary readers still parse.
|
|
151
|
+
"variant_score": arms_block["variant"]["score"],
|
|
152
|
+
"bare_score": arms_block["bare"]["score"],
|
|
153
|
+
"margin": m("variant", "bare", "variant_over_bare"),
|
|
154
|
+
"variant_disqualifier": arms_block["variant"]["disqualifier"],
|
|
155
|
+
"variant_dq_judge": arms_block["variant"]["dq_judge"],
|
|
156
|
+
"variant_dq_deterministic": arms_block["variant"]["dq_deterministic"],
|
|
157
|
+
"variant_wall_s": arms_block["variant"]["wall_s"],
|
|
158
|
+
"bare_wall_s": arms_block["bare"]["wall_s"],
|
|
159
|
+
"wall_ratio_variant_over_bare": wall_ratio(
|
|
160
|
+
arms_block["variant"]["wall_s"], arms_block["bare"]["wall_s"]),
|
|
161
|
+
"variant_verify_score": arms_block["variant"]["verify_score"],
|
|
162
|
+
"bare_verify_score": arms_block["bare"]["verify_score"],
|
|
163
|
+
"variant_files_changed": arms_block["variant"]["files_changed"],
|
|
164
|
+
"bare_files_changed": arms_block["bare"]["files_changed"],
|
|
165
|
+
"critical_findings_variant": arms_block["variant"]["critical_findings"],
|
|
166
|
+
"critical_findings_bare": arms_block["bare"]["critical_findings"],
|
|
167
|
+
}
|
|
168
|
+
rows.append(row)
|
|
169
|
+
|
|
170
|
+
# Aggregate
|
|
171
|
+
scored = [r for r in rows if r.get("variant_score") is not None]
|
|
172
|
+
excluded_known_limit = [r for r in scored if r.get("category") == "edge"] # F8 and similar
|
|
173
|
+
gated_rows = [r for r in scored if r.get("category") != "edge"]
|
|
174
|
+
|
|
175
|
+
def avg(values):
|
|
176
|
+
vals = [v for v in values if v is not None]
|
|
177
|
+
return round(sum(vals) / len(vals), 1) if vals else None
|
|
178
|
+
|
|
179
|
+
# iter-0019: per-arm averages (whichever arms ran).
|
|
180
|
+
arm_avg = {}
|
|
181
|
+
for arm in ("variant", "solo_claude", "bare"):
|
|
182
|
+
arm_avg[arm] = avg([(r.get("arms", {}).get(arm) or {}).get("score") for r in scored])
|
|
183
|
+
|
|
184
|
+
# Pairwise margin averages (positive = first arm wins on average).
|
|
185
|
+
def margin_avg(key):
|
|
186
|
+
return avg([(r.get("margins") or {}).get(key) for r in scored])
|
|
187
|
+
|
|
188
|
+
margins_avg = {
|
|
189
|
+
"variant_over_bare": margin_avg("variant_over_bare"),
|
|
190
|
+
"solo_over_bare": margin_avg("solo_over_bare"),
|
|
191
|
+
"variant_over_solo": margin_avg("variant_over_solo"),
|
|
192
|
+
}
|
|
193
|
+
# Pairwise wall-ratio averages.
|
|
194
|
+
def wall_avg(key):
|
|
195
|
+
return avg([(r.get("wall_ratios") or {}).get(key) for r in scored])
|
|
196
|
+
|
|
197
|
+
wall_ratio_avg_by_pair = {
|
|
198
|
+
"variant_over_bare": wall_avg("variant_over_bare"),
|
|
199
|
+
"solo_over_bare": wall_avg("solo_over_bare"),
|
|
200
|
+
"variant_over_solo": wall_avg("variant_over_solo"),
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
# margin_ge_5_count over the iter-0018-era variant-vs-bare metric,
|
|
204
|
+
# because the legacy ship-gate.py reads that. Pair-aware gates get
|
|
205
|
+
# added in iter-0021 / 0022 once the data shape stabilizes.
|
|
206
|
+
margin_ge_5 = sum(1 for r in gated_rows if (r.get("margin") or 0) >= 5)
|
|
207
|
+
disqualifier_count = sum(1 for r in scored if r.get("variant_disqualifier"))
|
|
208
|
+
|
|
209
|
+
# arm-presence flags so consumers know whether the iter is 2-arm legacy
|
|
210
|
+
# or 3-arm post-iter-0019.
|
|
211
|
+
has_solo = any((r.get("arms", {}).get("solo_claude") or {}).get("score") is not None for r in scored)
|
|
212
|
+
|
|
213
|
+
summary = {
|
|
214
|
+
"run_id": args.run_id,
|
|
215
|
+
"label": args.label,
|
|
216
|
+
"git_sha": git_sha(),
|
|
217
|
+
"branch": git_branch(),
|
|
218
|
+
"completed_at": datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z",
|
|
219
|
+
"fixtures_total": len(rows),
|
|
220
|
+
"fixtures_scored": len(scored),
|
|
221
|
+
# Legacy 2-arm fields kept for ship-gate.py + history readers.
|
|
222
|
+
"variant_avg": arm_avg.get("variant"),
|
|
223
|
+
"bare_avg": arm_avg.get("bare"),
|
|
224
|
+
"margin_avg": margins_avg.get("variant_over_bare"),
|
|
225
|
+
"hard_floor_violations": disqualifier_count,
|
|
226
|
+
"margin_ge_5_count": margin_ge_5,
|
|
227
|
+
"gated_fixtures": len(gated_rows),
|
|
228
|
+
"known_limit_fixtures": len(excluded_known_limit),
|
|
229
|
+
"wall_ratio_variant_over_bare_avg": wall_ratio_avg_by_pair.get("variant_over_bare"),
|
|
230
|
+
# iter-0019 — 3-arm aware aggregates.
|
|
231
|
+
"arms_present": {"variant": True, "solo_claude": has_solo, "bare": True},
|
|
232
|
+
"scores_avg_by_arm": arm_avg,
|
|
233
|
+
"margins_avg": margins_avg,
|
|
234
|
+
"wall_ratio_avg_by_pair": wall_ratio_avg_by_pair,
|
|
235
|
+
"rows": rows,
|
|
236
|
+
}
|
|
237
|
+
(res_root / "summary.json").write_text(json.dumps(summary, indent=2))
|
|
238
|
+
|
|
239
|
+
# Render human-readable report
|
|
240
|
+
lines = [
|
|
241
|
+
f"# Benchmark Suite Run — {summary['completed_at']}",
|
|
242
|
+
"",
|
|
243
|
+
f"Run-id: `{args.run_id}`",
|
|
244
|
+
f"Label: `{args.label or '(unlabeled)'}`",
|
|
245
|
+
f"Branch: `{summary['branch']}`",
|
|
246
|
+
f"Git SHA: `{summary['git_sha'][:12]}`",
|
|
247
|
+
"",
|
|
248
|
+
"| Fixture | Category | L2 (variant) | L1 (solo_claude) | L0 (bare) | L2-L0 | L1-L0 | L2-L1 | Winner | Wall L2/L1/L0 | Wall L2/L0 |",
|
|
249
|
+
"|---------|----------|--------------|------------------|-----------|-------|-------|-------|--------|---------------|-----------|",
|
|
250
|
+
]
|
|
251
|
+
for r in rows:
|
|
252
|
+
if r.get("variant_score") is None:
|
|
253
|
+
lines.append(f"| {r['fixture']} | — | — | — | — | — | — | — | NO_JUDGE | — | — |")
|
|
254
|
+
continue
|
|
255
|
+
arms = r.get("arms", {}) or {}
|
|
256
|
+
v = arms.get("variant", {}) or {}
|
|
257
|
+
s = arms.get("solo_claude", {}) or {}
|
|
258
|
+
b = arms.get("bare", {}) or {}
|
|
259
|
+
margins = r.get("margins", {}) or {}
|
|
260
|
+
wallr = r.get("wall_ratios", {}) or {}
|
|
261
|
+
def fmt_score(arm):
|
|
262
|
+
if arm.get("score") is None:
|
|
263
|
+
return "—"
|
|
264
|
+
tag = " ⚠DQ" if arm.get("disqualifier") else (" ⏱TO" if arm.get("timed_out") else "")
|
|
265
|
+
return f"{arm['score']}{tag}"
|
|
266
|
+
def fmt_margin(v): return f"{v:+d}" if isinstance(v, int) else "—"
|
|
267
|
+
def fmt_wall(arm):
|
|
268
|
+
return f"{arm['wall_s']}s" if arm.get("wall_s") else "?"
|
|
269
|
+
l2_l0_wall = f"{wallr.get('variant_over_bare'):.1f}x" if wallr.get("variant_over_bare") else "—"
|
|
270
|
+
wall_triplet = f"{fmt_wall(v)}/{fmt_wall(s)}/{fmt_wall(b)}"
|
|
271
|
+
lines.append(
|
|
272
|
+
f"| {r['fixture']} | {r['category']} | {fmt_score(v)} | {fmt_score(s)} | {fmt_score(b)} | "
|
|
273
|
+
f"{fmt_margin(margins.get('variant_over_bare'))} | {fmt_margin(margins.get('solo_over_bare'))} | "
|
|
274
|
+
f"{fmt_margin(margins.get('variant_over_solo'))} | {r.get('winner') or '—'} | {wall_triplet} | {l2_l0_wall} |"
|
|
275
|
+
)
|
|
276
|
+
def fmt_avg(v): return f"{v:.1f}" if isinstance(v, (int, float)) else "n/a"
|
|
277
|
+
def fmt_signed(v): return f"{v:+.1f}" if isinstance(v, (int, float)) else "n/a"
|
|
278
|
+
def fmt_ratio(v): return f"{v:.1f}x" if isinstance(v, (int, float)) else "n/a"
|
|
279
|
+
margin_avg_val = summary.get("margin_avg")
|
|
280
|
+
margin_avg_str = fmt_signed(margin_avg_val)
|
|
281
|
+
wall_ratio_str = fmt_ratio(summary.get("wall_ratio_variant_over_bare_avg"))
|
|
282
|
+
|
|
283
|
+
lines += [
|
|
284
|
+
"",
|
|
285
|
+
f"**Suite average variant (L2) score:** {fmt_avg(summary['variant_avg'])}",
|
|
286
|
+
]
|
|
287
|
+
if summary.get("arms_present", {}).get("solo_claude"):
|
|
288
|
+
lines.append(f"**Suite average solo_claude (L1) score:** {fmt_avg(summary['scores_avg_by_arm'].get('solo_claude'))}")
|
|
289
|
+
lines += [
|
|
290
|
+
f"**Suite average bare (L0) score:** {fmt_avg(summary['bare_avg'])}",
|
|
291
|
+
"",
|
|
292
|
+
f"**L2 vs L0 margin avg:** {margin_avg_str} (ship floor: +5, NORTH-STAR preferred: +8)",
|
|
293
|
+
]
|
|
294
|
+
if summary.get("arms_present", {}).get("solo_claude"):
|
|
295
|
+
ms = summary.get("margins_avg", {}) or {}
|
|
296
|
+
ws = summary.get("wall_ratio_avg_by_pair", {}) or {}
|
|
297
|
+
lines += [
|
|
298
|
+
f"**L1 vs L0 margin avg:** {fmt_signed(ms.get('solo_over_bare'))} (NORTH-STAR L1 contract: ≥+5)",
|
|
299
|
+
f"**L2 vs L1 margin avg:** {fmt_signed(ms.get('variant_over_solo'))} (NORTH-STAR L2 contract: ≥+5 on pair-eligible)",
|
|
300
|
+
f"**Wall ratio L2/L0:** {fmt_ratio(ws.get('variant_over_bare'))}",
|
|
301
|
+
f"**Wall ratio L1/L0:** {fmt_ratio(ws.get('solo_over_bare'))}",
|
|
302
|
+
f"**Wall ratio L2/L1:** {fmt_ratio(ws.get('variant_over_solo'))}",
|
|
303
|
+
]
|
|
304
|
+
else:
|
|
305
|
+
lines.append(f"**Wall ratio variant/bare (mean):** {wall_ratio_str} (no solo_claude arm in this run)")
|
|
306
|
+
lines += [
|
|
307
|
+
f"**Hard-floor violations:** {summary['hard_floor_violations']}",
|
|
308
|
+
f"**Fixtures with margin ≥ +5:** {summary['margin_ge_5_count']} / {summary['gated_fixtures']} (gate: ≥ 7 of 9)",
|
|
309
|
+
]
|
|
310
|
+
# Critical findings digest — per-arm sections.
|
|
311
|
+
def has_findings(arm):
|
|
312
|
+
return bool((arm or {}).get("critical_findings"))
|
|
313
|
+
cf_rows = [r for r in rows if any(has_findings((r.get("arms") or {}).get(a)) for a in ("variant", "solo_claude", "bare"))]
|
|
314
|
+
if cf_rows:
|
|
315
|
+
lines += ["", "## Critical Findings", ""]
|
|
316
|
+
for r in cf_rows:
|
|
317
|
+
lines.append(f"### {r['fixture']}")
|
|
318
|
+
for arm_label, arm_key in [("Variant (L2)", "variant"), ("Solo Claude (L1)", "solo_claude"), ("Bare (L0)", "bare")]:
|
|
319
|
+
arm = (r.get("arms") or {}).get(arm_key) or {}
|
|
320
|
+
if has_findings(arm):
|
|
321
|
+
lines.append(f"**{arm_label}:**")
|
|
322
|
+
for f in arm["critical_findings"]:
|
|
323
|
+
lines.append(f"- {f}")
|
|
324
|
+
lines.append("")
|
|
325
|
+
(res_root / "report.md").write_text("\n".join(lines))
|
|
326
|
+
print((res_root / "report.md").read_text())
|
|
327
|
+
return 0
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
if __name__ == "__main__":
|
|
331
|
+
sys.exit(main())
|