devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,552 @@
1
+ #!/usr/bin/env python3
2
+ """iter-0033c gate table — NEW L2 vs NEW L1 on /devlyn:resolve.
3
+
4
+ Reads:
5
+ - manifest (immutable; built by build-pair-eligible-manifest.py)
6
+ - per-fixture judge.json files under <results-dir>/<fixture>/<arm>/judge.json
7
+ - per-fixture timing/result.json files for wall + disqualifier signals
8
+ - per-fixture pipeline.state.json (under <work-dir>/.devlyn/runs/) for
9
+ pair_judge sub-verdict — required for Gate 6 + Gate 8.
10
+
11
+ Emits:
12
+ - gates.json — machine-readable {gate_id, status, evidence}
13
+ - gates.md — human-readable summary table
14
+
15
+ Gates per iter-0033c §"Acceptance gate":
16
+ 1a smoke (mode wiring) — recorded externally; pre-suite gate
17
+ 1b smoke (codex avail) — recorded externally; pre-suite gate
18
+ 1c smoke (impl confound) — recorded externally; pre-suite gate
19
+ 2 no-regression vs L1 (gated arm): every fixture (l2_gated − solo) ≥ −3
20
+ 3 lift on pair-eligible (gated arm, SHIP-BLOCKER): on frozen pair-eligible set,
21
+ count fixtures with (l2_gated − solo) ≥ +5; require count ≥ gate3_threshold_count.
22
+ 4 hard-floor: zero l2_gated disqualifier on previously-clean l1 fixtures;
23
+ zero l2_gated CRITICAL/HIGH design.* / security.* on previously-clean l1;
24
+ zero l2_gated watchdog timeouts (where l1 didn't time out).
25
+ 5 efficiency: per-fixture l2_gated_wall / l1_wall ≤ 2.0× (≤ 3.0× allowed only
26
+ when l2 catches a categorical rescue l1 missed).
27
+ 6 trigger discipline (fixture-level): for each pair-eligible fixture, if
28
+ l2_forced lifts ≥ +5 OR catches categorical rescue, AND forced is not
29
+ impl-confounded, AND forced.pair_judge present → l2_gated MUST also have
30
+ pair_judge present on that fixture.
31
+ 7 attribution (4-class, data-only): per-fixture classify into
32
+ {no_material_lift, implementation_confounded, tool_or_trigger_lift,
33
+ deliberation_lift}. Reporting only; not pass/fail.
34
+ 8 artifact contract: pair_judge non-null for every fixture where pair fired;
35
+ pair findings distinguishable from solo judge findings.
36
+
37
+ Ship-blockers: 1a, 1b, 1c, 2, 3, 4, 6.
38
+ Quality gates: 5, 8 (failure → root-cause iter; Phase 4 holds).
39
+ Data-only: 7.
40
+ """
41
+ import argparse
42
+ import json
43
+ import sys
44
+ from pathlib import Path
45
+
46
+
47
+ def load_judge(results_dir: Path, fixture: str) -> dict | None:
48
+ p = results_dir / fixture / "judge.json"
49
+ if not p.is_file():
50
+ return None
51
+ return json.loads(p.read_text())
52
+
53
+
54
+ def load_result(results_dir: Path, fixture: str, arm: str) -> dict | None:
55
+ p = results_dir / fixture / arm / "result.json"
56
+ if not p.is_file():
57
+ return None
58
+ return json.loads(p.read_text())
59
+
60
+
61
+ def load_state(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> dict | None:
62
+ """state.json lives in /tmp/bench-{run_id}-{fixture}-{arm}/.devlyn/runs/<rs-id>/."""
63
+ work = work_dir_root / f"bench-{run_id}-{fixture}-{arm}"
64
+ runs = work / ".devlyn" / "runs"
65
+ if not runs.is_dir():
66
+ return None
67
+ candidates = sorted(runs.glob("*/pipeline.state.json"))
68
+ if not candidates:
69
+ return None
70
+ return json.loads(candidates[-1].read_text())
71
+
72
+
73
+ def archive_run_dir(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> Path | None:
74
+ """The .devlyn/runs/<rs-id>/ where pipeline archived per-run artifacts."""
75
+ work = work_dir_root / f"bench-{run_id}-{fixture}-{arm}"
76
+ runs = work / ".devlyn" / "runs"
77
+ if not runs.is_dir():
78
+ return None
79
+ cands = sorted(runs.glob("*/pipeline.state.json"))
80
+ if not cands:
81
+ return None
82
+ return cands[-1].parent
83
+
84
+
85
+ def changed_files(results_dir: Path, fixture: str, arm: str) -> set[str]:
86
+ """Read changed-files.txt for an arm; returns set of file paths."""
87
+ p = results_dir / fixture / arm / "changed-files.txt"
88
+ if not p.is_file():
89
+ return set()
90
+ return {ln.strip() for ln in p.read_text().splitlines() if ln.strip()}
91
+
92
+
93
+ def pair_findings_distinguishable(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> bool:
94
+ """True iff archive has at least one per-engine pair-judge artifact for >=2 engines.
95
+
96
+ Gate 8: pair findings must be distinguishable from solo judge findings. The
97
+ orchestrator (Claude reading SKILL.md) was observed across smokes 1a + 1c
98
+ (fixed-diff) writing per-judge artifacts under several naming conventions:
99
+
100
+ * `verify-judge-<engine>.md` (smoke 1a, full pair mode)
101
+ * `verify.judge.<engine>.findings.jsonl` (smoke 1c, verify-only pair mode)
102
+ * `verify.judge.<engine>.summary.json` (verify-only mode auxiliary)
103
+
104
+ Detection treats "two distinct engine identifiers across any of those patterns"
105
+ as distinguishable. archive_run.py moves both `verify-judge-*.md` (added in
106
+ iter-0033c) and any `*.findings.jsonl` (existing pre-iter-0033c) so both
107
+ conventions land in the run dir. Either missing distinct engines = broken.
108
+ """
109
+ archive = archive_run_dir(work_dir_root, run_id, fixture, arm)
110
+ if archive is None:
111
+ return False
112
+ import re
113
+ # Collect engine identifiers across all observed pair-judge naming patterns.
114
+ patterns = [
115
+ re.compile(r"^verify-judge-(?P<engine>[a-z0-9_]+)\.md$"),
116
+ re.compile(r"^verify\.judge\.(?P<engine>[a-z0-9_]+)\.findings\.jsonl$"),
117
+ re.compile(r"^verify\.judge\.(?P<engine>[a-z0-9_]+)\.summary\.json$"),
118
+ ]
119
+ engines = set()
120
+ for f in archive.iterdir():
121
+ if not f.is_file():
122
+ continue
123
+ for pat in patterns:
124
+ m = pat.match(f.name)
125
+ if m:
126
+ engines.add(m.group("engine"))
127
+ break
128
+ return len(engines) >= 2
129
+
130
+
131
+ def impl_confounded_for_fixture(results_dir: Path, fixture: str) -> bool:
132
+ """True iff symmetric difference of changed-files between solo_claude and
133
+ l2_forced is non-empty. Drives Gate 7 attribution + Gate 3 carve-out
134
+ decision per Codex R-final-smoke Q5 (option (b): fixed-diff for confounded
135
+ fixtures only, NOT a wholesale pivot).
136
+
137
+ Smoke 1c (2026-05-02) confirmed F2 confounds (solo touched 1 file, l2_forced
138
+ touched 2 — l2_forced added a test file solo skipped) while F3 does not
139
+ (both arms touched the same 2 files).
140
+ """
141
+ solo = changed_files(results_dir, fixture, "solo_claude")
142
+ forced = changed_files(results_dir, fixture, "l2_forced")
143
+ if not solo or not forced:
144
+ return False # arm absent → can't claim confound
145
+ return solo.symmetric_difference(forced) != set()
146
+
147
+
148
+ def fixture_short(name: str) -> str:
149
+ return name.split("-", 1)[0] if "-" in name else name
150
+
151
+
152
+ def find_results_dir_fixtures(results_dir: Path) -> list[str]:
153
+ return sorted(d.name for d in results_dir.iterdir() if d.is_dir())
154
+
155
+
156
+ def get_score(judge: dict, arm: str) -> int | None:
157
+ """Score for a given arm. Prefer judge.json's `scores_by_arm` (already
158
+ arm-keyed); fall back to blind A/B/C lookup with case-correct `<letter>_score`
159
+ field (judge.sh writes a_score/b_score lowercase, not A_score)."""
160
+ if not judge:
161
+ return None
162
+ sba = judge.get("scores_by_arm") or {}
163
+ if arm in sba:
164
+ return sba[arm]
165
+ mapping = judge.get("_blind_mapping") or {}
166
+ letter = next((k for k, v in mapping.items() if v == arm), None)
167
+ if not letter:
168
+ return None
169
+ return judge.get(f"{letter.lower()}_score")
170
+
171
+
172
+ def get_disqualifier(judge: dict, arm: str) -> bool:
173
+ """DQ flag for a given arm. Prefer `disqualifiers_by_arm` written by judge.sh
174
+ line 314-323; fall back to blind A/B/C with case-correct letter."""
175
+ if not judge:
176
+ return False
177
+ dba = judge.get("disqualifiers_by_arm") or {}
178
+ if arm in dba:
179
+ return bool(dba[arm].get("disqualifier", False))
180
+ dqs = judge.get("disqualifiers") or {}
181
+ mapping = judge.get("_blind_mapping") or {}
182
+ letter = next((k for k, v in mapping.items() if v == arm), None)
183
+ if not letter:
184
+ return False
185
+ return bool(dqs.get(letter, False))
186
+
187
+
188
+ def gate_2_no_regression(rows: list[dict]) -> dict:
189
+ failures = []
190
+ for row in rows:
191
+ if row["solo_score"] is None or row["l2_gated_score"] is None:
192
+ continue
193
+ delta = row["l2_gated_score"] - row["solo_score"]
194
+ if delta < -3:
195
+ failures.append({"fixture": row["fixture"], "delta": delta})
196
+ return {
197
+ "gate": "2-no-regression",
198
+ "status": "PASS" if not failures else "FAIL",
199
+ "rule": "every fixture: (l2_gated − solo) ≥ −3",
200
+ "failures": failures,
201
+ }
202
+
203
+
204
+ def gate_3_lift(rows: list[dict], manifest: dict) -> dict:
205
+ eligible = set(manifest["fixtures_pair_eligible"])
206
+ threshold = manifest["gate3_threshold_count"]
207
+ total = manifest["gate3_total"]
208
+ counted = []
209
+ for row in rows:
210
+ fx = fixture_short(row["fixture"])
211
+ if fx not in eligible:
212
+ continue
213
+ if row["solo_score"] is None or row["l2_gated_score"] is None:
214
+ continue
215
+ delta = row["l2_gated_score"] - row["solo_score"]
216
+ counted.append({"fixture": row["fixture"], "delta": delta, "lift_ge5": delta >= 5})
217
+ n_lift = sum(1 for c in counted if c["lift_ge5"])
218
+ return {
219
+ "gate": "3-lift-on-pair-eligible",
220
+ "ship_blocker": True,
221
+ "status": "PASS" if n_lift >= threshold else "FAIL",
222
+ "rule": f"lift ≥ +5 on ≥ {threshold} of {total} pair-eligible fixtures",
223
+ "lift_count": n_lift,
224
+ "threshold": threshold,
225
+ "total": total,
226
+ "details": counted,
227
+ }
228
+
229
+
230
+ def classify_l2_disqualifier(row: dict, mechanical_findings: list[dict]) -> str:
231
+ """Bucket why L2 disqualified a previously-clean L1 fixture (Codex R-final-fdfd Q2).
232
+
233
+ Buckets:
234
+ - `mechanical_failed`: deterministic spec-verify gate ALSO flagged a
235
+ disqualifier-class finding → real product defect → Gate 4 FAIL.
236
+ - `target_env_reproduced`: pair-JUDGE finding manually reproduced in the
237
+ target env (post-suite human adjudication). Gate 4 FAIL.
238
+ - `pair_sandbox_only`: pair-JUDGE surfaced a CRITICAL/HIGH finding that
239
+ mechanical did NOT trigger. Could be valid-but-environment-conditional
240
+ (e.g. EPERM handling on systems where ~/.claude is unreadable); Codex's
241
+ smoke 1c-fixed on F2 was textbook of this. Logged as Gate 7 evidence,
242
+ NOT Gate 4 FAIL.
243
+
244
+ Default classification = `pair_sandbox_only` when mechanical didn't fail.
245
+ `target_env_reproduced` requires post-hoc manual override (no auto-reproducer).
246
+ """
247
+ has_mechanical_dq = any(
248
+ (f.get("severity") in ("CRITICAL", "HIGH")
249
+ and f.get("source") in ("mechanical", "spec-verify"))
250
+ or f.get("disqualifier") is True
251
+ for f in mechanical_findings
252
+ )
253
+ if has_mechanical_dq:
254
+ return "mechanical_failed"
255
+ return "pair_sandbox_only"
256
+
257
+
258
+ def load_mechanical_findings(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> list[dict]:
259
+ archive = archive_run_dir(work_dir_root, run_id, fixture, arm)
260
+ if archive is None:
261
+ return []
262
+ p = archive / "verify-mechanical.findings.jsonl"
263
+ if not p.is_file():
264
+ return []
265
+ out = []
266
+ for ln in p.read_text().splitlines():
267
+ ln = ln.strip()
268
+ if not ln:
269
+ continue
270
+ try:
271
+ out.append(json.loads(ln))
272
+ except json.JSONDecodeError:
273
+ continue
274
+ return out
275
+
276
+
277
+ def gate_4_hard_floor(rows: list[dict], work_dir_root: Path, run_id: str) -> dict:
278
+ failures = []
279
+ sandbox_only = []
280
+ for row in rows:
281
+ if row["solo_dq"]:
282
+ continue
283
+ if row["l2_gated_dq"]:
284
+ mech = load_mechanical_findings(work_dir_root, run_id, row["fixture"], "l2_gated")
285
+ classification = classify_l2_disqualifier(row, mech)
286
+ entry = {"fixture": row["fixture"], "kind": "l2_gated_dq_on_clean_l1",
287
+ "classification": classification}
288
+ if classification in ("mechanical_failed", "target_env_reproduced"):
289
+ failures.append(entry)
290
+ else:
291
+ sandbox_only.append(entry)
292
+ if row["l2_gated_timeout"] and not row["solo_timeout"]:
293
+ failures.append({"fixture": row["fixture"], "kind": "l2_gated_timeout_only",
294
+ "classification": "timeout"})
295
+ return {
296
+ "gate": "4-hard-floor",
297
+ "ship_blocker": True,
298
+ "status": "PASS" if not failures else "FAIL",
299
+ "rule": ("zero l2_gated dq / timeout on previously-clean l1 fixtures, "
300
+ "where dq is classified as mechanical_failed OR target_env_reproduced "
301
+ "(pair_sandbox_only logged as Gate 7 evidence per Codex R-final-fdfd Q2)"),
302
+ "failures": failures,
303
+ "pair_sandbox_only_logged": sandbox_only,
304
+ }
305
+
306
+
307
+ def gate_5_efficiency(rows: list[dict]) -> dict:
308
+ failures = []
309
+ details = []
310
+ for row in rows:
311
+ if row["solo_wall"] is None or row["l2_gated_wall"] is None:
312
+ continue
313
+ if row["solo_wall"] == 0:
314
+ continue
315
+ ratio = row["l2_gated_wall"] / row["solo_wall"]
316
+ details.append({"fixture": row["fixture"], "ratio": round(ratio, 2)})
317
+ if ratio > 2.0:
318
+ failures.append({"fixture": row["fixture"], "ratio": round(ratio, 2)})
319
+ return {
320
+ "gate": "5-efficiency",
321
+ "status": "PASS" if not failures else "FAIL",
322
+ "rule": "per-fixture l2_gated_wall / l1_wall ≤ 2.0×",
323
+ "failures": failures,
324
+ "details": details,
325
+ }
326
+
327
+
328
+ def gate_6_trigger_discipline(rows: list[dict], manifest: dict) -> dict:
329
+ eligible = set(manifest["fixtures_pair_eligible"])
330
+ failures = []
331
+ for row in rows:
332
+ fx = fixture_short(row["fixture"])
333
+ if fx not in eligible:
334
+ continue
335
+ if row["l2_forced_score"] is None:
336
+ continue
337
+ forced_lift = (
338
+ row["l2_forced_score"] is not None and row["solo_score"] is not None
339
+ and (row["l2_forced_score"] - row["solo_score"] >= 5)
340
+ )
341
+ forced_rescue = bool(row["solo_dq"] and not row["l2_forced_dq"])
342
+ forced_pair_present = bool(row["l2_forced_pair_judge_present"])
343
+ gated_pair_present = bool(row["l2_gated_pair_judge_present"])
344
+ if (forced_lift or forced_rescue) and forced_pair_present and not gated_pair_present:
345
+ failures.append({
346
+ "fixture": row["fixture"],
347
+ "forced_lift": forced_lift,
348
+ "forced_rescue": forced_rescue,
349
+ "gated_pair_judge_present": gated_pair_present,
350
+ })
351
+ return {
352
+ "gate": "6-trigger-discipline",
353
+ "ship_blocker": True,
354
+ "status": "PASS" if not failures else "FAIL",
355
+ "rule": "if forced lifts ≥ +5 (or rescues) → gated must also fire pair on that fixture",
356
+ "failures": failures,
357
+ }
358
+
359
+
360
+ def gate_7_attribution(rows: list[dict], manifest: dict) -> dict:
361
+ """4-class classification per fixture; data-gathering only.
362
+
363
+ no_material_lift — solo and l2 verdicts equivalent within ±2
364
+ implementation_confounded — IMPLEMENT diffs differ materially (smoke 1c flagged)
365
+ tool_or_trigger_lift — mechanical/coverage finding caused axis change
366
+ deliberation_lift — pair_judge surfaces verdict-binding finding absent from solo
367
+ """
368
+ classes = []
369
+ for row in rows:
370
+ fx = fixture_short(row["fixture"])
371
+ cls = "no_material_lift" # default
372
+ if row["impl_confounded"]:
373
+ cls = "implementation_confounded"
374
+ elif row["solo_score"] is not None and row["l2_gated_score"] is not None:
375
+ delta = row["l2_gated_score"] - row["solo_score"]
376
+ if abs(delta) <= 2:
377
+ cls = "no_material_lift"
378
+ elif row["pair_judge_unique_finding"]:
379
+ cls = "deliberation_lift"
380
+ elif row["mechanical_finding_drove_change"]:
381
+ cls = "tool_or_trigger_lift"
382
+ else:
383
+ cls = "deliberation_lift" if delta >= 5 else "no_material_lift"
384
+ classes.append({"fixture": row["fixture"], "class": cls,
385
+ "pair_eligible": fx in set(manifest["fixtures_pair_eligible"])})
386
+ return {
387
+ "gate": "7-attribution",
388
+ "data_only": True,
389
+ "status": "DATA",
390
+ "classes": classes,
391
+ }
392
+
393
+
394
+ def gate_8_artifact_contract(rows: list[dict]) -> dict:
395
+ failures = []
396
+ for row in rows:
397
+ # If pair fired (forced arm has pair_judge present) but artifact missing
398
+ if row["pair_fired"] and not row["pair_findings_distinguishable"]:
399
+ failures.append({"fixture": row["fixture"],
400
+ "missing": "pair_judge_findings_distinguishable"})
401
+ return {
402
+ "gate": "8-artifact-contract",
403
+ "status": "PASS" if not failures else "FAIL",
404
+ "rule": "pair_judge non-null when fired; pair findings distinguishable from solo",
405
+ "failures": failures,
406
+ }
407
+
408
+
409
+ def build_rows(results_dir: Path, work_dir_root: Path, run_id: str) -> list[dict]:
410
+ fixtures = find_results_dir_fixtures(results_dir)
411
+ rows = []
412
+ for fx in fixtures:
413
+ judge = load_judge(results_dir, fx)
414
+ solo_r = load_result(results_dir, fx, "solo_claude")
415
+ gated_r = load_result(results_dir, fx, "l2_gated")
416
+ forced_r = load_result(results_dir, fx, "l2_forced")
417
+ gated_state = load_state(work_dir_root, run_id, fx, "l2_gated")
418
+ forced_state = load_state(work_dir_root, run_id, fx, "l2_forced")
419
+
420
+ def pair_judge_present(state: dict | None) -> bool:
421
+ if not state:
422
+ return False
423
+ phases = state.get("phases") or {}
424
+ verify = phases.get("verify") or {}
425
+ sub = verify.get("sub_verdicts") or {}
426
+ return sub.get("pair_judge") is not None
427
+
428
+ # Pair findings distinguishability — checked from archive of whichever
429
+ # arm fired pair-mode. l2_forced always fires (when present); l2_gated
430
+ # only on natural triggers. Use l2_forced as the audit anchor when
431
+ # available; fall back to l2_gated.
432
+ pair_anchor_arm = "l2_forced" if forced_state else (
433
+ "l2_gated" if gated_state else None
434
+ )
435
+ pair_findings_ok = (
436
+ pair_findings_distinguishable(work_dir_root, run_id, fx, pair_anchor_arm)
437
+ if pair_anchor_arm else False
438
+ )
439
+ rows.append({
440
+ "fixture": fx,
441
+ "solo_score": get_score(judge, "solo_claude"),
442
+ "l2_gated_score": get_score(judge, "l2_gated"),
443
+ "l2_forced_score": get_score(judge, "l2_forced"),
444
+ "solo_dq": get_disqualifier(judge, "solo_claude"),
445
+ "l2_gated_dq": get_disqualifier(judge, "l2_gated"),
446
+ "l2_forced_dq": get_disqualifier(judge, "l2_forced"),
447
+ "solo_wall": (solo_r or {}).get("elapsed_seconds"),
448
+ "l2_gated_wall": (gated_r or {}).get("elapsed_seconds"),
449
+ "solo_timeout": bool((solo_r or {}).get("timed_out")),
450
+ "l2_gated_timeout": bool((gated_r or {}).get("timed_out")),
451
+ "l2_gated_pair_judge_present": pair_judge_present(gated_state),
452
+ "l2_forced_pair_judge_present": pair_judge_present(forced_state),
453
+ "pair_fired": pair_judge_present(gated_state) or pair_judge_present(forced_state),
454
+ "pair_findings_distinguishable": pair_findings_ok,
455
+ "impl_confounded": impl_confounded_for_fixture(results_dir, fx),
456
+ # Below remain conservative defaults — populating them needs cross-
457
+ # finding diffs (verify.findings.jsonl from each pair-judge file)
458
+ # that are out of scope for this iter's compare script. Recorded
459
+ # as TODO for follow-up; iter-0033c attribution downgrades fixtures
460
+ # with these defaults to no_material_lift unless the score delta
461
+ # itself is ≥+5 (then deliberation_lift), so the conservative
462
+ # default never inflates the count.
463
+ "pair_judge_unique_finding": False,
464
+ "mechanical_finding_drove_change": False,
465
+ })
466
+ return rows
467
+
468
+
469
+ def render_markdown(gates: list[dict], rows: list[dict]) -> str:
470
+ lines = ["# iter-0033c gate table\n"]
471
+ lines.append("| fixture | solo | l2_gated | Δ | l2_forced | l2g pair? | l2f pair? | wall_ratio |")
472
+ lines.append("|---|---|---|---|---|---|---|---|")
473
+ for r in rows:
474
+ delta = (
475
+ r["l2_gated_score"] - r["solo_score"]
476
+ if r["l2_gated_score"] is not None and r["solo_score"] is not None
477
+ else None
478
+ )
479
+ ratio = (
480
+ round(r["l2_gated_wall"] / r["solo_wall"], 2)
481
+ if r["l2_gated_wall"] is not None and r["solo_wall"]
482
+ else None
483
+ )
484
+ lines.append(
485
+ f"| {r['fixture']} | {r['solo_score']} | {r['l2_gated_score']} | "
486
+ f"{('+' if delta and delta > 0 else '') + str(delta) if delta is not None else '-'} | "
487
+ f"{r['l2_forced_score']} | "
488
+ f"{'✓' if r['l2_gated_pair_judge_present'] else '✗'} | "
489
+ f"{'✓' if r['l2_forced_pair_judge_present'] else '✗'} | "
490
+ f"{ratio if ratio is not None else '-'} |"
491
+ )
492
+ lines.append("\n## Gates\n")
493
+ for g in gates:
494
+ ship = " (SHIP-BLOCKER)" if g.get("ship_blocker") else ""
495
+ lines.append(f"- **{g['gate']}{ship}**: {g['status']} — {g.get('rule', '')}")
496
+ if g.get("failures"):
497
+ for f in g["failures"]:
498
+ lines.append(f" - FAIL: {f}")
499
+ return "\n".join(lines) + "\n"
500
+
501
+
502
+ def main() -> int:
503
+ ap = argparse.ArgumentParser()
504
+ ap.add_argument("--manifest", required=True)
505
+ ap.add_argument("--results-dir", required=True)
506
+ ap.add_argument("--work-dir-root", default="/tmp",
507
+ help="parent dir of bench-* WORK_DIRs (default: /tmp)")
508
+ ap.add_argument("--run-id", required=True,
509
+ help="benchmark run id used by run-fixture.sh (matches WORK_DIR prefix)")
510
+ ap.add_argument("--out-json", required=True)
511
+ ap.add_argument("--out-md", required=True)
512
+ args = ap.parse_args()
513
+
514
+ manifest = json.loads(Path(args.manifest).read_text())
515
+ rows = build_rows(Path(args.results_dir), Path(args.work_dir_root), args.run_id)
516
+
517
+ gates = [
518
+ gate_2_no_regression(rows),
519
+ gate_3_lift(rows, manifest),
520
+ gate_4_hard_floor(rows, Path(args.work_dir_root), args.run_id),
521
+ gate_5_efficiency(rows),
522
+ gate_6_trigger_discipline(rows, manifest),
523
+ gate_7_attribution(rows, manifest),
524
+ gate_8_artifact_contract(rows),
525
+ ]
526
+
527
+ out = {
528
+ "iter": "0033c",
529
+ "manifest_sha256": manifest["manifest_sha256"],
530
+ "manifest_head": manifest.get("head"),
531
+ "rows": rows,
532
+ "gates": gates,
533
+ "ship_blockers_failed": [g["gate"] for g in gates
534
+ if g.get("ship_blocker") and g["status"] == "FAIL"],
535
+ "quality_gates_failed": [g["gate"] for g in gates
536
+ if not g.get("ship_blocker") and g["status"] == "FAIL"],
537
+ }
538
+ Path(args.out_json).write_text(json.dumps(out, indent=2) + "\n")
539
+ Path(args.out_md).write_text(render_markdown(gates, rows))
540
+
541
+ print(f"[compare] gates -> {args.out_json}")
542
+ print(f"[compare] markdown -> {args.out_md}")
543
+ failed = out["ship_blockers_failed"]
544
+ if failed:
545
+ print(f"[compare] SHIP-BLOCKER FAIL: {failed}")
546
+ return 1
547
+ print("[compare] all ship-blockers PASS")
548
+ return 0
549
+
550
+
551
+ if __name__ == "__main__":
552
+ sys.exit(main())