devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +318 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  12. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  18. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  25. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  31. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  40. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  48. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  56. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  64. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  73. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  82. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  91. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  100. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  101. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  102. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  103. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  104. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  105. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  106. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  107. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  110. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  111. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  112. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  113. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  114. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  116. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  117. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  118. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  119. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  120. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  121. package/config/skills/_shared/archive_run.py +3 -0
  122. package/config/skills/_shared/codex-config.md +2 -2
  123. package/config/skills/_shared/codex-monitored.sh +72 -7
  124. package/config/skills/_shared/collect-codex-findings.py +125 -0
  125. package/config/skills/_shared/engine-preflight.md +1 -1
  126. package/config/skills/_shared/expected.schema.json +18 -0
  127. package/config/skills/_shared/spec-verify-check.py +312 -10
  128. package/config/skills/_shared/verify-merge-findings.py +327 -0
  129. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  130. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  131. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  132. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  133. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  134. package/package.json +1 -1
  135. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,265 @@
1
+ #!/usr/bin/env python3
2
+ """Render a SWE-bench frozen VERIFY matrix from compare artifacts."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import re
9
+ from collections import Counter
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+
14
+ RANK = {
15
+ "PASS": 0,
16
+ "PASS_WITH_ISSUES": 1,
17
+ "NEEDS_WORK": 2,
18
+ "BLOCKED": 3,
19
+ }
20
+
21
+
22
+ def rank(verdict: str | None) -> int:
23
+ return RANK.get(verdict or "", -1)
24
+
25
+
26
+ def load_json(path: Path) -> dict[str, Any]:
27
+ return json.loads(path.read_text(encoding="utf8"))
28
+
29
+
30
+ def transcript_failure_reason(results_root: Path, run_id: str, arm: str) -> str | None:
31
+ transcript_path = results_root / run_id / arm / "transcript.txt"
32
+ if not transcript_path.is_file():
33
+ return None
34
+ transcript = transcript_path.read_text(encoding="utf8", errors="replace")
35
+ if "You've hit your limit" in transcript:
36
+ return "provider_limit"
37
+ return None
38
+
39
+
40
+ def infer_fixture_id(results_root: Path, run_id: str) -> str:
41
+ for arm in ("pair", "solo"):
42
+ input_path = results_root / run_id / arm / "input.md"
43
+ if not input_path.exists():
44
+ continue
45
+ match = re.search(r"docs/roadmap/phase-1/([^`\s]+)\.md", input_path.read_text())
46
+ if match:
47
+ return match.group(1)
48
+ return "unknown"
49
+
50
+
51
+ def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
52
+ if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
53
+ return None
54
+ if solo_elapsed <= 0:
55
+ return None
56
+ return pair_elapsed / solo_elapsed
57
+
58
+
59
+ def load_gate_rows(gate_json: Path | None) -> dict[str, dict[str, Any]]:
60
+ if gate_json is None:
61
+ return {}
62
+ doc = load_json(gate_json)
63
+ return {row["run_id"]: row for row in doc.get("rows", [])}
64
+
65
+
66
+ def min_gate_rate(value: str) -> float:
67
+ rate = float(value)
68
+ if rate < 0 or rate > 1:
69
+ raise argparse.ArgumentTypeError("--min-gate-rate must be between 0 and 1")
70
+ return rate
71
+
72
+
73
+ def non_negative_int(value: str) -> int:
74
+ parsed = int(value)
75
+ if parsed < 0:
76
+ raise argparse.ArgumentTypeError("value must be >= 0")
77
+ return parsed
78
+
79
+
80
+ def classify(row: dict[str, Any], included: bool) -> str:
81
+ if included:
82
+ external = row["external_lift"]
83
+ internal = row["internal_lift"]
84
+ if external and internal:
85
+ return "gate: external + internal lift"
86
+ if external:
87
+ return "gate: external lift"
88
+ if internal:
89
+ return "gate: internal lift"
90
+ return "gate"
91
+ if row.get("row_failed_before_compare"):
92
+ row_exit = row.get("row_exit")
93
+ suffix = f" exit={row_exit}" if isinstance(row_exit, int) else ""
94
+ return f"failed attempt: row runner{suffix}"
95
+ if row.get("compare_missing"):
96
+ return "failed attempt: missing compare"
97
+ if row.get("solo_timed_out") or row.get("pair_timed_out"):
98
+ return "failed attempt: timeout"
99
+ if row.get("solo_failure_reason") == "provider_limit" or row.get("pair_failure_reason") == "provider_limit":
100
+ return "failed attempt: provider limit"
101
+ if row.get("solo_invoke_exit") not in (None, 0) or row.get("pair_invoke_exit") not in (None, 0):
102
+ return "failed attempt: nonzero invoke exit"
103
+ if row["solo_mechanical"] == "FAIL":
104
+ return "excluded: solo mechanical dominated"
105
+ if row["external_lift"] or row["internal_lift"]:
106
+ failures = row.get("gate_failures") or []
107
+ if failures:
108
+ return "lift excluded: " + "; ".join(failures)
109
+ return "lift outside gate"
110
+ if rank(row["pair_verdict"]) > rank(row["solo_verdict"]):
111
+ return "recall-only advisory"
112
+ if row["pair_found_more_low_or_worse"] or row["pair_found_more_findings"]:
113
+ return "recall-only findings"
114
+ return "no verdict lift"
115
+
116
+
117
+ def build_row(results_root: Path, run_id: str, gate_rows_by_id: dict[str, dict[str, Any]]) -> dict[str, Any]:
118
+ compare_path = results_root / run_id / "compare.json"
119
+ if compare_path.exists():
120
+ compare = load_json(compare_path)
121
+ else:
122
+ compare = {
123
+ "solo": {},
124
+ "pair": {},
125
+ "comparison": {"compare_missing": True},
126
+ }
127
+ solo = compare.get("solo") or {}
128
+ pair = compare.get("pair") or {}
129
+ comparison = compare.get("comparison") or {}
130
+ pair_ratio = elapsed_ratio(pair.get("elapsed_seconds"), solo.get("elapsed_seconds"))
131
+ gate_row = gate_rows_by_id.get(run_id) or {}
132
+ row = {
133
+ "fixture_id": infer_fixture_id(results_root, run_id),
134
+ "run_id": run_id,
135
+ "solo_verdict": comparison.get("solo_verdict") or solo.get("verify_verdict"),
136
+ "pair_verdict": comparison.get("pair_verdict") or pair.get("verify_verdict"),
137
+ "pair_mode": bool(pair.get("pair_mode")),
138
+ "external_lift": bool(comparison.get("pair_verdict_lift")),
139
+ "internal_lift": bool(comparison.get("pair_internal_verdict_lift")),
140
+ "pair_found_more_findings": bool(comparison.get("pair_found_more_findings")),
141
+ "pair_found_more_low_or_worse": bool(comparison.get("pair_found_more_low_or_worse")),
142
+ "row_failed_before_compare": bool(comparison.get("row_failed_before_compare")),
143
+ "row_exit": comparison.get("row_exit"),
144
+ "compare_missing": bool(comparison.get("compare_missing")),
145
+ "solo_invoke_exit": solo.get("invoke_exit"),
146
+ "pair_invoke_exit": pair.get("invoke_exit"),
147
+ "solo_failure_reason": solo.get("invoke_failure_reason")
148
+ or transcript_failure_reason(results_root, run_id, "solo"),
149
+ "pair_failure_reason": pair.get("invoke_failure_reason")
150
+ or transcript_failure_reason(results_root, run_id, "pair"),
151
+ "solo_timed_out": bool(solo.get("timed_out")),
152
+ "pair_timed_out": bool(pair.get("timed_out")),
153
+ "pair_solo_wall_ratio": pair_ratio,
154
+ "solo_mechanical": (solo.get("sub_verdicts") or {}).get("mechanical"),
155
+ "pair_mechanical": (pair.get("sub_verdicts") or {}).get("mechanical"),
156
+ "included_in_gate": gate_row.get("status") == "PASS",
157
+ "gate_failures": gate_row.get("failures") or [],
158
+ }
159
+ row["classification"] = classify(row, row["included_in_gate"])
160
+ return row
161
+
162
+
163
+ def fmt_ratio(value: Any) -> str:
164
+ return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
165
+
166
+
167
+ def write_md(path: Path, report: dict[str, Any]) -> None:
168
+ lines = [
169
+ f"# {report['title']}",
170
+ "",
171
+ f"Verdict: **{report['verdict']}**",
172
+ "",
173
+ f"Runs: {report['runs_total']}",
174
+ f"Included in gate: {report['gate_rows']}",
175
+ f"Excluded/recall/no-lift: {report['excluded_or_recall_rows']}",
176
+ f"Gate rate: {report['gate_rate']:.3f}",
177
+ f"Trailing non-gate rows: {report['trailing_non_gate_rows']}",
178
+ ]
179
+ if report["yield_thresholds"]:
180
+ lines.extend(["", f"Yield verdict: **{report['yield_verdict']}**"])
181
+ if report["yield_failures"]:
182
+ lines.append("Yield failures:")
183
+ lines.extend(f"- {failure}" for failure in report["yield_failures"])
184
+ if report.get("gate_artifact_json"):
185
+ lines.extend(["", f"Gate artifact: `{report['gate_artifact_json']}`"])
186
+ lines.extend(["", "Classification counts:"])
187
+ for name, count in sorted(report["classification_counts"].items()):
188
+ lines.append(f"- {name}: {count}")
189
+ lines.extend(
190
+ [
191
+ "",
192
+ "| Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Included | Classification |",
193
+ "|---|---|---|---|---:|---|---|---|---|",
194
+ ]
195
+ )
196
+ for row in report["rows"]:
197
+ lines.append(
198
+ f"| {row['fixture_id']} | {row['solo_verdict']} | {row['pair_verdict']} | "
199
+ f"{str(row['pair_mode']).lower()} | {fmt_ratio(row.get('pair_solo_wall_ratio'))} | "
200
+ f"{str(row['external_lift']).lower()} | {str(row['internal_lift']).lower()} | "
201
+ f"{str(row['included_in_gate']).lower()} | {row['classification']} |"
202
+ )
203
+ lines.append("")
204
+ path.write_text("\n".join(lines), encoding="utf8")
205
+
206
+
207
+ def main() -> int:
208
+ parser = argparse.ArgumentParser()
209
+ parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=Path)
210
+ parser.add_argument("--run-id", action="append", required=True)
211
+ parser.add_argument("--gate-json", type=Path)
212
+ parser.add_argument("--title", required=True)
213
+ parser.add_argument("--verdict", required=True)
214
+ parser.add_argument("--min-gate-rate", type=min_gate_rate)
215
+ parser.add_argument("--max-trailing-non-gate", type=non_negative_int)
216
+ parser.add_argument("--out-json", required=True, type=Path)
217
+ parser.add_argument("--out-md", required=True, type=Path)
218
+ args = parser.parse_args()
219
+
220
+ gate_rows_by_id = load_gate_rows(args.gate_json)
221
+ rows = [build_row(args.results_root, run_id, gate_rows_by_id) for run_id in args.run_id]
222
+ gate_rows = sum(1 for row in rows if row["included_in_gate"])
223
+ trailing_non_gate_rows = 0
224
+ for row in reversed(rows):
225
+ if row["included_in_gate"]:
226
+ break
227
+ trailing_non_gate_rows += 1
228
+ gate_rate = gate_rows / len(rows) if rows else 0.0
229
+ yield_thresholds = {
230
+ "min_gate_rate": args.min_gate_rate,
231
+ "max_trailing_non_gate": args.max_trailing_non_gate,
232
+ }
233
+ thresholds_configured = any(value is not None for value in yield_thresholds.values())
234
+ yield_failures = []
235
+ if args.min_gate_rate is not None and gate_rate < args.min_gate_rate:
236
+ yield_failures.append(f"gate rate {gate_rate:.3f} < minimum {args.min_gate_rate:.3f}")
237
+ if args.max_trailing_non_gate is not None and trailing_non_gate_rows > args.max_trailing_non_gate:
238
+ yield_failures.append(
239
+ f"trailing non-gate rows {trailing_non_gate_rows} > maximum {args.max_trailing_non_gate}"
240
+ )
241
+ report = {
242
+ "title": args.title,
243
+ "verdict": args.verdict,
244
+ "runs_total": len(rows),
245
+ "gate_rows": gate_rows,
246
+ "excluded_or_recall_rows": len(rows) - gate_rows,
247
+ "gate_rate": gate_rate,
248
+ "trailing_non_gate_rows": trailing_non_gate_rows,
249
+ "classification_counts": dict(Counter(row["classification"] for row in rows)),
250
+ "yield_thresholds": {
251
+ key: value for key, value in yield_thresholds.items() if value is not None
252
+ },
253
+ "yield_verdict": "FAIL" if yield_failures else "PASS" if thresholds_configured else "NOT_CONFIGURED",
254
+ "yield_failures": yield_failures,
255
+ "gate_artifact_json": str(args.gate_json) if args.gate_json else None,
256
+ "rows": rows,
257
+ }
258
+ args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
259
+ write_md(args.out_md, report)
260
+ print(json.dumps(report, indent=2))
261
+ return 2 if yield_failures else 0
262
+
263
+
264
+ if __name__ == "__main__":
265
+ raise SystemExit(main())
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for frozen-verify-gate.py evidence guards.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ GATE="$SCRIPT_DIR/frozen-verify-gate.py"
8
+ TMP_DIR="$(mktemp -d /tmp/frozen-verify-gate-test.XXXXXX)"
9
+ FIXTURES_DIR="$TMP_DIR/fixtures"
10
+ trap 'rm -rf "$TMP_DIR"' EXIT
11
+ mkdir -p "$FIXTURES_DIR"
12
+
13
+ write_run() {
14
+ local run_id="$1"
15
+ local fixture_id="${2:-}"
16
+ local solo_verdict="$3"
17
+ local pair_verdict="$4"
18
+ local lift="$5"
19
+ local internal_lift="${6:-false}"
20
+ local pair_primary="${7:-$pair_verdict}"
21
+ local pair_judge="${8:-$pair_verdict}"
22
+ mkdir -p "$TMP_DIR/$run_id/pair"
23
+ if [ -n "$fixture_id" ]; then
24
+ cat > "$TMP_DIR/$run_id/pair/input.md" <<EOF
25
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/$fixture_id.md.
26
+ EOF
27
+ fi
28
+ cat > "$TMP_DIR/$run_id/compare.json" <<EOF
29
+ {
30
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "$solo_verdict", "elapsed_seconds": 100},
31
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "$pair_verdict", "pair_mode": true, "elapsed_seconds": 200},
32
+ "comparison": {
33
+ "pair_trigger_missed": false,
34
+ "pair_verdict_lift": $lift,
35
+ "pair_internal_verdict_lift": $internal_lift,
36
+ "solo_verdict": "$solo_verdict",
37
+ "pair_verdict": "$pair_verdict",
38
+ "pair_primary_verdict": "$pair_primary",
39
+ "pair_judge_verdict": "$pair_judge"
40
+ }
41
+ }
42
+ EOF
43
+ }
44
+
45
+ expect_fail_contains() {
46
+ local label="$1"
47
+ local needle="$2"
48
+ shift 2
49
+ local out="$TMP_DIR/$label.out"
50
+ if "$@" > "$out" 2>&1; then
51
+ echo "expected failure for $label" >&2
52
+ cat "$out" >&2
53
+ exit 1
54
+ fi
55
+ if ! grep -Fq "$needle" "$out"; then
56
+ echo "missing expected text for $label: $needle" >&2
57
+ cat "$out" >&2
58
+ exit 1
59
+ fi
60
+ }
61
+
62
+ write_run pass-a F10-persist-write-collision PASS_WITH_ISSUES NEEDS_WORK true
63
+ write_run pass-b F12-webhook-raw-body-signature PASS_WITH_ISSUES NEEDS_WORK true
64
+ mkdir -p "$FIXTURES_DIR/F10-persist-write-collision" "$FIXTURES_DIR/F12-webhook-raw-body-signature"
65
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
66
+ --run-id pass-a --run-id pass-b --min-runs 2 --max-pair-solo-wall-ratio 3 \
67
+ > "$TMP_DIR/pass.out"
68
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/pass.out"
69
+ grep -Fq '"avg_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.out"
70
+ grep -Fq '"pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.out"
71
+
72
+ mkdir -p "$TMP_DIR/summary-verdicts/pair"
73
+ cat > "$TMP_DIR/summary-verdicts/pair/input.md" <<'EOF'
74
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F13-summary-verdict-fallback.md.
75
+ EOF
76
+ cat > "$TMP_DIR/summary-verdicts/compare.json" <<'EOF'
77
+ {
78
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
79
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
80
+ "comparison": {"pair_trigger_missed": false, "pair_verdict_lift": true, "pair_internal_verdict_lift": false}
81
+ }
82
+ EOF
83
+ mkdir -p "$FIXTURES_DIR/F13-summary-verdict-fallback"
84
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
85
+ --run-id summary-verdicts --min-runs 1 \
86
+ > "$TMP_DIR/summary-verdicts.out"
87
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/summary-verdicts.out"
88
+
89
+ write_run dup-a F12-webhook-raw-body-signature PASS_WITH_ISSUES NEEDS_WORK true
90
+ write_run dup-b F12-webhook-raw-body-signature PASS_WITH_ISSUES NEEDS_WORK true
91
+ expect_fail_contains duplicate-fixture "duplicate fixture_id=F12-webhook-raw-body-signature" \
92
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
93
+ --run-id dup-a --run-id dup-b --min-runs 2
94
+
95
+ write_run missing-fixture "" PASS_WITH_ISSUES NEEDS_WORK true
96
+ expect_fail_contains missing-fixture "fixture_id missing" \
97
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
98
+ --run-id missing-fixture --min-runs 1
99
+
100
+ write_run unknown-fixture F99-not-a-real-fixture PASS_WITH_ISSUES NEEDS_WORK true
101
+ expect_fail_contains unknown-fixture "fixture_id not found: F99-not-a-real-fixture" \
102
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
103
+ --run-id unknown-fixture --min-runs 1
104
+
105
+ write_run recall-only F11-batch-import-all-or-nothing PASS PASS_WITH_ISSUES false
106
+ mkdir -p "$FIXTURES_DIR/F11-batch-import-all-or-nothing"
107
+ expect_fail_contains recall-only "pair verdict PASS_WITH_ISSUES is not verdict-binding" \
108
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
109
+ --run-id recall-only --min-runs 1
110
+
111
+ write_run internal-lift F14-internal-pair-lift NEEDS_WORK NEEDS_WORK false true PASS_WITH_ISSUES NEEDS_WORK
112
+ mkdir -p "$FIXTURES_DIR/F14-internal-pair-lift"
113
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
114
+ --run-id internal-lift --min-runs 1 \
115
+ > "$TMP_DIR/internal-lift.out"
116
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/internal-lift.out"
117
+
118
+ write_run slow-pair F15-slow-pair PASS_WITH_ISSUES NEEDS_WORK true
119
+ mkdir -p "$FIXTURES_DIR/F15-slow-pair"
120
+ python3 - "$TMP_DIR/slow-pair/compare.json" <<'PY'
121
+ import json
122
+ import sys
123
+ path = sys.argv[1]
124
+ with open(path) as f:
125
+ data = json.load(f)
126
+ data["pair"]["elapsed_seconds"] = 401
127
+ with open(path, "w") as f:
128
+ json.dump(data, f)
129
+ PY
130
+ expect_fail_contains slow-pair "pair/solo wall ratio 4.01 exceeds 3.00" \
131
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
132
+ --run-id slow-pair --min-runs 1 --max-pair-solo-wall-ratio 3
133
+
134
+ mkdir -p "$TMP_DIR/missing-elapsed/pair"
135
+ cat > "$TMP_DIR/missing-elapsed/pair/input.md" <<'EOF'
136
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F16-missing-elapsed.md.
137
+ EOF
138
+ cat > "$TMP_DIR/missing-elapsed/compare.json" <<'EOF'
139
+ {
140
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES"},
141
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true},
142
+ "comparison": {
143
+ "pair_trigger_missed": false,
144
+ "pair_verdict_lift": true,
145
+ "pair_internal_verdict_lift": false,
146
+ "solo_verdict": "PASS_WITH_ISSUES",
147
+ "pair_verdict": "NEEDS_WORK",
148
+ "pair_primary_verdict": "NEEDS_WORK",
149
+ "pair_judge_verdict": "NEEDS_WORK"
150
+ }
151
+ }
152
+ EOF
153
+ mkdir -p "$FIXTURES_DIR/F16-missing-elapsed"
154
+ expect_fail_contains missing-elapsed "pair/solo wall ratio missing" \
155
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
156
+ --run-id missing-elapsed --min-runs 1 --max-pair-solo-wall-ratio 3
157
+
158
+ mkdir -p "$TMP_DIR/missing-compare/pair"
159
+ cat > "$TMP_DIR/missing-compare/pair/input.md" <<'EOF'
160
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F17-missing-compare.md.
161
+ EOF
162
+ mkdir -p "$FIXTURES_DIR/F17-missing-compare"
163
+ expect_fail_contains missing-compare "missing compare.json for missing-compare" \
164
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
165
+ --run-id missing-compare --min-runs 1
166
+
167
+ mkdir -p "$TMP_DIR/provider-limit/pair"
168
+ cat > "$TMP_DIR/provider-limit/pair/input.md" <<'EOF'
169
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F18-provider-limit.md.
170
+ EOF
171
+ cat > "$TMP_DIR/provider-limit/pair/transcript.txt" <<'EOF'
172
+ You've hit your limit · resets 3am (Asia/Seoul)
173
+ EOF
174
+ cat > "$TMP_DIR/provider-limit/compare.json" <<'EOF'
175
+ {
176
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS", "elapsed_seconds": 100},
177
+ "pair": {"invoke_exit": 1, "timed_out": false, "verify_verdict": null, "pair_mode": false, "elapsed_seconds": 1},
178
+ "comparison": {
179
+ "pair_trigger_missed": false,
180
+ "pair_verdict_lift": false,
181
+ "pair_internal_verdict_lift": false,
182
+ "solo_verdict": "PASS",
183
+ "pair_verdict": null
184
+ }
185
+ }
186
+ EOF
187
+ mkdir -p "$FIXTURES_DIR/F18-provider-limit"
188
+ expect_fail_contains provider-limit "pair provider limit" \
189
+ python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
190
+ --run-id provider-limit --min-runs 1
191
+
192
+ echo "✓ test-frozen-verify-gate"
@@ -0,0 +1,131 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for full-pipeline-pair-gate.py.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ GATE="$SCRIPT_DIR/full-pipeline-pair-gate.py"
8
+ TMP_DIR="$(mktemp -d /tmp/full-pipeline-pair-gate-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ write_fixture() {
12
+ local run_id="$1"
13
+ local fixture="$2"
14
+ local bare="$3"
15
+ local solo="$4"
16
+ local pair="$5"
17
+ local pair_mode="${6:-true}"
18
+ local pair_elapsed="${7:-200}"
19
+ local solo_elapsed="${8:-100}"
20
+ local pair_arm="${9:-l2_gated}"
21
+ local dir="$TMP_DIR/$run_id/$fixture"
22
+ mkdir -p "$dir/bare" "$dir/solo_claude" "$dir/$pair_arm"
23
+ cat > "$dir/judge.json" <<EOF
24
+ {
25
+ "scores_by_arm": {"bare": $bare, "solo_claude": $solo, "$pair_arm": $pair},
26
+ "disqualifiers_by_arm": {}
27
+ }
28
+ EOF
29
+ for arm in bare solo_claude "$pair_arm"; do
30
+ cat > "$dir/$arm/verify.json" <<'EOF'
31
+ {"disqualifier": false}
32
+ EOF
33
+ done
34
+ cat > "$dir/bare/result.json" <<'EOF'
35
+ {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": 20}
36
+ EOF
37
+ cat > "$dir/solo_claude/result.json" <<EOF
38
+ {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $solo_elapsed}
39
+ EOF
40
+ cat > "$dir/$pair_arm/result.json" <<EOF
41
+ {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $pair_elapsed, "pair_mode": $pair_mode}
42
+ EOF
43
+ }
44
+
45
+ expect_fail_contains() {
46
+ local label="$1"
47
+ local needle="$2"
48
+ shift 2
49
+ local out="$TMP_DIR/$label.out"
50
+ if "$@" > "$out" 2>&1; then
51
+ echo "expected failure for $label" >&2
52
+ cat "$out" >&2
53
+ exit 1
54
+ fi
55
+ if ! grep -Fq "$needle" "$out"; then
56
+ echo "missing expected text for $label: $needle" >&2
57
+ cat "$out" >&2
58
+ exit 1
59
+ fi
60
+ }
61
+
62
+ write_fixture pass F21 50 75 82 true 220 110
63
+ write_fixture pass F22 60 80 88 true 280 140
64
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass \
65
+ --max-pair-solo-wall-ratio 3 \
66
+ --out-json "$TMP_DIR/pass.json" \
67
+ --out-md "$TMP_DIR/pass.md"
68
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/pass.json"
69
+ grep -Fq '"avg_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.json"
70
+ grep -Fq 'Verdict: **PASS**' "$TMP_DIR/pass.md"
71
+
72
+ write_fixture no-headroom F21 50 81 90 true
73
+ write_fixture no-headroom F22 60 80 88 true
74
+ expect_fail_contains no-headroom "solo_claude score 81 > 80" \
75
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id no-headroom
76
+
77
+ write_fixture no-pair-mode F21 50 75 85 false
78
+ write_fixture no-pair-mode F22 60 80 90 true
79
+ expect_fail_contains no-pair-mode "l2_gated pair_mode not true" \
80
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id no-pair-mode
81
+
82
+ write_fixture weak-margin F21 50 75 79 true
83
+ write_fixture weak-margin F22 60 80 88 true
84
+ expect_fail_contains weak-margin "l2_gated margin +4 < +5" \
85
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id weak-margin
86
+
87
+ write_fixture custom-pair-arm F21 50 75 82 true 220 110 l2_risk_probes
88
+ write_fixture custom-pair-arm F22 60 80 88 true 280 140 l2_risk_probes
89
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id custom-pair-arm \
90
+ --pair-arm l2_risk_probes \
91
+ --max-pair-solo-wall-ratio 3 \
92
+ --out-json "$TMP_DIR/custom-pair-arm.json" \
93
+ --out-md "$TMP_DIR/custom-pair-arm.md"
94
+ grep -Fq '"pair_arm": "l2_risk_probes"' "$TMP_DIR/custom-pair-arm.json"
95
+ grep -Fq 'l2_risk_probes - solo_claude >= 5' "$TMP_DIR/custom-pair-arm.md"
96
+
97
+ write_fixture provider-limit F21 50 75 85 true 37 100 l2_risk_probes
98
+ python3 - "$TMP_DIR/provider-limit/F21/l2_risk_probes/result.json" <<'PY'
99
+ import json, sys
100
+ path = sys.argv[1]
101
+ data = json.load(open(path))
102
+ data["invoke_failure"] = True
103
+ data["invoke_failure_reason"] = "provider_limit"
104
+ json.dump(data, open(path, "w"), indent=2)
105
+ PY
106
+ expect_fail_contains provider-limit "l2_risk_probes invoke failure (provider_limit)" \
107
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id provider-limit \
108
+ --pair-arm l2_risk_probes --min-fixtures 1
109
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id provider-limit \
110
+ --pair-arm l2_risk_probes --min-fixtures 1 \
111
+ --out-json "$TMP_DIR/provider-limit.json" \
112
+ --out-md "$TMP_DIR/provider-limit.md" >/dev/null 2>&1 || true
113
+ grep -Fq '"pair_margin": null' "$TMP_DIR/provider-limit.json"
114
+ grep -Fq '"pair_solo_wall_ratio": null' "$TMP_DIR/provider-limit.json"
115
+ if grep -Fq 'margin -' "$TMP_DIR/provider-limit.md"; then
116
+ echo "provider-limit row must not report quality margin" >&2
117
+ cat "$TMP_DIR/provider-limit.md" >&2
118
+ exit 1
119
+ fi
120
+
121
+ write_fixture slow-pair F21 50 75 85 true 401 100
122
+ write_fixture slow-pair F22 60 80 88 true 280 140
123
+ expect_fail_contains slow-pair "pair/solo wall ratio 4.01 > 3.00" \
124
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair --max-pair-solo-wall-ratio 3
125
+
126
+ write_fixture one-fixture F21 50 75 85 true
127
+ expect_fail_contains one-fixture "fixture_count_ok" \
128
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id one-fixture --out-json "$TMP_DIR/one-fixture.json"
129
+ grep -Fq '"fixture_count_ok": false' "$TMP_DIR/one-fixture.json"
130
+
131
+ echo "PASS test-full-pipeline-pair-gate"
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for headroom-gate.py candidate-set guards.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ GATE="$SCRIPT_DIR/headroom-gate.py"
8
+ TMP_DIR="$(mktemp -d /tmp/headroom-gate-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ write_fixture() {
12
+ local run_id="$1"
13
+ local fixture="$2"
14
+ local bare="$3"
15
+ local solo="$4"
16
+ local solo_timed_out="${5:-false}"
17
+ local dir="$TMP_DIR/$run_id/$fixture"
18
+ mkdir -p "$dir/bare" "$dir/solo_claude"
19
+ cat > "$dir/judge.json" <<EOF
20
+ {
21
+ "scores_by_arm": {"bare": $bare, "solo_claude": $solo},
22
+ "disqualifiers_by_arm": {}
23
+ }
24
+ EOF
25
+ cat > "$dir/bare/result.json" <<'EOF'
26
+ {"timed_out": false, "invoke_failure": false}
27
+ EOF
28
+ cat > "$dir/bare/verify.json" <<'EOF'
29
+ {"disqualifier": false}
30
+ EOF
31
+ cat > "$dir/solo_claude/result.json" <<EOF
32
+ {"timed_out": $solo_timed_out, "invoke_failure": false}
33
+ EOF
34
+ cat > "$dir/solo_claude/verify.json" <<'EOF'
35
+ {"disqualifier": false}
36
+ EOF
37
+ }
38
+
39
+ expect_fail_contains() {
40
+ local label="$1"
41
+ local needle="$2"
42
+ shift 2
43
+ local out="$TMP_DIR/$label.out"
44
+ if "$@" > "$out" 2>&1; then
45
+ echo "expected failure for $label" >&2
46
+ cat "$out" >&2
47
+ exit 1
48
+ fi
49
+ if ! grep -Fq "$needle" "$out"; then
50
+ echo "missing expected text for $label: $needle" >&2
51
+ cat "$out" >&2
52
+ exit 1
53
+ fi
54
+ }
55
+
56
+ write_fixture one-pass F10 50 75
57
+ expect_fail_contains min-fixtures 'Verdict: **FAIL**' \
58
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --out-json "$TMP_DIR/one-pass.json"
59
+ grep -Fq '"fixture_count_ok": false' "$TMP_DIR/one-pass.json"
60
+
61
+ write_fixture two-pass F10 50 75
62
+ write_fixture two-pass F12 60 80
63
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id two-pass --out-json "$TMP_DIR/two-pass.json" \
64
+ > "$TMP_DIR/two-pass.out"
65
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/two-pass.json"
66
+ grep -Fq '"fixture_count_ok": true' "$TMP_DIR/two-pass.json"
67
+
68
+ write_fixture solo-ceiling F10 50 75
69
+ write_fixture solo-ceiling F12 20 92
70
+ expect_fail_contains solo-ceiling "solo_claude score 92 > 80" \
71
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id solo-ceiling
72
+
73
+ write_fixture dirty-solo F10 50 75
74
+ write_fixture dirty-solo F12 20 70 true
75
+ expect_fail_contains dirty-solo "solo_claude timed out" \
76
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo
77
+
78
+ write_fixture missing-artifact F10 50 75
79
+ write_fixture missing-artifact F12 20 70
80
+ rm "$TMP_DIR/missing-artifact/F12/solo_claude/verify.json"
81
+ expect_fail_contains missing-artifact "solo_claude verify.json missing" \
82
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-artifact
83
+
84
+ echo "✓ test-headroom-gate"