devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,1256 @@
1
+ #!/usr/bin/env python3
2
+ """Composite audit for pair-evidence readiness.
3
+
4
+ This is the release/handoff guard for solo<pair benchmark evidence. It runs:
5
+ 1. pair-candidate-frontier.py --fail-on-unmeasured
6
+ 2. audit-headroom-rejections.py
7
+
8
+ Both checks are provider-free and operate only on fixtures, the rejected
9
+ registry, and local gate summary artifacts.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import math
16
+ import pathlib
17
+ import subprocess
18
+ import sys
19
+ import tempfile
20
+
21
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
22
+ if str(SCRIPT_DIR) not in sys.path:
23
+ sys.path.insert(0, str(SCRIPT_DIR))
24
+
25
+ from pair_evidence_contract import (
26
+ best_pair_evidence,
27
+ has_historical_pair_trigger_reason,
28
+ is_historical_pair_trigger_reason,
29
+ is_strict_int,
30
+ is_strict_number,
31
+ loads_strict_json_object,
32
+ )
33
+
34
+
35
+ FRONTIER_SUMMARY_KEYS = [
36
+ "verdict",
37
+ "min_pair_margin",
38
+ "max_pair_solo_wall_ratio",
39
+ "fixtures_total",
40
+ "rejected_count",
41
+ "candidate_count",
42
+ "pair_evidence_count",
43
+ "unmeasured_count",
44
+ "pair_margin_avg",
45
+ "pair_margin_min",
46
+ "pair_solo_wall_ratio_avg",
47
+ "pair_solo_wall_ratio_max",
48
+ ]
49
+ def run_check(
50
+ name: str,
51
+ args: list[str],
52
+ *,
53
+ stdout_path: pathlib.Path | None = None,
54
+ stderr_path: pathlib.Path | None = None,
55
+ ) -> int:
56
+ print(f"[audit] {name}", flush=True)
57
+ proc = subprocess.run(args, text=True, capture_output=True)
58
+ if stdout_path is not None:
59
+ stdout_path.write_text(proc.stdout, encoding="utf8")
60
+ if stderr_path is not None:
61
+ stderr_path.write_text(proc.stderr, encoding="utf8")
62
+ if proc.stdout:
63
+ print(proc.stdout, end="", flush=True)
64
+ if proc.stderr:
65
+ print(proc.stderr, end="", file=sys.stderr, flush=True)
66
+ if proc.returncode == 0:
67
+ print(f"[audit] {name}: PASS", flush=True)
68
+ else:
69
+ print(f"[audit] {name}: FAIL", file=sys.stderr, flush=True)
70
+ return proc.returncode
71
+
72
+
73
+ def write_audit_report(
74
+ *,
75
+ out_dir: pathlib.Path,
76
+ frontier_status: int,
77
+ headroom_status: int,
78
+ min_pair_evidence: int,
79
+ min_pair_margin: int,
80
+ max_pair_solo_wall_ratio: float,
81
+ frontier_report_status: int,
82
+ frontier_stdout_status: int,
83
+ headroom_report_status: int,
84
+ pair_evidence_status: int,
85
+ pair_evidence_quality_status: int,
86
+ pair_trigger_reason_status: int,
87
+ pair_evidence_hypothesis_status: int,
88
+ pair_evidence_hypothesis_trigger_status: int,
89
+ require_hypothesis_trigger: bool,
90
+ fixtures_root: pathlib.Path,
91
+ ) -> None:
92
+ frontier_summary = load_summary(out_dir / "frontier.json", FRONTIER_SUMMARY_KEYS)
93
+ pair_evidence_rows = load_pair_evidence_rows(out_dir / "frontier.json")
94
+ pair_evidence_count = frontier_summary.get("pair_evidence_count")
95
+ pair_margins = [
96
+ row["pair_margin"]
97
+ for row in pair_evidence_rows
98
+ if is_strict_int(row.get("pair_margin"))
99
+ ]
100
+ wall_ratios = [
101
+ row["pair_solo_wall_ratio"]
102
+ for row in pair_evidence_rows
103
+ if is_strict_number(row.get("pair_solo_wall_ratio"))
104
+ ]
105
+ trigger_reason_rows = [
106
+ row
107
+ for row in pair_evidence_rows
108
+ if isinstance(row.get("pair_trigger_reasons"), list)
109
+ ]
110
+ canonical_trigger_rows = [
111
+ row
112
+ for row in trigger_reason_rows
113
+ if row.get("pair_trigger_has_canonical_reason") is True
114
+ ]
115
+ historical_alias_trigger_rows = [
116
+ row
117
+ for row in trigger_reason_rows
118
+ if has_historical_pair_trigger_reason(row["pair_trigger_reasons"])
119
+ ]
120
+ historical_alias_details = pair_trigger_historical_alias_details(
121
+ historical_alias_trigger_rows,
122
+ )
123
+ hypothesis_rows = pair_evidence_hypothesis_rows(
124
+ out_dir / "frontier.json",
125
+ fixtures_root,
126
+ )
127
+ hypothesis_passing_rows = [
128
+ row for row in hypothesis_rows if row.get("has_actionable_hypothesis") is True
129
+ ]
130
+ hypothesis_trigger_rows = pair_evidence_hypothesis_trigger_rows(
131
+ out_dir / "frontier.json",
132
+ fixtures_root,
133
+ )
134
+ hypothesis_trigger_matched_rows = [
135
+ row
136
+ for row in hypothesis_trigger_rows
137
+ if row.get("has_actionable_hypothesis") is True
138
+ and row.get("has_hypothesis_trigger") is True
139
+ ]
140
+ hypothesis_trigger_gap_details = pair_evidence_hypothesis_trigger_gap_details(
141
+ hypothesis_trigger_rows,
142
+ )
143
+ frontier_stdout_metrics = load_frontier_stdout_metrics(
144
+ out_dir / "frontier.json",
145
+ out_dir / "frontier.stdout",
146
+ )
147
+ headroom_audit_summary = load_headroom_audit_summary(
148
+ out_dir / "headroom-audit.json",
149
+ )
150
+ report = {
151
+ "verdict": (
152
+ "PASS"
153
+ if (
154
+ frontier_status == 0
155
+ and headroom_status == 0
156
+ and frontier_report_status == 0
157
+ and frontier_stdout_status == 0
158
+ and headroom_report_status == 0
159
+ and pair_evidence_status == 0
160
+ and pair_evidence_quality_status == 0
161
+ and pair_trigger_reason_status == 0
162
+ and pair_evidence_hypothesis_status == 0
163
+ and pair_evidence_hypothesis_trigger_status == 0
164
+ )
165
+ else "FAIL"
166
+ ),
167
+ "min_pair_evidence": min_pair_evidence,
168
+ "min_pair_margin": min_pair_margin,
169
+ "max_pair_solo_wall_ratio": max_pair_solo_wall_ratio,
170
+ "frontier_summary": frontier_summary,
171
+ "pair_evidence_rows": pair_evidence_rows,
172
+ "artifacts": {
173
+ "frontier_json": "frontier.json",
174
+ "frontier_stdout": "frontier.stdout",
175
+ "frontier_stderr": "frontier.stderr",
176
+ "headroom_audit_json": "headroom-audit.json",
177
+ "headroom_rejections_stdout": "headroom-rejections.stdout",
178
+ "headroom_rejections_stderr": "headroom-rejections.stderr",
179
+ "audit_json": "audit.json",
180
+ },
181
+ "checks": {
182
+ "frontier": {
183
+ "status": "PASS" if frontier_status == 0 else "FAIL",
184
+ "exit_code": frontier_status,
185
+ "report": str(out_dir / "frontier.json"),
186
+ },
187
+ "headroom_rejections": {
188
+ "status": (
189
+ "PASS"
190
+ if headroom_status == 0 and headroom_report_status == 0
191
+ else "FAIL"
192
+ ),
193
+ "exit_code": headroom_status,
194
+ "report_check_exit_code": headroom_report_status,
195
+ "report": str(out_dir / "headroom-audit.json"),
196
+ **headroom_audit_summary,
197
+ },
198
+ "frontier_report": {
199
+ "status": "PASS" if frontier_report_status == 0 else "FAIL",
200
+ "exit_code": frontier_report_status,
201
+ "verdict": frontier_summary.get("verdict"),
202
+ "unmeasured_count": frontier_summary.get("unmeasured_count"),
203
+ },
204
+ "frontier_stdout": {
205
+ "status": "PASS" if frontier_stdout_status == 0 else "FAIL",
206
+ "exit_code": frontier_stdout_status,
207
+ "report": str(out_dir / "frontier.stdout"),
208
+ **frontier_stdout_metrics,
209
+ },
210
+ "min_pair_evidence": {
211
+ "status": "PASS" if pair_evidence_status == 0 else "FAIL",
212
+ "exit_code": pair_evidence_status,
213
+ "required": min_pair_evidence,
214
+ "actual": pair_evidence_count,
215
+ "actual_rows": len(pair_evidence_rows),
216
+ "rows_match_count": (
217
+ is_strict_int(pair_evidence_count)
218
+ and len(pair_evidence_rows) == pair_evidence_count
219
+ ),
220
+ },
221
+ "pair_evidence_quality": {
222
+ "status": "PASS" if pair_evidence_quality_status == 0 else "FAIL",
223
+ "exit_code": pair_evidence_quality_status,
224
+ "min_pair_margin_required": min_pair_margin,
225
+ "min_pair_margin_actual": min(pair_margins) if pair_margins else None,
226
+ "max_pair_solo_wall_ratio_allowed": max_pair_solo_wall_ratio,
227
+ "max_pair_solo_wall_ratio_actual": (
228
+ round(max(wall_ratios), 2) if wall_ratios else None
229
+ ),
230
+ "summary_min_pair_margin": frontier_summary.get("pair_margin_min"),
231
+ "summary_max_pair_solo_wall_ratio": frontier_summary.get("pair_solo_wall_ratio_max"),
232
+ },
233
+ "pair_trigger_reasons": {
234
+ "status": "PASS" if pair_trigger_reason_status == 0 else "FAIL",
235
+ "exit_code": pair_trigger_reason_status,
236
+ "summary_pair_evidence_count": pair_evidence_count,
237
+ "canonical_rows": len(canonical_trigger_rows),
238
+ "historical_alias_rows": len(historical_alias_trigger_rows),
239
+ "historical_alias_details": historical_alias_details,
240
+ "exposed_rows": len(trigger_reason_rows),
241
+ "total_rows": len(pair_evidence_rows),
242
+ "rows_match_count": (
243
+ is_strict_int(pair_evidence_count)
244
+ and len(pair_evidence_rows) == pair_evidence_count
245
+ ),
246
+ },
247
+ "pair_evidence_hypotheses": {
248
+ "status": "PASS" if pair_evidence_hypothesis_status == 0 else "FAIL",
249
+ "exit_code": pair_evidence_hypothesis_status,
250
+ "documented_rows": len(hypothesis_passing_rows),
251
+ "total_rows": len(pair_evidence_rows),
252
+ "rows": hypothesis_rows,
253
+ },
254
+ "pair_evidence_hypothesis_triggers": {
255
+ "status": (
256
+ "PASS"
257
+ if len(hypothesis_trigger_matched_rows) == len(hypothesis_passing_rows)
258
+ else ("FAIL" if require_hypothesis_trigger else "WARN")
259
+ ),
260
+ "exit_code": pair_evidence_hypothesis_trigger_status,
261
+ "required": require_hypothesis_trigger,
262
+ "matched_rows": len(hypothesis_trigger_matched_rows),
263
+ "documented_rows": len(hypothesis_passing_rows),
264
+ "total_rows": len(pair_evidence_rows),
265
+ "gap_details": hypothesis_trigger_gap_details,
266
+ "rows": hypothesis_trigger_rows,
267
+ },
268
+ },
269
+ }
270
+ (out_dir / "audit.json").write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
271
+
272
+
273
+ def load_summary(path: pathlib.Path, keys: list[str]) -> dict[str, object]:
274
+ try:
275
+ data = loads_strict_json_object(path.read_text())
276
+ except (OSError, ValueError, json.JSONDecodeError):
277
+ return {}
278
+ return {key: data.get(key) for key in keys if key in data}
279
+
280
+
281
+ def pair_trigger_historical_alias_details(
282
+ rows: list[dict[str, object]],
283
+ ) -> list[dict[str, object]]:
284
+ return [
285
+ {
286
+ "fixture": row["fixture"],
287
+ "aliases": [
288
+ reason
289
+ for reason in row["pair_trigger_reasons"]
290
+ if isinstance(reason, str)
291
+ and is_historical_pair_trigger_reason(reason)
292
+ ],
293
+ }
294
+ for row in rows
295
+ if isinstance(row.get("pair_trigger_reasons"), list)
296
+ ]
297
+
298
+
299
+ def pair_evidence_hypothesis_trigger_gap_details(
300
+ rows: list[dict[str, object]],
301
+ ) -> list[dict[str, object]]:
302
+ return [
303
+ {
304
+ "fixture": row["fixture"],
305
+ "pair_trigger_reasons": [
306
+ reason
307
+ for reason in row["pair_trigger_reasons"]
308
+ if isinstance(reason, str)
309
+ ],
310
+ }
311
+ for row in rows
312
+ if row.get("has_actionable_hypothesis") is True
313
+ and row.get("has_hypothesis_trigger") is not True
314
+ and isinstance(row.get("pair_trigger_reasons"), list)
315
+ ]
316
+
317
+
318
+ def load_headroom_audit_summary(path: pathlib.Path) -> dict[str, object]:
319
+ try:
320
+ data = loads_strict_json_object(path.read_text())
321
+ except (OSError, ValueError, json.JSONDecodeError):
322
+ return {
323
+ "verdict": None,
324
+ "unrecorded_failure_count": None,
325
+ "unsupported_registry_rejection_count": None,
326
+ }
327
+ unrecorded = data.get("unrecorded_failures")
328
+ unsupported = data.get("unsupported_registry_rejections")
329
+ return {
330
+ "verdict": data.get("verdict"),
331
+ "unrecorded_failure_count": (
332
+ len(unrecorded) if isinstance(unrecorded, list) else None
333
+ ),
334
+ "unsupported_registry_rejection_count": (
335
+ len(unsupported) if isinstance(unsupported, list) else None
336
+ ),
337
+ }
338
+
339
+
340
+ def check_headroom_audit_report(headroom_json: pathlib.Path) -> int:
341
+ summary = load_headroom_audit_summary(headroom_json)
342
+ verdict = summary.get("verdict")
343
+ unrecorded_count = summary.get("unrecorded_failure_count")
344
+ unsupported_count = summary.get("unsupported_registry_rejection_count")
345
+ if verdict != "PASS":
346
+ print(f"headroom audit verdict {verdict!r} is not PASS", file=sys.stderr)
347
+ return 1
348
+ if not is_strict_int(unrecorded_count):
349
+ print(
350
+ "headroom audit unrecorded failure count missing or malformed",
351
+ file=sys.stderr,
352
+ )
353
+ return 1
354
+ if not is_strict_int(unsupported_count):
355
+ print(
356
+ "headroom audit unsupported registry rejection count missing or malformed",
357
+ file=sys.stderr,
358
+ )
359
+ return 1
360
+ if unrecorded_count != 0:
361
+ print(
362
+ f"headroom audit has {unrecorded_count} unrecorded failure(s)",
363
+ file=sys.stderr,
364
+ )
365
+ return 1
366
+ if unsupported_count != 0:
367
+ print(
368
+ f"headroom audit has {unsupported_count} unsupported registry rejection(s)",
369
+ file=sys.stderr,
370
+ )
371
+ return 1
372
+ return 0
373
+
374
+
375
+ def print_headroom_rejections_summary(
376
+ headroom_json: pathlib.Path,
377
+ *,
378
+ status: int,
379
+ ) -> None:
380
+ summary = load_headroom_audit_summary(headroom_json)
381
+ print(
382
+ "headroom_rejections={status} verdict={verdict} "
383
+ "unrecorded={unrecorded} unsupported={unsupported}".format(
384
+ status="PASS" if status == 0 else "FAIL",
385
+ verdict=summary.get("verdict") or "MISSING",
386
+ unrecorded=format_count(summary.get("unrecorded_failure_count")),
387
+ unsupported=format_count(
388
+ summary.get("unsupported_registry_rejection_count")
389
+ ),
390
+ ),
391
+ flush=True,
392
+ )
393
+
394
+
395
+ def load_frontier_stdout_metrics(
396
+ frontier_json: pathlib.Path,
397
+ frontier_stdout: pathlib.Path,
398
+ ) -> dict[str, object]:
399
+ expected_rows = len(load_pair_evidence_rows(frontier_json))
400
+ try:
401
+ stdout = frontier_stdout.read_text(encoding="utf8")
402
+ except OSError:
403
+ return {
404
+ "summary_rows": None,
405
+ "aggregate_rows": None,
406
+ "final_verdict_rows": None,
407
+ "expected_rows": expected_rows,
408
+ "stdout_rows": None,
409
+ "trigger_rows": None,
410
+ "hypothesis_trigger_rows": None,
411
+ "rows_match_count": False,
412
+ "trigger_rows_match_count": False,
413
+ "hypothesis_trigger_rows_match_count": False,
414
+ }
415
+ summary = load_summary(
416
+ frontier_json,
417
+ [
418
+ "verdict",
419
+ "fixtures_total",
420
+ "rejected_count",
421
+ "candidate_count",
422
+ "pair_evidence_count",
423
+ "unmeasured_count",
424
+ "pair_margin_avg",
425
+ "pair_margin_min",
426
+ "pair_solo_wall_ratio_avg",
427
+ "pair_solo_wall_ratio_max",
428
+ ],
429
+ )
430
+ summary_rows = None
431
+ aggregate_rows = None
432
+ final_verdict_rows = None
433
+ if all(key in summary for key in [
434
+ "verdict",
435
+ "fixtures_total",
436
+ "rejected_count",
437
+ "candidate_count",
438
+ "pair_evidence_count",
439
+ "unmeasured_count",
440
+ ]):
441
+ expected_summary = (
442
+ "fixtures={fixtures_total} rejected={rejected_count} candidates={candidate_count} "
443
+ "pair_evidence={pair_evidence_count} unmeasured={unmeasured_count} verdict={verdict}"
444
+ ).format(**summary)
445
+ summary_rows = stdout.splitlines().count(expected_summary)
446
+ if summary["verdict"] in {"PASS", "FAIL"}:
447
+ final_verdict_rows = stdout.splitlines().count(
448
+ f"{summary['verdict']} pair-candidate-frontier"
449
+ )
450
+ if all(key in summary for key in [
451
+ "pair_margin_avg",
452
+ "pair_margin_min",
453
+ "pair_solo_wall_ratio_avg",
454
+ "pair_solo_wall_ratio_max",
455
+ ]):
456
+ expected_aggregate = (
457
+ "pair_margin_avg={avg} pair_margin_min={min_margin} "
458
+ "wall_avg={wall_avg} wall_max={wall_max}"
459
+ ).format(
460
+ avg=format_decimal_margin(summary.get("pair_margin_avg")),
461
+ min_margin=format_margin(summary.get("pair_margin_min")),
462
+ wall_avg=format_wall_ratio(summary.get("pair_solo_wall_ratio_avg")),
463
+ wall_max=format_wall_ratio(summary.get("pair_solo_wall_ratio_max")),
464
+ )
465
+ aggregate_rows = stdout.splitlines().count(expected_aggregate)
466
+ stdout_rows = len(
467
+ [
468
+ line
469
+ for line in stdout.splitlines()
470
+ if "verdict=pair_evidence_passed" in line
471
+ ]
472
+ )
473
+ trigger_rows = len(
474
+ [
475
+ line
476
+ for line in stdout.splitlines()
477
+ if "verdict=pair_evidence_passed" in line and " triggers=" in line
478
+ ]
479
+ )
480
+ hypothesis_trigger_rows = len(
481
+ [
482
+ line
483
+ for line in stdout.splitlines()
484
+ if "verdict=pair_evidence_passed" in line and " hypothesis_trigger=" in line
485
+ ]
486
+ )
487
+ return {
488
+ "summary_rows": summary_rows,
489
+ "aggregate_rows": aggregate_rows,
490
+ "final_verdict_rows": final_verdict_rows,
491
+ "expected_rows": expected_rows,
492
+ "stdout_rows": stdout_rows,
493
+ "trigger_rows": trigger_rows,
494
+ "hypothesis_trigger_rows": hypothesis_trigger_rows,
495
+ "rows_match_count": stdout_rows == expected_rows,
496
+ "trigger_rows_match_count": trigger_rows == expected_rows,
497
+ "hypothesis_trigger_rows_match_count": hypothesis_trigger_rows == expected_rows,
498
+ }
499
+
500
+
501
+ def load_pair_evidence_rows(path: pathlib.Path) -> list[dict[str, object]]:
502
+ try:
503
+ data = loads_strict_json_object(path.read_text())
504
+ except (OSError, ValueError, json.JSONDecodeError):
505
+ return []
506
+ rows = data.get("rows")
507
+ if not isinstance(rows, list):
508
+ return []
509
+ evidence_rows: list[dict[str, object]] = []
510
+ for row in rows:
511
+ if not isinstance(row, dict) or row.get("status") != "pair_evidence_passed":
512
+ continue
513
+ fixture = row.get("fixture")
514
+ evidence = row.get("passing_pair_evidence")
515
+ if not isinstance(fixture, str) or not isinstance(evidence, list):
516
+ continue
517
+ best = best_pair_evidence(evidence)
518
+ if best is not None:
519
+ evidence_rows.append({
520
+ "fixture": fixture,
521
+ "verdict": "pair_evidence_passed",
522
+ **best,
523
+ })
524
+ return evidence_rows
525
+
526
+
527
+ def check_min_pair_evidence(frontier_json: pathlib.Path, minimum: int) -> int:
528
+ summary = load_summary(frontier_json, ["pair_evidence_count"])
529
+ count = summary.get("pair_evidence_count")
530
+ if not is_strict_int(count):
531
+ print("pair evidence count missing or malformed from frontier report", file=sys.stderr)
532
+ return 1
533
+ if count < minimum:
534
+ print(
535
+ f"pair evidence count {count} below required minimum {minimum}",
536
+ file=sys.stderr,
537
+ )
538
+ return 1
539
+ rows = load_pair_evidence_rows(frontier_json)
540
+ if len(rows) != count:
541
+ print(
542
+ f"pair evidence rows {len(rows)} do not match summary count {count}",
543
+ file=sys.stderr,
544
+ )
545
+ return 1
546
+ return 0
547
+
548
+
549
+ def check_pair_evidence_quality(
550
+ frontier_json: pathlib.Path,
551
+ *,
552
+ min_pair_margin: int,
553
+ max_pair_solo_wall_ratio: float,
554
+ ) -> int:
555
+ rows = load_pair_evidence_rows(frontier_json)
556
+ if not rows:
557
+ print("pair evidence quality check has no complete rows", file=sys.stderr)
558
+ return 1
559
+ low_margin = [
560
+ row["fixture"]
561
+ for row in rows
562
+ if row["pair_margin"] < min_pair_margin
563
+ ]
564
+ if low_margin:
565
+ print(
566
+ "pair evidence margin below minimum for fixture(s): "
567
+ + ", ".join(low_margin),
568
+ file=sys.stderr,
569
+ )
570
+ return 1
571
+ high_wall = [
572
+ row["fixture"]
573
+ for row in rows
574
+ if row["pair_solo_wall_ratio"] > max_pair_solo_wall_ratio
575
+ ]
576
+ if high_wall:
577
+ print(
578
+ "pair evidence wall ratio above maximum for fixture(s): "
579
+ + ", ".join(high_wall),
580
+ file=sys.stderr,
581
+ )
582
+ return 1
583
+ summary = load_summary(
584
+ frontier_json,
585
+ ["pair_margin_min", "pair_solo_wall_ratio_max"],
586
+ )
587
+ actual_min_margin = min(row["pair_margin"] for row in rows)
588
+ actual_max_wall = round(max(row["pair_solo_wall_ratio"] for row in rows), 2)
589
+ if summary.get("pair_margin_min") != actual_min_margin:
590
+ print("frontier pair_margin_min does not match pair evidence rows", file=sys.stderr)
591
+ return 1
592
+ if summary.get("pair_solo_wall_ratio_max") != actual_max_wall:
593
+ print(
594
+ "frontier pair_solo_wall_ratio_max does not match pair evidence rows",
595
+ file=sys.stderr,
596
+ )
597
+ return 1
598
+ return 0
599
+
600
+
601
+ def print_pair_evidence_quality(
602
+ frontier_json: pathlib.Path,
603
+ *,
604
+ min_pair_margin: int,
605
+ max_pair_solo_wall_ratio: float,
606
+ status: int,
607
+ ) -> None:
608
+ rows = load_pair_evidence_rows(frontier_json)
609
+ pair_margins = [
610
+ row["pair_margin"]
611
+ for row in rows
612
+ if is_strict_int(row.get("pair_margin"))
613
+ ]
614
+ wall_ratios = [
615
+ row["pair_solo_wall_ratio"]
616
+ for row in rows
617
+ if is_strict_number(row.get("pair_solo_wall_ratio"))
618
+ ]
619
+ print(
620
+ "pair_evidence_quality={status} min_pair_margin_actual={actual_margin} "
621
+ "min_pair_margin_required={required_margin} max_wall_actual={actual_wall} "
622
+ "max_wall_allowed={allowed_wall}".format(
623
+ status="PASS" if status == 0 else "FAIL",
624
+ actual_margin=format_margin(min(pair_margins) if pair_margins else None),
625
+ required_margin=format_margin(min_pair_margin),
626
+ actual_wall=format_wall_ratio(max(wall_ratios) if wall_ratios else None),
627
+ allowed_wall=format_wall_ratio(max_pair_solo_wall_ratio),
628
+ ),
629
+ flush=True,
630
+ )
631
+
632
+
633
+ def check_pair_trigger_reasons(frontier_json: pathlib.Path) -> int:
634
+ rows = load_pair_evidence_rows(frontier_json)
635
+ summary = load_summary(frontier_json, ["pair_evidence_count"])
636
+ count = summary.get("pair_evidence_count")
637
+ if not is_strict_int(count):
638
+ print("pair trigger reason count missing or malformed from frontier report", file=sys.stderr)
639
+ return 1
640
+ if len(rows) != count:
641
+ print(
642
+ f"pair trigger reason rows {len(rows)} do not match summary count {count}",
643
+ file=sys.stderr,
644
+ )
645
+ return 1
646
+ missing = [
647
+ row["fixture"]
648
+ for row in rows
649
+ if not isinstance(row.get("pair_trigger_reasons"), list)
650
+ ]
651
+ malformed = [
652
+ row["fixture"]
653
+ for row in rows
654
+ if isinstance(row.get("pair_trigger_reasons"), list)
655
+ and row.get("pair_trigger_has_canonical_reason") is not True
656
+ ]
657
+ if missing:
658
+ print(
659
+ "pair trigger reasons missing for fixture(s): "
660
+ + ", ".join(missing),
661
+ file=sys.stderr,
662
+ )
663
+ return 1
664
+ if malformed:
665
+ print(
666
+ "pair trigger reasons missing canonical trigger for fixture(s): "
667
+ + ", ".join(malformed),
668
+ file=sys.stderr,
669
+ )
670
+ return 1
671
+ return 0
672
+
673
+
674
+ def print_pair_trigger_reasons_summary(
675
+ frontier_json: pathlib.Path,
676
+ *,
677
+ status: int,
678
+ ) -> None:
679
+ rows = load_pair_evidence_rows(frontier_json)
680
+ summary = load_summary(frontier_json, ["pair_evidence_count"])
681
+ count = summary.get("pair_evidence_count")
682
+ rows_match = is_strict_int(count) and len(rows) == count
683
+ exposed = [
684
+ row
685
+ for row in rows
686
+ if isinstance(row.get("pair_trigger_reasons"), list)
687
+ ]
688
+ canonical = [
689
+ row
690
+ for row in exposed
691
+ if row.get("pair_trigger_has_canonical_reason") is True
692
+ ]
693
+ historical_alias = [
694
+ row
695
+ for row in exposed
696
+ if has_historical_pair_trigger_reason(row["pair_trigger_reasons"])
697
+ ]
698
+ historical_alias_details = pair_trigger_historical_alias_details(historical_alias)
699
+ print(
700
+ "pair_trigger_reasons={status} canonical={canonical} historical_alias={historical_alias} "
701
+ "exposed={exposed} total={total} summary={summary} rows_match={rows_match}".format(
702
+ status="PASS" if status == 0 else "FAIL",
703
+ canonical=len(canonical),
704
+ historical_alias=len(historical_alias),
705
+ exposed=len(exposed),
706
+ total=len(rows),
707
+ summary=format_count(count),
708
+ rows_match=str(rows_match).lower(),
709
+ ),
710
+ flush=True,
711
+ )
712
+ if historical_alias_details:
713
+ details = [
714
+ f"{row['fixture']}={','.join(row['aliases'])}"
715
+ for row in historical_alias_details
716
+ ]
717
+ print(
718
+ "pair_trigger_historical_aliases=" + ";".join(details),
719
+ flush=True,
720
+ )
721
+
722
+
723
+ def fixture_has_actionable_hypothesis(fixtures_root: pathlib.Path, fixture: str) -> bool:
724
+ checker = SCRIPT_DIR / "solo-headroom-hypothesis.py"
725
+ fixture_dir = fixtures_root / fixture
726
+ proc = subprocess.run(
727
+ [
728
+ sys.executable,
729
+ str(checker),
730
+ "--expected-json",
731
+ str(fixture_dir / "expected.json"),
732
+ str(fixture_dir / "spec.md"),
733
+ ],
734
+ text=True,
735
+ capture_output=True,
736
+ )
737
+ return proc.returncode == 0
738
+
739
+
740
+ def pair_evidence_hypothesis_rows(
741
+ frontier_json: pathlib.Path,
742
+ fixtures_root: pathlib.Path,
743
+ ) -> list[dict[str, object]]:
744
+ rows = load_pair_evidence_rows(frontier_json)
745
+ return [
746
+ {
747
+ "fixture": row["fixture"],
748
+ "has_actionable_hypothesis": fixture_has_actionable_hypothesis(
749
+ fixtures_root,
750
+ str(row["fixture"]),
751
+ ),
752
+ }
753
+ for row in rows
754
+ ]
755
+
756
+
757
+ def pair_evidence_hypothesis_trigger_rows(
758
+ frontier_json: pathlib.Path,
759
+ fixtures_root: pathlib.Path,
760
+ ) -> list[dict[str, object]]:
761
+ rows = load_pair_evidence_rows(frontier_json)
762
+ result: list[dict[str, object]] = []
763
+ for row in rows:
764
+ reasons = row.get("pair_trigger_reasons")
765
+ has_actionable = fixture_has_actionable_hypothesis(
766
+ fixtures_root,
767
+ str(row["fixture"]),
768
+ )
769
+ has_hypothesis_trigger = (
770
+ isinstance(reasons, list)
771
+ and "spec.solo_headroom_hypothesis" in reasons
772
+ )
773
+ result.append(
774
+ {
775
+ "fixture": row["fixture"],
776
+ "has_actionable_hypothesis": has_actionable,
777
+ "has_hypothesis_trigger": has_hypothesis_trigger,
778
+ "pair_trigger_reasons": reasons if isinstance(reasons, list) else [],
779
+ }
780
+ )
781
+ return result
782
+
783
+
784
+ def check_pair_evidence_hypotheses(
785
+ frontier_json: pathlib.Path,
786
+ fixtures_root: pathlib.Path,
787
+ ) -> int:
788
+ rows = pair_evidence_hypothesis_rows(frontier_json, fixtures_root)
789
+ missing = [
790
+ str(row["fixture"])
791
+ for row in rows
792
+ if row.get("has_actionable_hypothesis") is not True
793
+ ]
794
+ if missing:
795
+ print(
796
+ "pair evidence hypotheses missing for fixture(s): "
797
+ + ", ".join(missing),
798
+ file=sys.stderr,
799
+ )
800
+ return 1
801
+ return 0
802
+
803
+
804
+ def check_pair_evidence_hypothesis_triggers(
805
+ frontier_json: pathlib.Path,
806
+ fixtures_root: pathlib.Path,
807
+ *,
808
+ required: bool,
809
+ ) -> int:
810
+ rows = pair_evidence_hypothesis_trigger_rows(frontier_json, fixtures_root)
811
+ gaps = pair_evidence_hypothesis_trigger_gap_details(rows)
812
+ if required and gaps:
813
+ print(
814
+ "pair evidence hypothesis triggers missing for fixture(s): "
815
+ + ", ".join(str(row["fixture"]) for row in gaps),
816
+ file=sys.stderr,
817
+ )
818
+ return 1
819
+ return 0
820
+
821
+
822
+ def print_pair_evidence_hypotheses_summary(
823
+ frontier_json: pathlib.Path,
824
+ fixtures_root: pathlib.Path,
825
+ *,
826
+ status: int,
827
+ ) -> None:
828
+ rows = pair_evidence_hypothesis_rows(frontier_json, fixtures_root)
829
+ documented = [
830
+ row for row in rows if row.get("has_actionable_hypothesis") is True
831
+ ]
832
+ print(
833
+ "pair_evidence_hypotheses={status} documented={documented} total={total}".format(
834
+ status="PASS" if status == 0 else "FAIL",
835
+ documented=len(documented),
836
+ total=len(rows),
837
+ ),
838
+ flush=True,
839
+ )
840
+
841
+
842
+ def print_pair_evidence_hypothesis_triggers_summary(
843
+ frontier_json: pathlib.Path,
844
+ fixtures_root: pathlib.Path,
845
+ *,
846
+ required: bool,
847
+ ) -> None:
848
+ rows = pair_evidence_hypothesis_trigger_rows(frontier_json, fixtures_root)
849
+ documented = [
850
+ row for row in rows if row.get("has_actionable_hypothesis") is True
851
+ ]
852
+ matched = [
853
+ row
854
+ for row in documented
855
+ if row.get("has_hypothesis_trigger") is True
856
+ ]
857
+ status = "PASS" if len(matched) == len(documented) else ("FAIL" if required else "WARN")
858
+ print(
859
+ "pair_evidence_hypothesis_triggers={status} matched={matched} "
860
+ "documented={documented} total={total}".format(
861
+ status=status,
862
+ matched=len(matched),
863
+ documented=len(documented),
864
+ total=len(rows),
865
+ ),
866
+ flush=True,
867
+ )
868
+ gap_details = pair_evidence_hypothesis_trigger_gap_details(rows)
869
+ if gap_details:
870
+ details = [
871
+ f"{row['fixture']}={','.join(row['pair_trigger_reasons'])}"
872
+ for row in gap_details
873
+ ]
874
+ print(
875
+ "pair_evidence_hypothesis_trigger_gaps=" + ";".join(details),
876
+ flush=True,
877
+ )
878
+
879
+
880
+ def check_frontier_report(frontier_json: pathlib.Path) -> int:
881
+ summary = load_summary(frontier_json, ["verdict", "unmeasured_count"])
882
+ verdict = summary.get("verdict")
883
+ unmeasured_count = summary.get("unmeasured_count")
884
+ if verdict != "PASS":
885
+ print(f"frontier verdict {verdict!r} is not PASS", file=sys.stderr)
886
+ return 1
887
+ if not is_strict_int(unmeasured_count):
888
+ print("frontier unmeasured count missing or malformed", file=sys.stderr)
889
+ return 1
890
+ if unmeasured_count != 0:
891
+ print(
892
+ f"frontier has {unmeasured_count} unmeasured candidate fixture(s)",
893
+ file=sys.stderr,
894
+ )
895
+ return 1
896
+ return 0
897
+
898
+
899
+ def check_frontier_stdout(frontier_json: pathlib.Path, frontier_stdout: pathlib.Path) -> int:
900
+ try:
901
+ stdout = frontier_stdout.read_text(encoding="utf8")
902
+ except OSError:
903
+ print("frontier stdout artifact missing", file=sys.stderr)
904
+ return 1
905
+ summary = load_summary(
906
+ frontier_json,
907
+ [
908
+ "verdict",
909
+ "fixtures_total",
910
+ "rejected_count",
911
+ "candidate_count",
912
+ "pair_evidence_count",
913
+ "unmeasured_count",
914
+ "pair_margin_avg",
915
+ "pair_margin_min",
916
+ "pair_solo_wall_ratio_avg",
917
+ "pair_solo_wall_ratio_max",
918
+ ],
919
+ )
920
+ required_keys = {
921
+ "verdict",
922
+ "fixtures_total",
923
+ "rejected_count",
924
+ "candidate_count",
925
+ "pair_evidence_count",
926
+ "unmeasured_count",
927
+ }
928
+ if not required_keys.issubset(set(summary)):
929
+ print("frontier stdout check missing summary fields", file=sys.stderr)
930
+ return 1
931
+ count_keys = {
932
+ "fixtures_total",
933
+ "rejected_count",
934
+ "candidate_count",
935
+ "pair_evidence_count",
936
+ "unmeasured_count",
937
+ }
938
+ if any(not is_strict_int(summary.get(key)) for key in count_keys):
939
+ print("frontier stdout summary counts malformed", file=sys.stderr)
940
+ return 1
941
+ required_summary = (
942
+ "fixtures={fixtures_total} rejected={rejected_count} candidates={candidate_count} "
943
+ "pair_evidence={pair_evidence_count} unmeasured={unmeasured_count} verdict={verdict}"
944
+ ).format(**summary)
945
+ summary_count = stdout.splitlines().count(required_summary)
946
+ if summary_count != 1:
947
+ print("frontier stdout summary score row count is not exactly 1", file=sys.stderr)
948
+ return 1
949
+ pair_evidence_count = summary.get("pair_evidence_count")
950
+ if pair_evidence_count > 0:
951
+ aggregate_keys = {
952
+ "pair_margin_avg",
953
+ "pair_margin_min",
954
+ "pair_solo_wall_ratio_avg",
955
+ "pair_solo_wall_ratio_max",
956
+ }
957
+ if not aggregate_keys.issubset(set(summary)):
958
+ print("frontier stdout check missing aggregate fields", file=sys.stderr)
959
+ return 1
960
+ if (
961
+ not is_strict_number(summary.get("pair_margin_avg"))
962
+ or not is_strict_int(summary.get("pair_margin_min"))
963
+ or not is_strict_number(summary.get("pair_solo_wall_ratio_avg"))
964
+ or not is_strict_number(summary.get("pair_solo_wall_ratio_max"))
965
+ ):
966
+ print("frontier stdout aggregate fields malformed", file=sys.stderr)
967
+ return 1
968
+ required_aggregate = (
969
+ "pair_margin_avg={avg} pair_margin_min={min_margin} "
970
+ "wall_avg={wall_avg} wall_max={wall_max}"
971
+ ).format(
972
+ avg=format_decimal_margin(summary.get("pair_margin_avg")),
973
+ min_margin=format_margin(summary.get("pair_margin_min")),
974
+ wall_avg=format_wall_ratio(summary.get("pair_solo_wall_ratio_avg")),
975
+ wall_max=format_wall_ratio(summary.get("pair_solo_wall_ratio_max")),
976
+ )
977
+ aggregate_count = stdout.splitlines().count(required_aggregate)
978
+ if aggregate_count != 1:
979
+ print("frontier stdout aggregate score row count is not exactly 1", file=sys.stderr)
980
+ return 1
981
+ expected_rows = load_pair_evidence_rows(frontier_json)
982
+ stdout_score_rows = [
983
+ line
984
+ for line in stdout.splitlines()
985
+ if "verdict=pair_evidence_passed" in line
986
+ ]
987
+ if len(stdout_score_rows) != len(expected_rows):
988
+ print(
989
+ f"frontier stdout score row count {len(stdout_score_rows)} "
990
+ f"does not match frontier evidence row count {len(expected_rows)}",
991
+ file=sys.stderr,
992
+ )
993
+ return 1
994
+ for row in expected_rows:
995
+ required_row = (
996
+ "{fixture}: bare={bare_score} solo_claude={solo_score} pair={pair_score} "
997
+ "arm={pair_arm} margin={pair_margin:+d} wall={wall} run={run_id} "
998
+ "verdict=pair_evidence_passed triggers={triggers} "
999
+ "hypothesis_trigger={hypothesis_trigger}"
1000
+ ).format(
1001
+ **row,
1002
+ wall=format_wall_ratio(row.get("pair_solo_wall_ratio")),
1003
+ triggers=format_trigger_reasons(row.get("pair_trigger_reasons")),
1004
+ hypothesis_trigger=format_bool(row.get("pair_trigger_has_hypothesis_reason")),
1005
+ )
1006
+ if required_row not in stdout:
1007
+ print(f"frontier stdout missing score row for {row['fixture']}", file=sys.stderr)
1008
+ return 1
1009
+ if summary["verdict"] not in {"PASS", "FAIL"}:
1010
+ print("frontier stdout verdict malformed", file=sys.stderr)
1011
+ return 1
1012
+ final_verdict_count = stdout.splitlines().count(
1013
+ f"{summary['verdict']} pair-candidate-frontier"
1014
+ )
1015
+ if final_verdict_count != 1:
1016
+ print("frontier stdout final verdict row count is not exactly 1", file=sys.stderr)
1017
+ return 1
1018
+ return 0
1019
+
1020
+
1021
+ def format_wall_ratio(value: object) -> str:
1022
+ return f"{value:.2f}x" if is_strict_number(value) else ""
1023
+
1024
+
1025
+ def format_decimal_margin(value: object) -> str:
1026
+ return f"{value:+.2f}" if is_strict_number(value) else ""
1027
+
1028
+
1029
+ def format_margin(value: object) -> str:
1030
+ return f"{value:+d}" if is_strict_int(value) else ""
1031
+
1032
+
1033
+ def format_count(value: object) -> str:
1034
+ return str(value) if is_strict_int(value) else "MISSING"
1035
+
1036
+
1037
+ def format_trigger_reasons(value: object) -> str:
1038
+ if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
1039
+ return ""
1040
+ return ",".join(value)
1041
+
1042
+
1043
+ def format_bool(value: object) -> str:
1044
+ return str(value).lower() if isinstance(value, bool) else ""
1045
+
1046
+
1047
+ def main() -> int:
1048
+ parser = argparse.ArgumentParser()
1049
+ parser.add_argument(
1050
+ "--fixtures-root",
1051
+ type=pathlib.Path,
1052
+ default=pathlib.Path("benchmark/auto-resolve/fixtures"),
1053
+ )
1054
+ parser.add_argument(
1055
+ "--registry",
1056
+ type=pathlib.Path,
1057
+ default=SCRIPT_DIR / "pair-rejected-fixtures.sh",
1058
+ )
1059
+ parser.add_argument(
1060
+ "--results-root",
1061
+ type=pathlib.Path,
1062
+ default=pathlib.Path("benchmark/auto-resolve/results"),
1063
+ )
1064
+ parser.add_argument(
1065
+ "--out-dir",
1066
+ type=pathlib.Path,
1067
+ help="optional directory for audit.json, frontier.json, and headroom-audit.json",
1068
+ )
1069
+ parser.add_argument(
1070
+ "--min-pair-evidence",
1071
+ type=int,
1072
+ default=4,
1073
+ help="minimum active fixtures with passing pair evidence required for PASS",
1074
+ )
1075
+ parser.add_argument(
1076
+ "--min-pair-margin",
1077
+ type=int,
1078
+ default=5,
1079
+ help="minimum pair-over-solo margin required to count passing pair evidence",
1080
+ )
1081
+ parser.add_argument(
1082
+ "--max-pair-solo-wall-ratio",
1083
+ type=float,
1084
+ default=3.0,
1085
+ help="maximum pair/solo wall-time ratio allowed to count passing pair evidence",
1086
+ )
1087
+ parser.add_argument(
1088
+ "--require-hypothesis-trigger",
1089
+ action="store_true",
1090
+ help=(
1091
+ "fail if pair-evidence fixtures with actionable solo-headroom hypotheses "
1092
+ "do not expose spec.solo_headroom_hypothesis in trigger reasons"
1093
+ ),
1094
+ )
1095
+ args = parser.parse_args()
1096
+ if args.min_pair_evidence < 1:
1097
+ print("error: --min-pair-evidence must be >= 1", file=sys.stderr)
1098
+ return 2
1099
+ if args.min_pair_margin < 1:
1100
+ print("error: --min-pair-margin must be >= 1", file=sys.stderr)
1101
+ return 2
1102
+ if not math.isfinite(args.max_pair_solo_wall_ratio) or args.max_pair_solo_wall_ratio <= 0:
1103
+ print("error: --max-pair-solo-wall-ratio must be finite and > 0", file=sys.stderr)
1104
+ return 2
1105
+
1106
+ out_dir = args.out_dir
1107
+ temp_dir = None
1108
+ if out_dir is None:
1109
+ temp_dir = tempfile.TemporaryDirectory()
1110
+ report_dir = pathlib.Path(temp_dir.name)
1111
+ else:
1112
+ out_dir.mkdir(parents=True, exist_ok=True)
1113
+ report_dir = out_dir
1114
+
1115
+ common = [
1116
+ "--fixtures-root",
1117
+ str(args.fixtures_root),
1118
+ "--registry",
1119
+ str(args.registry),
1120
+ "--results-root",
1121
+ str(args.results_root),
1122
+ ]
1123
+ frontier_cmd = [
1124
+ sys.executable,
1125
+ str(SCRIPT_DIR / "pair-candidate-frontier.py"),
1126
+ *common,
1127
+ "--fail-on-unmeasured",
1128
+ "--min-pair-margin",
1129
+ str(args.min_pair_margin),
1130
+ "--max-pair-solo-wall-ratio",
1131
+ str(args.max_pair_solo_wall_ratio),
1132
+ "--out-json",
1133
+ str(report_dir / "frontier.json"),
1134
+ ]
1135
+ headroom_cmd = [
1136
+ sys.executable,
1137
+ str(SCRIPT_DIR / "audit-headroom-rejections.py"),
1138
+ *common,
1139
+ "--min-pair-margin",
1140
+ str(args.min_pair_margin),
1141
+ "--max-pair-solo-wall-ratio",
1142
+ str(args.max_pair_solo_wall_ratio),
1143
+ "--out-json",
1144
+ str(report_dir / "headroom-audit.json"),
1145
+ ]
1146
+
1147
+ frontier_stdout_path = report_dir / "frontier.stdout"
1148
+ frontier_stderr_path = report_dir / "frontier.stderr"
1149
+ headroom_stdout_path = report_dir / "headroom-rejections.stdout"
1150
+ headroom_stderr_path = report_dir / "headroom-rejections.stderr"
1151
+ frontier_status = run_check(
1152
+ "frontier",
1153
+ frontier_cmd,
1154
+ stdout_path=frontier_stdout_path,
1155
+ stderr_path=frontier_stderr_path,
1156
+ )
1157
+ headroom_status = run_check(
1158
+ "headroom-rejections",
1159
+ headroom_cmd,
1160
+ stdout_path=headroom_stdout_path,
1161
+ stderr_path=headroom_stderr_path,
1162
+ )
1163
+ frontier_report_status = check_frontier_report(report_dir / "frontier.json")
1164
+ frontier_stdout_status = check_frontier_stdout(
1165
+ report_dir / "frontier.json",
1166
+ frontier_stdout_path,
1167
+ )
1168
+ headroom_report_status = check_headroom_audit_report(
1169
+ report_dir / "headroom-audit.json",
1170
+ )
1171
+ print_headroom_rejections_summary(
1172
+ report_dir / "headroom-audit.json",
1173
+ status=headroom_report_status,
1174
+ )
1175
+ pair_evidence_status = check_min_pair_evidence(
1176
+ report_dir / "frontier.json",
1177
+ args.min_pair_evidence,
1178
+ )
1179
+ pair_evidence_quality_status = check_pair_evidence_quality(
1180
+ report_dir / "frontier.json",
1181
+ min_pair_margin=args.min_pair_margin,
1182
+ max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
1183
+ )
1184
+ print_pair_evidence_quality(
1185
+ report_dir / "frontier.json",
1186
+ min_pair_margin=args.min_pair_margin,
1187
+ max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
1188
+ status=pair_evidence_quality_status,
1189
+ )
1190
+ pair_trigger_reason_status = check_pair_trigger_reasons(
1191
+ report_dir / "frontier.json",
1192
+ )
1193
+ print_pair_trigger_reasons_summary(
1194
+ report_dir / "frontier.json",
1195
+ status=pair_trigger_reason_status,
1196
+ )
1197
+ pair_evidence_hypothesis_status = check_pair_evidence_hypotheses(
1198
+ report_dir / "frontier.json",
1199
+ args.fixtures_root,
1200
+ )
1201
+ print_pair_evidence_hypotheses_summary(
1202
+ report_dir / "frontier.json",
1203
+ args.fixtures_root,
1204
+ status=pair_evidence_hypothesis_status,
1205
+ )
1206
+ pair_evidence_hypothesis_trigger_status = check_pair_evidence_hypothesis_triggers(
1207
+ report_dir / "frontier.json",
1208
+ args.fixtures_root,
1209
+ required=args.require_hypothesis_trigger,
1210
+ )
1211
+ print_pair_evidence_hypothesis_triggers_summary(
1212
+ report_dir / "frontier.json",
1213
+ args.fixtures_root,
1214
+ required=args.require_hypothesis_trigger,
1215
+ )
1216
+ if out_dir:
1217
+ write_audit_report(
1218
+ out_dir=out_dir,
1219
+ frontier_status=frontier_status,
1220
+ headroom_status=headroom_status,
1221
+ min_pair_evidence=args.min_pair_evidence,
1222
+ min_pair_margin=args.min_pair_margin,
1223
+ max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
1224
+ frontier_report_status=frontier_report_status,
1225
+ frontier_stdout_status=frontier_stdout_status,
1226
+ headroom_report_status=headroom_report_status,
1227
+ pair_evidence_status=pair_evidence_status,
1228
+ pair_evidence_quality_status=pair_evidence_quality_status,
1229
+ pair_trigger_reason_status=pair_trigger_reason_status,
1230
+ pair_evidence_hypothesis_status=pair_evidence_hypothesis_status,
1231
+ pair_evidence_hypothesis_trigger_status=pair_evidence_hypothesis_trigger_status,
1232
+ require_hypothesis_trigger=args.require_hypothesis_trigger,
1233
+ fixtures_root=args.fixtures_root,
1234
+ )
1235
+ if temp_dir is not None:
1236
+ temp_dir.cleanup()
1237
+ if (
1238
+ frontier_status != 0
1239
+ or headroom_status != 0
1240
+ or frontier_report_status != 0
1241
+ or frontier_stdout_status != 0
1242
+ or headroom_report_status != 0
1243
+ or pair_evidence_status != 0
1244
+ or pair_evidence_quality_status != 0
1245
+ or pair_trigger_reason_status != 0
1246
+ or pair_evidence_hypothesis_status != 0
1247
+ or pair_evidence_hypothesis_trigger_status != 0
1248
+ ):
1249
+ print("FAIL audit-pair-evidence", file=sys.stderr, flush=True)
1250
+ return 1
1251
+ print("PASS audit-pair-evidence")
1252
+ return 0
1253
+
1254
+
1255
+ if __name__ == "__main__":
1256
+ sys.exit(main())