devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,1672 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for audit-pair-evidence.py.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ SCRIPT="$SCRIPT_DIR/audit-pair-evidence.py"
8
+ TMP_DIR="$(mktemp -d /tmp/audit-pair-evidence-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ expect_fail_contains() {
12
+ local label="$1"
13
+ local needle="$2"
14
+ shift 2
15
+ local out="$TMP_DIR/$label.out"
16
+ if "$@" > "$out" 2>&1; then
17
+ echo "expected failure for $label" >&2
18
+ cat "$out" >&2
19
+ exit 1
20
+ fi
21
+ if ! grep -Fq -- "$needle" "$out"; then
22
+ echo "missing expected text for $label: $needle" >&2
23
+ cat "$out" >&2
24
+ exit 1
25
+ fi
26
+ }
27
+
28
+ fixtures="$TMP_DIR/fixtures"
29
+ results="$TMP_DIR/results"
30
+ registry="$TMP_DIR/pair-rejected-fixtures.sh"
31
+ mkdir -p "$fixtures/F16-cli-quote-tax-rules" \
32
+ "$fixtures/F21-cli-scheduler-priority" \
33
+ "$fixtures/F34-cli-rejected-candidate" \
34
+ "$results/pair-pass" \
35
+ "$results/pair-pass-2" \
36
+ "$results/rejected-headroom"
37
+
38
+ cat > "$registry" <<'SH'
39
+ rejected_pair_fixture_reason() {
40
+ local fid="$1"
41
+ case "$fid" in
42
+ F34-*|F34)
43
+ echo "measured solo ceiling"
44
+ ;;
45
+ *)
46
+ return 1
47
+ ;;
48
+ esac
49
+ }
50
+ SH
51
+
52
+ cat > "$results/pair-pass/full-pipeline-pair-gate.json" <<'JSON'
53
+ {
54
+ "run_id": "pair-pass",
55
+ "verdict": "PASS",
56
+ "pair_arm": "l2_risk_probes",
57
+ "rows": [
58
+ {
59
+ "fixture": "F16-cli-quote-tax-rules",
60
+ "status": "PASS",
61
+ "bare_score": 50,
62
+ "solo_score": 75,
63
+ "pair_score": 96,
64
+ "pair_margin": 21,
65
+ "pair_mode": true,
66
+ "pair_trigger_eligible": true,
67
+ "pair_trigger_reasons": ["complexity.high"],
68
+ "pair_trigger_has_canonical_reason": true,
69
+ "pair_solo_wall_ratio": 1.28
70
+ }
71
+ ]
72
+ }
73
+ JSON
74
+
75
+ cat > "$results/rejected-headroom/headroom-gate.json" <<'JSON'
76
+ {
77
+ "run_id": "rejected-headroom",
78
+ "verdict": "FAIL",
79
+ "rows": [
80
+ {
81
+ "fixture": "F34-cli-rejected-candidate",
82
+ "status": "FAIL",
83
+ "bare_score": 33,
84
+ "solo_score": 98,
85
+ "reason": "solo_claude score 98 > 80"
86
+ }
87
+ ]
88
+ }
89
+ JSON
90
+
91
+ expect_fail_contains unmeasured "unmeasured candidate fixture(s): F21-cli-scheduler-priority" \
92
+ python3 "$SCRIPT" \
93
+ --fixtures-root "$fixtures" \
94
+ --registry "$registry" \
95
+ --results-root "$results" \
96
+ --out-dir "$TMP_DIR/out-fail"
97
+
98
+ expect_fail_contains bad-min-pair-evidence "--min-pair-evidence must be >= 1" \
99
+ python3 "$SCRIPT" \
100
+ --fixtures-root "$fixtures" \
101
+ --registry "$registry" \
102
+ --results-root "$results" \
103
+ --min-pair-evidence 0
104
+
105
+ expect_fail_contains bad-min-pair-margin "--min-pair-margin must be >= 1" \
106
+ python3 "$SCRIPT" \
107
+ --fixtures-root "$fixtures" \
108
+ --registry "$registry" \
109
+ --results-root "$results" \
110
+ --min-pair-margin 0
111
+
112
+ expect_fail_contains bad-max-wall-ratio "--max-pair-solo-wall-ratio must be finite and > 0" \
113
+ python3 "$SCRIPT" \
114
+ --fixtures-root "$fixtures" \
115
+ --registry "$registry" \
116
+ --results-root "$results" \
117
+ --max-pair-solo-wall-ratio 0
118
+
119
+ expect_fail_contains nan-max-wall-ratio "--max-pair-solo-wall-ratio must be finite and > 0" \
120
+ python3 "$SCRIPT" \
121
+ --fixtures-root "$fixtures" \
122
+ --registry "$registry" \
123
+ --results-root "$results" \
124
+ --max-pair-solo-wall-ratio NaN
125
+
126
+ cat > "$results/pair-pass-2/full-pipeline-pair-gate.json" <<'JSON'
127
+ {
128
+ "run_id": "pair-pass-2",
129
+ "verdict": "PASS",
130
+ "pair_arm": "l2_risk_probes",
131
+ "rows": [
132
+ {
133
+ "fixture": "F21-cli-scheduler-priority",
134
+ "status": "PASS",
135
+ "bare_score": 33,
136
+ "solo_score": 66,
137
+ "pair_score": 99,
138
+ "pair_margin": 33,
139
+ "pair_mode": true,
140
+ "pair_trigger_eligible": true,
141
+ "pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
142
+ "pair_trigger_has_canonical_reason": true,
143
+ "pair_solo_wall_ratio": 1.47
144
+ }
145
+ ]
146
+ }
147
+ JSON
148
+
149
+ expect_fail_contains pair-evidence-hypotheses "pair evidence hypotheses missing for fixture(s): F16-cli-quote-tax-rules, F21-cli-scheduler-priority" \
150
+ python3 "$SCRIPT" \
151
+ --fixtures-root "$fixtures" \
152
+ --registry "$registry" \
153
+ --results-root "$results" \
154
+ --min-pair-evidence 2 \
155
+ --out-dir "$TMP_DIR/out-hypothesis-fail"
156
+
157
+ for fixture in F16-cli-quote-tax-rules F21-cli-scheduler-priority; do
158
+ cat > "$fixtures/$fixture/spec.md" <<'EOF'
159
+ # Spec
160
+
161
+ ## Verification
162
+
163
+ - Visible pair-evidence fixture.
164
+
165
+ ## Solo-headroom hypothesis
166
+
167
+ A capable solo_claude baseline is expected to miss the ordering interaction;
168
+ observable command `node "$BENCH_FIXTURE_DIR/verifiers/visible.js"` exposes the miss.
169
+ EOF
170
+ cat > "$fixtures/$fixture/NOTES.md" <<'EOF'
171
+ # Notes
172
+ EOF
173
+ cat > "$fixtures/$fixture/expected.json" <<'EOF'
174
+ {
175
+ "verification_commands": [
176
+ {
177
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/visible.js\"",
178
+ "exit_code": 0
179
+ }
180
+ ]
181
+ }
182
+ EOF
183
+ done
184
+
185
+ python3 "$SCRIPT" \
186
+ --fixtures-root "$fixtures" \
187
+ --registry "$registry" \
188
+ --results-root "$results" \
189
+ --min-pair-evidence 2 \
190
+ --out-dir "$TMP_DIR/out-pass" \
191
+ > "$TMP_DIR/pass.out"
192
+ grep -Fq 'PASS audit-pair-evidence' "$TMP_DIR/pass.out"
193
+ test -f "$TMP_DIR/out-pass/frontier.json"
194
+ test -f "$TMP_DIR/out-pass/frontier.stdout"
195
+ test -f "$TMP_DIR/out-pass/frontier.stderr"
196
+ test -f "$TMP_DIR/out-pass/headroom-audit.json"
197
+ test -f "$TMP_DIR/out-pass/headroom-rejections.stdout"
198
+ test -f "$TMP_DIR/out-pass/headroom-rejections.stderr"
199
+ test -f "$TMP_DIR/out-pass/audit.json"
200
+ grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP_DIR/out-pass/frontier.stdout"
201
+ grep -Fq 'pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x' "$TMP_DIR/out-pass/frontier.stdout"
202
+ grep -Fq 'verdict=pair_evidence_passed' "$TMP_DIR/out-pass/frontier.stdout"
203
+ grep -Fq 'PASS pair-candidate-frontier' "$TMP_DIR/out-pass/frontier.stdout"
204
+ grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP_DIR/pass.out"
205
+ grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=1.47x max_wall_allowed=3.00x' "$TMP_DIR/pass.out"
206
+ grep -Fq 'pair_trigger_reasons=PASS canonical=2 historical_alias=1 exposed=2 total=2 summary=2 rows_match=true' "$TMP_DIR/pass.out"
207
+ grep -Fq 'pair_trigger_historical_aliases=F21-cli-scheduler-priority=risk_profile.high_risk' "$TMP_DIR/pass.out"
208
+ grep -Fq 'pair_evidence_hypotheses=PASS documented=2 total=2' "$TMP_DIR/pass.out"
209
+ grep -Fq 'pair_evidence_hypothesis_triggers=WARN matched=0 documented=2 total=2' "$TMP_DIR/pass.out"
210
+ grep -Fq 'pair_evidence_hypothesis_trigger_gaps=F16-cli-quote-tax-rules=complexity.high;F21-cli-scheduler-priority=complexity.high,risk_profile.high_risk' "$TMP_DIR/pass.out"
211
+ python3 - "$TMP_DIR/out-pass/audit.json" <<'PY'
212
+ import json
213
+ import sys
214
+
215
+ report = json.load(open(sys.argv[1], encoding="utf8"))
216
+ assert report["verdict"] == "PASS"
217
+ assert report["min_pair_evidence"] == 2
218
+ assert report["min_pair_margin"] == 5
219
+ assert report["max_pair_solo_wall_ratio"] == 3.0
220
+ assert report["frontier_summary"]["min_pair_margin"] == 5
221
+ assert report["frontier_summary"]["max_pair_solo_wall_ratio"] == 3.0
222
+ assert report["frontier_summary"]["fixtures_total"] == 3
223
+ assert report["frontier_summary"]["candidate_count"] == 2
224
+ assert report["frontier_summary"]["pair_evidence_count"] == 2
225
+ assert report["frontier_summary"]["unmeasured_count"] == 0
226
+ assert report["frontier_summary"]["pair_margin_avg"] == 27
227
+ assert report["frontier_summary"]["pair_margin_min"] == 21
228
+ assert report["frontier_summary"]["pair_solo_wall_ratio_avg"] == 1.38
229
+ assert report["frontier_summary"]["pair_solo_wall_ratio_max"] == 1.47
230
+ assert report["pair_evidence_rows"] == [
231
+ {
232
+ "fixture": "F16-cli-quote-tax-rules",
233
+ "verdict": "pair_evidence_passed",
234
+ "run_id": "pair-pass",
235
+ "pair_arm": "l2_risk_probes",
236
+ "bare_score": 50,
237
+ "solo_score": 75,
238
+ "pair_score": 96,
239
+ "pair_margin": 21,
240
+ "pair_mode": True,
241
+ "pair_trigger_eligible": True,
242
+ "pair_trigger_reasons": ["complexity.high"],
243
+ "pair_trigger_has_canonical_reason": True,
244
+ "pair_trigger_has_hypothesis_reason": False,
245
+ "pair_solo_wall_ratio": 1.28,
246
+ },
247
+ {
248
+ "fixture": "F21-cli-scheduler-priority",
249
+ "verdict": "pair_evidence_passed",
250
+ "run_id": "pair-pass-2",
251
+ "pair_arm": "l2_risk_probes",
252
+ "bare_score": 33,
253
+ "solo_score": 66,
254
+ "pair_score": 99,
255
+ "pair_margin": 33,
256
+ "pair_mode": True,
257
+ "pair_trigger_eligible": True,
258
+ "pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
259
+ "pair_trigger_has_canonical_reason": True,
260
+ "pair_trigger_has_hypothesis_reason": False,
261
+ "pair_solo_wall_ratio": 1.47,
262
+ },
263
+ ]
264
+ assert report["checks"]["frontier"]["status"] == "PASS"
265
+ assert report["checks"]["headroom_rejections"]["status"] == "PASS"
266
+ assert report["checks"]["headroom_rejections"]["exit_code"] == 0
267
+ assert report["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
268
+ assert report["checks"]["headroom_rejections"]["verdict"] == "PASS"
269
+ assert report["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
270
+ assert report["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
271
+ assert report["checks"]["frontier_report"]["status"] == "PASS"
272
+ assert report["checks"]["frontier_report"]["verdict"] == "PASS"
273
+ assert report["checks"]["frontier_report"]["unmeasured_count"] == 0
274
+ assert report["checks"]["frontier_stdout"]["status"] == "PASS"
275
+ assert report["checks"]["frontier_stdout"]["summary_rows"] == 1
276
+ assert report["checks"]["frontier_stdout"]["aggregate_rows"] == 1
277
+ assert report["checks"]["frontier_stdout"]["final_verdict_rows"] == 1
278
+ assert report["checks"]["frontier_stdout"]["expected_rows"] == 2
279
+ assert report["checks"]["frontier_stdout"]["stdout_rows"] == 2
280
+ assert report["checks"]["frontier_stdout"]["trigger_rows"] == 2
281
+ assert report["checks"]["frontier_stdout"]["hypothesis_trigger_rows"] == 2
282
+ assert report["checks"]["frontier_stdout"]["rows_match_count"] is True
283
+ assert report["checks"]["frontier_stdout"]["trigger_rows_match_count"] is True
284
+ assert report["checks"]["frontier_stdout"]["hypothesis_trigger_rows_match_count"] is True
285
+ assert report["checks"]["min_pair_evidence"]["status"] == "PASS"
286
+ assert report["checks"]["min_pair_evidence"]["required"] == 2
287
+ assert report["checks"]["min_pair_evidence"]["actual"] == 2
288
+ assert report["checks"]["min_pair_evidence"]["actual_rows"] == 2
289
+ assert report["checks"]["min_pair_evidence"]["rows_match_count"] is True
290
+ assert report["checks"]["pair_evidence_quality"]["status"] == "PASS"
291
+ assert report["checks"]["pair_evidence_quality"]["min_pair_margin_required"] == 5
292
+ assert report["checks"]["pair_evidence_quality"]["min_pair_margin_actual"] == 21
293
+ assert report["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_allowed"] == 3.0
294
+ assert report["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_actual"] == 1.47
295
+ assert report["checks"]["pair_evidence_quality"]["summary_min_pair_margin"] == 21
296
+ assert report["checks"]["pair_evidence_quality"]["summary_max_pair_solo_wall_ratio"] == 1.47
297
+ assert report["checks"]["pair_trigger_reasons"]["status"] == "PASS"
298
+ assert report["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 2
299
+ assert report["checks"]["pair_trigger_reasons"]["canonical_rows"] == 2
300
+ assert report["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 1
301
+ assert report["checks"]["pair_trigger_reasons"]["historical_alias_details"] == [
302
+ {"fixture": "F21-cli-scheduler-priority", "aliases": ["risk_profile.high_risk"]}
303
+ ]
304
+ assert report["checks"]["pair_trigger_reasons"]["exposed_rows"] == 2
305
+ assert report["checks"]["pair_trigger_reasons"]["total_rows"] == 2
306
+ assert report["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
307
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "WARN"
308
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
309
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["required"] is False
310
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 0
311
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 2
312
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 2
313
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == [
314
+ {
315
+ "fixture": "F16-cli-quote-tax-rules",
316
+ "pair_trigger_reasons": ["complexity.high"],
317
+ },
318
+ {
319
+ "fixture": "F21-cli-scheduler-priority",
320
+ "pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
321
+ },
322
+ ]
323
+ assert report["artifacts"] == {
324
+ "frontier_json": "frontier.json",
325
+ "frontier_stdout": "frontier.stdout",
326
+ "frontier_stderr": "frontier.stderr",
327
+ "headroom_audit_json": "headroom-audit.json",
328
+ "headroom_rejections_stdout": "headroom-rejections.stdout",
329
+ "headroom_rejections_stderr": "headroom-rejections.stderr",
330
+ "audit_json": "audit.json",
331
+ }
332
+ PY
333
+
334
+ if python3 "$SCRIPT" \
335
+ --fixtures-root "$fixtures" \
336
+ --registry "$registry" \
337
+ --results-root "$results" \
338
+ --min-pair-evidence 2 \
339
+ --require-hypothesis-trigger \
340
+ --out-dir "$TMP_DIR/out-strict-trigger" \
341
+ > "$TMP_DIR/strict-trigger.out" 2>&1; then
342
+ echo "audit must fail when --require-hypothesis-trigger sees trigger gaps" >&2
343
+ cat "$TMP_DIR/strict-trigger.out" >&2
344
+ exit 1
345
+ fi
346
+ grep -Fq 'pair evidence hypothesis triggers missing for fixture(s): F16-cli-quote-tax-rules, F21-cli-scheduler-priority' "$TMP_DIR/strict-trigger.out"
347
+ grep -Fq 'pair_evidence_hypothesis_triggers=FAIL matched=0 documented=2 total=2' "$TMP_DIR/strict-trigger.out"
348
+ grep -Fq 'pair_evidence_hypothesis_trigger_gaps=F16-cli-quote-tax-rules=complexity.high;F21-cli-scheduler-priority=complexity.high,risk_profile.high_risk' "$TMP_DIR/strict-trigger.out"
349
+ grep -Fq 'FAIL audit-pair-evidence' "$TMP_DIR/strict-trigger.out"
350
+ python3 - "$TMP_DIR/out-strict-trigger/audit.json" <<'PY'
351
+ import json
352
+ import sys
353
+
354
+ report = json.load(open(sys.argv[1], encoding="utf8"))
355
+ assert report["verdict"] == "FAIL"
356
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "FAIL"
357
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 1
358
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["required"] is True
359
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == [
360
+ {
361
+ "fixture": "F16-cli-quote-tax-rules",
362
+ "pair_trigger_reasons": ["complexity.high"],
363
+ },
364
+ {
365
+ "fixture": "F21-cli-scheduler-priority",
366
+ "pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
367
+ },
368
+ ]
369
+ PY
370
+
371
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/out-pass/frontier.stdout" <<'PY'
372
+ import importlib.util
373
+ import pathlib
374
+ import sys
375
+
376
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
377
+ module = importlib.util.module_from_spec(spec)
378
+ assert spec.loader is not None
379
+ spec.loader.exec_module(module)
380
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
381
+ PY
382
+
383
+ cat > "$TMP_DIR/missing-trigger-reasons.json" <<'JSON'
384
+ {
385
+ "pair_evidence_count": 1,
386
+ "rows": [
387
+ {
388
+ "fixture": "F16-cli-quote-tax-rules",
389
+ "status": "pair_evidence_passed",
390
+ "passing_pair_evidence": [
391
+ {
392
+ "run_id": "pair-pass",
393
+ "pair_arm": "l2_risk_probes",
394
+ "bare_score": 50,
395
+ "solo_score": 75,
396
+ "pair_score": 96,
397
+ "pair_margin": 21,
398
+ "pair_mode": true,
399
+ "pair_trigger_eligible": true,
400
+ "pair_solo_wall_ratio": 1.28
401
+ }
402
+ ]
403
+ }
404
+ ]
405
+ }
406
+ JSON
407
+ expect_fail_contains missing-trigger-reasons "pair trigger reason rows 0 do not match summary count 1" \
408
+ python3 - "$SCRIPT" "$TMP_DIR/missing-trigger-reasons.json" <<'PY'
409
+ import importlib.util
410
+ import pathlib
411
+ import sys
412
+
413
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
414
+ module = importlib.util.module_from_spec(spec)
415
+ assert spec.loader is not None
416
+ spec.loader.exec_module(module)
417
+ sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
418
+ PY
419
+
420
+ cat > "$TMP_DIR/malformed-trigger-reason-rows.json" <<'JSON'
421
+ {
422
+ "pair_evidence_count": 1,
423
+ "rows": [
424
+ {
425
+ "fixture": "F16-cli-quote-tax-rules",
426
+ "status": "pair_evidence_passed",
427
+ "passing_pair_evidence": [
428
+ {
429
+ "run_id": "pair-pass",
430
+ "pair_arm": "l2_risk_probes",
431
+ "bare_score": 50,
432
+ "solo_score": 75,
433
+ "pair_score": 96,
434
+ "pair_margin": 21,
435
+ "pair_mode": true,
436
+ "pair_trigger_eligible": true,
437
+ "pair_trigger_reasons": [],
438
+ "pair_trigger_has_canonical_reason": true,
439
+ "pair_solo_wall_ratio": 1.28
440
+ }
441
+ ]
442
+ }
443
+ ]
444
+ }
445
+ JSON
446
+ expect_fail_contains malformed-trigger-reason-rows "pair trigger reason rows 0 do not match summary count 1" \
447
+ python3 - "$SCRIPT" "$TMP_DIR/malformed-trigger-reason-rows.json" <<'PY'
448
+ import importlib.util
449
+ import pathlib
450
+ import sys
451
+
452
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
453
+ module = importlib.util.module_from_spec(spec)
454
+ assert spec.loader is not None
455
+ spec.loader.exec_module(module)
456
+ sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
457
+ PY
458
+
459
+ cat > "$TMP_DIR/mixed-unknown-trigger-reason-rows.json" <<'JSON'
460
+ {
461
+ "pair_evidence_count": 1,
462
+ "rows": [
463
+ {
464
+ "fixture": "F16-cli-quote-tax-rules",
465
+ "status": "pair_evidence_passed",
466
+ "passing_pair_evidence": [
467
+ {
468
+ "run_id": "pair-pass",
469
+ "pair_arm": "l2_risk_probes",
470
+ "bare_score": 50,
471
+ "solo_score": 75,
472
+ "pair_score": 96,
473
+ "pair_margin": 21,
474
+ "pair_mode": true,
475
+ "pair_trigger_eligible": true,
476
+ "pair_trigger_reasons": ["complexity.high", "looks-hard"],
477
+ "pair_trigger_has_canonical_reason": true,
478
+ "pair_solo_wall_ratio": 1.28
479
+ }
480
+ ]
481
+ }
482
+ ]
483
+ }
484
+ JSON
485
+ expect_fail_contains mixed-unknown-trigger-reason-rows "pair trigger reason rows 0 do not match summary count 1" \
486
+ python3 - "$SCRIPT" "$TMP_DIR/mixed-unknown-trigger-reason-rows.json" <<'PY'
487
+ import importlib.util
488
+ import pathlib
489
+ import sys
490
+
491
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
492
+ module = importlib.util.module_from_spec(spec)
493
+ assert spec.loader is not None
494
+ spec.loader.exec_module(module)
495
+ sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
496
+ PY
497
+
498
+ cat > "$TMP_DIR/normalized-canonical-trigger-reason-rows.json" <<'JSON'
499
+ {
500
+ "pair_evidence_count": 1,
501
+ "rows": [
502
+ {
503
+ "fixture": "F16-cli-quote-tax-rules",
504
+ "status": "pair_evidence_passed",
505
+ "passing_pair_evidence": [
506
+ {
507
+ "run_id": "pair-pass",
508
+ "pair_arm": "l2_risk_probes",
509
+ "bare_score": 50,
510
+ "solo_score": 75,
511
+ "pair_score": 96,
512
+ "pair_margin": 21,
513
+ "pair_mode": true,
514
+ "pair_trigger_eligible": true,
515
+ "pair_trigger_reasons": ["risk high"],
516
+ "pair_trigger_has_canonical_reason": true,
517
+ "pair_solo_wall_ratio": 1.28
518
+ }
519
+ ]
520
+ }
521
+ ]
522
+ }
523
+ JSON
524
+ expect_fail_contains normalized-canonical-trigger-reason-rows "pair trigger reason rows 0 do not match summary count 1" \
525
+ python3 - "$SCRIPT" "$TMP_DIR/normalized-canonical-trigger-reason-rows.json" <<'PY'
526
+ import importlib.util
527
+ import pathlib
528
+ import sys
529
+
530
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
531
+ module = importlib.util.module_from_spec(spec)
532
+ assert spec.loader is not None
533
+ spec.loader.exec_module(module)
534
+ sys.exit(module.check_pair_trigger_reasons(pathlib.Path(sys.argv[2])))
535
+ PY
536
+
537
+ grep -Fv 'PASS pair-candidate-frontier' "$TMP_DIR/out-pass/frontier.stdout" \
538
+ > "$TMP_DIR/missing-final-verdict-frontier.stdout"
539
+ expect_fail_contains missing-final-frontier-verdict "frontier stdout final verdict row count is not exactly 1" \
540
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/missing-final-verdict-frontier.stdout" <<'PY'
541
+ import importlib.util
542
+ import pathlib
543
+ import sys
544
+
545
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
546
+ module = importlib.util.module_from_spec(spec)
547
+ assert spec.loader is not None
548
+ spec.loader.exec_module(module)
549
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
550
+ PY
551
+
552
+ cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/duplicate-final-verdict-frontier.stdout"
553
+ printf 'PASS pair-candidate-frontier\n' >> "$TMP_DIR/duplicate-final-verdict-frontier.stdout"
554
+ expect_fail_contains duplicate-final-frontier-verdict "frontier stdout final verdict row count is not exactly 1" \
555
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/duplicate-final-verdict-frontier.stdout" <<'PY'
556
+ import importlib.util
557
+ import pathlib
558
+ import sys
559
+
560
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
561
+ module = importlib.util.module_from_spec(spec)
562
+ assert spec.loader is not None
563
+ spec.loader.exec_module(module)
564
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
565
+ PY
566
+
567
+ printf 'fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS\n' \
568
+ > "$TMP_DIR/bad-frontier.stdout"
569
+ printf 'pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x\n' \
570
+ >> "$TMP_DIR/bad-frontier.stdout"
571
+ printf 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=95 arm=l2_risk_probes margin=+20 wall=1.28x run=pair-pass verdict=pair_evidence_passed\n' \
572
+ >> "$TMP_DIR/bad-frontier.stdout"
573
+ printf 'F21-cli-scheduler-priority: bare=33 solo_claude=66 pair=99 arm=l2_risk_probes margin=+33 wall=1.47x run=pair-pass-2 verdict=pair_evidence_passed\n' \
574
+ >> "$TMP_DIR/bad-frontier.stdout"
575
+ expect_fail_contains missing-frontier-score-row "frontier stdout missing score row for F16-cli-quote-tax-rules" \
576
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
577
+ import importlib.util
578
+ import pathlib
579
+ import sys
580
+
581
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
582
+ module = importlib.util.module_from_spec(spec)
583
+ assert spec.loader is not None
584
+ spec.loader.exec_module(module)
585
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
586
+ PY
587
+
588
+ sed -E 's/ triggers=[^[:space:]]+//' "$TMP_DIR/out-pass/frontier.stdout" \
589
+ > "$TMP_DIR/no-trigger-frontier.stdout"
590
+ expect_fail_contains missing-frontier-triggers "frontier stdout missing score row for F16-cli-quote-tax-rules" \
591
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/no-trigger-frontier.stdout" <<'PY'
592
+ import importlib.util
593
+ import pathlib
594
+ import sys
595
+
596
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
597
+ module = importlib.util.module_from_spec(spec)
598
+ assert spec.loader is not None
599
+ spec.loader.exec_module(module)
600
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
601
+ PY
602
+
603
+ cat > "$TMP_DIR/no-aggregate-frontier.stdout" <<'OUT'
604
+ fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS
605
+ F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21 wall=1.28x run=pair-pass verdict=pair_evidence_passed
606
+ F21-cli-scheduler-priority: bare=33 solo_claude=66 pair=99 arm=l2_risk_probes margin=+33 wall=1.47x run=pair-pass-2 verdict=pair_evidence_passed
607
+ OUT
608
+ expect_fail_contains missing-frontier-aggregate-row "frontier stdout aggregate score row count is not exactly 1" \
609
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/no-aggregate-frontier.stdout" <<'PY'
610
+ import importlib.util
611
+ import pathlib
612
+ import sys
613
+
614
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
615
+ module = importlib.util.module_from_spec(spec)
616
+ assert spec.loader is not None
617
+ spec.loader.exec_module(module)
618
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
619
+ PY
620
+
621
+ cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/duplicate-summary-frontier.stdout"
622
+ printf 'fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS\n' \
623
+ >> "$TMP_DIR/duplicate-summary-frontier.stdout"
624
+ expect_fail_contains duplicate-frontier-summary-row "frontier stdout summary score row count is not exactly 1" \
625
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/duplicate-summary-frontier.stdout" <<'PY'
626
+ import importlib.util
627
+ import pathlib
628
+ import sys
629
+
630
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
631
+ module = importlib.util.module_from_spec(spec)
632
+ assert spec.loader is not None
633
+ spec.loader.exec_module(module)
634
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
635
+ PY
636
+
637
+ cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/duplicate-aggregate-frontier.stdout"
638
+ printf 'pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x\n' \
639
+ >> "$TMP_DIR/duplicate-aggregate-frontier.stdout"
640
+ expect_fail_contains duplicate-frontier-aggregate-row "frontier stdout aggregate score row count is not exactly 1" \
641
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/duplicate-aggregate-frontier.stdout" <<'PY'
642
+ import importlib.util
643
+ import pathlib
644
+ import sys
645
+
646
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
647
+ module = importlib.util.module_from_spec(spec)
648
+ assert spec.loader is not None
649
+ spec.loader.exec_module(module)
650
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
651
+ PY
652
+
653
+ cat > "$TMP_DIR/partial-frontier.stdout" <<'OUT'
654
+ fixtures=3 rejected=1 candidates=2 pair_evidence=2 unmeasured=0 verdict=PASS
655
+ pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x
656
+ F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21 verdict=pair_evidence_passed
657
+ F21-cli-scheduler-priority: bare=33 solo_claude=66 pair=99 arm=l2_risk_probes margin=+33 wall=1.47x run=pair-pass-2 verdict=pair_evidence_passed
658
+ OUT
659
+ expect_fail_contains partial-frontier-score-row "frontier stdout missing score row for F16-cli-quote-tax-rules" \
660
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/partial-frontier.stdout" <<'PY'
661
+ import importlib.util
662
+ import pathlib
663
+ import sys
664
+
665
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
666
+ module = importlib.util.module_from_spec(spec)
667
+ assert spec.loader is not None
668
+ spec.loader.exec_module(module)
669
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
670
+ PY
671
+
672
+ cp "$TMP_DIR/out-pass/frontier.stdout" "$TMP_DIR/extra-frontier.stdout"
673
+ printf 'F99-stale-fixture: bare=1 solo_claude=2 pair=3 arm=l2_risk_probes margin=+1 wall=1.00x run=stale verdict=pair_evidence_passed\n' \
674
+ >> "$TMP_DIR/extra-frontier.stdout"
675
+ expect_fail_contains extra-frontier-score-row "frontier stdout score row count 3 does not match frontier evidence row count 2" \
676
+ python3 - "$SCRIPT" "$TMP_DIR/out-pass/frontier.json" "$TMP_DIR/extra-frontier.stdout" <<'PY'
677
+ import importlib.util
678
+ import pathlib
679
+ import sys
680
+
681
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
682
+ module = importlib.util.module_from_spec(spec)
683
+ assert spec.loader is not None
684
+ spec.loader.exec_module(module)
685
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
686
+ PY
687
+
688
+ cat > "$TMP_DIR/malformed-frontier-summary.json" <<'JSON'
689
+ {
690
+ "verdict": "PASS"
691
+ }
692
+ JSON
693
+ expect_fail_contains malformed-frontier-stdout-summary "frontier stdout check missing summary fields" \
694
+ python3 - "$SCRIPT" "$TMP_DIR/malformed-frontier-summary.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
695
+ import importlib.util
696
+ import pathlib
697
+ import sys
698
+
699
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
700
+ module = importlib.util.module_from_spec(spec)
701
+ assert spec.loader is not None
702
+ spec.loader.exec_module(module)
703
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
704
+ PY
705
+
706
+ cat > "$TMP_DIR/malformed-frontier-count.json" <<'JSON'
707
+ {
708
+ "verdict": "PASS",
709
+ "fixtures_total": 3,
710
+ "rejected_count": 1,
711
+ "candidate_count": 2,
712
+ "pair_evidence_count": "2",
713
+ "unmeasured_count": 0
714
+ }
715
+ JSON
716
+ expect_fail_contains malformed-frontier-stdout-counts "frontier stdout summary counts malformed" \
717
+ python3 - "$SCRIPT" "$TMP_DIR/malformed-frontier-count.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
718
+ import importlib.util
719
+ import pathlib
720
+ import sys
721
+
722
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
723
+ module = importlib.util.module_from_spec(spec)
724
+ assert spec.loader is not None
725
+ spec.loader.exec_module(module)
726
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
727
+ PY
728
+
729
+ cat > "$TMP_DIR/malformed-frontier-aggregate.json" <<'JSON'
730
+ {
731
+ "verdict": "PASS",
732
+ "fixtures_total": 3,
733
+ "rejected_count": 1,
734
+ "candidate_count": 2,
735
+ "pair_evidence_count": 2,
736
+ "unmeasured_count": 0,
737
+ "pair_margin_avg": "27",
738
+ "pair_margin_min": 21,
739
+ "pair_solo_wall_ratio_avg": 1.38,
740
+ "pair_solo_wall_ratio_max": 1.47,
741
+ "rows": []
742
+ }
743
+ JSON
744
+ expect_fail_contains malformed-frontier-stdout-aggregate "frontier stdout aggregate fields malformed" \
745
+ python3 - "$SCRIPT" "$TMP_DIR/malformed-frontier-aggregate.json" "$TMP_DIR/bad-frontier.stdout" <<'PY'
746
+ import importlib.util
747
+ import pathlib
748
+ import sys
749
+
750
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
751
+ module = importlib.util.module_from_spec(spec)
752
+ assert spec.loader is not None
753
+ spec.loader.exec_module(module)
754
+ sys.exit(module.check_frontier_stdout(pathlib.Path(sys.argv[2]), pathlib.Path(sys.argv[3])))
755
+ PY
756
+
757
+ cat > "$TMP_DIR/frontier-fail-verdict.json" <<'JSON'
758
+ {
759
+ "verdict": "FAIL",
760
+ "unmeasured_count": 1,
761
+ "pair_evidence_count": 1,
762
+ "rows": []
763
+ }
764
+ JSON
765
+ expect_fail_contains frontier-fail-verdict "frontier verdict 'FAIL' is not PASS" \
766
+ python3 - "$SCRIPT" "$TMP_DIR/frontier-fail-verdict.json" <<'PY'
767
+ import importlib.util
768
+ import pathlib
769
+ import sys
770
+
771
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
772
+ module = importlib.util.module_from_spec(spec)
773
+ assert spec.loader is not None
774
+ spec.loader.exec_module(module)
775
+ sys.exit(module.check_frontier_report(pathlib.Path(sys.argv[2])))
776
+ PY
777
+
778
+ cat > "$TMP_DIR/frontier-unmeasured.json" <<'JSON'
779
+ {
780
+ "verdict": "PASS",
781
+ "unmeasured_count": 1,
782
+ "pair_evidence_count": 1,
783
+ "rows": []
784
+ }
785
+ JSON
786
+ expect_fail_contains frontier-unmeasured "frontier has 1 unmeasured candidate fixture(s)" \
787
+ python3 - "$SCRIPT" "$TMP_DIR/frontier-unmeasured.json" <<'PY'
788
+ import importlib.util
789
+ import pathlib
790
+ import sys
791
+
792
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
793
+ module = importlib.util.module_from_spec(spec)
794
+ assert spec.loader is not None
795
+ spec.loader.exec_module(module)
796
+ sys.exit(module.check_frontier_report(pathlib.Path(sys.argv[2])))
797
+ PY
798
+
799
+ cat > "$TMP_DIR/frontier-malformed-unmeasured.json" <<'JSON'
800
+ {
801
+ "verdict": "PASS",
802
+ "unmeasured_count": true,
803
+ "pair_evidence_count": 1,
804
+ "rows": []
805
+ }
806
+ JSON
807
+ expect_fail_contains frontier-malformed-unmeasured "frontier unmeasured count missing or malformed" \
808
+ python3 - "$SCRIPT" "$TMP_DIR/frontier-malformed-unmeasured.json" <<'PY'
809
+ import importlib.util
810
+ import pathlib
811
+ import sys
812
+
813
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
814
+ module = importlib.util.module_from_spec(spec)
815
+ assert spec.loader is not None
816
+ spec.loader.exec_module(module)
817
+ sys.exit(module.check_frontier_report(pathlib.Path(sys.argv[2])))
818
+ PY
819
+
820
+ cat > "$TMP_DIR/headroom-fail-verdict.json" <<'JSON'
821
+ {
822
+ "verdict": "FAIL",
823
+ "unrecorded_failures": [],
824
+ "unsupported_registry_rejections": []
825
+ }
826
+ JSON
827
+ expect_fail_contains headroom-fail-verdict "headroom audit verdict 'FAIL' is not PASS" \
828
+ python3 - "$SCRIPT" "$TMP_DIR/headroom-fail-verdict.json" <<'PY'
829
+ import importlib.util
830
+ import pathlib
831
+ import sys
832
+
833
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
834
+ module = importlib.util.module_from_spec(spec)
835
+ assert spec.loader is not None
836
+ spec.loader.exec_module(module)
837
+ sys.exit(module.check_headroom_audit_report(pathlib.Path(sys.argv[2])))
838
+ PY
839
+
840
+ cat > "$TMP_DIR/headroom-missing-unsupported.json" <<'JSON'
841
+ {
842
+ "verdict": "PASS",
843
+ "unrecorded_failures": []
844
+ }
845
+ JSON
846
+ expect_fail_contains headroom-missing-unsupported "headroom audit unsupported registry rejection count missing or malformed" \
847
+ python3 - "$SCRIPT" "$TMP_DIR/headroom-missing-unsupported.json" <<'PY'
848
+ import importlib.util
849
+ import pathlib
850
+ import sys
851
+
852
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
853
+ module = importlib.util.module_from_spec(spec)
854
+ assert spec.loader is not None
855
+ spec.loader.exec_module(module)
856
+ sys.exit(module.check_headroom_audit_report(pathlib.Path(sys.argv[2])))
857
+ PY
858
+
859
+ cat > "$TMP_DIR/headroom-unsupported.json" <<'JSON'
860
+ {
861
+ "verdict": "PASS",
862
+ "unrecorded_failures": [],
863
+ "unsupported_registry_rejections": [{"fixture": "F36-unsupported-rejection"}]
864
+ }
865
+ JSON
866
+ expect_fail_contains headroom-unsupported "headroom audit has 1 unsupported registry rejection(s)" \
867
+ python3 - "$SCRIPT" "$TMP_DIR/headroom-unsupported.json" <<'PY'
868
+ import importlib.util
869
+ import pathlib
870
+ import sys
871
+
872
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
873
+ module = importlib.util.module_from_spec(spec)
874
+ assert spec.loader is not None
875
+ spec.loader.exec_module(module)
876
+ sys.exit(module.check_headroom_audit_report(pathlib.Path(sys.argv[2])))
877
+ PY
878
+
879
+ python3 - "$SCRIPT" "$TMP_DIR/headroom-unsupported.json" > "$TMP_DIR/headroom-summary.out" <<'PY'
880
+ import importlib.util
881
+ import pathlib
882
+ import sys
883
+
884
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
885
+ module = importlib.util.module_from_spec(spec)
886
+ assert spec.loader is not None
887
+ spec.loader.exec_module(module)
888
+ module.print_headroom_rejections_summary(pathlib.Path(sys.argv[2]), status=1)
889
+ PY
890
+ grep -Fq 'headroom_rejections=FAIL verdict=PASS unrecorded=0 unsupported=1' "$TMP_DIR/headroom-summary.out"
891
+
892
+ cat > "$TMP_DIR/frontier-incomplete-best.json" <<'JSON'
893
+ {
894
+ "pair_evidence_count": 1,
895
+ "rows": [
896
+ {
897
+ "fixture": "F16-cli-quote-tax-rules",
898
+ "status": "pair_evidence_passed",
899
+ "passing_pair_evidence": [
900
+ {
901
+ "run_id": "higher-incomplete",
902
+ "bare_score": 50,
903
+ "solo_score": 75,
904
+ "pair_score": 98,
905
+ "pair_margin": 23,
906
+ "pair_mode": true,
907
+ "pair_trigger_eligible": true,
908
+ "pair_solo_wall_ratio": 1.32
909
+ },
910
+ {
911
+ "run_id": "lower-complete",
912
+ "pair_arm": "l2_risk_probes",
913
+ "bare_score": 50,
914
+ "solo_score": 75,
915
+ "pair_score": 96,
916
+ "pair_margin": 21,
917
+ "pair_mode": true,
918
+ "pair_trigger_eligible": true,
919
+ "pair_trigger_reasons": ["complexity.high"],
920
+ "pair_trigger_has_canonical_reason": true,
921
+ "pair_solo_wall_ratio": 1.28
922
+ }
923
+ ]
924
+ }
925
+ ]
926
+ }
927
+ JSON
928
+ python3 - "$SCRIPT" "$TMP_DIR/frontier-incomplete-best.json" <<'PY'
929
+ import importlib.util
930
+ import pathlib
931
+ import sys
932
+
933
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
934
+ module = importlib.util.module_from_spec(spec)
935
+ assert spec.loader is not None
936
+ spec.loader.exec_module(module)
937
+ rows = module.load_pair_evidence_rows(pathlib.Path(sys.argv[2]))
938
+ assert rows == [
939
+ {
940
+ "fixture": "F16-cli-quote-tax-rules",
941
+ "verdict": "pair_evidence_passed",
942
+ "run_id": "lower-complete",
943
+ "pair_arm": "l2_risk_probes",
944
+ "bare_score": 50,
945
+ "solo_score": 75,
946
+ "pair_score": 96,
947
+ "pair_margin": 21,
948
+ "pair_mode": True,
949
+ "pair_trigger_eligible": True,
950
+ "pair_trigger_reasons": ["complexity.high"],
951
+ "pair_trigger_has_canonical_reason": True,
952
+ "pair_trigger_has_hypothesis_reason": False,
953
+ "pair_solo_wall_ratio": 1.28,
954
+ }
955
+ ]
956
+ PY
957
+
958
+ cat > "$TMP_DIR/bad-frontier-rows.json" <<'JSON'
959
+ {
960
+ "pair_evidence_count": 2,
961
+ "rows": [
962
+ {
963
+ "fixture": "F16-cli-quote-tax-rules",
964
+ "status": "pair_evidence_passed",
965
+ "passing_pair_evidence": []
966
+ },
967
+ {
968
+ "fixture": "F21-cli-scheduler-priority",
969
+ "status": "pair_evidence_passed",
970
+ "passing_pair_evidence": "malformed"
971
+ }
972
+ ]
973
+ }
974
+ JSON
975
+ expect_fail_contains missing-pair-evidence-rows "pair evidence rows 0 do not match summary count 2" \
976
+ python3 - "$SCRIPT" "$TMP_DIR/bad-frontier-rows.json" <<'PY'
977
+ import importlib.util
978
+ import pathlib
979
+ import sys
980
+
981
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
982
+ module = importlib.util.module_from_spec(spec)
983
+ assert spec.loader is not None
984
+ spec.loader.exec_module(module)
985
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 2))
986
+ PY
987
+
988
+ cat > "$TMP_DIR/bad-frontier-row-fields.json" <<'JSON'
989
+ {
990
+ "pair_evidence_count": 2,
991
+ "rows": [
992
+ {
993
+ "fixture": "F16-cli-quote-tax-rules",
994
+ "status": "pair_evidence_passed",
995
+ "passing_pair_evidence": [
996
+ {
997
+ "run_id": "pair-pass",
998
+ "pair_arm": "l2_risk_probes",
999
+ "bare_score": null,
1000
+ "solo_score": 75,
1001
+ "pair_score": 96,
1002
+ "pair_margin": 21,
1003
+ "pair_mode": true,
1004
+ "pair_trigger_eligible": true,
1005
+ "pair_solo_wall_ratio": 1.28
1006
+ }
1007
+ ]
1008
+ },
1009
+ {
1010
+ "fixture": "F21-cli-scheduler-priority",
1011
+ "status": "pair_evidence_passed",
1012
+ "passing_pair_evidence": [
1013
+ {
1014
+ "run_id": "pair-pass-2",
1015
+ "pair_arm": "l2_risk_probes",
1016
+ "bare_score": 33,
1017
+ "solo_score": 66,
1018
+ "pair_score": 99,
1019
+ "pair_margin": 33,
1020
+ "pair_mode": true,
1021
+ "pair_trigger_eligible": true,
1022
+ "pair_solo_wall_ratio": true
1023
+ }
1024
+ ]
1025
+ }
1026
+ ]
1027
+ }
1028
+ JSON
1029
+ expect_fail_contains malformed-pair-evidence-row-fields "pair evidence rows 0 do not match summary count 2" \
1030
+ python3 - "$SCRIPT" "$TMP_DIR/bad-frontier-row-fields.json" <<'PY'
1031
+ import importlib.util
1032
+ import pathlib
1033
+ import sys
1034
+
1035
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1036
+ module = importlib.util.module_from_spec(spec)
1037
+ assert spec.loader is not None
1038
+ spec.loader.exec_module(module)
1039
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 2))
1040
+ PY
1041
+
1042
+ cat > "$TMP_DIR/nan-frontier-row-fields.json" <<'JSON'
1043
+ {
1044
+ "pair_evidence_count": 1,
1045
+ "rows": [
1046
+ {
1047
+ "fixture": "F16-cli-quote-tax-rules",
1048
+ "status": "pair_evidence_passed",
1049
+ "passing_pair_evidence": [
1050
+ {
1051
+ "run_id": "nan-wall-run",
1052
+ "pair_arm": "l2_risk_probes",
1053
+ "bare_score": 50,
1054
+ "solo_score": 75,
1055
+ "pair_score": 96,
1056
+ "pair_margin": 21,
1057
+ "pair_mode": true,
1058
+ "pair_trigger_eligible": true,
1059
+ "pair_solo_wall_ratio": NaN
1060
+ }
1061
+ ]
1062
+ }
1063
+ ]
1064
+ }
1065
+ JSON
1066
+ expect_fail_contains nan-pair-evidence-row-fields "pair evidence count missing or malformed from frontier report" \
1067
+ python3 - "$SCRIPT" "$TMP_DIR/nan-frontier-row-fields.json" <<'PY'
1068
+ import importlib.util
1069
+ import pathlib
1070
+ import sys
1071
+
1072
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1073
+ module = importlib.util.module_from_spec(spec)
1074
+ assert spec.loader is not None
1075
+ spec.loader.exec_module(module)
1076
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1077
+ PY
1078
+
1079
+ cat > "$TMP_DIR/mismatched-margin-row-fields.json" <<'JSON'
1080
+ {
1081
+ "pair_evidence_count": 1,
1082
+ "rows": [
1083
+ {
1084
+ "fixture": "F16-cli-quote-tax-rules",
1085
+ "status": "pair_evidence_passed",
1086
+ "passing_pair_evidence": [
1087
+ {
1088
+ "run_id": "inflated-margin-run",
1089
+ "pair_arm": "l2_risk_probes",
1090
+ "bare_score": 50,
1091
+ "solo_score": 75,
1092
+ "pair_score": 76,
1093
+ "pair_margin": 21,
1094
+ "pair_mode": true,
1095
+ "pair_trigger_eligible": true,
1096
+ "pair_solo_wall_ratio": 1.28
1097
+ }
1098
+ ]
1099
+ }
1100
+ ]
1101
+ }
1102
+ JSON
1103
+ expect_fail_contains mismatched-margin-row-fields "pair evidence rows 0 do not match summary count 1" \
1104
+ python3 - "$SCRIPT" "$TMP_DIR/mismatched-margin-row-fields.json" <<'PY'
1105
+ import importlib.util
1106
+ import pathlib
1107
+ import sys
1108
+
1109
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1110
+ module = importlib.util.module_from_spec(spec)
1111
+ assert spec.loader is not None
1112
+ spec.loader.exec_module(module)
1113
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1114
+ PY
1115
+
1116
+ cat > "$TMP_DIR/overrange-score-row-fields.json" <<'JSON'
1117
+ {
1118
+ "pair_evidence_count": 1,
1119
+ "rows": [
1120
+ {
1121
+ "fixture": "F16-cli-quote-tax-rules",
1122
+ "status": "pair_evidence_passed",
1123
+ "passing_pair_evidence": [
1124
+ {
1125
+ "run_id": "overrange-score-run",
1126
+ "pair_arm": "l2_risk_probes",
1127
+ "bare_score": 50,
1128
+ "solo_score": 75,
1129
+ "pair_score": 101,
1130
+ "pair_margin": 26,
1131
+ "pair_mode": true,
1132
+ "pair_trigger_eligible": true,
1133
+ "pair_solo_wall_ratio": 1.28
1134
+ }
1135
+ ]
1136
+ }
1137
+ ]
1138
+ }
1139
+ JSON
1140
+ expect_fail_contains overrange-score-row-fields "pair evidence rows 0 do not match summary count 1" \
1141
+ python3 - "$SCRIPT" "$TMP_DIR/overrange-score-row-fields.json" <<'PY'
1142
+ import importlib.util
1143
+ import pathlib
1144
+ import sys
1145
+
1146
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1147
+ module = importlib.util.module_from_spec(spec)
1148
+ assert spec.loader is not None
1149
+ spec.loader.exec_module(module)
1150
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1151
+ PY
1152
+
1153
+ cat > "$TMP_DIR/invalid-pair-arm-row-fields.json" <<'JSON'
1154
+ {
1155
+ "pair_evidence_count": 1,
1156
+ "rows": [
1157
+ {
1158
+ "fixture": "F16-cli-quote-tax-rules",
1159
+ "status": "pair_evidence_passed",
1160
+ "passing_pair_evidence": [
1161
+ {
1162
+ "run_id": "invalid-arm-run",
1163
+ "pair_arm": "bare",
1164
+ "bare_score": 50,
1165
+ "solo_score": 75,
1166
+ "pair_score": 96,
1167
+ "pair_margin": 21,
1168
+ "pair_mode": true,
1169
+ "pair_trigger_eligible": true,
1170
+ "pair_solo_wall_ratio": 1.28
1171
+ }
1172
+ ]
1173
+ }
1174
+ ]
1175
+ }
1176
+ JSON
1177
+ expect_fail_contains invalid-pair-arm-row-fields "pair evidence rows 0 do not match summary count 1" \
1178
+ python3 - "$SCRIPT" "$TMP_DIR/invalid-pair-arm-row-fields.json" <<'PY'
1179
+ import importlib.util
1180
+ import pathlib
1181
+ import sys
1182
+
1183
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1184
+ module = importlib.util.module_from_spec(spec)
1185
+ assert spec.loader is not None
1186
+ spec.loader.exec_module(module)
1187
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1188
+ PY
1189
+
1190
+ cat > "$TMP_DIR/false-pair-mode-row-fields.json" <<'JSON'
1191
+ {
1192
+ "pair_evidence_count": 1,
1193
+ "rows": [
1194
+ {
1195
+ "fixture": "F16-cli-quote-tax-rules",
1196
+ "status": "pair_evidence_passed",
1197
+ "passing_pair_evidence": [
1198
+ {
1199
+ "run_id": "false-pair-mode-run",
1200
+ "pair_arm": "l2_risk_probes",
1201
+ "bare_score": 50,
1202
+ "solo_score": 75,
1203
+ "pair_score": 96,
1204
+ "pair_margin": 21,
1205
+ "pair_mode": false,
1206
+ "pair_solo_wall_ratio": 1.28
1207
+ }
1208
+ ]
1209
+ }
1210
+ ]
1211
+ }
1212
+ JSON
1213
+ expect_fail_contains false-pair-mode-row-fields "pair evidence rows 0 do not match summary count 1" \
1214
+ python3 - "$SCRIPT" "$TMP_DIR/false-pair-mode-row-fields.json" <<'PY'
1215
+ import importlib.util
1216
+ import pathlib
1217
+ import sys
1218
+
1219
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1220
+ module = importlib.util.module_from_spec(spec)
1221
+ assert spec.loader is not None
1222
+ spec.loader.exec_module(module)
1223
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1224
+ PY
1225
+
1226
+ cat > "$TMP_DIR/missing-pair-trigger-row-fields.json" <<'JSON'
1227
+ {
1228
+ "pair_evidence_count": 1,
1229
+ "rows": [
1230
+ {
1231
+ "fixture": "F16-cli-quote-tax-rules",
1232
+ "status": "pair_evidence_passed",
1233
+ "passing_pair_evidence": [
1234
+ {
1235
+ "run_id": "stale-gate-run",
1236
+ "pair_arm": "l2_risk_probes",
1237
+ "bare_score": 50,
1238
+ "solo_score": 75,
1239
+ "pair_score": 96,
1240
+ "pair_margin": 21,
1241
+ "pair_mode": true,
1242
+ "pair_solo_wall_ratio": 1.28
1243
+ }
1244
+ ]
1245
+ }
1246
+ ]
1247
+ }
1248
+ JSON
1249
+ expect_fail_contains missing-pair-trigger-row-fields "pair evidence rows 0 do not match summary count 1" \
1250
+ python3 - "$SCRIPT" "$TMP_DIR/missing-pair-trigger-row-fields.json" <<'PY'
1251
+ import importlib.util
1252
+ import pathlib
1253
+ import sys
1254
+
1255
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1256
+ module = importlib.util.module_from_spec(spec)
1257
+ assert spec.loader is not None
1258
+ spec.loader.exec_module(module)
1259
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1260
+ PY
1261
+
1262
+ cat > "$TMP_DIR/zero-wall-row-fields.json" <<'JSON'
1263
+ {
1264
+ "pair_evidence_count": 1,
1265
+ "rows": [
1266
+ {
1267
+ "fixture": "F16-cli-quote-tax-rules",
1268
+ "status": "pair_evidence_passed",
1269
+ "passing_pair_evidence": [
1270
+ {
1271
+ "run_id": "zero-wall-run",
1272
+ "pair_arm": "l2_risk_probes",
1273
+ "bare_score": 50,
1274
+ "solo_score": 75,
1275
+ "pair_score": 96,
1276
+ "pair_margin": 21,
1277
+ "pair_mode": true,
1278
+ "pair_trigger_eligible": true,
1279
+ "pair_solo_wall_ratio": 0
1280
+ }
1281
+ ]
1282
+ }
1283
+ ]
1284
+ }
1285
+ JSON
1286
+ expect_fail_contains zero-wall-row-fields "pair evidence rows 0 do not match summary count 1" \
1287
+ python3 - "$SCRIPT" "$TMP_DIR/zero-wall-row-fields.json" <<'PY'
1288
+ import importlib.util
1289
+ import pathlib
1290
+ import sys
1291
+
1292
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1293
+ module = importlib.util.module_from_spec(spec)
1294
+ assert spec.loader is not None
1295
+ spec.loader.exec_module(module)
1296
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1297
+ PY
1298
+
1299
+ cat > "$TMP_DIR/bool-frontier-count.json" <<'JSON'
1300
+ {
1301
+ "pair_evidence_count": true,
1302
+ "rows": [
1303
+ {
1304
+ "fixture": "F16-cli-quote-tax-rules",
1305
+ "status": "pair_evidence_passed",
1306
+ "passing_pair_evidence": [
1307
+ {
1308
+ "run_id": "pair-pass",
1309
+ "pair_arm": "l2_risk_probes",
1310
+ "bare_score": 50,
1311
+ "solo_score": 75,
1312
+ "pair_score": 96,
1313
+ "pair_margin": 21,
1314
+ "pair_mode": true,
1315
+ "pair_trigger_eligible": true,
1316
+ "pair_trigger_reasons": ["complexity.high"],
1317
+ "pair_trigger_has_canonical_reason": true,
1318
+ "pair_solo_wall_ratio": 1.28
1319
+ }
1320
+ ]
1321
+ }
1322
+ ]
1323
+ }
1324
+ JSON
1325
+ expect_fail_contains malformed-pair-evidence-count "pair evidence count missing or malformed from frontier report" \
1326
+ python3 - "$SCRIPT" "$TMP_DIR/bool-frontier-count.json" <<'PY'
1327
+ import importlib.util
1328
+ import pathlib
1329
+ import sys
1330
+
1331
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1332
+ module = importlib.util.module_from_spec(spec)
1333
+ assert spec.loader is not None
1334
+ spec.loader.exec_module(module)
1335
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1336
+ PY
1337
+
1338
+ cat > "$TMP_DIR/mismatched-frontier-rows.json" <<'JSON'
1339
+ {
1340
+ "pair_evidence_count": 2,
1341
+ "rows": [
1342
+ {
1343
+ "fixture": "F16-cli-quote-tax-rules",
1344
+ "status": "pair_evidence_passed",
1345
+ "passing_pair_evidence": [
1346
+ {
1347
+ "run_id": "pair-pass",
1348
+ "pair_arm": "l2_risk_probes",
1349
+ "bare_score": 50,
1350
+ "solo_score": 75,
1351
+ "pair_score": 96,
1352
+ "pair_margin": 21,
1353
+ "pair_mode": true,
1354
+ "pair_trigger_eligible": true,
1355
+ "pair_trigger_reasons": ["complexity.high"],
1356
+ "pair_trigger_has_canonical_reason": true,
1357
+ "pair_solo_wall_ratio": 1.28
1358
+ }
1359
+ ]
1360
+ },
1361
+ {
1362
+ "fixture": "F21-cli-scheduler-priority",
1363
+ "status": "pair_evidence_passed",
1364
+ "passing_pair_evidence": [
1365
+ {
1366
+ "run_id": "incomplete-row",
1367
+ "bare_score": 33,
1368
+ "solo_score": 66,
1369
+ "pair_score": 99,
1370
+ "pair_margin": 33,
1371
+ "pair_mode": true,
1372
+ "pair_trigger_eligible": true,
1373
+ "pair_solo_wall_ratio": 1.47
1374
+ }
1375
+ ]
1376
+ }
1377
+ ]
1378
+ }
1379
+ JSON
1380
+ expect_fail_contains mismatched-pair-evidence-rows "pair evidence rows 1 do not match summary count 2" \
1381
+ python3 - "$SCRIPT" "$TMP_DIR/mismatched-frontier-rows.json" <<'PY'
1382
+ import importlib.util
1383
+ import pathlib
1384
+ import sys
1385
+
1386
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1387
+ module = importlib.util.module_from_spec(spec)
1388
+ assert spec.loader is not None
1389
+ spec.loader.exec_module(module)
1390
+ sys.exit(module.check_min_pair_evidence(pathlib.Path(sys.argv[2]), 1))
1391
+ PY
1392
+
1393
+ expect_fail_contains min-pair-evidence "pair evidence count 2 below required minimum 4" \
1394
+ python3 "$SCRIPT" \
1395
+ --fixtures-root "$fixtures" \
1396
+ --registry "$registry" \
1397
+ --results-root "$results" \
1398
+ --out-dir "$TMP_DIR/out-low-evidence"
1399
+ grep -Fq 'FAIL audit-pair-evidence' "$TMP_DIR/min-pair-evidence.out"
1400
+ grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP_DIR/min-pair-evidence.out"
1401
+ grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=1.47x max_wall_allowed=3.00x' "$TMP_DIR/min-pair-evidence.out"
1402
+ grep -Fq 'pair_trigger_reasons=PASS canonical=2 historical_alias=1 exposed=2 total=2 summary=2 rows_match=true' "$TMP_DIR/min-pair-evidence.out"
1403
+ grep -Fq 'pair_trigger_historical_aliases=F21-cli-scheduler-priority=risk_profile.high_risk' "$TMP_DIR/min-pair-evidence.out"
1404
+ grep -Fq 'pair_evidence_hypothesis_triggers=WARN matched=0 documented=2 total=2' "$TMP_DIR/min-pair-evidence.out"
1405
+ grep -Fq 'pair_evidence_hypothesis_trigger_gaps=F16-cli-quote-tax-rules=complexity.high;F21-cli-scheduler-priority=complexity.high,risk_profile.high_risk' "$TMP_DIR/min-pair-evidence.out"
1406
+ python3 - "$TMP_DIR/out-low-evidence/audit.json" <<'PY'
1407
+ import json
1408
+ import sys
1409
+
1410
+ report = json.load(open(sys.argv[1], encoding="utf8"))
1411
+ assert report["verdict"] == "FAIL"
1412
+ assert report["checks"]["frontier"]["status"] == "PASS"
1413
+ assert report["checks"]["headroom_rejections"]["status"] == "PASS"
1414
+ assert report["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
1415
+ assert report["checks"]["headroom_rejections"]["verdict"] == "PASS"
1416
+ assert report["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
1417
+ assert report["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
1418
+ assert report["checks"]["min_pair_evidence"]["status"] == "FAIL"
1419
+ assert report["checks"]["min_pair_evidence"]["required"] == 4
1420
+ assert report["checks"]["min_pair_evidence"]["actual_rows"] == 2
1421
+ assert report["checks"]["pair_evidence_quality"]["status"] == "PASS"
1422
+ assert report["checks"]["pair_evidence_quality"]["min_pair_margin_actual"] == 21
1423
+ assert report["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_actual"] == 1.47
1424
+ assert report["checks"]["pair_trigger_reasons"]["status"] == "PASS"
1425
+ assert report["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 2
1426
+ assert report["checks"]["pair_trigger_reasons"]["canonical_rows"] == 2
1427
+ assert report["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 1
1428
+ assert report["checks"]["pair_trigger_reasons"]["historical_alias_details"] == [
1429
+ {"fixture": "F21-cli-scheduler-priority", "aliases": ["risk_profile.high_risk"]}
1430
+ ]
1431
+ assert report["checks"]["pair_trigger_reasons"]["exposed_rows"] == 2
1432
+ assert report["checks"]["pair_trigger_reasons"]["total_rows"] == 2
1433
+ assert report["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
1434
+ assert report["checks"]["pair_evidence_hypotheses"]["status"] == "PASS"
1435
+ assert report["checks"]["pair_evidence_hypotheses"]["documented_rows"] == 2
1436
+ assert report["checks"]["pair_evidence_hypotheses"]["total_rows"] == 2
1437
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "WARN"
1438
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
1439
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["required"] is False
1440
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 0
1441
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 2
1442
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 2
1443
+ assert report["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == [
1444
+ {
1445
+ "fixture": "F16-cli-quote-tax-rules",
1446
+ "pair_trigger_reasons": ["complexity.high"],
1447
+ },
1448
+ {
1449
+ "fixture": "F21-cli-scheduler-priority",
1450
+ "pair_trigger_reasons": ["complexity.high", "risk_profile.high_risk"],
1451
+ },
1452
+ ]
1453
+ PY
1454
+
1455
+ cat > "$TMP_DIR/low-quality-frontier.json" <<'JSON'
1456
+ {
1457
+ "pair_margin_min": 4,
1458
+ "pair_solo_wall_ratio_max": 1.2,
1459
+ "rows": [
1460
+ {
1461
+ "fixture": "F16-cli-quote-tax-rules",
1462
+ "status": "pair_evidence_passed",
1463
+ "passing_pair_evidence": [
1464
+ {
1465
+ "run_id": "low-quality-run",
1466
+ "pair_arm": "l2_risk_probes",
1467
+ "bare_score": 50,
1468
+ "solo_score": 75,
1469
+ "pair_score": 79,
1470
+ "pair_margin": 4,
1471
+ "pair_mode": true,
1472
+ "pair_trigger_eligible": true,
1473
+ "pair_trigger_reasons": ["complexity.high"],
1474
+ "pair_trigger_has_canonical_reason": true,
1475
+ "pair_solo_wall_ratio": 1.2
1476
+ }
1477
+ ]
1478
+ }
1479
+ ]
1480
+ }
1481
+ JSON
1482
+ expect_fail_contains low-quality-pair-evidence "pair evidence margin below minimum for fixture(s): F16-cli-quote-tax-rules" \
1483
+ python3 - "$SCRIPT" "$TMP_DIR/low-quality-frontier.json" <<'PY'
1484
+ import importlib.util
1485
+ import pathlib
1486
+ import sys
1487
+
1488
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1489
+ module = importlib.util.module_from_spec(spec)
1490
+ assert spec.loader is not None
1491
+ spec.loader.exec_module(module)
1492
+ sys.exit(module.check_pair_evidence_quality(
1493
+ pathlib.Path(sys.argv[2]),
1494
+ min_pair_margin=5,
1495
+ max_pair_solo_wall_ratio=3.0,
1496
+ ))
1497
+ PY
1498
+ python3 - "$SCRIPT" "$TMP_DIR/low-quality-frontier.json" > "$TMP_DIR/low-quality-quality-row.out" <<'PY'
1499
+ import importlib.util
1500
+ import pathlib
1501
+ import sys
1502
+
1503
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1504
+ module = importlib.util.module_from_spec(spec)
1505
+ assert spec.loader is not None
1506
+ spec.loader.exec_module(module)
1507
+ module.print_pair_evidence_quality(
1508
+ pathlib.Path(sys.argv[2]),
1509
+ min_pair_margin=5,
1510
+ max_pair_solo_wall_ratio=3.0,
1511
+ status=1,
1512
+ )
1513
+ PY
1514
+ grep -Fq 'pair_evidence_quality=FAIL min_pair_margin_actual=+4 min_pair_margin_required=+5 max_wall_actual=1.20x max_wall_allowed=3.00x' "$TMP_DIR/low-quality-quality-row.out"
1515
+
1516
+ cat > "$TMP_DIR/no-quality-rows-frontier.json" <<'JSON'
1517
+ {
1518
+ "pair_margin_min": 21,
1519
+ "pair_solo_wall_ratio_max": 1.2,
1520
+ "rows": []
1521
+ }
1522
+ JSON
1523
+ expect_fail_contains no-quality-rows "pair evidence quality check has no complete rows" \
1524
+ python3 - "$SCRIPT" "$TMP_DIR/no-quality-rows-frontier.json" <<'PY'
1525
+ import importlib.util
1526
+ import pathlib
1527
+ import sys
1528
+
1529
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1530
+ module = importlib.util.module_from_spec(spec)
1531
+ assert spec.loader is not None
1532
+ spec.loader.exec_module(module)
1533
+ sys.exit(module.check_pair_evidence_quality(
1534
+ pathlib.Path(sys.argv[2]),
1535
+ min_pair_margin=5,
1536
+ max_pair_solo_wall_ratio=3.0,
1537
+ ))
1538
+ PY
1539
+
1540
+ cat > "$TMP_DIR/high-wall-frontier.json" <<'JSON'
1541
+ {
1542
+ "pair_margin_min": 21,
1543
+ "pair_solo_wall_ratio_max": 3.5,
1544
+ "rows": [
1545
+ {
1546
+ "fixture": "F16-cli-quote-tax-rules",
1547
+ "status": "pair_evidence_passed",
1548
+ "passing_pair_evidence": [
1549
+ {
1550
+ "run_id": "high-wall-run",
1551
+ "pair_arm": "l2_risk_probes",
1552
+ "bare_score": 50,
1553
+ "solo_score": 75,
1554
+ "pair_score": 96,
1555
+ "pair_margin": 21,
1556
+ "pair_mode": true,
1557
+ "pair_trigger_eligible": true,
1558
+ "pair_trigger_reasons": ["complexity.high"],
1559
+ "pair_trigger_has_canonical_reason": true,
1560
+ "pair_solo_wall_ratio": 3.5
1561
+ }
1562
+ ]
1563
+ }
1564
+ ]
1565
+ }
1566
+ JSON
1567
+ expect_fail_contains high-wall-pair-evidence "pair evidence wall ratio above maximum for fixture(s): F16-cli-quote-tax-rules" \
1568
+ python3 - "$SCRIPT" "$TMP_DIR/high-wall-frontier.json" <<'PY'
1569
+ import importlib.util
1570
+ import pathlib
1571
+ import sys
1572
+
1573
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1574
+ module = importlib.util.module_from_spec(spec)
1575
+ assert spec.loader is not None
1576
+ spec.loader.exec_module(module)
1577
+ sys.exit(module.check_pair_evidence_quality(
1578
+ pathlib.Path(sys.argv[2]),
1579
+ min_pair_margin=5,
1580
+ max_pair_solo_wall_ratio=3.0,
1581
+ ))
1582
+ PY
1583
+
1584
+ cat > "$TMP_DIR/summary-mismatch-frontier.json" <<'JSON'
1585
+ {
1586
+ "pair_margin_min": 22,
1587
+ "pair_solo_wall_ratio_max": 1.2,
1588
+ "rows": [
1589
+ {
1590
+ "fixture": "F16-cli-quote-tax-rules",
1591
+ "status": "pair_evidence_passed",
1592
+ "passing_pair_evidence": [
1593
+ {
1594
+ "run_id": "summary-mismatch-run",
1595
+ "pair_arm": "l2_risk_probes",
1596
+ "bare_score": 50,
1597
+ "solo_score": 75,
1598
+ "pair_score": 96,
1599
+ "pair_margin": 21,
1600
+ "pair_mode": true,
1601
+ "pair_trigger_eligible": true,
1602
+ "pair_trigger_reasons": ["complexity.high"],
1603
+ "pair_trigger_has_canonical_reason": true,
1604
+ "pair_solo_wall_ratio": 1.2
1605
+ }
1606
+ ]
1607
+ }
1608
+ ]
1609
+ }
1610
+ JSON
1611
+ expect_fail_contains summary-margin-mismatch "frontier pair_margin_min does not match pair evidence rows" \
1612
+ python3 - "$SCRIPT" "$TMP_DIR/summary-mismatch-frontier.json" <<'PY'
1613
+ import importlib.util
1614
+ import pathlib
1615
+ import sys
1616
+
1617
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1618
+ module = importlib.util.module_from_spec(spec)
1619
+ assert spec.loader is not None
1620
+ spec.loader.exec_module(module)
1621
+ sys.exit(module.check_pair_evidence_quality(
1622
+ pathlib.Path(sys.argv[2]),
1623
+ min_pair_margin=5,
1624
+ max_pair_solo_wall_ratio=3.0,
1625
+ ))
1626
+ PY
1627
+
1628
+ cat > "$TMP_DIR/summary-wall-mismatch-frontier.json" <<'JSON'
1629
+ {
1630
+ "pair_margin_min": 21,
1631
+ "pair_solo_wall_ratio_max": 1.3,
1632
+ "rows": [
1633
+ {
1634
+ "fixture": "F16-cli-quote-tax-rules",
1635
+ "status": "pair_evidence_passed",
1636
+ "passing_pair_evidence": [
1637
+ {
1638
+ "run_id": "summary-wall-mismatch-run",
1639
+ "pair_arm": "l2_risk_probes",
1640
+ "bare_score": 50,
1641
+ "solo_score": 75,
1642
+ "pair_score": 96,
1643
+ "pair_margin": 21,
1644
+ "pair_mode": true,
1645
+ "pair_trigger_eligible": true,
1646
+ "pair_trigger_reasons": ["complexity.high"],
1647
+ "pair_trigger_has_canonical_reason": true,
1648
+ "pair_solo_wall_ratio": 1.2
1649
+ }
1650
+ ]
1651
+ }
1652
+ ]
1653
+ }
1654
+ JSON
1655
+ expect_fail_contains summary-wall-mismatch "frontier pair_solo_wall_ratio_max does not match pair evidence rows" \
1656
+ python3 - "$SCRIPT" "$TMP_DIR/summary-wall-mismatch-frontier.json" <<'PY'
1657
+ import importlib.util
1658
+ import pathlib
1659
+ import sys
1660
+
1661
+ spec = importlib.util.spec_from_file_location("audit_pair_evidence", sys.argv[1])
1662
+ module = importlib.util.module_from_spec(spec)
1663
+ assert spec.loader is not None
1664
+ spec.loader.exec_module(module)
1665
+ sys.exit(module.check_pair_evidence_quality(
1666
+ pathlib.Path(sys.argv[2]),
1667
+ min_pair_margin=5,
1668
+ max_pair_solo_wall_ratio=3.0,
1669
+ ))
1670
+ PY
1671
+
1672
+ echo "PASS test-audit-pair-evidence"