devlyn-cli 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +80 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +210 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,591 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for pair-candidate-frontier.py.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ SCRIPT="$SCRIPT_DIR/pair-candidate-frontier.py"
8
+ TMP_DIR="$(mktemp -d /tmp/pair-candidate-frontier-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ expect_fail_contains() {
12
+ local label="$1"
13
+ local needle="$2"
14
+ shift 2
15
+ local out="$TMP_DIR/$label.out"
16
+ if "$@" > "$out" 2>&1; then
17
+ echo "expected failure for $label" >&2
18
+ cat "$out" >&2
19
+ exit 1
20
+ fi
21
+ if ! grep -Fq -- "$needle" "$out"; then
22
+ echo "missing expected text for $label: $needle" >&2
23
+ cat "$out" >&2
24
+ exit 1
25
+ fi
26
+ }
27
+
28
+ fixtures="$TMP_DIR/fixtures"
29
+ results="$TMP_DIR/results"
30
+ mkdir -p "$fixtures/F2-cli-medium-subcommand" \
31
+ "$fixtures/F16-cli-quote-tax-rules" \
32
+ "$fixtures/F21-cli-scheduler-priority" \
33
+ "$fixtures/F22-cli-low-margin" \
34
+ "$fixtures/F23-cli-high-wall" \
35
+ "$fixtures/retired/F99-retired"
36
+
37
+ cat > "$TMP_DIR/pair-rejected-fixtures.sh" <<'SH'
38
+ rejected_pair_fixture_reason() {
39
+ local fid="$1"
40
+ case "$fid" in
41
+ F2-*|F2)
42
+ echo "measured ceiling"
43
+ ;;
44
+ *)
45
+ return 1
46
+ ;;
47
+ esac
48
+ }
49
+ SH
50
+
51
+ mkdir -p "$results/pass-run"
52
+ cat > "$results/pass-run/full-pipeline-pair-gate.json" <<'JSON'
53
+ {
54
+ "run_id": "pass-run",
55
+ "verdict": "PASS",
56
+ "pair_arm": "l2_risk_probes",
57
+ "rows": [
58
+ {
59
+ "fixture": "F16-cli-quote-tax-rules",
60
+ "status": "PASS",
61
+ "bare_score": 50,
62
+ "solo_score": 75,
63
+ "pair_score": 96,
64
+ "pair_margin": 21,
65
+ "pair_mode": true,
66
+ "pair_trigger_eligible": true,
67
+ "pair_trigger_reasons": ["complexity.high"],
68
+ "pair_trigger_has_canonical_reason": true,
69
+ "pair_solo_wall_ratio": 1.28
70
+ }
71
+ ]
72
+ }
73
+ JSON
74
+
75
+ mkdir -p "$results/incomplete-high-run"
76
+ cat > "$results/incomplete-high-run/full-pipeline-pair-gate.json" <<'JSON'
77
+ {
78
+ "run_id": "incomplete-high-run",
79
+ "verdict": "PASS",
80
+ "rows": [
81
+ {
82
+ "fixture": "F16-cli-quote-tax-rules",
83
+ "status": "PASS",
84
+ "bare_score": 50,
85
+ "solo_score": 75,
86
+ "pair_score": 98,
87
+ "pair_margin": 23,
88
+ "pair_mode": true,
89
+ "pair_trigger_eligible": true,
90
+ "pair_solo_wall_ratio": 1.32
91
+ }
92
+ ]
93
+ }
94
+ JSON
95
+
96
+ mkdir -p "$results/low-margin-run"
97
+ cat > "$results/low-margin-run/full-pipeline-pair-gate.json" <<'JSON'
98
+ {
99
+ "run_id": "low-margin-run",
100
+ "verdict": "PASS",
101
+ "pair_arm": "l2_risk_probes",
102
+ "rows": [
103
+ {
104
+ "fixture": "F22-cli-low-margin",
105
+ "status": "PASS",
106
+ "bare_score": 50,
107
+ "solo_score": 80,
108
+ "pair_score": 84,
109
+ "pair_margin": 4,
110
+ "pair_mode": true,
111
+ "pair_trigger_eligible": true,
112
+ "pair_trigger_reasons": ["complexity.high"],
113
+ "pair_trigger_has_canonical_reason": true,
114
+ "pair_solo_wall_ratio": 1.1
115
+ }
116
+ ]
117
+ }
118
+ JSON
119
+
120
+ mkdir -p "$results/high-wall-run"
121
+ cat > "$results/high-wall-run/full-pipeline-pair-gate.json" <<'JSON'
122
+ {
123
+ "run_id": "high-wall-run",
124
+ "verdict": "PASS",
125
+ "pair_arm": "l2_risk_probes",
126
+ "rows": [
127
+ {
128
+ "fixture": "F23-cli-high-wall",
129
+ "status": "PASS",
130
+ "bare_score": 45,
131
+ "solo_score": 70,
132
+ "pair_score": 91,
133
+ "pair_margin": 21,
134
+ "pair_mode": true,
135
+ "pair_trigger_eligible": true,
136
+ "pair_trigger_reasons": ["complexity.high"],
137
+ "pair_trigger_has_canonical_reason": true,
138
+ "pair_solo_wall_ratio": 3.5
139
+ }
140
+ ]
141
+ }
142
+ JSON
143
+
144
+ mkdir -p "$results/nan-wall-run"
145
+ cat > "$results/nan-wall-run/full-pipeline-pair-gate.json" <<'JSON'
146
+ {
147
+ "run_id": "nan-wall-run",
148
+ "verdict": "PASS",
149
+ "pair_arm": "l2_risk_probes",
150
+ "rows": [
151
+ {
152
+ "fixture": "F21-cli-scheduler-priority",
153
+ "status": "PASS",
154
+ "bare_score": 45,
155
+ "solo_score": 70,
156
+ "pair_score": 91,
157
+ "pair_margin": 21,
158
+ "pair_mode": true,
159
+ "pair_trigger_eligible": true,
160
+ "pair_solo_wall_ratio": NaN
161
+ }
162
+ ]
163
+ }
164
+ JSON
165
+ expect_fail_contains nan-json-constant "pair evidence artifact malformed" \
166
+ python3 "$SCRIPT" \
167
+ --fixtures-root "$fixtures" \
168
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
169
+ --results-root "$results"
170
+ rm -rf "$results/nan-wall-run"
171
+
172
+ mkdir -p "$results/inflated-margin-run"
173
+ cat > "$results/inflated-margin-run/full-pipeline-pair-gate.json" <<'JSON'
174
+ {
175
+ "run_id": "inflated-margin-run",
176
+ "verdict": "PASS",
177
+ "pair_arm": "l2_risk_probes",
178
+ "rows": [
179
+ {
180
+ "fixture": "F21-cli-scheduler-priority",
181
+ "status": "PASS",
182
+ "bare_score": 45,
183
+ "solo_score": 70,
184
+ "pair_score": 71,
185
+ "pair_margin": 21,
186
+ "pair_mode": true,
187
+ "pair_trigger_eligible": true,
188
+ "pair_solo_wall_ratio": 1.2
189
+ }
190
+ ]
191
+ }
192
+ JSON
193
+
194
+ mkdir -p "$results/overrange-score-run"
195
+ cat > "$results/overrange-score-run/full-pipeline-pair-gate.json" <<'JSON'
196
+ {
197
+ "run_id": "overrange-score-run",
198
+ "verdict": "PASS",
199
+ "pair_arm": "l2_risk_probes",
200
+ "rows": [
201
+ {
202
+ "fixture": "F21-cli-scheduler-priority",
203
+ "status": "PASS",
204
+ "bare_score": 45,
205
+ "solo_score": 70,
206
+ "pair_score": 101,
207
+ "pair_margin": 31,
208
+ "pair_mode": true,
209
+ "pair_trigger_eligible": true,
210
+ "pair_solo_wall_ratio": 1.2
211
+ }
212
+ ]
213
+ }
214
+ JSON
215
+
216
+ mkdir -p "$results/invalid-arm-run"
217
+ cat > "$results/invalid-arm-run/full-pipeline-pair-gate.json" <<'JSON'
218
+ {
219
+ "run_id": "invalid-arm-run",
220
+ "verdict": "PASS",
221
+ "pair_arm": "bare",
222
+ "rows": [
223
+ {
224
+ "fixture": "F21-cli-scheduler-priority",
225
+ "status": "PASS",
226
+ "bare_score": 45,
227
+ "solo_score": 70,
228
+ "pair_score": 91,
229
+ "pair_margin": 21,
230
+ "pair_mode": true,
231
+ "pair_trigger_eligible": true,
232
+ "pair_solo_wall_ratio": 1.2
233
+ }
234
+ ]
235
+ }
236
+ JSON
237
+
238
+ mkdir -p "$results/false-pair-mode-run"
239
+ cat > "$results/false-pair-mode-run/full-pipeline-pair-gate.json" <<'JSON'
240
+ {
241
+ "run_id": "false-pair-mode-run",
242
+ "verdict": "PASS",
243
+ "pair_arm": "l2_risk_probes",
244
+ "rows": [
245
+ {
246
+ "fixture": "F21-cli-scheduler-priority",
247
+ "status": "PASS",
248
+ "bare_score": 45,
249
+ "solo_score": 70,
250
+ "pair_score": 91,
251
+ "pair_margin": 21,
252
+ "pair_mode": false,
253
+ "pair_solo_wall_ratio": 1.2
254
+ }
255
+ ]
256
+ }
257
+ JSON
258
+
259
+ mkdir -p "$results/zero-wall-run"
260
+ cat > "$results/zero-wall-run/full-pipeline-pair-gate.json" <<'JSON'
261
+ {
262
+ "run_id": "zero-wall-run",
263
+ "verdict": "PASS",
264
+ "pair_arm": "l2_risk_probes",
265
+ "rows": [
266
+ {
267
+ "fixture": "F21-cli-scheduler-priority",
268
+ "status": "PASS",
269
+ "bare_score": 45,
270
+ "solo_score": 70,
271
+ "pair_score": 91,
272
+ "pair_margin": 21,
273
+ "pair_mode": true,
274
+ "pair_trigger_eligible": true,
275
+ "pair_solo_wall_ratio": 0
276
+ }
277
+ ]
278
+ }
279
+ JSON
280
+
281
+ expect_fail_contains missing-registry "rejected fixture registry missing" \
282
+ python3 "$SCRIPT" \
283
+ --fixtures-root "$fixtures" \
284
+ --registry "$TMP_DIR/missing.sh" \
285
+ --results-root "$results"
286
+
287
+ empty_registry="$TMP_DIR/empty-registry.sh"
288
+ : > "$empty_registry"
289
+ expect_fail_contains empty-registry "rejected fixture registry has no fixture entries" \
290
+ python3 "$SCRIPT" \
291
+ --fixtures-root "$fixtures" \
292
+ --registry "$empty_registry" \
293
+ --results-root "$results"
294
+
295
+ s_only_registry="$TMP_DIR/s-only-registry.sh"
296
+ cat > "$s_only_registry" <<'SH'
297
+ rejected_pair_fixture_reason() {
298
+ local fid="$1"
299
+ case "$fid" in
300
+ S3-*|S3)
301
+ echo "shadow solo ceiling"
302
+ ;;
303
+ *)
304
+ return 1
305
+ ;;
306
+ esac
307
+ }
308
+ SH
309
+ python3 - "$SCRIPT" "$s_only_registry" <<'PY'
310
+ import importlib.util
311
+ import pathlib
312
+ import sys
313
+
314
+ spec = importlib.util.spec_from_file_location("pair_candidate_frontier", sys.argv[1])
315
+ module = importlib.util.module_from_spec(spec)
316
+ assert spec.loader is not None
317
+ spec.loader.exec_module(module)
318
+ assert module.registry_short_ids(pathlib.Path(sys.argv[2])) == {"S3"}
319
+ PY
320
+
321
+ mkdir -p "$results/bad-json-run"
322
+ printf '{not-json\n' > "$results/bad-json-run/full-pipeline-pair-gate.json"
323
+ expect_fail_contains bad-pair-evidence-json "pair evidence artifact malformed" \
324
+ python3 "$SCRIPT" \
325
+ --fixtures-root "$fixtures" \
326
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
327
+ --results-root "$results"
328
+ rm -rf "$results/bad-json-run"
329
+
330
+ mkdir -p "$results/bad-rows-run"
331
+ cat > "$results/bad-rows-run/full-pipeline-pair-gate.json" <<'JSON'
332
+ {
333
+ "run_id": "bad-rows-run",
334
+ "verdict": "PASS",
335
+ "pair_arm": "l2_risk_probes",
336
+ "rows": []
337
+ }
338
+ JSON
339
+ expect_fail_contains bad-pair-evidence-rows "pair evidence artifact rows malformed" \
340
+ python3 "$SCRIPT" \
341
+ --fixtures-root "$fixtures" \
342
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
343
+ --results-root "$results"
344
+ rm -rf "$results/bad-rows-run"
345
+
346
+ mkdir -p "$results/direct-trigger-run/F16-cli-quote-tax-rules/l2_risk_probes"
347
+ cat > "$results/direct-trigger-run/F16-cli-quote-tax-rules/l2_risk_probes/result.json" <<'JSON'
348
+ {
349
+ "pair_trigger": {
350
+ "eligible": true,
351
+ "reasons": ["complexity.high", "looks-hard"],
352
+ "skipped_reason": null
353
+ }
354
+ }
355
+ JSON
356
+ python3 - "$SCRIPT" "$results" <<'PY'
357
+ import importlib.util
358
+ import pathlib
359
+ import sys
360
+
361
+ spec = importlib.util.spec_from_file_location("pair_candidate_frontier", sys.argv[1])
362
+ module = importlib.util.module_from_spec(spec)
363
+ assert spec.loader is not None
364
+ spec.loader.exec_module(module)
365
+ results_root = pathlib.Path(sys.argv[2])
366
+ kwargs = {
367
+ "results_root": results_root,
368
+ "run_id": "direct-trigger-run",
369
+ "fixture": "F16-cli-quote-tax-rules",
370
+ "pair_arm": "l2_risk_probes",
371
+ }
372
+ assert module.pair_result_trigger_reasons(**kwargs) == []
373
+ path = (
374
+ results_root
375
+ / "direct-trigger-run"
376
+ / "F16-cli-quote-tax-rules"
377
+ / "l2_risk_probes"
378
+ / "result.json"
379
+ )
380
+ path.write_text(
381
+ '{"pair_trigger":{"eligible":true,"reasons":["complexity.high","risk_profile.high_risk"],"skipped_reason":null}}\n',
382
+ encoding="utf8",
383
+ )
384
+ assert module.pair_result_trigger_reasons(**kwargs) == [
385
+ "complexity.high",
386
+ "risk_profile.high_risk",
387
+ ]
388
+ path.write_text(
389
+ '{"pair_trigger":{"eligible":true,"reasons":["risk high"],"skipped_reason":null}}\n',
390
+ encoding="utf8",
391
+ )
392
+ assert module.pair_result_trigger_reasons(**kwargs) == []
393
+ PY
394
+
395
+ python3 "$SCRIPT" \
396
+ --fixtures-root "$fixtures" \
397
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
398
+ --results-root "$results" \
399
+ --out-json "$TMP_DIR/frontier.json" \
400
+ --out-md "$TMP_DIR/frontier.md" \
401
+ > "$TMP_DIR/frontier.stdout"
402
+
403
+ python3 - "$TMP_DIR/frontier.json" <<'PY'
404
+ import json
405
+ import sys
406
+
407
+ report = json.load(open(sys.argv[1], encoding="utf8"))
408
+ assert report["verdict"] == "FAIL"
409
+ assert report["min_pair_margin"] == 5
410
+ assert report["max_pair_solo_wall_ratio"] == 3.0
411
+ assert report["fixtures_total"] == 5
412
+ assert report["rejected_total"] == 1
413
+ assert report["candidate_total"] == 4
414
+ assert report["pair_evidence_total"] == 1
415
+ assert report["unmeasured_candidate_total"] == 3
416
+ assert report["rejected_count"] == 1
417
+ assert report["candidate_count"] == 4
418
+ assert report["pair_evidence_count"] == 1
419
+ assert report["unmeasured_count"] == 3
420
+ assert report["pair_margin_avg"] == 21
421
+ assert report["pair_margin_min"] == 21
422
+ assert report["pair_solo_wall_ratio_avg"] == 1.28
423
+ assert report["pair_solo_wall_ratio_max"] == 1.28
424
+ rows = {row["fixture"]: row for row in report["rows"]}
425
+ assert rows["F2-cli-medium-subcommand"]["status"] == "rejected"
426
+ assert rows["F2-cli-medium-subcommand"]["rejected_reason"] == "measured ceiling"
427
+ assert rows["F16-cli-quote-tax-rules"]["status"] == "pair_evidence_passed"
428
+ assert len(rows["F16-cli-quote-tax-rules"]["passing_pair_evidence"]) == 1
429
+ assert rows["F16-cli-quote-tax-rules"]["passing_pair_evidence"][0]["run_id"] == "pass-run"
430
+ assert rows["F21-cli-scheduler-priority"]["status"] == "candidate_unmeasured"
431
+ assert rows["F22-cli-low-margin"]["status"] == "candidate_unmeasured"
432
+ assert rows["F23-cli-high-wall"]["status"] == "candidate_unmeasured"
433
+ PY
434
+
435
+ grep -Fq 'fixtures=5 rejected=1 candidates=4 pair_evidence=1 unmeasured=3 verdict=FAIL' "$TMP_DIR/frontier.stdout"
436
+ grep -Fq 'pair_margin_avg=+21.00 pair_margin_min=+21 wall_avg=1.28x wall_max=1.28x' "$TMP_DIR/frontier.stdout"
437
+ grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21 wall=1.28x run=pass-run verdict=pair_evidence_passed triggers=complexity.high hypothesis_trigger=false' "$TMP_DIR/frontier.stdout"
438
+ grep -Fq 'FAIL pair-candidate-frontier' "$TMP_DIR/frontier.stdout"
439
+
440
+ grep -Fq 'Average pair margin: +21.00' "$TMP_DIR/frontier.md"
441
+ grep -Fq 'Verdict: FAIL' "$TMP_DIR/frontier.md"
442
+ grep -Fq 'Minimum pair margin required: +5' "$TMP_DIR/frontier.md"
443
+ grep -Fq 'Maximum pair/solo wall ratio allowed: 3.00x' "$TMP_DIR/frontier.md"
444
+ grep -Fq 'Maximum pair/solo wall ratio: 1.28x' "$TMP_DIR/frontier.md"
445
+ grep -Fq '| Fixture | Status | Verdict | Evidence | Pair arm | Triggers | Hypothesis trigger | Bare | Solo_claude | Pair | Margin | Wall ratio | Rejected reason |' "$TMP_DIR/frontier.md"
446
+ grep -Fq '| F2-cli-medium-subcommand | rejected | rejected | | | | | | | | | | measured ceiling |' "$TMP_DIR/frontier.md"
447
+ grep -Fq '| F16-cli-quote-tax-rules | pair_evidence_passed | pair_evidence_passed | pass-run | l2_risk_probes | complexity.high | false | 50 | 75 | 96 | +21 | 1.28x | |' "$TMP_DIR/frontier.md"
448
+ grep -Fq '| F21-cli-scheduler-priority | candidate_unmeasured | candidate_unmeasured | | | | | | | | | | |' "$TMP_DIR/frontier.md"
449
+
450
+ expect_fail_contains fail-on-unmeasured "unmeasured candidate fixture(s): F21-cli-scheduler-priority" \
451
+ python3 "$SCRIPT" \
452
+ --fixtures-root "$fixtures" \
453
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
454
+ --results-root "$results" \
455
+ --fail-on-unmeasured
456
+ grep -Fq 'FAIL pair-candidate-frontier' "$TMP_DIR/fail-on-unmeasured.out"
457
+
458
+ set +e
459
+ python3 "$SCRIPT" \
460
+ --fixtures-root "$fixtures" \
461
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
462
+ --results-root "$results" \
463
+ --fail-on-unmeasured \
464
+ > "$TMP_DIR/fail-on-unmeasured.json" \
465
+ 2> "$TMP_DIR/fail-on-unmeasured.stderr"
466
+ fail_on_unmeasured_status=$?
467
+ set -e
468
+ if [ "$fail_on_unmeasured_status" -eq 0 ]; then
469
+ echo "expected pure JSON fail-on-unmeasured path to fail" >&2
470
+ exit 1
471
+ fi
472
+ grep -Fq 'unmeasured candidate fixture(s): F21-cli-scheduler-priority' "$TMP_DIR/fail-on-unmeasured.stderr"
473
+ grep -Fq 'FAIL pair-candidate-frontier' "$TMP_DIR/fail-on-unmeasured.stderr"
474
+ if grep -Fq 'FAIL pair-candidate-frontier' "$TMP_DIR/fail-on-unmeasured.json"; then
475
+ echo "pure JSON stdout must not include final text verdict" >&2
476
+ cat "$TMP_DIR/fail-on-unmeasured.json" >&2
477
+ exit 1
478
+ fi
479
+ python3 - "$TMP_DIR/fail-on-unmeasured.json" <<'PY'
480
+ import json
481
+ import sys
482
+
483
+ report = json.load(open(sys.argv[1], encoding="utf8"))
484
+ assert report["verdict"] == "FAIL"
485
+ assert report["unmeasured_candidate_total"] == 3
486
+ PY
487
+
488
+ python3 "$SCRIPT" \
489
+ --fixtures-root "$fixtures" \
490
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
491
+ --results-root "$results" \
492
+ --min-pair-margin 4 \
493
+ --out-json "$TMP_DIR/frontier-margin4.json" \
494
+ > "$TMP_DIR/frontier-margin4.stdout"
495
+ python3 - "$TMP_DIR/frontier-margin4.json" <<'PY'
496
+ import json
497
+ import sys
498
+
499
+ report = json.load(open(sys.argv[1], encoding="utf8"))
500
+ assert report["min_pair_margin"] == 4
501
+ rows = {row["fixture"]: row for row in report["rows"]}
502
+ assert rows["F22-cli-low-margin"]["status"] == "pair_evidence_passed"
503
+ PY
504
+
505
+ python3 "$SCRIPT" \
506
+ --fixtures-root "$fixtures" \
507
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
508
+ --results-root "$results" \
509
+ --max-pair-solo-wall-ratio 4 \
510
+ --out-json "$TMP_DIR/frontier-wall4.json" \
511
+ > "$TMP_DIR/frontier-wall4.stdout"
512
+ python3 - "$TMP_DIR/frontier-wall4.json" <<'PY'
513
+ import json
514
+ import sys
515
+
516
+ report = json.load(open(sys.argv[1], encoding="utf8"))
517
+ assert report["max_pair_solo_wall_ratio"] == 4.0
518
+ rows = {row["fixture"]: row for row in report["rows"]}
519
+ assert rows["F23-cli-high-wall"]["status"] == "pair_evidence_passed"
520
+ PY
521
+
522
+ expect_fail_contains bad-min-pair-margin "--min-pair-margin must be >= 1" \
523
+ python3 "$SCRIPT" \
524
+ --fixtures-root "$fixtures" \
525
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
526
+ --results-root "$results" \
527
+ --min-pair-margin 0
528
+
529
+ expect_fail_contains bad-max-wall-ratio "--max-pair-solo-wall-ratio must be finite and > 0" \
530
+ python3 "$SCRIPT" \
531
+ --fixtures-root "$fixtures" \
532
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
533
+ --results-root "$results" \
534
+ --max-pair-solo-wall-ratio 0
535
+
536
+ expect_fail_contains nan-max-wall-ratio "--max-pair-solo-wall-ratio must be finite and > 0" \
537
+ python3 "$SCRIPT" \
538
+ --fixtures-root "$fixtures" \
539
+ --registry "$TMP_DIR/pair-rejected-fixtures.sh" \
540
+ --results-root "$results" \
541
+ --max-pair-solo-wall-ratio NaN
542
+
543
+ package_fixtures="$TMP_DIR/package-fixtures"
544
+ package_results="$TMP_DIR/package-results"
545
+ mkdir -p "$package_fixtures/F16-cli-quote-tax-rules" \
546
+ "$package_fixtures/F21-cli-scheduler-priority" \
547
+ "$package_fixtures/F23-cli-fulfillment-wave" \
548
+ "$package_fixtures/F25-cli-cart-promotion-rules" \
549
+ "$package_results/20260510-f16-f23-f25-combined-proof" \
550
+ "$package_results/20260511-f21-current-riskprobes-v1"
551
+ cp "$SCRIPT_DIR/../results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json" \
552
+ "$package_results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json"
553
+ cp "$SCRIPT_DIR/../results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json" \
554
+ "$package_results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json"
555
+ for fixture in F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules; do
556
+ mkdir -p "$package_results/20260510-f16-f23-f25-combined-proof/$fixture/l2_risk_probes"
557
+ cp "$SCRIPT_DIR/../results/20260510-f16-f23-f25-combined-proof/$fixture/l2_risk_probes/result.json" \
558
+ "$package_results/20260510-f16-f23-f25-combined-proof/$fixture/l2_risk_probes/result.json"
559
+ done
560
+ mkdir -p "$package_results/20260511-f21-current-riskprobes-v1/F21-cli-scheduler-priority/l2_risk_probes"
561
+ cp "$SCRIPT_DIR/../results/20260511-f21-current-riskprobes-v1/F21-cli-scheduler-priority/l2_risk_probes/result.json" \
562
+ "$package_results/20260511-f21-current-riskprobes-v1/F21-cli-scheduler-priority/l2_risk_probes/result.json"
563
+ python3 "$SCRIPT" \
564
+ --fixtures-root "$package_fixtures" \
565
+ --registry "$SCRIPT_DIR/pair-rejected-fixtures.sh" \
566
+ --results-root "$package_results" \
567
+ --fail-on-unmeasured \
568
+ --out-json "$TMP_DIR/package-frontier.json" \
569
+ > "$TMP_DIR/package-frontier.out"
570
+ grep -Fq 'fixtures=4 rejected=0 candidates=4 pair_evidence=4 unmeasured=0' "$TMP_DIR/package-frontier.out"
571
+ grep -Fq 'PASS pair-candidate-frontier' "$TMP_DIR/package-frontier.out"
572
+ python3 - "$TMP_DIR/package-frontier.json" <<'PY'
573
+ import json
574
+ import sys
575
+
576
+ report = json.load(open(sys.argv[1], encoding="utf8"))
577
+ assert report["verdict"] == "PASS"
578
+ assert report["min_pair_margin"] == 5
579
+ assert report["max_pair_solo_wall_ratio"] == 3.0
580
+ assert report["fixtures_total"] == 4
581
+ assert report["pair_evidence_total"] == 4
582
+ assert report["unmeasured_candidate_total"] == 0
583
+ assert report["pair_evidence_count"] == 4
584
+ assert report["unmeasured_count"] == 0
585
+ assert report["pair_margin_avg"] is not None
586
+ assert report["pair_margin_min"] is not None
587
+ assert report["pair_solo_wall_ratio_avg"] is not None
588
+ assert report["pair_solo_wall_ratio_max"] is not None
589
+ PY
590
+
591
+ echo "PASS test-pair-candidate-frontier"