devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -28,18 +28,27 @@ RESUME=0
28
28
  LIMIT=""
29
29
  INSTANCE_IDS=()
30
30
 
31
+ require_value() {
32
+ local flag="$1"
33
+ local value="${2:-}"
34
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
35
+ echo "$flag requires a value" >&2
36
+ exit 1
37
+ fi
38
+ }
39
+
31
40
  while [ $# -gt 0 ]; do
32
41
  case "$1" in
33
- --instances-jsonl) INSTANCES_JSONL="$2"; shift 2;;
34
- --predictions-out) PREDICTIONS_OUT="$2"; shift 2;;
35
- --model-name) MODEL_NAME="$2"; shift 2;;
36
- --repos-root) REPOS_ROOT="$2"; shift 2;;
37
- --worktrees-root) WORKTREES_ROOT="$2"; shift 2;;
38
- --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
42
+ --instances-jsonl) require_value "$1" "${2:-}"; INSTANCES_JSONL="$2"; shift 2;;
43
+ --predictions-out) require_value "$1" "${2:-}"; PREDICTIONS_OUT="$2"; shift 2;;
44
+ --model-name) require_value "$1" "${2:-}"; MODEL_NAME="$2"; shift 2;;
45
+ --repos-root) require_value "$1" "${2:-}"; REPOS_ROOT="$2"; shift 2;;
46
+ --worktrees-root) require_value "$1" "${2:-}"; WORKTREES_ROOT="$2"; shift 2;;
47
+ --timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_SECONDS="$2"; shift 2;;
39
48
  --copy-devlyn-context) COPY_DEVLYN_CONTEXT=1; shift;;
40
49
  --resume) RESUME=1; shift;;
41
- --limit) LIMIT="$2"; shift 2;;
42
- --instance-id) INSTANCE_IDS+=("$2"); shift 2;;
50
+ --limit) require_value "$1" "${2:-}"; LIMIT="$2"; shift 2;;
51
+ --instance-id) require_value "$1" "${2:-}"; INSTANCE_IDS+=("$2"); shift 2;;
43
52
  -h|--help) usage 0;;
44
53
  *) echo "unknown arg: $1" >&2; usage 1;;
45
54
  esac
@@ -62,22 +71,31 @@ TMP_IDS="$(mktemp)"
62
71
  TMP_SELECTED_INSTANCES="$(mktemp)"
63
72
  trap 'rm -f "$TMP_IDS" "$TMP_SELECTED_INSTANCES"' EXIT
64
73
 
65
- python3 - "$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT" "${INSTANCE_IDS[@]}" > "$TMP_IDS" <<'PY'
74
+ selection_args=("$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT")
75
+ if [ "${#INSTANCE_IDS[@]}" -gt 0 ]; then
76
+ selection_args+=("${INSTANCE_IDS[@]}")
77
+ fi
78
+
79
+ python3 - "$SCRIPT_DIR" "${selection_args[@]}" > "$TMP_IDS" <<'PY'
66
80
  import json
67
81
  import sys
68
82
  from pathlib import Path
83
+ sys.path.insert(0, sys.argv[1])
84
+ from pair_evidence_contract import reject_json_constant
69
85
 
70
- instances_path = Path(sys.argv[1])
71
- selected_path = Path(sys.argv[2])
72
- limit = int(sys.argv[3]) if sys.argv[3] else None
73
- requested = sys.argv[4:]
86
+ instances_path = Path(sys.argv[2])
87
+ selected_path = Path(sys.argv[3])
88
+ limit = int(sys.argv[4]) if sys.argv[4] else None
89
+ requested = sys.argv[5:]
74
90
  requested_set = set(requested)
75
91
  rows = []
76
92
  with instances_path.open(encoding="utf8") as f:
77
93
  for line_no, line in enumerate(f, start=1):
78
94
  if not line.strip():
79
95
  continue
80
- row = json.loads(line)
96
+ row = json.loads(line, parse_constant=reject_json_constant)
97
+ if not isinstance(row, dict):
98
+ raise SystemExit(f"{instances_path}:{line_no}: expected JSON object")
81
99
  instance_id = row.get("instance_id")
82
100
  if not isinstance(instance_id, str) or not instance_id:
83
101
  raise SystemExit(f"{instances_path}:{line_no}: missing instance_id")
@@ -11,6 +11,83 @@ Exits 0 on PASS, 1 on FAIL.
11
11
  from __future__ import annotations
12
12
  import argparse, json, pathlib, sys, shutil, datetime
13
13
 
14
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
15
+ if str(SCRIPT_DIR) not in sys.path:
16
+ sys.path.insert(0, str(SCRIPT_DIR))
17
+
18
+ from pair_evidence_contract import reject_json_constant
19
+
20
+
21
+ def load_dict_json(path: pathlib.Path) -> tuple[dict | None, str | None]:
22
+ try:
23
+ data = json.loads(path.read_text(), parse_constant=reject_json_constant)
24
+ except (ValueError, json.JSONDecodeError):
25
+ return None, "invalid JSON"
26
+ if not isinstance(data, dict):
27
+ return None, "expected object"
28
+ return data, None
29
+
30
+
31
+ def object_or_empty(value) -> dict:
32
+ return value if isinstance(value, dict) else {}
33
+
34
+
35
+ def rows_from_summary(summary: dict, failures: list[str]) -> list[dict]:
36
+ raw_rows = summary.get("rows")
37
+ if not isinstance(raw_rows, list):
38
+ failures.append("summary rows missing or malformed — measurement invalid")
39
+ return []
40
+ rows = [row for row in raw_rows if isinstance(row, dict)]
41
+ if len(rows) != len(raw_rows):
42
+ failures.append("summary rows contain non-object entries — measurement invalid")
43
+ return rows
44
+
45
+
46
+ def int_or_none(value) -> int | None:
47
+ return value if isinstance(value, int) and not isinstance(value, bool) else None
48
+
49
+
50
+ def number_or_none(value) -> int | float | None:
51
+ if isinstance(value, bool):
52
+ return None
53
+ return value if isinstance(value, (int, float)) else None
54
+
55
+
56
+ def bool_or_none(value) -> bool | None:
57
+ return value if isinstance(value, bool) else None
58
+
59
+
60
+ def axis_invalid_count(rows: list[dict], arm: str, failures: list[str]) -> int:
61
+ total = 0
62
+ for row in rows:
63
+ arms = object_or_empty(row.get("arms"))
64
+ payload = object_or_empty(arms.get(arm))
65
+ raw_count = payload.get("_axis_validation_out_of_range_count", 0)
66
+ count = number_or_none(raw_count)
67
+ if count is None:
68
+ failures.append(f"{arm} axis count malformed — measurement invalid")
69
+ elif count > 0:
70
+ total += 1
71
+ return total
72
+
73
+
74
+ def unmapped_axis_invalid_count(rows: list[dict], failures: list[str]) -> int:
75
+ total = 0
76
+ for row in rows:
77
+ raw_count = row.get("_axis_validation_unmapped_out_of_range_count", 0)
78
+ count = number_or_none(raw_count)
79
+ if count is None:
80
+ failures.append("unmapped axis count malformed — measurement invalid")
81
+ elif count > 0:
82
+ total += 1
83
+ return total
84
+
85
+
86
+ def is_known_limit(row: dict) -> bool:
87
+ raw_category = row.get("category")
88
+ category = raw_category.lower() if isinstance(raw_category, str) else ""
89
+ return category in {"edge", "known-limit"}
90
+
14
91
 
15
92
  def main() -> int:
16
93
  p = argparse.ArgumentParser()
@@ -25,68 +102,134 @@ def main() -> int:
25
102
  summary_p = root / "results" / args.run_id / "summary.json"
26
103
  if not summary_p.exists():
27
104
  print(f"no summary at {summary_p}", file=sys.stderr); return 1
28
- summary = json.loads(summary_p.read_text())
105
+ summary, summary_error = load_dict_json(summary_p)
106
+ if summary is None:
107
+ print(f"measurement invalid: malformed summary.json ({summary_error})", file=sys.stderr)
108
+ return 1
29
109
 
30
110
  baseline_p = root / "history" / "baselines" / "shipped.json"
31
111
  baseline = None
32
112
  if baseline_p.exists():
33
- try:
34
- baseline = json.loads(baseline_p.read_text())
35
- except Exception:
113
+ baseline, _ = load_dict_json(baseline_p)
114
+ if baseline is None:
36
115
  baseline = None
37
116
 
38
117
  failures: list[str] = []
39
118
  warnings: list[str] = []
119
+ rows = rows_from_summary(summary, failures)
40
120
 
41
121
  # Hard floor 1: no disqualifier in variant
42
- if summary["hard_floor_violations"] > 0:
43
- failures.append(f"{summary['hard_floor_violations']} variant disqualifier(s) — see report")
122
+ hard_floor_violations = int_or_none(summary.get("hard_floor_violations"))
123
+ if hard_floor_violations is None:
124
+ failures.append("summary hard_floor_violations missing or malformed — measurement invalid")
125
+ elif hard_floor_violations > 0:
126
+ failures.append(f"{hard_floor_violations} variant disqualifier(s) — see report")
127
+ variant_axis_invalid = axis_invalid_count(rows, "variant", failures)
128
+ if variant_axis_invalid > 0:
129
+ failures.append(
130
+ f"variant axis-invalid: {variant_axis_invalid} fixture(s) have out-of-range axis cells — "
131
+ "re-judge before trusting L2 margins"
132
+ )
133
+ bare_axis_invalid = axis_invalid_count(rows, "bare", failures)
134
+ if bare_axis_invalid > 0:
135
+ failures.append(
136
+ f"bare axis-invalid: {bare_axis_invalid} fixture(s) have out-of-range axis cells — "
137
+ "re-judge before trusting margins"
138
+ )
139
+ unmapped_axis_invalid = unmapped_axis_invalid_count(rows, failures)
140
+ if unmapped_axis_invalid > 0:
141
+ failures.append(
142
+ f"judge axis-invalid unmapped: {unmapped_axis_invalid} fixture(s) have out-of-range axis cells "
143
+ "that could not be mapped to an arm — re-judge before trusting margins"
144
+ )
44
145
 
45
146
  # Hard floor 2: F9 must pass (skipped during bootstrap via --accept-missing)
46
147
  # Variant arm legacy gate kept for L2 baseline comparability.
47
148
  # iter-0033a (2026-04-30): renamed F9 dir from -to-preflight to -to-resolve to
48
149
  # match the shipped 2-skill contract (no preflight). The OLD pre-rename id
49
150
  # is preserved in fixtures/retired/ for replay.
50
- f9_row = next((r for r in summary["rows"] if r.get("fixture") == "F9-e2e-ideate-to-resolve"), None)
151
+ f9_row = next((r for r in rows if r.get("fixture") == "F9-e2e-ideate-to-resolve"), None)
51
152
  if f9_row is None:
52
153
  if not args.accept_missing:
53
154
  failures.append("F9 (E2E novice flow) missing — add fixture or run with --accept-missing")
54
155
  else:
55
- if (f9_row.get("margin") or -999) < 5:
156
+ f9_margin = number_or_none(f9_row.get("margin"))
157
+ if f9_margin is None:
158
+ failures.append("F9 (E2E novice flow) margin missing or malformed — measurement invalid")
159
+ elif f9_margin < 5:
56
160
  failures.append("F9 (E2E novice flow) must have variant margin ≥ +5")
57
161
 
58
- # Hard floor 3: ≥ 7 of 9 gated fixtures with margin ≥ +5
162
+ for row in rows:
163
+ if not is_known_limit(row):
164
+ continue
165
+ margin = number_or_none(row.get("margin"))
166
+ if margin is not None and (margin < -3 or margin > 3):
167
+ warnings.append(
168
+ f"{row.get('fixture')} known-limit margin {margin:+g} outside expected [-3,+3] range"
169
+ )
170
+
171
+ # Hard floor 3: at least 7 gated fixtures with margin ≥ +5
59
172
  # (skipped during bootstrap via --accept-missing)
60
- if summary["gated_fixtures"] > 0 and summary["margin_ge_5_count"] < 7:
173
+ gated_fixtures = int_or_none(summary.get("gated_fixtures"))
174
+ margin_ge_5_count = int_or_none(summary.get("margin_ge_5_count"))
175
+ if gated_fixtures is None or margin_ge_5_count is None:
176
+ failures.append("summary gated fixture counts missing or malformed — measurement invalid")
177
+ elif gated_fixtures > 0 and margin_ge_5_count < 7:
61
178
  if not args.accept_missing:
62
179
  failures.append(
63
- f"only {summary['margin_ge_5_count']} of {summary['gated_fixtures']} "
180
+ f"only {margin_ge_5_count} of {gated_fixtures} "
64
181
  f"gated fixtures have variant margin ≥ +5 (need ≥ 7)"
65
182
  )
66
183
 
67
184
  # iter-0023 — L1 (solo_claude) gates per NORTH-STAR.md ops test #1.
68
185
  # Codex R1 (this iter) caught that ship-gate enforced only legacy L2
69
186
  # `variant` margin and never read `solo_over_bare`. Now NORTH-STAR's
70
- # documented L1 floor (≥ +5, 7/9 fixtures, F9 ≥ +5, no L1
187
+ # documented L1 floor (≥ +5 on at least 7 gated fixtures, F9 ≥ +5, no L1
71
188
  # disqualifier) is mechanically enforced.
72
- arms_present = summary.get("arms_present", {})
73
- margins_avg = summary.get("margins_avg", {})
74
- if arms_present.get("solo_claude"):
189
+ raw_arms_present = summary.get("arms_present")
190
+ if raw_arms_present is not None and not isinstance(raw_arms_present, dict):
191
+ failures.append("summary arms_present malformed — measurement invalid")
192
+ arms_present = object_or_empty(raw_arms_present)
193
+ raw_margins_avg = summary.get("margins_avg")
194
+ margins_avg = object_or_empty(raw_margins_avg)
195
+ raw_solo_present = arms_present.get("solo_claude")
196
+ solo_present = bool_or_none(raw_solo_present)
197
+ if raw_solo_present is not None and solo_present is None:
198
+ failures.append("summary arms_present.solo_claude malformed — measurement invalid")
199
+ if solo_present is True:
200
+ if raw_margins_avg is not None and not isinstance(raw_margins_avg, dict):
201
+ failures.append("summary margins_avg malformed — measurement invalid")
202
+ l1_dq_by_fixture: dict[str, bool] = {}
203
+ for r in rows:
204
+ fixture = str(r.get("fixture"))
205
+ l1 = object_or_empty(object_or_empty(r.get("arms")).get("solo_claude"))
206
+ raw_l1_dq = l1.get("disqualifier")
207
+ parsed_l1_dq = bool_or_none(raw_l1_dq)
208
+ if raw_l1_dq is not None and parsed_l1_dq is None:
209
+ failures.append(f"{fixture} L1 disqualifier malformed — measurement invalid")
210
+ l1_dq_by_fixture[fixture] = True
211
+ else:
212
+ l1_dq_by_fixture[fixture] = parsed_l1_dq is True
213
+
75
214
  l1_avg = margins_avg.get("solo_over_bare")
76
- if l1_avg is not None and l1_avg < 5:
215
+ if l1_avg is not None and number_or_none(l1_avg) is None:
216
+ failures.append("L1 (solo_over_bare) suite avg malformed — measurement invalid")
217
+ elif l1_avg is not None and l1_avg < 5:
77
218
  warnings.append(
78
219
  f"L1 (solo_over_bare) suite avg {l1_avg:+.1f} below NORTH-STAR floor +5 "
79
220
  "(reporting only — per-fixture L1 gates below are decisive)"
80
221
  )
81
222
  # F9 L1 floor
82
223
  if f9_row is not None:
83
- f9_l1 = (f9_row.get("margins") or {}).get("solo_over_bare")
224
+ f9_l1 = object_or_empty(f9_row.get("margins")).get("solo_over_bare")
84
225
  if f9_l1 is None:
85
226
  if not args.accept_missing:
86
227
  failures.append("F9 L1 (solo_over_bare) margin missing — measurement invalid")
228
+ elif number_or_none(f9_l1) is None:
229
+ failures.append("F9 L1 (solo_over_bare) margin malformed — measurement invalid")
87
230
  elif f9_l1 < 5:
88
- failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+d} < +5 floor")
89
- # 7-of-9 L1 floor — headroom-aware (added 2026-05-02 per iter-0033 R4
231
+ failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+g} < +5 floor")
232
+ # 7-fixture L1 floor — headroom-aware (added 2026-05-02 per iter-0033 R4
90
233
  # Codex collab + NORTH-STAR amendment + RUBRIC hard-floor 3 update).
91
234
  # A fixture is excluded from the denominator when 100 - L0_score < 5
92
235
  # AND L1_score >= 95 AND the L1 arm has no disqualifier / CRITICAL-HIGH
@@ -96,25 +239,26 @@ def main() -> int:
96
239
  l1_ge_5 = 0
97
240
  l1_gated = 0
98
241
  l1_excluded_headroom = []
99
- for r in summary.get("rows", []):
100
- if (r.get("category") or "").lower() == "known-limit":
242
+ for r in rows:
243
+ if is_known_limit(r):
101
244
  continue
102
- arms = r.get("arms") or {}
103
- l0 = arms.get("bare") or {}
104
- l1 = arms.get("solo_claude") or {}
105
- l0_score = l0.get("score")
106
- l1_score = l1.get("score")
107
- m = (r.get("margins") or {}).get("solo_over_bare")
245
+ arms = object_or_empty(r.get("arms"))
246
+ l0 = object_or_empty(arms.get("bare"))
247
+ l1 = object_or_empty(arms.get("solo_claude"))
248
+ l0_score = number_or_none(l0.get("score"))
249
+ l1_score = number_or_none(l1.get("score"))
250
+ m = number_or_none(object_or_empty(r.get("margins")).get("solo_over_bare"))
108
251
  if m is None:
109
252
  continue
110
253
  # Headroom carve-out — must satisfy ALL conditions:
111
254
  # (a) bare ceiling-near (100 - L0 < 5)
112
255
  # (b) L1 also ceiling-near (>=95)
113
256
  # (c) L1 arm clean (no disqualifier, no axis-invalid, fix-loop didn't fail)
114
- l1_dq_here = bool(l1.get("disqualifier"))
115
- l1_axis_inv = (l1.get("_axis_validation_out_of_range_count") or 0) > 0
257
+ l1_dq_here = l1_dq_by_fixture.get(str(r.get("fixture")), False)
258
+ l1_axis_count = number_or_none(l1.get("_axis_validation_out_of_range_count", 0))
259
+ l1_axis_inv = bool(l1_axis_count is not None and l1_axis_count > 0)
116
260
  if (
117
- isinstance(l0_score, (int, float)) and isinstance(l1_score, (int, float))
261
+ l0_score is not None and l1_score is not None
118
262
  and (100 - l0_score) < 5 and l1_score >= 95
119
263
  and not l1_dq_here and not l1_axis_inv
120
264
  ):
@@ -136,14 +280,14 @@ def main() -> int:
136
280
  warnings.append(
137
281
  "L1 headroom-excluded (saturation candidates per RUBRIC two-shipped-version rule): "
138
282
  + ", ".join(
139
- f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+d})"
283
+ f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+g})"
140
284
  for x in l1_excluded_headroom
141
285
  )
142
286
  )
143
287
  # L1 disqualifier floor
144
288
  l1_dq = sum(
145
- 1 for r in summary.get("rows", [])
146
- if ((r.get("arms") or {}).get("solo_claude") or {}).get("disqualifier")
289
+ 1 for r in rows
290
+ if l1_dq_by_fixture.get(str(r.get("fixture")), False)
147
291
  )
148
292
  if l1_dq > 0:
149
293
  failures.append(f"L1 disqualifier(s): {l1_dq} solo_claude arm(s) hit a disqualifier")
@@ -151,10 +295,13 @@ def main() -> int:
151
295
  # `_axis_validation` per fixture). If any L1 row has invalid axis data,
152
296
  # the L1 score for that row is not trustworthy.
153
297
  l1_axis_invalid = 0
154
- for r in summary.get("rows", []):
155
- av = (r.get("arms") or {}).get("solo_claude") or {}
298
+ for r in rows:
299
+ av = object_or_empty(object_or_empty(r.get("arms")).get("solo_claude"))
156
300
  inv = av.get("_axis_validation_out_of_range_count")
157
- if inv is not None and inv > 0:
301
+ count = number_or_none(inv)
302
+ if inv is not None and count is None:
303
+ failures.append("L1 axis count malformed — measurement invalid")
304
+ elif count is not None and count > 0:
158
305
  l1_axis_invalid += 1
159
306
  if l1_axis_invalid > 0:
160
307
  failures.append(
@@ -164,31 +311,53 @@ def main() -> int:
164
311
 
165
312
  # Hard floor 4: no per-fixture regression worse than −5 vs shipped baseline
166
313
  if baseline:
167
- prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
168
- for r in summary["rows"]:
314
+ prev_rows = {
315
+ r["fixture"]: r for r in baseline.get("rows", [])
316
+ if isinstance(r, dict) and isinstance(r.get("fixture"), str)
317
+ }
318
+ for r in rows:
319
+ if is_known_limit(r):
320
+ continue
169
321
  fid = r.get("fixture")
170
322
  prev = prev_rows.get(fid)
171
- if prev and r.get("variant_score") is not None and prev.get("variant_score") is not None:
172
- delta = r["variant_score"] - prev["variant_score"]
323
+ current_score = number_or_none(r.get("variant_score"))
324
+ previous_score = number_or_none(prev.get("variant_score")) if prev else None
325
+ if prev and current_score is not None and previous_score is not None:
326
+ delta = current_score - previous_score
173
327
  if delta < -5:
174
- failures.append(f"{fid} regressed {delta:+d} vs shipped (floor: −5)")
328
+ failures.append(f"{fid} regressed {delta:+g} vs shipped (floor: −5)")
175
329
 
176
330
  # Soft gate: suite average margin drop > 3
177
331
  if baseline:
178
- margin_delta = summary["margin_avg"] - baseline.get("margin_avg", 0)
179
- if margin_delta < -3:
180
- warnings.append(f"suite margin dropped {margin_delta:+.1f} vs shipped (soft gate: > −3)")
332
+ current_margin_avg = number_or_none(summary.get("margin_avg"))
333
+ baseline_margin_avg = number_or_none(baseline.get("margin_avg"))
334
+ if current_margin_avg is None:
335
+ failures.append("suite margin missing — measurement invalid")
336
+ elif baseline_margin_avg is None:
337
+ warnings.append("shipped baseline margin malformed; skipping suite margin delta")
338
+ else:
339
+ margin_delta = current_margin_avg - baseline_margin_avg
340
+ if margin_delta < -3:
341
+ warnings.append(f"suite margin dropped {margin_delta:+.1f} vs shipped (soft gate: > −3)")
181
342
 
182
343
  # Soft gate: any fixture that was > +5 before is now ≤ 0
183
344
  if baseline:
184
- prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
185
- for r in summary["rows"]:
345
+ prev_rows = {
346
+ r["fixture"]: r for r in baseline.get("rows", [])
347
+ if isinstance(r, dict) and isinstance(r.get("fixture"), str)
348
+ }
349
+ for r in rows:
186
350
  fid = r.get("fixture")
187
351
  prev = prev_rows.get(fid)
188
- if prev and (prev.get("margin") or 0) > 5 and (r.get("margin") or 0) <= 0:
189
- warnings.append(
190
- f"{fid} lost its margin: was {prev['margin']:+d}, now {r['margin']:+d}"
191
- )
352
+ prev_margin = number_or_none(prev.get("margin")) if prev else None
353
+ current_margin = number_or_none(r.get("margin"))
354
+ if prev and prev_margin is not None and prev_margin > 5:
355
+ if current_margin is None:
356
+ warnings.append(f"{fid} margin missing; was {prev_margin:+g}")
357
+ elif current_margin <= 0:
358
+ warnings.append(
359
+ f"{fid} lost its margin: was {prev_margin:+g}, now {current_margin:+g}"
360
+ )
192
361
 
193
362
  verdict = "PASS" if not failures else "FAIL"
194
363
  print(f"\n═══ SHIP-GATE VERDICT: {verdict} ═══\n")
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env python3
2
+ """Validate a shadow fixture solo ceiling avoidance note."""
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import pathlib
7
+ import re
8
+ import sys
9
+
10
+
11
+ SECTION_RE = re.compile(r"(?ms)^##[ \t]+Solo ceiling avoidance\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)")
12
+ CONTROL_RE = re.compile(r"\bS[2-6]\b|S2-S6|solo-saturated|rejected controls?", re.IGNORECASE)
13
+ REASON_RE = re.compile(r"\bdiffer(?:s|ent|ence)?\b|\bunlike\b|\bbecause\b|\bpreserve\b|\bheadroom\b", re.IGNORECASE)
14
+
15
+
16
+ def read_text(path: pathlib.Path) -> str:
17
+ try:
18
+ return path.read_text(encoding="utf-8")
19
+ except UnicodeDecodeError as exc:
20
+ print(f"{path}: expected UTF-8 text ({exc})", file=sys.stderr)
21
+ raise SystemExit(2) from None
22
+ except OSError as exc:
23
+ print(f"{path}: unable to read ({exc})", file=sys.stderr)
24
+ raise SystemExit(2) from None
25
+
26
+
27
+ def solo_ceiling_avoidance_error(text: str) -> str | None:
28
+ match = SECTION_RE.search(text)
29
+ if not match:
30
+ return "missing ## Solo ceiling avoidance section"
31
+ section = match.group(1)
32
+ if "solo_claude" not in section:
33
+ return "solo ceiling avoidance must mention solo_claude"
34
+ if not CONTROL_RE.search(section):
35
+ return "solo ceiling avoidance must compare against rejected or solo-saturated controls such as S2-S6"
36
+ if not REASON_RE.search(section):
37
+ return "solo ceiling avoidance must state difference/headroom reasoning"
38
+ return None
39
+
40
+
41
+ def main(argv: list[str]) -> int:
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument("path", type=pathlib.Path)
44
+ args = parser.parse_args(argv)
45
+ err = solo_ceiling_avoidance_error(read_text(args.path))
46
+ if err:
47
+ print(f"{args.path}: {err}", file=sys.stderr)
48
+ return 1
49
+ return 0
50
+
51
+
52
+ if __name__ == "__main__":
53
+ raise SystemExit(main(sys.argv[1:]))
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env python3
2
+ """Validate that a pair-candidate fixture states an actionable solo-headroom hypothesis."""
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import pathlib
8
+ import sys
9
+
10
+
11
+ from pair_evidence_contract import (
12
+ actionable_observable_commands,
13
+ has_actionable_solo_headroom_hypothesis_text,
14
+ )
15
+
16
+
17
+ def combined_text(paths: list[pathlib.Path]) -> str:
18
+ chunks: list[str] = []
19
+ for path in paths:
20
+ if not path.is_file():
21
+ continue
22
+ try:
23
+ chunks.append(path.read_text(encoding="utf-8"))
24
+ except UnicodeDecodeError as exc:
25
+ print(f"{path}: expected UTF-8 text ({exc})", file=sys.stderr)
26
+ raise SystemExit(2) from None
27
+ return "\n".join(chunks)
28
+
29
+
30
+ def has_actionable_hypothesis(text: str) -> bool:
31
+ return has_actionable_solo_headroom_hypothesis_text(text)
32
+
33
+
34
+ def expected_commands(path: pathlib.Path) -> set[str]:
35
+ try:
36
+ data = json.loads(path.read_text(encoding="utf-8"))
37
+ except UnicodeDecodeError as exc:
38
+ print(f"{path}: expected UTF-8 JSON ({exc})", file=sys.stderr)
39
+ raise SystemExit(2) from None
40
+ except json.JSONDecodeError as exc:
41
+ print(f"{path}: invalid JSON ({exc})", file=sys.stderr)
42
+ raise SystemExit(2) from None
43
+
44
+ commands = data.get("verification_commands")
45
+ if not isinstance(commands, list):
46
+ print(f"{path}: verification_commands must be a list", file=sys.stderr)
47
+ raise SystemExit(2)
48
+
49
+ result: set[str] = set()
50
+ for index, command in enumerate(commands):
51
+ if not isinstance(command, dict) or not isinstance(command.get("cmd"), str):
52
+ print(f"{path}: verification_commands[{index}].cmd must be a string", file=sys.stderr)
53
+ raise SystemExit(2)
54
+ result.add(command["cmd"])
55
+ return result
56
+
57
+
58
+ def main(argv: list[str]) -> int:
59
+ parser = argparse.ArgumentParser()
60
+ parser.add_argument(
61
+ "--expected-json",
62
+ type=pathlib.Path,
63
+ help="Require the observable hypothesis command to match expected.json verification_commands[].cmd.",
64
+ )
65
+ parser.add_argument("paths", nargs="+", type=pathlib.Path)
66
+ args = parser.parse_args(argv)
67
+ text = combined_text(args.paths)
68
+ if not has_actionable_hypothesis(text):
69
+ return 1
70
+ if args.expected_json is None:
71
+ return 0
72
+ expected = expected_commands(args.expected_json)
73
+ return 0 if any(command in expected for command in actionable_observable_commands(text)) else 1
74
+
75
+
76
+ if __name__ == "__main__":
77
+ raise SystemExit(main(sys.argv[1:]))