devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env bash
2
+ # Shared rejected/ceiling pair-candidate fixture registry.
3
+ # Includes active fixtures and calibrated shadow controls that should not spend
4
+ # pair-candidate runs unless explicitly requested for diagnostics.
5
+
6
+ rejected_pair_fixture_reason() {
7
+ local fid="$1"
8
+ case "$fid" in
9
+ F1-*|F1)
10
+ echo "trivial calibration fixture; every arm is expected to one-shot it"
11
+ ;;
12
+ F2-*|F2)
13
+ echo "bare 83 / solo_claude 95 in 20260512-f2-medium-headroom"
14
+ ;;
15
+ F3-*|F3)
16
+ echo "bare 97 / solo_claude 99 in 20260511-f3-http-error-headroom"
17
+ ;;
18
+ F4-*|F4)
19
+ echo "bare 70 / solo_claude 92 with bare disqualifier in 20260512-f4-web-headroom"
20
+ ;;
21
+ F5-*|F5)
22
+ echo "bare 99 / solo_claude 99 in 20260512-f5-fixloop-headroom"
23
+ ;;
24
+ F6-*|F6)
25
+ echo "bare 97 / solo_claude 96 in 20260512-f6-checksum-headroom"
26
+ ;;
27
+ F7-*|F7)
28
+ echo "bare 99 / solo_claude 100 in 20260512-f7-scope-headroom"
29
+ ;;
30
+ F8-*|F8)
31
+ echo "known-limit ambiguity fixture; expected margin is [-3,+3], not pair-lift evidence"
32
+ ;;
33
+ F9-*|F9)
34
+ echo "bare 60 / solo_claude 90 with bare headroom 0 and bare judge disqualifier in 20260512-f9-e2e-headroom"
35
+ ;;
36
+ F10-*|F10)
37
+ echo "bare 75 / solo_claude 94 in 20260507-f10-f11-tier1-full-pipeline"
38
+ ;;
39
+ F11-*|F11)
40
+ echo "bare 98 / solo_claude 97 in 20260507-f10-f11-tier1-full-pipeline"
41
+ ;;
42
+ F12-*|F12)
43
+ echo "bare 85 / solo_claude 99 in 20260511-f12-webhook-headroom"
44
+ ;;
45
+ F15-*|F15)
46
+ echo "bare 99 / solo_claude 94 in 20260511-f15-concurrency-headroom"
47
+ ;;
48
+ F22-*|F22)
49
+ echo "bare 94 / solo_claude 98 in 20260508-f22-exact-error-headroom"
50
+ ;;
51
+ F26-*|F26)
52
+ echo "solo_claude scored 98 in 20260508-f26-headroom"
53
+ ;;
54
+ F27-*|F27)
55
+ echo "solo_claude scored 94 in 20260511-f27-headroom-smoke-061401"
56
+ ;;
57
+ F28-*|F28)
58
+ echo "corrected-oracle reverify scored solo_claude 98 in 20260511-f28-policy-oraclefix-reverified-pair"
59
+ ;;
60
+ F29-*|F29)
61
+ echo "corrected headroom scored solo_claude 92 in 20260510-f29-headroom-v2"
62
+ ;;
63
+ F30-*|F30)
64
+ echo "solo_claude scored 98 in 20260511-f30-headroom-v1"
65
+ ;;
66
+ F31-*|F31)
67
+ echo "solo_claude scored 98 with bare disqualifiers in 20260512-f31-seat-rebalance-headroom"
68
+ ;;
69
+ F32-*|F32)
70
+ echo "bare 33 / solo_claude 98 in 20260512-f32-subscription-renewal-headroom"
71
+ ;;
72
+ S2-*|S2)
73
+ echo "bare 33 / solo_claude 99 with solo timeout in 20260513-s2-inventory-headroom"
74
+ ;;
75
+ S3-*|S3)
76
+ echo "bare 33 / solo_claude 99 with solo timeout in 20260513-s3-ticket-headroom"
77
+ ;;
78
+ S4-*|S4)
79
+ echo "bare 33 / solo_claude 98 with solo timeout in 20260513-s4-return-headroom"
80
+ ;;
81
+ S5-*|S5)
82
+ echo "bare 33 / solo_claude 98 with solo timeout in 20260513-s5-credit-headroom"
83
+ ;;
84
+ S6-*|S6)
85
+ echo "bare 33 / solo_claude 98 with solo timeout in 20260514-s6-refund-headroom-v1"
86
+ ;;
87
+ *)
88
+ return 1
89
+ ;;
90
+ esac
91
+ }
@@ -0,0 +1,269 @@
1
+ """Shared pair-evidence contract for benchmark audits."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import math
6
+ import re
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ ALLOWED_PAIR_ARMS = {"l2_risk_probes", "l2_gated"}
12
+ CANONICAL_PAIR_TRIGGER_REASONS = {
13
+ "mode.verify-only",
14
+ "mode.pair-verify",
15
+ "complexity.high",
16
+ "complexity.large",
17
+ "spec.complexity.high",
18
+ "spec.complexity.large",
19
+ "spec.solo_headroom_hypothesis",
20
+ "risk.high",
21
+ "risk_probes.enabled",
22
+ "risk_probes.present",
23
+ "coverage.failed",
24
+ "mechanical.warning",
25
+ "judge.warning",
26
+ }
27
+ HISTORICAL_PAIR_TRIGGER_REASON_ALIASES = {
28
+ "risk_profile.high_risk",
29
+ "risk_probes_enabled",
30
+ }
31
+ HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES = {
32
+ "complexity.high.spec.frontmatter",
33
+ "frontmatter.complexity.high",
34
+ "high.complexity.spec",
35
+ "high.risk.profile",
36
+ "spec.frontmatter.complexity.high",
37
+ "state.complexity.high",
38
+ }
39
+ # Benchmark readers accept historical aliases only for archived artifacts.
40
+ # Runtime /devlyn:resolve state must continue to emit canonical reasons.
41
+ KNOWN_PAIR_TRIGGER_REASONS = (
42
+ CANONICAL_PAIR_TRIGGER_REASONS | HISTORICAL_PAIR_TRIGGER_REASON_ALIASES
43
+ )
44
+ OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")
45
+ BACKTICKED_TEXT_RE = re.compile(r"`[^`\n]+`")
46
+ RESERVED_BACKTICK_TERMS = {"solo-headroom hypothesis", "solo_claude", "miss"}
47
+ COMMAND_PREFIXES = {
48
+ "bash",
49
+ "bun",
50
+ "cargo",
51
+ "git",
52
+ "go",
53
+ "jest",
54
+ "make",
55
+ "node",
56
+ "npm",
57
+ "pnpm",
58
+ "printf",
59
+ "pytest",
60
+ "python",
61
+ "python3",
62
+ "ruff",
63
+ "sh",
64
+ "uv",
65
+ "vitest",
66
+ "yarn",
67
+ }
68
+
69
+
70
+ def reject_json_constant(token: str) -> None:
71
+ raise ValueError(f"invalid JSON numeric constant: {token}")
72
+
73
+
74
+ def loads_strict_json_object(text: str) -> dict[str, Any]:
75
+ data = json.loads(text, parse_constant=reject_json_constant)
76
+ if not isinstance(data, dict):
77
+ raise ValueError("top-level JSON value must be an object")
78
+ return data
79
+
80
+
81
+ def normalized_pair_trigger_reason(reason: str) -> str:
82
+ return re.sub(r"[^a-z0-9]+", ".", reason.lower()).strip(".")
83
+
84
+
85
+ def is_known_pair_trigger_reason(reason: str) -> bool:
86
+ normalized = normalized_pair_trigger_reason(reason)
87
+ return (
88
+ reason in CANONICAL_PAIR_TRIGGER_REASONS
89
+ or reason in HISTORICAL_PAIR_TRIGGER_REASON_ALIASES
90
+ or normalized in HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES
91
+ )
92
+
93
+
94
+ def is_canonical_pair_trigger_reason(reason: str) -> bool:
95
+ return reason in CANONICAL_PAIR_TRIGGER_REASONS
96
+
97
+
98
+ def is_historical_pair_trigger_reason(reason: str) -> bool:
99
+ normalized = normalized_pair_trigger_reason(reason)
100
+ return (
101
+ reason in HISTORICAL_PAIR_TRIGGER_REASON_ALIASES
102
+ or normalized in HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES
103
+ )
104
+
105
+
106
+ def has_known_pair_trigger_reason(reasons: list[str]) -> bool:
107
+ return any(is_known_pair_trigger_reason(reason) for reason in reasons)
108
+
109
+
110
+ def all_known_pair_trigger_reasons(reasons: list[str]) -> bool:
111
+ return all(is_known_pair_trigger_reason(reason) for reason in reasons)
112
+
113
+
114
+ def has_canonical_pair_trigger_reason(reasons: list[str]) -> bool:
115
+ return any(is_canonical_pair_trigger_reason(reason) for reason in reasons)
116
+
117
+
118
+ def has_historical_pair_trigger_reason(reasons: list[str]) -> bool:
119
+ return any(is_historical_pair_trigger_reason(reason) for reason in reasons)
120
+
121
+
122
+ def is_command_like_backtick(value: str) -> bool:
123
+ stripped = value.strip()
124
+ lower = stripped.lower()
125
+ if not stripped or lower in RESERVED_BACKTICK_TERMS:
126
+ return False
127
+ first = lower.split(maxsplit=1)[0]
128
+ return (
129
+ first in COMMAND_PREFIXES
130
+ or any(marker in stripped for marker in ("/", "$", "=", "|", "&&", ";"))
131
+ or stripped.endswith((".js", ".py", ".sh"))
132
+ )
133
+
134
+
135
+ def actionable_observable_commands(text: str) -> list[str]:
136
+ commands: list[str] = []
137
+ for line in text.splitlines():
138
+ lower = line.lower()
139
+ if "miss" not in lower or not any(marker in lower for marker in OBSERVABLE_COMMAND_MARKERS):
140
+ continue
141
+ for match in BACKTICKED_TEXT_RE.finditer(line):
142
+ value = match.group(0).strip("`")
143
+ if is_command_like_backtick(value):
144
+ commands.append(value)
145
+ return commands
146
+
147
+
148
+ def has_actionable_solo_headroom_hypothesis_text(text: str) -> bool:
149
+ lower = text.lower()
150
+ return (
151
+ "solo-headroom hypothesis" in lower
152
+ and "solo_claude" in lower
153
+ and "miss" in lower
154
+ and bool(actionable_observable_commands(text))
155
+ )
156
+
157
+
158
+ def path_has_actionable_solo_headroom_hypothesis(path: Path) -> bool:
159
+ try:
160
+ text = path.read_text(encoding="utf-8")
161
+ except OSError:
162
+ return False
163
+ return has_actionable_solo_headroom_hypothesis_text(text)
164
+
165
+
166
+ def normalize_pair_evidence_row(
167
+ *,
168
+ fixture: str,
169
+ run_id: str,
170
+ pair_arm: object,
171
+ row: dict[str, Any],
172
+ ) -> dict[str, Any] | None:
173
+ bare_score = row.get("bare_score")
174
+ solo_score = row.get("solo_score")
175
+ pair_score = row.get("pair_score")
176
+ pair_margin = row.get("pair_margin")
177
+ pair_mode = row.get("pair_mode")
178
+ pair_trigger_eligible = row.get("pair_trigger_eligible")
179
+ pair_trigger_reasons = row.get("pair_trigger_reasons")
180
+ wall_ratio = row.get("pair_solo_wall_ratio")
181
+ if not fixture or not run_id:
182
+ return None
183
+ if not isinstance(pair_arm, str) or pair_arm not in ALLOWED_PAIR_ARMS:
184
+ return None
185
+ if not all(is_score(value) for value in [bare_score, solo_score, pair_score]):
186
+ return None
187
+ if not is_strict_int(pair_margin):
188
+ return None
189
+ if pair_margin != pair_score - solo_score:
190
+ return None
191
+ if pair_mode is not True:
192
+ return None
193
+ if pair_trigger_eligible is not True:
194
+ return None
195
+ if not (
196
+ isinstance(pair_trigger_reasons, list)
197
+ and pair_trigger_reasons
198
+ and all(isinstance(reason, str) for reason in pair_trigger_reasons)
199
+ and all_known_pair_trigger_reasons(pair_trigger_reasons)
200
+ and has_canonical_pair_trigger_reason(pair_trigger_reasons)
201
+ ):
202
+ return None
203
+ if not is_strict_number(wall_ratio):
204
+ return None
205
+ normalized = {
206
+ "run_id": run_id,
207
+ "pair_arm": pair_arm,
208
+ "bare_score": bare_score,
209
+ "solo_score": solo_score,
210
+ "pair_score": pair_score,
211
+ "pair_margin": pair_margin,
212
+ "pair_mode": pair_mode,
213
+ "pair_trigger_eligible": pair_trigger_eligible,
214
+ "pair_trigger_reasons": pair_trigger_reasons,
215
+ "pair_trigger_has_canonical_reason": True,
216
+ "pair_trigger_has_hypothesis_reason": (
217
+ "spec.solo_headroom_hypothesis" in pair_trigger_reasons
218
+ ),
219
+ "pair_solo_wall_ratio": wall_ratio,
220
+ }
221
+ return normalized
222
+
223
+
224
+ def best_pair_evidence(evidence: list[object]) -> dict[str, Any] | None:
225
+ candidates = [
226
+ normalized
227
+ for item in evidence
228
+ if isinstance(item, dict)
229
+ if isinstance(item.get("run_id"), str)
230
+ for normalized in [
231
+ normalize_pair_evidence_row(
232
+ fixture="_",
233
+ run_id=item["run_id"],
234
+ pair_arm=item.get("pair_arm"),
235
+ row=item,
236
+ )
237
+ ]
238
+ if normalized is not None
239
+ ]
240
+ if not candidates:
241
+ return None
242
+
243
+ def key(item: dict[str, Any]) -> tuple[int, int, str]:
244
+ margin = item.get("pair_margin")
245
+ pair_score = item.get("pair_score")
246
+ return (
247
+ margin if isinstance(margin, int) else -10_000,
248
+ pair_score if isinstance(pair_score, int) else -10_000,
249
+ str(item.get("run_id") or ""),
250
+ )
251
+
252
+ return max(candidates, key=key)
253
+
254
+
255
+ def is_strict_int(value: object) -> bool:
256
+ return isinstance(value, int) and not isinstance(value, bool)
257
+
258
+
259
+ def is_score(value: object) -> bool:
260
+ return is_strict_int(value) and 0 <= value <= 100
261
+
262
+
263
+ def is_strict_number(value: object) -> bool:
264
+ return (
265
+ isinstance(value, (int, float))
266
+ and not isinstance(value, bool)
267
+ and math.isfinite(value)
268
+ and value > 0
269
+ )
@@ -17,8 +17,12 @@ import subprocess
17
17
  from pathlib import Path
18
18
  from typing import Any
19
19
 
20
+ from pair_evidence_contract import loads_strict_json_object
21
+
20
22
 
21
23
  SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
24
+ SAFE_REPO = re.compile(r"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$")
25
+ SAFE_COMMIT = re.compile(r"^[0-9a-fA-F]{7,40}$")
22
26
 
23
27
 
24
28
  def run(cmd: list[str], cwd: Path | None = None) -> None:
@@ -26,11 +30,12 @@ def run(cmd: list[str], cwd: Path | None = None) -> None:
26
30
 
27
31
 
28
32
  def read_json(path: Path) -> dict[str, Any]:
29
- with path.open(encoding="utf8") as f:
30
- data = json.load(f)
31
- if not isinstance(data, dict):
32
- raise ValueError(f"expected JSON object: {path}")
33
- return data
33
+ try:
34
+ return loads_strict_json_object(path.read_text(encoding="utf8"))
35
+ except ValueError as exc:
36
+ if str(exc) == "top-level JSON value must be an object":
37
+ raise ValueError(f"expected JSON object: {path}") from exc
38
+ raise
34
39
 
35
40
 
36
41
  def require_text(instance: dict[str, Any], key: str) -> str:
@@ -40,14 +45,38 @@ def require_text(instance: dict[str, Any], key: str) -> str:
40
45
  return value.strip()
41
46
 
42
47
 
48
+ def positive_int(value: str) -> int:
49
+ try:
50
+ parsed = int(value)
51
+ except ValueError as exc:
52
+ raise argparse.ArgumentTypeError("must be an integer") from exc
53
+ if parsed <= 0:
54
+ raise argparse.ArgumentTypeError("must be > 0")
55
+ return parsed
56
+
57
+
58
+ def require_safe_repo(instance: dict[str, Any]) -> str:
59
+ repo = require_text(instance, "repo")
60
+ if not SAFE_REPO.match(repo):
61
+ raise ValueError(f"unsafe SWE-bench repo: {repo!r}")
62
+ return repo
63
+
64
+
65
+ def require_safe_base_commit(instance: dict[str, Any]) -> str:
66
+ base_commit = require_text(instance, "base_commit")
67
+ if not SAFE_COMMIT.match(base_commit):
68
+ raise ValueError(f"unsafe SWE-bench base_commit: {base_commit!r}")
69
+ return base_commit
70
+
71
+
43
72
  def repo_cache_name(repo: str, base_commit: str) -> str:
44
73
  safe_repo = repo.replace("/", "__")
45
74
  return f"{safe_repo}-{base_commit[:12]}"
46
75
 
47
76
 
48
77
  def prepare_repo(instance: dict[str, Any], repo_dir: Path | None, repos_root: Path) -> Path:
49
- repo = require_text(instance, "repo")
50
- base_commit = require_text(instance, "base_commit")
78
+ repo = require_safe_repo(instance)
79
+ base_commit = require_safe_base_commit(instance)
51
80
  repos_root.mkdir(parents=True, exist_ok=True)
52
81
  dest = repos_root / repo_cache_name(repo, base_commit)
53
82
 
@@ -72,8 +101,8 @@ def write_case_files(
72
101
  timeout_seconds: int,
73
102
  ) -> None:
74
103
  instance_id = require_text(instance, "instance_id")
75
- repo = require_text(instance, "repo")
76
- base_commit = require_text(instance, "base_commit")
104
+ repo = require_safe_repo(instance)
105
+ base_commit = require_safe_base_commit(instance)
77
106
  problem = require_text(instance, "problem_statement")
78
107
  case_dir.mkdir(parents=True, exist_ok=True)
79
108
 
@@ -196,7 +225,7 @@ def main() -> int:
196
225
  type=Path,
197
226
  help="Local clone/source repo to copy instead of cloning GitHub; useful for tests and cached runs.",
198
227
  )
199
- parser.add_argument("--timeout-seconds", type=int, default=2400)
228
+ parser.add_argument("--timeout-seconds", type=positive_int, default=2400)
200
229
  args = parser.parse_args()
201
230
 
202
231
  instance = read_json(args.instance_json)
@@ -10,6 +10,8 @@ import tempfile
10
10
  from pathlib import Path
11
11
  from typing import Any
12
12
 
13
+ from pair_evidence_contract import loads_strict_json_object, reject_json_constant
14
+
13
15
 
14
16
  def read_jsonl(path: Path) -> list[dict[str, Any]]:
15
17
  rows: list[dict[str, Any]] = []
@@ -17,7 +19,7 @@ def read_jsonl(path: Path) -> list[dict[str, Any]]:
17
19
  for line_no, line in enumerate(f, start=1):
18
20
  if not line.strip():
19
21
  continue
20
- value = json.loads(line)
22
+ value = json.loads(line, parse_constant=reject_json_constant)
21
23
  if not isinstance(value, dict):
22
24
  raise ValueError(f"{path}:{line_no}: expected JSON object")
23
25
  rows.append(value)
@@ -31,6 +33,32 @@ def require_text(row: dict[str, Any], key: str, source: str) -> str:
31
33
  return value.strip()
32
34
 
33
35
 
36
+ def positive_int(value: str) -> int:
37
+ try:
38
+ parsed = int(value)
39
+ except ValueError as exc:
40
+ raise argparse.ArgumentTypeError("must be an integer") from exc
41
+ if parsed <= 0:
42
+ raise argparse.ArgumentTypeError("must be > 0")
43
+ return parsed
44
+
45
+
46
+ def parse_prepared_case(stdout: str, source: str) -> dict[str, Any]:
47
+ try:
48
+ value = loads_strict_json_object(stdout)
49
+ except ValueError as exc:
50
+ if str(exc) == "top-level JSON value must be an object":
51
+ raise ValueError(f"{source}: expected JSON object") from exc
52
+ raise
53
+ for key in ("instance_id", "case_dir", "repo_dir", "run_command"):
54
+ if key == "run_command":
55
+ if not isinstance(value.get(key), list) or not value[key]:
56
+ raise ValueError(f"{source}: missing non-empty {key!r}")
57
+ elif not isinstance(value.get(key), str) or not value[key].strip():
58
+ raise ValueError(f"{source}: missing non-empty {key!r}")
59
+ return value
60
+
61
+
34
62
  def main() -> int:
35
63
  parser = argparse.ArgumentParser()
36
64
  parser.add_argument("--instances-jsonl", required=True, type=Path)
@@ -47,8 +75,8 @@ def main() -> int:
47
75
  )
48
76
  parser.add_argument("--repo-dir", type=Path, help="Use one local repo clone for every selected instance.")
49
77
  parser.add_argument("--instance-id", action="append", help="Prepare only these instance ids.")
50
- parser.add_argument("--limit", type=int, help="Prepare at most N matched instances after filtering.")
51
- parser.add_argument("--timeout-seconds", type=int, default=2400)
78
+ parser.add_argument("--limit", type=positive_int, help="Prepare at most N matched instances after filtering.")
79
+ parser.add_argument("--timeout-seconds", type=positive_int, default=2400)
52
80
  parser.add_argument("--out-manifest", type=Path)
53
81
  args = parser.parse_args()
54
82
 
@@ -61,6 +89,8 @@ def main() -> int:
61
89
  predictions[instance_id] = row
62
90
 
63
91
  selected_ids = args.instance_id or list(predictions)
92
+ if not selected_ids:
93
+ raise ValueError("no prediction instances selected")
64
94
  script = Path(__file__).with_name("prepare-swebench-frozen-case.py")
65
95
  prepared: list[dict[str, Any]] = []
66
96
  with tempfile.TemporaryDirectory() as tmp:
@@ -98,7 +128,7 @@ def main() -> int:
98
128
  if args.repo_dir is not None:
99
129
  cmd.extend(["--repo-dir", str(args.repo_dir)])
100
130
  completed = subprocess.run(cmd, check=True, text=True, capture_output=True)
101
- prepared.append(json.loads(completed.stdout))
131
+ prepared.append(parse_prepared_case(completed.stdout, f"prepared case {instance_id}"))
102
132
 
103
133
  manifest = {
104
134
  "instances_jsonl": str(args.instances_jsonl),
@@ -11,8 +11,12 @@ import subprocess
11
11
  from pathlib import Path
12
12
  from typing import Any
13
13
 
14
+ from pair_evidence_contract import reject_json_constant
15
+
14
16
 
15
17
  SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
18
+ SAFE_REPO = re.compile(r"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$")
19
+ SAFE_COMMIT = re.compile(r"^[0-9a-fA-F]{7,40}$")
16
20
 
17
21
 
18
22
  def run(cmd: list[str], cwd: Path | None = None) -> None:
@@ -25,7 +29,7 @@ def read_instances(path: Path) -> list[dict[str, Any]]:
25
29
  for line_no, line in enumerate(f, start=1):
26
30
  if not line.strip():
27
31
  continue
28
- value = json.loads(line)
32
+ value = json.loads(line, parse_constant=reject_json_constant)
29
33
  if not isinstance(value, dict):
30
34
  raise ValueError(f"{path}:{line_no}: expected JSON object")
31
35
  rows.append(value)
@@ -39,6 +43,20 @@ def require_text(instance: dict[str, Any], key: str) -> str:
39
43
  return value.strip()
40
44
 
41
45
 
46
+ def require_safe_repo(instance: dict[str, Any]) -> str:
47
+ repo = require_text(instance, "repo")
48
+ if not SAFE_REPO.match(repo):
49
+ raise ValueError(f"unsafe SWE-bench repo: {repo!r}")
50
+ return repo
51
+
52
+
53
+ def require_safe_base_commit(instance: dict[str, Any]) -> str:
54
+ base_commit = require_text(instance, "base_commit")
55
+ if not SAFE_COMMIT.match(base_commit):
56
+ raise ValueError(f"unsafe SWE-bench base_commit: {base_commit!r}")
57
+ return base_commit
58
+
59
+
42
60
  def pick_instance(path: Path, instance_id: str) -> dict[str, Any]:
43
61
  matches = [row for row in read_instances(path) if row.get("instance_id") == instance_id]
44
62
  if len(matches) != 1:
@@ -51,8 +69,8 @@ def repo_cache_name(repo: str, base_commit: str) -> str:
51
69
 
52
70
 
53
71
  def prepare_repo(instance: dict[str, Any], repos_root: Path) -> Path:
54
- repo = require_text(instance, "repo")
55
- base_commit = require_text(instance, "base_commit")
72
+ repo = require_safe_repo(instance)
73
+ base_commit = require_safe_base_commit(instance)
56
74
  repos_root.mkdir(parents=True, exist_ok=True)
57
75
  dest = repos_root / repo_cache_name(repo, base_commit)
58
76
 
@@ -77,8 +95,8 @@ def copy_worktree(repo_path: Path, worktree: Path) -> None:
77
95
 
78
96
  def write_spec(instance: dict[str, Any], worktree: Path) -> Path:
79
97
  instance_id = require_text(instance, "instance_id")
80
- repo = require_text(instance, "repo")
81
- base_commit = require_text(instance, "base_commit")
98
+ repo = require_safe_repo(instance)
99
+ base_commit = require_safe_base_commit(instance)
82
100
  problem = require_text(instance, "problem_statement")
83
101
  spec_path = worktree / "docs" / "roadmap" / "phase-1" / f"{instance_id}.md"
84
102
  spec_path.parent.mkdir(parents=True, exist_ok=True)