devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -25,6 +25,24 @@ EOF
25
25
  exit "${1:-1}"
26
26
  }
27
27
 
28
+ require_value() {
29
+ local flag="$1"
30
+ local value="${2:-}"
31
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
32
+ echo "$flag requires a value" >&2
33
+ exit 1
34
+ fi
35
+ }
36
+
37
+ require_safe_id() {
38
+ local label="$1"
39
+ local value="$2"
40
+ if [[ ! "$value" =~ ^[A-Za-z0-9_.-]+$ ]]; then
41
+ echo "$label must match [A-Za-z0-9_.-]+: $value" >&2
42
+ exit 1
43
+ fi
44
+ }
45
+
28
46
  FIXTURE=""
29
47
  DIFF_PATH=""
30
48
  RUN_ID=""
@@ -36,13 +54,13 @@ TIMEOUT_OVERRIDE=""
36
54
  RESUME_COMPLETED_ARMS=0
37
55
  while [ $# -gt 0 ]; do
38
56
  case "$1" in
39
- --fixture) FIXTURE="$2"; shift 2;;
40
- --diff) DIFF_PATH="$2"; shift 2;;
41
- --run-id) RUN_ID="$2"; shift 2;;
42
- --pair-mode) PAIR_MODE="$2"; shift 2;;
43
- --fixtures-root) FIXTURES_ROOT="$2"; shift 2;;
44
- --base-repo) BASE_REPO="$2"; shift 2;;
45
- --timeout-seconds) TIMEOUT_OVERRIDE="$2"; shift 2;;
57
+ --fixture) require_value "$1" "${2:-}"; FIXTURE="$2"; shift 2;;
58
+ --diff) require_value "$1" "${2:-}"; DIFF_PATH="$2"; shift 2;;
59
+ --run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
60
+ --pair-mode) require_value "$1" "${2:-}"; PAIR_MODE="$2"; shift 2;;
61
+ --fixtures-root) require_value "$1" "${2:-}"; FIXTURES_ROOT="$2"; shift 2;;
62
+ --base-repo) require_value "$1" "${2:-}"; BASE_REPO="$2"; shift 2;;
63
+ --timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_OVERRIDE="$2"; shift 2;;
46
64
  --prepare-only) PREPARE_ONLY=1; shift;;
47
65
  --resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
48
66
  -h|--help) usage 0;;
@@ -51,6 +69,7 @@ while [ $# -gt 0 ]; do
51
69
  done
52
70
 
53
71
  [ -n "$FIXTURE" ] && [ -n "$DIFF_PATH" ] || usage 1
72
+ require_safe_id "--fixture" "$FIXTURE"
54
73
  [ -f "$DIFF_PATH" ] || { echo "diff not found: $DIFF_PATH" >&2; exit 1; }
55
74
  [ -s "$DIFF_PATH" ] || { echo "diff is empty: $DIFF_PATH" >&2; exit 1; }
56
75
  [ "$PAIR_MODE" = "forced" ] || [ "$PAIR_MODE" = "gated" ] || { echo "--pair-mode must be forced|gated (got '$PAIR_MODE')" >&2; exit 1; }
@@ -74,7 +93,20 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK" "$SETUP"; do
74
93
  [ -f "$f" ] || { echo "fixture missing required file: $f" >&2; exit 1; }
75
94
  done
76
95
 
77
- TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
96
+ TIMEOUT=$(python3 - "$META" "$BENCH_ROOT/scripts" <<'PY'
97
+ import pathlib
98
+ import sys
99
+
100
+ sys.path.insert(0, sys.argv[2])
101
+ from pair_evidence_contract import loads_strict_json_object
102
+
103
+ metadata = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
104
+ timeout = metadata.get("timeout_seconds")
105
+ if not isinstance(timeout, int) or isinstance(timeout, bool) or timeout <= 0:
106
+ raise SystemExit("metadata timeout_seconds must be a positive integer")
107
+ print(timeout)
108
+ PY
109
+ )
78
110
  if [ -n "$TIMEOUT_OVERRIDE" ]; then
79
111
  case "$TIMEOUT_OVERRIDE" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
80
112
  [ "$TIMEOUT_OVERRIDE" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
@@ -85,10 +117,28 @@ if [ -z "$RUN_ID" ]; then
85
117
  SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
86
118
  RUN_ID="${TS}-${SHA}-frozen-verify"
87
119
  fi
120
+ require_safe_id "--run-id" "$RUN_ID"
88
121
 
89
122
  RESULT_ROOT="$BENCH_ROOT/results/$RUN_ID"
90
123
  mkdir -p "$RESULT_ROOT"
91
124
 
125
+ print_command() {
126
+ local cmd=(bash "$0"
127
+ --fixture "$FIXTURE"
128
+ --fixtures-root "$FIXTURES_ROOT"
129
+ --base-repo "$BASE_REPO"
130
+ --diff "$DIFF_PATH"
131
+ --run-id "$RUN_ID"
132
+ --pair-mode "$PAIR_MODE"
133
+ --timeout-seconds "$TIMEOUT"
134
+ )
135
+ [ "$PREPARE_ONLY" -eq 0 ] || cmd+=(--prepare-only)
136
+ [ "$RESUME_COMPLETED_ARMS" -eq 0 ] || cmd+=(--resume-completed-arms)
137
+ printf 'Command: '
138
+ printf '%q ' "${cmd[@]}"
139
+ printf '\n'
140
+ }
141
+
92
142
  echo ""
93
143
  echo "═══ Frozen Verify Pair Run ═══"
94
144
  echo "Run-id: $RUN_ID"
@@ -99,6 +149,7 @@ echo "Diff: $DIFF_PATH"
99
149
  echo "Pair: $PAIR_MODE"
100
150
  echo "Timeout: ${TIMEOUT}s per arm"
101
151
  [ "$PREPARE_ONLY" -eq 0 ] || echo "Mode: prepare-only"
152
+ print_command
102
153
  echo ""
103
154
 
104
155
  mirror_skills() {
@@ -195,17 +246,36 @@ summarize_arm() {
195
246
  local result_dir="$1"
196
247
  local elapsed="$2"
197
248
  local invoke_exit="$3"
198
- python3 - "$result_dir" "$elapsed" "$invoke_exit" <<'PY'
249
+ python3 - "$result_dir" "$elapsed" "$invoke_exit" "$BENCH_ROOT/scripts" <<'PY'
199
250
  import json, pathlib, sys
251
+ sys.path.insert(0, sys.argv[4])
252
+ from pair_evidence_contract import loads_strict_json_object, reject_json_constant
253
+
200
254
  result_dir = pathlib.Path(sys.argv[1])
201
255
  elapsed = int(sys.argv[2])
202
256
  invoke_exit = int(sys.argv[3])
203
257
  archive = result_dir / "run-archive"
204
258
  state_path = archive / "pipeline.state.json"
205
- state = json.loads(state_path.read_text()) if state_path.is_file() else {}
206
- verify = ((state.get("phases") or {}).get("verify") or {})
259
+ def as_dict(value):
260
+ return value if isinstance(value, dict) else {}
261
+
262
+ def strict_nonnegative_int(value):
263
+ return isinstance(value, int) and not isinstance(value, bool) and value >= 0
264
+
265
+ state = as_dict(loads_strict_json_object(state_path.read_text())) if state_path.is_file() else {}
266
+ phases = as_dict(state.get("phases"))
267
+ verify = as_dict(phases.get("verify"))
268
+ legacy_verify = as_dict(state.get("verify"))
207
269
  sub_verdicts = verify.get("sub_verdicts")
208
- pair_trigger = verify.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
270
+ pair_trigger = verify.get("pair_trigger") or legacy_verify.get("pair_trigger")
271
+ PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}
272
+
273
+ def has_pair_judge_verdict(sub_verdicts):
274
+ return isinstance(sub_verdicts, dict) and (
275
+ sub_verdicts.get("judge_codex") in PAIR_VERDICTS
276
+ or sub_verdicts.get("pair_judge") in PAIR_VERDICTS
277
+ )
278
+
209
279
  findings = []
210
280
  finding_paths = []
211
281
  merged_path = archive / "verify-merged.findings.jsonl"
@@ -231,7 +301,7 @@ for findings_path in finding_paths:
231
301
  for line in findings_path.read_text().splitlines():
232
302
  if line.strip():
233
303
  try:
234
- parsed = json.loads(line)
304
+ parsed = json.loads(line, parse_constant=reject_json_constant)
235
305
  except json.JSONDecodeError:
236
306
  continue
237
307
  if not isinstance(parsed, dict):
@@ -240,9 +310,10 @@ for findings_path in finding_paths:
240
310
  if sev not in finding_severities:
241
311
  continue
242
312
  findings.append(parsed)
243
- merged = verify.get("merged") if isinstance(verify.get("merged"), dict) else {}
313
+ merged = as_dict(verify.get("merged"))
244
314
  merged_findings_count = sum(
245
- int(merged.get(k) or 0) for k in ("critical", "high", "medium", "low")
315
+ merged.get(k) if strict_nonnegative_int(merged.get(k)) else 0
316
+ for k in ("critical", "high", "medium", "low")
246
317
  )
247
318
  findings_count = len(findings) if findings else merged_findings_count
248
319
  severity_counts = {}
@@ -262,15 +333,12 @@ summary = {
262
333
  "invoke_exit": invoke_exit,
263
334
  "timed_out": invoke_exit == 124,
264
335
  "invoke_failure_reason": invoke_failure_reason,
265
- "terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
336
+ "terminal_verdict": as_dict(phases.get("final_report")).get("verdict"),
266
337
  "verify_verdict": verify.get("verdict"),
267
338
  "sub_verdicts": sub_verdicts,
268
339
  "pair_trigger": pair_trigger,
269
- "pair_mode": bool(isinstance(sub_verdicts, dict) and (
270
- sub_verdicts.get("judge_codex") is not None
271
- or sub_verdicts.get("pair_judge") is not None
272
- ))
273
- or bool(verify.get("pair_mode")),
340
+ "pair_mode": has_pair_judge_verdict(sub_verdicts)
341
+ or verify.get("pair_mode") is True,
274
342
  "verify_findings_count": findings_count,
275
343
  "verify_findings_source": findings_source if finding_paths else (
276
344
  "state.merged" if merged_findings_count else "missing"
@@ -302,11 +370,13 @@ run_arm() {
302
370
  local result_dir="$RESULT_ROOT/$arm"
303
371
  local work_dir="/tmp/bench-${RUN_ID}-${FIXTURE}-${arm}"
304
372
  if [ "$RESUME_COMPLETED_ARMS" -eq 1 ] && [ "$PREPARE_ONLY" -eq 0 ] && [ -f "$result_dir/summary.json" ]; then
305
- if python3 - "$result_dir/summary.json" <<'PY'
306
- import json
373
+ if python3 - "$result_dir/summary.json" "$BENCH_ROOT/scripts" <<'PY'
374
+ import pathlib
307
375
  import sys
376
+ sys.path.insert(0, sys.argv[2])
377
+ from pair_evidence_contract import loads_strict_json_object
308
378
 
309
- summary = json.load(open(sys.argv[1]))
379
+ summary = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
310
380
  raise SystemExit(0 if summary.get("invoke_exit") == 0 else 1)
311
381
  PY
312
382
  then
@@ -330,9 +400,12 @@ PY
330
400
  mkdir -p "$work_dir/docs/roadmap/phase-1" "$work_dir/.devlyn"
331
401
  cp "$SPEC" "$work_dir/docs/roadmap/phase-1/$FIXTURE.md"
332
402
  cp "$DIFF_PATH" "$work_dir/.devlyn/external-diff.patch"
333
- python3 - "$EXPECTED" "$work_dir/.devlyn/spec-verify.json" <<'PY'
334
- import json, os, sys
335
- expected = json.load(open(sys.argv[1]))
403
+ python3 - "$EXPECTED" "$work_dir/.devlyn/spec-verify.json" "$BENCH_ROOT/scripts" <<'PY'
404
+ import json, os, pathlib, sys
405
+ sys.path.insert(0, sys.argv[3])
406
+ from pair_evidence_contract import loads_strict_json_object
407
+
408
+ expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
336
409
  out_path = sys.argv[2]
337
410
  commands = expected.get("verification_commands", [])
338
411
  if not commands:
@@ -462,14 +535,17 @@ else
462
535
  run_arm pair ""
463
536
  fi
464
537
 
465
- python3 - "$RESULT_ROOT" "$PAIR_MODE" <<'PY'
466
- import json, pathlib, sys
538
+ python3 - "$RESULT_ROOT" "$PAIR_MODE" "$BENCH_ROOT/scripts" <<'PY'
539
+ import json, math, pathlib, sys
540
+ sys.path.insert(0, sys.argv[3])
541
+ from pair_evidence_contract import loads_strict_json_object
542
+
467
543
  root = pathlib.Path(sys.argv[1])
468
544
  pair_mode_requested = sys.argv[2]
469
545
  out = {}
470
546
  for arm in ("solo", "pair"):
471
547
  path = root / arm / "summary.json"
472
- out[arm] = json.loads(path.read_text()) if path.is_file() else {"missing": True}
548
+ out[arm] = loads_strict_json_object(path.read_text()) if path.is_file() else {"missing": True}
473
549
  solo = out.get("solo", {})
474
550
  pair = out.get("pair", {})
475
551
  rank = {
@@ -481,31 +557,127 @@ rank = {
481
557
  }
482
558
  solo_rank = rank.get(solo.get("verify_verdict"), 0)
483
559
  pair_rank = rank.get(pair.get("verify_verdict"), 0)
484
- pair_sub = pair.get("sub_verdicts") or {}
560
+ raw_pair_sub = pair.get("sub_verdicts")
561
+ pair_sub = raw_pair_sub if isinstance(raw_pair_sub, dict) else {}
485
562
  pair_primary_verdict = pair_sub.get("judge")
486
563
  pair_judge_verdict = pair_sub.get("pair_judge")
487
564
  pair_primary_rank = rank.get(pair_primary_verdict, 0)
488
565
  pair_judge_rank = rank.get(pair_judge_verdict, 0)
566
+ def strict_positive_number(value):
567
+ return (
568
+ isinstance(value, (int, float))
569
+ and not isinstance(value, bool)
570
+ and math.isfinite(value)
571
+ and value > 0
572
+ )
573
+
574
+ def elapsed_ratio(pair_elapsed, solo_elapsed):
575
+ if not strict_positive_number(pair_elapsed) or not strict_positive_number(solo_elapsed):
576
+ return None
577
+ return pair_elapsed / solo_elapsed
578
+
579
+ def strict_nonnegative_int(value):
580
+ return isinstance(value, int) and not isinstance(value, bool) and value >= 0
581
+
582
+ def summary_findings_count(data):
583
+ value = data.get("verify_findings_count")
584
+ return value if strict_nonnegative_int(value) else None
585
+
586
+ def severity_count_sum(data):
587
+ raw_counts = data.get("severity_counts")
588
+ if not isinstance(raw_counts, dict):
589
+ return None
590
+ total = 0
591
+ for key in ("LOW", "MEDIUM", "HIGH", "CRITICAL"):
592
+ value = raw_counts.get(key, 0)
593
+ if not strict_nonnegative_int(value):
594
+ return None
595
+ total += value
596
+ return total
597
+
598
+ def strict_greater(left, right):
599
+ return left is not None and right is not None and left > right
600
+
601
+ wall_ratio = elapsed_ratio(pair.get("elapsed_seconds"), solo.get("elapsed_seconds"))
602
+ pair_mode_true = pair.get("pair_mode") is True
603
+ raw_pair_trigger = pair.get("pair_trigger")
604
+ pair_trigger = raw_pair_trigger if isinstance(raw_pair_trigger, dict) else {}
605
+ pair_findings_count = summary_findings_count(pair)
606
+ solo_findings_count = summary_findings_count(solo)
607
+ pair_low_or_worse = severity_count_sum(pair)
608
+ solo_low_or_worse = severity_count_sum(solo)
489
609
  out["comparison"] = {
490
610
  "pair_mode_requested": pair_mode_requested,
491
611
  "pair_trigger_missed": bool(
492
612
  pair_mode_requested == "gated"
493
- and (pair.get("pair_trigger") or {}).get("eligible") is True
494
- and (pair.get("pair_trigger") or {}).get("reasons")
495
- and not pair.get("pair_mode")
613
+ and pair_trigger.get("eligible") is True
614
+ and pair_trigger.get("reasons")
615
+ and not pair_mode_true
496
616
  ),
497
- "pair_found_more_findings": (pair.get("verify_findings_count") or 0) > (solo.get("verify_findings_count") or 0),
498
- "pair_found_more_low_or_worse": sum((pair.get("severity_counts") or {}).get(k, 0) for k in ("LOW", "MEDIUM", "HIGH", "CRITICAL"))
499
- > sum((solo.get("severity_counts") or {}).get(k, 0) for k in ("LOW", "MEDIUM", "HIGH", "CRITICAL")),
500
- "pair_verdict_lift": bool(pair.get("pair_mode")) and pair_rank > solo_rank and pair_rank >= rank["NEEDS_WORK"],
501
- "pair_internal_verdict_lift": bool(pair.get("pair_mode"))
617
+ "pair_found_more_findings": strict_greater(pair_findings_count, solo_findings_count),
618
+ "pair_found_more_low_or_worse": strict_greater(pair_low_or_worse, solo_low_or_worse),
619
+ "pair_verdict_lift": pair_mode_true and pair_rank > solo_rank and pair_rank >= rank["NEEDS_WORK"],
620
+ "pair_internal_verdict_lift": pair_mode_true
502
621
  and pair_judge_rank > pair_primary_rank
503
622
  and pair_rank >= rank["NEEDS_WORK"],
504
623
  "solo_verdict": solo.get("verify_verdict"),
505
624
  "pair_verdict": pair.get("verify_verdict"),
506
625
  "pair_primary_verdict": pair_primary_verdict,
507
626
  "pair_judge_verdict": pair_judge_verdict,
627
+ "pair_solo_wall_ratio": wall_ratio,
508
628
  }
509
629
  (root / "compare.json").write_text(json.dumps(out, indent=2) + "\n")
510
630
  print(json.dumps(out, indent=2))
631
+
632
+ def fmt_bool(value):
633
+ return str(value is True).lower()
634
+
635
+ def fmt_ratio(value):
636
+ return f"{value:.2f}x" if strict_positive_number(value) else "n/a"
637
+
638
+ def fmt_seconds(value):
639
+ return f"{value}s" if strict_positive_number(value) else "n/a"
640
+
641
+ def fmt_trigger_reasons(value):
642
+ if not isinstance(value, dict):
643
+ return ""
644
+ reasons = value.get("reasons")
645
+ if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
646
+ return ""
647
+ return ",".join(reasons)
648
+
649
+ def arm_row(name, data):
650
+ return (
651
+ f"| {name} | {data.get('verify_verdict') or 'n/a'} | "
652
+ f"{fmt_bool(data.get('pair_mode'))} | "
653
+ f"{fmt_trigger_reasons(data.get('pair_trigger'))} | "
654
+ f"{data.get('verify_findings_count', 'n/a')} | "
655
+ f"{fmt_seconds(data.get('elapsed_seconds'))} | {data.get('invoke_exit', 'n/a')} | "
656
+ f"{data.get('invoke_failure_reason') or 'n/a'} |"
657
+ )
658
+
659
+ summary_lines = [
660
+ "# Frozen VERIFY Pair Summary",
661
+ "",
662
+ f"Run id: `{root.name}`",
663
+ f"Pair mode requested: `{pair_mode_requested}`",
664
+ "",
665
+ "| Arm | Verdict | Pair mode | Triggers | Findings | Elapsed | Invoke exit | Failure |",
666
+ "|---|---|---:|---|---:|---:|---:|---|",
667
+ arm_row("solo", solo),
668
+ arm_row("pair", pair),
669
+ "",
670
+ "| Wall ratio | External lift | Internal lift | Trigger missed |",
671
+ "|---:|---:|---:|---:|",
672
+ (
673
+ f"| {fmt_ratio(wall_ratio)} | "
674
+ f"{fmt_bool(out['comparison']['pair_verdict_lift'])} | "
675
+ f"{fmt_bool(out['comparison']['pair_internal_verdict_lift'])} | "
676
+ f"{fmt_bool(out['comparison']['pair_trigger_missed'])} |"
677
+ ),
678
+ "",
679
+ ]
680
+ summary_text = "\n".join(summary_lines)
681
+ (root / "compare.md").write_text(summary_text, encoding="utf8")
682
+ print(summary_text)
511
683
  PY