devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -14,10 +14,19 @@ from __future__ import annotations
14
14
 
15
15
  import argparse
16
16
  import json
17
+ import math
17
18
  import re
18
19
  from pathlib import Path
19
20
  from typing import Any
20
21
 
22
+ from pair_evidence_contract import (
23
+ all_known_pair_trigger_reasons,
24
+ has_canonical_pair_trigger_reason,
25
+ has_known_pair_trigger_reason,
26
+ path_has_actionable_solo_headroom_hypothesis,
27
+ reject_json_constant,
28
+ )
29
+
21
30
 
22
31
  VERDICT_RANK = {
23
32
  "PASS": 0,
@@ -31,8 +40,35 @@ def load_compare(results_root: Path, run_id: str) -> dict[str, Any]:
31
40
  compare_path = results_root / run_id / "compare.json"
32
41
  if not compare_path.exists():
33
42
  raise FileNotFoundError(f"missing compare.json for {run_id}: {compare_path}")
34
- with compare_path.open() as f:
35
- return json.load(f)
43
+ try:
44
+ data = json.loads(
45
+ compare_path.read_text(encoding="utf8"),
46
+ parse_constant=reject_json_constant,
47
+ )
48
+ except (json.JSONDecodeError, ValueError) as exc:
49
+ raise ValueError(f"malformed compare.json for {run_id}: invalid JSON") from exc
50
+ if not isinstance(data, dict):
51
+ raise ValueError(f"malformed compare.json for {run_id}: expected object")
52
+ return data
53
+
54
+
55
+ def object_field(payload: dict[str, Any], key: str) -> dict[str, Any]:
56
+ value = payload.get(key)
57
+ return value if isinstance(value, dict) else {}
58
+
59
+
60
+ def verdict_field(payload: dict[str, Any], key: str) -> str | None:
61
+ value = payload.get(key)
62
+ return value if isinstance(value, str) else None
63
+
64
+
65
+ def number_field(payload: dict[str, Any], key: str) -> int | float | None:
66
+ value = payload.get(key)
67
+ if isinstance(value, bool):
68
+ return None
69
+ if not isinstance(value, (int, float)) or not math.isfinite(value):
70
+ return None
71
+ return value
36
72
 
37
73
 
38
74
  def rank(verdict: str | None) -> int:
@@ -42,11 +78,49 @@ def rank(verdict: str | None) -> int:
42
78
  def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
43
79
  if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
44
80
  return None
45
- if solo_elapsed <= 0:
81
+ if pair_elapsed <= 0 or solo_elapsed <= 0:
46
82
  return None
47
83
  return pair_elapsed / solo_elapsed
48
84
 
49
85
 
86
+ def pair_trigger_failures(pair: dict[str, Any]) -> list[str]:
87
+ trigger = pair.get("pair_trigger")
88
+ if not isinstance(trigger, dict):
89
+ return ["pair_trigger missing or malformed"]
90
+ eligible = trigger.get("eligible")
91
+ reasons = trigger.get("reasons")
92
+ skipped_reason = trigger.get("skipped_reason")
93
+ if not isinstance(eligible, bool):
94
+ return ["pair_trigger.eligible malformed"]
95
+ if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
96
+ return ["pair_trigger.reasons malformed"]
97
+ if skipped_reason is not None and not isinstance(skipped_reason, str):
98
+ return ["pair_trigger.skipped_reason malformed"]
99
+ if eligible is not True:
100
+ return ["pair_trigger not eligible"]
101
+ if not reasons:
102
+ return ["pair_trigger eligible with empty reasons"]
103
+ if not has_known_pair_trigger_reason(reasons):
104
+ return ["pair_trigger reasons missing known trigger reason"]
105
+ if not all_known_pair_trigger_reasons(reasons):
106
+ return ["pair_trigger reasons contain unknown trigger reason"]
107
+ if not has_canonical_pair_trigger_reason(reasons):
108
+ return ["pair_trigger reasons missing canonical trigger reason"]
109
+ if skipped_reason is not None:
110
+ return ["pair_trigger eligible with skipped_reason"]
111
+ return []
112
+
113
+
114
+ def pair_trigger_reasons(pair: dict[str, Any]) -> list[str]:
115
+ trigger = pair.get("pair_trigger")
116
+ if not isinstance(trigger, dict):
117
+ return []
118
+ reasons = trigger.get("reasons")
119
+ if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
120
+ return []
121
+ return reasons
122
+
123
+
50
124
  def infer_fixture_id(results_root: Path, run_id: str) -> str | None:
51
125
  run_root = results_root / run_id
52
126
  for arm in ("pair", "solo"):
@@ -74,10 +148,11 @@ def evaluate_run(
74
148
  fixtures_root: Path,
75
149
  run_id: str,
76
150
  max_pair_solo_wall_ratio: float | None,
151
+ require_hypothesis_trigger: bool,
77
152
  ) -> dict[str, Any]:
78
153
  try:
79
154
  compare = load_compare(results_root, run_id)
80
- except FileNotFoundError as exc:
155
+ except (FileNotFoundError, ValueError) as exc:
81
156
  fixture_id = infer_fixture_id(results_root, run_id)
82
157
  return {
83
158
  "run_id": run_id,
@@ -97,9 +172,9 @@ def evaluate_run(
97
172
  "pair_solo_wall_ratio": None,
98
173
  "pair_severity_counts": {},
99
174
  }
100
- solo = compare.get("solo") or {}
101
- pair = compare.get("pair") or {}
102
- comparison = compare.get("comparison") or {}
175
+ solo = object_field(compare, "solo")
176
+ pair = object_field(compare, "pair")
177
+ comparison = object_field(compare, "comparison")
103
178
  solo_failure_reason = solo.get("invoke_failure_reason") or transcript_failure_reason(
104
179
  results_root, run_id, "solo"
105
180
  )
@@ -112,6 +187,20 @@ def evaluate_run(
112
187
  failures.append("solo timed out")
113
188
  if pair.get("timed_out"):
114
189
  failures.append("pair timed out")
190
+ if solo.get("invoke_failure"):
191
+ reason = solo.get("invoke_failure_reason")
192
+ failures.append(f"solo invoke failure ({reason})" if reason else "solo invoke failure")
193
+ if pair.get("invoke_failure"):
194
+ reason = pair.get("invoke_failure_reason")
195
+ failures.append(f"pair invoke failure ({reason})" if reason else "pair invoke failure")
196
+ if solo.get("environment_contamination"):
197
+ failures.append("solo environment contamination")
198
+ if pair.get("environment_contamination"):
199
+ failures.append("pair environment contamination")
200
+ if solo.get("disqualifier"):
201
+ failures.append("solo disqualifier")
202
+ if pair.get("disqualifier"):
203
+ failures.append("pair disqualifier")
115
204
  if solo_failure_reason == "provider_limit":
116
205
  failures.append("solo provider limit")
117
206
  if pair_failure_reason == "provider_limit":
@@ -120,27 +209,39 @@ def evaluate_run(
120
209
  failures.append(f"solo invoke_exit={solo.get('invoke_exit')}")
121
210
  if pair.get("invoke_exit") != 0:
122
211
  failures.append(f"pair invoke_exit={pair.get('invoke_exit')}")
123
- if not pair.get("pair_mode"):
212
+ pair_mode = pair.get("pair_mode") is True
213
+ if not pair_mode:
124
214
  failures.append("pair_mode false")
125
- if comparison.get("pair_trigger_missed"):
215
+ failures.extend(pair_trigger_failures(pair))
216
+ trigger_reasons = pair_trigger_reasons(pair)
217
+ pair_trigger_missed = comparison.get("pair_trigger_missed") is True
218
+ if pair_trigger_missed:
126
219
  failures.append("pair trigger missed")
127
- external_lift = bool(comparison.get("pair_verdict_lift"))
128
- internal_lift = bool(comparison.get("pair_internal_verdict_lift"))
220
+ external_lift = comparison.get("pair_verdict_lift") is True
221
+ internal_lift = comparison.get("pair_internal_verdict_lift") is True
129
222
  if not (external_lift or internal_lift):
130
223
  failures.append("pair verdict lift false")
131
224
 
132
225
  solo_verdict = (
133
- comparison.get("solo_verdict")
134
- or solo.get("verify_verdict")
135
- or solo.get("terminal_verdict")
226
+ verdict_field(comparison, "solo_verdict")
227
+ or verdict_field(solo, "verify_verdict")
228
+ or verdict_field(solo, "terminal_verdict")
136
229
  )
137
230
  pair_verdict = (
138
- comparison.get("pair_verdict")
139
- or pair.get("verify_verdict")
140
- or pair.get("terminal_verdict")
231
+ verdict_field(comparison, "pair_verdict")
232
+ or verdict_field(pair, "verify_verdict")
233
+ or verdict_field(pair, "terminal_verdict")
141
234
  )
142
- pair_primary_verdict = comparison.get("pair_primary_verdict")
143
- pair_judge_verdict = comparison.get("pair_judge_verdict")
235
+ pair_primary_verdict = verdict_field(comparison, "pair_primary_verdict")
236
+ pair_judge_verdict = verdict_field(comparison, "pair_judge_verdict")
237
+ if solo_verdict is None:
238
+ failures.append("solo verdict missing or malformed")
239
+ if pair_verdict is None:
240
+ failures.append("pair verdict missing or malformed")
241
+ if internal_lift and pair_primary_verdict is None:
242
+ failures.append("pair primary verdict missing or malformed")
243
+ if internal_lift and pair_judge_verdict is None:
244
+ failures.append("pair judge verdict missing or malformed")
144
245
  if external_lift and rank(pair_verdict) <= rank(solo_verdict):
145
246
  failures.append(f"pair verdict {pair_verdict} not stricter than solo {solo_verdict}")
146
247
  if internal_lift and rank(pair_judge_verdict) <= rank(pair_primary_verdict):
@@ -149,8 +250,8 @@ def evaluate_run(
149
250
  )
150
251
  if rank(pair_verdict) < VERDICT_RANK["NEEDS_WORK"]:
151
252
  failures.append(f"pair verdict {pair_verdict} is not verdict-binding")
152
- pair_elapsed = pair.get("elapsed_seconds")
153
- solo_elapsed = solo.get("elapsed_seconds")
253
+ pair_elapsed = number_field(pair, "elapsed_seconds")
254
+ solo_elapsed = number_field(solo, "elapsed_seconds")
154
255
  wall_ratio = elapsed_ratio(pair_elapsed, solo_elapsed)
155
256
  if max_pair_solo_wall_ratio is not None:
156
257
  if wall_ratio is None:
@@ -164,6 +265,12 @@ def evaluate_run(
164
265
  failures.append("fixture_id missing")
165
266
  elif not (fixtures_root / fixture_id).is_dir():
166
267
  failures.append(f"fixture_id not found: {fixture_id}")
268
+ elif (
269
+ require_hypothesis_trigger
270
+ and path_has_actionable_solo_headroom_hypothesis(fixtures_root / fixture_id / "spec.md")
271
+ and "spec.solo_headroom_hypothesis" not in trigger_reasons
272
+ ):
273
+ failures.append("pair_trigger missing spec.solo_headroom_hypothesis")
167
274
 
168
275
  return {
169
276
  "run_id": run_id,
@@ -172,8 +279,10 @@ def evaluate_run(
172
279
  "failures": failures,
173
280
  "solo_verdict": solo_verdict,
174
281
  "pair_verdict": pair_verdict,
175
- "pair_mode": bool(pair.get("pair_mode")),
176
- "pair_trigger_missed": bool(comparison.get("pair_trigger_missed")),
282
+ "pair_mode": pair_mode,
283
+ "pair_trigger_reasons": trigger_reasons,
284
+ "pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(trigger_reasons),
285
+ "pair_trigger_missed": pair_trigger_missed,
177
286
  "pair_verdict_lift": external_lift,
178
287
  "pair_internal_verdict_lift": internal_lift,
179
288
  "pair_primary_verdict": pair_primary_verdict,
@@ -183,7 +292,7 @@ def evaluate_run(
183
292
  "pair_solo_wall_ratio": wall_ratio,
184
293
  "solo_failure_reason": solo_failure_reason,
185
294
  "pair_failure_reason": pair_failure_reason,
186
- "pair_severity_counts": pair.get("severity_counts") or {},
295
+ "pair_severity_counts": object_field(pair, "severity_counts"),
187
296
  }
188
297
 
189
298
 
@@ -193,6 +302,12 @@ def format_ratio(value: Any) -> str:
193
302
  return "n/a"
194
303
 
195
304
 
305
+ def format_trigger_reasons(value: Any) -> str:
306
+ if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
307
+ return ""
308
+ return ",".join(value)
309
+
310
+
196
311
  def write_markdown(path: Path, report: dict[str, Any]) -> None:
197
312
  lines = [
198
313
  f"# Frozen VERIFY Gate — {report['run_ids_label']}",
@@ -208,15 +323,17 @@ def write_markdown(path: Path, report: dict[str, Any]) -> None:
208
323
  f"Max pair/solo wall ratio: {format_ratio(report.get('max_pair_solo_wall_ratio'))}",
209
324
  f"Average pair/solo wall ratio: {format_ratio(report.get('avg_pair_solo_wall_ratio'))}",
210
325
  "",
211
- "| Run | Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Status | Reason |",
212
- "|---|---|---|---|---|---|---|---|---|---|",
326
+ "| Run | Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Triggers | Wall ratio | External lift | Internal lift | Status | Reason |",
327
+ "|---|---|---|---|---|---|---|---|---|---|---|",
213
328
  ]
214
329
  for row in report["rows"]:
215
330
  reason = "; ".join(row["failures"]) if row["failures"] else "ok"
216
331
  lines.append(
217
332
  f"| {row['run_id']} | {row.get('fixture_id') or 'unknown'} | "
218
333
  f"{row['solo_verdict']} | {row['pair_verdict']} | "
219
- f"{str(row['pair_mode']).lower()} | {format_ratio(row.get('pair_solo_wall_ratio'))} | "
334
+ f"{str(row['pair_mode']).lower()} | "
335
+ f"{format_trigger_reasons(row.get('pair_trigger_reasons'))} | "
336
+ f"{format_ratio(row.get('pair_solo_wall_ratio'))} | "
220
337
  f"{str(row['pair_verdict_lift']).lower()} | "
221
338
  f"{str(row['pair_internal_verdict_lift']).lower()} | "
222
339
  f"{row['status']} | {reason} |"
@@ -225,17 +342,36 @@ def write_markdown(path: Path, report: dict[str, Any]) -> None:
225
342
  path.write_text("\n".join(lines), encoding="utf8")
226
343
 
227
344
 
345
+ def positive_int(value: str) -> int:
346
+ parsed = int(value)
347
+ if parsed <= 0:
348
+ raise argparse.ArgumentTypeError("value must be > 0")
349
+ return parsed
350
+
351
+
352
+ def positive_float(value: str) -> float:
353
+ parsed = float(value)
354
+ if parsed <= 0:
355
+ raise argparse.ArgumentTypeError("value must be > 0")
356
+ return parsed
357
+
358
+
228
359
  def main() -> int:
229
360
  parser = argparse.ArgumentParser()
230
361
  parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
231
362
  parser.add_argument("--fixtures-root", default="benchmark/auto-resolve/fixtures")
232
363
  parser.add_argument("--run-id", action="append", required=True)
233
- parser.add_argument("--min-runs", type=int, default=2)
364
+ parser.add_argument("--min-runs", type=positive_int, default=2)
234
365
  parser.add_argument(
235
366
  "--max-pair-solo-wall-ratio",
236
- type=float,
367
+ type=positive_float,
237
368
  help="Optional efficiency cap. When set, every run must include elapsed_seconds and pair/solo wall ratio must not exceed this value.",
238
369
  )
370
+ parser.add_argument(
371
+ "--require-hypothesis-trigger",
372
+ action="store_true",
373
+ help="require fixtures with actionable solo-headroom hypotheses to expose spec.solo_headroom_hypothesis in pair_trigger.reasons",
374
+ )
239
375
  parser.add_argument("--out-json")
240
376
  parser.add_argument("--out-md")
241
377
  args = parser.parse_args()
@@ -243,7 +379,13 @@ def main() -> int:
243
379
  results_root = Path(args.results_root)
244
380
  fixtures_root = Path(args.fixtures_root)
245
381
  rows = [
246
- evaluate_run(results_root, fixtures_root, run_id, args.max_pair_solo_wall_ratio)
382
+ evaluate_run(
383
+ results_root,
384
+ fixtures_root,
385
+ run_id,
386
+ args.max_pair_solo_wall_ratio,
387
+ args.require_hypothesis_trigger,
388
+ )
247
389
  for run_id in args.run_id
248
390
  ]
249
391
  fixture_counts: dict[str, int] = {}
@@ -262,6 +404,9 @@ def main() -> int:
262
404
  row["pair_solo_wall_ratio"]
263
405
  for row in rows
264
406
  if isinstance(row.get("pair_solo_wall_ratio"), (int, float))
407
+ and not isinstance(row.get("pair_solo_wall_ratio"), bool)
408
+ and math.isfinite(row["pair_solo_wall_ratio"])
409
+ and row["pair_solo_wall_ratio"] > 0
265
410
  ]
266
411
 
267
412
  report = {