devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -15,6 +15,50 @@ The report is the output of `npx devlyn-cli benchmark`. Ship-gate.py consumes su
15
15
  from __future__ import annotations
16
16
  import argparse, json, pathlib, sys, subprocess, datetime
17
17
 
18
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
19
+ if str(SCRIPT_DIR) not in sys.path:
20
+ sys.path.insert(0, str(SCRIPT_DIR))
21
+
22
+ from pair_evidence_contract import is_score, is_strict_number, loads_strict_json_object
23
+
24
+ KNOWN_ARMS = {"variant", "solo_claude", "bare"}
25
+ PASS_VERDICTS = {"PASS", "PASS_WITH_ISSUES"}
26
+
27
+
28
+ def verify_score_clean(value) -> bool:
29
+ return is_strict_number(value) and value >= 1.0
30
+
31
+
32
+ def exact_bool(value):
33
+ if value is True or value is False:
34
+ return value
35
+ if value is None:
36
+ return False
37
+ return None
38
+
39
+
40
+ def skill_verdict_clean(result: dict, arm: str) -> bool:
41
+ if arm == "bare":
42
+ return True
43
+ return (
44
+ result.get("terminal_verdict") in PASS_VERDICTS
45
+ and result.get("verify_verdict") in PASS_VERDICTS
46
+ )
47
+
48
+
49
+ def utc_now_iso() -> str:
50
+ return datetime.datetime.now(datetime.UTC).isoformat(timespec="seconds").replace("+00:00", "Z")
51
+
52
+
53
+ def load_dict_json(path: pathlib.Path) -> dict:
54
+ if not path.exists():
55
+ return {}
56
+ try:
57
+ data = loads_strict_json_object(path.read_text())
58
+ except (ValueError, json.JSONDecodeError):
59
+ return {}
60
+ return data
61
+
18
62
 
19
63
  def git_sha() -> str:
20
64
  try:
@@ -30,6 +74,58 @@ def git_branch() -> str:
30
74
  return "unknown"
31
75
 
32
76
 
77
+ def axis_validation_breakdown(judge: dict):
78
+ raw_validation = judge.get("_axis_validation")
79
+ validation = raw_validation if isinstance(raw_validation, dict) else {}
80
+ cells = validation.get("out_of_range_cells") or []
81
+ declared_count = validation.get("out_of_range_count")
82
+ total_invalid = max(
83
+ declared_count if isinstance(declared_count, int) else 0,
84
+ len(cells) if isinstance(cells, list) else 0,
85
+ )
86
+ raw_blind_mapping = judge.get("_blind_mapping")
87
+ blind_mapping = raw_blind_mapping if isinstance(raw_blind_mapping, dict) else {}
88
+ breakdown_to_letter = {
89
+ "a_breakdown": "A",
90
+ "b_breakdown": "B",
91
+ "c_breakdown": "C",
92
+ }
93
+ by_arm = {}
94
+ mapped_count = 0
95
+ unmapped_cells = []
96
+ if not isinstance(cells, list):
97
+ return by_arm, total_invalid, [{"reason": "out_of_range_cells is not a list"}]
98
+ for cell in cells:
99
+ if not isinstance(cell, dict):
100
+ unmapped_cells.append(cell)
101
+ continue
102
+ letter = breakdown_to_letter.get(cell.get("breakdown"))
103
+ arm = blind_mapping.get(letter) if letter else None
104
+ if arm in KNOWN_ARMS:
105
+ by_arm.setdefault(arm, []).append(cell)
106
+ mapped_count += 1
107
+ else:
108
+ unmapped_cells.append(cell)
109
+ unmapped_count = max(0, total_invalid - mapped_count)
110
+ if unmapped_count > len(unmapped_cells):
111
+ unmapped_cells.extend(
112
+ {"reason": "out_of_range_count exceeds mapped cells"}
113
+ for _ in range(unmapped_count - len(unmapped_cells))
114
+ )
115
+ return by_arm, unmapped_count, unmapped_cells
116
+
117
+
118
+ def blind_mapped_arms(judge: dict) -> set[str]:
119
+ mapping = judge.get("_blind_mapping")
120
+ if not isinstance(mapping, dict):
121
+ return set()
122
+ return {arm for key, arm in mapping.items() if key in {"A", "B", "C"}}
123
+
124
+
125
+ def strict_number(value):
126
+ return value if is_strict_number(value) else None
127
+
128
+
33
129
  def main() -> int:
34
130
  p = argparse.ArgumentParser()
35
131
  p.add_argument("--run-id", required=True)
@@ -49,26 +145,34 @@ def main() -> int:
49
145
  if not judge_path.exists():
50
146
  rows.append({"fixture": fid, "status": "NO_JUDGE", "reason": "judge.json missing"})
51
147
  continue
52
- judge = json.loads(judge_path.read_text())
148
+ judge = load_dict_json(judge_path)
53
149
  # iter-0019: 3-arm aware. judge.json now carries scores_by_arm /
54
150
  # findings_by_arm / disqualifiers_by_arm / margins. Older judge.json
55
- # (pre-iter-0019, only variant_score + bare_score) is handled by
56
- # falling back to legacy fields.
57
- scores_by_arm = judge.get("scores_by_arm") or {}
151
+ # can populate legacy fields, but any score still requires a matching
152
+ # _blind_mapping arm before downstream consumers may trust it.
153
+ raw_scores_by_arm = judge.get("scores_by_arm")
154
+ scores_by_arm = raw_scores_by_arm if isinstance(raw_scores_by_arm, dict) else {}
58
155
  if not scores_by_arm:
59
- if "variant_score" in judge:
156
+ if is_score(judge.get("variant_score")):
60
157
  scores_by_arm["variant"] = judge["variant_score"]
61
- if "bare_score" in judge:
158
+ if is_score(judge.get("bare_score")):
62
159
  scores_by_arm["bare"] = judge["bare_score"]
63
160
 
64
- findings_by_arm = judge.get("findings_by_arm") or {}
65
- dq_by_arm = judge.get("disqualifiers_by_arm") or {}
66
- margins = judge.get("margins") or {}
161
+ raw_findings_by_arm = judge.get("findings_by_arm")
162
+ findings_by_arm = raw_findings_by_arm if isinstance(raw_findings_by_arm, dict) else {}
163
+ raw_dq_by_arm = judge.get("disqualifiers_by_arm")
164
+ dq_by_arm = raw_dq_by_arm if isinstance(raw_dq_by_arm, dict) else {}
165
+ axis_invalid_by_arm, axis_unmapped_count, axis_unmapped_cells = axis_validation_breakdown(judge)
166
+ mapped_arms = blind_mapped_arms(judge)
167
+ trusted_scores_by_arm = {
168
+ arm: score for arm, score in scores_by_arm.items()
169
+ if arm in mapped_arms and is_score(score)
170
+ }
67
171
 
68
172
  arm_results = {}
69
173
  for arm in ("variant", "solo_claude", "bare"):
70
174
  res_p = fdir / arm / "result.json"
71
- arm_results[arm] = json.loads(res_p.read_text()) if res_p.exists() else {}
175
+ arm_results[arm] = load_dict_json(res_p)
72
176
  var_res = arm_results["variant"]
73
177
  solo_res = arm_results["solo_claude"]
74
178
  bare_res = arm_results["bare"]
@@ -77,12 +181,12 @@ def main() -> int:
77
181
  category = "unknown"
78
182
  if meta_p.exists():
79
183
  try:
80
- category = json.loads(meta_p.read_text()).get("category", "unknown")
184
+ category = load_dict_json(meta_p).get("category", "unknown")
81
185
  except Exception:
82
186
  pass
83
187
 
84
188
  def wall_ratio(numer, denom):
85
- if numer and denom:
189
+ if is_strict_number(numer) and is_strict_number(denom):
86
190
  return round(numer / denom, 2)
87
191
  return None
88
192
 
@@ -91,44 +195,93 @@ def main() -> int:
91
195
  # A/B-letter shape if present).
92
196
  def arm_dq_judge(arm: str):
93
197
  if arm in dq_by_arm:
94
- return bool(dq_by_arm[arm].get("disqualifier", False))
95
- mapping = judge.get("_blind_mapping", {}) or {}
198
+ entry = dq_by_arm[arm]
199
+ value = entry.get("disqualifier") if isinstance(entry, dict) else entry
200
+ parsed = exact_bool(value)
201
+ return (parsed is True or parsed is None, parsed is None)
202
+ raw_mapping = judge.get("_blind_mapping")
203
+ mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
96
204
  for letter in ("A", "B", "C"):
97
205
  if mapping.get(letter) == arm:
98
- return bool((judge.get("disqualifiers", {}) or {}).get(letter, False))
99
- return False
206
+ raw_dqs = judge.get("disqualifiers")
207
+ dqs = raw_dqs if isinstance(raw_dqs, dict) else {}
208
+ parsed = exact_bool(dqs.get(letter))
209
+ return (parsed is True or parsed is None, parsed is None)
210
+ return False, False
211
+
212
+ def critical_findings_for(arm: str):
213
+ entry = findings_by_arm.get(arm)
214
+ if isinstance(entry, list):
215
+ return entry
216
+ if entry:
217
+ return [entry]
218
+ return []
100
219
 
101
220
  # Per-arm payload — arm absent = scores_by_arm key absent, downstream
102
221
  # consumers null-check.
103
222
  arms_block = {}
104
223
  for arm in ("variant", "solo_claude", "bare"):
105
224
  r = arm_results.get(arm) or {}
106
- score = scores_by_arm.get(arm)
107
- judge_dq = arm_dq_judge(arm)
108
- det_dq = bool(r.get("disqualifier", False))
225
+ raw_score = scores_by_arm.get(arm)
226
+ score = trusted_scores_by_arm.get(arm)
227
+ blind_mapping_arm_missing = raw_score is not None and arm not in mapped_arms
228
+ judge_dq, judge_dq_malformed = arm_dq_judge(arm)
229
+ result_bool_values = {
230
+ field: exact_bool(r.get(field))
231
+ for field in ("disqualifier", "timed_out", "invoke_failure", "environment_contamination")
232
+ }
233
+ malformed_boolean_fields = [
234
+ field for field, value in result_bool_values.items() if value is None
235
+ ]
236
+ det_dq = bool(
237
+ result_bool_values["disqualifier"] is True
238
+ or result_bool_values["timed_out"] is True
239
+ or result_bool_values["invoke_failure"] is True
240
+ or result_bool_values["environment_contamination"] is True
241
+ or bool(malformed_boolean_fields)
242
+ or not verify_score_clean(r.get("verify_score"))
243
+ or not skill_verdict_clean(r, arm)
244
+ or blind_mapping_arm_missing
245
+ )
109
246
  arms_block[arm] = {
110
247
  "score": score,
111
- "wall_s": r.get("elapsed_seconds"),
112
- "verify_score": r.get("verify_score"),
248
+ "wall_s": strict_number(r.get("elapsed_seconds")),
249
+ "verify_score": strict_number(r.get("verify_score")),
113
250
  "files_changed": r.get("files_changed"),
114
- "timed_out": bool(r.get("timed_out", False)),
251
+ "timed_out": result_bool_values["timed_out"] is True,
252
+ "invoke_failure": result_bool_values["invoke_failure"] is True,
253
+ "invoke_failure_reason": r.get("invoke_failure_reason"),
254
+ "environment_contamination": result_bool_values["environment_contamination"] is True,
115
255
  "disqualifier": judge_dq or det_dq,
116
256
  "dq_judge": judge_dq,
257
+ "dq_judge_malformed": judge_dq_malformed,
117
258
  "dq_deterministic": det_dq,
118
- "critical_findings": findings_by_arm.get(arm, []) if findings_by_arm else [],
259
+ "malformed_boolean_fields": malformed_boolean_fields,
260
+ "blind_mapping_arm_missing": blind_mapping_arm_missing,
261
+ "critical_findings": critical_findings_for(arm),
262
+ "_axis_validation_out_of_range_count": len(axis_invalid_by_arm.get(arm, [])),
263
+ "_axis_validation_out_of_range_cells": axis_invalid_by_arm.get(arm, []),
119
264
  }
120
265
 
121
- # Pairwise margins. Prefer judge-side margins (single calibrated
122
- # scoring) over arithmetic differences, but fall through to compute
123
- # from scores_by_arm if the judge didn't emit margins.
266
+ # Pairwise margins are derived from trusted mapped scores only. Cached
267
+ # judge-side margins are redundant and can be stale if a partial artifact
268
+ # is reused.
124
269
  def m(left, right, key):
125
- if margins.get(key) is not None:
126
- return margins[key]
127
- l = scores_by_arm.get(left); r2 = scores_by_arm.get(right)
270
+ if left not in mapped_arms or right not in mapped_arms:
271
+ return None
272
+ l = trusted_scores_by_arm.get(left); r2 = trusted_scores_by_arm.get(right)
128
273
  if l is None or r2 is None:
129
274
  return None
130
275
  return l - r2
131
276
 
277
+ def trusted_winner():
278
+ winner = judge.get("winner_arm")
279
+ if winner == "tie":
280
+ return winner
281
+ if winner in trusted_scores_by_arm:
282
+ return winner
283
+ return None
284
+
132
285
  row = {
133
286
  "fixture": fid,
134
287
  "category": category,
@@ -146,7 +299,9 @@ def main() -> int:
146
299
  "solo_over_bare": wall_ratio(arms_block["solo_claude"]["wall_s"], arms_block["bare"]["wall_s"]),
147
300
  "variant_over_solo": wall_ratio(arms_block["variant"]["wall_s"], arms_block["solo_claude"]["wall_s"]),
148
301
  },
149
- "winner": judge.get("winner_arm"),
302
+ "winner": trusted_winner(),
303
+ "_axis_validation_unmapped_out_of_range_count": axis_unmapped_count,
304
+ "_axis_validation_unmapped_out_of_range_cells": axis_unmapped_cells,
150
305
  # Legacy fields preserved so older summary readers still parse.
151
306
  "variant_score": arms_block["variant"]["score"],
152
307
  "bare_score": arms_block["bare"]["score"],
@@ -204,18 +359,23 @@ def main() -> int:
204
359
  # because the legacy ship-gate.py reads that. Pair-aware gates get
205
360
  # added in iter-0021 / 0022 once the data shape stabilizes.
206
361
  margin_ge_5 = sum(1 for r in gated_rows if (r.get("margin") or 0) >= 5)
207
- disqualifier_count = sum(1 for r in scored if r.get("variant_disqualifier"))
362
+ disqualifier_count = sum(1 for r in rows if r.get("variant_disqualifier"))
208
363
 
209
364
  # arm-presence flags so consumers know whether the iter is 2-arm legacy
210
365
  # or 3-arm post-iter-0019.
211
- has_solo = any((r.get("arms", {}).get("solo_claude") or {}).get("score") is not None for r in scored)
366
+ has_solo = any(
367
+ (arm := (r.get("arms", {}).get("solo_claude") or {})).get("score") is not None
368
+ or arm.get("wall_s") is not None
369
+ or bool(arm.get("disqualifier"))
370
+ for r in rows
371
+ )
212
372
 
213
373
  summary = {
214
374
  "run_id": args.run_id,
215
375
  "label": args.label,
216
376
  "git_sha": git_sha(),
217
377
  "branch": git_branch(),
218
- "completed_at": datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z",
378
+ "completed_at": utc_now_iso(),
219
379
  "fixtures_total": len(rows),
220
380
  "fixtures_scored": len(scored),
221
381
  # Legacy 2-arm fields kept for ship-gate.py + history readers.
@@ -245,12 +405,12 @@ def main() -> int:
245
405
  f"Branch: `{summary['branch']}`",
246
406
  f"Git SHA: `{summary['git_sha'][:12]}`",
247
407
  "",
248
- "| Fixture | Category | L2 (variant) | L1 (solo_claude) | L0 (bare) | L2-L0 | L1-L0 | L2-L1 | Winner | Wall L2/L1/L0 | Wall L2/L0 |",
249
- "|---------|----------|--------------|------------------|-----------|-------|-------|-------|--------|---------------|-----------|",
408
+ "| Fixture | Category | variant (L2) | solo_claude (L1) | bare (L0) | variant-bare | solo_claude-bare | variant-solo_claude | Winner | Wall variant/solo_claude/bare | Wall variant/solo_claude | Wall variant/bare |",
409
+ "|---------|----------|--------------|------------------|-----------|--------------|-------------------|----------------------|--------|--------------------------------|--------------------------|-------------------|",
250
410
  ]
251
411
  for r in rows:
252
412
  if r.get("variant_score") is None:
253
- lines.append(f"| {r['fixture']} | — | — | — | — | — | — | — | NO_JUDGE | — | — |")
413
+ lines.append(f"| {r['fixture']} | — | — | — | — | — | — | — | NO_JUDGE | — | — | — |")
254
414
  continue
255
415
  arms = r.get("arms", {}) or {}
256
416
  v = arms.get("variant", {}) or {}
@@ -267,11 +427,13 @@ def main() -> int:
267
427
  def fmt_wall(arm):
268
428
  return f"{arm['wall_s']}s" if arm.get("wall_s") else "?"
269
429
  l2_l0_wall = f"{wallr.get('variant_over_bare'):.1f}x" if wallr.get("variant_over_bare") else "—"
430
+ l2_l1_wall = f"{wallr.get('variant_over_solo'):.1f}x" if wallr.get("variant_over_solo") else "—"
270
431
  wall_triplet = f"{fmt_wall(v)}/{fmt_wall(s)}/{fmt_wall(b)}"
271
432
  lines.append(
272
433
  f"| {r['fixture']} | {r['category']} | {fmt_score(v)} | {fmt_score(s)} | {fmt_score(b)} | "
273
434
  f"{fmt_margin(margins.get('variant_over_bare'))} | {fmt_margin(margins.get('solo_over_bare'))} | "
274
- f"{fmt_margin(margins.get('variant_over_solo'))} | {r.get('winner') or '—'} | {wall_triplet} | {l2_l0_wall} |"
435
+ f"{fmt_margin(margins.get('variant_over_solo'))} | {r.get('winner') or '—'} | "
436
+ f"{wall_triplet} | {l2_l1_wall} | {l2_l0_wall} |"
275
437
  )
276
438
  def fmt_avg(v): return f"{v:.1f}" if isinstance(v, (int, float)) else "n/a"
277
439
  def fmt_signed(v): return f"{v:+.1f}" if isinstance(v, (int, float)) else "n/a"
@@ -289,23 +451,23 @@ def main() -> int:
289
451
  lines += [
290
452
  f"**Suite average bare (L0) score:** {fmt_avg(summary['bare_avg'])}",
291
453
  "",
292
- f"**L2 vs L0 margin avg:** {margin_avg_str} (ship floor: +5, NORTH-STAR preferred: +8)",
454
+ f"**variant (L2) vs bare (L0) margin avg:** {margin_avg_str} (ship floor: +5, NORTH-STAR preferred: +8)",
293
455
  ]
294
456
  if summary.get("arms_present", {}).get("solo_claude"):
295
457
  ms = summary.get("margins_avg", {}) or {}
296
458
  ws = summary.get("wall_ratio_avg_by_pair", {}) or {}
297
459
  lines += [
298
- f"**L1 vs L0 margin avg:** {fmt_signed(ms.get('solo_over_bare'))} (NORTH-STAR L1 contract: ≥+5)",
299
- f"**L2 vs L1 margin avg:** {fmt_signed(ms.get('variant_over_solo'))} (NORTH-STAR L2 contract: ≥+5 on pair-eligible)",
300
- f"**Wall ratio L2/L0:** {fmt_ratio(ws.get('variant_over_bare'))}",
301
- f"**Wall ratio L1/L0:** {fmt_ratio(ws.get('solo_over_bare'))}",
302
- f"**Wall ratio L2/L1:** {fmt_ratio(ws.get('variant_over_solo'))}",
460
+ f"**solo_claude (L1) vs bare (L0) margin avg:** {fmt_signed(ms.get('solo_over_bare'))} (NORTH-STAR L1 contract: ≥+5)",
461
+ f"**variant (L2) vs solo_claude (L1) margin avg:** {fmt_signed(ms.get('variant_over_solo'))} (NORTH-STAR L2 contract: ≥+5 on pair-eligible)",
462
+ f"**Wall ratio variant (L2) / bare (L0):** {fmt_ratio(ws.get('variant_over_bare'))}",
463
+ f"**Wall ratio solo_claude (L1) / bare (L0):** {fmt_ratio(ws.get('solo_over_bare'))}",
464
+ f"**Wall ratio variant (L2) / solo_claude (L1):** {fmt_ratio(ws.get('variant_over_solo'))}",
303
465
  ]
304
466
  else:
305
- lines.append(f"**Wall ratio variant/bare (mean):** {wall_ratio_str} (no solo_claude arm in this run)")
467
+ lines.append(f"**Wall ratio variant (L2) / bare (L0) mean:** {wall_ratio_str} (no solo_claude arm in this run)")
306
468
  lines += [
307
469
  f"**Hard-floor violations:** {summary['hard_floor_violations']}",
308
- f"**Fixtures with margin ≥ +5:** {summary['margin_ge_5_count']} / {summary['gated_fixtures']} (gate: ≥ 7 of 9)",
470
+ f"**Fixtures with margin ≥ +5:** {summary['margin_ge_5_count']} / {summary['gated_fixtures']} (gate: ≥ 7)",
309
471
  ]
310
472
  # Critical findings digest — per-arm sections.
311
473
  def has_findings(arm):
@@ -315,7 +477,7 @@ def main() -> int:
315
477
  lines += ["", "## Critical Findings", ""]
316
478
  for r in cf_rows:
317
479
  lines.append(f"### {r['fixture']}")
318
- for arm_label, arm_key in [("Variant (L2)", "variant"), ("Solo Claude (L1)", "solo_claude"), ("Bare (L0)", "bare")]:
480
+ for arm_label, arm_key in [("variant (L2)", "variant"), ("solo_claude (L1)", "solo_claude"), ("bare (L0)", "bare")]:
319
481
  arm = (r.get("arms") or {}).get(arm_key) or {}
320
482
  if has_findings(arm):
321
483
  lines.append(f"**{arm_label}:**")
@@ -11,6 +11,8 @@ import urllib.request
11
11
  from pathlib import Path
12
12
  from typing import Any
13
13
 
14
+ from pair_evidence_contract import reject_json_constant
15
+
14
16
 
15
17
  DATASETS = {
16
18
  "lite": "princeton-nlp/SWE-bench_Lite",
@@ -31,7 +33,17 @@ def fetch_rows(dataset: str, split: str, offset: int, length: int) -> dict[str,
31
33
  )
32
34
  url = f"https://datasets-server.huggingface.co/rows?{params}"
33
35
  with urllib.request.urlopen(url, timeout=60) as response:
34
- return json.load(response)
36
+ return json.load(response, parse_constant=reject_json_constant)
37
+
38
+
39
+ def positive_int(value: str) -> int:
40
+ try:
41
+ parsed = int(value)
42
+ except ValueError as exc:
43
+ raise argparse.ArgumentTypeError("must be an integer") from exc
44
+ if parsed <= 0:
45
+ raise argparse.ArgumentTypeError("must be > 0")
46
+ return parsed
35
47
 
36
48
 
37
49
  def main() -> int:
@@ -39,7 +51,7 @@ def main() -> int:
39
51
  parser.add_argument("--dataset", choices=sorted(DATASETS), default="lite")
40
52
  parser.add_argument("--dataset-id", help="Override the Hugging Face dataset id.")
41
53
  parser.add_argument("--split", default="test")
42
- parser.add_argument("--limit", type=int, help="Fetch at most N rows.")
54
+ parser.add_argument("--limit", type=positive_int, help="Fetch at most N rows.")
43
55
  parser.add_argument("--page-size", type=int, default=100)
44
56
  parser.add_argument("--instance-id", action="append", help="Keep only these instance ids.")
45
57
  parser.add_argument("--out", required=True, type=Path)
@@ -63,13 +75,19 @@ def main() -> int:
63
75
  if total is None:
64
76
  total = int(page.get("num_rows_total") or 0)
65
77
  page_rows = page.get("rows") or []
78
+ if not isinstance(page_rows, list):
79
+ raise ValueError("fetched page malformed: rows must be a list")
66
80
  if not page_rows:
67
81
  break
68
- for wrapper in page_rows:
82
+ for row_index, wrapper in enumerate(page_rows, start=1):
83
+ if not isinstance(wrapper, dict):
84
+ raise ValueError(f"malformed fetched row {row_index}: wrapper must be object")
69
85
  row = wrapper.get("row")
70
86
  if not isinstance(row, dict):
71
- continue
87
+ raise ValueError(f"malformed fetched row {row_index}: row must be object")
72
88
  instance_id = row.get("instance_id")
89
+ if not isinstance(instance_id, str) or not instance_id:
90
+ raise ValueError(f"malformed fetched row {row_index}: instance_id must be a non-empty string")
73
91
  if keep and instance_id not in keep:
74
92
  continue
75
93
  rows.append(row)