devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -0,0 +1,232 @@
1
+ #!/usr/bin/env python3
2
+ """Print a compact, wrap-safe benchmark snapshot from local artifacts."""
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import importlib.util
7
+ import json
8
+ import pathlib
9
+ import sys
10
+ import textwrap
11
+ from typing import Any
12
+
13
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
14
+ FRONTIER_PATH = SCRIPT_DIR / "pair-candidate-frontier.py"
15
+
16
+
17
+ def load_frontier_module() -> Any:
18
+ spec = importlib.util.spec_from_file_location("pair_candidate_frontier", FRONTIER_PATH)
19
+ if spec is None or spec.loader is None:
20
+ raise RuntimeError(f"cannot load frontier module: {FRONTIER_PATH}")
21
+ module = importlib.util.module_from_spec(spec)
22
+ spec.loader.exec_module(module)
23
+ return module
24
+
25
+
26
+ FRONTIER = load_frontier_module()
27
+
28
+
29
+ def best_rows(report: dict[str, Any]) -> list[dict[str, Any]]:
30
+ rows: list[dict[str, Any]] = []
31
+ for row in report.get("rows", []):
32
+ if row.get("status") != "pair_evidence_passed":
33
+ continue
34
+ best = FRONTIER.best_pair_evidence(row.get("passing_pair_evidence", []))
35
+ if best is None:
36
+ continue
37
+ rows.append({"fixture": row["fixture"], **best})
38
+ return rows
39
+
40
+
41
+ def display_fixture(fixture: str) -> str:
42
+ short, _, rest = fixture.partition("-")
43
+ return f"{short} {rest.replace('-', ' ')}" if rest else fixture
44
+
45
+
46
+ def fmt_margin(value: Any) -> str:
47
+ return f"{value:+d}" if isinstance(value, int) and not isinstance(value, bool) else "n/a"
48
+
49
+
50
+ def fmt_decimal_margin(value: Any) -> str:
51
+ return f"{value:+.2f}" if isinstance(value, (int, float)) and not isinstance(value, bool) else "n/a"
52
+
53
+
54
+ def fmt_wall(value: Any) -> str:
55
+ return f"{value:.2f}x" if isinstance(value, (int, float)) and not isinstance(value, bool) else "n/a"
56
+
57
+
58
+ def fmt_score(value: Any) -> str:
59
+ return str(value) if isinstance(value, int) and not isinstance(value, bool) else "n/a"
60
+
61
+
62
+ def wrap_item(prefix: str, text: str, *, width: int) -> list[str]:
63
+ return textwrap.wrap(
64
+ text,
65
+ width=width,
66
+ initial_indent=prefix,
67
+ subsequent_indent=" " * len(prefix),
68
+ break_long_words=False,
69
+ break_on_hyphens=False,
70
+ ) or [prefix.rstrip()]
71
+
72
+
73
+ def render_text(report: dict[str, Any], *, width: int) -> str:
74
+ rows = best_rows(report)
75
+ lines = [
76
+ "Recent Benchmark Snapshot",
77
+ "=========================",
78
+ "",
79
+ "Status",
80
+ f" Verdict: {report.get('verdict', 'n/a')}",
81
+ f" Active fixtures: {report.get('fixtures_total', 'n/a')}",
82
+ f" Rejected controls: {report.get('rejected_total', 'n/a')}",
83
+ f" Pair evidence rows: {report.get('pair_evidence_total', 'n/a')}",
84
+ f" Unmeasured candidates: {report.get('unmeasured_candidate_total', 'n/a')}",
85
+ "",
86
+ "Pair Lift",
87
+ f" Average margin: {fmt_decimal_margin(report.get('pair_margin_avg'))}",
88
+ f" Minimum margin: {fmt_margin(report.get('pair_margin_min'))}",
89
+ f" Average wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_avg'))}",
90
+ f" Maximum wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_max'))}",
91
+ f" Gate: margin >= {fmt_margin(report.get('min_pair_margin'))}; wall <= {fmt_wall(report.get('max_pair_solo_wall_ratio'))}",
92
+ "",
93
+ "Pair Evidence",
94
+ ]
95
+ if not rows:
96
+ lines.append(" No passing pair evidence rows found.")
97
+ return "\n".join(lines) + "\n"
98
+
99
+ for item in rows:
100
+ lines.append(f" {display_fixture(item['fixture'])}")
101
+ lines.append(
102
+ " scores: bare {bare} | solo_claude {solo} | pair {pair}".format(
103
+ bare=fmt_score(item.get("bare_score")),
104
+ solo=fmt_score(item.get("solo_score")),
105
+ pair=fmt_score(item.get("pair_score")),
106
+ )
107
+ )
108
+ lines.append(
109
+ " lift: {margin} | wall {wall} | arm {arm}".format(
110
+ margin=fmt_margin(item.get("pair_margin")),
111
+ wall=fmt_wall(item.get("pair_solo_wall_ratio")),
112
+ arm=item.get("pair_arm") or "n/a",
113
+ )
114
+ )
115
+ lines.extend(wrap_item(" run: ", str(item.get("run_id") or "n/a"), width=width))
116
+ triggers = ", ".join(item.get("pair_trigger_reasons") or [])
117
+ lines.extend(wrap_item(" triggers: ", triggers or "n/a", width=width))
118
+ return "\n".join(lines) + "\n"
119
+
120
+
121
+ def render_markdown(report: dict[str, Any], *, width: int) -> str:
122
+ rows = best_rows(report)
123
+ lines = [
124
+ "# Recent Benchmark Snapshot",
125
+ "",
126
+ "## Status",
127
+ "",
128
+ f"- Verdict: **{report.get('verdict', 'n/a')}**",
129
+ f"- Active fixtures: {report.get('fixtures_total', 'n/a')}",
130
+ f"- Rejected controls: {report.get('rejected_total', 'n/a')}",
131
+ f"- Pair evidence rows: {report.get('pair_evidence_total', 'n/a')}",
132
+ f"- Unmeasured candidates: {report.get('unmeasured_candidate_total', 'n/a')}",
133
+ "",
134
+ "## Pair Lift",
135
+ "",
136
+ f"- Average margin: **{fmt_decimal_margin(report.get('pair_margin_avg'))}**",
137
+ f"- Minimum margin: **{fmt_margin(report.get('pair_margin_min'))}**",
138
+ f"- Average wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_avg'))}",
139
+ f"- Maximum wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_max'))}",
140
+ f"- Gate: margin >= {fmt_margin(report.get('min_pair_margin'))}; wall <= {fmt_wall(report.get('max_pair_solo_wall_ratio'))}",
141
+ "",
142
+ "## Pair Evidence",
143
+ "",
144
+ ]
145
+ if not rows:
146
+ lines.append("No passing pair evidence rows found.")
147
+ return "\n".join(lines) + "\n"
148
+
149
+ for item in rows:
150
+ lines.extend(
151
+ [
152
+ f"### {display_fixture(item['fixture'])}",
153
+ "",
154
+ f"- Scores: bare {fmt_score(item.get('bare_score'))}, solo_claude {fmt_score(item.get('solo_score'))}, pair {fmt_score(item.get('pair_score'))}.",
155
+ f"- Lift: {fmt_margin(item.get('pair_margin'))}; wall {fmt_wall(item.get('pair_solo_wall_ratio'))}; arm `{item.get('pair_arm') or 'n/a'}`.",
156
+ f"- Run: `{item.get('run_id') or 'n/a'}`.",
157
+ ]
158
+ )
159
+ triggers = ", ".join(item.get("pair_trigger_reasons") or [])
160
+ wrapped = wrap_item("- Triggers: ", triggers or "n/a", width=width)
161
+ lines.extend(wrapped)
162
+ lines.append("")
163
+ return "\n".join(lines).rstrip() + "\n"
164
+
165
+
166
+ def main() -> int:
167
+ parser = argparse.ArgumentParser()
168
+ parser.add_argument(
169
+ "--fixtures-root",
170
+ type=pathlib.Path,
171
+ default=pathlib.Path("benchmark/auto-resolve/fixtures"),
172
+ )
173
+ parser.add_argument(
174
+ "--registry",
175
+ type=pathlib.Path,
176
+ default=SCRIPT_DIR / "pair-rejected-fixtures.sh",
177
+ )
178
+ parser.add_argument(
179
+ "--results-root",
180
+ type=pathlib.Path,
181
+ default=pathlib.Path("benchmark/auto-resolve/results"),
182
+ )
183
+ parser.add_argument("--out-json", type=pathlib.Path)
184
+ parser.add_argument("--out-md", type=pathlib.Path)
185
+ parser.add_argument(
186
+ "--max-width",
187
+ type=int,
188
+ default=92,
189
+ help="target maximum line width for text and markdown output",
190
+ )
191
+ parser.add_argument(
192
+ "--min-pair-margin",
193
+ type=int,
194
+ default=5,
195
+ help="minimum pair-over-solo margin required to count passing pair evidence",
196
+ )
197
+ parser.add_argument(
198
+ "--max-pair-solo-wall-ratio",
199
+ type=float,
200
+ default=3.0,
201
+ help="maximum pair/solo wall-time ratio allowed to count passing pair evidence",
202
+ )
203
+ args = parser.parse_args()
204
+ if args.max_width < 60:
205
+ print("error: --max-width must be >= 60", file=sys.stderr)
206
+ return 2
207
+
208
+ try:
209
+ report = FRONTIER.build_report(
210
+ fixtures_root=args.fixtures_root,
211
+ registry=args.registry,
212
+ results_root=args.results_root,
213
+ min_pair_margin=args.min_pair_margin,
214
+ max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
215
+ )
216
+ except ValueError as exc:
217
+ print(f"error: {exc}", file=sys.stderr)
218
+ return 2
219
+
220
+ if args.out_json:
221
+ args.out_json.parent.mkdir(parents=True, exist_ok=True)
222
+ args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
223
+ if args.out_md:
224
+ args.out_md.parent.mkdir(parents=True, exist_ok=True)
225
+ args.out_md.write_text(render_markdown(report, width=args.max_width), encoding="utf8")
226
+
227
+ print(render_text(report, width=args.max_width), end="")
228
+ return 0
229
+
230
+
231
+ if __name__ == "__main__":
232
+ sys.exit(main())
@@ -5,8 +5,8 @@
5
5
  # subprocess (isolated session), then captures artifacts + runs verification.
6
6
  #
7
7
  # Usage:
8
- # run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID>
9
- # run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID> --dry-run
8
+ # run-fixture.sh --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID>
9
+ # run-fixture.sh --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID> --dry-run
10
10
  #
11
11
  # Outputs to benchmark/auto-resolve/results/<run-id>/<fixture>/<arm>/:
12
12
  # input.md, transcript.txt, diff.patch, changed-files.txt, verify.json,
@@ -19,6 +19,15 @@ usage() {
19
19
  exit 1
20
20
  }
21
21
 
22
+ require_value() {
23
+ local flag="$1"
24
+ local value="${2:-}"
25
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
26
+ echo "$flag requires a value" >&2
27
+ exit 1
28
+ fi
29
+ }
30
+
22
31
  kill_worktree_processes() {
23
32
  local work_dir="$1"
24
33
  local signal="$2"
@@ -40,16 +49,16 @@ FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
40
49
  RESOLVE_SKILL="new"
41
50
  while [ $# -gt 0 ]; do
42
51
  case "$1" in
43
- --fixture) FIXTURE="$2"; shift 2;;
44
- --arm) ARM="$2"; shift 2;;
45
- --run-id) RUN_ID="$2"; shift 2;;
46
- --resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
52
+ --fixture) require_value "$1" "${2:-}"; FIXTURE="$2"; shift 2;;
53
+ --arm) require_value "$1" "${2:-}"; ARM="$2"; shift 2;;
54
+ --run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
55
+ --resolve-skill) require_value "$1" "${2:-}"; RESOLVE_SKILL="$2"; shift 2;;
47
56
  --dry-run) DRY_RUN=1; shift;;
48
57
  *) usage;;
49
58
  esac
50
59
  done
51
60
  [ -n "$FIXTURE" ] && [ -n "$ARM" ] && [ -n "$RUN_ID" ] || usage
52
- # iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
61
+ # iter-0019/0037: 3 smoke arms — variant (L2: Claude orchestrator + risk-probes pair path),
53
62
  # solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
54
63
  # bare (L0: direct claude -p, no skill, no codex).
55
64
  # iter-0033c (Codex R0-infra adoption, 2026-05-02): two L2 diagnostic arms for /devlyn:resolve —
@@ -99,8 +108,21 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
99
108
  [ -f "$f" ] || { echo "fixture missing required file: $f (see SCHEMA.md)"; exit 1; }
100
109
  done
101
110
 
102
- TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
103
- if [ "$ARM" = "l2_risk_probes" ]; then
111
+ TIMEOUT=$(python3 - "$META" "$BENCH_ROOT/scripts" <<'PY'
112
+ import pathlib
113
+ import sys
114
+
115
+ sys.path.insert(0, sys.argv[2])
116
+ from pair_evidence_contract import loads_strict_json_object
117
+
118
+ metadata = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
119
+ timeout = metadata.get("timeout_seconds")
120
+ if not isinstance(timeout, int) or isinstance(timeout, bool) or timeout <= 0:
121
+ raise SystemExit("metadata timeout_seconds must be a positive integer")
122
+ print(timeout)
123
+ PY
124
+ )
125
+ if [ "$ARM" = "variant" ] || [ "$ARM" = "l2_risk_probes" ]; then
104
126
  # This arm adds a bounded Codex probe-derive phase before IMPLEMENT and a
105
127
  # bounded Codex pair-JUDGE during VERIFY. The full-pipeline gate still
106
128
  # enforces wall-time efficiency by pair/solo ratio; this budget prevents a
@@ -119,19 +141,18 @@ WORK_DIR="/tmp/bench-${RUN_ID}-${FIXTURE}-${ARM}"
119
141
  rm -rf "$WORK_DIR"
120
142
  cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
121
143
 
122
- # All skill-driven arms (variant / solo_claude / l2_gated / l2_forced) get
123
- # devlyn skills + project CLAUDE.md pre-baseline + codex shim + monitored
124
- # wrapper. Bare gets nothing (no skill, no shim, no env).
144
+ # All skill-driven arms (variant / solo_claude / l2_gated / l2_risk_probes /
145
+ # l2_forced) get devlyn skills + project CLAUDE.md pre-baseline + codex shim
146
+ # + monitored wrapper. Bare gets nothing (no skill, no shim, no env).
125
147
  #
126
148
  # iter-0019: solo_claude (L1) shares variant-arm staging because the L1 arm
127
149
  # runs the same orchestrator on the same skills — only difference is codex
128
150
  # is blocked. Shim catches PATH resolution; wrapper catches direct-path
129
151
  # invocations.
130
- # iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced share variant staging
131
- # (codex unblocked, shim+wrapper routing). Difference vs variant is the
132
- # ENGINE_CLAUSE branch below l2_* run --engine claude (Claude IMPLEMENT)
133
- # while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
134
- # /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
152
+ # iter-0033c/0037 (Codex R0-infra Q6 + risk probes): pair arms share variant
153
+ # staging (codex unblocked, shim+wrapper routing). The smoke `variant` arm now
154
+ # follows the current measured risk-probes path rather than an older
155
+ # auto-engine implement route.
135
156
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
136
157
  || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
137
158
  mkdir -p "$WORK_DIR/.claude"
@@ -183,7 +204,7 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
183
204
  # the bypass weapon. Across iter-0019 paid 5-fixture run the bypass
184
205
  # was OPEN but never exercised; this closes the surface preemptively
185
206
  # before iter-0020's 9-fixture L0/L1/L2 run.
186
- # iter-0033c (Codex R0-infra Q5): l2_gated/l2_forced are codex-UNBLOCKED
207
+ # iter-0033c/0037 (Codex R0-infra Q5 + risk probes): l2_* arms are codex-UNBLOCKED
187
208
  # (codex must be reachable for VERIFY pair-JUDGE). They take the variant
188
209
  # path: ARM_CODEX_BLOCKED=0 → python writer omits CODEX_BLOCKED from env
189
210
  # entirely (the shim refuses on any non-empty value, so 0 ≠ unset).
@@ -209,11 +230,12 @@ if codex_blocked == "1":
209
230
  # CODEX_BLOCKED enforcement gap.
210
231
  env["CODEX_BLOCKED"] = "1"
211
232
  else:
212
- # variant arm (L2) — codex routes through wrapper as part of pair-mode
213
- # BUILD; both vars are required by the shim/wrapper handshake.
233
+ # variant / pair arms — codex routes through wrapper for risk-probe
234
+ # derivation and VERIFY pair-JUDGE; both vars are required by the
235
+ # shim/wrapper handshake.
214
236
  env["CODEX_REAL_BIN"] = real_bin
215
237
  env["CODEX_MONITORED_PATH"] = monitored
216
- if arm == "l2_risk_probes":
238
+ if arm in ("variant", "l2_risk_probes"):
217
239
  # Risk-probe derivation is a bounded contract-conversion step. A long
218
240
  # Codex run is a harness failure, not useful extra quality signal.
219
241
  env["CODEX_MONITORED_TIMEOUT_SEC"] = "300"
@@ -273,9 +295,12 @@ fi
273
295
  # files. Those commands still run in the post-run verifier below.
274
296
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
275
297
  || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
276
- python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
277
- import json, os, sys
278
- expected = json.load(open(sys.argv[1]))
298
+ python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" "$BENCH_ROOT/scripts" <<'PY'
299
+ import json, os, pathlib, sys
300
+ sys.path.insert(0, sys.argv[3])
301
+ from pair_evidence_contract import loads_strict_json_object
302
+
303
+ expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
279
304
  out_path = sys.argv[2]
280
305
  visible_commands = [
281
306
  cmd for cmd in expected.get("verification_commands", [])
@@ -301,11 +326,11 @@ fi
301
326
  # 2. Spec-mode `/devlyn:resolve --spec <path>` for the rest (post iter-0034
302
327
  # Phase 4 cutover the OLD `/devlyn:auto-resolve` route was deleted).
303
328
  PROMPT_FILE="$RESULT_DIR/input.md"
304
- # Variant uses --engine auto (experimental dual-engine: codex BUILD + claude
305
- # critique pair); solo_claude uses --engine claude explicitly so the orchestrator
306
- # routes every phase to Claude and never tries to invoke codex. The CODEX_BLOCKED
307
- # shim enforces this at the binary layer if the orchestrator misroutes. Both
308
- # arms pass the engine flag explicitly so they survive future runtime-default
329
+ # Variant uses the current measured risk-probes pair path; solo_claude uses
330
+ # --engine claude explicitly so the orchestrator routes every implementation
331
+ # phase to Claude and never tries to invoke codex. The CODEX_BLOCKED shim
332
+ # enforces this at the binary layer if the orchestrator misroutes. Both arms
333
+ # pass the engine flag explicitly so they survive future runtime-default
309
334
  # changes (post iter-0020 close-out: default flipped to claude).
310
335
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
311
336
  || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
@@ -315,8 +340,8 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
315
340
  ENGINE_PROMPT_HINT="Run with \`--engine claude\` for every phase. Codex must not be invoked — the harness has blocked it at the binary layer for this run."
316
341
  ;;
317
342
  variant)
318
- ENGINE_CLAUSE="--engine auto"
319
- ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) do not override it."
343
+ ENGINE_CLAUSE="--engine claude --risk-probes"
344
+ ENGINE_PROMPT_HINT="Run with \`--engine claude --risk-probes\` so the smoke L2 arm uses the current measured pair path: Claude implements, Codex derives bounded visible-verification probes and can act as VERIFY pair-JUDGE."
320
345
  ;;
321
346
  l2_gated)
322
347
  # NEW L2 with natural pair-mode triggers. Claude does IMPLEMENT;
@@ -484,7 +509,7 @@ else
484
509
  # iter-0009 + iter-0019: prepend codex shim PATH for any arm that staged
485
510
  # one. variant routes through codex-monitored.sh; solo_claude refuses on
486
511
  # CODEX_BLOCKED=1; bare has no shim.
487
- # iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced ALSO need the shim
512
+ # iter-0033c/0037 (Codex R0-infra Q6 + risk probes): l2_* arms ALSO need the shim
488
513
  # PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
489
514
  # `codex exec` through the wrapper for starvation safety.
490
515
  if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
@@ -652,10 +677,13 @@ fi
652
677
  # Run verification commands + forbidden pattern scan + deps check. Uses
653
678
  # the operator's real HOME (same as the arm saw). Fixtures that need HOME
654
679
  # isolation override it inline per verification command.
655
- python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" <<'PY'
656
- import json, os, re, subprocess, sys
680
+ python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" "$BENCH_ROOT/scripts" <<'PY'
681
+ import json, os, pathlib, re, subprocess, sys
682
+
683
+ sys.path.insert(0, sys.argv[4])
684
+ from pair_evidence_contract import loads_strict_json_object
657
685
 
658
- expected = json.load(open(sys.argv[1]))
686
+ expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
659
687
  result_dir = sys.argv[2]
660
688
  work = sys.argv[3]
661
689
 
@@ -771,12 +799,39 @@ for oracle_file in (
771
799
  "oracle-scope-tier-b.json",
772
800
  "oracle-test-fidelity.json",
773
801
  ):
802
+ oracle_path = os.path.join(result_dir, oracle_file)
774
803
  try:
775
- data = json.load(open(os.path.join(result_dir, oracle_file)))
776
- except Exception:
804
+ raw_oracle = loads_strict_json_object(pathlib.Path(oracle_path).read_text())
805
+ except (OSError, ValueError) as exc:
806
+ oracle_name = oracle_file.removesuffix(".json")
807
+ verify["oracle_findings"].append({
808
+ "oracle": oracle_name,
809
+ "type": "oracle-error",
810
+ "severity": "hard",
811
+ "verdict": "Deterministic oracle failed or emitted an invalid artifact",
812
+ "error": f"oracle artifact malformed or unreadable: {exc}",
813
+ })
814
+ verify["oracle_disqualifier"] = True
777
815
  continue
816
+ data = raw_oracle
778
817
  oracle_name = data.get("oracle") or oracle_file.removesuffix(".json")
779
- for finding in data.get("findings", []) or []:
818
+ if not isinstance(oracle_name, str) or not oracle_name:
819
+ oracle_name = oracle_file.removesuffix(".json")
820
+ oracle_error = data.get("error")
821
+ if isinstance(oracle_error, str) and oracle_error:
822
+ verify["oracle_findings"].append({
823
+ "oracle": oracle_name,
824
+ "type": "oracle-error",
825
+ "severity": "hard",
826
+ "verdict": "Deterministic oracle failed or emitted an invalid artifact",
827
+ "error": oracle_error,
828
+ })
829
+ verify["oracle_disqualifier"] = True
830
+ raw_findings = data.get("findings")
831
+ findings = raw_findings if isinstance(raw_findings, list) else []
832
+ for finding in findings:
833
+ if not isinstance(finding, dict):
834
+ continue
780
835
  item = dict(finding)
781
836
  item["oracle"] = oracle_name
782
837
  verify["oracle_findings"].append(item)
@@ -796,12 +851,15 @@ PY
796
851
 
797
852
  # Timing + aggregate
798
853
  export INVOKE_EXIT WATCHDOG_FIRED
799
- python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" <<'PY'
800
- import json, os, sys
854
+ python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" "$BENCH_ROOT/scripts" <<'PY'
855
+ import json, os, pathlib, sys
801
856
  result_dir, fixture, arm, run_id = sys.argv[1:5]
802
857
  t_end, elapsed, timeout = int(sys.argv[5]), int(sys.argv[6]), int(sys.argv[7])
803
858
 
804
- timing = json.load(open(os.path.join(result_dir, "timing.json")))
859
+ sys.path.insert(0, sys.argv[8])
860
+ from pair_evidence_contract import loads_strict_json_object
861
+
862
+ timing = loads_strict_json_object(pathlib.Path(result_dir, "timing.json").read_text())
805
863
  timing["end_epoch"] = t_end
806
864
  timing["elapsed_seconds"] = elapsed
807
865
  timing["timeout_seconds"] = timeout
@@ -812,7 +870,10 @@ timing["timeout_seconds"] = timeout
812
870
  timing["timed_out"] = os.environ.get("WATCHDOG_FIRED", "0") == "1"
813
871
  json.dump(timing, open(os.path.join(result_dir, "timing.json"), "w"), indent=2)
814
872
 
815
- verify = json.load(open(os.path.join(result_dir, "verify.json")))
873
+ def as_dict(value):
874
+ return value if isinstance(value, dict) else {}
875
+
876
+ verify = as_dict(loads_strict_json_object(pathlib.Path(result_dir, "verify.json").read_text()))
816
877
  try:
817
878
  with open(os.path.join(result_dir, "diff.patch")) as f: diff_size = len(f.read())
818
879
  except Exception: diff_size = 0
@@ -825,15 +886,21 @@ except Exception:
825
886
  state = {}
826
887
  state_path = os.path.join(result_dir, "run-archive", "pipeline.state.json")
827
888
  if os.path.isfile(state_path):
828
- with open(state_path) as f:
829
- state = json.load(f)
830
- verify_phase = (state.get("phases") or {}).get("verify") or {}
889
+ state = as_dict(loads_strict_json_object(pathlib.Path(state_path).read_text()))
890
+ phases = as_dict(state.get("phases"))
891
+ verify_phase = as_dict(phases.get("verify"))
892
+ legacy_verify = as_dict(state.get("verify"))
831
893
  sub_verdicts = verify_phase.get("sub_verdicts")
832
- pair_trigger = verify_phase.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
833
- pair_mode = bool(
834
- isinstance(sub_verdicts, dict)
835
- and (sub_verdicts.get("judge_codex") is not None or sub_verdicts.get("pair_judge") is not None)
836
- ) or bool(verify_phase.get("pair_mode"))
894
+ pair_trigger = verify_phase.get("pair_trigger") or legacy_verify.get("pair_trigger")
895
+ PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}
896
+
897
+ def has_pair_judge_verdict(sub_verdicts):
898
+ return isinstance(sub_verdicts, dict) and (
899
+ sub_verdicts.get("judge_codex") in PAIR_VERDICTS
900
+ or sub_verdicts.get("pair_judge") in PAIR_VERDICTS
901
+ )
902
+
903
+ pair_mode = has_pair_judge_verdict(sub_verdicts) or verify_phase.get("pair_mode") is True
837
904
 
838
905
  invoke_exit = int(os.environ.get("INVOKE_EXIT", "0"))
839
906
  plugin_contamination = False
@@ -893,7 +960,7 @@ result = {
893
960
  "invoke_exit": invoke_exit,
894
961
  "invoke_failure": invoke_failure,
895
962
  "invoke_failure_reason": invoke_failure_reason,
896
- "terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
963
+ "terminal_verdict": as_dict(phases.get("final_report")).get("verdict"),
897
964
  "verify_verdict": verify_phase.get("verdict"),
898
965
  "pair_trigger": pair_trigger,
899
966
  "pair_mode": pair_mode,