devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -14,7 +14,7 @@ anchoring is left-only, which is what we want. Per-oracle convention
14
14
  documented here; step 1's content oracle uses regex instead.
15
15
 
16
16
  Fixtures can waive any Tier A pattern via `expected.json::tier_a_waivers`
17
- (list of fnmatch globs). Load-bearing case: F9 e2e-ideate-to-preflight
17
+ (list of fnmatch globs). Load-bearing case: F9 e2e-ideate-to-resolve
18
18
  legitimately creates docs/VISION.md, docs/ROADMAP.md, docs/roadmap/**.
19
19
 
20
20
  Step 2 scope: findings only. Scoring integration is a later step.
@@ -27,6 +27,8 @@ import pathlib
27
27
  import subprocess
28
28
  import sys
29
29
 
30
+ from pair_evidence_contract import loads_strict_json_object
31
+
30
32
  ORACLE_NAME = "scope-tier-a"
31
33
 
32
34
  # iter-0022: stable category enumeration. See header comment in
@@ -222,27 +224,32 @@ def main():
222
224
 
223
225
  waivers = []
224
226
  fixture_id = None
227
+ expected_error = None
225
228
  if args.expected:
226
229
  exp_path = pathlib.Path(args.expected)
227
230
  # fixture_id = parent directory name of expected.json
228
231
  fixture_id = exp_path.parent.name
229
232
  try:
230
- expected = json.loads(exp_path.read_text())
233
+ expected = loads_strict_json_object(exp_path.read_text())
231
234
  raw = expected.get("tier_a_waivers", [])
232
235
  if isinstance(raw, list):
233
236
  waivers = [w for w in raw if isinstance(w, str)]
234
- except (OSError, json.JSONDecodeError) as e:
237
+ except (OSError, json.JSONDecodeError, ValueError) as e:
238
+ expected_error = f"expected.json unreadable: {e}"
235
239
  sys.stderr.write(
236
240
  f"[oracle-scope-tier-a] could not read waivers from {args.expected}: {e}\n"
237
241
  )
238
242
 
239
243
  findings = analyze(args.work, args.scaffold, waivers, fixture_id=fixture_id)
240
- print(json.dumps({
244
+ report = {
241
245
  "oracle": "scope-tier-a",
242
246
  "waivers": waivers,
243
247
  "fixture_id": fixture_id,
244
248
  "findings": findings,
245
- }, indent=2))
249
+ }
250
+ if expected_error:
251
+ report["error"] = expected_error
252
+ print(json.dumps(report, indent=2))
246
253
 
247
254
 
248
255
  if __name__ == "__main__":
@@ -34,6 +34,8 @@ import re
34
34
  import subprocess
35
35
  import sys
36
36
 
37
+ from pair_evidence_contract import loads_strict_json_object
38
+
37
39
  ORACLE_NAME = "scope-tier-b"
38
40
 
39
41
  # iter-0022: stable category enumeration. tier-b-reachable is `info` severity
@@ -221,8 +223,8 @@ def main():
221
223
  ap.error("--work, --scaffold, and --expected are required unless --list-categories is set")
222
224
 
223
225
  try:
224
- expected = json.loads(pathlib.Path(args.expected).read_text())
225
- except (OSError, json.JSONDecodeError) as e:
226
+ expected = loads_strict_json_object(pathlib.Path(args.expected).read_text())
227
+ except (OSError, json.JSONDecodeError, ValueError) as e:
226
228
  sys.stderr.write(f"[oracle-scope-tier-b] cannot read expected: {e}\n")
227
229
  print(json.dumps({
228
230
  "oracle": "scope-tier-b",
@@ -238,6 +240,27 @@ def main():
238
240
  # fixture_id = parent directory name of expected.json
239
241
  fixture_id = pathlib.Path(args.expected).parent.name
240
242
 
243
+ if not isinstance(tier_c, list) or not all(isinstance(item, str) for item in tier_c):
244
+ print(json.dumps({
245
+ "oracle": "scope-tier-b",
246
+ "trace_method": TRACE_METHOD,
247
+ "tier_c_seeds_matched": [],
248
+ "fixture_id": fixture_id,
249
+ "findings": [],
250
+ "error": "expected.json malformed: spec_output_files must be a string array",
251
+ }, indent=2))
252
+ return
253
+ if not isinstance(waivers, list) or not all(isinstance(item, str) for item in waivers):
254
+ print(json.dumps({
255
+ "oracle": "scope-tier-b",
256
+ "trace_method": TRACE_METHOD,
257
+ "tier_c_seeds_matched": [],
258
+ "fixture_id": fixture_id,
259
+ "findings": [],
260
+ "error": "expected.json malformed: tier_a_waivers must be a string array",
261
+ }, indent=2))
262
+ return
263
+
241
264
  if not tier_c:
242
265
  print(json.dumps({
243
266
  "oracle": "scope-tier-b",
@@ -0,0 +1,469 @@
1
+ #!/usr/bin/env python3
2
+ """Report active pair-candidate fixture frontier.
3
+
4
+ This is a spending guard for solo<pair work. It answers three questions before
5
+ new provider calls:
6
+ - which active fixtures are already rejected by measured headroom/design,
7
+ - which active fixtures remain pair-candidate eligible,
8
+ - which eligible fixtures already have passing full-pipeline pair evidence.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import math
15
+ import pathlib
16
+ import re
17
+ import subprocess
18
+ import sys
19
+ from typing import Any
20
+
21
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
22
+ if str(SCRIPT_DIR) not in sys.path:
23
+ sys.path.insert(0, str(SCRIPT_DIR))
24
+
25
+ from pair_evidence_contract import (
26
+ all_known_pair_trigger_reasons,
27
+ best_pair_evidence,
28
+ has_canonical_pair_trigger_reason,
29
+ has_known_pair_trigger_reason,
30
+ is_strict_number,
31
+ loads_strict_json_object,
32
+ normalize_pair_evidence_row,
33
+ )
34
+
35
+
36
+ def fixture_short(name: str) -> str:
37
+ return name.split("-", 1)[0] if "-" in name else name
38
+
39
+
40
+ def sort_fixture_key(name: str) -> tuple[int, str]:
41
+ short = fixture_short(name)
42
+ match = re.fullmatch(r"F(\d+)", short)
43
+ return (int(match.group(1)) if match else 10_000, name)
44
+
45
+
46
+ def active_fixtures(fixtures_root: pathlib.Path) -> list[str]:
47
+ if not fixtures_root.is_dir():
48
+ raise ValueError(f"fixtures root missing: {fixtures_root}")
49
+ return sorted(
50
+ [
51
+ path.name
52
+ for path in fixtures_root.iterdir()
53
+ if path.is_dir() and re.fullmatch(r"F\d+-.+", path.name)
54
+ ],
55
+ key=sort_fixture_key,
56
+ )
57
+
58
+
59
+ def registry_short_ids(registry: pathlib.Path) -> set[str]:
60
+ if not registry.is_file():
61
+ raise ValueError(f"rejected fixture registry missing: {registry}")
62
+ rejected: set[str] = set()
63
+ for line in registry.read_text().splitlines():
64
+ match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
65
+ if match and match.group(1) == match.group(2):
66
+ rejected.add(match.group(1))
67
+ if not rejected:
68
+ raise ValueError(f"rejected fixture registry has no fixture entries: {registry}")
69
+ return rejected
70
+
71
+
72
+ def rejected_reason(registry: pathlib.Path, fixture: str) -> str | None:
73
+ proc = subprocess.run(
74
+ [
75
+ "bash",
76
+ "-c",
77
+ 'source "$1"; rejected_pair_fixture_reason "$2"',
78
+ "bash",
79
+ str(registry),
80
+ fixture,
81
+ ],
82
+ text=True,
83
+ stdout=subprocess.PIPE,
84
+ stderr=subprocess.PIPE,
85
+ check=False,
86
+ )
87
+ if proc.returncode == 0:
88
+ return proc.stdout.strip()
89
+ return None
90
+
91
+
92
+ def load_json_object(path: pathlib.Path) -> dict[str, Any]:
93
+ try:
94
+ data = loads_strict_json_object(path.read_text())
95
+ except (OSError, ValueError, json.JSONDecodeError):
96
+ raise ValueError(f"pair evidence artifact malformed: {path}") from None
97
+ return data
98
+
99
+
100
+ def pair_gate_rows(path: pathlib.Path, gate: dict[str, Any]) -> list[dict[str, Any]]:
101
+ rows = gate.get("rows")
102
+ if not isinstance(rows, list) or not rows:
103
+ raise ValueError(f"pair evidence artifact rows malformed: {path}")
104
+ if not all(isinstance(row, dict) for row in rows):
105
+ raise ValueError(f"pair evidence artifact rows malformed: {path}")
106
+ return rows
107
+
108
+
109
+ def pair_result_trigger_reasons(
110
+ results_root: pathlib.Path,
111
+ *,
112
+ run_id: str,
113
+ fixture: str,
114
+ pair_arm: str,
115
+ ) -> list[str]:
116
+ path = results_root / run_id / fixture / pair_arm / "result.json"
117
+ try:
118
+ result = loads_strict_json_object(path.read_text())
119
+ except (OSError, ValueError, json.JSONDecodeError):
120
+ return []
121
+ trigger = result.get("pair_trigger")
122
+ if not isinstance(trigger, dict):
123
+ return []
124
+ reasons = trigger.get("reasons")
125
+ if not (
126
+ isinstance(reasons, list)
127
+ and reasons
128
+ and all(isinstance(reason, str) for reason in reasons)
129
+ and has_known_pair_trigger_reason(reasons)
130
+ and all_known_pair_trigger_reasons(reasons)
131
+ and has_canonical_pair_trigger_reason(reasons)
132
+ ):
133
+ return []
134
+ return reasons
135
+
136
+
137
+ def passing_pair_evidence(
138
+ results_root: pathlib.Path,
139
+ *,
140
+ min_pair_margin: int,
141
+ max_pair_solo_wall_ratio: float,
142
+ ) -> dict[str, list[dict[str, Any]]]:
143
+ evidence: dict[str, list[dict[str, Any]]] = {}
144
+ if not results_root.is_dir():
145
+ return evidence
146
+ for gate_path in sorted(results_root.glob("*/full-pipeline-pair-gate.json")):
147
+ gate = load_json_object(gate_path)
148
+ if gate.get("verdict") != "PASS":
149
+ continue
150
+ run_id = str(gate.get("run_id") or gate_path.parent.name)
151
+ pair_arm = gate.get("pair_arm")
152
+ for row in pair_gate_rows(gate_path, gate):
153
+ if row.get("status") != "PASS":
154
+ continue
155
+ fixture = row.get("fixture")
156
+ if not isinstance(fixture, str):
157
+ continue
158
+ candidate_row = row
159
+ if row.get("pair_trigger_reasons") is None and isinstance(pair_arm, str):
160
+ reasons = pair_result_trigger_reasons(
161
+ results_root,
162
+ run_id=run_id,
163
+ fixture=fixture,
164
+ pair_arm=pair_arm,
165
+ )
166
+ if reasons:
167
+ candidate_row = dict(row)
168
+ candidate_row["pair_trigger_reasons"] = reasons
169
+ candidate_row["pair_trigger_has_canonical_reason"] = True
170
+ evidence_row = normalize_pair_evidence_row(
171
+ fixture=fixture,
172
+ run_id=run_id,
173
+ pair_arm=pair_arm,
174
+ row=candidate_row,
175
+ )
176
+ if evidence_row is None:
177
+ continue
178
+ pair_margin = evidence_row["pair_margin"]
179
+ wall_ratio = evidence_row["pair_solo_wall_ratio"]
180
+ if pair_margin < min_pair_margin or wall_ratio > max_pair_solo_wall_ratio:
181
+ continue
182
+ evidence.setdefault(fixture, []).append(evidence_row)
183
+ return evidence
184
+
185
+
186
+ def build_report(
187
+ *,
188
+ fixtures_root: pathlib.Path,
189
+ registry: pathlib.Path,
190
+ results_root: pathlib.Path,
191
+ min_pair_margin: int = 5,
192
+ max_pair_solo_wall_ratio: float = 3.0,
193
+ ) -> dict[str, Any]:
194
+ fixtures = active_fixtures(fixtures_root)
195
+ rejected_short = registry_short_ids(registry)
196
+ evidence_by_fixture = passing_pair_evidence(
197
+ results_root,
198
+ min_pair_margin=min_pair_margin,
199
+ max_pair_solo_wall_ratio=max_pair_solo_wall_ratio,
200
+ )
201
+
202
+ rows: list[dict[str, Any]] = []
203
+ for fixture in fixtures:
204
+ reason = rejected_reason(registry, fixture) if fixture_short(fixture) in rejected_short else None
205
+ evidence = evidence_by_fixture.get(fixture, [])
206
+ if reason:
207
+ status = "rejected"
208
+ elif evidence:
209
+ status = "pair_evidence_passed"
210
+ else:
211
+ status = "candidate_unmeasured"
212
+ rows.append(
213
+ {
214
+ "fixture": fixture,
215
+ "short_id": fixture_short(fixture),
216
+ "status": status,
217
+ "rejected_reason": reason,
218
+ "passing_pair_evidence": evidence,
219
+ }
220
+ )
221
+
222
+ rejected_total = sum(1 for row in rows if row["status"] == "rejected")
223
+ candidate_total = sum(1 for row in rows if row["status"] != "rejected")
224
+ pair_evidence_total = sum(
225
+ 1 for row in rows if row["status"] == "pair_evidence_passed"
226
+ )
227
+ unmeasured_candidate_total = sum(
228
+ 1 for row in rows if row["status"] == "candidate_unmeasured"
229
+ )
230
+ best_pairs = [
231
+ best
232
+ for row in rows
233
+ if row["status"] == "pair_evidence_passed"
234
+ for best in [best_pair_evidence(row["passing_pair_evidence"])]
235
+ if best is not None
236
+ ]
237
+ pair_margins = [
238
+ item["pair_margin"]
239
+ for item in best_pairs
240
+ if isinstance(item.get("pair_margin"), int)
241
+ ]
242
+ wall_ratios = [
243
+ item["pair_solo_wall_ratio"]
244
+ for item in best_pairs
245
+ if is_strict_number(item.get("pair_solo_wall_ratio"))
246
+ ]
247
+
248
+ return {
249
+ "verdict": "PASS" if unmeasured_candidate_total == 0 else "FAIL",
250
+ "min_pair_margin": min_pair_margin,
251
+ "max_pair_solo_wall_ratio": max_pair_solo_wall_ratio,
252
+ "fixtures_total": len(rows),
253
+ "rejected_total": rejected_total,
254
+ "candidate_total": candidate_total,
255
+ "pair_evidence_total": pair_evidence_total,
256
+ "unmeasured_candidate_total": unmeasured_candidate_total,
257
+ "rejected_count": rejected_total,
258
+ "candidate_count": candidate_total,
259
+ "pair_evidence_count": pair_evidence_total,
260
+ "unmeasured_count": unmeasured_candidate_total,
261
+ "pair_margin_avg": average(pair_margins),
262
+ "pair_margin_min": min(pair_margins) if pair_margins else None,
263
+ "pair_solo_wall_ratio_avg": average(wall_ratios),
264
+ "pair_solo_wall_ratio_max": round(max(wall_ratios), 2) if wall_ratios else None,
265
+ "rows": rows,
266
+ }
267
+
268
+
269
+ def write_markdown(path: pathlib.Path, report: dict[str, Any]) -> None:
270
+ lines = [
271
+ "# Pair Candidate Frontier",
272
+ "",
273
+ f"Active fixtures: {report['fixtures_total']}",
274
+ f"Verdict: {report['verdict']}",
275
+ f"Rejected fixtures: {report['rejected_total']}",
276
+ f"Candidate fixtures: {report['candidate_total']}",
277
+ f"Candidates with passing pair evidence: {report['pair_evidence_total']}",
278
+ f"Unmeasured candidates: {report['unmeasured_candidate_total']}",
279
+ f"Minimum pair margin required: {format_margin(report.get('min_pair_margin'))}",
280
+ f"Maximum pair/solo wall ratio allowed: {format_wall_ratio(report.get('max_pair_solo_wall_ratio'))}",
281
+ f"Average pair margin: {format_decimal_margin(report.get('pair_margin_avg'))}",
282
+ f"Minimum pair margin: {format_margin(report.get('pair_margin_min'))}",
283
+ f"Average pair/solo wall ratio: {format_wall_ratio(report.get('pair_solo_wall_ratio_avg'))}",
284
+ f"Maximum pair/solo wall ratio: {format_wall_ratio(report.get('pair_solo_wall_ratio_max'))}",
285
+ "",
286
+ "| Fixture | Status | Verdict | Evidence | Pair arm | Triggers | Hypothesis trigger | Bare | Solo_claude | Pair | Margin | Wall ratio | Rejected reason |",
287
+ "|---|---|---|---|---|---|---|---:|---:|---:|---:|---:|---|",
288
+ ]
289
+ for row in report["rows"]:
290
+ evidence = row["passing_pair_evidence"]
291
+ best = best_pair_evidence(evidence)
292
+ evidence_text = best.get("run_id", "") if best else ""
293
+ pair_arm = best.get("pair_arm", "") if best else ""
294
+ triggers = format_trigger_reasons(best.get("pair_trigger_reasons")) if best else ""
295
+ lines.append(
296
+ f"| {row['fixture']} | {row['status']} | {row['status']} | {evidence_text} | {pair_arm} | {triggers} | "
297
+ f"{format_bool(best.get('pair_trigger_has_hypothesis_reason') if best else None)} | "
298
+ f"{format_number(best.get('bare_score') if best else None)} | "
299
+ f"{format_number(best.get('solo_score') if best else None)} | "
300
+ f"{format_number(best.get('pair_score') if best else None)} | "
301
+ f"{format_margin(best.get('pair_margin') if best else None)} | "
302
+ f"{format_wall_ratio(best.get('pair_solo_wall_ratio') if best else None)} | "
303
+ f"{row.get('rejected_reason') or ''} |"
304
+ )
305
+ path.write_text("\n".join(lines) + "\n", encoding="utf8")
306
+
307
+
308
+ def average(values: list[int | float]) -> float | None:
309
+ return round(sum(values) / len(values), 2) if values else None
310
+
311
+
312
+ def format_number(value: Any) -> str:
313
+ return str(value) if isinstance(value, int) else ""
314
+
315
+
316
+ def format_decimal_margin(value: Any) -> str:
317
+ return f"{value:+.2f}" if isinstance(value, (int, float)) else ""
318
+
319
+
320
+ def format_margin(value: Any) -> str:
321
+ return f"{value:+d}" if isinstance(value, int) else ""
322
+
323
+
324
+ def format_wall_ratio(value: Any) -> str:
325
+ return f"{value:.2f}x" if is_strict_number(value) else ""
326
+
327
+
328
+ def format_trigger_reasons(value: Any) -> str:
329
+ if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
330
+ return ""
331
+ return ",".join(value)
332
+
333
+
334
+ def format_bool(value: Any) -> str:
335
+ return str(value).lower() if isinstance(value, bool) else ""
336
+
337
+
338
+ def print_summary(report: dict[str, Any]) -> None:
339
+ print(
340
+ "fixtures={fixtures_total} rejected={rejected_total} "
341
+ "candidates={candidate_total} pair_evidence={pair_evidence_total} "
342
+ "unmeasured={unmeasured_candidate_total} verdict={verdict}".format(**report)
343
+ )
344
+ if report.get("pair_evidence_total"):
345
+ print(
346
+ "pair_margin_avg={avg} pair_margin_min={min_margin} "
347
+ "wall_avg={wall_avg} wall_max={wall_max}".format(
348
+ avg=format_decimal_margin(report.get("pair_margin_avg")),
349
+ min_margin=format_margin(report.get("pair_margin_min")),
350
+ wall_avg=format_wall_ratio(report.get("pair_solo_wall_ratio_avg")),
351
+ wall_max=format_wall_ratio(report.get("pair_solo_wall_ratio_max")),
352
+ )
353
+ )
354
+ for row in report["rows"]:
355
+ if row["status"] != "pair_evidence_passed":
356
+ continue
357
+ best = best_pair_evidence(row["passing_pair_evidence"])
358
+ if not best:
359
+ continue
360
+ print(
361
+ "{fixture}: bare={bare} solo_claude={solo} pair={pair} arm={arm} margin={margin} "
362
+ "wall={wall} run={run} verdict=pair_evidence_passed triggers={triggers} "
363
+ "hypothesis_trigger={hypothesis_trigger}".format(
364
+ fixture=row["fixture"],
365
+ bare=format_number(best.get("bare_score")),
366
+ solo=format_number(best.get("solo_score")),
367
+ pair=format_number(best.get("pair_score")),
368
+ arm=best.get("pair_arm") or "",
369
+ margin=format_margin(best.get("pair_margin")),
370
+ wall=format_wall_ratio(best.get("pair_solo_wall_ratio")),
371
+ run=best.get("run_id") or "",
372
+ triggers=format_trigger_reasons(best.get("pair_trigger_reasons")),
373
+ hypothesis_trigger=format_bool(best.get("pair_trigger_has_hypothesis_reason")),
374
+ )
375
+ )
376
+
377
+
378
+ def print_final_verdict(report: dict[str, Any]) -> None:
379
+ if report.get("verdict") == "PASS":
380
+ print("PASS pair-candidate-frontier", flush=True)
381
+ else:
382
+ print("FAIL pair-candidate-frontier", flush=True)
383
+
384
+
385
+ def main() -> int:
386
+ parser = argparse.ArgumentParser()
387
+ parser.add_argument(
388
+ "--fixtures-root",
389
+ type=pathlib.Path,
390
+ default=pathlib.Path("benchmark/auto-resolve/fixtures"),
391
+ )
392
+ parser.add_argument(
393
+ "--registry",
394
+ type=pathlib.Path,
395
+ default=pathlib.Path(__file__).with_name("pair-rejected-fixtures.sh"),
396
+ )
397
+ parser.add_argument(
398
+ "--results-root",
399
+ type=pathlib.Path,
400
+ default=pathlib.Path("benchmark/auto-resolve/results"),
401
+ )
402
+ parser.add_argument("--out-json", type=pathlib.Path)
403
+ parser.add_argument("--out-md", type=pathlib.Path)
404
+ parser.add_argument(
405
+ "--fail-on-unmeasured",
406
+ action="store_true",
407
+ help="exit 1 when active candidate_unmeasured fixtures remain",
408
+ )
409
+ parser.add_argument(
410
+ "--min-pair-margin",
411
+ type=int,
412
+ default=5,
413
+ help="minimum pair-over-solo margin required to count passing pair evidence",
414
+ )
415
+ parser.add_argument(
416
+ "--max-pair-solo-wall-ratio",
417
+ type=float,
418
+ default=3.0,
419
+ help="maximum pair/solo wall-time ratio allowed to count passing pair evidence",
420
+ )
421
+ args = parser.parse_args()
422
+ if args.min_pair_margin < 1:
423
+ print("error: --min-pair-margin must be >= 1", file=sys.stderr)
424
+ return 2
425
+ if not math.isfinite(args.max_pair_solo_wall_ratio) or args.max_pair_solo_wall_ratio <= 0:
426
+ print("error: --max-pair-solo-wall-ratio must be finite and > 0", file=sys.stderr)
427
+ return 2
428
+
429
+ try:
430
+ report = build_report(
431
+ fixtures_root=args.fixtures_root,
432
+ registry=args.registry,
433
+ results_root=args.results_root,
434
+ min_pair_margin=args.min_pair_margin,
435
+ max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
436
+ )
437
+ except ValueError as exc:
438
+ print(f"error: {exc}", file=sys.stderr)
439
+ return 2
440
+
441
+ if args.out_json:
442
+ args.out_json.parent.mkdir(parents=True, exist_ok=True)
443
+ args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
444
+ if args.out_md:
445
+ args.out_md.parent.mkdir(parents=True, exist_ok=True)
446
+ write_markdown(args.out_md, report)
447
+ if not args.out_json and not args.out_md:
448
+ print(json.dumps(report, indent=2))
449
+ else:
450
+ print_summary(report)
451
+ print_final_verdict(report)
452
+ if args.fail_on_unmeasured and report["unmeasured_candidate_total"] > 0:
453
+ unmeasured = [
454
+ row["fixture"]
455
+ for row in report["rows"]
456
+ if row["status"] == "candidate_unmeasured"
457
+ ]
458
+ print(
459
+ "unmeasured candidate fixture(s): " + ", ".join(unmeasured),
460
+ file=sys.stderr,
461
+ )
462
+ if not args.out_json and not args.out_md:
463
+ print("FAIL pair-candidate-frontier", file=sys.stderr, flush=True)
464
+ return 1
465
+ return 0
466
+
467
+
468
+ if __name__ == "__main__":
469
+ sys.exit(main())
@@ -31,6 +31,8 @@ import re
31
31
  import subprocess
32
32
  import sys
33
33
 
34
+ from pair_evidence_contract import loads_strict_json_object
35
+
34
36
  ORACLE_SCRIPTS = {
35
37
  "test-fidelity": "oracle-test-fidelity.py",
36
38
  "scope-tier-a": "oracle-scope-tier-a.py",
@@ -157,7 +159,7 @@ def list_oracle_categories(scripts_dir, oracle_name):
157
159
  text=True,
158
160
  check=True,
159
161
  )
160
- payload = json.loads(r.stdout)
162
+ payload = loads_strict_json_object(r.stdout)
161
163
  if payload.get("oracle") != oracle_name:
162
164
  raise ValueError(
163
165
  f"oracle name mismatch: expected {oracle_name}, got {payload.get('oracle')}"
@@ -173,10 +175,8 @@ def build_registry(fixture_dir, scripts_dir, generated_at, repo_root):
173
175
  expected_path = fixture_dir / "expected.json"
174
176
  metadata_path = fixture_dir / "metadata.json"
175
177
 
176
- with open(expected_path, "r", encoding="utf-8") as f:
177
- expected = json.load(f)
178
- with open(metadata_path, "r", encoding="utf-8") as f:
179
- metadata = json.load(f)
178
+ expected = loads_strict_json_object(expected_path.read_text(encoding="utf-8"))
179
+ metadata = loads_strict_json_object(metadata_path.read_text(encoding="utf-8"))
180
180
 
181
181
  fixture_id = metadata.get("id") or fixture_dir.name
182
182
 
@@ -25,6 +25,8 @@ import json
25
25
  import pathlib
26
26
  import sys
27
27
 
28
+ from pair_evidence_contract import reject_json_constant
29
+
28
30
  SCHEMA_VERSION = "1"
29
31
  AUTHORITY_ORDER_CANONICAL = [
30
32
  "spec.md",
@@ -67,7 +69,11 @@ def _strict_pairs(pairs):
67
69
 
68
70
  def load_strict_json(path):
69
71
  with open(path, "r", encoding="utf-8") as f:
70
- return json.load(f, object_pairs_hook=_strict_pairs)
72
+ return json.load(
73
+ f,
74
+ object_pairs_hook=_strict_pairs,
75
+ parse_constant=reject_json_constant,
76
+ )
71
77
 
72
78
 
73
79
  # ---------------------------------------------------------------------------
@@ -396,7 +402,8 @@ def lint(plan_path, registry_override=None):
396
402
  return {"ok": False, "errors": [{"code": "plan_invalid_json",
397
403
  "message": f"plan parse error: {e}"}]}
398
404
  except ValueError as e:
399
- return {"ok": False, "errors": [{"code": "plan_duplicate_keys",
405
+ code = "plan_duplicate_keys" if "duplicate key" in str(e) else "plan_invalid_json"
406
+ return {"ok": False, "errors": [{"code": code,
400
407
  "message": str(e)}]}
401
408
  except FileNotFoundError:
402
409
  return {"ok": False, "errors": [{"code": "plan_not_found",