devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -27,12 +27,12 @@ Gates per iter-0033c §"Acceptance gate":
27
27
  6 trigger discipline (fixture-level): for each pair-eligible fixture, if
28
28
  l2_forced lifts ≥ +5 OR catches categorical rescue, AND forced is not
29
29
  impl-confounded, AND forced.pair_judge present → l2_gated MUST also have
30
- pair_judge present on that fixture.
30
+ recognized pair_judge verdict on that fixture.
31
31
  7 attribution (4-class, data-only): per-fixture classify into
32
32
  {no_material_lift, implementation_confounded, tool_or_trigger_lift,
33
33
  deliberation_lift}. Reporting only; not pass/fail.
34
- 8 artifact contract: pair_judge non-null for every fixture where pair fired;
35
- pair findings distinguishable from solo judge findings.
34
+ 8 artifact contract: recognized pair_judge verdict for every fixture where
35
+ pair fired; pair findings distinguishable from solo judge findings.
36
36
 
37
37
  Ship-blockers: 1a, 1b, 1c, 2, 3, 4, 6.
38
38
  Quality gates: 5, 8 (failure → root-cause iter; Phase 4 holds).
@@ -43,19 +43,55 @@ import json
43
43
  import sys
44
44
  from pathlib import Path
45
45
 
46
+ SCRIPT_DIR = Path(__file__).resolve().parent
47
+ if str(SCRIPT_DIR) not in sys.path:
48
+ sys.path.insert(0, str(SCRIPT_DIR))
49
+
50
+ from pair_evidence_contract import (
51
+ is_score,
52
+ is_strict_number,
53
+ loads_strict_json_object,
54
+ reject_json_constant,
55
+ )
56
+
57
+ PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}
58
+
59
+
60
+ def exact_bool(value: object) -> bool | None:
61
+ return value if isinstance(value, bool) else None
62
+
63
+
64
+ def bool_flag(value: object, *, default: bool = False) -> bool:
65
+ if value is None:
66
+ return default
67
+ parsed = exact_bool(value)
68
+ return parsed if parsed is not None else True
69
+
70
+
71
+ def is_pair_judge_verdict(value: object) -> bool:
72
+ return value in PAIR_VERDICTS
73
+
46
74
 
47
75
  def load_judge(results_dir: Path, fixture: str) -> dict | None:
48
76
  p = results_dir / fixture / "judge.json"
49
77
  if not p.is_file():
50
78
  return None
51
- return json.loads(p.read_text())
79
+ try:
80
+ data = loads_strict_json_object(p.read_text())
81
+ except (ValueError, json.JSONDecodeError):
82
+ return None
83
+ return data
52
84
 
53
85
 
54
86
  def load_result(results_dir: Path, fixture: str, arm: str) -> dict | None:
55
87
  p = results_dir / fixture / arm / "result.json"
56
88
  if not p.is_file():
57
89
  return None
58
- return json.loads(p.read_text())
90
+ try:
91
+ data = loads_strict_json_object(p.read_text())
92
+ except (ValueError, json.JSONDecodeError):
93
+ return None
94
+ return data
59
95
 
60
96
 
61
97
  def load_state(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> dict | None:
@@ -67,7 +103,11 @@ def load_state(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> dict
67
103
  candidates = sorted(runs.glob("*/pipeline.state.json"))
68
104
  if not candidates:
69
105
  return None
70
- return json.loads(candidates[-1].read_text())
106
+ try:
107
+ data = loads_strict_json_object(candidates[-1].read_text())
108
+ except (ValueError, json.JSONDecodeError):
109
+ return None
110
+ return data
71
111
 
72
112
 
73
113
  def archive_run_dir(work_dir_root: Path, run_id: str, fixture: str, arm: str) -> Path | None:
@@ -154,19 +194,24 @@ def find_results_dir_fixtures(results_dir: Path) -> list[str]:
154
194
 
155
195
 
156
196
  def get_score(judge: dict, arm: str) -> int | None:
157
- """Score for a given arm. Prefer judge.json's `scores_by_arm` (already
158
- arm-keyed); fall back to blind A/B/C lookup with case-correct `<letter>_score`
159
- field (judge.sh writes a_score/b_score lowercase, not A_score)."""
197
+ """Score for a given arm, only when the arm is present in `_blind_mapping`.
198
+
199
+ `scores_by_arm` is accepted only as a decoded view of the blind A/B/C slots;
200
+ a score for an arm absent from the blind mapping is not score evidence.
201
+ """
160
202
  if not judge:
161
203
  return None
162
- sba = judge.get("scores_by_arm") or {}
163
- if arm in sba:
164
- return sba[arm]
165
- mapping = judge.get("_blind_mapping") or {}
204
+ raw_mapping = judge.get("_blind_mapping")
205
+ mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
166
206
  letter = next((k for k, v in mapping.items() if v == arm), None)
167
207
  if not letter:
168
208
  return None
169
- return judge.get(f"{letter.lower()}_score")
209
+ raw_scores = judge.get("scores_by_arm")
210
+ sba = raw_scores if isinstance(raw_scores, dict) else {}
211
+ if is_score(sba.get(arm)):
212
+ return sba[arm]
213
+ legacy = judge.get(f"{letter.lower()}_score")
214
+ return legacy if is_score(legacy) else None
170
215
 
171
216
 
172
217
  def get_disqualifier(judge: dict, arm: str) -> bool:
@@ -174,15 +219,31 @@ def get_disqualifier(judge: dict, arm: str) -> bool:
174
219
  line 314-323; fall back to blind A/B/C with case-correct letter."""
175
220
  if not judge:
176
221
  return False
177
- dba = judge.get("disqualifiers_by_arm") or {}
178
- if arm in dba:
179
- return bool(dba[arm].get("disqualifier", False))
180
- dqs = judge.get("disqualifiers") or {}
181
- mapping = judge.get("_blind_mapping") or {}
222
+ raw_mapping = judge.get("_blind_mapping")
223
+ mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
182
224
  letter = next((k for k, v in mapping.items() if v == arm), None)
183
225
  if not letter:
184
- return False
185
- return bool(dqs.get(letter, False))
226
+ raw_scores = judge.get("scores_by_arm")
227
+ sba = raw_scores if isinstance(raw_scores, dict) else {}
228
+ raw_dba = judge.get("disqualifiers_by_arm")
229
+ if raw_dba is not None and not isinstance(raw_dba, dict):
230
+ return True
231
+ dba = raw_dba if isinstance(raw_dba, dict) else {}
232
+ return arm in sba or arm in dba
233
+ raw_dba = judge.get("disqualifiers_by_arm")
234
+ if raw_dba is not None and not isinstance(raw_dba, dict):
235
+ return True
236
+ dba = raw_dba if isinstance(raw_dba, dict) else {}
237
+ if arm in dba:
238
+ entry = dba[arm]
239
+ return bool_flag(
240
+ entry.get("disqualifier") if isinstance(entry, dict) else entry
241
+ )
242
+ raw_dqs = judge.get("disqualifiers")
243
+ if raw_dqs is not None and not isinstance(raw_dqs, dict):
244
+ return True
245
+ dqs = raw_dqs if isinstance(raw_dqs, dict) else {}
246
+ return bool_flag(dqs.get(letter))
186
247
 
187
248
 
188
249
  def gate_2_no_regression(rows: list[dict]) -> dict:
@@ -268,9 +329,11 @@ def load_mechanical_findings(work_dir_root: Path, run_id: str, fixture: str, arm
268
329
  if not ln:
269
330
  continue
270
331
  try:
271
- out.append(json.loads(ln))
332
+ parsed = json.loads(ln, parse_constant=reject_json_constant)
272
333
  except json.JSONDecodeError:
273
334
  continue
335
+ if isinstance(parsed, dict):
336
+ out.append(parsed)
274
337
  return out
275
338
 
276
339
 
@@ -401,7 +464,7 @@ def gate_8_artifact_contract(rows: list[dict]) -> dict:
401
464
  return {
402
465
  "gate": "8-artifact-contract",
403
466
  "status": "PASS" if not failures else "FAIL",
404
- "rule": "pair_judge non-null when fired; pair findings distinguishable from solo",
467
+ "rule": "recognized pair_judge verdict when fired; pair findings distinguishable from solo",
405
468
  "failures": failures,
406
469
  }
407
470
 
@@ -418,12 +481,18 @@ def build_rows(results_dir: Path, work_dir_root: Path, run_id: str) -> list[dict
418
481
  forced_state = load_state(work_dir_root, run_id, fx, "l2_forced")
419
482
 
420
483
  def pair_judge_present(state: dict | None) -> bool:
421
- if not state:
484
+ if not isinstance(state, dict):
485
+ return False
486
+ phases = state.get("phases")
487
+ if not isinstance(phases, dict):
488
+ return False
489
+ verify = phases.get("verify")
490
+ if not isinstance(verify, dict):
491
+ return False
492
+ sub = verify.get("sub_verdicts")
493
+ if not isinstance(sub, dict):
422
494
  return False
423
- phases = state.get("phases") or {}
424
- verify = phases.get("verify") or {}
425
- sub = verify.get("sub_verdicts") or {}
426
- return sub.get("pair_judge") is not None
495
+ return is_pair_judge_verdict(sub.get("pair_judge"))
427
496
 
428
497
  # Pair findings distinguishability — checked from archive of whichever
429
498
  # arm fired pair-mode. l2_forced always fires (when present); l2_gated
@@ -444,10 +513,10 @@ def build_rows(results_dir: Path, work_dir_root: Path, run_id: str) -> list[dict
444
513
  "solo_dq": get_disqualifier(judge, "solo_claude"),
445
514
  "l2_gated_dq": get_disqualifier(judge, "l2_gated"),
446
515
  "l2_forced_dq": get_disqualifier(judge, "l2_forced"),
447
- "solo_wall": (solo_r or {}).get("elapsed_seconds"),
448
- "l2_gated_wall": (gated_r or {}).get("elapsed_seconds"),
449
- "solo_timeout": bool((solo_r or {}).get("timed_out")),
450
- "l2_gated_timeout": bool((gated_r or {}).get("timed_out")),
516
+ "solo_wall": strict_elapsed_seconds(solo_r),
517
+ "l2_gated_wall": strict_elapsed_seconds(gated_r),
518
+ "solo_timeout": timeout_flag(solo_r),
519
+ "l2_gated_timeout": timeout_flag(gated_r),
451
520
  "l2_gated_pair_judge_present": pair_judge_present(gated_state),
452
521
  "l2_forced_pair_judge_present": pair_judge_present(forced_state),
453
522
  "pair_fired": pair_judge_present(gated_state) or pair_judge_present(forced_state),
@@ -466,6 +535,57 @@ def build_rows(results_dir: Path, work_dir_root: Path, run_id: str) -> list[dict
466
535
  return rows
467
536
 
468
537
 
538
+ def strict_elapsed_seconds(result: dict | None) -> float | int | None:
539
+ if not result:
540
+ return None
541
+ value = result.get("elapsed_seconds")
542
+ return value if is_strict_number(value) else None
543
+
544
+
545
+ def timeout_flag(result: dict | None) -> bool:
546
+ if not result:
547
+ return False
548
+ return bool_flag(result.get("timed_out"))
549
+
550
+
551
+ def validate_manifest(manifest: object) -> tuple[dict | None, str | None]:
552
+ if not isinstance(manifest, dict):
553
+ return None, "manifest malformed: expected object"
554
+ raw_eligible = manifest.get("fixtures_pair_eligible")
555
+ if not isinstance(raw_eligible, list) or not all(isinstance(fx, str) for fx in raw_eligible):
556
+ return None, "manifest malformed: fixtures_pair_eligible must be a string array"
557
+ if not raw_eligible:
558
+ return None, "manifest malformed: fixtures_pair_eligible must not be empty"
559
+ threshold = manifest.get("gate3_threshold_count")
560
+ total = manifest.get("gate3_total")
561
+ if not isinstance(threshold, int) or isinstance(threshold, bool) or threshold <= 0:
562
+ return None, "manifest malformed: gate3_threshold_count must be a positive integer"
563
+ if not isinstance(total, int) or isinstance(total, bool) or total <= 0:
564
+ return None, "manifest malformed: gate3_total must be a positive integer"
565
+ if total != len(raw_eligible):
566
+ return None, "manifest malformed: gate3_total must equal fixtures_pair_eligible length"
567
+ if threshold > total:
568
+ return None, "manifest malformed: gate3_threshold_count must be <= gate3_total"
569
+ rule = manifest.get("selection_rule")
570
+ if rule is not None:
571
+ if not isinstance(rule, dict):
572
+ return None, "manifest malformed: selection_rule must be an object"
573
+ rejected = rule.get("rejected_excluded")
574
+ reasons = rule.get("rejected_excluded_reasons")
575
+ if rejected is not None:
576
+ if not isinstance(rejected, list) or not all(isinstance(fx, str) for fx in rejected):
577
+ return None, "manifest malformed: selection_rule.rejected_excluded must be a string array"
578
+ if reasons is not None:
579
+ if (
580
+ not isinstance(reasons, dict)
581
+ or not all(isinstance(fx, str) and isinstance(reason, str) and reason for fx, reason in reasons.items())
582
+ ):
583
+ return None, "manifest malformed: selection_rule.rejected_excluded_reasons must map fixture ids to non-empty strings"
584
+ if rejected is not None and set(reasons) != set(rejected):
585
+ return None, "manifest malformed: selection_rule.rejected_excluded_reasons keys must match rejected_excluded"
586
+ return manifest, None
587
+
588
+
469
589
  def render_markdown(gates: list[dict], rows: list[dict]) -> str:
470
590
  lines = ["# iter-0033c gate table\n"]
471
591
  lines.append("| fixture | solo | l2_gated | Δ | l2_forced | l2g pair? | l2f pair? | wall_ratio |")
@@ -511,7 +631,18 @@ def main() -> int:
511
631
  ap.add_argument("--out-md", required=True)
512
632
  args = ap.parse_args()
513
633
 
514
- manifest = json.loads(Path(args.manifest).read_text())
634
+ try:
635
+ raw_manifest = json.loads(
636
+ Path(args.manifest).read_text(),
637
+ parse_constant=reject_json_constant,
638
+ )
639
+ except (ValueError, json.JSONDecodeError) as exc:
640
+ print(f"error: manifest malformed: invalid JSON: {exc}", file=sys.stderr)
641
+ return 2
642
+ manifest, manifest_error = validate_manifest(raw_manifest)
643
+ if manifest is None:
644
+ print(f"error: {manifest_error}", file=sys.stderr)
645
+ return 2
515
646
  rows = build_rows(Path(args.results_dir), Path(args.work_dir_root), args.run_id)
516
647
 
517
648
  gates = [
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env python3
2
+ """Build iter-0033c L1 rerun summary from per-fixture judge/result artifacts."""
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ SCRIPT_DIR = Path(__file__).resolve().parent
12
+ if str(SCRIPT_DIR) not in sys.path:
13
+ sys.path.insert(0, str(SCRIPT_DIR))
14
+
15
+ from pair_evidence_contract import is_score, is_strict_number, loads_strict_json_object
16
+
17
+
18
+ SCORE_ARMS = ("solo_claude", "l2_gated", "l2_forced", "bare")
19
+
20
+
21
+ def load_json(path: Path) -> dict[str, Any]:
22
+ try:
23
+ data = loads_strict_json_object(path.read_text(encoding="utf8"))
24
+ except (ValueError, json.JSONDecodeError):
25
+ return {}
26
+ return data
27
+
28
+
29
+ def score_for(judge: dict[str, Any], arm: str, mapping: dict[str, Any]) -> int | None:
30
+ letter = next(
31
+ (slot for slot, mapped in mapping.items() if slot in {"A", "B", "C"} and mapped == arm),
32
+ None,
33
+ )
34
+ if letter is None:
35
+ return None
36
+ raw_scores = judge.get("scores_by_arm")
37
+ scores = raw_scores if isinstance(raw_scores, dict) else {}
38
+ score = scores.get(arm)
39
+ if is_score(score):
40
+ return score
41
+ legacy = judge.get(f"{letter.lower()}_score")
42
+ return legacy if is_score(legacy) else None
43
+
44
+
45
+ def strict_number(value: object) -> object:
46
+ return value if is_strict_number(value) else None
47
+
48
+
49
+ def build_summary(results_dir: Path, run_id: str, git_sha: str) -> dict[str, Any]:
50
+ rows = []
51
+ for fx_dir in sorted(p for p in results_dir.iterdir() if p.is_dir()):
52
+ judge_path = fx_dir / "judge.json"
53
+ if not judge_path.is_file():
54
+ continue
55
+ judge = load_json(judge_path)
56
+ raw_mapping = judge.get("_blind_mapping")
57
+ mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
58
+ arms = {}
59
+ for arm_name in SCORE_ARMS:
60
+ score = score_for(judge, arm_name, mapping)
61
+ if score is None and arm_name not in set(mapping.values()):
62
+ continue
63
+ arm_dir = fx_dir / arm_name
64
+ result = load_json(arm_dir / "result.json") if (arm_dir / "result.json").is_file() else {}
65
+ arms[arm_name] = {
66
+ "score": score,
67
+ "wall_s": strict_number(result.get("elapsed_seconds")),
68
+ "verify_score": strict_number(result.get("verify_score")),
69
+ "files_changed": result.get("files_changed"),
70
+ "timed_out": result.get("timed_out"),
71
+ "disqualifier": result.get("disqualifier"),
72
+ }
73
+ rows.append({"fixture": fx_dir.name, "arms": arms})
74
+ return {
75
+ "run_id": run_id,
76
+ "git_sha": git_sha,
77
+ "fixtures_total": len(rows),
78
+ "rows": rows,
79
+ }
80
+
81
+
82
+ def main() -> int:
83
+ parser = argparse.ArgumentParser()
84
+ parser.add_argument("--results-dir", required=True, type=Path)
85
+ parser.add_argument("--out", required=True, type=Path)
86
+ parser.add_argument("--run-id", required=True)
87
+ parser.add_argument("--git-sha", required=True)
88
+ args = parser.parse_args()
89
+
90
+ summary = build_summary(args.results_dir, args.run_id, args.git_sha)
91
+ args.out.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf8")
92
+ print(f"[l1-rerun-summary] wrote {args.out} (fixtures={summary['fixtures_total']})")
93
+ return 0
94
+
95
+
96
+ if __name__ == "__main__":
97
+ raise SystemExit(main())