devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -14,7 +14,7 @@ ensure_ascii=False, allow_nan=False`, then sha256 the bytes.
14
14
  Inputs (all required):
15
15
  --c1-summary <path> iter-0033 (C1) summary.json (selection grounds; never a comparison baseline)
16
16
  --f9-judge <path> iter-0033a F9 judge.json (F9 inclusion proof)
17
- --l1-rerun-summary <path> L1 rerun summary at iter-0033c HEAD (fresh baseline)
17
+ --l1-rerun-summary <path> L1 rerun summary archived for provenance, not selection
18
18
  --output <path> destination .devlyn/manifests/iter-0033c-pair-eligible.json
19
19
 
20
20
  Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set"):
@@ -25,18 +25,38 @@ Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set
25
25
  pair_eligible = high_value ∪ promoted_by_l1_le_l0 ∪ {F9 if iter-0033a passed}
26
26
  − reporting_only
27
27
  − conditional_excluded that did not get promoted
28
+ − current rejected/ceiling registry
28
29
  """
29
30
  import argparse
30
31
  import copy
31
32
  import hashlib
32
33
  import json
34
+ import re
33
35
  import subprocess
34
36
  import sys
35
37
  from pathlib import Path
36
38
 
39
+ SCRIPT_DIR = Path(__file__).resolve().parent
40
+ if str(SCRIPT_DIR) not in sys.path:
41
+ sys.path.insert(0, str(SCRIPT_DIR))
42
+
43
+ from pair_evidence_contract import is_score, reject_json_constant
44
+
37
45
  HIGH_VALUE = ["F2", "F3", "F4", "F6", "F7"]
38
46
  CONDITIONAL = ["F1", "F5"]
39
47
  REPORTING_ONLY = ["F8"]
48
+ REJECTED_REGISTRY = Path(__file__).with_name("pair-rejected-fixtures.sh")
49
+
50
+
51
+ def exact_bool(value: object) -> bool | None:
52
+ return value if isinstance(value, bool) else None
53
+
54
+
55
+ def disqualifier_flag(value: object, *, default: bool = False) -> bool:
56
+ if value is None:
57
+ return default
58
+ parsed = exact_bool(value)
59
+ return parsed if parsed is not None else True
40
60
 
41
61
 
42
62
  def file_sha256(path: Path) -> str:
@@ -62,28 +82,111 @@ def fixture_short_id(full: str) -> str:
62
82
  return full.split("-", 1)[0] if "-" in full else full
63
83
 
64
84
 
85
+ def load_rejected_fixture_reasons(path: Path) -> dict[str, str]:
86
+ if not path.is_file():
87
+ raise ValueError(f"rejected fixture registry not found: {path}")
88
+ rejected: dict[str, str] = {}
89
+ current: str | None = None
90
+ for line in path.read_text().splitlines():
91
+ match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
92
+ if match and match.group(1) == match.group(2):
93
+ current = match.group(1)
94
+ continue
95
+ reason = re.match(r'\s*echo "([^"]+)"', line)
96
+ if current and reason:
97
+ rejected[current] = reason.group(1)
98
+ current = None
99
+ return dict(sorted(rejected.items(), key=lambda item: (item[0][0], int(item[0][1:]))))
100
+
101
+
102
+ def load_rejected_short_ids(path: Path) -> list[str]:
103
+ return list(load_rejected_fixture_reasons(path))
104
+
105
+
106
+ def load_json_object(path: Path, label: str) -> dict:
107
+ try:
108
+ data = json.loads(path.read_text(), parse_constant=reject_json_constant)
109
+ except (ValueError, json.JSONDecodeError) as exc:
110
+ raise ValueError(f"{label} malformed: invalid JSON") from exc
111
+ if not isinstance(data, dict):
112
+ raise ValueError(f"{label} malformed: expected object")
113
+ return data
114
+
115
+
65
116
  def compute_promoted_l1_le_l0(c1_rows: list) -> list:
66
117
  """Return short fixture IDs (e.g. 'F3') where solo_claude.score ≤ bare.score in C1."""
67
118
  promoted = []
68
119
  for row in c1_rows:
69
- arms = row.get("arms", {})
70
- solo = arms.get("solo_claude", {}).get("score")
71
- bare = arms.get("bare", {}).get("score")
72
- if solo is None or bare is None:
120
+ if not isinstance(row, dict):
121
+ continue
122
+ raw_arms = row.get("arms")
123
+ arms = raw_arms if isinstance(raw_arms, dict) else {}
124
+ raw_solo = arms.get("solo_claude")
125
+ raw_bare = arms.get("bare")
126
+ solo_arm = raw_solo if isinstance(raw_solo, dict) else {}
127
+ bare_arm = raw_bare if isinstance(raw_bare, dict) else {}
128
+ if (
129
+ disqualifier_flag(solo_arm.get("disqualifier"))
130
+ or disqualifier_flag(bare_arm.get("disqualifier"))
131
+ ):
132
+ continue
133
+ solo = solo_arm.get("score")
134
+ bare = bare_arm.get("score")
135
+ if not is_score(solo) or not is_score(bare):
73
136
  continue
74
137
  if solo <= bare:
75
- promoted.append(fixture_short_id(row["fixture"]))
138
+ fixture = row.get("fixture")
139
+ if isinstance(fixture, str):
140
+ promoted.append(fixture_short_id(fixture))
76
141
  return promoted
77
142
 
78
143
 
144
+ def mapped_score(judge: dict, arm: str) -> int | None:
145
+ mapping = judge.get("_blind_mapping")
146
+ if not isinstance(mapping, dict):
147
+ return None
148
+ letter = next((slot for slot, mapped in mapping.items() if mapped == arm), None)
149
+ if letter is None:
150
+ return None
151
+ raw_scores = judge.get("scores_by_arm")
152
+ scores = raw_scores if isinstance(raw_scores, dict) else {}
153
+ score = scores.get(arm)
154
+ if is_score(score):
155
+ return score
156
+ legacy = judge.get(f"{letter.lower()}_score")
157
+ return legacy if is_score(legacy) else None
158
+
159
+
160
+ def mapped_disqualifier(judge: dict, arm: str) -> bool:
161
+ mapping = judge.get("_blind_mapping")
162
+ if not isinstance(mapping, dict):
163
+ return True
164
+ letter = next((slot for slot, mapped in mapping.items() if mapped == arm), None)
165
+ if letter is None:
166
+ return True
167
+ raw_by_arm = judge.get("disqualifiers_by_arm")
168
+ if raw_by_arm is not None and not isinstance(raw_by_arm, dict):
169
+ return True
170
+ by_arm = raw_by_arm if isinstance(raw_by_arm, dict) else {}
171
+ if arm in by_arm:
172
+ entry = by_arm.get(arm)
173
+ return disqualifier_flag(
174
+ entry.get("disqualifier") if isinstance(entry, dict) else entry
175
+ )
176
+ raw_legacy = judge.get("disqualifiers")
177
+ if raw_legacy is not None and not isinstance(raw_legacy, dict):
178
+ return True
179
+ legacy = raw_legacy if isinstance(raw_legacy, dict) else {}
180
+ return disqualifier_flag(legacy.get(letter))
181
+
182
+
79
183
  def f9_passed(f9_judge: dict) -> bool:
80
- """iter-0033a passed iff A score > B score AND A is not disqualified."""
81
- a = f9_judge.get("a_score")
82
- b = f9_judge.get("b_score")
83
- dqs = f9_judge.get("disqualifiers") or {}
84
- if a is None or b is None:
184
+ """iter-0033a passed iff solo_claude beats bare and solo is not disqualified."""
185
+ solo = mapped_score(f9_judge, "solo_claude")
186
+ bare = mapped_score(f9_judge, "bare")
187
+ if solo is None or bare is None:
85
188
  return False
86
- return a > b and not bool(dqs.get("A", False))
189
+ return solo > bare and not mapped_disqualifier(f9_judge, "solo_claude")
87
190
 
88
191
 
89
192
  def head_sha() -> str:
@@ -114,10 +217,21 @@ def main() -> int:
114
217
  print(f"error: {label} not found: {p}", file=sys.stderr)
115
218
  return 2
116
219
 
117
- c1 = json.loads(c1_path.read_text())
118
- f9 = json.loads(f9_path.read_text())
220
+ try:
221
+ c1 = load_json_object(c1_path, "c1-summary")
222
+ f9 = load_json_object(f9_path, "f9-judge")
223
+ rejected_reasons = load_rejected_fixture_reasons(REJECTED_REGISTRY)
224
+ rejected_short_ids = list(rejected_reasons)
225
+ except ValueError as exc:
226
+ print(f"error: {exc}", file=sys.stderr)
227
+ return 2
228
+
229
+ c1_rows = c1.get("rows")
230
+ if not isinstance(c1_rows, list):
231
+ print("error: c1-summary malformed: rows must be an array", file=sys.stderr)
232
+ return 2
119
233
 
120
- promoted = compute_promoted_l1_le_l0(c1.get("rows", []))
234
+ promoted = compute_promoted_l1_le_l0(c1_rows)
121
235
  f9_in = f9_passed(f9)
122
236
 
123
237
  pair_eligible = list(HIGH_VALUE) # frozen high-value list, ordered
@@ -127,10 +241,23 @@ def main() -> int:
127
241
  if f9_in and "F9" not in pair_eligible:
128
242
  pair_eligible.append("F9")
129
243
  pair_eligible = [fx for fx in pair_eligible if fx not in REPORTING_ONLY]
244
+ rejected_excluded = sorted(
245
+ {fx for fx in pair_eligible if fx in rejected_short_ids},
246
+ key=lambda s: (s[0], int(s[1:])),
247
+ )
248
+ pair_eligible = [fx for fx in pair_eligible if fx not in rejected_short_ids]
130
249
 
131
250
  conditional_promoted = [fx for fx in CONDITIONAL if fx in promoted]
132
251
  conditional_excluded = [fx for fx in CONDITIONAL if fx not in promoted]
133
252
  pair_eligible_sorted = sorted(pair_eligible, key=lambda s: (s[0], int(s[1:])))
253
+ if not pair_eligible_sorted:
254
+ rejected_text = ", ".join(rejected_excluded) if rejected_excluded else "none"
255
+ print(
256
+ "error: no pair-eligible fixtures remain after rejected-registry filtering "
257
+ f"(rejected_excluded={rejected_text})",
258
+ file=sys.stderr,
259
+ )
260
+ return 1
134
261
 
135
262
  gate3_total = len(pair_eligible_sorted)
136
263
  gate3_threshold = (gate3_total + 1) // 2 # ≥50% — ceil(gate3_total / 2)
@@ -152,6 +279,11 @@ def main() -> int:
152
279
  "reporting_only": REPORTING_ONLY,
153
280
  "conditional_excluded": conditional_excluded,
154
281
  "conditional_promoted": conditional_promoted,
282
+ "rejected_registry": str(REJECTED_REGISTRY),
283
+ "rejected_excluded": rejected_excluded,
284
+ "rejected_excluded_reasons": {
285
+ fixture: rejected_reasons[fixture] for fixture in rejected_excluded
286
+ },
155
287
  },
156
288
  "fixtures_pair_eligible": pair_eligible_sorted,
157
289
  "gate3_threshold_count": gate3_threshold,
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env python3
2
- """F9 variant/solo arm artifact + transcript fingerprint check.
2
+ """F9 skill-driven arm artifact + transcript fingerprint check.
3
3
 
4
4
  Out-of-band per Codex R0.5 §B (iter-0033a): expected.json.verification_commands
5
5
  apply to ALL arms (run-fixture.sh:472), so a `docs/specs/**` check there would
6
- punish bare. This script runs AFTER run-fixture.sh and asserts variant/solo
6
+ punish bare. This script runs AFTER run-fixture.sh and asserts skill-driven
7
7
  arms produced the artifacts the 2-skill ideate→resolve chain should emit.
8
8
 
9
9
  Bare arm is exempt by construction.
@@ -13,7 +13,7 @@ Usage:
13
13
 
14
14
  Exits:
15
15
  0 — all checks pass (or bare arm — exempt).
16
- 1 — variant/solo arm but artifact contract violated.
16
+ 1 — skill-driven arm but artifact contract violated.
17
17
  2 — invalid invocation (missing args, missing dir).
18
18
 
19
19
  Emits a small JSON report at <result-dir>/check-f9-artifacts.json.
@@ -25,8 +25,10 @@ import re
25
25
  import sys
26
26
  from pathlib import Path
27
27
 
28
+ from pair_evidence_contract import loads_strict_json_object
28
29
 
29
- VARIANT_ARMS = {"variant", "solo_claude", "l2_gated", "l2_forced"}
30
+
31
+ SKILL_DRIVEN_ARMS = {"variant", "solo_claude", "l2_gated", "l2_risk_probes", "l2_forced"}
30
32
  EXEMPT_ARMS = {"bare"}
31
33
 
32
34
  SPEC_DIR_GLOB = "docs/specs/*/spec.md"
@@ -39,6 +41,18 @@ RE_AUTO_RESOLVE = re.compile(r"/devlyn:auto-resolve\b")
39
41
  RE_PREFLIGHT = re.compile(r"/devlyn:preflight\b")
40
42
 
41
43
 
44
+ def _load_json_object(path: Path) -> tuple[dict | None, str | None]:
45
+ try:
46
+ data = loads_strict_json_object(path.read_text())
47
+ except json.JSONDecodeError as exc:
48
+ return None, f"{exc.__class__.__name__}: {exc}"
49
+ except ValueError as exc:
50
+ if str(exc) == "top-level JSON value must be an object":
51
+ return None, "expected JSON object"
52
+ return None, f"{exc.__class__.__name__}: {exc}"
53
+ return data, None
54
+
55
+
42
56
  def main() -> int:
43
57
  p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
44
58
  p.add_argument("--result-dir", required=True,
@@ -71,8 +85,8 @@ def main() -> int:
71
85
  _write_report(result_dir, report)
72
86
  return 0
73
87
 
74
- if arm not in VARIANT_ARMS:
75
- print(f"error: unknown arm '{arm}' (expected one of {VARIANT_ARMS | EXEMPT_ARMS})",
88
+ if arm not in SKILL_DRIVEN_ARMS:
89
+ print(f"error: unknown arm '{arm}' (expected one of {SKILL_DRIVEN_ARMS | EXEMPT_ARMS})",
76
90
  file=sys.stderr)
77
91
  return 2
78
92
 
@@ -81,13 +95,13 @@ def main() -> int:
81
95
  timing_path = result_dir / "timing.json"
82
96
  work_dir: Path
83
97
  if timing_path.is_file():
84
- try:
85
- timing = json.loads(timing_path.read_text())
98
+ timing, _timing_error = _load_json_object(timing_path)
99
+ if timing is not None:
86
100
  work_dir = Path(timing.get("work_dir", ""))
87
- except Exception:
88
- work_dir = Path("")
101
+ else:
102
+ work_dir = Path("__invalid_timing_work_dir__")
89
103
  else:
90
- work_dir = Path("")
104
+ work_dir = Path("__missing_timing_work_dir__")
91
105
 
92
106
  if not work_dir.is_dir():
93
107
  report["checks"].append({
@@ -163,16 +177,14 @@ def main() -> int:
163
177
  else:
164
178
  # Read the most recent run.
165
179
  state_path = sorted(state_paths)[-1]
166
- try:
167
- state = json.loads(state_path.read_text())
168
- except Exception as exc:
180
+ state, state_error = _load_json_object(state_path)
181
+ if state is None:
169
182
  report["checks"].append({
170
183
  "name": "pipeline.state.json-parses",
171
184
  "pass": False,
172
- "reason": f"{exc.__class__.__name__}: {exc}",
185
+ "reason": state_error,
173
186
  })
174
187
  report["pass"] = False
175
- state = None
176
188
 
177
189
  if state is not None:
178
190
  archived = "/runs/" in str(state_path)
@@ -8,6 +8,8 @@ import json
8
8
  from pathlib import Path
9
9
  from typing import Any
10
10
 
11
+ from pair_evidence_contract import reject_json_constant
12
+
11
13
 
12
14
  def read_jsonl(path: Path) -> list[dict[str, Any]]:
13
15
  rows: list[dict[str, Any]] = []
@@ -15,7 +17,7 @@ def read_jsonl(path: Path) -> list[dict[str, Any]]:
15
17
  for line_no, line in enumerate(f, start=1):
16
18
  if not line.strip():
17
19
  continue
18
- value = json.loads(line)
20
+ value = json.loads(line, parse_constant=reject_json_constant)
19
21
  if not isinstance(value, dict):
20
22
  raise ValueError(f"{path}:{line_no}: expected JSON object")
21
23
  rows.append(value)
@@ -36,11 +38,17 @@ def instance_ids_from_jsonl(path: Path | None) -> set[str] | None:
36
38
 
37
39
  def collect_from_root(root: Path, patch_name: str, keep: set[str] | None) -> list[tuple[str, Path]]:
38
40
  patches: list[tuple[str, Path]] = []
41
+ seen: set[str] = set()
39
42
  for patch_path in sorted(root.glob(f"*/{patch_name}")):
40
43
  instance_id = patch_path.parent.name
41
44
  if keep is not None and instance_id not in keep:
42
45
  continue
46
+ seen.add(instance_id)
43
47
  patches.append((instance_id, patch_path))
48
+ if keep is not None:
49
+ missing = sorted(keep - seen)
50
+ if missing:
51
+ raise ValueError(f"missing {patch_name} for instance ids: {', '.join(missing)}")
44
52
  return patches
45
53
 
46
54
 
@@ -81,6 +89,8 @@ def main() -> int:
81
89
  + "\n"
82
90
  )
83
91
  written += 1
92
+ if written == 0:
93
+ raise ValueError("no non-empty patches collected")
84
94
 
85
95
  report = {
86
96
  "patch_root": str(args.patch_root),