devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -0,0 +1,491 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for build-pair-eligible-manifest.py score-source handling.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ BUILD="$SCRIPT_DIR/build-pair-eligible-manifest.py"
8
+ TMP_DIR="$(mktemp -d /tmp/build-pair-eligible-manifest-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ expect_fail_contains() {
12
+ local label="$1"
13
+ local needle="$2"
14
+ shift 2
15
+ local out="$TMP_DIR/$label.out"
16
+ if "$@" > "$out" 2>&1; then
17
+ echo "expected failure for $label" >&2
18
+ cat "$out" >&2
19
+ exit 1
20
+ fi
21
+ if ! grep -Fq "$needle" "$out"; then
22
+ echo "missing expected text for $label: $needle" >&2
23
+ cat "$out" >&2
24
+ exit 1
25
+ fi
26
+ }
27
+
28
+ cat > "$TMP_DIR/c1.json" <<'JSON'
29
+ {
30
+ "rows": [
31
+ {
32
+ "fixture": "F1-cli-example",
33
+ "arms": {
34
+ "solo_claude": {"score": 40, "disqualifier": false},
35
+ "bare": {"score": 50, "disqualifier": false}
36
+ }
37
+ },
38
+ {
39
+ "fixture": "F5-cli-dirty",
40
+ "arms": {
41
+ "solo_claude": {"score": 30, "disqualifier": true},
42
+ "bare": {"score": 50, "disqualifier": false}
43
+ }
44
+ },
45
+ {
46
+ "fixture": "F16-cli-current-proof",
47
+ "arms": {
48
+ "solo_claude": {"score": 40, "disqualifier": false},
49
+ "bare": {"score": 50, "disqualifier": false}
50
+ }
51
+ },
52
+ {
53
+ "fixture": "F17-cli-overrange",
54
+ "arms": {
55
+ "solo_claude": {"score": 101, "disqualifier": false},
56
+ "bare": {"score": 102, "disqualifier": false}
57
+ }
58
+ }
59
+ ]
60
+ }
61
+ JSON
62
+
63
+ cat > "$TMP_DIR/c1-all-rejected.json" <<'JSON'
64
+ {
65
+ "rows": [
66
+ {
67
+ "fixture": "F1-cli-example",
68
+ "arms": {
69
+ "solo_claude": {"score": 40, "disqualifier": false},
70
+ "bare": {"score": 50, "disqualifier": false}
71
+ }
72
+ }
73
+ ]
74
+ }
75
+ JSON
76
+
77
+ cat > "$TMP_DIR/l1.json" <<'JSON'
78
+ {
79
+ "rows": [
80
+ {
81
+ "fixture": "F5-cli-dirty",
82
+ "arms": {
83
+ "solo_claude": {"score": 30, "disqualifier": false},
84
+ "bare": {"score": 50, "disqualifier": false}
85
+ }
86
+ }
87
+ ]
88
+ }
89
+ JSON
90
+
91
+ write_f9() {
92
+ local path="$1"
93
+ local mapping_a="$2"
94
+ local solo_dq="$3"
95
+ cat > "$path" <<JSON
96
+ {
97
+ "a_score": 91,
98
+ "b_score": 76,
99
+ "_blind_mapping": {"A": "$mapping_a", "B": "bare", "seed": 1},
100
+ "scores_by_arm": {"solo_claude": 91, "bare": 76},
101
+ "disqualifiers": {"A": $solo_dq, "B": false},
102
+ "disqualifiers_by_arm": {
103
+ "solo_claude": {"disqualifier": $solo_dq},
104
+ "bare": {"disqualifier": false}
105
+ }
106
+ }
107
+ JSON
108
+ }
109
+
110
+ write_f9 "$TMP_DIR/f9-pass.json" "solo_claude" false
111
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/c1-malformed-top.json"
112
+ expect_fail_contains c1-malformed-top "c1-summary malformed: expected object" \
113
+ python3 "$BUILD" \
114
+ --c1-summary "$TMP_DIR/c1-malformed-top.json" \
115
+ --f9-judge "$TMP_DIR/f9-pass.json" \
116
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
117
+ --output "$TMP_DIR/c1-malformed-top-manifest.json"
118
+
119
+ cat > "$TMP_DIR/c1-nan-score.json" <<'JSON'
120
+ {"rows":[{"fixture":"F1-synthetic","arms":{"solo_claude":{"score":NaN},"bare":{"score":50}}}]}
121
+ JSON
122
+ expect_fail_contains c1-nan-score "c1-summary malformed: invalid JSON" \
123
+ python3 "$BUILD" \
124
+ --c1-summary "$TMP_DIR/c1-nan-score.json" \
125
+ --f9-judge "$TMP_DIR/f9-pass.json" \
126
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
127
+ --output "$TMP_DIR/c1-nan-score-manifest.json"
128
+
129
+ cat > "$TMP_DIR/c1-malformed-rows.json" <<'JSON'
130
+ {"rows": {"not": "a-list"}}
131
+ JSON
132
+ expect_fail_contains c1-malformed-rows "c1-summary malformed: rows must be an array" \
133
+ python3 "$BUILD" \
134
+ --c1-summary "$TMP_DIR/c1-malformed-rows.json" \
135
+ --f9-judge "$TMP_DIR/f9-pass.json" \
136
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
137
+ --output "$TMP_DIR/c1-malformed-rows-manifest.json"
138
+
139
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/f9-malformed-top.json"
140
+ expect_fail_contains f9-malformed-top "f9-judge malformed: expected object" \
141
+ python3 "$BUILD" \
142
+ --c1-summary "$TMP_DIR/c1.json" \
143
+ --f9-judge "$TMP_DIR/f9-malformed-top.json" \
144
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
145
+ --output "$TMP_DIR/f9-malformed-top-manifest.json"
146
+
147
+ cat > "$TMP_DIR/c1-malformed-row-fields.json" <<'JSON'
148
+ {
149
+ "rows": [
150
+ "not-a-row",
151
+ {
152
+ "fixture": "F1-cli-example",
153
+ "arms": {
154
+ "solo_claude": {"score": true, "disqualifier": false},
155
+ "bare": {"score": 50, "disqualifier": false}
156
+ }
157
+ },
158
+ {
159
+ "fixture": 123,
160
+ "arms": {
161
+ "solo_claude": {"score": 40, "disqualifier": false},
162
+ "bare": {"score": 50, "disqualifier": false}
163
+ }
164
+ },
165
+ {
166
+ "fixture": "F16-cli-current-proof",
167
+ "arms": {
168
+ "solo_claude": {"score": 40, "disqualifier": false},
169
+ "bare": {"score": 50, "disqualifier": false}
170
+ }
171
+ },
172
+ {
173
+ "fixture": "F18-cli-string-disqualifier",
174
+ "arms": {
175
+ "solo_claude": {"score": 40, "disqualifier": "false"},
176
+ "bare": {"score": 50, "disqualifier": false}
177
+ }
178
+ }
179
+ ]
180
+ }
181
+ JSON
182
+ python3 "$BUILD" \
183
+ --c1-summary "$TMP_DIR/c1-malformed-row-fields.json" \
184
+ --f9-judge "$TMP_DIR/f9-pass.json" \
185
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
186
+ --output "$TMP_DIR/c1-malformed-row-fields-manifest.json" >/dev/null
187
+ python3 - "$TMP_DIR/c1-malformed-row-fields-manifest.json" <<'PY'
188
+ import json
189
+ import sys
190
+
191
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
192
+ promoted = manifest["selection_rule"]["promoted_by_l1_le_l0"]
193
+ if "F1" in promoted:
194
+ raise SystemExit("malformed C1 row fields must not promote F1")
195
+ if "F16" not in manifest["fixtures_pair_eligible"]:
196
+ raise SystemExit("expected non-rejected F16 row to keep manifest non-empty")
197
+ if "F17" in promoted:
198
+ raise SystemExit("overrange C1 row fields must not promote F17")
199
+ if "F18" in promoted:
200
+ raise SystemExit("string C1 disqualifier must not promote F18")
201
+ PY
202
+
203
+ expect_fail_contains all-rejected-empty \
204
+ "no pair-eligible fixtures remain after rejected-registry filtering" \
205
+ python3 "$BUILD" \
206
+ --c1-summary "$TMP_DIR/c1-all-rejected.json" \
207
+ --f9-judge "$TMP_DIR/f9-pass.json" \
208
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
209
+ --output "$TMP_DIR/all-rejected-manifest.json"
210
+
211
+ python3 "$BUILD" \
212
+ --c1-summary "$TMP_DIR/c1.json" \
213
+ --f9-judge "$TMP_DIR/f9-pass.json" \
214
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
215
+ --output "$TMP_DIR/pass-manifest.json" >/dev/null
216
+ python3 - "$TMP_DIR/pass-manifest.json" <<'PY'
217
+ import json
218
+ import sys
219
+
220
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
221
+ eligible = manifest["fixtures_pair_eligible"]
222
+ if "F9" in eligible:
223
+ raise SystemExit("F9 is currently rejected by the shared pair registry")
224
+ if "F1" in eligible:
225
+ raise SystemExit("F1 is currently rejected by the shared pair registry")
226
+ if "F5" in eligible:
227
+ raise SystemExit("dirty F5 L1<=L0 row must not be promoted")
228
+ if "F16" not in eligible:
229
+ raise SystemExit("expected non-rejected F16 L1<=L0 promotion")
230
+ rule = manifest["selection_rule"]
231
+ if rule["f9_included"] is not True:
232
+ raise SystemExit("expected selection rule to record F9 pre-reg inclusion before rejected-registry filtering")
233
+ for fixture in ["F1", "F2", "F3", "F4", "F6", "F7", "F9"]:
234
+ if fixture not in rule["rejected_excluded"]:
235
+ raise SystemExit(f"expected {fixture} to be excluded by rejected registry")
236
+ reasons = rule["rejected_excluded_reasons"]
237
+ if reasons["F2"] != "bare 83 / solo_claude 95 in 20260512-f2-medium-headroom":
238
+ raise SystemExit("expected rejected_excluded_reasons to preserve the F2 registry reason")
239
+ if "20260512-f9-e2e-headroom" not in reasons["F9"]:
240
+ raise SystemExit("expected rejected_excluded_reasons to preserve the F9 registry run id")
241
+ PY
242
+ python3 - "$TMP_DIR/pass-manifest.json" <<'PY'
243
+ import json
244
+ import sys
245
+
246
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
247
+ if "F5" in manifest["fixtures_pair_eligible"]:
248
+ raise SystemExit("l1-rerun-summary must not override pre-registered C1 selection grounds")
249
+ PY
250
+
251
+ write_f9 "$TMP_DIR/f9-wrong-mapping.json" "variant" false
252
+ python3 "$BUILD" \
253
+ --c1-summary "$TMP_DIR/c1.json" \
254
+ --f9-judge "$TMP_DIR/f9-wrong-mapping.json" \
255
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
256
+ --output "$TMP_DIR/wrong-mapping-manifest.json" >/dev/null
257
+ python3 - "$TMP_DIR/wrong-mapping-manifest.json" <<'PY'
258
+ import json
259
+ import sys
260
+
261
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
262
+ if "F9" in manifest["fixtures_pair_eligible"]:
263
+ raise SystemExit("F9 must not be included when solo_claude is absent from _blind_mapping")
264
+ if manifest["selection_rule"]["f9_included"] is not False:
265
+ raise SystemExit("expected f9_included false for wrong mapping")
266
+ PY
267
+
268
+ write_f9 "$TMP_DIR/f9-dq.json" "solo_claude" true
269
+ python3 "$BUILD" \
270
+ --c1-summary "$TMP_DIR/c1.json" \
271
+ --f9-judge "$TMP_DIR/f9-dq.json" \
272
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
273
+ --output "$TMP_DIR/dq-manifest.json" >/dev/null
274
+ python3 - "$TMP_DIR/dq-manifest.json" <<'PY'
275
+ import json
276
+ import sys
277
+
278
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
279
+ if "F9" in manifest["fixtures_pair_eligible"]:
280
+ raise SystemExit("F9 must not be included when solo_claude is disqualified")
281
+ PY
282
+
283
+ cat > "$TMP_DIR/f9-malformed-mapping.json" <<'JSON'
284
+ {
285
+ "a_score": 91,
286
+ "b_score": 76,
287
+ "_blind_mapping": "not-a-dict",
288
+ "scores_by_arm": {"solo_claude": 91, "bare": 76},
289
+ "disqualifiers": {"A": false, "B": false},
290
+ "disqualifiers_by_arm": {
291
+ "solo_claude": {"disqualifier": false},
292
+ "bare": {"disqualifier": false}
293
+ }
294
+ }
295
+ JSON
296
+ python3 "$BUILD" \
297
+ --c1-summary "$TMP_DIR/c1.json" \
298
+ --f9-judge "$TMP_DIR/f9-malformed-mapping.json" \
299
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
300
+ --output "$TMP_DIR/malformed-mapping-manifest.json" >/dev/null
301
+ python3 - "$TMP_DIR/malformed-mapping-manifest.json" <<'PY'
302
+ import json
303
+ import sys
304
+
305
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
306
+ if "F9" in manifest["fixtures_pair_eligible"]:
307
+ raise SystemExit("F9 must not be included when _blind_mapping is malformed")
308
+ if manifest["selection_rule"]["f9_included"] is not False:
309
+ raise SystemExit("expected f9_included false for malformed mapping")
310
+ PY
311
+
312
+ cat > "$TMP_DIR/f9-malformed-scores.json" <<'JSON'
313
+ {
314
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "seed": 1},
315
+ "scores_by_arm": ["not", "a", "dict"],
316
+ "disqualifiers": {"A": false, "B": false},
317
+ "disqualifiers_by_arm": {
318
+ "solo_claude": {"disqualifier": false},
319
+ "bare": {"disqualifier": false}
320
+ }
321
+ }
322
+ JSON
323
+ python3 "$BUILD" \
324
+ --c1-summary "$TMP_DIR/c1.json" \
325
+ --f9-judge "$TMP_DIR/f9-malformed-scores.json" \
326
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
327
+ --output "$TMP_DIR/malformed-scores-manifest.json" >/dev/null
328
+ python3 - "$TMP_DIR/malformed-scores-manifest.json" <<'PY'
329
+ import json
330
+ import sys
331
+
332
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
333
+ if "F9" in manifest["fixtures_pair_eligible"]:
334
+ raise SystemExit("F9 must not be included when scores_by_arm is malformed and no legacy scores exist")
335
+ if manifest["selection_rule"]["f9_included"] is not False:
336
+ raise SystemExit("expected f9_included false for malformed scores")
337
+ PY
338
+
339
+ cat > "$TMP_DIR/f9-overrange-scores.json" <<'JSON'
340
+ {
341
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "seed": 1},
342
+ "scores_by_arm": {"solo_claude": 101, "bare": 76},
343
+ "a_score": 101,
344
+ "b_score": 76,
345
+ "disqualifiers": {"A": false, "B": false},
346
+ "disqualifiers_by_arm": {
347
+ "solo_claude": {"disqualifier": false},
348
+ "bare": {"disqualifier": false}
349
+ }
350
+ }
351
+ JSON
352
+ python3 "$BUILD" \
353
+ --c1-summary "$TMP_DIR/c1.json" \
354
+ --f9-judge "$TMP_DIR/f9-overrange-scores.json" \
355
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
356
+ --output "$TMP_DIR/overrange-scores-manifest.json" >/dev/null
357
+ python3 - "$TMP_DIR/overrange-scores-manifest.json" <<'PY'
358
+ import json
359
+ import sys
360
+
361
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
362
+ if manifest["selection_rule"]["f9_included"] is not False:
363
+ raise SystemExit("F9 must not be included when mapped scores are out of range")
364
+ PY
365
+
366
+ cat > "$TMP_DIR/f9-boolean-scores.json" <<'JSON'
367
+ {
368
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "seed": 1},
369
+ "scores_by_arm": {"solo_claude": true, "bare": 0},
370
+ "a_score": true,
371
+ "b_score": 0,
372
+ "disqualifiers": {"A": false, "B": false},
373
+ "disqualifiers_by_arm": {
374
+ "solo_claude": {"disqualifier": false},
375
+ "bare": {"disqualifier": false}
376
+ }
377
+ }
378
+ JSON
379
+ python3 "$BUILD" \
380
+ --c1-summary "$TMP_DIR/c1.json" \
381
+ --f9-judge "$TMP_DIR/f9-boolean-scores.json" \
382
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
383
+ --output "$TMP_DIR/boolean-scores-manifest.json" >/dev/null
384
+ python3 - "$TMP_DIR/boolean-scores-manifest.json" <<'PY'
385
+ import json
386
+ import sys
387
+
388
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
389
+ if manifest["selection_rule"]["f9_included"] is not False:
390
+ raise SystemExit("F9 must not be included when mapped scores are booleans")
391
+ PY
392
+
393
+ cat > "$TMP_DIR/f9-malformed-dq.json" <<'JSON'
394
+ {
395
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "seed": 1},
396
+ "scores_by_arm": {"solo_claude": 91, "bare": 76},
397
+ "disqualifiers": ["not", "a", "dict"],
398
+ "disqualifiers_by_arm": ["not", "a", "dict"]
399
+ }
400
+ JSON
401
+ python3 "$BUILD" \
402
+ --c1-summary "$TMP_DIR/c1.json" \
403
+ --f9-judge "$TMP_DIR/f9-malformed-dq.json" \
404
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
405
+ --output "$TMP_DIR/malformed-dq-manifest.json" >/dev/null
406
+ python3 - "$TMP_DIR/malformed-dq-manifest.json" <<'PY'
407
+ import json
408
+ import sys
409
+
410
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
411
+ if "F9" in manifest["fixtures_pair_eligible"]:
412
+ raise SystemExit("F9 must not be included when disqualifier maps are malformed")
413
+ if manifest["selection_rule"]["f9_included"] is not False:
414
+ raise SystemExit("malformed disqualifier maps must fail closed before registry filtering")
415
+ PY
416
+
417
+ cat > "$TMP_DIR/f9-malformed-dq-entry.json" <<'JSON'
418
+ {
419
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "seed": 1},
420
+ "scores_by_arm": {"solo_claude": 91, "bare": 76},
421
+ "disqualifiers": {"A": false, "B": false},
422
+ "disqualifiers_by_arm": {"solo_claude": true}
423
+ }
424
+ JSON
425
+ python3 "$BUILD" \
426
+ --c1-summary "$TMP_DIR/c1.json" \
427
+ --f9-judge "$TMP_DIR/f9-malformed-dq-entry.json" \
428
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
429
+ --output "$TMP_DIR/malformed-dq-entry-manifest.json" >/dev/null
430
+ python3 - "$TMP_DIR/malformed-dq-entry-manifest.json" <<'PY'
431
+ import json
432
+ import sys
433
+
434
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
435
+ if "F9" in manifest["fixtures_pair_eligible"]:
436
+ raise SystemExit("truthy malformed disqualifier entry must exclude F9")
437
+ PY
438
+
439
+ cat > "$TMP_DIR/f9-string-dq-entry.json" <<'JSON'
440
+ {
441
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "seed": 1},
442
+ "scores_by_arm": {"solo_claude": 91, "bare": 76},
443
+ "disqualifiers": {"A": false, "B": false},
444
+ "disqualifiers_by_arm": {"solo_claude": {"disqualifier": "false"}}
445
+ }
446
+ JSON
447
+ python3 "$BUILD" \
448
+ --c1-summary "$TMP_DIR/c1.json" \
449
+ --f9-judge "$TMP_DIR/f9-string-dq-entry.json" \
450
+ --l1-rerun-summary "$TMP_DIR/l1.json" \
451
+ --output "$TMP_DIR/string-dq-entry-manifest.json" >/dev/null
452
+ python3 - "$TMP_DIR/string-dq-entry-manifest.json" <<'PY'
453
+ import json
454
+ import sys
455
+
456
+ manifest = json.load(open(sys.argv[1], encoding="utf8"))
457
+ if manifest["selection_rule"]["f9_included"] is not False:
458
+ raise SystemExit("string disqualifier entry must fail closed")
459
+ PY
460
+
461
+ python3 - "$BUILD" <<'PY'
462
+ import importlib.util
463
+ import pathlib
464
+ import sys
465
+ import tempfile
466
+
467
+ spec = importlib.util.spec_from_file_location("build_pair_eligible_manifest", sys.argv[1])
468
+ module = importlib.util.module_from_spec(spec)
469
+ assert spec.loader is not None
470
+ spec.loader.exec_module(module)
471
+ reasons = module.load_rejected_fixture_reasons(module.REJECTED_REGISTRY)
472
+ assert reasons["F31"] == "solo_claude scored 98 with bare disqualifiers in 20260512-f31-seat-rebalance-headroom"
473
+ assert reasons["F32"] == "bare 33 / solo_claude 98 in 20260512-f32-subscription-renewal-headroom"
474
+ assert reasons["S3"] == "bare 33 / solo_claude 99 with solo timeout in 20260513-s3-ticket-headroom"
475
+ with tempfile.TemporaryDirectory() as tmp:
476
+ registry = pathlib.Path(tmp) / "pair-rejected-fixtures.sh"
477
+ registry.write_text(
478
+ 'rejected_pair_fixture_reason() {\n'
479
+ ' case "$1" in\n'
480
+ ' S3-*|S3)\n'
481
+ ' echo "shadow solo ceiling"\n'
482
+ ' ;;\n'
483
+ ' *) return 1 ;;\n'
484
+ ' esac\n'
485
+ '}\n',
486
+ encoding="utf8",
487
+ )
488
+ assert module.load_rejected_fixture_reasons(registry) == {"S3": "shadow solo ceiling"}
489
+ PY
490
+
491
+ echo "PASS test-build-pair-eligible-manifest"
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for check-f9-artifacts.py.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ CHECK="$SCRIPT_DIR/check-f9-artifacts.py"
8
+ TMP_DIR="$(mktemp -d /tmp/check-f9-artifacts-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ write_result() {
12
+ local arm="$1"
13
+ local result_dir="$TMP_DIR/results/run/F9-e2e-ideate-to-resolve/$arm"
14
+ local work_dir="$TMP_DIR/work/$arm"
15
+ mkdir -p "$result_dir" "$work_dir/docs/specs/F9-e2e-ideate-to-resolve" "$work_dir/.devlyn"
16
+ cat > "$result_dir/timing.json" <<EOF
17
+ {"work_dir": "$work_dir"}
18
+ EOF
19
+ cat > "$work_dir/docs/specs/F9-e2e-ideate-to-resolve/spec.md" <<'EOF'
20
+ # F9
21
+ EOF
22
+ cat > "$work_dir/docs/specs/F9-e2e-ideate-to-resolve/spec.expected.json" <<'EOF'
23
+ {"verification_commands": []}
24
+ EOF
25
+ cat > "$work_dir/.devlyn/pipeline.state.json" <<'EOF'
26
+ {
27
+ "mode": "spec",
28
+ "source": {
29
+ "type": "spec",
30
+ "spec_path": "docs/specs/F9-e2e-ideate-to-resolve/spec.md"
31
+ }
32
+ }
33
+ EOF
34
+ cat > "$result_dir/transcript.txt" <<'EOF'
35
+ spec ready - /devlyn:resolve --spec docs/specs/F9-e2e-ideate-to-resolve/spec.md
36
+ EOF
37
+ printf '%s\n' "$result_dir"
38
+ }
39
+
40
+ risk_result="$(write_result l2_risk_probes)"
41
+ python3 "$CHECK" --result-dir "$risk_result"
42
+ grep -Fq '"arm": "l2_risk_probes"' "$risk_result/check-f9-artifacts.json"
43
+ grep -Fq '"pass": true' "$risk_result/check-f9-artifacts.json"
44
+
45
+ malformed_timing_result="$(write_result solo_claude)"
46
+ printf '["not", "a", "dict"]\n' > "$malformed_timing_result/timing.json"
47
+ if python3 "$CHECK" --result-dir "$malformed_timing_result"; then
48
+ echo "expected malformed timing.json to fail" >&2
49
+ exit 1
50
+ fi
51
+ grep -Fq '"name": "work-dir-resolvable"' "$malformed_timing_result/check-f9-artifacts.json"
52
+ grep -Fq '"pass": false' "$malformed_timing_result/check-f9-artifacts.json"
53
+
54
+ malformed_state_result="$(write_result l2_gated)"
55
+ work_dir="$(python3 - "$malformed_state_result/timing.json" <<'PY'
56
+ import json
57
+ import sys
58
+ print(json.load(open(sys.argv[1], encoding="utf8"))["work_dir"])
59
+ PY
60
+ )"
61
+ printf '["not", "a", "dict"]\n' > "$work_dir/.devlyn/pipeline.state.json"
62
+ if python3 "$CHECK" --result-dir "$malformed_state_result"; then
63
+ echo "expected malformed pipeline.state.json to fail" >&2
64
+ exit 1
65
+ fi
66
+ grep -Fq '"name": "pipeline.state.json-parses"' "$malformed_state_result/check-f9-artifacts.json"
67
+ grep -Fq '"reason": "expected JSON object"' "$malformed_state_result/check-f9-artifacts.json"
68
+
69
+ nan_state_result="$(write_result l2_forced)"
70
+ nan_work_dir="$(python3 - "$nan_state_result/timing.json" <<'PY'
71
+ import json
72
+ import sys
73
+ print(json.load(open(sys.argv[1], encoding="utf8"))["work_dir"])
74
+ PY
75
+ )"
76
+ cat > "$nan_work_dir/.devlyn/pipeline.state.json" <<'EOF'
77
+ {"mode": NaN, "source": {"type": "spec", "spec_path": "docs/specs/F9-e2e-ideate-to-resolve/spec.md"}}
78
+ EOF
79
+ if python3 "$CHECK" --result-dir "$nan_state_result"; then
80
+ echo "expected NaN pipeline.state.json to fail" >&2
81
+ exit 1
82
+ fi
83
+ grep -Fq '"name": "pipeline.state.json-parses"' "$nan_state_result/check-f9-artifacts.json"
84
+ grep -Fq 'invalid JSON numeric constant: NaN' "$nan_state_result/check-f9-artifacts.json"
85
+
86
+ bare_result="$TMP_DIR/results/run/F9-e2e-ideate-to-resolve/bare"
87
+ mkdir -p "$bare_result"
88
+ python3 "$CHECK" --result-dir "$bare_result"
89
+ grep -Fq '"exempt": true' "$bare_result/check-f9-artifacts.json"
90
+
91
+ echo "PASS test-check-f9-artifacts"