devlyn-cli 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +80 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +210 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,525 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for iter-0033c-compare.py score-source handling.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ COMPARE="$SCRIPT_DIR/iter-0033c-compare.py"
8
+ TMP_DIR="$(mktemp -d /tmp/iter-0033c-compare-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ write_manifest() {
12
+ local path="$1"
13
+ cat > "$path" <<'JSON'
14
+ {
15
+ "manifest_sha256": "synthetic",
16
+ "fixtures_pair_eligible": ["F1"],
17
+ "gate3_threshold_count": 1,
18
+ "gate3_total": 1
19
+ }
20
+ JSON
21
+ }
22
+
23
+ write_manifest_with_values() {
24
+ local path="$1"
25
+ local eligible="$2"
26
+ local threshold="$3"
27
+ local total="$4"
28
+ cat > "$path" <<JSON
29
+ {
30
+ "manifest_sha256": "synthetic",
31
+ "fixtures_pair_eligible": $eligible,
32
+ "gate3_threshold_count": $threshold,
33
+ "gate3_total": $total
34
+ }
35
+ JSON
36
+ }
37
+
38
+ write_fixture() {
39
+ local run_dir="$1"
40
+ local mapping_c="$2"
41
+ local fixture="$run_dir/F1-synthetic"
42
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
43
+ cat > "$fixture/solo_claude/result.json" <<'JSON'
44
+ {"elapsed_seconds": 100, "timed_out": false}
45
+ JSON
46
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
47
+ {"elapsed_seconds": 150, "timed_out": false}
48
+ JSON
49
+ cat > "$fixture/judge.json" <<JSON
50
+ {
51
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "$mapping_c", "seed": 1},
52
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "l2_gated": 70},
53
+ "disqualifiers_by_arm": {
54
+ "solo_claude": {"disqualifier": false},
55
+ "l2_gated": {"disqualifier": false}
56
+ }
57
+ }
58
+ JSON
59
+ }
60
+
61
+ write_fixture_with_malformed_mapping() {
62
+ local run_dir="$1"
63
+ local fixture="$run_dir/F1-synthetic"
64
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
65
+ cat > "$fixture/solo_claude/result.json" <<'JSON'
66
+ {"elapsed_seconds": 100, "timed_out": false}
67
+ JSON
68
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
69
+ {"elapsed_seconds": 150, "timed_out": false}
70
+ JSON
71
+ cat > "$fixture/judge.json" <<'JSON'
72
+ {
73
+ "_blind_mapping": "not-a-dict",
74
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "l2_gated": 70},
75
+ "disqualifiers_by_arm": {
76
+ "solo_claude": {"disqualifier": false},
77
+ "l2_gated": {"disqualifier": false}
78
+ }
79
+ }
80
+ JSON
81
+ }
82
+
83
+ write_fixture_with_malformed_scores() {
84
+ local run_dir="$1"
85
+ local fixture="$run_dir/F1-synthetic"
86
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
87
+ cat > "$fixture/solo_claude/result.json" <<'JSON'
88
+ {"elapsed_seconds": 100, "timed_out": false}
89
+ JSON
90
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
91
+ {"elapsed_seconds": 150, "timed_out": false}
92
+ JSON
93
+ cat > "$fixture/judge.json" <<'JSON'
94
+ {
95
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_gated", "seed": 1},
96
+ "scores_by_arm": ["not", "a", "dict"],
97
+ "disqualifiers_by_arm": {
98
+ "solo_claude": {"disqualifier": false},
99
+ "l2_gated": {"disqualifier": false}
100
+ }
101
+ }
102
+ JSON
103
+ }
104
+
105
+ write_fixture_with_malformed_dq_entry() {
106
+ local run_dir="$1"
107
+ local fixture="$run_dir/F1-synthetic"
108
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
109
+ cat > "$fixture/solo_claude/result.json" <<'JSON'
110
+ {"elapsed_seconds": 100, "timed_out": false}
111
+ JSON
112
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
113
+ {"elapsed_seconds": 150, "timed_out": false}
114
+ JSON
115
+ cat > "$fixture/judge.json" <<'JSON'
116
+ {
117
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_gated", "seed": 1},
118
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "l2_gated": 70},
119
+ "disqualifiers_by_arm": {"l2_gated": true}
120
+ }
121
+ JSON
122
+ }
123
+
124
+ write_fixture_with_malformed_legacy_dq() {
125
+ local run_dir="$1"
126
+ local fixture="$run_dir/F1-synthetic"
127
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
128
+ cat > "$fixture/solo_claude/result.json" <<'JSON'
129
+ {"elapsed_seconds": 100, "timed_out": false}
130
+ JSON
131
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
132
+ {"elapsed_seconds": 150, "timed_out": false}
133
+ JSON
134
+ cat > "$fixture/judge.json" <<'JSON'
135
+ {
136
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_gated", "seed": 1},
137
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "l2_gated": 70},
138
+ "disqualifiers": ["not", "a", "dict"]
139
+ }
140
+ JSON
141
+ }
142
+
143
+ write_fixture_with_string_dq_entry() {
144
+ local run_dir="$1"
145
+ local fixture="$run_dir/F1-synthetic"
146
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
147
+ cat > "$fixture/solo_claude/result.json" <<'JSON'
148
+ {"elapsed_seconds": 100, "timed_out": false}
149
+ JSON
150
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
151
+ {"elapsed_seconds": 150, "timed_out": false}
152
+ JSON
153
+ cat > "$fixture/judge.json" <<'JSON'
154
+ {
155
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_gated", "seed": 1},
156
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "l2_gated": 70},
157
+ "disqualifiers_by_arm": {
158
+ "solo_claude": {"disqualifier": false},
159
+ "l2_gated": {"disqualifier": "false"}
160
+ }
161
+ }
162
+ JSON
163
+ }
164
+
165
+ write_fixture_with_string_timeout() {
166
+ local run_dir="$1"
167
+ local fixture="$run_dir/F1-synthetic"
168
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
169
+ cat > "$fixture/solo_claude/result.json" <<'JSON'
170
+ {"elapsed_seconds": 100, "timed_out": false}
171
+ JSON
172
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
173
+ {"elapsed_seconds": 150, "timed_out": "false"}
174
+ JSON
175
+ cat > "$fixture/judge.json" <<'JSON'
176
+ {
177
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_gated", "seed": 1},
178
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "l2_gated": 70},
179
+ "disqualifiers_by_arm": {
180
+ "solo_claude": {"disqualifier": false},
181
+ "l2_gated": {"disqualifier": false}
182
+ }
183
+ }
184
+ JSON
185
+ }
186
+
187
+ write_fixture_with_malformed_result() {
188
+ local run_dir="$1"
189
+ local fixture="$run_dir/F1-synthetic"
190
+ mkdir -p "$fixture"/{solo_claude,l2_gated}
191
+ printf '["not", "a", "dict"]\n' > "$fixture/solo_claude/result.json"
192
+ cat > "$fixture/l2_gated/result.json" <<'JSON'
193
+ {"elapsed_seconds": 150, "timed_out": false}
194
+ JSON
195
+ cat > "$fixture/judge.json" <<'JSON'
196
+ {
197
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_gated", "seed": 1},
198
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "l2_gated": 70},
199
+ "disqualifiers_by_arm": {
200
+ "solo_claude": {"disqualifier": false},
201
+ "l2_gated": {"disqualifier": false}
202
+ }
203
+ }
204
+ JSON
205
+ }
206
+
207
+ write_state_pair_judge() {
208
+ local arm="$1"
209
+ local pair_judge_json="$2"
210
+ local run_dir="$TMP_DIR/bench-synthetic-F1-synthetic-$arm/.devlyn/runs/001"
211
+ mkdir -p "$run_dir"
212
+ cat > "$run_dir/pipeline.state.json" <<JSON
213
+ {
214
+ "phases": {
215
+ "verify": {
216
+ "sub_verdicts": {
217
+ "judge": "PASS_WITH_ISSUES",
218
+ "pair_judge": $pair_judge_json
219
+ }
220
+ }
221
+ }
222
+ }
223
+ JSON
224
+ }
225
+
226
+ expect_fail_contains() {
227
+ local label="$1"
228
+ local needle="$2"
229
+ shift 2
230
+ local out="$TMP_DIR/$label.out"
231
+ if "$@" > "$out" 2>&1; then
232
+ echo "expected failure for $label" >&2
233
+ cat "$out" >&2
234
+ exit 1
235
+ fi
236
+ if ! grep -Fq "$needle" "$out"; then
237
+ echo "missing expected text for $label: $needle" >&2
238
+ cat "$out" >&2
239
+ exit 1
240
+ fi
241
+ }
242
+
243
+ MANIFEST="$TMP_DIR/manifest.json"
244
+ write_manifest "$MANIFEST"
245
+
246
+ cat > "$TMP_DIR/nan-threshold-manifest.json" <<'JSON'
247
+ {
248
+ "fixtures_pair_eligible": ["F1"],
249
+ "gate3_threshold_count": NaN,
250
+ "gate3_total": 1,
251
+ "manifest_sha256": "synthetic"
252
+ }
253
+ JSON
254
+ expect_fail_contains nan-threshold-manifest "manifest malformed: invalid JSON" \
255
+ python3 "$COMPARE" \
256
+ --manifest "$TMP_DIR/nan-threshold-manifest.json" \
257
+ --results-dir "$TMP_DIR" \
258
+ --work-dir-root "$TMP_DIR" \
259
+ --run-id synthetic \
260
+ --out-json "$TMP_DIR/nan-threshold.json" \
261
+ --out-md "$TMP_DIR/nan-threshold.md"
262
+
263
+ write_manifest_with_values "$TMP_DIR/empty-manifest.json" '[]' 0 0
264
+ expect_fail_contains empty-manifest "fixtures_pair_eligible must not be empty" \
265
+ python3 "$COMPARE" \
266
+ --manifest "$TMP_DIR/empty-manifest.json" \
267
+ --results-dir "$TMP_DIR" \
268
+ --work-dir-root "$TMP_DIR" \
269
+ --run-id synthetic \
270
+ --out-json "$TMP_DIR/empty.json" \
271
+ --out-md "$TMP_DIR/empty.md"
272
+
273
+ write_manifest_with_values "$TMP_DIR/zero-threshold-manifest.json" '["F1"]' 0 1
274
+ expect_fail_contains zero-threshold-manifest "gate3_threshold_count must be a positive integer" \
275
+ python3 "$COMPARE" \
276
+ --manifest "$TMP_DIR/zero-threshold-manifest.json" \
277
+ --results-dir "$TMP_DIR" \
278
+ --work-dir-root "$TMP_DIR" \
279
+ --run-id synthetic \
280
+ --out-json "$TMP_DIR/zero-threshold.json" \
281
+ --out-md "$TMP_DIR/zero-threshold.md"
282
+
283
+ write_manifest_with_values "$TMP_DIR/total-mismatch-manifest.json" '["F1"]' 1 2
284
+ expect_fail_contains total-mismatch-manifest "gate3_total must equal fixtures_pair_eligible length" \
285
+ python3 "$COMPARE" \
286
+ --manifest "$TMP_DIR/total-mismatch-manifest.json" \
287
+ --results-dir "$TMP_DIR" \
288
+ --work-dir-root "$TMP_DIR" \
289
+ --run-id synthetic \
290
+ --out-json "$TMP_DIR/total-mismatch.json" \
291
+ --out-md "$TMP_DIR/total-mismatch.md"
292
+
293
+ cat > "$TMP_DIR/bad-rejected-reasons-manifest.json" <<'JSON'
294
+ {
295
+ "manifest_sha256": "synthetic",
296
+ "fixtures_pair_eligible": ["F1"],
297
+ "gate3_threshold_count": 1,
298
+ "gate3_total": 1,
299
+ "selection_rule": {
300
+ "rejected_excluded": ["F2"],
301
+ "rejected_excluded_reasons": {"F3": "wrong fixture"}
302
+ }
303
+ }
304
+ JSON
305
+ expect_fail_contains bad-rejected-reasons-manifest "selection_rule.rejected_excluded_reasons keys must match rejected_excluded" \
306
+ python3 "$COMPARE" \
307
+ --manifest "$TMP_DIR/bad-rejected-reasons-manifest.json" \
308
+ --results-dir "$TMP_DIR" \
309
+ --work-dir-root "$TMP_DIR" \
310
+ --run-id synthetic \
311
+ --out-json "$TMP_DIR/bad-rejected-reasons.json" \
312
+ --out-md "$TMP_DIR/bad-rejected-reasons.md"
313
+
314
+ PASS_DIR="$TMP_DIR/pass-results"
315
+ mkdir -p "$PASS_DIR"
316
+ write_fixture "$PASS_DIR" "l2_gated"
317
+ python3 "$COMPARE" \
318
+ --manifest "$MANIFEST" \
319
+ --results-dir "$PASS_DIR" \
320
+ --work-dir-root "$TMP_DIR" \
321
+ --run-id synthetic \
322
+ --out-json "$TMP_DIR/pass.json" \
323
+ --out-md "$TMP_DIR/pass.md"
324
+ grep -Fq '"ship_blockers_failed": []' "$TMP_DIR/pass.json"
325
+ grep -Fq '"l2_gated_score": 70' "$TMP_DIR/pass.json"
326
+
327
+ MALFORMED_PAIR_STATE_DIR="$TMP_DIR/malformed-pair-state-results"
328
+ mkdir -p "$MALFORMED_PAIR_STATE_DIR"
329
+ write_fixture "$MALFORMED_PAIR_STATE_DIR" "l2_gated"
330
+ write_state_pair_judge l2_gated '""'
331
+ python3 "$COMPARE" \
332
+ --manifest "$MANIFEST" \
333
+ --results-dir "$MALFORMED_PAIR_STATE_DIR" \
334
+ --work-dir-root "$TMP_DIR" \
335
+ --run-id synthetic \
336
+ --out-json "$TMP_DIR/malformed-pair-state.json" \
337
+ --out-md "$TMP_DIR/malformed-pair-state.md"
338
+ grep -Fq '"l2_gated_pair_judge_present": false' "$TMP_DIR/malformed-pair-state.json"
339
+ grep -Fq '"pair_fired": false' "$TMP_DIR/malformed-pair-state.json"
340
+ grep -Fq '"ship_blockers_failed": []' "$TMP_DIR/malformed-pair-state.json"
341
+
342
+ BAD_DIR="$TMP_DIR/bad-results"
343
+ mkdir -p "$BAD_DIR"
344
+ write_fixture "$BAD_DIR" "l2_forced"
345
+ expect_fail_contains bad-mapping "SHIP-BLOCKER FAIL" \
346
+ python3 "$COMPARE" \
347
+ --manifest "$MANIFEST" \
348
+ --results-dir "$BAD_DIR" \
349
+ --work-dir-root "$TMP_DIR" \
350
+ --run-id synthetic \
351
+ --out-json "$TMP_DIR/bad.json" \
352
+ --out-md "$TMP_DIR/bad.md"
353
+ grep -Fq '"l2_gated_score": null' "$TMP_DIR/bad.json"
354
+ grep -Fq '"3-lift-on-pair-eligible"' "$TMP_DIR/bad.json"
355
+
356
+ MALFORMED_DIR="$TMP_DIR/malformed-results"
357
+ mkdir -p "$MALFORMED_DIR"
358
+ write_fixture_with_malformed_mapping "$MALFORMED_DIR"
359
+ expect_fail_contains malformed-mapping "SHIP-BLOCKER FAIL" \
360
+ python3 "$COMPARE" \
361
+ --manifest "$MANIFEST" \
362
+ --results-dir "$MALFORMED_DIR" \
363
+ --work-dir-root "$TMP_DIR" \
364
+ --run-id synthetic \
365
+ --out-json "$TMP_DIR/malformed.json" \
366
+ --out-md "$TMP_DIR/malformed.md"
367
+ grep -Fq '"solo_score": null' "$TMP_DIR/malformed.json"
368
+ grep -Fq '"l2_gated_score": null' "$TMP_DIR/malformed.json"
369
+ grep -Fq '"solo_dq": true' "$TMP_DIR/malformed.json"
370
+ grep -Fq '"l2_gated_dq": true' "$TMP_DIR/malformed.json"
371
+
372
+ MALFORMED_SCORES_DIR="$TMP_DIR/malformed-scores-results"
373
+ mkdir -p "$MALFORMED_SCORES_DIR"
374
+ write_fixture_with_malformed_scores "$MALFORMED_SCORES_DIR"
375
+ expect_fail_contains malformed-scores "SHIP-BLOCKER FAIL" \
376
+ python3 "$COMPARE" \
377
+ --manifest "$MANIFEST" \
378
+ --results-dir "$MALFORMED_SCORES_DIR" \
379
+ --work-dir-root "$TMP_DIR" \
380
+ --run-id synthetic \
381
+ --out-json "$TMP_DIR/malformed-scores.json" \
382
+ --out-md "$TMP_DIR/malformed-scores.md"
383
+ grep -Fq '"solo_score": null' "$TMP_DIR/malformed-scores.json"
384
+ grep -Fq '"l2_gated_score": null' "$TMP_DIR/malformed-scores.json"
385
+
386
+ OVERRANGE_SCORES_DIR="$TMP_DIR/overrange-scores-results"
387
+ mkdir -p "$OVERRANGE_SCORES_DIR"
388
+ write_fixture "$OVERRANGE_SCORES_DIR" "l2_gated"
389
+ python3 - "$OVERRANGE_SCORES_DIR/F1-synthetic/judge.json" <<'PY'
390
+ import json, sys
391
+ path = sys.argv[1]
392
+ data = json.load(open(path))
393
+ data["scores_by_arm"]["l2_gated"] = 101
394
+ data["c_score"] = 101
395
+ json.dump(data, open(path, "w"), indent=2)
396
+ PY
397
+ expect_fail_contains overrange-scores "SHIP-BLOCKER FAIL" \
398
+ python3 "$COMPARE" \
399
+ --manifest "$MANIFEST" \
400
+ --results-dir "$OVERRANGE_SCORES_DIR" \
401
+ --work-dir-root "$TMP_DIR" \
402
+ --run-id synthetic \
403
+ --out-json "$TMP_DIR/overrange-scores.json" \
404
+ --out-md "$TMP_DIR/overrange-scores.md"
405
+ grep -Fq '"l2_gated_score": null' "$TMP_DIR/overrange-scores.json"
406
+
407
+ BOOLEAN_SCORES_DIR="$TMP_DIR/boolean-scores-results"
408
+ mkdir -p "$BOOLEAN_SCORES_DIR"
409
+ write_fixture "$BOOLEAN_SCORES_DIR" "l2_gated"
410
+ python3 - "$BOOLEAN_SCORES_DIR/F1-synthetic/judge.json" <<'PY'
411
+ import json, sys
412
+ path = sys.argv[1]
413
+ data = json.load(open(path))
414
+ data["scores_by_arm"]["solo_claude"] = True
415
+ data["a_score"] = True
416
+ json.dump(data, open(path, "w"), indent=2)
417
+ PY
418
+ expect_fail_contains boolean-scores "SHIP-BLOCKER FAIL" \
419
+ python3 "$COMPARE" \
420
+ --manifest "$MANIFEST" \
421
+ --results-dir "$BOOLEAN_SCORES_DIR" \
422
+ --work-dir-root "$TMP_DIR" \
423
+ --run-id synthetic \
424
+ --out-json "$TMP_DIR/boolean-scores.json" \
425
+ --out-md "$TMP_DIR/boolean-scores.md"
426
+ grep -Fq '"solo_score": null' "$TMP_DIR/boolean-scores.json"
427
+
428
+ BOOLEAN_WALL_DIR="$TMP_DIR/boolean-wall-results"
429
+ mkdir -p "$BOOLEAN_WALL_DIR"
430
+ write_fixture "$BOOLEAN_WALL_DIR" "l2_gated"
431
+ python3 - "$BOOLEAN_WALL_DIR/F1-synthetic/l2_gated/result.json" <<'PY'
432
+ import json, sys
433
+ path = sys.argv[1]
434
+ data = json.load(open(path))
435
+ data["elapsed_seconds"] = True
436
+ json.dump(data, open(path, "w"), indent=2)
437
+ PY
438
+ python3 "$COMPARE" \
439
+ --manifest "$MANIFEST" \
440
+ --results-dir "$BOOLEAN_WALL_DIR" \
441
+ --work-dir-root "$TMP_DIR" \
442
+ --run-id synthetic \
443
+ --out-json "$TMP_DIR/boolean-wall.json" \
444
+ --out-md "$TMP_DIR/boolean-wall.md" >/dev/null
445
+ grep -Fq '"l2_gated_wall": null' "$TMP_DIR/boolean-wall.json"
446
+
447
+ NAN_WALL_DIR="$TMP_DIR/nan-wall-results"
448
+ mkdir -p "$NAN_WALL_DIR"
449
+ write_fixture "$NAN_WALL_DIR" "l2_gated"
450
+ cat > "$NAN_WALL_DIR/F1-synthetic/l2_gated/result.json" <<'JSON'
451
+ {"elapsed_seconds": NaN, "timed_out": false}
452
+ JSON
453
+ python3 "$COMPARE" \
454
+ --manifest "$MANIFEST" \
455
+ --results-dir "$NAN_WALL_DIR" \
456
+ --work-dir-root "$TMP_DIR" \
457
+ --run-id synthetic \
458
+ --out-json "$TMP_DIR/nan-wall.json" \
459
+ --out-md "$TMP_DIR/nan-wall.md" >/dev/null
460
+ grep -Fq '"l2_gated_wall": null' "$TMP_DIR/nan-wall.json"
461
+
462
+ STRING_TIMEOUT_DIR="$TMP_DIR/string-timeout-results"
463
+ mkdir -p "$STRING_TIMEOUT_DIR"
464
+ write_fixture_with_string_timeout "$STRING_TIMEOUT_DIR"
465
+ expect_fail_contains string-timeout "SHIP-BLOCKER FAIL" \
466
+ python3 "$COMPARE" \
467
+ --manifest "$MANIFEST" \
468
+ --results-dir "$STRING_TIMEOUT_DIR" \
469
+ --work-dir-root "$TMP_DIR" \
470
+ --run-id synthetic \
471
+ --out-json "$TMP_DIR/string-timeout.json" \
472
+ --out-md "$TMP_DIR/string-timeout.md"
473
+ grep -Fq '"l2_gated_timeout": true' "$TMP_DIR/string-timeout.json"
474
+
475
+ MALFORMED_DQ_ENTRY_DIR="$TMP_DIR/malformed-dq-entry-results"
476
+ mkdir -p "$MALFORMED_DQ_ENTRY_DIR"
477
+ write_fixture_with_malformed_dq_entry "$MALFORMED_DQ_ENTRY_DIR"
478
+ python3 "$COMPARE" \
479
+ --manifest "$MANIFEST" \
480
+ --results-dir "$MALFORMED_DQ_ENTRY_DIR" \
481
+ --work-dir-root "$TMP_DIR" \
482
+ --run-id synthetic \
483
+ --out-json "$TMP_DIR/malformed-dq-entry.json" \
484
+ --out-md "$TMP_DIR/malformed-dq-entry.md" >/dev/null
485
+ grep -Fq '"l2_gated_dq": true' "$TMP_DIR/malformed-dq-entry.json"
486
+
487
+ STRING_DQ_ENTRY_DIR="$TMP_DIR/string-dq-entry-results"
488
+ mkdir -p "$STRING_DQ_ENTRY_DIR"
489
+ write_fixture_with_string_dq_entry "$STRING_DQ_ENTRY_DIR"
490
+ python3 "$COMPARE" \
491
+ --manifest "$MANIFEST" \
492
+ --results-dir "$STRING_DQ_ENTRY_DIR" \
493
+ --work-dir-root "$TMP_DIR" \
494
+ --run-id synthetic \
495
+ --out-json "$TMP_DIR/string-dq-entry.json" \
496
+ --out-md "$TMP_DIR/string-dq-entry.md" >/dev/null
497
+ grep -Fq '"l2_gated_dq": true' "$TMP_DIR/string-dq-entry.json"
498
+
499
+ MALFORMED_LEGACY_DQ_DIR="$TMP_DIR/malformed-legacy-dq-results"
500
+ mkdir -p "$MALFORMED_LEGACY_DQ_DIR"
501
+ write_fixture_with_malformed_legacy_dq "$MALFORMED_LEGACY_DQ_DIR"
502
+ python3 "$COMPARE" \
503
+ --manifest "$MANIFEST" \
504
+ --results-dir "$MALFORMED_LEGACY_DQ_DIR" \
505
+ --work-dir-root "$TMP_DIR" \
506
+ --run-id synthetic \
507
+ --out-json "$TMP_DIR/malformed-legacy-dq.json" \
508
+ --out-md "$TMP_DIR/malformed-legacy-dq.md" >/dev/null
509
+ grep -Fq '"ship_blockers_failed": []' "$TMP_DIR/malformed-legacy-dq.json"
510
+ grep -Fq '"solo_dq": true' "$TMP_DIR/malformed-legacy-dq.json"
511
+ grep -Fq '"l2_gated_dq": true' "$TMP_DIR/malformed-legacy-dq.json"
512
+
513
+ MALFORMED_RESULT_DIR="$TMP_DIR/malformed-result-results"
514
+ mkdir -p "$MALFORMED_RESULT_DIR"
515
+ write_fixture_with_malformed_result "$MALFORMED_RESULT_DIR"
516
+ python3 "$COMPARE" \
517
+ --manifest "$MANIFEST" \
518
+ --results-dir "$MALFORMED_RESULT_DIR" \
519
+ --work-dir-root "$TMP_DIR" \
520
+ --run-id synthetic \
521
+ --out-json "$TMP_DIR/malformed-result.json" \
522
+ --out-md "$TMP_DIR/malformed-result.md" >/dev/null
523
+ grep -Fq '"solo_wall": null' "$TMP_DIR/malformed-result.json"
524
+
525
+ echo "PASS test-iter-0033c-compare"