devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +318 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  12. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  18. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  25. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  31. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  40. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  48. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  56. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  64. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  73. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  82. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  91. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  100. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  101. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  102. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  103. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  104. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  105. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  106. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  107. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  110. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  111. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  112. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  113. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  114. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  116. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  117. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  118. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  119. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  120. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  121. package/config/skills/_shared/archive_run.py +3 -0
  122. package/config/skills/_shared/codex-config.md +2 -2
  123. package/config/skills/_shared/codex-monitored.sh +72 -7
  124. package/config/skills/_shared/collect-codex-findings.py +125 -0
  125. package/config/skills/_shared/engine-preflight.md +1 -1
  126. package/config/skills/_shared/expected.schema.json +18 -0
  127. package/config/skills/_shared/spec-verify-check.py +312 -10
  128. package/config/skills/_shared/verify-merge-findings.py +327 -0
  129. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  130. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  131. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  132. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  133. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  134. package/package.json +1 -1
  135. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,511 @@
1
+ #!/usr/bin/env bash
2
+ # run-frozen-verify-pair.sh — compare solo VERIFY vs pair VERIFY on one frozen diff.
3
+ #
4
+ # This isolates VERIFY/JUDGE from IMPLEMENT: the implementation diff is applied
5
+ # before /devlyn:resolve starts, then both arms run verify-only against the same
6
+ # committed code and `.devlyn/external-diff.patch`.
7
+
8
+ set -euo pipefail
9
+
10
+ usage() {
11
+ cat >&2 <<EOF
12
+ usage: $0 --fixture <FID> --diff <path> [--run-id ID] [--pair-mode forced|gated]
13
+ [--fixtures-root <path>] [--base-repo <path>]
14
+ [--timeout-seconds N] [--prepare-only] [--resume-completed-arms]
15
+
16
+ Runs two verify-only arms:
17
+ solo = /devlyn:resolve --verify-only ... --engine claude
18
+ pair = forced: /devlyn:resolve --verify-only ... --engine claude --pair-verify
19
+ gated: /devlyn:resolve --verify-only ... --engine claude
20
+
21
+ By default fixtures come from benchmark/auto-resolve/fixtures and the base repo
22
+ is fixtures/test-repo. External corpora such as SWE-bench can pass their own
23
+ case root and checked-out base repo.
24
+ EOF
25
+ exit "${1:-1}"
26
+ }
27
+
28
+ FIXTURE=""
29
+ DIFF_PATH=""
30
+ RUN_ID=""
31
+ PAIR_MODE="forced"
32
+ FIXTURES_ROOT=""
33
+ BASE_REPO=""
34
+ PREPARE_ONLY=0
35
+ TIMEOUT_OVERRIDE=""
36
+ RESUME_COMPLETED_ARMS=0
37
+ while [ $# -gt 0 ]; do
38
+ case "$1" in
39
+ --fixture) FIXTURE="$2"; shift 2;;
40
+ --diff) DIFF_PATH="$2"; shift 2;;
41
+ --run-id) RUN_ID="$2"; shift 2;;
42
+ --pair-mode) PAIR_MODE="$2"; shift 2;;
43
+ --fixtures-root) FIXTURES_ROOT="$2"; shift 2;;
44
+ --base-repo) BASE_REPO="$2"; shift 2;;
45
+ --timeout-seconds) TIMEOUT_OVERRIDE="$2"; shift 2;;
46
+ --prepare-only) PREPARE_ONLY=1; shift;;
47
+ --resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
48
+ -h|--help) usage 0;;
49
+ *) echo "unknown arg: $1" >&2; usage 1;;
50
+ esac
51
+ done
52
+
53
+ [ -n "$FIXTURE" ] && [ -n "$DIFF_PATH" ] || usage 1
54
+ [ -f "$DIFF_PATH" ] || { echo "diff not found: $DIFF_PATH" >&2; exit 1; }
55
+ [ -s "$DIFF_PATH" ] || { echo "diff is empty: $DIFF_PATH" >&2; exit 1; }
56
+ [ "$PAIR_MODE" = "forced" ] || [ "$PAIR_MODE" = "gated" ] || { echo "--pair-mode must be forced|gated (got '$PAIR_MODE')" >&2; exit 1; }
57
+
58
+ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
59
+ REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
60
+ [ -n "$FIXTURES_ROOT" ] || FIXTURES_ROOT="$BENCH_ROOT/fixtures"
61
+ [ -n "$BASE_REPO" ] || BASE_REPO="$BENCH_ROOT/fixtures/test-repo"
62
+ FIXTURES_ROOT="$(cd "$FIXTURES_ROOT" && pwd)"
63
+ BASE_REPO="$(cd "$BASE_REPO" && pwd)"
64
+ FIX_DIR="$FIXTURES_ROOT/$FIXTURE"
65
+ [ -d "$FIX_DIR" ] || { echo "fixture not found: $FIXTURE" >&2; exit 1; }
66
+ [ -d "$BASE_REPO" ] || { echo "base repo not found: $BASE_REPO" >&2; exit 1; }
67
+
68
+ META="$FIX_DIR/metadata.json"
69
+ EXPECTED="$FIX_DIR/expected.json"
70
+ SPEC="$FIX_DIR/spec.md"
71
+ TASK="$FIX_DIR/task.txt"
72
+ SETUP="$FIX_DIR/setup.sh"
73
+ for f in "$META" "$EXPECTED" "$SPEC" "$TASK" "$SETUP"; do
74
+ [ -f "$f" ] || { echo "fixture missing required file: $f" >&2; exit 1; }
75
+ done
76
+
77
+ TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
78
+ if [ -n "$TIMEOUT_OVERRIDE" ]; then
79
+ case "$TIMEOUT_OVERRIDE" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
80
+ [ "$TIMEOUT_OVERRIDE" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
81
+ TIMEOUT="$TIMEOUT_OVERRIDE"
82
+ fi
83
+ if [ -z "$RUN_ID" ]; then
84
+ TS=$(date -u +%Y%m%dT%H%M%SZ)
85
+ SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
86
+ RUN_ID="${TS}-${SHA}-frozen-verify"
87
+ fi
88
+
89
+ RESULT_ROOT="$BENCH_ROOT/results/$RUN_ID"
90
+ mkdir -p "$RESULT_ROOT"
91
+
92
+ echo ""
93
+ echo "═══ Frozen Verify Pair Run ═══"
94
+ echo "Run-id: $RUN_ID"
95
+ echo "Fixture: $FIXTURE"
96
+ echo "Cases: $FIXTURES_ROOT"
97
+ echo "Base: $BASE_REPO"
98
+ echo "Diff: $DIFF_PATH"
99
+ echo "Pair: $PAIR_MODE"
100
+ echo "Timeout: ${TIMEOUT}s per arm"
101
+ [ "$PREPARE_ONLY" -eq 0 ] || echo "Mode: prepare-only"
102
+ echo ""
103
+
104
+ mirror_skills() {
105
+ local src_skills="$REPO_ROOT/config/skills"
106
+ local dst_skills="$REPO_ROOT/.claude/skills"
107
+ mkdir -p "$dst_skills"
108
+ local mirrored=0
109
+ for src_dir in "$src_skills"/*/; do
110
+ [ -d "$src_dir" ] || continue
111
+ local name
112
+ name=$(basename "$src_dir")
113
+ case "$name" in
114
+ devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
115
+ continue ;;
116
+ esac
117
+ local staging="$dst_skills/.${name}.staging"
118
+ rm -rf "$staging"
119
+ cp -R "$src_dir" "$staging"
120
+ rm -rf "$dst_skills/$name"
121
+ mv "$staging" "$dst_skills/$name"
122
+ mirrored=$((mirrored + 1))
123
+ done
124
+ echo "[frozen-verify] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
125
+ }
126
+
127
+ stage_codex_env() {
128
+ local work_dir="$1"
129
+ local arm="$2"
130
+ mkdir -p "$work_dir/.claude"
131
+ cp -R "$REPO_ROOT/.claude/skills" "$work_dir/.claude/skills"
132
+ [ -f "$REPO_ROOT/CLAUDE.md" ] && cp "$REPO_ROOT/CLAUDE.md" "$work_dir/CLAUDE.md"
133
+
134
+ if ! command -v codex >/dev/null 2>&1; then
135
+ echo "warning: codex not on PATH — pair arm cannot exercise Codex pair-JUDGE" >&2
136
+ return
137
+ fi
138
+ local real_bin shim_src monitored_src monitored_path snapshot_path injected_path blocked
139
+ real_bin="$(command -v codex)"
140
+ shim_src="$REPO_ROOT/scripts/codex-shim/codex"
141
+ monitored_src="$REPO_ROOT/config/skills/_shared/codex-monitored.sh"
142
+ [ -x "$shim_src" ] || { echo "missing codex shim: $shim_src" >&2; exit 1; }
143
+ [ -r "$monitored_src" ] || { echo "missing codex wrapper: $monitored_src" >&2; exit 1; }
144
+ mkdir -p "$work_dir/.devlyn-bin"
145
+ cp "$shim_src" "$work_dir/.devlyn-bin/codex"
146
+ chmod +x "$work_dir/.devlyn-bin/codex"
147
+ monitored_path="$work_dir/.claude/skills/_shared/codex-monitored.sh"
148
+ snapshot_path=$(grep -m1 '^export PATH=' "$HOME/.claude/shell-snapshots/snapshot-zsh-"*.sh 2>/dev/null | head -1 | sed 's/^[^=]*=//' | tr -d '"' || true)
149
+ [ -n "$snapshot_path" ] || snapshot_path="$PATH"
150
+ injected_path="$work_dir/.devlyn-bin:$snapshot_path"
151
+ blocked=0
152
+ [ "$arm" = "solo" ] && blocked=1
153
+ python3 - "$work_dir/.claude/settings.json" "$injected_path" "$real_bin" "$monitored_path" "$blocked" <<'PY'
154
+ import json, sys
155
+ out_path, path_val, real_bin, monitored, blocked = sys.argv[1:6]
156
+ env = {"CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1", "PATH": path_val}
157
+ if blocked == "1":
158
+ env["CODEX_BLOCKED"] = "1"
159
+ else:
160
+ env["CODEX_REAL_BIN"] = real_bin
161
+ env["CODEX_MONITORED_PATH"] = monitored
162
+ with open(out_path, "w") as f:
163
+ json.dump({"env": env}, f, indent=2)
164
+ f.write("\n")
165
+ PY
166
+ }
167
+
168
+ cleanup_workdir_processes() {
169
+ local work_dir="$1"
170
+ local signal="$2"
171
+ local physical_work_dir current_pgid
172
+ physical_work_dir="$(cd "$work_dir" 2>/dev/null && pwd -P || printf '%s' "$work_dir")"
173
+ current_pgid="$(ps -o pgid= -p "$$" | tr -d ' ')"
174
+ ps -axo pid=,pgid=,command= \
175
+ | awk -v p1="$work_dir" -v p2="$physical_work_dir" -v self="$$" -v current_pgid="$current_pgid" '
176
+ $1 != self && $2 != current_pgid && (index($0, p1) || index($0, p2)) { print $2 }
177
+ ' \
178
+ | sort -u \
179
+ | while IFS= read -r pgid; do
180
+ [ -n "$pgid" ] || continue
181
+ kill "-$signal" -- "-$pgid" 2>/dev/null || true
182
+ done
183
+ }
184
+
185
+ archive_ready() {
186
+ local work_dir="$1"
187
+ python3 - "$work_dir" <<'PY'
188
+ import pathlib, sys
189
+ root = pathlib.Path(sys.argv[1]) / ".devlyn" / "runs"
190
+ raise SystemExit(0 if root.is_dir() and any(root.glob("*/pipeline.state.json")) else 1)
191
+ PY
192
+ }
193
+
194
+ summarize_arm() {
195
+ local result_dir="$1"
196
+ local elapsed="$2"
197
+ local invoke_exit="$3"
198
+ python3 - "$result_dir" "$elapsed" "$invoke_exit" <<'PY'
199
+ import json, pathlib, sys
200
+ result_dir = pathlib.Path(sys.argv[1])
201
+ elapsed = int(sys.argv[2])
202
+ invoke_exit = int(sys.argv[3])
203
+ archive = result_dir / "run-archive"
204
+ state_path = archive / "pipeline.state.json"
205
+ state = json.loads(state_path.read_text()) if state_path.is_file() else {}
206
+ verify = ((state.get("phases") or {}).get("verify") or {})
207
+ sub_verdicts = verify.get("sub_verdicts")
208
+ pair_trigger = verify.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
209
+ findings = []
210
+ finding_paths = []
211
+ merged_path = archive / "verify-merged.findings.jsonl"
212
+ if merged_path.is_file():
213
+ finding_paths.append(merged_path)
214
+ else:
215
+ candidates = []
216
+ for name in ("verify.findings.jsonl", "verify.pair-judge.findings.jsonl"):
217
+ candidates.append(archive / name)
218
+ candidates.extend(sorted(archive.glob("verify.findings*.jsonl")))
219
+ candidates.extend(sorted(archive.glob("verify.*findings*.jsonl")))
220
+ seen = set()
221
+ for candidate_path in candidates:
222
+ if candidate_path.name == "verify-mechanical.findings.jsonl":
223
+ continue
224
+ if candidate_path in seen or not candidate_path.is_file():
225
+ continue
226
+ seen.add(candidate_path)
227
+ finding_paths.append(candidate_path)
228
+ findings_source = "+".join(path.name for path in finding_paths) if finding_paths else "missing"
229
+ finding_severities = {"CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"}
230
+ for findings_path in finding_paths:
231
+ for line in findings_path.read_text().splitlines():
232
+ if line.strip():
233
+ try:
234
+ parsed = json.loads(line)
235
+ except json.JSONDecodeError:
236
+ continue
237
+ if not isinstance(parsed, dict):
238
+ continue
239
+ sev = str(parsed.get("severity") or parsed.get("level") or "").upper()
240
+ if sev not in finding_severities:
241
+ continue
242
+ findings.append(parsed)
243
+ merged = verify.get("merged") if isinstance(verify.get("merged"), dict) else {}
244
+ merged_findings_count = sum(
245
+ int(merged.get(k) or 0) for k in ("critical", "high", "medium", "low")
246
+ )
247
+ findings_count = len(findings) if findings else merged_findings_count
248
+ severity_counts = {}
249
+ for finding in findings:
250
+ if isinstance(finding, dict):
251
+ sev = str(finding.get("severity") or finding.get("level") or "unknown").upper()
252
+ severity_counts[sev] = severity_counts.get(sev, 0) + 1
253
+ transcript_path = result_dir / "transcript.txt"
254
+ transcript = transcript_path.read_text(errors="replace") if transcript_path.is_file() else ""
255
+ invoke_failure_reason = None
256
+ if invoke_exit == 124:
257
+ invoke_failure_reason = "timeout"
258
+ elif "You've hit your limit" in transcript:
259
+ invoke_failure_reason = "provider_limit"
260
+ summary = {
261
+ "elapsed_seconds": elapsed,
262
+ "invoke_exit": invoke_exit,
263
+ "timed_out": invoke_exit == 124,
264
+ "invoke_failure_reason": invoke_failure_reason,
265
+ "terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
266
+ "verify_verdict": verify.get("verdict"),
267
+ "sub_verdicts": sub_verdicts,
268
+ "pair_trigger": pair_trigger,
269
+ "pair_mode": bool(isinstance(sub_verdicts, dict) and (
270
+ sub_verdicts.get("judge_codex") is not None
271
+ or sub_verdicts.get("pair_judge") is not None
272
+ ))
273
+ or bool(verify.get("pair_mode")),
274
+ "verify_findings_count": findings_count,
275
+ "verify_findings_source": findings_source if finding_paths else (
276
+ "state.merged" if merged_findings_count else "missing"
277
+ ),
278
+ "merged_findings_counts": merged,
279
+ "severity_counts": severity_counts,
280
+ "verify_findings_severities": [f.get("severity") for f in findings if isinstance(f, dict)],
281
+ }
282
+ (result_dir / "summary.json").write_text(json.dumps(summary, indent=2) + "\n")
283
+ print(json.dumps(summary, indent=2))
284
+ PY
285
+ }
286
+
287
+ copy_base_repo() {
288
+ local work_dir="$1"
289
+ rm -rf "$work_dir"
290
+ mkdir -p "$work_dir"
291
+ if [ -d "$BASE_REPO/.git" ]; then
292
+ git -C "$BASE_REPO" archive --format=tar HEAD | (cd "$work_dir" && LC_ALL=C tar -xf -)
293
+ else
294
+ cp -R "$BASE_REPO"/. "$work_dir"/
295
+ rm -rf "$work_dir/.git"
296
+ fi
297
+ }
298
+
299
+ run_arm() {
300
+ local arm="$1"
301
+ local pair_flag="$2"
302
+ local result_dir="$RESULT_ROOT/$arm"
303
+ local work_dir="/tmp/bench-${RUN_ID}-${FIXTURE}-${arm}"
304
+ if [ "$RESUME_COMPLETED_ARMS" -eq 1 ] && [ "$PREPARE_ONLY" -eq 0 ] && [ -f "$result_dir/summary.json" ]; then
305
+ if python3 - "$result_dir/summary.json" <<'PY'
306
+ import json
307
+ import sys
308
+
309
+ summary = json.load(open(sys.argv[1]))
310
+ raise SystemExit(0 if summary.get("invoke_exit") == 0 else 1)
311
+ PY
312
+ then
313
+ echo "[frozen-verify] $arm: reuse completed summary"
314
+ return 0
315
+ fi
316
+ fi
317
+ mkdir -p "$result_dir"
318
+ copy_base_repo "$work_dir"
319
+
320
+ stage_codex_env "$work_dir" "$arm"
321
+
322
+ (cd "$work_dir" && git init -q && git add -A && git -c user.email=b@b -c user.name=b commit -q -m baseline)
323
+
324
+ if [ -s "$SETUP" ]; then
325
+ chmod +x "$SETUP"
326
+ (cd "$work_dir" && "$SETUP") > "$result_dir/setup.log" 2>&1
327
+ (cd "$work_dir" && git add -A && git -c user.email=b@b -c user.name=b commit -q --allow-empty -m fixture-setup)
328
+ fi
329
+
330
+ mkdir -p "$work_dir/docs/roadmap/phase-1" "$work_dir/.devlyn"
331
+ cp "$SPEC" "$work_dir/docs/roadmap/phase-1/$FIXTURE.md"
332
+ cp "$DIFF_PATH" "$work_dir/.devlyn/external-diff.patch"
333
+ python3 - "$EXPECTED" "$work_dir/.devlyn/spec-verify.json" <<'PY'
334
+ import json, os, sys
335
+ expected = json.load(open(sys.argv[1]))
336
+ out_path = sys.argv[2]
337
+ commands = expected.get("verification_commands", [])
338
+ if not commands:
339
+ raise SystemExit(0)
340
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
341
+ with open(out_path, "w") as f:
342
+ json.dump({"verification_commands": commands}, f, indent=2)
343
+ f.write("\n")
344
+ PY
345
+
346
+ if ! (cd "$work_dir" && git apply .devlyn/external-diff.patch); then
347
+ echo "[frozen-verify] $arm: diff failed to apply" >&2
348
+ return 1
349
+ fi
350
+ (cd "$work_dir" && git add -A && git -c user.email=b@b -c user.name=b commit -q -m external-implementation)
351
+
352
+ cat > "$result_dir/input.md" <<EOF
353
+ Use the \`/devlyn:resolve --verify-only .devlyn/external-diff.patch --spec docs/roadmap/phase-1/$FIXTURE.md --engine claude ${pair_flag}\` skill to run VERIFY-ONLY mode.
354
+
355
+ The diff at .devlyn/external-diff.patch represents an external implementation already applied to the work tree. Run PHASE 5 (VERIFY) only — skip PLAN, IMPLEMENT, BUILD_GATE, CLEANUP per the skill's verify-only mode contract.
356
+
357
+ Important: \`--engine claude\` selects the primary VERIFY judge only. It must not suppress gated VERIFY pair-mode. If the spec/phase trigger makes pair-mode eligible with non-empty reasons, the skill must spawn the OTHER-engine judge unless Codex is blocked/unavailable at the invocation layer.
358
+
359
+ Report the terminal verdict, list of files in the diff, and any findings.
360
+ EOF
361
+
362
+ if [ "$PREPARE_ONLY" -eq 1 ]; then
363
+ echo "[frozen-verify] $arm prepared at $work_dir"
364
+ return 0
365
+ fi
366
+
367
+ local start end elapsed invoke_exit watchdog timeout_flag complete_flag
368
+ start=$(date +%s)
369
+ timeout_flag="$result_dir/.timed_out"
370
+ complete_flag="$result_dir/.completed"
371
+ rm -f "$timeout_flag" "$complete_flag"
372
+ set +e
373
+ set -m
374
+ (
375
+ cd "$work_dir"
376
+ export PATH="$work_dir/.devlyn-bin:$PATH"
377
+ [ "$arm" = "solo" ] && export CODEX_BLOCKED=1
378
+ export BENCH_WORKDIR="$work_dir"
379
+ export BENCH_FIXTURE_DIR="$FIX_DIR"
380
+ exec claude \
381
+ -p "$(cat "$result_dir/input.md")" \
382
+ --dangerously-skip-permissions \
383
+ --effort xhigh \
384
+ --strict-mcp-config \
385
+ --mcp-config '{"mcpServers":{}}' \
386
+ --debug-file "$result_dir/claude-debug.log" \
387
+ </dev/null
388
+ ) > "$result_dir/transcript.txt" 2>&1 &
389
+ local child_pid=$!
390
+ set +m
391
+ (
392
+ local deadline now
393
+ deadline=$(($(date +%s) + TIMEOUT))
394
+ while kill -0 "$child_pid" 2>/dev/null; do
395
+ if archive_ready "$work_dir"; then
396
+ : > "$complete_flag"
397
+ kill -TERM -- "-$child_pid" 2>/dev/null
398
+ cleanup_workdir_processes "$work_dir" TERM
399
+ sleep 2
400
+ kill -KILL -- "-$child_pid" 2>/dev/null
401
+ cleanup_workdir_processes "$work_dir" KILL
402
+ exit 0
403
+ fi
404
+ now=$(date +%s)
405
+ [ "$now" -lt "$deadline" ] || break
406
+ sleep 5
407
+ done
408
+ if kill -0 "$child_pid" 2>/dev/null; then
409
+ : > "$timeout_flag"
410
+ kill -TERM -- "-$child_pid" 2>/dev/null
411
+ cleanup_workdir_processes "$work_dir" TERM
412
+ sleep 5
413
+ kill -KILL -- "-$child_pid" 2>/dev/null
414
+ cleanup_workdir_processes "$work_dir" KILL
415
+ fi
416
+ ) &
417
+ watchdog=$!
418
+ wait "$child_pid"
419
+ invoke_exit=$?
420
+ kill -TERM "$watchdog" 2>/dev/null || true
421
+ wait "$watchdog" 2>/dev/null || true
422
+ if [ -f "$timeout_flag" ]; then
423
+ invoke_exit=124
424
+ rm -f "$timeout_flag"
425
+ elif [ -f "$complete_flag" ]; then
426
+ invoke_exit=0
427
+ rm -f "$complete_flag"
428
+ fi
429
+ set -e
430
+ end=$(date +%s)
431
+ elapsed=$((end - start))
432
+
433
+ local run_dir
434
+ run_dir=$(find "$work_dir/.devlyn/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -1 || true)
435
+ if [ -n "$run_dir" ]; then
436
+ rm -rf "$result_dir/run-archive"
437
+ cp -R "$run_dir" "$result_dir/run-archive"
438
+ [ -f "$result_dir/run-archive/pipeline.state.json" ] \
439
+ || [ ! -f "$work_dir/.devlyn/pipeline.state.json" ] \
440
+ || cp "$work_dir/.devlyn/pipeline.state.json" "$result_dir/run-archive/pipeline.state.json"
441
+ elif [ -d "$work_dir/.devlyn" ]; then
442
+ rm -rf "$result_dir/run-archive"
443
+ mkdir -p "$result_dir/run-archive"
444
+ find "$work_dir/.devlyn" -maxdepth 1 -type f -exec cp {} "$result_dir/run-archive/" \;
445
+ fi
446
+ if [ -d "$work_dir/.devlyn" ] && [ -d "$result_dir/run-archive" ]; then
447
+ find "$work_dir/.devlyn" -maxdepth 1 -type f \
448
+ \( -name 'verify.findings*.jsonl' -o -name 'verify.*findings*.jsonl' -o -name 'verify-merged.findings.jsonl' \) \
449
+ ! -name 'verify-mechanical.findings.jsonl' \
450
+ -exec cp {} "$result_dir/run-archive/" \;
451
+ fi
452
+ summarize_arm "$result_dir" "$elapsed" "$invoke_exit"
453
+ }
454
+
455
+ mirror_skills
456
+ echo "[frozen-verify] ► solo"
457
+ run_arm solo ""
458
+ echo "[frozen-verify] ► pair"
459
+ if [ "$PAIR_MODE" = "forced" ]; then
460
+ run_arm pair "--pair-verify"
461
+ else
462
+ run_arm pair ""
463
+ fi
464
+
465
+ python3 - "$RESULT_ROOT" "$PAIR_MODE" <<'PY'
466
+ import json, pathlib, sys
467
+ root = pathlib.Path(sys.argv[1])
468
+ pair_mode_requested = sys.argv[2]
469
+ out = {}
470
+ for arm in ("solo", "pair"):
471
+ path = root / arm / "summary.json"
472
+ out[arm] = json.loads(path.read_text()) if path.is_file() else {"missing": True}
473
+ solo = out.get("solo", {})
474
+ pair = out.get("pair", {})
475
+ rank = {
476
+ None: 0,
477
+ "PASS": 0,
478
+ "PASS_WITH_ISSUES": 1,
479
+ "NEEDS_WORK": 2,
480
+ "BLOCKED": 3,
481
+ }
482
+ solo_rank = rank.get(solo.get("verify_verdict"), 0)
483
+ pair_rank = rank.get(pair.get("verify_verdict"), 0)
484
+ pair_sub = pair.get("sub_verdicts") or {}
485
+ pair_primary_verdict = pair_sub.get("judge")
486
+ pair_judge_verdict = pair_sub.get("pair_judge")
487
+ pair_primary_rank = rank.get(pair_primary_verdict, 0)
488
+ pair_judge_rank = rank.get(pair_judge_verdict, 0)
489
+ out["comparison"] = {
490
+ "pair_mode_requested": pair_mode_requested,
491
+ "pair_trigger_missed": bool(
492
+ pair_mode_requested == "gated"
493
+ and (pair.get("pair_trigger") or {}).get("eligible") is True
494
+ and (pair.get("pair_trigger") or {}).get("reasons")
495
+ and not pair.get("pair_mode")
496
+ ),
497
+ "pair_found_more_findings": (pair.get("verify_findings_count") or 0) > (solo.get("verify_findings_count") or 0),
498
+ "pair_found_more_low_or_worse": sum((pair.get("severity_counts") or {}).get(k, 0) for k in ("LOW", "MEDIUM", "HIGH", "CRITICAL"))
499
+ > sum((solo.get("severity_counts") or {}).get(k, 0) for k in ("LOW", "MEDIUM", "HIGH", "CRITICAL")),
500
+ "pair_verdict_lift": bool(pair.get("pair_mode")) and pair_rank > solo_rank and pair_rank >= rank["NEEDS_WORK"],
501
+ "pair_internal_verdict_lift": bool(pair.get("pair_mode"))
502
+ and pair_judge_rank > pair_primary_rank
503
+ and pair_rank >= rank["NEEDS_WORK"],
504
+ "solo_verdict": solo.get("verify_verdict"),
505
+ "pair_verdict": pair.get("verify_verdict"),
506
+ "pair_primary_verdict": pair_primary_verdict,
507
+ "pair_judge_verdict": pair_judge_verdict,
508
+ }
509
+ (root / "compare.json").write_text(json.dumps(out, indent=2) + "\n")
510
+ print(json.dumps(out, indent=2))
511
+ PY
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env bash
2
+ # run-full-pipeline-pair-candidate.sh — measure full-pipeline L2/pair candidates.
3
+ #
4
+ # Runs bare + solo_claude first and applies headroom-gate.py. Only if the set
5
+ # leaves room for L2 does it run l2_gated, rejudge, and apply
6
+ # full-pipeline-pair-gate.py.
7
+
8
+ set -euo pipefail
9
+
10
+ usage() {
11
+ local code="${1:-1}"
12
+ cat >&2 <<'EOF'
13
+ usage: run-full-pipeline-pair-candidate.sh [options] <fixture> [<fixture> ...]
14
+
15
+ Options:
16
+ --run-id ID
17
+ --bare-max N
18
+ --solo-max N
19
+ --min-fixtures N
20
+ --min-pair-margin N
21
+ --max-pair-solo-wall-ratio N
22
+ --pair-arm ARM
23
+ --reuse-calibrated-from RUN_ID
24
+ EOF
25
+ exit "$code"
26
+ }
27
+
28
+ RUN_ID=""
29
+ BARE_MAX=60
30
+ SOLO_MAX=80
31
+ MIN_FIXTURES=2
32
+ MIN_PAIR_MARGIN=5
33
+ MAX_PAIR_SOLO_WALL_RATIO=""
34
+ PAIR_ARM="l2_gated"
35
+ REUSE_CALIBRATED_FROM=""
36
+ FIXTURES=()
37
+ while [ $# -gt 0 ]; do
38
+ case "$1" in
39
+ --run-id) RUN_ID="$2"; shift 2;;
40
+ --bare-max) BARE_MAX="$2"; shift 2;;
41
+ --solo-max) SOLO_MAX="$2"; shift 2;;
42
+ --min-fixtures) MIN_FIXTURES="$2"; shift 2;;
43
+ --min-pair-margin) MIN_PAIR_MARGIN="$2"; shift 2;;
44
+ --max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
45
+ --pair-arm) PAIR_ARM="$2"; shift 2;;
46
+ --reuse-calibrated-from) REUSE_CALIBRATED_FROM="$2"; shift 2;;
47
+ -h|--help) usage 0;;
48
+ F[0-9]*) FIXTURES+=("$1"); shift;;
49
+ *) echo "unknown arg: $1" >&2; usage;;
50
+ esac
51
+ done
52
+ [ ${#FIXTURES[@]} -gt 0 ] || usage
53
+
54
+ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
55
+ REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
56
+
57
+ if [ -z "$RUN_ID" ]; then
58
+ TS=$(date -u +%Y%m%dT%H%M%SZ)
59
+ SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
60
+ RUN_ID="${TS}-${SHA}-full-pipeline-pair"
61
+ fi
62
+
63
+ echo ""
64
+ echo "═══ Full-Pipeline Pair Candidate Run ═══"
65
+ echo "Run-id: $RUN_ID"
66
+ echo "Fixtures: ${FIXTURES[*]}"
67
+ echo "Arms: bare solo_claude $PAIR_ARM"
68
+ [ -z "$REUSE_CALIBRATED_FROM" ] || echo "Reuse: bare+solo from $REUSE_CALIBRATED_FROM"
69
+ echo ""
70
+
71
+ SRC_SKILLS="$REPO_ROOT/config/skills"
72
+ DST_SKILLS="$REPO_ROOT/.claude/skills"
73
+ mkdir -p "$DST_SKILLS"
74
+ mirrored=0
75
+ for src_dir in "$SRC_SKILLS"/*/; do
76
+ [ -d "$src_dir" ] || continue
77
+ name=$(basename "$src_dir")
78
+ case "$name" in
79
+ devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
80
+ continue ;;
81
+ esac
82
+ staging="$DST_SKILLS/.${name}.staging"
83
+ rm -rf "$staging"
84
+ cp -R "$src_dir" "$staging"
85
+ rm -rf "$DST_SKILLS/$name"
86
+ mv "$staging" "$DST_SKILLS/$name"
87
+ mirrored=$((mirrored + 1))
88
+ done
89
+ echo "[full-pipeline-pair] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
90
+
91
+ copy_calibrated_arm() {
92
+ local fid="$1"
93
+ local arm="$2"
94
+ local src="$BENCH_ROOT/results/$REUSE_CALIBRATED_FROM/$fid/$arm"
95
+ local dst="$BENCH_ROOT/results/$RUN_ID/$fid/$arm"
96
+ if [ -f "$dst/result.json" ]; then
97
+ echo "[full-pipeline-pair] reuse skip: $fid / $arm already exists in $RUN_ID"
98
+ return 0
99
+ fi
100
+ [ -d "$src" ] || { echo "reuse source missing: $src" >&2; exit 1; }
101
+ [ -f "$src/result.json" ] || { echo "reuse source missing result.json: $src" >&2; exit 1; }
102
+ mkdir -p "$(dirname "$dst")"
103
+ cp -R "$src" "$dst"
104
+ echo "[full-pipeline-pair] reused $fid / $arm from $REUSE_CALIBRATED_FROM"
105
+ }
106
+
107
+ for fid in "${FIXTURES[@]}"; do
108
+ if [ -n "$REUSE_CALIBRATED_FROM" ]; then
109
+ copy_calibrated_arm "$fid" bare
110
+ copy_calibrated_arm "$fid" solo_claude
111
+ else
112
+ echo "[full-pipeline-pair] ► $fid / bare"
113
+ bash "$BENCH_ROOT/scripts/run-fixture.sh" \
114
+ --fixture "$fid" --arm bare --run-id "$RUN_ID" \
115
+ || echo "[full-pipeline-pair] ✗ $fid / bare (arm failure tolerated; gate will fail if dirty)"
116
+
117
+ echo "[full-pipeline-pair] ► $fid / solo_claude"
118
+ bash "$BENCH_ROOT/scripts/run-fixture.sh" \
119
+ --fixture "$fid" --arm solo_claude --run-id "$RUN_ID" \
120
+ || echo "[full-pipeline-pair] ✗ $fid / solo_claude (arm failure tolerated; gate will fail if dirty)"
121
+ fi
122
+
123
+ echo "[full-pipeline-pair] ► headroom judge $fid"
124
+ bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
125
+ || echo "[full-pipeline-pair] ✗ headroom judge failed for $fid"
126
+ done
127
+
128
+ headroom_args=(
129
+ --run-id "$RUN_ID"
130
+ --bare-max "$BARE_MAX"
131
+ --solo-max "$SOLO_MAX"
132
+ --min-fixtures "$MIN_FIXTURES"
133
+ --out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json"
134
+ --out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
135
+ )
136
+ python3 "$BENCH_ROOT/scripts/headroom-gate.py" "${headroom_args[@]}"
137
+
138
+ for fid in "${FIXTURES[@]}"; do
139
+ echo "[full-pipeline-pair] ► $fid / $PAIR_ARM"
140
+ bash "$BENCH_ROOT/scripts/run-fixture.sh" \
141
+ --fixture "$fid" --arm "$PAIR_ARM" --run-id "$RUN_ID" \
142
+ || echo "[full-pipeline-pair] ✗ $fid / $PAIR_ARM (arm failure tolerated; gate will fail if dirty)"
143
+
144
+ echo "[full-pipeline-pair] ► final judge $fid"
145
+ bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
146
+ || echo "[full-pipeline-pair] ✗ final judge failed for $fid"
147
+ done
148
+
149
+ pair_args=(
150
+ --run-id "$RUN_ID"
151
+ --bare-max "$BARE_MAX"
152
+ --solo-max "$SOLO_MAX"
153
+ --min-fixtures "$MIN_FIXTURES"
154
+ --min-pair-margin "$MIN_PAIR_MARGIN"
155
+ --pair-arm "$PAIR_ARM"
156
+ --out-json "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.json"
157
+ --out-md "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.md"
158
+ )
159
+ [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || pair_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
160
+
161
+ python3 "$BENCH_ROOT/scripts/full-pipeline-pair-gate.py" "${pair_args[@]}"
162
+ cat "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.md"