devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +318 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  12. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  18. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  25. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  31. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  40. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  48. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  56. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  64. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  73. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  82. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  91. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  100. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  101. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  102. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  103. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  104. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  105. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  106. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  107. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  110. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  111. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  112. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  113. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  114. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  116. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  117. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  118. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  119. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  120. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  121. package/config/skills/_shared/archive_run.py +3 -0
  122. package/config/skills/_shared/codex-config.md +2 -2
  123. package/config/skills/_shared/codex-monitored.sh +72 -7
  124. package/config/skills/_shared/collect-codex-findings.py +125 -0
  125. package/config/skills/_shared/engine-preflight.md +1 -1
  126. package/config/skills/_shared/expected.schema.json +18 -0
  127. package/config/skills/_shared/spec-verify-check.py +312 -10
  128. package/config/skills/_shared/verify-merge-findings.py +327 -0
  129. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  130. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  131. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  132. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  133. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  134. package/package.json +1 -1
  135. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,302 @@
1
+ #!/usr/bin/env bash
2
+ # Regression test for the SWE-bench frozen VERIFY case importer.
3
+ set -euo pipefail
4
+
5
+ ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
6
+ TMP="$(mktemp -d)"
7
+ trap 'rm -rf "$TMP"' EXIT
8
+
9
+ REPO="$TMP/repo"
10
+ mkdir -p "$REPO"
11
+ git -C "$REPO" init -q
12
+ git -C "$REPO" config user.email bench@example.com
13
+ git -C "$REPO" config user.name bench
14
+ printf 'hello\n' > "$REPO/app.txt"
15
+ git -C "$REPO" add app.txt
16
+ git -C "$REPO" commit -q -m base
17
+ BASE_SHA="$(git -C "$REPO" rev-parse HEAD)"
18
+
19
+ printf 'goodbye\n' > "$REPO/app.txt"
20
+ git -C "$REPO" diff > "$TMP/model.patch"
21
+ git -C "$REPO" checkout -q -- app.txt
22
+
23
+ cat > "$TMP/instance.json" <<JSON
24
+ {
25
+ "instance_id": "local__repo-1",
26
+ "repo": "local/repo",
27
+ "base_commit": "$BASE_SHA",
28
+ "problem_statement": "Change app.txt so it says goodbye instead of hello.",
29
+ "version": "test",
30
+ "issue_url": "https://example.test/issue",
31
+ "pr_url": "https://example.test/pr"
32
+ }
33
+ JSON
34
+
35
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
36
+ --instance-json "$TMP/instance.json" \
37
+ --model-patch "$TMP/model.patch" \
38
+ --cases-root "$TMP/cases" \
39
+ --repos-root "$TMP/repos" \
40
+ --repo-dir "$REPO" \
41
+ --timeout-seconds 60 > "$TMP/prepare.json"
42
+
43
+ CASE_DIR="$TMP/cases/local__repo-1"
44
+ BASE_REPO="$TMP/repos/local__repo-${BASE_SHA:0:12}"
45
+ test -f "$CASE_DIR/spec.md"
46
+ test -f "$CASE_DIR/model.patch"
47
+ test -x "$CASE_DIR/setup.sh"
48
+ grep -q 'SWE-bench local__repo-1' "$CASE_DIR/spec.md"
49
+ grep -q -- '--pair-mode gated' "$CASE_DIR/run-command.txt"
50
+
51
+ python3 "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" \
52
+ --dataset lite \
53
+ --limit 1 \
54
+ --out "$TMP/fetched-lite.jsonl" > "$TMP/fetch.json"
55
+ grep -q '"rows_written": 1' "$TMP/fetch.json"
56
+ python3 - "$TMP/fetched-lite.jsonl" <<'PY'
57
+ import json, pathlib, sys
58
+ row = json.loads(pathlib.Path(sys.argv[1]).read_text().splitlines()[0])
59
+ for key in ("instance_id", "repo", "base_commit", "problem_statement"):
60
+ assert row.get(key), key
61
+ PY
62
+
63
+ python3 - "$TMP/instance.json" "$TMP/instances.jsonl" "$TMP/model.patch" "$TMP/predictions.jsonl" <<'PY'
64
+ import json, pathlib, sys
65
+ instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
66
+ pathlib.Path(sys.argv[2]).write_text(json.dumps(instance) + "\n")
67
+ patch = pathlib.Path(sys.argv[3]).read_text()
68
+ pathlib.Path(sys.argv[4]).write_text(json.dumps({
69
+ "instance_id": "local__repo-1",
70
+ "model_name_or_path": "local-test",
71
+ "model_patch": patch,
72
+ }) + "\n")
73
+ PY
74
+
75
+ mkdir -p "$TMP/patch-root/local__repo-1"
76
+ cp "$TMP/model.patch" "$TMP/patch-root/local__repo-1/patch.diff"
77
+ python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
78
+ --patch-root "$TMP/patch-root" \
79
+ --instances-jsonl "$TMP/instances.jsonl" \
80
+ --model-name local-patch-root \
81
+ --out "$TMP/collected-predictions.jsonl" > "$TMP/collect.json"
82
+ grep -q '"predictions_written": 1' "$TMP/collect.json"
83
+ python3 - "$TMP/collected-predictions.jsonl" <<'PY'
84
+ import json, pathlib, sys
85
+ row = json.loads(pathlib.Path(sys.argv[1]).read_text())
86
+ assert row["instance_id"] == "local__repo-1"
87
+ assert row["model_name_or_path"] == "local-patch-root"
88
+ assert row["model_patch"].endswith("\n")
89
+ PY
90
+
91
+ rm -rf "$TMP/cases-batch" "$TMP/repos-batch"
92
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
93
+ --instances-jsonl "$TMP/instances.jsonl" \
94
+ --predictions-jsonl "$TMP/predictions.jsonl" \
95
+ --cases-root "$TMP/cases-batch" \
96
+ --repos-root "$TMP/repos-batch" \
97
+ --repo-dir "$REPO" \
98
+ --out-manifest "$TMP/manifest.json" > "$TMP/batch.json"
99
+ grep -q '"prepared_count": 1' "$TMP/manifest.json"
100
+ test -f "$TMP/cases-batch/local__repo-1/model.patch"
101
+
102
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
103
+ --fixture local__repo-1 \
104
+ --fixtures-root "$TMP/cases" \
105
+ --base-repo "$BASE_REPO" \
106
+ --diff "$CASE_DIR/model.patch" \
107
+ --run-id swebench-frozen-case-test \
108
+ --pair-mode gated \
109
+ --timeout-seconds 7 \
110
+ --prepare-only > "$TMP/runner.log"
111
+
112
+ grep -q 'Timeout: 7s per arm' "$TMP/runner.log"
113
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/app.txt
114
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/app.txt
115
+ test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/.devlyn/spec-verify.json
116
+ test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/.devlyn/spec-verify.json
117
+
118
+ RESULTS_DIR="$ROOT/benchmark/auto-resolve/results"
119
+ RESUME_RUN_ID="swebench-resume-arm-test-local__repo-1"
120
+ mkdir -p "$RESULTS_DIR/$RESUME_RUN_ID/solo" "$TMP/fakebin"
121
+ cat > "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json" <<'EOF'
122
+ {
123
+ "elapsed_seconds": 1,
124
+ "invoke_exit": 0,
125
+ "timed_out": false,
126
+ "verify_verdict": "PASS",
127
+ "terminal_verdict": "PASS"
128
+ }
129
+ EOF
130
+ cat > "$TMP/fakebin/claude" <<'EOF'
131
+ #!/usr/bin/env bash
132
+ echo "fake claude invoked"
133
+ exit 1
134
+ EOF
135
+ chmod +x "$TMP/fakebin/claude"
136
+ PATH="$TMP/fakebin:$PATH" bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
137
+ --fixture local__repo-1 \
138
+ --fixtures-root "$TMP/cases" \
139
+ --base-repo "$BASE_REPO" \
140
+ --diff "$CASE_DIR/model.patch" \
141
+ --run-id "$RESUME_RUN_ID" \
142
+ --pair-mode gated \
143
+ --timeout-seconds 3 \
144
+ --resume-completed-arms > "$TMP/resume-arm.log" 2>&1
145
+ grep -Fq '[frozen-verify] solo: reuse completed summary' "$TMP/resume-arm.log"
146
+ grep -Fq 'fake claude invoked' "$RESULTS_DIR/$RESUME_RUN_ID/pair/transcript.txt"
147
+ grep -q '"invoke_exit": 0' "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json"
148
+
149
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
150
+ --manifest "$TMP/manifest.json" \
151
+ --run-prefix swebench-frozen-corpus-test \
152
+ --timeout-seconds 7 \
153
+ --run-ids-out "$TMP/prepare-run-ids.txt" \
154
+ --out-json "$TMP/gate.json" \
155
+ --out-md "$TMP/gate.md" \
156
+ --prepare-only > "$TMP/corpus-runner.log"
157
+ grep -q 'prepare-only complete; gate skipped' "$TMP/corpus-runner.log"
158
+ grep -q 'Timeout: 7s per arm' "$TMP/corpus-runner.log"
159
+ grep -q '^swebench-frozen-corpus-test-1-local__repo-1$' "$TMP/prepare-run-ids.txt"
160
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/app.txt
161
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/app.txt
162
+ test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/.devlyn/spec-verify.json
163
+ test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/.devlyn/spec-verify.json
164
+ test ! -e "$TMP/gate.json"
165
+ test ! -e "$TMP/gate.md"
166
+
167
+ python3 - "$TMP/manifest.json" "$TMP/manifest-bad-diff.json" <<'PY'
168
+ import json, pathlib, sys
169
+ manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
170
+ manifest["prepared"][0]["case_dir"] = str(pathlib.Path(manifest["prepared"][0]["case_dir"]).parent / "missing-case")
171
+ pathlib.Path(sys.argv[2]).write_text(json.dumps(manifest, indent=2) + "\n")
172
+ PY
173
+ set +e
174
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
175
+ --manifest "$TMP/manifest-bad-diff.json" \
176
+ --run-prefix swebench-frozen-corpus-fail-test \
177
+ --run-ids-out "$TMP/fail-run-ids.txt" \
178
+ --prepare-only > "$TMP/corpus-fail.log" 2>&1
179
+ fail_status=$?
180
+ set -e
181
+ [ "$fail_status" -ne 0 ]
182
+ grep -q 'row failed: swebench-frozen-corpus-fail-test-1-local__repo-1' "$TMP/corpus-fail.log"
183
+ grep -q '^swebench-frozen-corpus-fail-test-1-local__repo-1$' "$TMP/fail-run-ids.txt"
184
+ test -f "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1/compare.json"
185
+
186
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
187
+ --title "Local SWE-bench Failed Matrix" \
188
+ --verdict FAIL \
189
+ --run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
190
+ --out-json "$TMP/fail-matrix.json" \
191
+ --out-md "$TMP/fail-matrix.md" > "$TMP/fail-matrix.log"
192
+ grep -q '"classification": "failed attempt: row runner exit=1"' "$TMP/fail-matrix.json"
193
+ grep -q '"trailing_non_gate_rows": 1' "$TMP/fail-matrix.json"
194
+ grep -q '"failed attempt: row runner exit=1": 1' "$TMP/fail-matrix.json"
195
+ grep -Fq 'failed attempt: row runner exit=1' "$TMP/fail-matrix.md"
196
+ grep -Fq 'Trailing non-gate rows: 1' "$TMP/fail-matrix.md"
197
+ set +e
198
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
199
+ --title "Local SWE-bench Failed Matrix" \
200
+ --verdict FAIL \
201
+ --run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
202
+ --max-trailing-non-gate 0 \
203
+ --out-json "$TMP/fail-yield-matrix.json" \
204
+ --out-md "$TMP/fail-yield-matrix.md" > "$TMP/fail-yield-matrix.log"
205
+ yield_status=$?
206
+ set -e
207
+ [ "$yield_status" -eq 2 ]
208
+ grep -q '"yield_verdict": "FAIL"' "$TMP/fail-yield-matrix.json"
209
+ grep -q '"trailing non-gate rows 1 > maximum 0"' "$TMP/fail-yield-matrix.json"
210
+ grep -Fq 'Yield verdict: **FAIL**' "$TMP/fail-yield-matrix.md"
211
+
212
+ PROVIDER_LIMIT_RUN_ID="swebench-provider-limit-test-local__repo-1"
213
+ mkdir -p "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo" "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair"
214
+ cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo/input.md" <<'EOF'
215
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
216
+ EOF
217
+ cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair/transcript.txt" <<'EOF'
218
+ You've hit your limit · resets 3am (Asia/Seoul)
219
+ EOF
220
+ cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/compare.json" <<'EOF'
221
+ {
222
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS", "elapsed_seconds": 1},
223
+ "pair": {"invoke_exit": 1, "timed_out": false, "verify_verdict": null, "elapsed_seconds": 1},
224
+ "comparison": {
225
+ "pair_trigger_missed": false,
226
+ "pair_verdict_lift": false,
227
+ "solo_verdict": "PASS",
228
+ "pair_verdict": null
229
+ }
230
+ }
231
+ EOF
232
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
233
+ --title "Local SWE-bench Provider Limit Matrix" \
234
+ --verdict FAIL \
235
+ --run-id "$PROVIDER_LIMIT_RUN_ID" \
236
+ --out-json "$TMP/provider-limit-matrix.json" \
237
+ --out-md "$TMP/provider-limit-matrix.md" > "$TMP/provider-limit-matrix.log"
238
+ grep -q '"classification": "failed attempt: provider limit"' "$TMP/provider-limit-matrix.json"
239
+ grep -Fq 'failed attempt: provider limit' "$TMP/provider-limit-matrix.md"
240
+
241
+ RUN_ID="swebench-gate-only-test-local__repo-1"
242
+ mkdir -p "$RESULTS_DIR/$RUN_ID/pair"
243
+ cat > "$RESULTS_DIR/$RUN_ID/pair/input.md" <<'EOF'
244
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
245
+ EOF
246
+ cat > "$RESULTS_DIR/$RUN_ID/compare.json" <<'EOF'
247
+ {
248
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
249
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
250
+ "comparison": {
251
+ "pair_trigger_missed": false,
252
+ "pair_verdict_lift": true,
253
+ "solo_verdict": "PASS_WITH_ISSUES",
254
+ "pair_verdict": "NEEDS_WORK"
255
+ }
256
+ }
257
+ EOF
258
+ printf '%s\n' "$RUN_ID" > "$TMP/run-ids.txt"
259
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
260
+ --manifest "$TMP/manifest.json" \
261
+ --gate-only-run-ids "$TMP/run-ids.txt" \
262
+ --min-runs 1 \
263
+ --max-pair-solo-wall-ratio 3 \
264
+ --run-ids-out "$TMP/gate-run-ids.txt" \
265
+ --out-json "$TMP/gate.json" \
266
+ --out-md "$TMP/gate.md" > "$TMP/gate-only.log"
267
+ grep -q '"verdict": "PASS"' "$TMP/gate.json"
268
+ grep -q '"avg_pair_solo_wall_ratio": 2.0' "$TMP/gate.json"
269
+ grep -Fq 'Verdict: **PASS**' "$TMP/gate.md"
270
+ grep -Fq 'Max pair/solo wall ratio: 3.00x' "$TMP/gate.md"
271
+ cmp "$TMP/run-ids.txt" "$TMP/gate-run-ids.txt"
272
+
273
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
274
+ --title "Local SWE-bench Matrix" \
275
+ --verdict PASS \
276
+ --gate-json "$TMP/gate.json" \
277
+ --run-id "$RUN_ID" \
278
+ --min-gate-rate 1 \
279
+ --max-trailing-non-gate 0 \
280
+ --out-json "$TMP/matrix.json" \
281
+ --out-md "$TMP/matrix.md" > "$TMP/matrix.log"
282
+ grep -q '"runs_total": 1' "$TMP/matrix.json"
283
+ grep -q '"gate_rows": 1' "$TMP/matrix.json"
284
+ grep -q '"gate_rate": 1.0' "$TMP/matrix.json"
285
+ grep -q '"trailing_non_gate_rows": 0' "$TMP/matrix.json"
286
+ grep -q '"yield_verdict": "PASS"' "$TMP/matrix.json"
287
+ grep -Fq 'Local SWE-bench Matrix' "$TMP/matrix.md"
288
+ grep -Fq 'Gate rate: 1.000' "$TMP/matrix.md"
289
+ grep -Fq 'Yield verdict: **PASS**' "$TMP/matrix.md"
290
+
291
+ rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-solo
292
+ rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-pair
293
+ rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo
294
+ rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair
295
+ rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test"
296
+ rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-test-1-local__repo-1"
297
+ rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1"
298
+ rm -rf "$RESULTS_DIR/$RESUME_RUN_ID"
299
+ rm -rf "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID"
300
+ rm -rf "$RESULTS_DIR/$RUN_ID"
301
+
302
+ echo "PASS test-swebench-frozen-case"
@@ -26,6 +26,7 @@ PER_RUN_PATTERNS = (
26
26
  "*.log.md",
27
27
  "fix-batch.round-*.json",
28
28
  "criteria.generated.md",
29
+ "risk-probes.jsonl",
29
30
  # iter-0019.8: spec-verify carrier artifacts get archived alongside
30
31
  # other per-run state. Killed mid-run cleanup is enforced separately
31
32
  # by spec-verify-check.py main() — when source markdown has no json
@@ -35,6 +36,7 @@ PER_RUN_PATTERNS = (
35
36
  "spec-verify.json",
36
37
  "spec-verify.results.json",
37
38
  "spec-verify-findings.jsonl",
39
+ "verify-merge.summary.json",
38
40
  # iter-0033a/2026-04-30 archive-fix iter: NEW /devlyn:resolve emits
39
41
  # plan.md (PLAN output) + final-report.md (PHASE 6 render) +
40
42
  # cumulative.patch (cumulative diff). Smoke 2's archive listing
@@ -52,6 +54,7 @@ PER_RUN_PATTERNS = (
52
54
  # ("pair_judge findings archive distinguishable") would false-fail on
53
55
  # every paired fixture without this glob.
54
56
  "verify-judge-*.md",
57
+ "codex-judge.*",
55
58
  )
56
59
 
57
60
 
@@ -6,7 +6,7 @@ Single source of truth for how every skill calls Codex. **MCP is not used.** Ski
6
6
 
7
7
  All long-running Codex calls go through `codex-monitored.sh` — a thin wrapper that closes stdin (codex 0.124.0 hangs when both stdin is open and a prompt arg is given), streams Codex stdout fully (no `tail -n` truncation), and prints a `[codex-monitored] heartbeat` line every 30s so the outer `claude -p` byte-watchdog stays fed during long reasoning gaps. The wrapper passes its arguments through verbatim to the underlying CLI, so the canonical flag set is unchanged from a raw call — only the launcher differs.
8
8
 
9
- **Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex.
9
+ **Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex. Read-only critique returns findings on stdout; the orchestrator writes any files.
10
10
 
11
11
  ```bash
12
12
  bash .claude/skills/_shared/codex-monitored.sh \
@@ -51,4 +51,4 @@ The local Codex CLI (fronted by `codex-monitored.sh`) is the primary (and only)
51
51
 
52
52
  Skills write the invocation as a Bash command the runtime executes. Example shape from `/devlyn:resolve` PHASE 2 IMPLEMENT when routed to Codex:
53
53
 
54
- > Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout.
54
+ > Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout. Do not pipe the wrapper; direct capture or file redirection preserves streaming and avoids the pipe-refusal exit.
@@ -41,7 +41,10 @@
41
41
  #
42
42
  # ENV OVERRIDES:
43
43
  # CODEX_MONITORED_HEARTBEAT — heartbeat interval seconds (default 30).
44
- # CODEX_BIN real codex binary path. Default: `codex`.
44
+ # CODEX_MONITORED_TIMEOUT_SEC optional hard timeout. When >0, kill the
45
+ # codex process group and exit 124.
46
+ # CODEX_BIN — real codex binary path. Default:
47
+ # CODEX_REAL_BIN when set, else `codex`.
45
48
  # Set this when the shim has put us first
46
49
  # on PATH.
47
50
  # CODEX_MONITORED_ALLOW_PIPED — set non-empty to skip the pipe-stdout
@@ -63,8 +66,10 @@ if [ -n "${CODEX_BLOCKED:-}" ]; then
63
66
  fi
64
67
 
65
68
  HEARTBEAT_SEC="${CODEX_MONITORED_HEARTBEAT:-30}"
66
- CODEX_BIN="${CODEX_BIN:-codex}"
69
+ TIMEOUT_SEC="${CODEX_MONITORED_TIMEOUT_SEC:-0}"
70
+ CODEX_BIN="${CODEX_BIN:-${CODEX_REAL_BIN:-codex}}"
67
71
  START=$(date +%s)
72
+ TIMEOUT_FLAG=""
68
73
 
69
74
  # --- Pipe-stdout refusal (iter-0009 R2 finding #1) -------------------------
70
75
  # `[ -p /dev/stdout ]` is the POSIX test for "is fd 1 a FIFO/pipe". Verified
@@ -106,35 +111,95 @@ heartbeat_loop() {
106
111
  done
107
112
  }
108
113
 
114
+ timeout_loop() {
115
+ local pid="$1"
116
+ local seconds="$2"
117
+ local flag="$3"
118
+ [ "$seconds" -gt 0 ] || return 0
119
+ sleep "$seconds"
120
+ if kill -0 "$pid" 2>/dev/null; then
121
+ : > "$flag"
122
+ printf '[codex-monitored] timeout: elapsed=%ds limit=%ds\n' \
123
+ "$(( $(date +%s) - START ))" "$seconds" >&2
124
+ kill -TERM -- "-$pid" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
125
+ sleep 5
126
+ kill -KILL -- "-$pid" 2>/dev/null || kill -KILL "$pid" 2>/dev/null || true
127
+ fi
128
+ }
129
+
130
+ terminate_process_group() {
131
+ local pgid="$1"
132
+ local reason="$2"
133
+ if ! kill -0 -- "-$pgid" 2>/dev/null; then
134
+ return 0
135
+ fi
136
+ printf '[codex-monitored] reap: reason=%s pgid=%s\n' "$reason" "$pgid" >&2
137
+ kill -TERM -- "-$pgid" 2>/dev/null || true
138
+ local i
139
+ for i in 1 2 3 4 5; do
140
+ sleep 1
141
+ if ! kill -0 -- "-$pgid" 2>/dev/null; then
142
+ return 0
143
+ fi
144
+ done
145
+ kill -KILL -- "-$pgid" 2>/dev/null || true
146
+ }
147
+
109
148
  forward_signal() {
110
149
  local sig="$1"
111
150
  if [ -n "${CODEX_PID:-}" ] && kill -0 "$CODEX_PID" 2>/dev/null; then
112
- kill -"$sig" "$CODEX_PID" 2>/dev/null || true
151
+ kill -"$sig" -- "-$CODEX_PID" 2>/dev/null || kill -"$sig" "$CODEX_PID" 2>/dev/null || true
113
152
  fi
114
153
  if [ -n "${HB_PID:-}" ] && kill -0 "$HB_PID" 2>/dev/null; then
115
154
  kill -TERM "$HB_PID" 2>/dev/null || true
116
155
  fi
156
+ if [ -n "${WATCHDOG_PID:-}" ] && kill -0 "$WATCHDOG_PID" 2>/dev/null; then
157
+ kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
158
+ fi
159
+ }
160
+
161
+ cleanup() {
162
+ forward_signal TERM
163
+ [ -z "$TIMEOUT_FLAG" ] || rm -f "$TIMEOUT_FLAG"
117
164
  }
118
165
 
119
- trap 'forward_signal TERM' TERM
120
- trap 'forward_signal INT' INT
166
+ trap 'forward_signal TERM; exit 143' TERM
167
+ trap 'forward_signal INT; exit 130' INT
168
+ trap cleanup EXIT
121
169
 
122
- printf '[codex-monitored] start: ts=%s heartbeat=%ds bin=%s\n' \
123
- "$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$CODEX_BIN" >&2
170
+ printf '[codex-monitored] start: ts=%s heartbeat=%ds timeout=%ss bin=%s\n' \
171
+ "$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$TIMEOUT_SEC" "$CODEX_BIN" >&2
124
172
 
125
173
  # Launch codex with stdin closed; output streams directly to OUR stdout/stderr.
174
+ set -m
126
175
  "$CODEX_BIN" exec "$@" < /dev/null &
127
176
  CODEX_PID=$!
177
+ set +m
128
178
  printf '[codex-monitored] codex pid=%d\n' "$CODEX_PID" >&2
129
179
 
130
180
  heartbeat_loop "$CODEX_PID" &
131
181
  HB_PID=$!
132
182
 
183
+ if [ "$TIMEOUT_SEC" -gt 0 ]; then
184
+ TIMEOUT_FLAG=$(mktemp "${TMPDIR:-/tmp}/codex-monitored-timeout.XXXXXX")
185
+ rm -f "$TIMEOUT_FLAG"
186
+ timeout_loop "$CODEX_PID" "$TIMEOUT_SEC" "$TIMEOUT_FLAG" &
187
+ WATCHDOG_PID=$!
188
+ fi
189
+
133
190
  wait "$CODEX_PID"
134
191
  EXIT=$?
192
+ terminate_process_group "$CODEX_PID" "post-exit-descendants"
135
193
 
136
194
  kill -TERM "$HB_PID" 2>/dev/null || true
137
195
  wait "$HB_PID" 2>/dev/null || true
196
+ if [ -n "${WATCHDOG_PID:-}" ]; then
197
+ kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
198
+ wait "$WATCHDOG_PID" 2>/dev/null || true
199
+ fi
200
+ if [ -n "$TIMEOUT_FLAG" ] && [ -f "$TIMEOUT_FLAG" ]; then
201
+ EXIT=124
202
+ fi
138
203
 
139
204
  printf '[codex-monitored] codex exited: code=%d elapsed=%ds\n' \
140
205
  "$EXIT" $(( $(date +%s) - START )) >&2
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env python3
2
+ """Normalize raw Codex pair-JUDGE stdout into canonical VERIFY JSONL."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import pathlib
9
+ import sys
10
+ import tempfile
11
+ from typing import Any
12
+
13
+
14
+ FINDING_SEVERITIES = {"CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"}
15
+
16
+
17
+ def atomic_write(path: pathlib.Path, text: str) -> None:
18
+ path.parent.mkdir(parents=True, exist_ok=True)
19
+ with tempfile.NamedTemporaryFile(
20
+ "w", encoding="utf-8", dir=path.parent, delete=False
21
+ ) as handle:
22
+ handle.write(text)
23
+ tmp_name = handle.name
24
+ pathlib.Path(tmp_name).replace(path)
25
+
26
+
27
+ def collect(stdout_path: pathlib.Path) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
28
+ findings: list[dict[str, Any]] = []
29
+ summary: dict[str, Any] | None = None
30
+ with stdout_path.open(encoding="utf-8") as handle:
31
+ for line_no, line in enumerate(handle, 1):
32
+ raw = line.strip()
33
+ if not raw:
34
+ continue
35
+ if raw.startswith("# SUMMARY "):
36
+ try:
37
+ item = json.loads(raw.removeprefix("# SUMMARY ").strip())
38
+ except json.JSONDecodeError as exc:
39
+ raise SystemExit(f"error: invalid SUMMARY JSON at {stdout_path}:{line_no}: {exc}")
40
+ if not isinstance(item, dict):
41
+ raise SystemExit(f"error: SUMMARY is not an object at {stdout_path}:{line_no}")
42
+ summary = item
43
+ continue
44
+ if raw.startswith("#"):
45
+ continue
46
+ try:
47
+ item = json.loads(raw)
48
+ except json.JSONDecodeError as exc:
49
+ raise SystemExit(f"error: invalid JSONL at {stdout_path}:{line_no}: {exc}")
50
+ if not isinstance(item, dict):
51
+ raise SystemExit(f"error: JSONL item is not an object at {stdout_path}:{line_no}")
52
+ severity = str(item.get("severity") or "").upper()
53
+ if severity not in FINDING_SEVERITIES:
54
+ raise SystemExit(f"error: finding missing valid severity at {stdout_path}:{line_no}")
55
+ findings.append(item)
56
+ if not findings and summary is None:
57
+ raise SystemExit("error: Codex pair-JUDGE stdout contained no JSONL findings or PASS line")
58
+ if summary and summary.get("verdict") in {"NEEDS_WORK", "FAIL", "BLOCKED"} and not findings:
59
+ raise SystemExit("error: non-PASS SUMMARY without JSONL findings")
60
+ return findings, summary
61
+
62
+
63
+ def self_test() -> int:
64
+ with tempfile.TemporaryDirectory() as tmp:
65
+ root = pathlib.Path(tmp)
66
+ stdout_path = root / "codex-judge.stdout"
67
+ out_path = root / "verify.pair.findings.jsonl"
68
+ summary_path = root / "codex-judge.summary.json"
69
+ stdout_path.write_text(
70
+ json.dumps({"id": "a", "severity": "HIGH"}) + "\n"
71
+ + '# SUMMARY {"verdict":"NEEDS_WORK"}\n',
72
+ encoding="utf-8",
73
+ )
74
+ findings, summary = collect(stdout_path)
75
+ write_outputs(findings, summary, out_path, summary_path)
76
+ assert out_path.read_text(encoding="utf-8").count("\n") == 1
77
+ assert json.loads(summary_path.read_text(encoding="utf-8"))["verdict"] == "NEEDS_WORK"
78
+ stdout_path.write_text("", encoding="utf-8")
79
+ try:
80
+ collect(stdout_path)
81
+ except SystemExit as exc:
82
+ assert "no JSONL findings" in str(exc)
83
+ else:
84
+ raise AssertionError("empty Codex stdout must not normalize to PASS")
85
+ return 0
86
+
87
+
88
+ def write_outputs(
89
+ findings: list[dict[str, Any]],
90
+ summary: dict[str, Any] | None,
91
+ out_path: pathlib.Path,
92
+ summary_path: pathlib.Path,
93
+ ) -> None:
94
+ atomic_write(
95
+ out_path,
96
+ "".join(json.dumps(item, sort_keys=True, separators=(",", ":")) + "\n" for item in findings),
97
+ )
98
+ if summary is not None:
99
+ atomic_write(summary_path, json.dumps(summary, indent=2, sort_keys=True) + "\n")
100
+
101
+
102
+ def main() -> int:
103
+ parser = argparse.ArgumentParser(description=__doc__)
104
+ parser.add_argument("--devlyn-dir", default=".devlyn")
105
+ parser.add_argument("--stdout-file", default="codex-judge.stdout")
106
+ parser.add_argument("--out", default="verify.pair.findings.jsonl")
107
+ parser.add_argument("--summary-out", default="codex-judge.summary.json")
108
+ parser.add_argument("--self-test", action="store_true")
109
+ args = parser.parse_args()
110
+ if args.self_test:
111
+ return self_test()
112
+
113
+ devlyn = pathlib.Path(args.devlyn_dir)
114
+ stdout_path = devlyn / args.stdout_file
115
+ if not stdout_path.is_file():
116
+ sys.stderr.write(f"error: {stdout_path} not found\n")
117
+ return 1
118
+ findings, summary = collect(stdout_path)
119
+ write_outputs(findings, summary, devlyn / args.out, devlyn / args.summary_out)
120
+ print(json.dumps({"findings_count": len(findings), "summary": summary}, sort_keys=True))
121
+ return 0
122
+
123
+
124
+ if __name__ == "__main__":
125
+ raise SystemExit(main())
@@ -14,7 +14,7 @@ When the resolved engine is `auto` or `codex`, on entry (before spawning any pha
14
14
 
15
15
  Never prompt the user. Never abort the run on missing CLI.
16
16
 
17
- Per-skill defaults: `/devlyn:resolve` defaults to `claude` (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement); `/devlyn:ideate` defaults to `auto` for the CHALLENGE phase's cross-model GAN-critic dynamic. Each skill's SKILL.md flag block is the source of truth for that skill's default.
17
+ Per-skill defaults: `/devlyn:resolve` defaults to `claude` for PLAN/IMPLEMENT (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement). `/devlyn:resolve` VERIFY is the exception: gated pair-JUDGE may invoke the OTHER engine when its SKILL.md trigger policy fires. `/devlyn:ideate` defaults to `auto` for the CHALLENGE phase's cross-model GAN-critic dynamic. Each skill's SKILL.md flag block is the source of truth for that skill's default.
18
18
 
19
19
  ## Why this is the one permitted silent fallback
20
20
 
@@ -35,6 +35,12 @@
35
35
  "description": "None of these substrings may appear in (stdout + stderr) for pass.",
36
36
  "items": { "type": "string", "minLength": 1 },
37
37
  "default": []
38
+ },
39
+ "contract_refs": {
40
+ "type": "array",
41
+ "description": "For hidden BENCH_FIXTURE_DIR commands, exact substrings from spec.md that this oracle verifies. Hidden oracles may test only visible spec clauses.",
42
+ "items": { "type": "string", "minLength": 1 },
43
+ "default": []
38
44
  }
39
45
  }
40
46
  }
@@ -83,6 +89,18 @@
83
89
  "items": { "type": "string", "minLength": 1 },
84
90
  "default": []
85
91
  },
92
+ "tier_a_waivers": {
93
+ "type": "array",
94
+ "description": "Optional fnmatch globs exempted from Tier A scope-oracle path checks when the spec explicitly authorizes those files.",
95
+ "items": { "type": "string", "minLength": 1 },
96
+ "default": []
97
+ },
98
+ "spec_output_files": {
99
+ "type": "array",
100
+ "description": "Files or globs that define the spec-authorized output surface for scope oracles. Touched files outside this set must be reachable from it via static imports or separately waived.",
101
+ "items": { "type": "string", "minLength": 1 },
102
+ "default": []
103
+ },
86
104
  "max_deps_added": {
87
105
  "type": "integer",
88
106
  "description": "Hard cap on new entries under dependencies/devDependencies in package.json. Exceeds → DQ.",