devlyn-cli 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/CLAUDE.md +1 -1
  2. package/README.md +1 -1
  3. package/benchmark/auto-resolve/README.md +318 -2
  4. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  91. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  100. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  101. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  102. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  103. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  104. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  105. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  106. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  107. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  110. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  111. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  112. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  113. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  114. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  116. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  117. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  118. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  119. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  120. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  121. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  122. package/bin/devlyn.js +56 -10
  123. package/config/skills/_shared/archive_run.py +3 -0
  124. package/config/skills/_shared/codex-config.md +2 -2
  125. package/config/skills/_shared/codex-monitored.sh +72 -7
  126. package/config/skills/_shared/collect-codex-findings.py +125 -0
  127. package/config/skills/_shared/engine-preflight.md +1 -1
  128. package/config/skills/_shared/expected.schema.json +18 -0
  129. package/config/skills/_shared/spec-verify-check.py +312 -10
  130. package/config/skills/_shared/verify-merge-findings.py +327 -0
  131. package/config/skills/devlyn:ideate/SKILL.md +1 -1
  132. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  133. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  134. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  135. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  136. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  137. package/package.json +1 -1
  138. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,302 @@
1
+ #!/usr/bin/env bash
2
+ # Regression test for the SWE-bench frozen VERIFY case importer.
3
+ set -euo pipefail
4
+
5
+ ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
6
+ TMP="$(mktemp -d)"
7
+ trap 'rm -rf "$TMP"' EXIT
8
+
9
+ REPO="$TMP/repo"
10
+ mkdir -p "$REPO"
11
+ git -C "$REPO" init -q
12
+ git -C "$REPO" config user.email bench@example.com
13
+ git -C "$REPO" config user.name bench
14
+ printf 'hello\n' > "$REPO/app.txt"
15
+ git -C "$REPO" add app.txt
16
+ git -C "$REPO" commit -q -m base
17
+ BASE_SHA="$(git -C "$REPO" rev-parse HEAD)"
18
+
19
+ printf 'goodbye\n' > "$REPO/app.txt"
20
+ git -C "$REPO" diff > "$TMP/model.patch"
21
+ git -C "$REPO" checkout -q -- app.txt
22
+
23
+ cat > "$TMP/instance.json" <<JSON
24
+ {
25
+ "instance_id": "local__repo-1",
26
+ "repo": "local/repo",
27
+ "base_commit": "$BASE_SHA",
28
+ "problem_statement": "Change app.txt so it says goodbye instead of hello.",
29
+ "version": "test",
30
+ "issue_url": "https://example.test/issue",
31
+ "pr_url": "https://example.test/pr"
32
+ }
33
+ JSON
34
+
35
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
36
+ --instance-json "$TMP/instance.json" \
37
+ --model-patch "$TMP/model.patch" \
38
+ --cases-root "$TMP/cases" \
39
+ --repos-root "$TMP/repos" \
40
+ --repo-dir "$REPO" \
41
+ --timeout-seconds 60 > "$TMP/prepare.json"
42
+
43
+ CASE_DIR="$TMP/cases/local__repo-1"
44
+ BASE_REPO="$TMP/repos/local__repo-${BASE_SHA:0:12}"
45
+ test -f "$CASE_DIR/spec.md"
46
+ test -f "$CASE_DIR/model.patch"
47
+ test -x "$CASE_DIR/setup.sh"
48
+ grep -q 'SWE-bench local__repo-1' "$CASE_DIR/spec.md"
49
+ grep -q -- '--pair-mode gated' "$CASE_DIR/run-command.txt"
50
+
51
+ python3 "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" \
52
+ --dataset lite \
53
+ --limit 1 \
54
+ --out "$TMP/fetched-lite.jsonl" > "$TMP/fetch.json"
55
+ grep -q '"rows_written": 1' "$TMP/fetch.json"
56
+ python3 - "$TMP/fetched-lite.jsonl" <<'PY'
57
+ import json, pathlib, sys
58
+ row = json.loads(pathlib.Path(sys.argv[1]).read_text().splitlines()[0])
59
+ for key in ("instance_id", "repo", "base_commit", "problem_statement"):
60
+ assert row.get(key), key
61
+ PY
62
+
63
+ python3 - "$TMP/instance.json" "$TMP/instances.jsonl" "$TMP/model.patch" "$TMP/predictions.jsonl" <<'PY'
64
+ import json, pathlib, sys
65
+ instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
66
+ pathlib.Path(sys.argv[2]).write_text(json.dumps(instance) + "\n")
67
+ patch = pathlib.Path(sys.argv[3]).read_text()
68
+ pathlib.Path(sys.argv[4]).write_text(json.dumps({
69
+ "instance_id": "local__repo-1",
70
+ "model_name_or_path": "local-test",
71
+ "model_patch": patch,
72
+ }) + "\n")
73
+ PY
74
+
75
+ mkdir -p "$TMP/patch-root/local__repo-1"
76
+ cp "$TMP/model.patch" "$TMP/patch-root/local__repo-1/patch.diff"
77
+ python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
78
+ --patch-root "$TMP/patch-root" \
79
+ --instances-jsonl "$TMP/instances.jsonl" \
80
+ --model-name local-patch-root \
81
+ --out "$TMP/collected-predictions.jsonl" > "$TMP/collect.json"
82
+ grep -q '"predictions_written": 1' "$TMP/collect.json"
83
+ python3 - "$TMP/collected-predictions.jsonl" <<'PY'
84
+ import json, pathlib, sys
85
+ row = json.loads(pathlib.Path(sys.argv[1]).read_text())
86
+ assert row["instance_id"] == "local__repo-1"
87
+ assert row["model_name_or_path"] == "local-patch-root"
88
+ assert row["model_patch"].endswith("\n")
89
+ PY
90
+
91
+ rm -rf "$TMP/cases-batch" "$TMP/repos-batch"
92
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
93
+ --instances-jsonl "$TMP/instances.jsonl" \
94
+ --predictions-jsonl "$TMP/predictions.jsonl" \
95
+ --cases-root "$TMP/cases-batch" \
96
+ --repos-root "$TMP/repos-batch" \
97
+ --repo-dir "$REPO" \
98
+ --out-manifest "$TMP/manifest.json" > "$TMP/batch.json"
99
+ grep -q '"prepared_count": 1' "$TMP/manifest.json"
100
+ test -f "$TMP/cases-batch/local__repo-1/model.patch"
101
+
102
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
103
+ --fixture local__repo-1 \
104
+ --fixtures-root "$TMP/cases" \
105
+ --base-repo "$BASE_REPO" \
106
+ --diff "$CASE_DIR/model.patch" \
107
+ --run-id swebench-frozen-case-test \
108
+ --pair-mode gated \
109
+ --timeout-seconds 7 \
110
+ --prepare-only > "$TMP/runner.log"
111
+
112
+ grep -q 'Timeout: 7s per arm' "$TMP/runner.log"
113
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/app.txt
114
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/app.txt
115
+ test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/.devlyn/spec-verify.json
116
+ test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/.devlyn/spec-verify.json
117
+
118
+ RESULTS_DIR="$ROOT/benchmark/auto-resolve/results"
119
+ RESUME_RUN_ID="swebench-resume-arm-test-local__repo-1"
120
+ mkdir -p "$RESULTS_DIR/$RESUME_RUN_ID/solo" "$TMP/fakebin"
121
+ cat > "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json" <<'EOF'
122
+ {
123
+ "elapsed_seconds": 1,
124
+ "invoke_exit": 0,
125
+ "timed_out": false,
126
+ "verify_verdict": "PASS",
127
+ "terminal_verdict": "PASS"
128
+ }
129
+ EOF
130
+ cat > "$TMP/fakebin/claude" <<'EOF'
131
+ #!/usr/bin/env bash
132
+ echo "fake claude invoked"
133
+ exit 1
134
+ EOF
135
+ chmod +x "$TMP/fakebin/claude"
136
+ PATH="$TMP/fakebin:$PATH" bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
137
+ --fixture local__repo-1 \
138
+ --fixtures-root "$TMP/cases" \
139
+ --base-repo "$BASE_REPO" \
140
+ --diff "$CASE_DIR/model.patch" \
141
+ --run-id "$RESUME_RUN_ID" \
142
+ --pair-mode gated \
143
+ --timeout-seconds 3 \
144
+ --resume-completed-arms > "$TMP/resume-arm.log" 2>&1
145
+ grep -Fq '[frozen-verify] solo: reuse completed summary' "$TMP/resume-arm.log"
146
+ grep -Fq 'fake claude invoked' "$RESULTS_DIR/$RESUME_RUN_ID/pair/transcript.txt"
147
+ grep -q '"invoke_exit": 0' "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json"
148
+
149
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
150
+ --manifest "$TMP/manifest.json" \
151
+ --run-prefix swebench-frozen-corpus-test \
152
+ --timeout-seconds 7 \
153
+ --run-ids-out "$TMP/prepare-run-ids.txt" \
154
+ --out-json "$TMP/gate.json" \
155
+ --out-md "$TMP/gate.md" \
156
+ --prepare-only > "$TMP/corpus-runner.log"
157
+ grep -q 'prepare-only complete; gate skipped' "$TMP/corpus-runner.log"
158
+ grep -q 'Timeout: 7s per arm' "$TMP/corpus-runner.log"
159
+ grep -q '^swebench-frozen-corpus-test-1-local__repo-1$' "$TMP/prepare-run-ids.txt"
160
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/app.txt
161
+ grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/app.txt
162
+ test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/.devlyn/spec-verify.json
163
+ test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/.devlyn/spec-verify.json
164
+ test ! -e "$TMP/gate.json"
165
+ test ! -e "$TMP/gate.md"
166
+
167
+ python3 - "$TMP/manifest.json" "$TMP/manifest-bad-diff.json" <<'PY'
168
+ import json, pathlib, sys
169
+ manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
170
+ manifest["prepared"][0]["case_dir"] = str(pathlib.Path(manifest["prepared"][0]["case_dir"]).parent / "missing-case")
171
+ pathlib.Path(sys.argv[2]).write_text(json.dumps(manifest, indent=2) + "\n")
172
+ PY
173
+ set +e
174
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
175
+ --manifest "$TMP/manifest-bad-diff.json" \
176
+ --run-prefix swebench-frozen-corpus-fail-test \
177
+ --run-ids-out "$TMP/fail-run-ids.txt" \
178
+ --prepare-only > "$TMP/corpus-fail.log" 2>&1
179
+ fail_status=$?
180
+ set -e
181
+ [ "$fail_status" -ne 0 ]
182
+ grep -q 'row failed: swebench-frozen-corpus-fail-test-1-local__repo-1' "$TMP/corpus-fail.log"
183
+ grep -q '^swebench-frozen-corpus-fail-test-1-local__repo-1$' "$TMP/fail-run-ids.txt"
184
+ test -f "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1/compare.json"
185
+
186
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
187
+ --title "Local SWE-bench Failed Matrix" \
188
+ --verdict FAIL \
189
+ --run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
190
+ --out-json "$TMP/fail-matrix.json" \
191
+ --out-md "$TMP/fail-matrix.md" > "$TMP/fail-matrix.log"
192
+ grep -q '"classification": "failed attempt: row runner exit=1"' "$TMP/fail-matrix.json"
193
+ grep -q '"trailing_non_gate_rows": 1' "$TMP/fail-matrix.json"
194
+ grep -q '"failed attempt: row runner exit=1": 1' "$TMP/fail-matrix.json"
195
+ grep -Fq 'failed attempt: row runner exit=1' "$TMP/fail-matrix.md"
196
+ grep -Fq 'Trailing non-gate rows: 1' "$TMP/fail-matrix.md"
197
+ set +e
198
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
199
+ --title "Local SWE-bench Failed Matrix" \
200
+ --verdict FAIL \
201
+ --run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
202
+ --max-trailing-non-gate 0 \
203
+ --out-json "$TMP/fail-yield-matrix.json" \
204
+ --out-md "$TMP/fail-yield-matrix.md" > "$TMP/fail-yield-matrix.log"
205
+ yield_status=$?
206
+ set -e
207
+ [ "$yield_status" -eq 2 ]
208
+ grep -q '"yield_verdict": "FAIL"' "$TMP/fail-yield-matrix.json"
209
+ grep -q '"trailing non-gate rows 1 > maximum 0"' "$TMP/fail-yield-matrix.json"
210
+ grep -Fq 'Yield verdict: **FAIL**' "$TMP/fail-yield-matrix.md"
211
+
212
+ PROVIDER_LIMIT_RUN_ID="swebench-provider-limit-test-local__repo-1"
213
+ mkdir -p "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo" "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair"
214
+ cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo/input.md" <<'EOF'
215
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
216
+ EOF
217
+ cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair/transcript.txt" <<'EOF'
218
+ You've hit your limit · resets 3am (Asia/Seoul)
219
+ EOF
220
+ cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/compare.json" <<'EOF'
221
+ {
222
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS", "elapsed_seconds": 1},
223
+ "pair": {"invoke_exit": 1, "timed_out": false, "verify_verdict": null, "elapsed_seconds": 1},
224
+ "comparison": {
225
+ "pair_trigger_missed": false,
226
+ "pair_verdict_lift": false,
227
+ "solo_verdict": "PASS",
228
+ "pair_verdict": null
229
+ }
230
+ }
231
+ EOF
232
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
233
+ --title "Local SWE-bench Provider Limit Matrix" \
234
+ --verdict FAIL \
235
+ --run-id "$PROVIDER_LIMIT_RUN_ID" \
236
+ --out-json "$TMP/provider-limit-matrix.json" \
237
+ --out-md "$TMP/provider-limit-matrix.md" > "$TMP/provider-limit-matrix.log"
238
+ grep -q '"classification": "failed attempt: provider limit"' "$TMP/provider-limit-matrix.json"
239
+ grep -Fq 'failed attempt: provider limit' "$TMP/provider-limit-matrix.md"
240
+
241
+ RUN_ID="swebench-gate-only-test-local__repo-1"
242
+ mkdir -p "$RESULTS_DIR/$RUN_ID/pair"
243
+ cat > "$RESULTS_DIR/$RUN_ID/pair/input.md" <<'EOF'
244
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
245
+ EOF
246
+ cat > "$RESULTS_DIR/$RUN_ID/compare.json" <<'EOF'
247
+ {
248
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
249
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
250
+ "comparison": {
251
+ "pair_trigger_missed": false,
252
+ "pair_verdict_lift": true,
253
+ "solo_verdict": "PASS_WITH_ISSUES",
254
+ "pair_verdict": "NEEDS_WORK"
255
+ }
256
+ }
257
+ EOF
258
+ printf '%s\n' "$RUN_ID" > "$TMP/run-ids.txt"
259
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
260
+ --manifest "$TMP/manifest.json" \
261
+ --gate-only-run-ids "$TMP/run-ids.txt" \
262
+ --min-runs 1 \
263
+ --max-pair-solo-wall-ratio 3 \
264
+ --run-ids-out "$TMP/gate-run-ids.txt" \
265
+ --out-json "$TMP/gate.json" \
266
+ --out-md "$TMP/gate.md" > "$TMP/gate-only.log"
267
+ grep -q '"verdict": "PASS"' "$TMP/gate.json"
268
+ grep -q '"avg_pair_solo_wall_ratio": 2.0' "$TMP/gate.json"
269
+ grep -Fq 'Verdict: **PASS**' "$TMP/gate.md"
270
+ grep -Fq 'Max pair/solo wall ratio: 3.00x' "$TMP/gate.md"
271
+ cmp "$TMP/run-ids.txt" "$TMP/gate-run-ids.txt"
272
+
273
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
274
+ --title "Local SWE-bench Matrix" \
275
+ --verdict PASS \
276
+ --gate-json "$TMP/gate.json" \
277
+ --run-id "$RUN_ID" \
278
+ --min-gate-rate 1 \
279
+ --max-trailing-non-gate 0 \
280
+ --out-json "$TMP/matrix.json" \
281
+ --out-md "$TMP/matrix.md" > "$TMP/matrix.log"
282
+ grep -q '"runs_total": 1' "$TMP/matrix.json"
283
+ grep -q '"gate_rows": 1' "$TMP/matrix.json"
284
+ grep -q '"gate_rate": 1.0' "$TMP/matrix.json"
285
+ grep -q '"trailing_non_gate_rows": 0' "$TMP/matrix.json"
286
+ grep -q '"yield_verdict": "PASS"' "$TMP/matrix.json"
287
+ grep -Fq 'Local SWE-bench Matrix' "$TMP/matrix.md"
288
+ grep -Fq 'Gate rate: 1.000' "$TMP/matrix.md"
289
+ grep -Fq 'Yield verdict: **PASS**' "$TMP/matrix.md"
290
+
291
+ rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-solo
292
+ rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-pair
293
+ rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo
294
+ rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair
295
+ rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test"
296
+ rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-test-1-local__repo-1"
297
+ rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1"
298
+ rm -rf "$RESULTS_DIR/$RESUME_RUN_ID"
299
+ rm -rf "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID"
300
+ rm -rf "$RESULTS_DIR/$RUN_ID"
301
+
302
+ echo "PASS test-swebench-frozen-case"
package/bin/devlyn.js CHANGED
@@ -19,6 +19,10 @@ const CLI_TARGETS = {
19
19
  instructionsFile: 'AGENTS.md',
20
20
  baseInstructionsFile: 'AGENTS.md',
21
21
  configDir: null, // Codex uses AGENTS.md at project root
22
+ // Codex auto-loads skills from ~/.codex/skills/ (user-global). Same
23
+ // SKILL.md format as Claude Code; descriptions must stay ≤1024 chars.
24
+ skillsDir: path.join(os.homedir(), '.codex', 'skills'),
25
+ skillsToInstall: ['devlyn:resolve', 'devlyn:ideate', '_shared'],
22
26
  detect: () => fs.existsSync(path.join(process.cwd(), 'AGENTS.md')) || fs.existsSync(path.join(process.cwd(), '.codex')),
23
27
  },
24
28
  gemini: {
@@ -509,6 +513,37 @@ function detectOtherCLIs() {
509
513
  return detected;
510
514
  }
511
515
 
516
+ // Install /devlyn:resolve + /devlyn:ideate + _shared skills into a CLI's
517
+ // global skills directory (e.g. ~/.codex/skills/). Returns count of skills
518
+ // copied. Skipped silently for CLIs without a skillsDir (e.g. cursor, copilot
519
+ // at the time of writing — they don't have an analogous skill-loader).
520
+ function installSkillsForCLI(cliKey) {
521
+ const cli = CLI_TARGETS[cliKey];
522
+ if (!cli || !cli.skillsDir || !cli.skillsToInstall) return 0;
523
+
524
+ const sourceSkillsDir = path.join(CONFIG_SOURCE, 'skills');
525
+ if (!fs.existsSync(sourceSkillsDir)) return 0;
526
+ if (!fs.existsSync(cli.skillsDir)) {
527
+ fs.mkdirSync(cli.skillsDir, { recursive: true });
528
+ }
529
+
530
+ let copied = 0;
531
+ for (const skillName of cli.skillsToInstall) {
532
+ const src = path.join(sourceSkillsDir, skillName);
533
+ const dest = path.join(cli.skillsDir, skillName);
534
+ if (!fs.existsSync(src)) continue;
535
+ // Full replace per cleanManagedSkillDirs semantics: stale files in the
536
+ // installed mirror would otherwise persist forever.
537
+ if (fs.existsSync(dest)) {
538
+ fs.rmSync(dest, { recursive: true, force: true });
539
+ }
540
+ copyRecursive(src, dest, cli.skillsDir);
541
+ copied++;
542
+ log(` → ${cli.skillsDir.replace(os.homedir(), '~')}/${skillName}`, 'dim');
543
+ }
544
+ return copied;
545
+ }
546
+
512
547
  function installAgentsForCLI(cliKey) {
513
548
  const cli = CLI_TARGETS[cliKey];
514
549
  if (!cli) return false;
@@ -561,6 +596,14 @@ function installAgentsForCLI(cliKey) {
561
596
  log(` → ${cli.instructionsFile} (agent instructions appended)`, 'dim');
562
597
  }
563
598
 
599
+ // If this CLI also supports a global skill-loader (currently Codex), install
600
+ // /devlyn:resolve + /devlyn:ideate + _shared so the same slash commands work
601
+ // there. Skipped for CLIs without a skillsDir entry.
602
+ const skillsCopied = installSkillsForCLI(cliKey);
603
+ if (skillsCopied > 0) {
604
+ log(` → ${skillsCopied} skill${skillsCopied > 1 ? 's' : ''} installed (devlyn:resolve / devlyn:ideate / _shared)`, 'dim');
605
+ }
606
+
564
607
  return true;
565
608
  }
566
609
 
@@ -695,7 +738,7 @@ async function init(skipPrompts = false) {
695
738
  // Skip prompts if -y flag or non-interactive
696
739
  if (skipPrompts || !process.stdin.isTTY) {
697
740
  log('\n💡 Add optional addons later: run `npx devlyn-cli` without -y', 'dim');
698
- log(' Add Codex instructions later: run `npx devlyn-cli agents codex`', 'dim');
741
+ log(' Add Codex instructions + skills later: run `npx devlyn-cli agents codex`', 'dim');
699
742
  log(`\n${COLORS.dim} Enjoying devlyn? Star it on GitHub — it helps others find it:${COLORS.reset}`);
700
743
  log(` ${COLORS.purple}→ https://github.com/fysoul17/devlyn-cli${COLORS.reset}\n`);
701
744
  return;
@@ -703,14 +746,17 @@ async function init(skipPrompts = false) {
703
746
 
704
747
  // Ask which non-Claude CLIs should receive instruction files.
705
748
  log('\n🤖 Optional AI CLI instructions:\n', 'blue');
706
- const cliOptions = Object.entries(CLI_TARGETS).map(([key, cli]) => ({
707
- key,
708
- name: cli.name,
709
- desc: cli.configDir
710
- ? `Install agents into ${cli.configDir}/`
711
- : `Install ${cli.instructionsFile}`,
712
- type: 'cli',
713
- }));
749
+ const cliOptions = Object.entries(CLI_TARGETS).map(([key, cli]) => {
750
+ let desc;
751
+ if (cli.configDir) {
752
+ desc = `Install agents into ${cli.configDir}/`;
753
+ } else if (cli.skillsDir) {
754
+ desc = `Install ${cli.instructionsFile} + /devlyn:resolve + /devlyn:ideate skills (~/.codex/skills/)`;
755
+ } else {
756
+ desc = `Install ${cli.instructionsFile}`;
757
+ }
758
+ return { key, name: cli.name, desc, type: 'cli' };
759
+ });
714
760
  const selectedClis = await multiSelect(cliOptions);
715
761
  if (selectedClis.length > 0) {
716
762
  let agentsInstalled = 0;
@@ -720,7 +766,7 @@ async function init(skipPrompts = false) {
720
766
  log(` ✅ Agent instructions installed for ${agentsInstalled} CLI${agentsInstalled !== 1 ? 's' : ''}`, 'green');
721
767
  } else {
722
768
  log('💡 No additional CLI instructions selected', 'dim');
723
- log(' Run `npx devlyn-cli agents codex` later to install Codex AGENTS.md', 'dim');
769
+ log(' Run `npx devlyn-cli agents codex` later to install Codex AGENTS.md + /devlyn skills', 'dim');
724
770
  }
725
771
 
726
772
  // Ask about optional addons (local skills + external packs)
@@ -26,6 +26,7 @@ PER_RUN_PATTERNS = (
26
26
  "*.log.md",
27
27
  "fix-batch.round-*.json",
28
28
  "criteria.generated.md",
29
+ "risk-probes.jsonl",
29
30
  # iter-0019.8: spec-verify carrier artifacts get archived alongside
30
31
  # other per-run state. Killed mid-run cleanup is enforced separately
31
32
  # by spec-verify-check.py main() — when source markdown has no json
@@ -35,6 +36,7 @@ PER_RUN_PATTERNS = (
35
36
  "spec-verify.json",
36
37
  "spec-verify.results.json",
37
38
  "spec-verify-findings.jsonl",
39
+ "verify-merge.summary.json",
38
40
  # iter-0033a/2026-04-30 archive-fix iter: NEW /devlyn:resolve emits
39
41
  # plan.md (PLAN output) + final-report.md (PHASE 6 render) +
40
42
  # cumulative.patch (cumulative diff). Smoke 2's archive listing
@@ -52,6 +54,7 @@ PER_RUN_PATTERNS = (
52
54
  # ("pair_judge findings archive distinguishable") would false-fail on
53
55
  # every paired fixture without this glob.
54
56
  "verify-judge-*.md",
57
+ "codex-judge.*",
55
58
  )
56
59
 
57
60
 
@@ -6,7 +6,7 @@ Single source of truth for how every skill calls Codex. **MCP is not used.** Ski
6
6
 
7
7
  All long-running Codex calls go through `codex-monitored.sh` — a thin wrapper that closes stdin (codex 0.124.0 hangs when both stdin is open and a prompt arg is given), streams Codex stdout fully (no `tail -n` truncation), and prints a `[codex-monitored] heartbeat` line every 30s so the outer `claude -p` byte-watchdog stays fed during long reasoning gaps. The wrapper passes its arguments through verbatim to the underlying CLI, so the canonical flag set is unchanged from a raw call — only the launcher differs.
8
8
 
9
- **Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex.
9
+ **Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex. Read-only critique returns findings on stdout; the orchestrator writes any files.
10
10
 
11
11
  ```bash
12
12
  bash .claude/skills/_shared/codex-monitored.sh \
@@ -51,4 +51,4 @@ The local Codex CLI (fronted by `codex-monitored.sh`) is the primary (and only)
51
51
 
52
52
  Skills write the invocation as a Bash command the runtime executes. Example shape from `/devlyn:resolve` PHASE 2 IMPLEMENT when routed to Codex:
53
53
 
54
- > Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout.
54
+ > Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout. Do not pipe the wrapper; direct capture or file redirection preserves streaming and avoids the pipe-refusal exit.
@@ -41,7 +41,10 @@
41
41
  #
42
42
  # ENV OVERRIDES:
43
43
  # CODEX_MONITORED_HEARTBEAT — heartbeat interval seconds (default 30).
44
- # CODEX_BIN real codex binary path. Default: `codex`.
44
+ # CODEX_MONITORED_TIMEOUT_SEC optional hard timeout. When >0, kill the
45
+ # codex process group and exit 124.
46
+ # CODEX_BIN — real codex binary path. Default:
47
+ # CODEX_REAL_BIN when set, else `codex`.
45
48
  # Set this when the shim has put us first
46
49
  # on PATH.
47
50
  # CODEX_MONITORED_ALLOW_PIPED — set non-empty to skip the pipe-stdout
@@ -63,8 +66,10 @@ if [ -n "${CODEX_BLOCKED:-}" ]; then
63
66
  fi
64
67
 
65
68
  HEARTBEAT_SEC="${CODEX_MONITORED_HEARTBEAT:-30}"
66
- CODEX_BIN="${CODEX_BIN:-codex}"
69
+ TIMEOUT_SEC="${CODEX_MONITORED_TIMEOUT_SEC:-0}"
70
+ CODEX_BIN="${CODEX_BIN:-${CODEX_REAL_BIN:-codex}}"
67
71
  START=$(date +%s)
72
+ TIMEOUT_FLAG=""
68
73
 
69
74
  # --- Pipe-stdout refusal (iter-0009 R2 finding #1) -------------------------
70
75
  # `[ -p /dev/stdout ]` is the POSIX test for "is fd 1 a FIFO/pipe". Verified
@@ -106,35 +111,95 @@ heartbeat_loop() {
106
111
  done
107
112
  }
108
113
 
114
+ timeout_loop() {
115
+ local pid="$1"
116
+ local seconds="$2"
117
+ local flag="$3"
118
+ [ "$seconds" -gt 0 ] || return 0
119
+ sleep "$seconds"
120
+ if kill -0 "$pid" 2>/dev/null; then
121
+ : > "$flag"
122
+ printf '[codex-monitored] timeout: elapsed=%ds limit=%ds\n' \
123
+ "$(( $(date +%s) - START ))" "$seconds" >&2
124
+ kill -TERM -- "-$pid" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
125
+ sleep 5
126
+ kill -KILL -- "-$pid" 2>/dev/null || kill -KILL "$pid" 2>/dev/null || true
127
+ fi
128
+ }
129
+
130
+ terminate_process_group() {
131
+ local pgid="$1"
132
+ local reason="$2"
133
+ if ! kill -0 -- "-$pgid" 2>/dev/null; then
134
+ return 0
135
+ fi
136
+ printf '[codex-monitored] reap: reason=%s pgid=%s\n' "$reason" "$pgid" >&2
137
+ kill -TERM -- "-$pgid" 2>/dev/null || true
138
+ local i
139
+ for i in 1 2 3 4 5; do
140
+ sleep 1
141
+ if ! kill -0 -- "-$pgid" 2>/dev/null; then
142
+ return 0
143
+ fi
144
+ done
145
+ kill -KILL -- "-$pgid" 2>/dev/null || true
146
+ }
147
+
109
148
  forward_signal() {
110
149
  local sig="$1"
111
150
  if [ -n "${CODEX_PID:-}" ] && kill -0 "$CODEX_PID" 2>/dev/null; then
112
- kill -"$sig" "$CODEX_PID" 2>/dev/null || true
151
+ kill -"$sig" -- "-$CODEX_PID" 2>/dev/null || kill -"$sig" "$CODEX_PID" 2>/dev/null || true
113
152
  fi
114
153
  if [ -n "${HB_PID:-}" ] && kill -0 "$HB_PID" 2>/dev/null; then
115
154
  kill -TERM "$HB_PID" 2>/dev/null || true
116
155
  fi
156
+ if [ -n "${WATCHDOG_PID:-}" ] && kill -0 "$WATCHDOG_PID" 2>/dev/null; then
157
+ kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
158
+ fi
159
+ }
160
+
161
+ cleanup() {
162
+ forward_signal TERM
163
+ [ -z "$TIMEOUT_FLAG" ] || rm -f "$TIMEOUT_FLAG"
117
164
  }
118
165
 
119
- trap 'forward_signal TERM' TERM
120
- trap 'forward_signal INT' INT
166
+ trap 'forward_signal TERM; exit 143' TERM
167
+ trap 'forward_signal INT; exit 130' INT
168
+ trap cleanup EXIT
121
169
 
122
- printf '[codex-monitored] start: ts=%s heartbeat=%ds bin=%s\n' \
123
- "$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$CODEX_BIN" >&2
170
+ printf '[codex-monitored] start: ts=%s heartbeat=%ds timeout=%ss bin=%s\n' \
171
+ "$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$TIMEOUT_SEC" "$CODEX_BIN" >&2
124
172
 
125
173
  # Launch codex with stdin closed; output streams directly to OUR stdout/stderr.
174
+ set -m
126
175
  "$CODEX_BIN" exec "$@" < /dev/null &
127
176
  CODEX_PID=$!
177
+ set +m
128
178
  printf '[codex-monitored] codex pid=%d\n' "$CODEX_PID" >&2
129
179
 
130
180
  heartbeat_loop "$CODEX_PID" &
131
181
  HB_PID=$!
132
182
 
183
+ if [ "$TIMEOUT_SEC" -gt 0 ]; then
184
+ TIMEOUT_FLAG=$(mktemp "${TMPDIR:-/tmp}/codex-monitored-timeout.XXXXXX")
185
+ rm -f "$TIMEOUT_FLAG"
186
+ timeout_loop "$CODEX_PID" "$TIMEOUT_SEC" "$TIMEOUT_FLAG" &
187
+ WATCHDOG_PID=$!
188
+ fi
189
+
133
190
  wait "$CODEX_PID"
134
191
  EXIT=$?
192
+ terminate_process_group "$CODEX_PID" "post-exit-descendants"
135
193
 
136
194
  kill -TERM "$HB_PID" 2>/dev/null || true
137
195
  wait "$HB_PID" 2>/dev/null || true
196
+ if [ -n "${WATCHDOG_PID:-}" ]; then
197
+ kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
198
+ wait "$WATCHDOG_PID" 2>/dev/null || true
199
+ fi
200
+ if [ -n "$TIMEOUT_FLAG" ] && [ -f "$TIMEOUT_FLAG" ]; then
201
+ EXIT=124
202
+ fi
138
203
 
139
204
  printf '[codex-monitored] codex exited: code=%d elapsed=%ds\n' \
140
205
  "$EXIT" $(( $(date +%s) - START )) >&2