agent-control-plane 0.4.9 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +109 -13
  2. package/npm/bin/agent-control-plane.js +1 -1
  3. package/package.json +39 -33
  4. package/tools/bin/debug-session.sh +106 -0
  5. package/tools/bin/flow-config-lib.sh +13 -3508
  6. package/tools/bin/flow-execution-lib.sh +243 -0
  7. package/tools/bin/flow-forge-lib.sh +1770 -0
  8. package/tools/bin/flow-profile-lib.sh +335 -0
  9. package/tools/bin/flow-provider-lib.sh +981 -0
  10. package/tools/bin/flow-runtime-doctor-linux.sh +136 -0
  11. package/tools/bin/flow-runtime-doctor.sh +5 -1
  12. package/tools/bin/flow-session-lib.sh +317 -0
  13. package/tools/bin/install-project-systemd.sh +255 -0
  14. package/tools/bin/project-runtimectl.sh +45 -0
  15. package/tools/bin/project-systemd-bootstrap.sh +74 -0
  16. package/tools/bin/uninstall-project-systemd.sh +87 -0
  17. package/tools/dashboard/app.js +238 -8
  18. package/tools/dashboard/issue_queue_state.py +101 -0
  19. package/tools/dashboard/requirements.txt +3 -0
  20. package/tools/dashboard/server.py +250 -30
  21. package/tools/dashboard/styles.css +526 -455
  22. package/tools/bin/agent-cleanup-worktree +0 -247
  23. package/tools/bin/agent-github-update-labels +0 -105
  24. package/tools/bin/agent-init-worktree +0 -216
  25. package/tools/bin/agent-project-archive-run +0 -52
  26. package/tools/bin/agent-project-capture-worker +0 -46
  27. package/tools/bin/agent-project-catch-up-issue-pr-links +0 -118
  28. package/tools/bin/agent-project-catch-up-merged-prs +0 -195
  29. package/tools/bin/agent-project-catch-up-scheduled-issue-retries +0 -123
  30. package/tools/bin/agent-project-cleanup-session +0 -513
  31. package/tools/bin/agent-project-detached-launch +0 -127
  32. package/tools/bin/agent-project-heartbeat-loop +0 -1029
  33. package/tools/bin/agent-project-open-issue-worktree +0 -89
  34. package/tools/bin/agent-project-open-pr-worktree +0 -80
  35. package/tools/bin/agent-project-publish-issue-pr +0 -468
  36. package/tools/bin/agent-project-reconcile-issue-session +0 -1409
  37. package/tools/bin/agent-project-reconcile-pr-session +0 -1288
  38. package/tools/bin/agent-project-retry-state +0 -158
  39. package/tools/bin/agent-project-run-claude-session +0 -805
  40. package/tools/bin/agent-project-run-codex-resilient +0 -963
  41. package/tools/bin/agent-project-run-codex-session +0 -435
  42. package/tools/bin/agent-project-run-kilo-session +0 -369
  43. package/tools/bin/agent-project-run-ollama-session +0 -658
  44. package/tools/bin/agent-project-run-openclaw-session +0 -1309
  45. package/tools/bin/agent-project-run-opencode-session +0 -377
  46. package/tools/bin/agent-project-run-pi-session +0 -479
  47. package/tools/bin/agent-project-sync-anchor-repo +0 -139
  48. package/tools/bin/agent-project-sync-source-repo-main +0 -163
  49. package/tools/bin/agent-project-worker-status +0 -188
  50. package/tools/bin/branch-verification-guard.sh +0 -364
  51. package/tools/bin/capture-worker.sh +0 -18
  52. package/tools/bin/cleanup-worktree.sh +0 -52
  53. package/tools/bin/codex-quota +0 -31
  54. package/tools/bin/create-follow-up-issue.sh +0 -114
  55. package/tools/bin/dashboard-launchd-bootstrap.sh +0 -50
  56. package/tools/bin/issue-publish-localization-guard.sh +0 -142
  57. package/tools/bin/issue-publish-scope-guard.sh +0 -242
  58. package/tools/bin/issue-requires-local-workspace-install.sh +0 -31
  59. package/tools/bin/issue-resource-class.sh +0 -12
  60. package/tools/bin/kick-scheduler.sh +0 -75
  61. package/tools/bin/label-follow-up-issues.sh +0 -14
  62. package/tools/bin/new-pr-worktree.sh +0 -50
  63. package/tools/bin/new-worktree.sh +0 -49
  64. package/tools/bin/pr-risk.sh +0 -12
  65. package/tools/bin/prepare-worktree.sh +0 -142
  66. package/tools/bin/provider-cooldown-state.sh +0 -204
  67. package/tools/bin/publish-issue-worker.sh +0 -31
  68. package/tools/bin/reconcile-bootstrap-lib.sh +0 -113
  69. package/tools/bin/reconcile-issue-worker.sh +0 -34
  70. package/tools/bin/reconcile-pr-worker.sh +0 -34
  71. package/tools/bin/record-verification.sh +0 -71
  72. package/tools/bin/render-flow-config.sh +0 -98
  73. package/tools/bin/resident-issue-controller-lib.sh +0 -448
  74. package/tools/bin/retry-state.sh +0 -31
  75. package/tools/bin/reuse-issue-worktree.sh +0 -121
  76. package/tools/bin/run-codex-bypass.sh +0 -3
  77. package/tools/bin/run-codex-safe.sh +0 -3
  78. package/tools/bin/run-codex-task.sh +0 -280
  79. package/tools/bin/serve-dashboard.sh +0 -5
  80. package/tools/bin/start-issue-worker.sh +0 -943
  81. package/tools/bin/start-pr-fix-worker.sh +0 -528
  82. package/tools/bin/start-pr-merge-repair-worker.sh +0 -8
  83. package/tools/bin/start-pr-review-worker.sh +0 -261
  84. package/tools/bin/start-resident-issue-loop.sh +0 -499
  85. package/tools/bin/update-github-labels.sh +0 -14
  86. package/tools/bin/worker-status.sh +0 -19
  87. package/tools/bin/workflow-catalog.sh +0 -77
@@ -1,963 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5
- # shellcheck source=/dev/null
6
- source "${SCRIPT_DIR}/flow-config-lib.sh"
7
-
8
- usage() {
9
- cat <<'EOF'
10
- Usage:
11
- agent-project-run-codex-resilient --mode safe|bypass --worktree <path> --prompt-file <path> --output-file <path> --host-run-dir <path> --sandbox-run-dir <path> --codex-bin <path> [options]
12
-
13
- Run Codex with persisted thread recovery for quota/auth interruptions.
14
-
15
- Options:
16
- --safe-profile <name> Codex profile for safe mode
17
- --bypass-profile <name> Codex profile for bypass mode
18
- --max-resume-attempts <count> Maximum resume attempts after interruption
19
- --auth-refresh-timeout-seconds <secs> How long to wait for refreshed auth before failing
20
- --auth-refresh-poll-seconds <secs> Poll interval while waiting for refreshed auth
21
- --stall-seconds <secs> Fail if Codex stops producing output for too long
22
- --help Show this help
23
- EOF
24
- }
25
-
26
- mode=""
27
- worktree=""
28
- prompt_file=""
29
- output_file=""
30
- host_run_dir=""
31
- sandbox_run_dir=""
32
- safe_profile="default"
33
- bypass_profile="default"
34
- codex_bin=""
35
- max_resume_attempts="${ACP_CODEX_MAX_RESUME_ATTEMPTS:-${F_LOSNING_CODEX_MAX_RESUME_ATTEMPTS:-6}}"
36
- auth_refresh_timeout_seconds="${ACP_CODEX_AUTH_REFRESH_TIMEOUT_SECONDS:-${F_LOSNING_CODEX_AUTH_REFRESH_TIMEOUT_SECONDS:-900}}"
37
- auth_refresh_poll_seconds="${ACP_CODEX_AUTH_REFRESH_POLL_SECONDS:-${F_LOSNING_CODEX_AUTH_REFRESH_POLL_SECONDS:-10}}"
38
- max_quota_autoswitch_attempts="${ACP_CODEX_MAX_AUTOSWITCH_ATTEMPTS:-${F_LOSNING_CODEX_MAX_AUTOSWITCH_ATTEMPTS:-1}}"
39
- codex_progress_heartbeat_seconds="${ACP_CODEX_PROGRESS_HEARTBEAT_SECONDS:-${F_LOSNING_CODEX_PROGRESS_HEARTBEAT_SECONDS:-30}}"
40
- codex_stall_seconds="${ACP_CODEX_STALL_SECONDS:-${F_LOSNING_CODEX_STALL_SECONDS:-300}}"
41
- python_bin=""
42
-
43
- resolve_python_bin() {
44
- if command -v python3 >/dev/null 2>&1; then
45
- command -v python3
46
- return 0
47
- fi
48
- if [[ -x /opt/homebrew/bin/python3 ]]; then
49
- printf '%s\n' "/opt/homebrew/bin/python3"
50
- return 0
51
- fi
52
- if command -v python >/dev/null 2>&1; then
53
- command -v python
54
- return 0
55
- fi
56
- return 1
57
- }
58
-
59
- while [[ $# -gt 0 ]]; do
60
- case "$1" in
61
- --mode) mode="${2:-}"; shift 2 ;;
62
- --worktree) worktree="${2:-}"; shift 2 ;;
63
- --prompt-file) prompt_file="${2:-}"; shift 2 ;;
64
- --output-file) output_file="${2:-}"; shift 2 ;;
65
- --host-run-dir) host_run_dir="${2:-}"; shift 2 ;;
66
- --sandbox-run-dir) sandbox_run_dir="${2:-}"; shift 2 ;;
67
- --safe-profile) safe_profile="${2:-}"; shift 2 ;;
68
- --bypass-profile) bypass_profile="${2:-}"; shift 2 ;;
69
- --codex-bin) codex_bin="${2:-}"; shift 2 ;;
70
- --max-resume-attempts) max_resume_attempts="${2:-}"; shift 2 ;;
71
- --auth-refresh-timeout-seconds) auth_refresh_timeout_seconds="${2:-}"; shift 2 ;;
72
- --auth-refresh-poll-seconds) auth_refresh_poll_seconds="${2:-}"; shift 2 ;;
73
- --stall-seconds) codex_stall_seconds="${2:-}"; shift 2 ;;
74
- --help|-h) usage; exit 0 ;;
75
- *) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
76
- esac
77
- done
78
-
79
- if [[ -z "$mode" || -z "$worktree" || -z "$prompt_file" || -z "$output_file" || -z "$host_run_dir" || -z "$sandbox_run_dir" || -z "$codex_bin" ]]; then
80
- usage >&2
81
- exit 1
82
- fi
83
-
84
- case "$mode" in
85
- safe|bypass) ;;
86
- *)
87
- echo "--mode must be safe or bypass" >&2
88
- exit 1
89
- ;;
90
- esac
91
-
92
- case "$max_resume_attempts" in
93
- ''|*[!0-9]*) echo "--max-resume-attempts must be numeric" >&2; exit 1 ;;
94
- esac
95
- case "$auth_refresh_timeout_seconds" in
96
- ''|*[!0-9]*) echo "--auth-refresh-timeout-seconds must be numeric" >&2; exit 1 ;;
97
- esac
98
- case "$auth_refresh_poll_seconds" in
99
- ''|*[!0-9]*) echo "--auth-refresh-poll-seconds must be numeric" >&2; exit 1 ;;
100
- esac
101
- case "$max_quota_autoswitch_attempts" in
102
- ''|*[!0-9]*) echo "ACP_CODEX_MAX_AUTOSWITCH_ATTEMPTS must be numeric" >&2; exit 1 ;;
103
- esac
104
- case "$codex_progress_heartbeat_seconds" in
105
- ''|*[!0-9]*) echo "ACP_CODEX_PROGRESS_HEARTBEAT_SECONDS must be numeric" >&2; exit 1 ;;
106
- 0) echo "ACP_CODEX_PROGRESS_HEARTBEAT_SECONDS must be greater than zero" >&2; exit 1 ;;
107
- esac
108
- case "$codex_stall_seconds" in
109
- ''|*[!0-9]*) echo "ACP_CODEX_STALL_SECONDS must be numeric" >&2; exit 1 ;;
110
- esac
111
-
112
- python_bin="$(resolve_python_bin || true)"
113
- if [[ -z "$python_bin" || ! -x "$python_bin" ]]; then
114
- echo "unable to resolve a runnable python interpreter for codex supervision" >&2
115
- exit 1
116
- fi
117
-
118
- FLOW_SKILL_DIR="$(resolve_flow_skill_dir "${BASH_SOURCE[0]}")"
119
- state_file="${host_run_dir}/runner.env"
120
- auth_file="${HOME}/.codex/auth.json"
121
- shared_agent_home="${SHARED_AGENT_HOME:-$(resolve_shared_agent_home "${FLOW_SKILL_DIR}")}"
122
- quota_tool_bin="$(flow_resolve_codex_quota_bin "${FLOW_SKILL_DIR}")"
123
- quota_manager_script="$(flow_resolve_codex_quota_manager_script "${FLOW_SKILL_DIR}")"
124
- quota_autoswitch_enabled="${ACP_CODEX_QUOTA_AUTOSWITCH_ENABLED:-${F_LOSNING_CODEX_QUOTA_AUTOSWITCH_ENABLED:-1}}"
125
- quota_threshold="${ACP_CODEX_QUOTA_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_THRESHOLD:-70}}"
126
- quota_weekly_threshold="${ACP_CODEX_QUOTA_WEEKLY_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_WEEKLY_THRESHOLD:-90}}"
127
- quota_soft_threshold="${ACP_CODEX_QUOTA_SOFT_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_SOFT_THRESHOLD:-55}}"
128
- quota_soft_worker_threshold="${ACP_CODEX_QUOTA_SOFT_WORKER_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_SOFT_WORKER_THRESHOLD:-8}}"
129
- quota_emergency_threshold="${ACP_CODEX_QUOTA_EMERGENCY_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_EMERGENCY_THRESHOLD:-65}}"
130
- quota_emergency_worker_threshold="${ACP_CODEX_QUOTA_EMERGENCY_WORKER_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_EMERGENCY_WORKER_THRESHOLD:-12}}"
131
- quota_switch_cooldown_seconds="${ACP_CODEX_QUOTA_SWITCH_COOLDOWN_SECONDS:-${F_LOSNING_CODEX_QUOTA_SWITCH_COOLDOWN_SECONDS:-600}}"
132
- quota_timeout_seconds="${ACP_CODEX_QUOTA_TIMEOUT_SECONDS:-${F_LOSNING_CODEX_QUOTA_TIMEOUT_SECONDS:-45}}"
133
- quota_prefer_label="${ACP_CODEX_QUOTA_PREFER_LABEL:-${F_LOSNING_CODEX_QUOTA_PREFER_LABEL:-}}"
134
- quota_switch_state_file="${CODEX_QUOTA_MANAGER_SWITCH_STATE_FILE:-${XDG_CACHE_HOME:-$HOME/.cache}/codex-quota-manager/last-switch.env}"
135
- config_yaml="$(resolve_flow_config_yaml "${BASH_SOURCE[0]}")"
136
- issue_session_prefix="$(flow_resolve_issue_session_prefix "${config_yaml}")"
137
- pr_session_prefix="$(flow_resolve_pr_session_prefix "${config_yaml}")"
138
-
139
- # Keep npm-backed verification steps isolated from any broken user-global cache state.
140
- npm_cache_dir="${NPM_CONFIG_CACHE:-${npm_config_cache:-}}"
141
- if [[ -z "${npm_cache_dir}" ]]; then
142
- npm_cache_dir="${ACP_NPM_CACHE_DIR:-${F_LOSNING_NPM_CACHE_DIR:-${HOME}/.agent-runtime/npm-cache}}"
143
- fi
144
- export NPM_CONFIG_CACHE="${npm_cache_dir}"
145
- export npm_config_cache="${npm_cache_dir}"
146
- mkdir -p "${npm_cache_dir}" 2>/dev/null || true
147
-
148
- thread_id=""
149
- attempt=0
150
- resume_count=0
151
- last_exit_code=""
152
- last_failure_reason=""
153
- last_trigger_reason=""
154
- auth_wait_started_at=""
155
- last_auth_fingerprint=""
156
- last_attempt_start_size=0
157
- last_attempt_start_quota_label=""
158
- last_quota_switch_status=""
159
- last_quota_next_retry_at=""
160
- last_quota_selected_label=""
161
- quota_autoswitch_attempt_count=0
162
- last_attempt_started_epoch=0
163
-
164
- mkdir -p "$host_run_dir"
165
- touch "$output_file"
166
-
167
- log_runner() {
168
- local message="${1:-}"
169
- [[ -n "$message" ]] || return 0
170
- printf '[%s] %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$message" | tee -a "$output_file"
171
- }
172
-
173
- write_state() {
174
- local runner_state="${1:?runner state required}"
175
- local failure_reason="${2:-$last_failure_reason}"
176
- local updated_at
177
- local tmp_file="${state_file}.tmp.$$"
178
-
179
- updated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
180
- {
181
- printf 'RUNNER_STATE=%q\n' "$runner_state"
182
- printf 'THREAD_ID=%q\n' "$thread_id"
183
- printf 'ATTEMPT=%s\n' "$attempt"
184
- printf 'RESUME_COUNT=%s\n' "$resume_count"
185
- printf 'LAST_EXIT_CODE=%q\n' "$last_exit_code"
186
- printf 'LAST_FAILURE_REASON=%q\n' "$failure_reason"
187
- printf 'LAST_TRIGGER_REASON=%q\n' "$last_trigger_reason"
188
- printf 'AUTH_WAIT_STARTED_AT=%q\n' "$auth_wait_started_at"
189
- printf 'LAST_AUTH_FINGERPRINT=%q\n' "$last_auth_fingerprint"
190
- printf 'UPDATED_AT=%q\n' "$updated_at"
191
- } >"$tmp_file"
192
- mv "$tmp_file" "$state_file"
193
- }
194
-
195
- run_codex_command() {
196
- # Nested workers must not inherit a parent thread id; the wrapper persists the child thread explicitly.
197
- env -u CODEX_THREAD_ID "$codex_bin" "$@"
198
- }
199
-
200
- codex_recovery_target() {
201
- if [[ -n "$thread_id" ]]; then
202
- printf 'thread %s' "$thread_id"
203
- return 0
204
- fi
205
- printf 'initial Codex exec'
206
- }
207
-
208
- run_with_timeout() {
209
- local timeout_seconds="${1:?timeout seconds required}"
210
- shift
211
-
212
- "$python_bin" - "$timeout_seconds" "$@" <<'PY'
213
- import os
214
- import signal
215
- import subprocess
216
- import sys
217
-
218
- timeout_seconds = float(sys.argv[1])
219
- argv = sys.argv[2:]
220
-
221
- if not argv:
222
- sys.exit(64)
223
-
224
- proc = subprocess.Popen(argv, start_new_session=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
225
-
226
- try:
227
- stdout, stderr = proc.communicate(timeout=timeout_seconds)
228
- except subprocess.TimeoutExpired:
229
- try:
230
- os.killpg(proc.pid, signal.SIGTERM)
231
- except ProcessLookupError:
232
- pass
233
- try:
234
- stdout, stderr = proc.communicate(timeout=2)
235
- except subprocess.TimeoutExpired:
236
- try:
237
- os.killpg(proc.pid, signal.SIGKILL)
238
- except ProcessLookupError:
239
- pass
240
- stdout, stderr = proc.communicate()
241
- if stdout:
242
- sys.stdout.buffer.write(stdout)
243
- if stderr:
244
- sys.stderr.buffer.write(stderr)
245
- sys.exit(124)
246
-
247
- if stdout:
248
- sys.stdout.buffer.write(stdout)
249
- if stderr:
250
- sys.stderr.buffer.write(stderr)
251
- sys.exit(proc.returncode)
252
- PY
253
- }
254
-
255
- stat_file_size() {
256
- local path="${1:?path required}"
257
- local value=""
258
-
259
- value="$(stat -f %z "$path" 2>/dev/null || true)"
260
- if [[ "$value" =~ ^[0-9]+$ ]]; then
261
- printf '%s\n' "$value"
262
- return 0
263
- fi
264
-
265
- value="$(stat -c %s "$path" 2>/dev/null || true)"
266
- if [[ "$value" =~ ^[0-9]+$ ]]; then
267
- printf '%s\n' "$value"
268
- return 0
269
- fi
270
-
271
- "$python_bin" - "$path" <<'PY'
272
- import os
273
- import sys
274
-
275
- try:
276
- print(os.path.getsize(sys.argv[1]))
277
- except OSError:
278
- print("0")
279
- PY
280
- }
281
-
282
- stat_file_mtime() {
283
- local path="${1:?path required}"
284
- local value=""
285
-
286
- value="$(stat -f %m "$path" 2>/dev/null || true)"
287
- if [[ "$value" =~ ^[0-9]+$ ]]; then
288
- printf '%s\n' "$value"
289
- return 0
290
- fi
291
-
292
- value="$(stat -c %Y "$path" 2>/dev/null || true)"
293
- if [[ "$value" =~ ^[0-9]+$ ]]; then
294
- printf '%s\n' "$value"
295
- return 0
296
- fi
297
-
298
- "$python_bin" - "$path" <<'PY'
299
- import os
300
- import sys
301
-
302
- try:
303
- print(int(os.path.getmtime(sys.argv[1])))
304
- except OSError:
305
- print("0")
306
- PY
307
- }
308
-
309
- auth_fingerprint() {
310
- if [[ ! -f "$auth_file" ]]; then
311
- printf 'missing\n'
312
- return 0
313
- fi
314
-
315
- local mtime size sha
316
- mtime="$(stat_file_mtime "$auth_file" 2>/dev/null || printf '0')"
317
- size="$(stat_file_size "$auth_file" 2>/dev/null || printf '0')"
318
- sha="$(shasum -a 256 "$auth_file" | awk '{print $1}')"
319
- printf '%s:%s:%s\n' "$mtime" "$size" "$sha"
320
- }
321
-
322
- quota_active_label() {
323
- if [[ ! -x "$quota_tool_bin" ]] || ! command -v jq >/dev/null 2>&1; then
324
- printf '\n'
325
- return 0
326
- fi
327
-
328
- "$quota_tool_bin" codex list --json 2>/dev/null \
329
- | jq -r '
330
- .activeInfo.trackedLabel
331
- // .activeInfo.activeLabel
332
- // ([.accounts[]? | select(.isActive == true or .isNativeActive == true)][0].label)
333
- // empty
334
- ' 2>/dev/null \
335
- || printf '\n'
336
- }
337
-
338
- quota_switch_signature() {
339
- if [[ ! -f "$quota_switch_state_file" ]]; then
340
- printf 'missing\n'
341
- return 0
342
- fi
343
-
344
- local mtime size sha
345
- mtime="$(stat_file_mtime "$quota_switch_state_file" 2>/dev/null || printf '0')"
346
- size="$(stat_file_size "$quota_switch_state_file" 2>/dev/null || printf '0')"
347
- sha="$(shasum -a 256 "$quota_switch_state_file" | awk '{print $1}')"
348
- printf '%s:%s:%s\n' "$mtime" "$size" "$sha"
349
- }
350
-
351
- running_workers() {
352
- if ! command -v tmux >/dev/null 2>&1; then
353
- printf '0\n'
354
- return 0
355
- fi
356
-
357
- { tmux list-sessions -F '#S' 2>/dev/null || true; } \
358
- | awk -v issue_prefix="$issue_session_prefix" -v pr_prefix="$pr_session_prefix" '
359
- index($0, issue_prefix) == 1 || index($0, pr_prefix) == 1 { count++ }
360
- END { print count + 0 }
361
- '
362
- }
363
-
364
- extract_kv_value() {
365
- local key="${1:?key required}"
366
- local payload="${2:-}"
367
- sed -nE "s/^${key}=(.*)$/\\1/p" <<<"$payload" | tail -n 1
368
- }
369
-
370
- run_quota_autoswitch() {
371
- local quota_output quota_status shell_flags_before_quota_exec
372
- local -a quota_cmd
373
- local worker_count
374
-
375
- quota_autoswitch_unavailable_reason() {
376
- if [[ "$quota_autoswitch_enabled" == "0" ]]; then
377
- printf 'disabled\n'
378
- return 0
379
- fi
380
- if [[ ! -x "$quota_manager_script" ]]; then
381
- printf 'missing-script\n'
382
- return 0
383
- fi
384
- if [[ ! -x "$quota_tool_bin" ]]; then
385
- printf 'missing-codex-quota\n'
386
- return 0
387
- fi
388
- if ! command -v jq >/dev/null 2>&1; then
389
- printf 'missing-jq\n'
390
- return 0
391
- fi
392
- printf 'ok\n'
393
- }
394
-
395
- local unavailable_reason=""
396
- last_quota_next_retry_at=""
397
- last_quota_selected_label=""
398
- if [[ "$quota_autoswitch_enabled" == "0" ]]; then
399
- log_runner "quota auto-switch disabled; waiting for external Codex auth refresh"
400
- printf 'CODEX_QUOTA_AUTOSWITCH_ENABLED=0\n' | tee -a "$output_file"
401
- last_quota_switch_status="disabled"
402
- return 1
403
- fi
404
-
405
- unavailable_reason="$(quota_autoswitch_unavailable_reason)"
406
- if [[ "$unavailable_reason" != "ok" ]]; then
407
- log_runner "quota auto-switch unavailable (${unavailable_reason}); waiting for external Codex auth refresh"
408
- printf 'CODEX_QUOTA_MANAGER_UNAVAILABLE=yes\n' | tee -a "$output_file"
409
- printf 'CODEX_QUOTA_MANAGER_REASON=%s\n' "$unavailable_reason" | tee -a "$output_file"
410
- last_quota_switch_status="unavailable"
411
- return 1
412
- fi
413
-
414
- quota_autoswitch_attempt_count=$((quota_autoswitch_attempt_count + 1))
415
- worker_count="$(running_workers)"
416
- quota_cmd=(
417
- env
418
- "CODEX_QUOTA_BIN=${quota_tool_bin}"
419
- bash "$quota_manager_script"
420
- --trigger-reason "$last_failure_reason"
421
- --current-label "${last_attempt_start_quota_label}"
422
- --five-hour-threshold "$quota_threshold"
423
- --weekly-threshold "$quota_weekly_threshold"
424
- --running-workers "$worker_count"
425
- )
426
- if [[ -n "$quota_prefer_label" ]]; then
427
- quota_cmd+=(--prefer-label "$quota_prefer_label")
428
- fi
429
-
430
- log_runner "${last_failure_reason} detected; attempting failure-driven Codex account switch"
431
- shell_flags_before_quota_exec="$-"
432
- set +e
433
- quota_output="$(run_with_timeout "$quota_timeout_seconds" "${quota_cmd[@]}" 2>&1)"
434
- quota_status=$?
435
- case "$shell_flags_before_quota_exec" in
436
- *e*) set -e ;;
437
- *) set +e ;;
438
- esac
439
-
440
- if [[ "$quota_status" == "0" ]]; then
441
- last_quota_switch_status="$(extract_kv_value "SWITCH_DECISION" "$quota_output")"
442
- last_quota_next_retry_at="$(extract_kv_value "NEXT_RETRY_AT" "$quota_output")"
443
- last_quota_selected_label="$(extract_kv_value "SELECTED_LABEL" "$quota_output")"
444
- [[ -n "$quota_output" ]] && printf '%s\n' "$quota_output" | tee -a "$output_file"
445
- case "$last_quota_switch_status" in
446
- switched|current-ok)
447
- return 0
448
- ;;
449
- deferred)
450
- return 10
451
- ;;
452
- *)
453
- last_quota_switch_status="failed"
454
- return 1
455
- ;;
456
- esac
457
- fi
458
-
459
- last_quota_next_retry_at="$(extract_kv_value "NEXT_RETRY_AT" "${quota_output:-}")"
460
- last_quota_selected_label="$(extract_kv_value "SELECTED_LABEL" "${quota_output:-}")"
461
- [[ -n "${quota_output:-}" ]] && printf '%s\n' "$quota_output" | tee -a "$output_file"
462
- if [[ "$quota_status" == "10" ]]; then
463
- last_quota_switch_status="deferred"
464
- log_runner "no eligible Codex account is ready yet; waiting for the next reset window"
465
- return 10
466
- fi
467
- last_quota_switch_status="failed"
468
- if [[ "$quota_status" == "124" ]]; then
469
- log_runner "quota auto-switch timed out after ${quota_timeout_seconds}s"
470
- else
471
- log_runner "quota auto-switch exited with status ${quota_status}"
472
- fi
473
- return "$quota_status"
474
- }
475
-
476
- new_output_since() {
477
- local start_size="${1:?start size required}"
478
- local file_size
479
- file_size="$(stat_file_size "$output_file" 2>/dev/null || printf '0')"
480
- if (( file_size <= start_size )); then
481
- return 0
482
- fi
483
- tail -c "+$((start_size + 1))" "$output_file"
484
- }
485
-
486
- update_thread_id_from_output() {
487
- local start_size="${1:?start size required}"
488
- local new_thread_id
489
-
490
- new_thread_id="$(new_output_since "$start_size" | extract_thread_id || true)"
491
- if [[ -n "$new_thread_id" ]]; then
492
- thread_id="$new_thread_id"
493
- fi
494
- }
495
-
496
- extract_thread_id_from_line() {
497
- local line="${1:-}"
498
- sed -nE 's/.*"type"[[:space:]]*:[[:space:]]*"thread\.started".*"thread_id"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/p' <<<"$line"
499
- }
500
-
501
- persist_thread_id_from_line() {
502
- local line="${1:-}"
503
- local new_thread_id=""
504
-
505
- new_thread_id="$(extract_thread_id_from_line "$line")"
506
- if [[ -n "$new_thread_id" && "$new_thread_id" != "$thread_id" ]]; then
507
- thread_id="$new_thread_id"
508
- write_state "running" ""
509
- fi
510
- }
511
-
512
- terminate_codex_producer_tree() {
513
- local pid="${1:?pid required}"
514
- local deadline=""
515
-
516
- if ! kill -0 "$pid" 2>/dev/null; then
517
- return 0
518
- fi
519
-
520
- pkill -TERM -P "$pid" 2>/dev/null || true
521
- kill "$pid" 2>/dev/null || true
522
-
523
- deadline=$(( $(date +%s) + 2 ))
524
- while kill -0 "$pid" 2>/dev/null; do
525
- if (( $(date +%s) >= deadline )); then
526
- break
527
- fi
528
- sleep 0.1
529
- done
530
-
531
- if kill -0 "$pid" 2>/dev/null; then
532
- pkill -KILL -P "$pid" 2>/dev/null || true
533
- kill -9 "$pid" 2>/dev/null || true
534
- fi
535
- }
536
-
537
- stream_codex_exec() {
538
- local phase="${1:?phase required}"
539
- local stream_fifo=""
540
- local producer_pid=""
541
- local heartbeat_pid=""
542
- local progress_file=""
543
- local line=""
544
-
545
- last_attempt_start_size="$(stat_file_size "$output_file" 2>/dev/null || printf '0')"
546
- last_attempt_started_epoch="$(date +%s)"
547
- progress_file="${host_run_dir}/.codex-progress.$$"
548
- rm -f "$progress_file"
549
- stream_fifo="$(mktemp -u "${TMPDIR:-/tmp}/codex-stream.XXXXXX")"
550
- mkfifo "$stream_fifo"
551
-
552
- case "$phase" in
553
- initial)
554
- (
555
- case "$mode" in
556
- safe)
557
- run_codex_command exec --json --profile "$safe_profile" --full-auto <"$prompt_file"
558
- ;;
559
- bypass)
560
- run_codex_command exec --json --profile "$bypass_profile" --dangerously-bypass-approvals-and-sandbox <"$prompt_file"
561
- ;;
562
- esac
563
- ) >"$stream_fifo" 2>&1 &
564
- ;;
565
- resume)
566
- (
567
- case "$mode" in
568
- safe)
569
- resume_prompt | run_codex_command exec resume --json --full-auto "$thread_id" -
570
- ;;
571
- bypass)
572
- resume_prompt | run_codex_command exec resume --json --dangerously-bypass-approvals-and-sandbox "$thread_id" -
573
- ;;
574
- esac
575
- ) >"$stream_fifo" 2>&1 &
576
- ;;
577
- *)
578
- rm -f "$stream_fifo"
579
- echo "unknown codex exec phase: $phase" >&2
580
- exit 1
581
- ;;
582
- esac
583
-
584
- producer_pid="$!"
585
- (
586
- local now elapsed last_progress_epoch idle_for
587
- while kill -0 "$producer_pid" 2>/dev/null; do
588
- sleep "$codex_progress_heartbeat_seconds"
589
- if ! kill -0 "$producer_pid" 2>/dev/null; then
590
- break
591
- fi
592
- now="$(date +%s)"
593
- elapsed=$((now - last_attempt_started_epoch))
594
- if (( codex_stall_seconds > 0 )); then
595
- if [[ ! -f "$progress_file" ]]; then
596
- if (( elapsed >= codex_stall_seconds )); then
597
- write_state "running" ""
598
- log_runner "stale-run no-codex-output-before-stall-threshold elapsed=${elapsed}s"
599
- terminate_codex_producer_tree "$producer_pid"
600
- break
601
- fi
602
- else
603
- last_progress_epoch="$(stat_file_mtime "$progress_file" 2>/dev/null || printf '0')"
604
- if [[ -n "$last_progress_epoch" && "$last_progress_epoch" != "0" ]]; then
605
- idle_for=$((now - last_progress_epoch))
606
- if (( idle_for >= codex_stall_seconds )); then
607
- write_state "running" ""
608
- log_runner "stale-run no-codex-progress-before-stall-threshold elapsed=${elapsed}s idle=${idle_for}s"
609
- terminate_codex_producer_tree "$producer_pid"
610
- break
611
- fi
612
- fi
613
- fi
614
- fi
615
- write_state "running" ""
616
- log_runner "heartbeat waiting-for-codex-output elapsed=${elapsed}s"
617
- done
618
- ) &
619
- heartbeat_pid="$!"
620
-
621
- while IFS= read -r line || [[ -n "$line" ]]; do
622
- printf '%s\n' "$line" | tee -a "$output_file"
623
- touch "$progress_file" 2>/dev/null || true
624
- persist_thread_id_from_line "$line"
625
- done <"$stream_fifo"
626
-
627
- if [[ -n "$heartbeat_pid" ]] && kill -0 "$heartbeat_pid" 2>/dev/null; then
628
- kill "$heartbeat_pid" 2>/dev/null || true
629
- wait "$heartbeat_pid" 2>/dev/null || true
630
- fi
631
-
632
- rm -f "$stream_fifo"
633
- rm -f "$progress_file"
634
-
635
- set +e
636
- wait "$producer_pid" 2>/dev/null
637
- last_exit_code="$?"
638
- set -e
639
-
640
- update_thread_id_from_output "$last_attempt_start_size"
641
- }
642
-
643
- extract_thread_id() {
644
- "$python_bin" -c '
645
- import json
646
- import sys
647
-
648
- thread_id = ""
649
- for raw in sys.stdin:
650
- line = raw.strip()
651
- if not line.startswith("{"):
652
- continue
653
- try:
654
- payload = json.loads(line)
655
- except Exception:
656
- continue
657
- if payload.get("type") == "thread.started" and payload.get("thread_id"):
658
- thread_id = str(payload["thread_id"])
659
-
660
- if thread_id:
661
- sys.stdout.write(thread_id)
662
- '
663
- }
664
-
665
- classify_failure_reason() {
666
- local chunk="${1:-}"
667
- local recent_chunk
668
-
669
- recent_chunk="$(tail -n 120 <<<"$chunk")"
670
-
671
- if grep -Eiq 'stale-run no-codex-output-before-stall-threshold|no-codex-output-before-stall-threshold' <<<"$recent_chunk"; then
672
- printf 'no-codex-output-before-stall-threshold\n'
673
- return 0
674
- fi
675
-
676
- if grep -Eiq 'stale-run no-codex-progress-before-stall-threshold|no-codex-progress-before-stall-threshold' <<<"$recent_chunk"; then
677
- printf 'no-codex-progress-before-stall-threshold\n'
678
- return 0
679
- fi
680
-
681
- if grep -Eiq "You've hit your usage limit|You have reached your Codex usage limits|visit https://chatgpt.com/codex/settings/usage|Upgrade to Pro|rate limit exceeded|quota exceeded|usage cap (reached|exceeded)|usage quota (reached|exceeded)" <<<"$recent_chunk"; then
682
- printf 'usage-limit\n'
683
- return 0
684
- fi
685
-
686
- if grep -Eiq "(HTTP[^0-9]*)?401([^0-9]|$)|unauthorized|invalid credentials|invalid api key|authentication failed with status 401|received 401" <<<"$recent_chunk"; then
687
- printf 'auth-401\n'
688
- return 0
689
- fi
690
-
691
- if grep -Eiq "account (is )?(banned|suspended|disabled)|access revoked|account revoked|forbidden due to policy|account blocked|policy violation" <<<"$recent_chunk"; then
692
- printf 'account-banned\n'
693
- return 0
694
- fi
695
-
696
- if grep -Eiq "Authentication required|Please log in|Please login|Please authenticate|login required|run codex login|codex login required|logged out|not logged in|expired session|session expired|token expired|reauthenticate|unauthenticated|auth(entication)? failed|credentials expired" <<<"$recent_chunk"; then
697
- printf 'auth-failure\n'
698
- return 0
699
- fi
700
-
701
- if [[ -n "$last_exit_code" && "$last_exit_code" != "0" ]]; then
702
- printf 'worker-exit-failed\n'
703
- fi
704
- }
705
-
706
- failure_chunk_indicates_startup_stall() {
707
- local chunk="${1:-}"
708
- local recent_chunk
709
-
710
- recent_chunk="$(tail -n 120 <<<"$chunk")"
711
- grep -q '"type":"thread.started"' <<<"$recent_chunk" || return 1
712
- grep -q '"type":"turn.started"' <<<"$recent_chunk" || return 1
713
- if grep -Eq '"type":"item\.(started|completed)"' <<<"$recent_chunk"; then
714
- return 1
715
- fi
716
- if grep -q '"type":"turn.completed"' <<<"$recent_chunk"; then
717
- return 1
718
- fi
719
- return 0
720
- }
721
-
722
- resume_prompt() {
723
- cat <<EOF
724
- The previous Codex exec turn in this same thread was interrupted because the host refreshed Codex authentication after a quota or auth failure.
725
-
726
- Continue the same task from the next unfinished step only. Do not restart completed work unless you need to verify or repair it.
727
-
728
- If you need to reorient, inspect the current git status plus the existing run artifacts in:
729
- - Host run dir: ${host_run_dir}
730
- - Sandbox run dir: ${sandbox_run_dir}
731
- EOF
732
- }
733
-
734
- codex_login_healthy() {
735
- run_codex_command login status >/dev/null 2>&1
736
- }
737
-
738
- wait_for_auth_refresh() {
739
- local baseline_fingerprint="${1:?baseline fingerprint required}"
740
- local trigger_reason="${2:?trigger reason required}"
741
- local baseline_quota_label="${3:-}"
742
- local baseline_switch_signature="${4:-}"
743
- local deadline now current_fingerprint current_quota_label current_switch_signature
744
- local sleep_seconds
745
- local recovery_target
746
-
747
- recovery_target="$(codex_recovery_target)"
748
- auth_wait_started_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
749
- last_trigger_reason="$trigger_reason"
750
- write_state "waiting-auth-refresh" "$trigger_reason"
751
-
752
- deadline=$(( $(date +%s) + auth_refresh_timeout_seconds ))
753
- while :; do
754
- current_fingerprint="$(auth_fingerprint)"
755
- last_auth_fingerprint="$current_fingerprint"
756
- case "$trigger_reason" in
757
- usage-limit|auth-401|account-banned)
758
- current_quota_label="$(quota_active_label)"
759
- current_switch_signature="$(quota_switch_signature)"
760
- if codex_login_healthy; then
761
- if [[ "$current_fingerprint" != "$baseline_fingerprint" ]]; then
762
- log_runner "detected refreshed Codex auth after quota interruption; resuming ${recovery_target}"
763
- auth_wait_started_at=""
764
- write_state "running" ""
765
- return 0
766
- fi
767
-
768
- if [[ -n "$baseline_quota_label" && -n "$current_quota_label" && "$current_quota_label" != "$baseline_quota_label" ]]; then
769
- log_runner "detected rotated Codex quota account (${baseline_quota_label} -> ${current_quota_label}); resuming ${recovery_target}"
770
- auth_wait_started_at=""
771
- write_state "running" ""
772
- return 0
773
- fi
774
-
775
- if [[ -n "$baseline_switch_signature" && -n "$current_switch_signature" && "$current_switch_signature" != "$baseline_switch_signature" ]]; then
776
- log_runner "detected quota switch state refresh; resuming ${recovery_target}"
777
- auth_wait_started_at=""
778
- write_state "running" ""
779
- return 0
780
- fi
781
-
782
- if [[ "$last_quota_switch_status" == "switched" && -n "$current_quota_label" ]]; then
783
- log_runner "quota manager reports healthy Codex account ${current_quota_label}; resuming ${recovery_target}"
784
- auth_wait_started_at=""
785
- write_state "running" ""
786
- return 0
787
- fi
788
- fi
789
-
790
- ;;
791
- *)
792
- if codex_login_healthy; then
793
- if [[ "$current_fingerprint" != "$baseline_fingerprint" ]]; then
794
- log_runner "detected refreshed Codex auth; resuming ${recovery_target}"
795
- else
796
- log_runner "Codex auth is healthy again; resuming ${recovery_target}"
797
- fi
798
- auth_wait_started_at=""
799
- write_state "running" ""
800
- return 0
801
- fi
802
- ;;
803
- esac
804
-
805
- now="$(date +%s)"
806
- if (( now >= deadline )); then
807
- last_failure_reason="auth-refresh-timeout"
808
- write_state "failed" "$last_failure_reason"
809
- return 1
810
- fi
811
-
812
- sleep_seconds="$auth_refresh_poll_seconds"
813
- if (( sleep_seconds > deadline - now )); then
814
- sleep_seconds=$(( deadline - now ))
815
- fi
816
-
817
- if (( sleep_seconds < 1 )); then
818
- sleep_seconds=1
819
- fi
820
- sleep "$sleep_seconds"
821
- done
822
- }
823
-
824
- run_initial_exec() {
825
- stream_codex_exec initial
826
- }
827
-
828
- run_resume_exec() {
829
- stream_codex_exec resume
830
- }
831
-
832
- attempt_run() {
833
- local reason auth_before_switch quota_label_before_switch quota_switch_signature_before_switch quota_switch_result shell_flags_before_quota_switch failure_chunk startup_stall
834
-
835
- attempt=$((attempt + 1))
836
- last_quota_switch_status=""
837
- last_attempt_start_quota_label="$(quota_active_label)"
838
- write_state "running" ""
839
-
840
- if [[ -z "$thread_id" ]]; then
841
- log_runner "starting Codex exec attempt ${attempt}"
842
- run_initial_exec
843
- else
844
- log_runner "resuming Codex thread ${thread_id} (resume ${resume_count}/${max_resume_attempts})"
845
- run_resume_exec
846
- fi
847
-
848
- if [[ "${last_exit_code}" == "0" ]]; then
849
- last_failure_reason=""
850
- write_state "succeeded" ""
851
- return 0
852
- fi
853
-
854
- failure_chunk="$(new_output_since "$last_attempt_start_size")"
855
- reason="$(classify_failure_reason "$failure_chunk")"
856
- last_failure_reason="${reason:-worker-exit-failed}"
857
- startup_stall="no"
858
- if [[ "$last_failure_reason" == "no-codex-output-before-stall-threshold" || "$last_failure_reason" == "no-codex-progress-before-stall-threshold" ]]; then
859
- if failure_chunk_indicates_startup_stall "$failure_chunk"; then
860
- startup_stall="yes"
861
- fi
862
- fi
863
-
864
- case "$last_failure_reason" in
865
- usage-limit|auth-failure|auth-401|account-banned)
866
- if (( resume_count >= max_resume_attempts )); then
867
- last_failure_reason="resume-attempts-exhausted"
868
- write_state "failed" "$last_failure_reason"
869
- return 1
870
- fi
871
-
872
- auth_before_switch="$(auth_fingerprint)"
873
- quota_label_before_switch="$last_attempt_start_quota_label"
874
- quota_switch_signature_before_switch="$(quota_switch_signature)"
875
- last_auth_fingerprint="$auth_before_switch"
876
- if [[ "$last_failure_reason" == "usage-limit" || "$last_failure_reason" == "auth-401" || "$last_failure_reason" == "account-banned" ]]; then
877
- if (( quota_autoswitch_attempt_count >= max_quota_autoswitch_attempts )); then
878
- log_runner "automatic Codex quota switching already ran ${quota_autoswitch_attempt_count} time(s) in this worker; refusing another rotation"
879
- last_failure_reason="quota-switch-attempt-limit"
880
- write_state "failed" "$last_failure_reason"
881
- return 1
882
- fi
883
- write_state "switching-account" "$last_failure_reason"
884
- shell_flags_before_quota_switch="$-"
885
- set +e
886
- run_quota_autoswitch
887
- quota_switch_result=$?
888
- case "$shell_flags_before_quota_switch" in
889
- *e*) set -e ;;
890
- *) set +e ;;
891
- esac
892
- if [[ "$quota_switch_result" == "10" ]]; then
893
- log_runner "quota manager deferred rotation until ${last_quota_next_retry_at:-unknown}; automatic timed re-tries are disabled for safety"
894
- last_failure_reason="quota-switch-deferred"
895
- write_state "failed" "$last_failure_reason"
896
- return 1
897
- fi
898
- fi
899
-
900
- if ! wait_for_auth_refresh "$auth_before_switch" "$last_failure_reason" "$quota_label_before_switch" "$quota_switch_signature_before_switch"; then
901
- return 1
902
- fi
903
-
904
- resume_count=$((resume_count + 1))
905
- return 2
906
- ;;
907
- no-codex-output-before-stall-threshold|no-codex-progress-before-stall-threshold)
908
- if [[ "$startup_stall" == "yes" && $quota_autoswitch_attempt_count -lt $max_quota_autoswitch_attempts ]]; then
909
- auth_before_switch="$(auth_fingerprint)"
910
- quota_label_before_switch="$last_attempt_start_quota_label"
911
- quota_switch_signature_before_switch="$(quota_switch_signature)"
912
- last_auth_fingerprint="$auth_before_switch"
913
- write_state "switching-account" "$last_failure_reason"
914
- log_runner "startup-stall detected before first Codex tool activity; attempting Codex account rotation"
915
- shell_flags_before_quota_switch="$-"
916
- set +e
917
- run_quota_autoswitch
918
- quota_switch_result=$?
919
- case "$shell_flags_before_quota_switch" in
920
- *e*) set -e ;;
921
- *) set +e ;;
922
- esac
923
- if [[ "$quota_switch_result" == "0" ]]; then
924
- thread_id=""
925
- auth_wait_started_at=""
926
- write_state "running" ""
927
- return 2
928
- fi
929
- if [[ "$quota_switch_result" == "10" ]]; then
930
- log_runner "startup-stall rotation deferred until ${last_quota_next_retry_at:-unknown}"
931
- last_failure_reason="quota-switch-deferred"
932
- write_state "failed" "$last_failure_reason"
933
- return 1
934
- fi
935
- fi
936
- write_state "failed" "$last_failure_reason"
937
- return 1
938
- ;;
939
- *)
940
- write_state "failed" "$last_failure_reason"
941
- return 1
942
- ;;
943
- esac
944
- }
945
-
946
- write_state "running" ""
947
-
948
- while :; do
949
- set +e
950
- attempt_run
951
- attempt_status=$?
952
- set -e
953
-
954
- if [[ "$attempt_status" == "0" ]]; then
955
- exit 0
956
- fi
957
-
958
- if [[ "$attempt_status" == "2" ]]; then
959
- continue
960
- fi
961
-
962
- exit "${last_exit_code:-1}"
963
- done