agent-control-plane 0.4.9 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +72 -9
- package/npm/bin/agent-control-plane.js +1 -1
- package/package.json +39 -33
- package/tools/bin/debug-session.sh +106 -0
- package/tools/bin/flow-runtime-doctor-linux.sh +136 -0
- package/tools/bin/flow-runtime-doctor.sh +5 -1
- package/tools/bin/install-project-systemd.sh +255 -0
- package/tools/bin/project-runtimectl.sh +45 -0
- package/tools/bin/project-systemd-bootstrap.sh +74 -0
- package/tools/bin/uninstall-project-systemd.sh +87 -0
- package/tools/dashboard/app.js +198 -5
- package/tools/dashboard/issue_queue_state.py +101 -0
- package/tools/dashboard/server.py +123 -1
- package/tools/dashboard/styles.css +526 -455
- package/tools/bin/agent-cleanup-worktree +0 -247
- package/tools/bin/agent-github-update-labels +0 -105
- package/tools/bin/agent-init-worktree +0 -216
- package/tools/bin/agent-project-archive-run +0 -52
- package/tools/bin/agent-project-capture-worker +0 -46
- package/tools/bin/agent-project-catch-up-issue-pr-links +0 -118
- package/tools/bin/agent-project-catch-up-merged-prs +0 -195
- package/tools/bin/agent-project-catch-up-scheduled-issue-retries +0 -123
- package/tools/bin/agent-project-cleanup-session +0 -513
- package/tools/bin/agent-project-detached-launch +0 -127
- package/tools/bin/agent-project-heartbeat-loop +0 -1029
- package/tools/bin/agent-project-open-issue-worktree +0 -89
- package/tools/bin/agent-project-open-pr-worktree +0 -80
- package/tools/bin/agent-project-publish-issue-pr +0 -468
- package/tools/bin/agent-project-reconcile-issue-session +0 -1409
- package/tools/bin/agent-project-reconcile-pr-session +0 -1288
- package/tools/bin/agent-project-retry-state +0 -158
- package/tools/bin/agent-project-run-claude-session +0 -805
- package/tools/bin/agent-project-run-codex-resilient +0 -963
- package/tools/bin/agent-project-run-codex-session +0 -435
- package/tools/bin/agent-project-run-kilo-session +0 -369
- package/tools/bin/agent-project-run-ollama-session +0 -658
- package/tools/bin/agent-project-run-openclaw-session +0 -1309
- package/tools/bin/agent-project-run-opencode-session +0 -377
- package/tools/bin/agent-project-run-pi-session +0 -479
- package/tools/bin/agent-project-sync-anchor-repo +0 -139
- package/tools/bin/agent-project-sync-source-repo-main +0 -163
- package/tools/bin/agent-project-worker-status +0 -188
- package/tools/bin/branch-verification-guard.sh +0 -364
- package/tools/bin/capture-worker.sh +0 -18
- package/tools/bin/cleanup-worktree.sh +0 -52
- package/tools/bin/codex-quota +0 -31
- package/tools/bin/create-follow-up-issue.sh +0 -114
- package/tools/bin/dashboard-launchd-bootstrap.sh +0 -50
- package/tools/bin/issue-publish-localization-guard.sh +0 -142
- package/tools/bin/issue-publish-scope-guard.sh +0 -242
- package/tools/bin/issue-requires-local-workspace-install.sh +0 -31
- package/tools/bin/issue-resource-class.sh +0 -12
- package/tools/bin/kick-scheduler.sh +0 -75
- package/tools/bin/label-follow-up-issues.sh +0 -14
- package/tools/bin/new-pr-worktree.sh +0 -50
- package/tools/bin/new-worktree.sh +0 -49
- package/tools/bin/pr-risk.sh +0 -12
- package/tools/bin/prepare-worktree.sh +0 -142
- package/tools/bin/provider-cooldown-state.sh +0 -204
- package/tools/bin/publish-issue-worker.sh +0 -31
- package/tools/bin/reconcile-bootstrap-lib.sh +0 -113
- package/tools/bin/reconcile-issue-worker.sh +0 -34
- package/tools/bin/reconcile-pr-worker.sh +0 -34
- package/tools/bin/record-verification.sh +0 -71
- package/tools/bin/render-flow-config.sh +0 -98
- package/tools/bin/resident-issue-controller-lib.sh +0 -448
- package/tools/bin/retry-state.sh +0 -31
- package/tools/bin/reuse-issue-worktree.sh +0 -121
- package/tools/bin/run-codex-bypass.sh +0 -3
- package/tools/bin/run-codex-safe.sh +0 -3
- package/tools/bin/run-codex-task.sh +0 -280
- package/tools/bin/serve-dashboard.sh +0 -5
- package/tools/bin/start-issue-worker.sh +0 -943
- package/tools/bin/start-pr-fix-worker.sh +0 -528
- package/tools/bin/start-pr-merge-repair-worker.sh +0 -8
- package/tools/bin/start-pr-review-worker.sh +0 -261
- package/tools/bin/start-resident-issue-loop.sh +0 -499
- package/tools/bin/update-github-labels.sh +0 -14
- package/tools/bin/worker-status.sh +0 -19
- package/tools/bin/workflow-catalog.sh +0 -77
|
@@ -1,963 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -euo pipefail
|
|
3
|
-
|
|
4
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
5
|
-
# shellcheck source=/dev/null
|
|
6
|
-
source "${SCRIPT_DIR}/flow-config-lib.sh"
|
|
7
|
-
|
|
8
|
-
usage() {
|
|
9
|
-
cat <<'EOF'
|
|
10
|
-
Usage:
|
|
11
|
-
agent-project-run-codex-resilient --mode safe|bypass --worktree <path> --prompt-file <path> --output-file <path> --host-run-dir <path> --sandbox-run-dir <path> --codex-bin <path> [options]
|
|
12
|
-
|
|
13
|
-
Run Codex with persisted thread recovery for quota/auth interruptions.
|
|
14
|
-
|
|
15
|
-
Options:
|
|
16
|
-
--safe-profile <name> Codex profile for safe mode
|
|
17
|
-
--bypass-profile <name> Codex profile for bypass mode
|
|
18
|
-
--max-resume-attempts <count> Maximum resume attempts after interruption
|
|
19
|
-
--auth-refresh-timeout-seconds <secs> How long to wait for refreshed auth before failing
|
|
20
|
-
--auth-refresh-poll-seconds <secs> Poll interval while waiting for refreshed auth
|
|
21
|
-
--stall-seconds <secs> Fail if Codex stops producing output for too long
|
|
22
|
-
--help Show this help
|
|
23
|
-
EOF
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
mode=""
|
|
27
|
-
worktree=""
|
|
28
|
-
prompt_file=""
|
|
29
|
-
output_file=""
|
|
30
|
-
host_run_dir=""
|
|
31
|
-
sandbox_run_dir=""
|
|
32
|
-
safe_profile="default"
|
|
33
|
-
bypass_profile="default"
|
|
34
|
-
codex_bin=""
|
|
35
|
-
max_resume_attempts="${ACP_CODEX_MAX_RESUME_ATTEMPTS:-${F_LOSNING_CODEX_MAX_RESUME_ATTEMPTS:-6}}"
|
|
36
|
-
auth_refresh_timeout_seconds="${ACP_CODEX_AUTH_REFRESH_TIMEOUT_SECONDS:-${F_LOSNING_CODEX_AUTH_REFRESH_TIMEOUT_SECONDS:-900}}"
|
|
37
|
-
auth_refresh_poll_seconds="${ACP_CODEX_AUTH_REFRESH_POLL_SECONDS:-${F_LOSNING_CODEX_AUTH_REFRESH_POLL_SECONDS:-10}}"
|
|
38
|
-
max_quota_autoswitch_attempts="${ACP_CODEX_MAX_AUTOSWITCH_ATTEMPTS:-${F_LOSNING_CODEX_MAX_AUTOSWITCH_ATTEMPTS:-1}}"
|
|
39
|
-
codex_progress_heartbeat_seconds="${ACP_CODEX_PROGRESS_HEARTBEAT_SECONDS:-${F_LOSNING_CODEX_PROGRESS_HEARTBEAT_SECONDS:-30}}"
|
|
40
|
-
codex_stall_seconds="${ACP_CODEX_STALL_SECONDS:-${F_LOSNING_CODEX_STALL_SECONDS:-300}}"
|
|
41
|
-
python_bin=""
|
|
42
|
-
|
|
43
|
-
resolve_python_bin() {
|
|
44
|
-
if command -v python3 >/dev/null 2>&1; then
|
|
45
|
-
command -v python3
|
|
46
|
-
return 0
|
|
47
|
-
fi
|
|
48
|
-
if [[ -x /opt/homebrew/bin/python3 ]]; then
|
|
49
|
-
printf '%s\n' "/opt/homebrew/bin/python3"
|
|
50
|
-
return 0
|
|
51
|
-
fi
|
|
52
|
-
if command -v python >/dev/null 2>&1; then
|
|
53
|
-
command -v python
|
|
54
|
-
return 0
|
|
55
|
-
fi
|
|
56
|
-
return 1
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
while [[ $# -gt 0 ]]; do
|
|
60
|
-
case "$1" in
|
|
61
|
-
--mode) mode="${2:-}"; shift 2 ;;
|
|
62
|
-
--worktree) worktree="${2:-}"; shift 2 ;;
|
|
63
|
-
--prompt-file) prompt_file="${2:-}"; shift 2 ;;
|
|
64
|
-
--output-file) output_file="${2:-}"; shift 2 ;;
|
|
65
|
-
--host-run-dir) host_run_dir="${2:-}"; shift 2 ;;
|
|
66
|
-
--sandbox-run-dir) sandbox_run_dir="${2:-}"; shift 2 ;;
|
|
67
|
-
--safe-profile) safe_profile="${2:-}"; shift 2 ;;
|
|
68
|
-
--bypass-profile) bypass_profile="${2:-}"; shift 2 ;;
|
|
69
|
-
--codex-bin) codex_bin="${2:-}"; shift 2 ;;
|
|
70
|
-
--max-resume-attempts) max_resume_attempts="${2:-}"; shift 2 ;;
|
|
71
|
-
--auth-refresh-timeout-seconds) auth_refresh_timeout_seconds="${2:-}"; shift 2 ;;
|
|
72
|
-
--auth-refresh-poll-seconds) auth_refresh_poll_seconds="${2:-}"; shift 2 ;;
|
|
73
|
-
--stall-seconds) codex_stall_seconds="${2:-}"; shift 2 ;;
|
|
74
|
-
--help|-h) usage; exit 0 ;;
|
|
75
|
-
*) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
|
|
76
|
-
esac
|
|
77
|
-
done
|
|
78
|
-
|
|
79
|
-
if [[ -z "$mode" || -z "$worktree" || -z "$prompt_file" || -z "$output_file" || -z "$host_run_dir" || -z "$sandbox_run_dir" || -z "$codex_bin" ]]; then
|
|
80
|
-
usage >&2
|
|
81
|
-
exit 1
|
|
82
|
-
fi
|
|
83
|
-
|
|
84
|
-
case "$mode" in
|
|
85
|
-
safe|bypass) ;;
|
|
86
|
-
*)
|
|
87
|
-
echo "--mode must be safe or bypass" >&2
|
|
88
|
-
exit 1
|
|
89
|
-
;;
|
|
90
|
-
esac
|
|
91
|
-
|
|
92
|
-
case "$max_resume_attempts" in
|
|
93
|
-
''|*[!0-9]*) echo "--max-resume-attempts must be numeric" >&2; exit 1 ;;
|
|
94
|
-
esac
|
|
95
|
-
case "$auth_refresh_timeout_seconds" in
|
|
96
|
-
''|*[!0-9]*) echo "--auth-refresh-timeout-seconds must be numeric" >&2; exit 1 ;;
|
|
97
|
-
esac
|
|
98
|
-
case "$auth_refresh_poll_seconds" in
|
|
99
|
-
''|*[!0-9]*) echo "--auth-refresh-poll-seconds must be numeric" >&2; exit 1 ;;
|
|
100
|
-
esac
|
|
101
|
-
case "$max_quota_autoswitch_attempts" in
|
|
102
|
-
''|*[!0-9]*) echo "ACP_CODEX_MAX_AUTOSWITCH_ATTEMPTS must be numeric" >&2; exit 1 ;;
|
|
103
|
-
esac
|
|
104
|
-
case "$codex_progress_heartbeat_seconds" in
|
|
105
|
-
''|*[!0-9]*) echo "ACP_CODEX_PROGRESS_HEARTBEAT_SECONDS must be numeric" >&2; exit 1 ;;
|
|
106
|
-
0) echo "ACP_CODEX_PROGRESS_HEARTBEAT_SECONDS must be greater than zero" >&2; exit 1 ;;
|
|
107
|
-
esac
|
|
108
|
-
case "$codex_stall_seconds" in
|
|
109
|
-
''|*[!0-9]*) echo "ACP_CODEX_STALL_SECONDS must be numeric" >&2; exit 1 ;;
|
|
110
|
-
esac
|
|
111
|
-
|
|
112
|
-
python_bin="$(resolve_python_bin || true)"
|
|
113
|
-
if [[ -z "$python_bin" || ! -x "$python_bin" ]]; then
|
|
114
|
-
echo "unable to resolve a runnable python interpreter for codex supervision" >&2
|
|
115
|
-
exit 1
|
|
116
|
-
fi
|
|
117
|
-
|
|
118
|
-
FLOW_SKILL_DIR="$(resolve_flow_skill_dir "${BASH_SOURCE[0]}")"
|
|
119
|
-
state_file="${host_run_dir}/runner.env"
|
|
120
|
-
auth_file="${HOME}/.codex/auth.json"
|
|
121
|
-
shared_agent_home="${SHARED_AGENT_HOME:-$(resolve_shared_agent_home "${FLOW_SKILL_DIR}")}"
|
|
122
|
-
quota_tool_bin="$(flow_resolve_codex_quota_bin "${FLOW_SKILL_DIR}")"
|
|
123
|
-
quota_manager_script="$(flow_resolve_codex_quota_manager_script "${FLOW_SKILL_DIR}")"
|
|
124
|
-
quota_autoswitch_enabled="${ACP_CODEX_QUOTA_AUTOSWITCH_ENABLED:-${F_LOSNING_CODEX_QUOTA_AUTOSWITCH_ENABLED:-1}}"
|
|
125
|
-
quota_threshold="${ACP_CODEX_QUOTA_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_THRESHOLD:-70}}"
|
|
126
|
-
quota_weekly_threshold="${ACP_CODEX_QUOTA_WEEKLY_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_WEEKLY_THRESHOLD:-90}}"
|
|
127
|
-
quota_soft_threshold="${ACP_CODEX_QUOTA_SOFT_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_SOFT_THRESHOLD:-55}}"
|
|
128
|
-
quota_soft_worker_threshold="${ACP_CODEX_QUOTA_SOFT_WORKER_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_SOFT_WORKER_THRESHOLD:-8}}"
|
|
129
|
-
quota_emergency_threshold="${ACP_CODEX_QUOTA_EMERGENCY_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_EMERGENCY_THRESHOLD:-65}}"
|
|
130
|
-
quota_emergency_worker_threshold="${ACP_CODEX_QUOTA_EMERGENCY_WORKER_THRESHOLD:-${F_LOSNING_CODEX_QUOTA_EMERGENCY_WORKER_THRESHOLD:-12}}"
|
|
131
|
-
quota_switch_cooldown_seconds="${ACP_CODEX_QUOTA_SWITCH_COOLDOWN_SECONDS:-${F_LOSNING_CODEX_QUOTA_SWITCH_COOLDOWN_SECONDS:-600}}"
|
|
132
|
-
quota_timeout_seconds="${ACP_CODEX_QUOTA_TIMEOUT_SECONDS:-${F_LOSNING_CODEX_QUOTA_TIMEOUT_SECONDS:-45}}"
|
|
133
|
-
quota_prefer_label="${ACP_CODEX_QUOTA_PREFER_LABEL:-${F_LOSNING_CODEX_QUOTA_PREFER_LABEL:-}}"
|
|
134
|
-
quota_switch_state_file="${CODEX_QUOTA_MANAGER_SWITCH_STATE_FILE:-${XDG_CACHE_HOME:-$HOME/.cache}/codex-quota-manager/last-switch.env}"
|
|
135
|
-
config_yaml="$(resolve_flow_config_yaml "${BASH_SOURCE[0]}")"
|
|
136
|
-
issue_session_prefix="$(flow_resolve_issue_session_prefix "${config_yaml}")"
|
|
137
|
-
pr_session_prefix="$(flow_resolve_pr_session_prefix "${config_yaml}")"
|
|
138
|
-
|
|
139
|
-
# Keep npm-backed verification steps isolated from any broken user-global cache state.
|
|
140
|
-
npm_cache_dir="${NPM_CONFIG_CACHE:-${npm_config_cache:-}}"
|
|
141
|
-
if [[ -z "${npm_cache_dir}" ]]; then
|
|
142
|
-
npm_cache_dir="${ACP_NPM_CACHE_DIR:-${F_LOSNING_NPM_CACHE_DIR:-${HOME}/.agent-runtime/npm-cache}}"
|
|
143
|
-
fi
|
|
144
|
-
export NPM_CONFIG_CACHE="${npm_cache_dir}"
|
|
145
|
-
export npm_config_cache="${npm_cache_dir}"
|
|
146
|
-
mkdir -p "${npm_cache_dir}" 2>/dev/null || true
|
|
147
|
-
|
|
148
|
-
thread_id=""
|
|
149
|
-
attempt=0
|
|
150
|
-
resume_count=0
|
|
151
|
-
last_exit_code=""
|
|
152
|
-
last_failure_reason=""
|
|
153
|
-
last_trigger_reason=""
|
|
154
|
-
auth_wait_started_at=""
|
|
155
|
-
last_auth_fingerprint=""
|
|
156
|
-
last_attempt_start_size=0
|
|
157
|
-
last_attempt_start_quota_label=""
|
|
158
|
-
last_quota_switch_status=""
|
|
159
|
-
last_quota_next_retry_at=""
|
|
160
|
-
last_quota_selected_label=""
|
|
161
|
-
quota_autoswitch_attempt_count=0
|
|
162
|
-
last_attempt_started_epoch=0
|
|
163
|
-
|
|
164
|
-
mkdir -p "$host_run_dir"
|
|
165
|
-
touch "$output_file"
|
|
166
|
-
|
|
167
|
-
log_runner() {
|
|
168
|
-
local message="${1:-}"
|
|
169
|
-
[[ -n "$message" ]] || return 0
|
|
170
|
-
printf '[%s] %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$message" | tee -a "$output_file"
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
write_state() {
|
|
174
|
-
local runner_state="${1:?runner state required}"
|
|
175
|
-
local failure_reason="${2:-$last_failure_reason}"
|
|
176
|
-
local updated_at
|
|
177
|
-
local tmp_file="${state_file}.tmp.$$"
|
|
178
|
-
|
|
179
|
-
updated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
|
180
|
-
{
|
|
181
|
-
printf 'RUNNER_STATE=%q\n' "$runner_state"
|
|
182
|
-
printf 'THREAD_ID=%q\n' "$thread_id"
|
|
183
|
-
printf 'ATTEMPT=%s\n' "$attempt"
|
|
184
|
-
printf 'RESUME_COUNT=%s\n' "$resume_count"
|
|
185
|
-
printf 'LAST_EXIT_CODE=%q\n' "$last_exit_code"
|
|
186
|
-
printf 'LAST_FAILURE_REASON=%q\n' "$failure_reason"
|
|
187
|
-
printf 'LAST_TRIGGER_REASON=%q\n' "$last_trigger_reason"
|
|
188
|
-
printf 'AUTH_WAIT_STARTED_AT=%q\n' "$auth_wait_started_at"
|
|
189
|
-
printf 'LAST_AUTH_FINGERPRINT=%q\n' "$last_auth_fingerprint"
|
|
190
|
-
printf 'UPDATED_AT=%q\n' "$updated_at"
|
|
191
|
-
} >"$tmp_file"
|
|
192
|
-
mv "$tmp_file" "$state_file"
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
run_codex_command() {
|
|
196
|
-
# Nested workers must not inherit a parent thread id; the wrapper persists the child thread explicitly.
|
|
197
|
-
env -u CODEX_THREAD_ID "$codex_bin" "$@"
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
codex_recovery_target() {
|
|
201
|
-
if [[ -n "$thread_id" ]]; then
|
|
202
|
-
printf 'thread %s' "$thread_id"
|
|
203
|
-
return 0
|
|
204
|
-
fi
|
|
205
|
-
printf 'initial Codex exec'
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
run_with_timeout() {
|
|
209
|
-
local timeout_seconds="${1:?timeout seconds required}"
|
|
210
|
-
shift
|
|
211
|
-
|
|
212
|
-
"$python_bin" - "$timeout_seconds" "$@" <<'PY'
|
|
213
|
-
import os
|
|
214
|
-
import signal
|
|
215
|
-
import subprocess
|
|
216
|
-
import sys
|
|
217
|
-
|
|
218
|
-
timeout_seconds = float(sys.argv[1])
|
|
219
|
-
argv = sys.argv[2:]
|
|
220
|
-
|
|
221
|
-
if not argv:
|
|
222
|
-
sys.exit(64)
|
|
223
|
-
|
|
224
|
-
proc = subprocess.Popen(argv, start_new_session=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
225
|
-
|
|
226
|
-
try:
|
|
227
|
-
stdout, stderr = proc.communicate(timeout=timeout_seconds)
|
|
228
|
-
except subprocess.TimeoutExpired:
|
|
229
|
-
try:
|
|
230
|
-
os.killpg(proc.pid, signal.SIGTERM)
|
|
231
|
-
except ProcessLookupError:
|
|
232
|
-
pass
|
|
233
|
-
try:
|
|
234
|
-
stdout, stderr = proc.communicate(timeout=2)
|
|
235
|
-
except subprocess.TimeoutExpired:
|
|
236
|
-
try:
|
|
237
|
-
os.killpg(proc.pid, signal.SIGKILL)
|
|
238
|
-
except ProcessLookupError:
|
|
239
|
-
pass
|
|
240
|
-
stdout, stderr = proc.communicate()
|
|
241
|
-
if stdout:
|
|
242
|
-
sys.stdout.buffer.write(stdout)
|
|
243
|
-
if stderr:
|
|
244
|
-
sys.stderr.buffer.write(stderr)
|
|
245
|
-
sys.exit(124)
|
|
246
|
-
|
|
247
|
-
if stdout:
|
|
248
|
-
sys.stdout.buffer.write(stdout)
|
|
249
|
-
if stderr:
|
|
250
|
-
sys.stderr.buffer.write(stderr)
|
|
251
|
-
sys.exit(proc.returncode)
|
|
252
|
-
PY
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
stat_file_size() {
|
|
256
|
-
local path="${1:?path required}"
|
|
257
|
-
local value=""
|
|
258
|
-
|
|
259
|
-
value="$(stat -f %z "$path" 2>/dev/null || true)"
|
|
260
|
-
if [[ "$value" =~ ^[0-9]+$ ]]; then
|
|
261
|
-
printf '%s\n' "$value"
|
|
262
|
-
return 0
|
|
263
|
-
fi
|
|
264
|
-
|
|
265
|
-
value="$(stat -c %s "$path" 2>/dev/null || true)"
|
|
266
|
-
if [[ "$value" =~ ^[0-9]+$ ]]; then
|
|
267
|
-
printf '%s\n' "$value"
|
|
268
|
-
return 0
|
|
269
|
-
fi
|
|
270
|
-
|
|
271
|
-
"$python_bin" - "$path" <<'PY'
|
|
272
|
-
import os
|
|
273
|
-
import sys
|
|
274
|
-
|
|
275
|
-
try:
|
|
276
|
-
print(os.path.getsize(sys.argv[1]))
|
|
277
|
-
except OSError:
|
|
278
|
-
print("0")
|
|
279
|
-
PY
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
stat_file_mtime() {
|
|
283
|
-
local path="${1:?path required}"
|
|
284
|
-
local value=""
|
|
285
|
-
|
|
286
|
-
value="$(stat -f %m "$path" 2>/dev/null || true)"
|
|
287
|
-
if [[ "$value" =~ ^[0-9]+$ ]]; then
|
|
288
|
-
printf '%s\n' "$value"
|
|
289
|
-
return 0
|
|
290
|
-
fi
|
|
291
|
-
|
|
292
|
-
value="$(stat -c %Y "$path" 2>/dev/null || true)"
|
|
293
|
-
if [[ "$value" =~ ^[0-9]+$ ]]; then
|
|
294
|
-
printf '%s\n' "$value"
|
|
295
|
-
return 0
|
|
296
|
-
fi
|
|
297
|
-
|
|
298
|
-
"$python_bin" - "$path" <<'PY'
|
|
299
|
-
import os
|
|
300
|
-
import sys
|
|
301
|
-
|
|
302
|
-
try:
|
|
303
|
-
print(int(os.path.getmtime(sys.argv[1])))
|
|
304
|
-
except OSError:
|
|
305
|
-
print("0")
|
|
306
|
-
PY
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
auth_fingerprint() {
|
|
310
|
-
if [[ ! -f "$auth_file" ]]; then
|
|
311
|
-
printf 'missing\n'
|
|
312
|
-
return 0
|
|
313
|
-
fi
|
|
314
|
-
|
|
315
|
-
local mtime size sha
|
|
316
|
-
mtime="$(stat_file_mtime "$auth_file" 2>/dev/null || printf '0')"
|
|
317
|
-
size="$(stat_file_size "$auth_file" 2>/dev/null || printf '0')"
|
|
318
|
-
sha="$(shasum -a 256 "$auth_file" | awk '{print $1}')"
|
|
319
|
-
printf '%s:%s:%s\n' "$mtime" "$size" "$sha"
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
quota_active_label() {
|
|
323
|
-
if [[ ! -x "$quota_tool_bin" ]] || ! command -v jq >/dev/null 2>&1; then
|
|
324
|
-
printf '\n'
|
|
325
|
-
return 0
|
|
326
|
-
fi
|
|
327
|
-
|
|
328
|
-
"$quota_tool_bin" codex list --json 2>/dev/null \
|
|
329
|
-
| jq -r '
|
|
330
|
-
.activeInfo.trackedLabel
|
|
331
|
-
// .activeInfo.activeLabel
|
|
332
|
-
// ([.accounts[]? | select(.isActive == true or .isNativeActive == true)][0].label)
|
|
333
|
-
// empty
|
|
334
|
-
' 2>/dev/null \
|
|
335
|
-
|| printf '\n'
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
quota_switch_signature() {
|
|
339
|
-
if [[ ! -f "$quota_switch_state_file" ]]; then
|
|
340
|
-
printf 'missing\n'
|
|
341
|
-
return 0
|
|
342
|
-
fi
|
|
343
|
-
|
|
344
|
-
local mtime size sha
|
|
345
|
-
mtime="$(stat_file_mtime "$quota_switch_state_file" 2>/dev/null || printf '0')"
|
|
346
|
-
size="$(stat_file_size "$quota_switch_state_file" 2>/dev/null || printf '0')"
|
|
347
|
-
sha="$(shasum -a 256 "$quota_switch_state_file" | awk '{print $1}')"
|
|
348
|
-
printf '%s:%s:%s\n' "$mtime" "$size" "$sha"
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
running_workers() {
|
|
352
|
-
if ! command -v tmux >/dev/null 2>&1; then
|
|
353
|
-
printf '0\n'
|
|
354
|
-
return 0
|
|
355
|
-
fi
|
|
356
|
-
|
|
357
|
-
{ tmux list-sessions -F '#S' 2>/dev/null || true; } \
|
|
358
|
-
| awk -v issue_prefix="$issue_session_prefix" -v pr_prefix="$pr_session_prefix" '
|
|
359
|
-
index($0, issue_prefix) == 1 || index($0, pr_prefix) == 1 { count++ }
|
|
360
|
-
END { print count + 0 }
|
|
361
|
-
'
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
extract_kv_value() {
|
|
365
|
-
local key="${1:?key required}"
|
|
366
|
-
local payload="${2:-}"
|
|
367
|
-
sed -nE "s/^${key}=(.*)$/\\1/p" <<<"$payload" | tail -n 1
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
run_quota_autoswitch() {
|
|
371
|
-
local quota_output quota_status shell_flags_before_quota_exec
|
|
372
|
-
local -a quota_cmd
|
|
373
|
-
local worker_count
|
|
374
|
-
|
|
375
|
-
quota_autoswitch_unavailable_reason() {
|
|
376
|
-
if [[ "$quota_autoswitch_enabled" == "0" ]]; then
|
|
377
|
-
printf 'disabled\n'
|
|
378
|
-
return 0
|
|
379
|
-
fi
|
|
380
|
-
if [[ ! -x "$quota_manager_script" ]]; then
|
|
381
|
-
printf 'missing-script\n'
|
|
382
|
-
return 0
|
|
383
|
-
fi
|
|
384
|
-
if [[ ! -x "$quota_tool_bin" ]]; then
|
|
385
|
-
printf 'missing-codex-quota\n'
|
|
386
|
-
return 0
|
|
387
|
-
fi
|
|
388
|
-
if ! command -v jq >/dev/null 2>&1; then
|
|
389
|
-
printf 'missing-jq\n'
|
|
390
|
-
return 0
|
|
391
|
-
fi
|
|
392
|
-
printf 'ok\n'
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
local unavailable_reason=""
|
|
396
|
-
last_quota_next_retry_at=""
|
|
397
|
-
last_quota_selected_label=""
|
|
398
|
-
if [[ "$quota_autoswitch_enabled" == "0" ]]; then
|
|
399
|
-
log_runner "quota auto-switch disabled; waiting for external Codex auth refresh"
|
|
400
|
-
printf 'CODEX_QUOTA_AUTOSWITCH_ENABLED=0\n' | tee -a "$output_file"
|
|
401
|
-
last_quota_switch_status="disabled"
|
|
402
|
-
return 1
|
|
403
|
-
fi
|
|
404
|
-
|
|
405
|
-
unavailable_reason="$(quota_autoswitch_unavailable_reason)"
|
|
406
|
-
if [[ "$unavailable_reason" != "ok" ]]; then
|
|
407
|
-
log_runner "quota auto-switch unavailable (${unavailable_reason}); waiting for external Codex auth refresh"
|
|
408
|
-
printf 'CODEX_QUOTA_MANAGER_UNAVAILABLE=yes\n' | tee -a "$output_file"
|
|
409
|
-
printf 'CODEX_QUOTA_MANAGER_REASON=%s\n' "$unavailable_reason" | tee -a "$output_file"
|
|
410
|
-
last_quota_switch_status="unavailable"
|
|
411
|
-
return 1
|
|
412
|
-
fi
|
|
413
|
-
|
|
414
|
-
quota_autoswitch_attempt_count=$((quota_autoswitch_attempt_count + 1))
|
|
415
|
-
worker_count="$(running_workers)"
|
|
416
|
-
quota_cmd=(
|
|
417
|
-
env
|
|
418
|
-
"CODEX_QUOTA_BIN=${quota_tool_bin}"
|
|
419
|
-
bash "$quota_manager_script"
|
|
420
|
-
--trigger-reason "$last_failure_reason"
|
|
421
|
-
--current-label "${last_attempt_start_quota_label}"
|
|
422
|
-
--five-hour-threshold "$quota_threshold"
|
|
423
|
-
--weekly-threshold "$quota_weekly_threshold"
|
|
424
|
-
--running-workers "$worker_count"
|
|
425
|
-
)
|
|
426
|
-
if [[ -n "$quota_prefer_label" ]]; then
|
|
427
|
-
quota_cmd+=(--prefer-label "$quota_prefer_label")
|
|
428
|
-
fi
|
|
429
|
-
|
|
430
|
-
log_runner "${last_failure_reason} detected; attempting failure-driven Codex account switch"
|
|
431
|
-
shell_flags_before_quota_exec="$-"
|
|
432
|
-
set +e
|
|
433
|
-
quota_output="$(run_with_timeout "$quota_timeout_seconds" "${quota_cmd[@]}" 2>&1)"
|
|
434
|
-
quota_status=$?
|
|
435
|
-
case "$shell_flags_before_quota_exec" in
|
|
436
|
-
*e*) set -e ;;
|
|
437
|
-
*) set +e ;;
|
|
438
|
-
esac
|
|
439
|
-
|
|
440
|
-
if [[ "$quota_status" == "0" ]]; then
|
|
441
|
-
last_quota_switch_status="$(extract_kv_value "SWITCH_DECISION" "$quota_output")"
|
|
442
|
-
last_quota_next_retry_at="$(extract_kv_value "NEXT_RETRY_AT" "$quota_output")"
|
|
443
|
-
last_quota_selected_label="$(extract_kv_value "SELECTED_LABEL" "$quota_output")"
|
|
444
|
-
[[ -n "$quota_output" ]] && printf '%s\n' "$quota_output" | tee -a "$output_file"
|
|
445
|
-
case "$last_quota_switch_status" in
|
|
446
|
-
switched|current-ok)
|
|
447
|
-
return 0
|
|
448
|
-
;;
|
|
449
|
-
deferred)
|
|
450
|
-
return 10
|
|
451
|
-
;;
|
|
452
|
-
*)
|
|
453
|
-
last_quota_switch_status="failed"
|
|
454
|
-
return 1
|
|
455
|
-
;;
|
|
456
|
-
esac
|
|
457
|
-
fi
|
|
458
|
-
|
|
459
|
-
last_quota_next_retry_at="$(extract_kv_value "NEXT_RETRY_AT" "${quota_output:-}")"
|
|
460
|
-
last_quota_selected_label="$(extract_kv_value "SELECTED_LABEL" "${quota_output:-}")"
|
|
461
|
-
[[ -n "${quota_output:-}" ]] && printf '%s\n' "$quota_output" | tee -a "$output_file"
|
|
462
|
-
if [[ "$quota_status" == "10" ]]; then
|
|
463
|
-
last_quota_switch_status="deferred"
|
|
464
|
-
log_runner "no eligible Codex account is ready yet; waiting for the next reset window"
|
|
465
|
-
return 10
|
|
466
|
-
fi
|
|
467
|
-
last_quota_switch_status="failed"
|
|
468
|
-
if [[ "$quota_status" == "124" ]]; then
|
|
469
|
-
log_runner "quota auto-switch timed out after ${quota_timeout_seconds}s"
|
|
470
|
-
else
|
|
471
|
-
log_runner "quota auto-switch exited with status ${quota_status}"
|
|
472
|
-
fi
|
|
473
|
-
return "$quota_status"
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
new_output_since() {
|
|
477
|
-
local start_size="${1:?start size required}"
|
|
478
|
-
local file_size
|
|
479
|
-
file_size="$(stat_file_size "$output_file" 2>/dev/null || printf '0')"
|
|
480
|
-
if (( file_size <= start_size )); then
|
|
481
|
-
return 0
|
|
482
|
-
fi
|
|
483
|
-
tail -c "+$((start_size + 1))" "$output_file"
|
|
484
|
-
}
|
|
485
|
-
|
|
486
|
-
update_thread_id_from_output() {
|
|
487
|
-
local start_size="${1:?start size required}"
|
|
488
|
-
local new_thread_id
|
|
489
|
-
|
|
490
|
-
new_thread_id="$(new_output_since "$start_size" | extract_thread_id || true)"
|
|
491
|
-
if [[ -n "$new_thread_id" ]]; then
|
|
492
|
-
thread_id="$new_thread_id"
|
|
493
|
-
fi
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
extract_thread_id_from_line() {
|
|
497
|
-
local line="${1:-}"
|
|
498
|
-
sed -nE 's/.*"type"[[:space:]]*:[[:space:]]*"thread\.started".*"thread_id"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/p' <<<"$line"
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
persist_thread_id_from_line() {
|
|
502
|
-
local line="${1:-}"
|
|
503
|
-
local new_thread_id=""
|
|
504
|
-
|
|
505
|
-
new_thread_id="$(extract_thread_id_from_line "$line")"
|
|
506
|
-
if [[ -n "$new_thread_id" && "$new_thread_id" != "$thread_id" ]]; then
|
|
507
|
-
thread_id="$new_thread_id"
|
|
508
|
-
write_state "running" ""
|
|
509
|
-
fi
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
terminate_codex_producer_tree() {
|
|
513
|
-
local pid="${1:?pid required}"
|
|
514
|
-
local deadline=""
|
|
515
|
-
|
|
516
|
-
if ! kill -0 "$pid" 2>/dev/null; then
|
|
517
|
-
return 0
|
|
518
|
-
fi
|
|
519
|
-
|
|
520
|
-
pkill -TERM -P "$pid" 2>/dev/null || true
|
|
521
|
-
kill "$pid" 2>/dev/null || true
|
|
522
|
-
|
|
523
|
-
deadline=$(( $(date +%s) + 2 ))
|
|
524
|
-
while kill -0 "$pid" 2>/dev/null; do
|
|
525
|
-
if (( $(date +%s) >= deadline )); then
|
|
526
|
-
break
|
|
527
|
-
fi
|
|
528
|
-
sleep 0.1
|
|
529
|
-
done
|
|
530
|
-
|
|
531
|
-
if kill -0 "$pid" 2>/dev/null; then
|
|
532
|
-
pkill -KILL -P "$pid" 2>/dev/null || true
|
|
533
|
-
kill -9 "$pid" 2>/dev/null || true
|
|
534
|
-
fi
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
stream_codex_exec() {
|
|
538
|
-
local phase="${1:?phase required}"
|
|
539
|
-
local stream_fifo=""
|
|
540
|
-
local producer_pid=""
|
|
541
|
-
local heartbeat_pid=""
|
|
542
|
-
local progress_file=""
|
|
543
|
-
local line=""
|
|
544
|
-
|
|
545
|
-
last_attempt_start_size="$(stat_file_size "$output_file" 2>/dev/null || printf '0')"
|
|
546
|
-
last_attempt_started_epoch="$(date +%s)"
|
|
547
|
-
progress_file="${host_run_dir}/.codex-progress.$$"
|
|
548
|
-
rm -f "$progress_file"
|
|
549
|
-
stream_fifo="$(mktemp -u "${TMPDIR:-/tmp}/codex-stream.XXXXXX")"
|
|
550
|
-
mkfifo "$stream_fifo"
|
|
551
|
-
|
|
552
|
-
case "$phase" in
|
|
553
|
-
initial)
|
|
554
|
-
(
|
|
555
|
-
case "$mode" in
|
|
556
|
-
safe)
|
|
557
|
-
run_codex_command exec --json --profile "$safe_profile" --full-auto <"$prompt_file"
|
|
558
|
-
;;
|
|
559
|
-
bypass)
|
|
560
|
-
run_codex_command exec --json --profile "$bypass_profile" --dangerously-bypass-approvals-and-sandbox <"$prompt_file"
|
|
561
|
-
;;
|
|
562
|
-
esac
|
|
563
|
-
) >"$stream_fifo" 2>&1 &
|
|
564
|
-
;;
|
|
565
|
-
resume)
|
|
566
|
-
(
|
|
567
|
-
case "$mode" in
|
|
568
|
-
safe)
|
|
569
|
-
resume_prompt | run_codex_command exec resume --json --full-auto "$thread_id" -
|
|
570
|
-
;;
|
|
571
|
-
bypass)
|
|
572
|
-
resume_prompt | run_codex_command exec resume --json --dangerously-bypass-approvals-and-sandbox "$thread_id" -
|
|
573
|
-
;;
|
|
574
|
-
esac
|
|
575
|
-
) >"$stream_fifo" 2>&1 &
|
|
576
|
-
;;
|
|
577
|
-
*)
|
|
578
|
-
rm -f "$stream_fifo"
|
|
579
|
-
echo "unknown codex exec phase: $phase" >&2
|
|
580
|
-
exit 1
|
|
581
|
-
;;
|
|
582
|
-
esac
|
|
583
|
-
|
|
584
|
-
producer_pid="$!"
|
|
585
|
-
(
|
|
586
|
-
local now elapsed last_progress_epoch idle_for
|
|
587
|
-
while kill -0 "$producer_pid" 2>/dev/null; do
|
|
588
|
-
sleep "$codex_progress_heartbeat_seconds"
|
|
589
|
-
if ! kill -0 "$producer_pid" 2>/dev/null; then
|
|
590
|
-
break
|
|
591
|
-
fi
|
|
592
|
-
now="$(date +%s)"
|
|
593
|
-
elapsed=$((now - last_attempt_started_epoch))
|
|
594
|
-
if (( codex_stall_seconds > 0 )); then
|
|
595
|
-
if [[ ! -f "$progress_file" ]]; then
|
|
596
|
-
if (( elapsed >= codex_stall_seconds )); then
|
|
597
|
-
write_state "running" ""
|
|
598
|
-
log_runner "stale-run no-codex-output-before-stall-threshold elapsed=${elapsed}s"
|
|
599
|
-
terminate_codex_producer_tree "$producer_pid"
|
|
600
|
-
break
|
|
601
|
-
fi
|
|
602
|
-
else
|
|
603
|
-
last_progress_epoch="$(stat_file_mtime "$progress_file" 2>/dev/null || printf '0')"
|
|
604
|
-
if [[ -n "$last_progress_epoch" && "$last_progress_epoch" != "0" ]]; then
|
|
605
|
-
idle_for=$((now - last_progress_epoch))
|
|
606
|
-
if (( idle_for >= codex_stall_seconds )); then
|
|
607
|
-
write_state "running" ""
|
|
608
|
-
log_runner "stale-run no-codex-progress-before-stall-threshold elapsed=${elapsed}s idle=${idle_for}s"
|
|
609
|
-
terminate_codex_producer_tree "$producer_pid"
|
|
610
|
-
break
|
|
611
|
-
fi
|
|
612
|
-
fi
|
|
613
|
-
fi
|
|
614
|
-
fi
|
|
615
|
-
write_state "running" ""
|
|
616
|
-
log_runner "heartbeat waiting-for-codex-output elapsed=${elapsed}s"
|
|
617
|
-
done
|
|
618
|
-
) &
|
|
619
|
-
heartbeat_pid="$!"
|
|
620
|
-
|
|
621
|
-
while IFS= read -r line || [[ -n "$line" ]]; do
|
|
622
|
-
printf '%s\n' "$line" | tee -a "$output_file"
|
|
623
|
-
touch "$progress_file" 2>/dev/null || true
|
|
624
|
-
persist_thread_id_from_line "$line"
|
|
625
|
-
done <"$stream_fifo"
|
|
626
|
-
|
|
627
|
-
if [[ -n "$heartbeat_pid" ]] && kill -0 "$heartbeat_pid" 2>/dev/null; then
|
|
628
|
-
kill "$heartbeat_pid" 2>/dev/null || true
|
|
629
|
-
wait "$heartbeat_pid" 2>/dev/null || true
|
|
630
|
-
fi
|
|
631
|
-
|
|
632
|
-
rm -f "$stream_fifo"
|
|
633
|
-
rm -f "$progress_file"
|
|
634
|
-
|
|
635
|
-
set +e
|
|
636
|
-
wait "$producer_pid" 2>/dev/null
|
|
637
|
-
last_exit_code="$?"
|
|
638
|
-
set -e
|
|
639
|
-
|
|
640
|
-
update_thread_id_from_output "$last_attempt_start_size"
|
|
641
|
-
}
|
|
642
|
-
|
|
643
|
-
extract_thread_id() {
|
|
644
|
-
"$python_bin" -c '
|
|
645
|
-
import json
|
|
646
|
-
import sys
|
|
647
|
-
|
|
648
|
-
thread_id = ""
|
|
649
|
-
for raw in sys.stdin:
|
|
650
|
-
line = raw.strip()
|
|
651
|
-
if not line.startswith("{"):
|
|
652
|
-
continue
|
|
653
|
-
try:
|
|
654
|
-
payload = json.loads(line)
|
|
655
|
-
except Exception:
|
|
656
|
-
continue
|
|
657
|
-
if payload.get("type") == "thread.started" and payload.get("thread_id"):
|
|
658
|
-
thread_id = str(payload["thread_id"])
|
|
659
|
-
|
|
660
|
-
if thread_id:
|
|
661
|
-
sys.stdout.write(thread_id)
|
|
662
|
-
'
|
|
663
|
-
}
|
|
664
|
-
|
|
665
|
-
classify_failure_reason() {
|
|
666
|
-
local chunk="${1:-}"
|
|
667
|
-
local recent_chunk
|
|
668
|
-
|
|
669
|
-
recent_chunk="$(tail -n 120 <<<"$chunk")"
|
|
670
|
-
|
|
671
|
-
if grep -Eiq 'stale-run no-codex-output-before-stall-threshold|no-codex-output-before-stall-threshold' <<<"$recent_chunk"; then
|
|
672
|
-
printf 'no-codex-output-before-stall-threshold\n'
|
|
673
|
-
return 0
|
|
674
|
-
fi
|
|
675
|
-
|
|
676
|
-
if grep -Eiq 'stale-run no-codex-progress-before-stall-threshold|no-codex-progress-before-stall-threshold' <<<"$recent_chunk"; then
|
|
677
|
-
printf 'no-codex-progress-before-stall-threshold\n'
|
|
678
|
-
return 0
|
|
679
|
-
fi
|
|
680
|
-
|
|
681
|
-
if grep -Eiq "You've hit your usage limit|You have reached your Codex usage limits|visit https://chatgpt.com/codex/settings/usage|Upgrade to Pro|rate limit exceeded|quota exceeded|usage cap (reached|exceeded)|usage quota (reached|exceeded)" <<<"$recent_chunk"; then
|
|
682
|
-
printf 'usage-limit\n'
|
|
683
|
-
return 0
|
|
684
|
-
fi
|
|
685
|
-
|
|
686
|
-
if grep -Eiq "(HTTP[^0-9]*)?401([^0-9]|$)|unauthorized|invalid credentials|invalid api key|authentication failed with status 401|received 401" <<<"$recent_chunk"; then
|
|
687
|
-
printf 'auth-401\n'
|
|
688
|
-
return 0
|
|
689
|
-
fi
|
|
690
|
-
|
|
691
|
-
if grep -Eiq "account (is )?(banned|suspended|disabled)|access revoked|account revoked|forbidden due to policy|account blocked|policy violation" <<<"$recent_chunk"; then
|
|
692
|
-
printf 'account-banned\n'
|
|
693
|
-
return 0
|
|
694
|
-
fi
|
|
695
|
-
|
|
696
|
-
if grep -Eiq "Authentication required|Please log in|Please login|Please authenticate|login required|run codex login|codex login required|logged out|not logged in|expired session|session expired|token expired|reauthenticate|unauthenticated|auth(entication)? failed|credentials expired" <<<"$recent_chunk"; then
|
|
697
|
-
printf 'auth-failure\n'
|
|
698
|
-
return 0
|
|
699
|
-
fi
|
|
700
|
-
|
|
701
|
-
if [[ -n "$last_exit_code" && "$last_exit_code" != "0" ]]; then
|
|
702
|
-
printf 'worker-exit-failed\n'
|
|
703
|
-
fi
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
failure_chunk_indicates_startup_stall() {
|
|
707
|
-
local chunk="${1:-}"
|
|
708
|
-
local recent_chunk
|
|
709
|
-
|
|
710
|
-
recent_chunk="$(tail -n 120 <<<"$chunk")"
|
|
711
|
-
grep -q '"type":"thread.started"' <<<"$recent_chunk" || return 1
|
|
712
|
-
grep -q '"type":"turn.started"' <<<"$recent_chunk" || return 1
|
|
713
|
-
if grep -Eq '"type":"item\.(started|completed)"' <<<"$recent_chunk"; then
|
|
714
|
-
return 1
|
|
715
|
-
fi
|
|
716
|
-
if grep -q '"type":"turn.completed"' <<<"$recent_chunk"; then
|
|
717
|
-
return 1
|
|
718
|
-
fi
|
|
719
|
-
return 0
|
|
720
|
-
}
|
|
721
|
-
|
|
722
|
-
resume_prompt() {
|
|
723
|
-
cat <<EOF
|
|
724
|
-
The previous Codex exec turn in this same thread was interrupted because the host refreshed Codex authentication after a quota or auth failure.
|
|
725
|
-
|
|
726
|
-
Continue the same task from the next unfinished step only. Do not restart completed work unless you need to verify or repair it.
|
|
727
|
-
|
|
728
|
-
If you need to reorient, inspect the current git status plus the existing run artifacts in:
|
|
729
|
-
- Host run dir: ${host_run_dir}
|
|
730
|
-
- Sandbox run dir: ${sandbox_run_dir}
|
|
731
|
-
EOF
|
|
732
|
-
}
|
|
733
|
-
|
|
734
|
-
codex_login_healthy() {
|
|
735
|
-
run_codex_command login status >/dev/null 2>&1
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
wait_for_auth_refresh() {
|
|
739
|
-
local baseline_fingerprint="${1:?baseline fingerprint required}"
|
|
740
|
-
local trigger_reason="${2:?trigger reason required}"
|
|
741
|
-
local baseline_quota_label="${3:-}"
|
|
742
|
-
local baseline_switch_signature="${4:-}"
|
|
743
|
-
local deadline now current_fingerprint current_quota_label current_switch_signature
|
|
744
|
-
local sleep_seconds
|
|
745
|
-
local recovery_target
|
|
746
|
-
|
|
747
|
-
recovery_target="$(codex_recovery_target)"
|
|
748
|
-
auth_wait_started_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
|
749
|
-
last_trigger_reason="$trigger_reason"
|
|
750
|
-
write_state "waiting-auth-refresh" "$trigger_reason"
|
|
751
|
-
|
|
752
|
-
deadline=$(( $(date +%s) + auth_refresh_timeout_seconds ))
|
|
753
|
-
while :; do
|
|
754
|
-
current_fingerprint="$(auth_fingerprint)"
|
|
755
|
-
last_auth_fingerprint="$current_fingerprint"
|
|
756
|
-
case "$trigger_reason" in
|
|
757
|
-
usage-limit|auth-401|account-banned)
|
|
758
|
-
current_quota_label="$(quota_active_label)"
|
|
759
|
-
current_switch_signature="$(quota_switch_signature)"
|
|
760
|
-
if codex_login_healthy; then
|
|
761
|
-
if [[ "$current_fingerprint" != "$baseline_fingerprint" ]]; then
|
|
762
|
-
log_runner "detected refreshed Codex auth after quota interruption; resuming ${recovery_target}"
|
|
763
|
-
auth_wait_started_at=""
|
|
764
|
-
write_state "running" ""
|
|
765
|
-
return 0
|
|
766
|
-
fi
|
|
767
|
-
|
|
768
|
-
if [[ -n "$baseline_quota_label" && -n "$current_quota_label" && "$current_quota_label" != "$baseline_quota_label" ]]; then
|
|
769
|
-
log_runner "detected rotated Codex quota account (${baseline_quota_label} -> ${current_quota_label}); resuming ${recovery_target}"
|
|
770
|
-
auth_wait_started_at=""
|
|
771
|
-
write_state "running" ""
|
|
772
|
-
return 0
|
|
773
|
-
fi
|
|
774
|
-
|
|
775
|
-
if [[ -n "$baseline_switch_signature" && -n "$current_switch_signature" && "$current_switch_signature" != "$baseline_switch_signature" ]]; then
|
|
776
|
-
log_runner "detected quota switch state refresh; resuming ${recovery_target}"
|
|
777
|
-
auth_wait_started_at=""
|
|
778
|
-
write_state "running" ""
|
|
779
|
-
return 0
|
|
780
|
-
fi
|
|
781
|
-
|
|
782
|
-
if [[ "$last_quota_switch_status" == "switched" && -n "$current_quota_label" ]]; then
|
|
783
|
-
log_runner "quota manager reports healthy Codex account ${current_quota_label}; resuming ${recovery_target}"
|
|
784
|
-
auth_wait_started_at=""
|
|
785
|
-
write_state "running" ""
|
|
786
|
-
return 0
|
|
787
|
-
fi
|
|
788
|
-
fi
|
|
789
|
-
|
|
790
|
-
;;
|
|
791
|
-
*)
|
|
792
|
-
if codex_login_healthy; then
|
|
793
|
-
if [[ "$current_fingerprint" != "$baseline_fingerprint" ]]; then
|
|
794
|
-
log_runner "detected refreshed Codex auth; resuming ${recovery_target}"
|
|
795
|
-
else
|
|
796
|
-
log_runner "Codex auth is healthy again; resuming ${recovery_target}"
|
|
797
|
-
fi
|
|
798
|
-
auth_wait_started_at=""
|
|
799
|
-
write_state "running" ""
|
|
800
|
-
return 0
|
|
801
|
-
fi
|
|
802
|
-
;;
|
|
803
|
-
esac
|
|
804
|
-
|
|
805
|
-
now="$(date +%s)"
|
|
806
|
-
if (( now >= deadline )); then
|
|
807
|
-
last_failure_reason="auth-refresh-timeout"
|
|
808
|
-
write_state "failed" "$last_failure_reason"
|
|
809
|
-
return 1
|
|
810
|
-
fi
|
|
811
|
-
|
|
812
|
-
sleep_seconds="$auth_refresh_poll_seconds"
|
|
813
|
-
if (( sleep_seconds > deadline - now )); then
|
|
814
|
-
sleep_seconds=$(( deadline - now ))
|
|
815
|
-
fi
|
|
816
|
-
|
|
817
|
-
if (( sleep_seconds < 1 )); then
|
|
818
|
-
sleep_seconds=1
|
|
819
|
-
fi
|
|
820
|
-
sleep "$sleep_seconds"
|
|
821
|
-
done
|
|
822
|
-
}
|
|
823
|
-
|
|
824
|
-
run_initial_exec() {
|
|
825
|
-
stream_codex_exec initial
|
|
826
|
-
}
|
|
827
|
-
|
|
828
|
-
run_resume_exec() {
|
|
829
|
-
stream_codex_exec resume
|
|
830
|
-
}
|
|
831
|
-
|
|
832
|
-
attempt_run() {
|
|
833
|
-
local reason auth_before_switch quota_label_before_switch quota_switch_signature_before_switch quota_switch_result shell_flags_before_quota_switch failure_chunk startup_stall
|
|
834
|
-
|
|
835
|
-
attempt=$((attempt + 1))
|
|
836
|
-
last_quota_switch_status=""
|
|
837
|
-
last_attempt_start_quota_label="$(quota_active_label)"
|
|
838
|
-
write_state "running" ""
|
|
839
|
-
|
|
840
|
-
if [[ -z "$thread_id" ]]; then
|
|
841
|
-
log_runner "starting Codex exec attempt ${attempt}"
|
|
842
|
-
run_initial_exec
|
|
843
|
-
else
|
|
844
|
-
log_runner "resuming Codex thread ${thread_id} (resume ${resume_count}/${max_resume_attempts})"
|
|
845
|
-
run_resume_exec
|
|
846
|
-
fi
|
|
847
|
-
|
|
848
|
-
if [[ "${last_exit_code}" == "0" ]]; then
|
|
849
|
-
last_failure_reason=""
|
|
850
|
-
write_state "succeeded" ""
|
|
851
|
-
return 0
|
|
852
|
-
fi
|
|
853
|
-
|
|
854
|
-
failure_chunk="$(new_output_since "$last_attempt_start_size")"
|
|
855
|
-
reason="$(classify_failure_reason "$failure_chunk")"
|
|
856
|
-
last_failure_reason="${reason:-worker-exit-failed}"
|
|
857
|
-
startup_stall="no"
|
|
858
|
-
if [[ "$last_failure_reason" == "no-codex-output-before-stall-threshold" || "$last_failure_reason" == "no-codex-progress-before-stall-threshold" ]]; then
|
|
859
|
-
if failure_chunk_indicates_startup_stall "$failure_chunk"; then
|
|
860
|
-
startup_stall="yes"
|
|
861
|
-
fi
|
|
862
|
-
fi
|
|
863
|
-
|
|
864
|
-
case "$last_failure_reason" in
|
|
865
|
-
usage-limit|auth-failure|auth-401|account-banned)
|
|
866
|
-
if (( resume_count >= max_resume_attempts )); then
|
|
867
|
-
last_failure_reason="resume-attempts-exhausted"
|
|
868
|
-
write_state "failed" "$last_failure_reason"
|
|
869
|
-
return 1
|
|
870
|
-
fi
|
|
871
|
-
|
|
872
|
-
auth_before_switch="$(auth_fingerprint)"
|
|
873
|
-
quota_label_before_switch="$last_attempt_start_quota_label"
|
|
874
|
-
quota_switch_signature_before_switch="$(quota_switch_signature)"
|
|
875
|
-
last_auth_fingerprint="$auth_before_switch"
|
|
876
|
-
if [[ "$last_failure_reason" == "usage-limit" || "$last_failure_reason" == "auth-401" || "$last_failure_reason" == "account-banned" ]]; then
|
|
877
|
-
if (( quota_autoswitch_attempt_count >= max_quota_autoswitch_attempts )); then
|
|
878
|
-
log_runner "automatic Codex quota switching already ran ${quota_autoswitch_attempt_count} time(s) in this worker; refusing another rotation"
|
|
879
|
-
last_failure_reason="quota-switch-attempt-limit"
|
|
880
|
-
write_state "failed" "$last_failure_reason"
|
|
881
|
-
return 1
|
|
882
|
-
fi
|
|
883
|
-
write_state "switching-account" "$last_failure_reason"
|
|
884
|
-
shell_flags_before_quota_switch="$-"
|
|
885
|
-
set +e
|
|
886
|
-
run_quota_autoswitch
|
|
887
|
-
quota_switch_result=$?
|
|
888
|
-
case "$shell_flags_before_quota_switch" in
|
|
889
|
-
*e*) set -e ;;
|
|
890
|
-
*) set +e ;;
|
|
891
|
-
esac
|
|
892
|
-
if [[ "$quota_switch_result" == "10" ]]; then
|
|
893
|
-
log_runner "quota manager deferred rotation until ${last_quota_next_retry_at:-unknown}; automatic timed re-tries are disabled for safety"
|
|
894
|
-
last_failure_reason="quota-switch-deferred"
|
|
895
|
-
write_state "failed" "$last_failure_reason"
|
|
896
|
-
return 1
|
|
897
|
-
fi
|
|
898
|
-
fi
|
|
899
|
-
|
|
900
|
-
if ! wait_for_auth_refresh "$auth_before_switch" "$last_failure_reason" "$quota_label_before_switch" "$quota_switch_signature_before_switch"; then
|
|
901
|
-
return 1
|
|
902
|
-
fi
|
|
903
|
-
|
|
904
|
-
resume_count=$((resume_count + 1))
|
|
905
|
-
return 2
|
|
906
|
-
;;
|
|
907
|
-
no-codex-output-before-stall-threshold|no-codex-progress-before-stall-threshold)
|
|
908
|
-
if [[ "$startup_stall" == "yes" && $quota_autoswitch_attempt_count -lt $max_quota_autoswitch_attempts ]]; then
|
|
909
|
-
auth_before_switch="$(auth_fingerprint)"
|
|
910
|
-
quota_label_before_switch="$last_attempt_start_quota_label"
|
|
911
|
-
quota_switch_signature_before_switch="$(quota_switch_signature)"
|
|
912
|
-
last_auth_fingerprint="$auth_before_switch"
|
|
913
|
-
write_state "switching-account" "$last_failure_reason"
|
|
914
|
-
log_runner "startup-stall detected before first Codex tool activity; attempting Codex account rotation"
|
|
915
|
-
shell_flags_before_quota_switch="$-"
|
|
916
|
-
set +e
|
|
917
|
-
run_quota_autoswitch
|
|
918
|
-
quota_switch_result=$?
|
|
919
|
-
case "$shell_flags_before_quota_switch" in
|
|
920
|
-
*e*) set -e ;;
|
|
921
|
-
*) set +e ;;
|
|
922
|
-
esac
|
|
923
|
-
if [[ "$quota_switch_result" == "0" ]]; then
|
|
924
|
-
thread_id=""
|
|
925
|
-
auth_wait_started_at=""
|
|
926
|
-
write_state "running" ""
|
|
927
|
-
return 2
|
|
928
|
-
fi
|
|
929
|
-
if [[ "$quota_switch_result" == "10" ]]; then
|
|
930
|
-
log_runner "startup-stall rotation deferred until ${last_quota_next_retry_at:-unknown}"
|
|
931
|
-
last_failure_reason="quota-switch-deferred"
|
|
932
|
-
write_state "failed" "$last_failure_reason"
|
|
933
|
-
return 1
|
|
934
|
-
fi
|
|
935
|
-
fi
|
|
936
|
-
write_state "failed" "$last_failure_reason"
|
|
937
|
-
return 1
|
|
938
|
-
;;
|
|
939
|
-
*)
|
|
940
|
-
write_state "failed" "$last_failure_reason"
|
|
941
|
-
return 1
|
|
942
|
-
;;
|
|
943
|
-
esac
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
write_state "running" ""
|
|
947
|
-
|
|
948
|
-
while :; do
|
|
949
|
-
set +e
|
|
950
|
-
attempt_run
|
|
951
|
-
attempt_status=$?
|
|
952
|
-
set -e
|
|
953
|
-
|
|
954
|
-
if [[ "$attempt_status" == "0" ]]; then
|
|
955
|
-
exit 0
|
|
956
|
-
fi
|
|
957
|
-
|
|
958
|
-
if [[ "$attempt_status" == "2" ]]; then
|
|
959
|
-
continue
|
|
960
|
-
fi
|
|
961
|
-
|
|
962
|
-
exit "${last_exit_code:-1}"
|
|
963
|
-
done
|