agent-control-plane 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,8 +75,167 @@ QUOTA_LOCK_DIR="${STATE_ROOT}/quota-preflight.lock"
75
75
  QUOTA_PID_FILE="${QUOTA_LOCK_DIR}/pid"
76
76
  python_bin="$(flow_resolve_python_bin || true)"
77
77
 
78
+ # Stale lock detection and cleanup
79
+ cleanup_stale_locks() {
80
+ local lock_dir pid_file pid max_age_seconds=${1:-1800} # default 30 minutes
81
+ local lock_dirs=(
82
+ "${STATE_ROOT}/heartbeat-loop.lock"
83
+ "${STATE_ROOT}/quota-preflight.lock"
84
+ )
85
+
86
+ for lock_dir in "${lock_dirs[@]}"; do
87
+ pid_file="${lock_dir}/pid"
88
+ if [[ -f "$pid_file" ]]; then
89
+ pid=$(cat "$pid_file" 2>/dev/null | tr -d '[:space:]')
90
+ if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
91
+ # Process is still running, check if parent is init (orphan)
92
+ local ppid
93
+ ppid=$(ps -p "$pid" -o ppid= 2>/dev/null | tr -d '[:space:]')
94
+ if [[ "$ppid" == "1" ]]; then
95
+ log_event "stale_lock_detected" "type" "orphan" "pid" "$pid" "lock_dir" "$lock_dir"
96
+ echo "Warning: Removing orphan lock (PID $pid, lock: $lock_dir)"
97
+ rm -rf "$lock_dir"
98
+ fi
99
+ else
100
+ # Process not running, check lock age
101
+ local lock_age
102
+ lock_age=$(($(date +%s) - $(stat -f %m "$pid_file" 2>/dev/null || stat -c %Y "$pid_file" 2>/dev/null || echo "0")))
103
+ if [[ $lock_age -gt $max_age_seconds ]]; then
104
+ log_event "stale_lock_detected" "type" "timeout" "pid" "$pid" "age_seconds" "$lock_age" "lock_dir" "$lock_dir"
105
+ echo "Warning: Removing stale lock (PID $pid, age: ${lock_age}s, lock: $lock_dir)"
106
+ rm -rf "$lock_dir"
107
+ fi
108
+ fi
109
+ fi
110
+ done
111
+ }
112
+
113
+ # Structured logging for scheduler observability
114
+ LOG_FILE="${STATE_ROOT}/scheduler-events.jsonl"
115
+ mkdir -p "$(dirname "${LOG_FILE}")"
116
+
117
+ log_event() {
118
+ local event_type="$1"
119
+ shift
120
+ local timestamp
121
+ timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
122
+ local extra_fields=""
123
+ while [[ $# -gt 0 ]]; do
124
+ extra_fields="${extra_fields}, \"$1\": \"$2\""
125
+ shift 2
126
+ done
127
+ echo "{\"timestamp\": \"${timestamp}\", \"event\": \"${event_type}\", \"pid\": ${$}${extra_fields}}" >> "${LOG_FILE}"
128
+ }
129
+
130
+ # Health check: monitor system resources
131
+ check_system_resources() {
132
+ local cpu_usage mem_usage disk_usage
133
+ local warn=0
134
+
135
+ # CPU usage (1-min load average / number of cores)
136
+ if command -v nproc >/dev/null 2>&1 && command -v awk >/dev/null 2>&1; then
137
+ local load_1min disk_avail disk_total
138
+ load_1min=$(cat /proc/loadavg 2>/dev/null | awk '{print $1}' || echo "0")
139
+ local cores
140
+ cores=$(nproc 2>/dev/null || echo "1")
141
+ cpu_usage=$(echo "$load_1min $cores" | awk '{printf "%.0f", ($1/$2)*100}' 2>/dev/null || echo "0")
142
+ fi
143
+
144
+ # Memory usage
145
+ if command -v free >/dev/null 2>&1; then
146
+ mem_usage=$(free | awk '/Mem:/ {printf "%.0f", ($3/$2)*100}' 2>/dev/null || echo "0")
147
+ elif [[ -f /proc/meminfo ]]; then
148
+ local mem_total mem_available
149
+ mem_total=$(grep MemTotal /proc/meminfo 2>/dev/null | awk '{print $2}' || echo "1")
150
+ mem_available=$(grep MemAvailable /proc/meminfo 2>/dev/null | awk '{print $2}' || echo "0")
151
+ mem_usage=$(echo "$mem_total $mem_available" | awk '{printf "%.0f", (($1-$2)/$1)*100}' 2>/dev/null || echo "0")
152
+ fi
153
+
154
+ # Disk usage for STATE_ROOT
155
+ disk_usage=$(df "${STATE_ROOT}" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}' || echo "0")
156
+
157
+ # Log resource status
158
+ log_event "system_resources" "cpu_pct" "${cpu_usage:-0}" "mem_pct" "${mem_usage:-0}" "disk_pct" "${disk_usage:-0}"
159
+
160
+ # Warnings
161
+ if [[ "${cpu_usage:-0}" -gt 80 ]]; then
162
+ log_event "resource_warning" "type" "cpu" "value" "${cpu_usage}"
163
+ warn=1
164
+ fi
165
+ if [[ "${mem_usage:-0}" -gt 90 ]]; then
166
+ log_event "resource_warning" "type" "memory" "value" "${mem_usage}"
167
+ warn=1
168
+ fi
169
+ if [[ "${disk_usage:-0}" -gt 90 ]]; then
170
+ log_event "resource_warning" "type" "disk" "value" "${disk_usage}"
171
+ warn=1
172
+ fi
173
+
174
+ return $warn
175
+ }
176
+
177
+ # Collect scheduler metrics for observability
178
+ collect_metrics() {
179
+ local active_sessions=0
180
+ local queued_issues=0
181
+ local completed_today=0
182
+ local failed_today=0
183
+
184
+ # Count active tmux sessions for this repo
185
+ if command -v tmux >/dev/null 2>&1; then
186
+ active_sessions=$(tmux ls 2>/dev/null | grep -c "agent-" || echo "0")
187
+ fi
188
+
189
+ # Count queued issues (issues with agent-keep-open label but no active session)
190
+ if command -v gh >/dev/null 2>&1 && [[ -n "${REPO_SLUG:-}" ]]; then
191
+ queued_issues=$(gh issue list --repo "${REPO_SLUG}" --label "agent-keep-open" --state open --json number 2>/dev/null | grep -c '"number"' || echo "0")
192
+ fi
193
+
194
+ # Count completed/failed sessions from history (last 24h)
195
+ if [[ -d "${HISTORY_ROOT}" ]]; then
196
+ completed_today=$(find "${HISTORY_ROOT}" -name "*.json" -mtime 0 2>/dev/null | xargs grep -l '"status": "completed"' 2>/dev/null | wc -l || echo "0")
197
+ failed_today=$(find "${HISTORY_ROOT}" -name "*.json" -mtime 0 2>/dev/null | xargs grep -l '"status": "failed"' 2>/dev/null | wc -l || echo "0")
198
+ fi
199
+
200
+ # Log metrics
201
+ log_event "scheduler_metrics" \
202
+ "active_sessions" "$active_sessions" \
203
+ "queued_issues" "$queued_issues" \
204
+ "completed_today" "$completed_today" \
205
+ "failed_today" "$failed_today"
206
+ }
207
+
208
+ # Error tracking for scheduler observability
209
+ ERROR_LOG="${STATE_ROOT}/scheduler-errors.jsonl"
210
+ error_count=0
211
+
212
+ track_error() {
213
+ local error_type="$1"
214
+ local error_msg="$2"
215
+ local timestamp
216
+ timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
217
+
218
+ ((error_count++))
219
+
220
+ # Log to JSONL
221
+ echo "{\"timestamp\": \"${timestamp}\", \"type\": \"${error_type}\", \"message\": \"${error_msg}\", \"pid\": ${$}}" >> "${ERROR_LOG}"
222
+
223
+ # Also log as event
224
+ log_event "scheduler_error" "type" "${error_type}" "message" "${error_msg}"
225
+
226
+ # Alert if too many errors
227
+ if [[ $error_count -gt 10 ]]; then
228
+ log_event "error_threshold_exceeded" "count" "$error_count"
229
+ echo "Warning: High error count detected ($error_count errors)" >&2
230
+ fi
231
+ }
232
+
78
233
  mkdir -p "${AGENT_ROOT}" "${RUNS_ROOT}" "${STATE_ROOT}" "${HISTORY_ROOT}" "${WORKTREE_ROOT}" "${MEMORY_DIR}"
79
234
 
235
+ cleanup_stale_locks 1800 # Clean locks older than 30 minutes
236
+ collect_metrics
237
+ log_event "heartbeat_start" "repo_slug" "${REPO_SLUG}"
238
+
80
239
  if [[ -z "${python_bin}" || ! -x "${python_bin}" ]]; then
81
240
  echo "unable to resolve a runnable python interpreter for heartbeat-safe-auto.sh" >&2
82
241
  exit 1
@@ -605,6 +764,7 @@ write_shared_loop_status "running" ""
605
764
  --heavy-deferred-message "E2E-heavy issues remain queued until the single e2e slot is free."; then
606
765
  write_shared_loop_status "idle" "0"
607
766
  printf '[%s] shared heartbeat loop end status=0\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
767
+ log_event "heartbeat_complete" "status" "0"
608
768
  else
609
769
  loop_status=$?
610
770
  write_shared_loop_status "idle" "${loop_status}"
@@ -612,6 +772,7 @@ else
612
772
  printf 'HEARTBEAT_LOOP_TIMEOUT=yes\n'
613
773
  fi
614
774
  printf '[%s] shared heartbeat loop end status=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "${loop_status}"
775
+ log_event "heartbeat_complete" "status" "${loop_status}"
615
776
  exit "${loop_status}"
616
777
  fi
617
778
 
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5
+ # shellcheck source=/dev/null
6
+ source "${SCRIPT_DIR}/flow-config-lib.sh"
7
+
8
+ FLOW_SKILL_DIR="$(resolve_flow_skill_dir "${BASH_SOURCE[0]}")"
9
+ PROFILE_REGISTRY_ROOT="$(resolve_flow_profile_registry_root)"
10
+ CONFIG_YAML="$(resolve_flow_config_yaml "${BASH_SOURCE[0]}")"
11
+ # Do NOT export execution env for the current profile here — render-flow-config
12
+ # is meant to render the SELECTED profile's config (via CONFIG_YAML), and exporting
13
+ # the ambient profile's vars into the shell causes config_or_env to silently override
14
+ # per-profile YAML with defaults from the current resident worker's own config.
15
+ # Also, ambient env vars from the shell are cleared below so they don't leak into
16
+ # profile-smoke or other callers.
17
+ for _clean in ACP_CODING_WORKER ACP_OPENCLAW_MODEL ACP_CLAUDE_MODEL \
18
+ ACP_CLAUDE_TIMEOUT_SECONDS ACP_CLAUDE_MAX_ATTEMPTS ACP_CLAUDE_RETRY_BACKOFF_SECONDS \
19
+ ACP_OPENCLAW_THINKING ACP_OPENCLAW_TIMEOUT_SECONDS \
20
+ CODING_WORKER; do
21
+ unset "${_clean}" 2>/dev/null || true
22
+ done
23
+ unset _clean
24
+ AVAILABLE_PROFILES="$(flow_list_profile_ids "${FLOW_SKILL_DIR}" | paste -sd, -)"
25
+ INSTALLED_PROFILES="$(flow_list_installed_profile_ids | paste -sd, -)"
26
+ PROFILE_ID="$(flow_resolve_adapter_id "${CONFIG_YAML}")"
27
+ PROFILE_SELECTION_MODE="$(flow_profile_selection_mode "${FLOW_SKILL_DIR}")"
28
+ PROFILE_SELECTION_HINT="$(flow_profile_selection_hint "${FLOW_SKILL_DIR}")"
29
+ PROFILE_NOTES="$(flow_resolve_profile_notes_file "${CONFIG_YAML}")"
30
+
31
+ config_or_env() {
32
+ local env_names="${1:?env names required}"
33
+ local config_key="${2:-}"
34
+ local env_name=""
35
+ local value=""
36
+
37
+ for env_name in ${env_names}; do
38
+ value="${!env_name:-}"
39
+ if [[ -n "${value}" ]]; then
40
+ printf '%s\n' "${value}"
41
+ return 0
42
+ fi
43
+ done
44
+
45
+ if [[ -n "${config_key}" && -f "${CONFIG_YAML}" ]]; then
46
+ flow_config_get "${CONFIG_YAML}" "${config_key}"
47
+ return 0
48
+ fi
49
+
50
+ printf '\n'
51
+ }
52
+
53
+ printf 'FLOW_SKILL_DIR=%s\n' "${FLOW_SKILL_DIR}"
54
+ printf 'PROFILE_REGISTRY_ROOT=%s\n' "${PROFILE_REGISTRY_ROOT}"
55
+ printf 'CONFIG_YAML=%s\n' "${CONFIG_YAML}"
56
+ printf 'PROFILE_ID=%s\n' "${PROFILE_ID}"
57
+ printf 'PROFILE_SELECTION_MODE=%s\n' "${PROFILE_SELECTION_MODE}"
58
+ if [[ -n "${PROFILE_SELECTION_HINT}" ]]; then
59
+ printf 'PROFILE_SELECTION_HINT=%s\n' "${PROFILE_SELECTION_HINT}"
60
+ fi
61
+ printf 'AVAILABLE_PROFILES=%s\n' "${AVAILABLE_PROFILES}"
62
+ printf 'INSTALLED_PROFILES=%s\n' "${INSTALLED_PROFILES}"
63
+ printf 'PROFILE_NOTES=%s\n' "${PROFILE_NOTES}"
64
+ if [[ -f "${PROFILE_NOTES}" ]]; then
65
+ printf 'PROFILE_NOTES_EXISTS=yes\n'
66
+ else
67
+ printf 'PROFILE_NOTES_EXISTS=no\n'
68
+ fi
69
+ printf 'EFFECTIVE_REPO_ROOT=%s\n' "$(config_or_env 'ACP_REPO_ROOT' repo.root)"
70
+ printf 'EFFECTIVE_AGENT_REPO_ROOT=%s\n' "$(config_or_env 'ACP_AGENT_REPO_ROOT' runtime.agent_repo_root)"
71
+ printf 'EFFECTIVE_WORKTREE_ROOT=%s\n' "$(config_or_env 'ACP_WORKTREE_ROOT' runtime.worktree_root)"
72
+ printf 'EFFECTIVE_RUNS_ROOT=%s\n' "$(config_or_env 'ACP_RUNS_ROOT' runtime.runs_root)"
73
+ printf 'EFFECTIVE_STATE_ROOT=%s\n' "$(config_or_env 'ACP_STATE_ROOT' runtime.state_root)"
74
+ printf 'EFFECTIVE_RETAINED_REPO_ROOT=%s\n' "$(config_or_env 'ACP_RETAINED_REPO_ROOT' runtime.retained_repo_root)"
75
+ printf 'EFFECTIVE_VSCODE_WORKSPACE_FILE=%s\n' "$(config_or_env 'ACP_VSCODE_WORKSPACE_FILE' runtime.vscode_workspace_file)"
76
+ printf 'EFFECTIVE_CODING_WORKER=%s\n' "$(config_or_env 'ACP_CODING_WORKER' execution.coding_worker)"
77
+ printf 'EFFECTIVE_PROVIDER_QUOTA_COOLDOWNS=%s\n' "$(config_or_env 'ACP_PROVIDER_QUOTA_COOLDOWNS' execution.provider_quota.cooldowns)"
78
+ printf 'EFFECTIVE_PROVIDER_POOL_ORDER=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_ORDER' execution.provider_pool_order)"
79
+ printf 'EFFECTIVE_PROVIDER_POOL_NAME=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_POOL_NAME')"
80
+ printf 'EFFECTIVE_PROVIDER_POOL_BACKEND=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_BACKEND')"
81
+ printf 'EFFECTIVE_PROVIDER_POOL_MODEL=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_MODEL')"
82
+ printf 'EFFECTIVE_PROVIDER_POOL_KEY=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_KEY')"
83
+ printf 'EFFECTIVE_PROVIDER_POOLS_EXHAUSTED=%s\n' "$(config_or_env 'ACP_PROVIDER_POOLS_EXHAUSTED')"
84
+ printf 'EFFECTIVE_PROVIDER_POOL_SELECTION_REASON=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_SELECTION_REASON')"
85
+ printf 'EFFECTIVE_PROVIDER_POOL_NEXT_ATTEMPT_EPOCH=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_NEXT_ATTEMPT_EPOCH')"
86
+ printf 'EFFECTIVE_PROVIDER_POOL_NEXT_ATTEMPT_AT=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_NEXT_ATTEMPT_AT')"
87
+ printf 'EFFECTIVE_PROVIDER_POOL_LAST_REASON=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_LAST_REASON')"
88
+ printf 'EFFECTIVE_CODEX_PROFILE_SAFE=%s\n' "$(config_or_env 'ACP_CODEX_PROFILE_SAFE' execution.safe_profile)"
89
+ printf 'EFFECTIVE_CODEX_PROFILE_BYPASS=%s\n' "$(config_or_env 'ACP_CODEX_PROFILE_BYPASS' execution.bypass_profile)"
90
+ printf 'EFFECTIVE_CLAUDE_MODEL=%s\n' "$(config_or_env 'ACP_CLAUDE_MODEL' execution.claude.model)"
91
+ printf 'EFFECTIVE_CLAUDE_PERMISSION_MODE=%s\n' "$(config_or_env 'ACP_CLAUDE_PERMISSION_MODE' execution.claude.permission_mode)"
92
+ printf 'EFFECTIVE_CLAUDE_EFFORT=%s\n' "$(config_or_env 'ACP_CLAUDE_EFFORT' execution.claude.effort)"
93
+ printf 'EFFECTIVE_CLAUDE_TIMEOUT_SECONDS=%s\n' "$(config_or_env 'ACP_CLAUDE_TIMEOUT_SECONDS' execution.claude.timeout_seconds)"
94
+ printf 'EFFECTIVE_CLAUDE_MAX_ATTEMPTS=%s\n' "$(config_or_env 'ACP_CLAUDE_MAX_ATTEMPTS' execution.claude.max_attempts)"
95
+ printf 'EFFECTIVE_CLAUDE_RETRY_BACKOFF_SECONDS=%s\n' "$(config_or_env 'ACP_CLAUDE_RETRY_BACKOFF_SECONDS' execution.claude.retry_backoff_seconds)"
96
+ printf 'EFFECTIVE_OPENCLAW_MODEL=%s\n' "$(config_or_env 'ACP_OPENCLAW_MODEL' execution.openclaw.model)"
97
+ printf 'EFFECTIVE_OPENCLAW_THINKING=%s\n' "$(config_or_env 'ACP_OPENCLAW_THINKING' execution.openclaw.thinking)"
98
+ printf 'EFFECTIVE_OPENCLAW_TIMEOUT_SECONDS=%s\n' "$(config_or_env 'ACP_OPENCLAW_TIMEOUT_SECONDS' execution.openclaw.timeout_seconds)"
@@ -78,6 +78,29 @@ sync_skill_copies() {
78
78
  if [[ -n "${TARGET_FLOW_COMPAT_ALIAS}" ]]; then
79
79
  sync_tree_into_target "${FLOW_SKILL_SOURCE}" "${TARGET_FLOW_COMPAT_ALIAS}"
80
80
  fi
81
+
82
+ # Explicitly ensure profile-smoke.sh is synced to runtime home
83
+ local profile_smoke_source="${FLOW_SKILL_SOURCE}/tools/bin/profile-smoke.sh"
84
+ local profile_smoke_target="${FLOW_SKILL_TARGET}/tools/bin/profile-smoke.sh"
85
+ if [[ -f "${profile_smoke_source}" ]]; then
86
+ mkdir -p "$(dirname "${profile_smoke_target}")"
87
+ cp "${profile_smoke_source}" "${profile_smoke_target}"
88
+ chmod +x "${profile_smoke_target}"
89
+ fi
90
+
91
+ # Ensure test scripts are synced for regression coverage
92
+ for test_script in \
93
+ "${FLOW_SKILL_SOURCE}/tools/bin/kick-scheduler-wrapper.sh" \
94
+ "${FLOW_SKILL_SOURCE}/tools/tests/test-kick-scheduler-wrapper.sh" \
95
+ "${FLOW_SKILL_SOURCE}/tools/tests/test-runtime-operator-smoke.sh" \
96
+ "${FLOW_SKILL_SOURCE}/tools/tests/test-package-tarball-surface.sh"; do
97
+ if [[ -f "${test_script}" ]]; then
98
+ target_script="${FLOW_SKILL_TARGET}${test_script#${FLOW_SKILL_SOURCE}}"
99
+ mkdir -p "$(dirname "${target_script}")"
100
+ cp "${test_script}" "${target_script}"
101
+ chmod +x "${target_script}"
102
+ fi
103
+ done
81
104
  }
82
105
 
83
106
  refresh_legacy_profile_templates() {