npm - agent-control-plane - Versions diffs - 0.7.1 → 0.8.0 - Mend

agent-control-plane 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/README.md +305 -7
package/hooks/pr-reconcile-hooks.sh +12 -1
package/package.json +9 -7
package/tools/bin/flow-runtime-doctor.sh +67 -0
package/tools/bin/heartbeat-safe-auto.sh +161 -0
package/tools/bin/render-flow-config.sh +98 -0
package/tools/bin/sync-shared-agent-home.sh +23 -0
package/tools/dashboard/__pycache__/server.cpython-311.pyc +0 -0
package/tools/dashboard/app-v2.js +1120 -0
package/tools/dashboard/app.js +129 -38
package/tools/dashboard/index-inline.html +1533 -0
package/tools/dashboard/index-v2.html +45 -0
package/tools/dashboard/server.py +64 -15
package/tools/dashboard/styles.css +595 -521
package/tools/bin/profile-activate.sh +0 -109
package/tools/bin/profile-adopt.sh +0 -225
package/tools/bin/profile-smoke.sh +0 -461
package/tools/bin/test-smoke.sh +0 -119

package/tools/bin/heartbeat-safe-auto.sh CHANGED Viewed

@@ -75,8 +75,167 @@ QUOTA_LOCK_DIR="${STATE_ROOT}/quota-preflight.lock"
 QUOTA_PID_FILE="${QUOTA_LOCK_DIR}/pid"
 python_bin="$(flow_resolve_python_bin || true)"
+# Stale lock detection and cleanup
+cleanup_stale_locks() {
+  local lock_dir pid_file pid max_age_seconds=${1:-1800}  # default 30 minutes
+  local lock_dirs=(
+    "${STATE_ROOT}/heartbeat-loop.lock"
+    "${STATE_ROOT}/quota-preflight.lock"
+  )
+  for lock_dir in "${lock_dirs[@]}"; do
+    pid_file="${lock_dir}/pid"
+    if [[ -f "$pid_file" ]]; then
+      pid=$(cat "$pid_file" 2>/dev/null | tr -d '[:space:]')
+      if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
+        # Process is still running, check if parent is init (orphan)
+        local ppid
+        ppid=$(ps -p "$pid" -o ppid= 2>/dev/null | tr -d '[:space:]')
+        if [[ "$ppid" == "1" ]]; then
+          log_event "stale_lock_detected" "type" "orphan" "pid" "$pid" "lock_dir" "$lock_dir"
+          echo "Warning: Removing orphan lock (PID $pid, lock: $lock_dir)"
+          rm -rf "$lock_dir"
+        fi
+      else
+        # Process not running, check lock age
+        local lock_age
+        lock_age=$(($(date +%s) - $(stat -f %m "$pid_file" 2>/dev/null || stat -c %Y "$pid_file" 2>/dev/null || echo "0")))
+        if [[ $lock_age -gt $max_age_seconds ]]; then
+          log_event "stale_lock_detected" "type" "timeout" "pid" "$pid" "age_seconds" "$lock_age" "lock_dir" "$lock_dir"
+          echo "Warning: Removing stale lock (PID $pid, age: ${lock_age}s, lock: $lock_dir)"
+          rm -rf "$lock_dir"
+        fi
+      fi
+    fi
+  done
+}
+# Structured logging for scheduler observability
+LOG_FILE="${STATE_ROOT}/scheduler-events.jsonl"
+mkdir -p "$(dirname "${LOG_FILE}")"
+log_event() {
+  local event_type="$1"
+  shift
+  local timestamp
+  timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+  local extra_fields=""
+  while [[ $# -gt 0 ]]; do
+    extra_fields="${extra_fields}, \"$1\": \"$2\""
+    shift 2
+  done
+  echo "{\"timestamp\": \"${timestamp}\", \"event\": \"${event_type}\", \"pid\": ${$}${extra_fields}}" >> "${LOG_FILE}"
+}
+# Health check: monitor system resources
+check_system_resources() {
+  local cpu_usage mem_usage disk_usage
+  local warn=0
+  # CPU usage (1-min load average / number of cores)
+  if command -v nproc >/dev/null 2>&1 && command -v awk >/dev/null 2>&1; then
+    local load_1min disk_avail disk_total
+    load_1min=$(cat /proc/loadavg 2>/dev/null | awk '{print $1}' || echo "0")
+    local cores
+    cores=$(nproc 2>/dev/null || echo "1")
+    cpu_usage=$(echo "$load_1min $cores" | awk '{printf "%.0f", ($1/$2)*100}' 2>/dev/null || echo "0")
+  fi
+  # Memory usage
+  if command -v free >/dev/null 2>&1; then
+    mem_usage=$(free | awk '/Mem:/ {printf "%.0f", ($3/$2)*100}' 2>/dev/null || echo "0")
+  elif [[ -f /proc/meminfo ]]; then
+    local mem_total mem_available
+    mem_total=$(grep MemTotal /proc/meminfo 2>/dev/null | awk '{print $2}' || echo "1")
+    mem_available=$(grep MemAvailable /proc/meminfo 2>/dev/null | awk '{print $2}' || echo "0")
+    mem_usage=$(echo "$mem_total $mem_available" | awk '{printf "%.0f", (($1-$2)/$1)*100}' 2>/dev/null || echo "0")
+  fi
+  # Disk usage for STATE_ROOT
+  disk_usage=$(df "${STATE_ROOT}" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}' || echo "0")
+  # Log resource status
+  log_event "system_resources" "cpu_pct" "${cpu_usage:-0}" "mem_pct" "${mem_usage:-0}" "disk_pct" "${disk_usage:-0}"
+  # Warnings
+  if [[ "${cpu_usage:-0}" -gt 80 ]]; then
+    log_event "resource_warning" "type" "cpu" "value" "${cpu_usage}"
+    warn=1
+  fi
+  if [[ "${mem_usage:-0}" -gt 90 ]]; then
+    log_event "resource_warning" "type" "memory" "value" "${mem_usage}"
+    warn=1
+  fi
+  if [[ "${disk_usage:-0}" -gt 90 ]]; then
+    log_event "resource_warning" "type" "disk" "value" "${disk_usage}"
+    warn=1
+  fi
+  return $warn
+}
+# Collect scheduler metrics for observability
+collect_metrics() {
+  local active_sessions=0
+  local queued_issues=0
+  local completed_today=0
+  local failed_today=0
+  # Count active tmux sessions for this repo
+  if command -v tmux >/dev/null 2>&1; then
+    active_sessions=$(tmux ls 2>/dev/null | grep -c "agent-" || echo "0")
+  fi
+  # Count queued issues (issues with agent-keep-open label but no active session)
+  if command -v gh >/dev/null 2>&1 && [[ -n "${REPO_SLUG:-}" ]]; then
+    queued_issues=$(gh issue list --repo "${REPO_SLUG}" --label "agent-keep-open" --state open --json number 2>/dev/null | grep -c '"number"' || echo "0")
+  fi
+  # Count completed/failed sessions from history (last 24h)
+  if [[ -d "${HISTORY_ROOT}" ]]; then
+    completed_today=$(find "${HISTORY_ROOT}" -name "*.json" -mtime 0 2>/dev/null | xargs grep -l '"status": "completed"' 2>/dev/null | wc -l || echo "0")
+    failed_today=$(find "${HISTORY_ROOT}" -name "*.json" -mtime 0 2>/dev/null | xargs grep -l '"status": "failed"' 2>/dev/null | wc -l || echo "0")
+  fi
+  # Log metrics
+  log_event "scheduler_metrics" \
+    "active_sessions" "$active_sessions" \
+    "queued_issues" "$queued_issues" \
+    "completed_today" "$completed_today" \
+    "failed_today" "$failed_today"
+}
+# Error tracking for scheduler observability
+ERROR_LOG="${STATE_ROOT}/scheduler-errors.jsonl"
+error_count=0
+track_error() {
+  local error_type="$1"
+  local error_msg="$2"
+  local timestamp
+  timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+  ((error_count++))
+  # Log to JSONL
+  echo "{\"timestamp\": \"${timestamp}\", \"type\": \"${error_type}\", \"message\": \"${error_msg}\", \"pid\": ${$}}" >> "${ERROR_LOG}"
+  # Also log as event
+  log_event "scheduler_error" "type" "${error_type}" "message" "${error_msg}"
+  # Alert if too many errors
+  if [[ $error_count -gt 10 ]]; then
+    log_event "error_threshold_exceeded" "count" "$error_count"
+    echo "Warning: High error count detected ($error_count errors)" >&2
+  fi
+}
 mkdir -p "${AGENT_ROOT}" "${RUNS_ROOT}" "${STATE_ROOT}" "${HISTORY_ROOT}" "${WORKTREE_ROOT}" "${MEMORY_DIR}"
+cleanup_stale_locks 1800  # Clean locks older than 30 minutes
+collect_metrics
+log_event "heartbeat_start" "repo_slug" "${REPO_SLUG}"
 if [[ -z "${python_bin}" || ! -x "${python_bin}" ]]; then
   echo "unable to resolve a runnable python interpreter for heartbeat-safe-auto.sh" >&2
   exit 1
@@ -605,6 +764,7 @@ write_shared_loop_status "running" ""
       --heavy-deferred-message "E2E-heavy issues remain queued until the single e2e slot is free."; then
   write_shared_loop_status "idle" "0"
   printf '[%s] shared heartbeat loop end status=0\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+  log_event "heartbeat_complete" "status" "0"
 else
   loop_status=$?
   write_shared_loop_status "idle" "${loop_status}"
@@ -612,6 +772,7 @@ else
     printf 'HEARTBEAT_LOOP_TIMEOUT=yes\n'
   fi
   printf '[%s] shared heartbeat loop end status=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "${loop_status}"
+  log_event "heartbeat_complete" "status" "${loop_status}"
   exit "${loop_status}"
 fi

package/tools/bin/render-flow-config.sh ADDED Viewed

@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=/dev/null
+source "${SCRIPT_DIR}/flow-config-lib.sh"
+FLOW_SKILL_DIR="$(resolve_flow_skill_dir "${BASH_SOURCE[0]}")"
+PROFILE_REGISTRY_ROOT="$(resolve_flow_profile_registry_root)"
+CONFIG_YAML="$(resolve_flow_config_yaml "${BASH_SOURCE[0]}")"
+# Do NOT export execution env for the current profile here — render-flow-config
+# is meant to render the SELECTED profile's config (via CONFIG_YAML), and exporting
+# the ambient profile's vars into the shell causes config_or_env to silently override
+# per-profile YAML with defaults from the current resident worker's own config.
+# Also, ambient env vars from the shell are cleared below so they don't leak into
+# profile-smoke or other callers.
+for _clean in ACP_CODING_WORKER ACP_OPENCLAW_MODEL ACP_CLAUDE_MODEL \
+  ACP_CLAUDE_TIMEOUT_SECONDS ACP_CLAUDE_MAX_ATTEMPTS ACP_CLAUDE_RETRY_BACKOFF_SECONDS \
+  ACP_OPENCLAW_THINKING ACP_OPENCLAW_TIMEOUT_SECONDS \
+  CODING_WORKER; do
+  unset "${_clean}" 2>/dev/null || true
+done
+unset _clean
+AVAILABLE_PROFILES="$(flow_list_profile_ids "${FLOW_SKILL_DIR}" | paste -sd, -)"
+INSTALLED_PROFILES="$(flow_list_installed_profile_ids | paste -sd, -)"
+PROFILE_ID="$(flow_resolve_adapter_id "${CONFIG_YAML}")"
+PROFILE_SELECTION_MODE="$(flow_profile_selection_mode "${FLOW_SKILL_DIR}")"
+PROFILE_SELECTION_HINT="$(flow_profile_selection_hint "${FLOW_SKILL_DIR}")"
+PROFILE_NOTES="$(flow_resolve_profile_notes_file "${CONFIG_YAML}")"
+config_or_env() {
+  local env_names="${1:?env names required}"
+  local config_key="${2:-}"
+  local env_name=""
+  local value=""
+  for env_name in ${env_names}; do
+    value="${!env_name:-}"
+    if [[ -n "${value}" ]]; then
+      printf '%s\n' "${value}"
+      return 0
+    fi
+  done
+  if [[ -n "${config_key}" && -f "${CONFIG_YAML}" ]]; then
+    flow_config_get "${CONFIG_YAML}" "${config_key}"
+    return 0
+  fi
+  printf '\n'
+}
+printf 'FLOW_SKILL_DIR=%s\n' "${FLOW_SKILL_DIR}"
+printf 'PROFILE_REGISTRY_ROOT=%s\n' "${PROFILE_REGISTRY_ROOT}"
+printf 'CONFIG_YAML=%s\n' "${CONFIG_YAML}"
+printf 'PROFILE_ID=%s\n' "${PROFILE_ID}"
+printf 'PROFILE_SELECTION_MODE=%s\n' "${PROFILE_SELECTION_MODE}"
+if [[ -n "${PROFILE_SELECTION_HINT}" ]]; then
+  printf 'PROFILE_SELECTION_HINT=%s\n' "${PROFILE_SELECTION_HINT}"
+fi
+printf 'AVAILABLE_PROFILES=%s\n' "${AVAILABLE_PROFILES}"
+printf 'INSTALLED_PROFILES=%s\n' "${INSTALLED_PROFILES}"
+printf 'PROFILE_NOTES=%s\n' "${PROFILE_NOTES}"
+if [[ -f "${PROFILE_NOTES}" ]]; then
+  printf 'PROFILE_NOTES_EXISTS=yes\n'
+else
+  printf 'PROFILE_NOTES_EXISTS=no\n'
+fi
+printf 'EFFECTIVE_REPO_ROOT=%s\n' "$(config_or_env 'ACP_REPO_ROOT' repo.root)"
+printf 'EFFECTIVE_AGENT_REPO_ROOT=%s\n' "$(config_or_env 'ACP_AGENT_REPO_ROOT' runtime.agent_repo_root)"
+printf 'EFFECTIVE_WORKTREE_ROOT=%s\n' "$(config_or_env 'ACP_WORKTREE_ROOT' runtime.worktree_root)"
+printf 'EFFECTIVE_RUNS_ROOT=%s\n' "$(config_or_env 'ACP_RUNS_ROOT' runtime.runs_root)"
+printf 'EFFECTIVE_STATE_ROOT=%s\n' "$(config_or_env 'ACP_STATE_ROOT' runtime.state_root)"
+printf 'EFFECTIVE_RETAINED_REPO_ROOT=%s\n' "$(config_or_env 'ACP_RETAINED_REPO_ROOT' runtime.retained_repo_root)"
+printf 'EFFECTIVE_VSCODE_WORKSPACE_FILE=%s\n' "$(config_or_env 'ACP_VSCODE_WORKSPACE_FILE' runtime.vscode_workspace_file)"
+printf 'EFFECTIVE_CODING_WORKER=%s\n' "$(config_or_env 'ACP_CODING_WORKER' execution.coding_worker)"
+printf 'EFFECTIVE_PROVIDER_QUOTA_COOLDOWNS=%s\n' "$(config_or_env 'ACP_PROVIDER_QUOTA_COOLDOWNS' execution.provider_quota.cooldowns)"
+printf 'EFFECTIVE_PROVIDER_POOL_ORDER=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_ORDER' execution.provider_pool_order)"
+printf 'EFFECTIVE_PROVIDER_POOL_NAME=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_POOL_NAME')"
+printf 'EFFECTIVE_PROVIDER_POOL_BACKEND=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_BACKEND')"
+printf 'EFFECTIVE_PROVIDER_POOL_MODEL=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_MODEL')"
+printf 'EFFECTIVE_PROVIDER_POOL_KEY=%s\n' "$(config_or_env 'ACP_ACTIVE_PROVIDER_KEY')"
+printf 'EFFECTIVE_PROVIDER_POOLS_EXHAUSTED=%s\n' "$(config_or_env 'ACP_PROVIDER_POOLS_EXHAUSTED')"
+printf 'EFFECTIVE_PROVIDER_POOL_SELECTION_REASON=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_SELECTION_REASON')"
+printf 'EFFECTIVE_PROVIDER_POOL_NEXT_ATTEMPT_EPOCH=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_NEXT_ATTEMPT_EPOCH')"
+printf 'EFFECTIVE_PROVIDER_POOL_NEXT_ATTEMPT_AT=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_NEXT_ATTEMPT_AT')"
+printf 'EFFECTIVE_PROVIDER_POOL_LAST_REASON=%s\n' "$(config_or_env 'ACP_PROVIDER_POOL_LAST_REASON')"
+printf 'EFFECTIVE_CODEX_PROFILE_SAFE=%s\n' "$(config_or_env 'ACP_CODEX_PROFILE_SAFE' execution.safe_profile)"
+printf 'EFFECTIVE_CODEX_PROFILE_BYPASS=%s\n' "$(config_or_env 'ACP_CODEX_PROFILE_BYPASS' execution.bypass_profile)"
+printf 'EFFECTIVE_CLAUDE_MODEL=%s\n' "$(config_or_env 'ACP_CLAUDE_MODEL' execution.claude.model)"
+printf 'EFFECTIVE_CLAUDE_PERMISSION_MODE=%s\n' "$(config_or_env 'ACP_CLAUDE_PERMISSION_MODE' execution.claude.permission_mode)"
+printf 'EFFECTIVE_CLAUDE_EFFORT=%s\n' "$(config_or_env 'ACP_CLAUDE_EFFORT' execution.claude.effort)"
+printf 'EFFECTIVE_CLAUDE_TIMEOUT_SECONDS=%s\n' "$(config_or_env 'ACP_CLAUDE_TIMEOUT_SECONDS' execution.claude.timeout_seconds)"
+printf 'EFFECTIVE_CLAUDE_MAX_ATTEMPTS=%s\n' "$(config_or_env 'ACP_CLAUDE_MAX_ATTEMPTS' execution.claude.max_attempts)"
+printf 'EFFECTIVE_CLAUDE_RETRY_BACKOFF_SECONDS=%s\n' "$(config_or_env 'ACP_CLAUDE_RETRY_BACKOFF_SECONDS' execution.claude.retry_backoff_seconds)"
+printf 'EFFECTIVE_OPENCLAW_MODEL=%s\n' "$(config_or_env 'ACP_OPENCLAW_MODEL' execution.openclaw.model)"
+printf 'EFFECTIVE_OPENCLAW_THINKING=%s\n' "$(config_or_env 'ACP_OPENCLAW_THINKING' execution.openclaw.thinking)"
+printf 'EFFECTIVE_OPENCLAW_TIMEOUT_SECONDS=%s\n' "$(config_or_env 'ACP_OPENCLAW_TIMEOUT_SECONDS' execution.openclaw.timeout_seconds)"

package/tools/bin/sync-shared-agent-home.sh CHANGED Viewed

@@ -78,6 +78,29 @@ sync_skill_copies() {
   if [[ -n "${TARGET_FLOW_COMPAT_ALIAS}" ]]; then
     sync_tree_into_target "${FLOW_SKILL_SOURCE}" "${TARGET_FLOW_COMPAT_ALIAS}"
   fi
+  # Explicitly ensure profile-smoke.sh is synced to runtime home
+  local profile_smoke_source="${FLOW_SKILL_SOURCE}/tools/bin/profile-smoke.sh"
+  local profile_smoke_target="${FLOW_SKILL_TARGET}/tools/bin/profile-smoke.sh"
+  if [[ -f "${profile_smoke_source}" ]]; then
+    mkdir -p "$(dirname "${profile_smoke_target}")"
+    cp "${profile_smoke_source}" "${profile_smoke_target}"
+    chmod +x "${profile_smoke_target}"
+  fi
+  # Ensure test scripts are synced for regression coverage
+  for test_script in \
+    "${FLOW_SKILL_SOURCE}/tools/bin/kick-scheduler-wrapper.sh" \
+    "${FLOW_SKILL_SOURCE}/tools/tests/test-kick-scheduler-wrapper.sh" \
+    "${FLOW_SKILL_SOURCE}/tools/tests/test-runtime-operator-smoke.sh" \
+    "${FLOW_SKILL_SOURCE}/tools/tests/test-package-tarball-surface.sh"; do
+    if [[ -f "${test_script}" ]]; then
+      target_script="${FLOW_SKILL_TARGET}${test_script#${FLOW_SKILL_SOURCE}}"
+      mkdir -p "$(dirname "${target_script}")"
+      cp "${test_script}" "${target_script}"
+      chmod +x "${target_script}"
+    fi
+  done
 }
 refresh_legacy_profile_templates() {

package/tools/dashboard/__pycache__/server.cpython-311.pyc ADDED Viewed

Binary file