@adaptic/maestro 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -296,14 +296,63 @@ configure_app_launches() {
|
|
|
296
296
|
# The poller uses IMAP directly, not the browser
|
|
297
297
|
log "Safari: not needed as Login Item (Gmail uses IMAP polling)"
|
|
298
298
|
|
|
299
|
-
# Ensure required directories exist
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
299
|
+
# Ensure required directories exist (as the real user, not root)
|
|
300
|
+
local real_user="${SUDO_USER:-$CURRENT_USER}"
|
|
301
|
+
local dirs=("$AGENT_DIR/logs/huddle" "$AGENT_DIR/state/huddle" "$AGENT_DIR/logs/watchdog")
|
|
302
|
+
for d in "${dirs[@]}"; do
|
|
303
|
+
mkdir -p "$d"
|
|
304
|
+
chown "$real_user:staff" "$d" 2>/dev/null || true
|
|
305
|
+
done
|
|
306
|
+
ok "Log and state directories verified (owned by $real_user)"
|
|
303
307
|
}
|
|
304
308
|
|
|
305
309
|
# =============================================================================
|
|
306
|
-
# 7.
|
|
310
|
+
# 7. MEMORY WATCHDOG
|
|
311
|
+
# =============================================================================
|
|
312
|
+
|
|
313
|
+
configure_memory_watchdog() {
|
|
314
|
+
section "Memory Watchdog (OOM Protection)"
|
|
315
|
+
|
|
316
|
+
local WATCHDOG_SCRIPT="$AGENT_DIR/scripts/watchdog/memory-watchdog.sh"
|
|
317
|
+
local PLIST_SOURCE="$AGENT_DIR/scripts/watchdog/ai.maestro.memory-watchdog.plist"
|
|
318
|
+
|
|
319
|
+
if [ ! -f "$WATCHDOG_SCRIPT" ]; then
|
|
320
|
+
fail "Watchdog script not found at $WATCHDOG_SCRIPT"
|
|
321
|
+
return 0
|
|
322
|
+
fi
|
|
323
|
+
|
|
324
|
+
chmod +x "$WATCHDOG_SCRIPT"
|
|
325
|
+
ok "Watchdog script executable"
|
|
326
|
+
|
|
327
|
+
# Determine the real user's home for LaunchAgents
|
|
328
|
+
local real_user="${SUDO_USER:-$CURRENT_USER}"
|
|
329
|
+
local real_home
|
|
330
|
+
real_home=$(eval echo "~$real_user")
|
|
331
|
+
local PLIST_DEST="$real_home/Library/LaunchAgents/ai.maestro.memory-watchdog.plist"
|
|
332
|
+
|
|
333
|
+
# Install the plist (update paths for this agent directory)
|
|
334
|
+
sed "s|/Users/sophie/maestro|$AGENT_DIR|g" "$PLIST_SOURCE" > "$PLIST_DEST"
|
|
335
|
+
chown "$real_user:staff" "$PLIST_DEST" 2>/dev/null || true
|
|
336
|
+
|
|
337
|
+
# Load it (as the real user)
|
|
338
|
+
if [ "$(id -u)" -eq 0 ]; then
|
|
339
|
+
su "$real_user" -c "launchctl unload '$PLIST_DEST' 2>/dev/null; launchctl load '$PLIST_DEST'" 2>/dev/null || true
|
|
340
|
+
else
|
|
341
|
+
launchctl unload "$PLIST_DEST" 2>/dev/null || true
|
|
342
|
+
launchctl load "$PLIST_DEST" 2>/dev/null || true
|
|
343
|
+
fi
|
|
344
|
+
|
|
345
|
+
ok "Memory watchdog launchd agent installed (runs every 30s)"
|
|
346
|
+
log "Thresholds: warn=60% critical=75% emergency=85% max_procs=8"
|
|
347
|
+
log "Override via env: WATCHDOG_WARN_PERCENT, WATCHDOG_CRITICAL_PERCENT, etc."
|
|
348
|
+
|
|
349
|
+
# Create log directory with correct ownership
|
|
350
|
+
mkdir -p "$AGENT_DIR/logs/watchdog"
|
|
351
|
+
chown "$real_user:staff" "$AGENT_DIR/logs/watchdog" 2>/dev/null || true
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
# =============================================================================
|
|
355
|
+
# 8. ORCHESTRATOR AND SUBSYSTEMS ON BOOT
|
|
307
356
|
# =============================================================================
|
|
308
357
|
|
|
309
358
|
configure_boot_services() {
|
|
@@ -331,7 +380,7 @@ configure_boot_services() {
|
|
|
331
380
|
|
|
332
381
|
# Check which are loaded
|
|
333
382
|
local loaded_count
|
|
334
|
-
loaded_count=$(launchctl list 2>/dev/null | grep -c "ai.adaptic" ||
|
|
383
|
+
loaded_count=$(launchctl list 2>/dev/null | grep -c "ai.adaptic" || true)
|
|
335
384
|
if [ "$loaded_count" -gt 0 ]; then
|
|
336
385
|
ok "$loaded_count agents currently loaded"
|
|
337
386
|
else
|
|
@@ -364,7 +413,7 @@ configure_boot_services() {
|
|
|
364
413
|
}
|
|
365
414
|
|
|
366
415
|
# =============================================================================
|
|
367
|
-
#
|
|
416
|
+
# 9. SYSTEM VERIFICATION
|
|
368
417
|
# =============================================================================
|
|
369
418
|
|
|
370
419
|
verify_all() {
|
|
@@ -440,6 +489,30 @@ verify_all() {
|
|
|
440
489
|
issues=$((issues + 1))
|
|
441
490
|
fi
|
|
442
491
|
|
|
492
|
+
# Check memory watchdog
|
|
493
|
+
if launchctl list 2>/dev/null | grep -q "ai.maestro.memory-watchdog"; then
|
|
494
|
+
ok "Memory watchdog: running"
|
|
495
|
+
elif [ -f "$HOME/Library/LaunchAgents/ai.maestro.memory-watchdog.plist" ]; then
|
|
496
|
+
warn "Memory watchdog: installed but not loaded"
|
|
497
|
+
else
|
|
498
|
+
fail "Memory watchdog: not installed"
|
|
499
|
+
issues=$((issues + 1))
|
|
500
|
+
fi
|
|
501
|
+
|
|
502
|
+
# Check heartbeat freshness
|
|
503
|
+
local heartbeat_file="$AGENT_DIR/state/heartbeat"
|
|
504
|
+
if [ -f "$heartbeat_file" ]; then
|
|
505
|
+
local hb_age
|
|
506
|
+
hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$(cat "$heartbeat_file")" +%s 2>/dev/null || echo 0) ))
|
|
507
|
+
if [ "$hb_age" -lt 120 ]; then
|
|
508
|
+
ok "Heartbeat: fresh (${hb_age}s ago)"
|
|
509
|
+
else
|
|
510
|
+
warn "Heartbeat: stale (${hb_age}s ago — watchdog may not be running)"
|
|
511
|
+
fi
|
|
512
|
+
else
|
|
513
|
+
warn "Heartbeat: no heartbeat file found"
|
|
514
|
+
fi
|
|
515
|
+
|
|
443
516
|
# Summary
|
|
444
517
|
echo ""
|
|
445
518
|
if [ "$issues" -eq 0 ]; then
|
|
@@ -473,6 +546,7 @@ main() {
|
|
|
473
546
|
configure_slack_cdp
|
|
474
547
|
configure_audio
|
|
475
548
|
configure_app_launches
|
|
549
|
+
configure_memory_watchdog
|
|
476
550
|
configure_boot_services
|
|
477
551
|
;;
|
|
478
552
|
--full|full|"")
|
|
@@ -482,6 +556,7 @@ main() {
|
|
|
482
556
|
configure_slack_cdp
|
|
483
557
|
configure_audio
|
|
484
558
|
configure_app_launches
|
|
559
|
+
configure_memory_watchdog
|
|
485
560
|
configure_boot_services
|
|
486
561
|
|
|
487
562
|
section "Configuration Complete"
|
package/scripts/spawn-session.sh
CHANGED
|
@@ -28,6 +28,26 @@ if [ ! -f "$TASK_FILE" ]; then
|
|
|
28
28
|
exit 1
|
|
29
29
|
fi
|
|
30
30
|
|
|
31
|
+
# --- Resource gate: refuse to spawn if machine is under pressure -------------
|
|
32
|
+
MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
|
|
33
|
+
TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
|
|
34
|
+
SPAWN_BLOCK_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
|
|
35
|
+
|
|
36
|
+
claude_count=$(/bin/ps -eo comm | grep -ci "claude" | tr -d ' ' || true)
|
|
37
|
+
claude_rss_kb=$(/bin/ps -eo rss,comm | grep -i "claude" | grep -v "Claude.app" | grep -v "Claude Helper" | awk '{sum+=$1} END {print sum+0}')
|
|
38
|
+
claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
|
|
39
|
+
|
|
40
|
+
if [ "$claude_count" -ge "$MAX_CLAUDE_PROCS" ]; then
|
|
41
|
+
echo "[$TIMESTAMP] BLOCKED: Too many claude processes ($claude_count >= $MAX_CLAUDE_PROCS)" >&2
|
|
42
|
+
exit 1
|
|
43
|
+
fi
|
|
44
|
+
|
|
45
|
+
if [ "$claude_pct" -ge "$SPAWN_BLOCK_PERCENT" ]; then
|
|
46
|
+
echo "[$TIMESTAMP] BLOCKED: Claude memory usage ${claude_pct}% >= ${SPAWN_BLOCK_PERCENT}% threshold" >&2
|
|
47
|
+
exit 1
|
|
48
|
+
fi
|
|
49
|
+
# -----------------------------------------------------------------------------
|
|
50
|
+
|
|
31
51
|
# Create session directory
|
|
32
52
|
mkdir -p "$SESSION_DIR" "$(dirname "$LOG_FILE")"
|
|
33
53
|
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
|
3
|
+
<plist version="1.0">
|
|
4
|
+
<dict>
|
|
5
|
+
<key>Label</key>
|
|
6
|
+
<string>ai.maestro.memory-watchdog</string>
|
|
7
|
+
|
|
8
|
+
<key>ProgramArguments</key>
|
|
9
|
+
<array>
|
|
10
|
+
<string>/bin/bash</string>
|
|
11
|
+
<string>/Users/sophie/maestro/scripts/watchdog/memory-watchdog.sh</string>
|
|
12
|
+
</array>
|
|
13
|
+
|
|
14
|
+
<key>StartInterval</key>
|
|
15
|
+
<integer>30</integer>
|
|
16
|
+
|
|
17
|
+
<key>RunAtLoad</key>
|
|
18
|
+
<true/>
|
|
19
|
+
|
|
20
|
+
<key>StandardOutPath</key>
|
|
21
|
+
<string>/Users/sophie/maestro/logs/watchdog/launchd-stdout.log</string>
|
|
22
|
+
|
|
23
|
+
<key>StandardErrorPath</key>
|
|
24
|
+
<string>/Users/sophie/maestro/logs/watchdog/launchd-stderr.log</string>
|
|
25
|
+
|
|
26
|
+
<key>EnvironmentVariables</key>
|
|
27
|
+
<dict>
|
|
28
|
+
<key>PATH</key>
|
|
29
|
+
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
|
|
30
|
+
</dict>
|
|
31
|
+
|
|
32
|
+
<key>Nice</key>
|
|
33
|
+
<integer>10</integer>
|
|
34
|
+
|
|
35
|
+
<key>ProcessType</key>
|
|
36
|
+
<string>Background</string>
|
|
37
|
+
|
|
38
|
+
<key>ThrottleInterval</key>
|
|
39
|
+
<integer>10</integer>
|
|
40
|
+
</dict>
|
|
41
|
+
</plist>
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# force-reboot.sh — Remote/Automated Force Reboot for Frozen Mac Mini
|
|
4
|
+
# =============================================================================
|
|
5
|
+
#
|
|
6
|
+
# This script provides multiple escalation levels for rebooting a frozen
|
|
7
|
+
# or unresponsive Mac mini running the Maestro agent system.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# ./scripts/watchdog/force-reboot.sh --graceful # Try graceful shutdown first
|
|
11
|
+
# ./scripts/watchdog/force-reboot.sh --force # Immediate reboot (requires sudo)
|
|
12
|
+
# ./scripts/watchdog/force-reboot.sh --status # Check heartbeat and uptime
|
|
13
|
+
#
|
|
14
|
+
# Remote usage (from another machine via SSH):
|
|
15
|
+
# ssh sophie@mac-mini.local "~/maestro/scripts/watchdog/force-reboot.sh --graceful"
|
|
16
|
+
#
|
|
17
|
+
# =============================================================================
|
|
18
|
+
#
|
|
19
|
+
# FORCE-REBOOT OPTIONS WHEN THE MACHINE IS COMPLETELY FROZEN:
|
|
20
|
+
#
|
|
21
|
+
# If SSH is unresponsive and the GUI is frozen, the only options are:
|
|
22
|
+
#
|
|
23
|
+
# 1. SMART PLUG (recommended for headless servers):
|
|
24
|
+
# - Install a WiFi smart plug (TP-Link Kasa, Shelly, etc.) on the Mac mini power
|
|
25
|
+
# - Power cycle via the smart plug's app or API
|
|
26
|
+
# - Combined with `pmset autorestart 1` (already configured), the Mac mini
|
|
27
|
+
# will boot automatically when power is restored
|
|
28
|
+
# - Example (TP-Link Kasa):
|
|
29
|
+
# kasa --host <plug-ip> --type plug off && sleep 5 && kasa --host <plug-ip> --type plug on
|
|
30
|
+
# - Example (Shelly):
|
|
31
|
+
# curl -s "http://<shelly-ip>/relay/0?turn=off" && sleep 5 && curl -s "http://<shelly-ip>/relay/0?turn=on"
|
|
32
|
+
#
|
|
33
|
+
# 2. PARSEC (if display is frozen but network is up):
|
|
34
|
+
# - Parsec may still be responsive even if the GUI is frozen
|
|
35
|
+
# - Connect via Parsec and use Cmd+Ctrl+Power to force restart
|
|
36
|
+
#
|
|
37
|
+
# 3. APPLE REMOTE MANAGEMENT (ARD):
|
|
38
|
+
# - If enabled, use "Send UNIX Command" from another Mac:
|
|
39
|
+
# sudo shutdown -r now
|
|
40
|
+
#
|
|
41
|
+
# 4. PHYSICAL POWER CYCLE:
|
|
42
|
+
# - Unplug and replug the Mac mini power cable
|
|
43
|
+
# - With `pmset autorestart 1`, it will boot automatically
|
|
44
|
+
#
|
|
45
|
+
# 5. SSH + SYSDIAGNOSE RESET (partial freeze, SSH still works):
|
|
46
|
+
# - ssh sophie@mac-mini.local "sudo reboot"
|
|
47
|
+
# - Or: ssh sophie@mac-mini.local "sudo shutdown -r now"
|
|
48
|
+
#
|
|
49
|
+
# =============================================================================
|
|
50
|
+
|
|
51
|
+
set -euo pipefail
|
|
52
|
+
|
|
53
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
54
|
+
MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
55
|
+
|
|
56
|
+
HEARTBEAT_FILE="$MAESTRO_DIR/state/heartbeat"
|
|
57
|
+
LOG_FILE="$MAESTRO_DIR/logs/watchdog/$(date +%Y-%m-%d)-reboot.log"
|
|
58
|
+
|
|
59
|
+
log() {
|
|
60
|
+
local ts
|
|
61
|
+
ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
62
|
+
echo "[$ts] $1" | tee -a "$LOG_FILE"
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
check_heartbeat() {
|
|
66
|
+
if [ ! -f "$HEARTBEAT_FILE" ]; then
|
|
67
|
+
echo "No heartbeat file found — watchdog may not be running"
|
|
68
|
+
return 1
|
|
69
|
+
fi
|
|
70
|
+
|
|
71
|
+
local hb_time hb_age
|
|
72
|
+
hb_time=$(cat "$HEARTBEAT_FILE")
|
|
73
|
+
hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$hb_time" +%s 2>/dev/null || echo 0) ))
|
|
74
|
+
|
|
75
|
+
echo "Last heartbeat: $hb_time (${hb_age}s ago)"
|
|
76
|
+
echo "System uptime: $(uptime)"
|
|
77
|
+
|
|
78
|
+
if [ "$hb_age" -gt 120 ]; then
|
|
79
|
+
echo "WARNING: Heartbeat is stale (>2 min) — watchdog may be stuck"
|
|
80
|
+
return 1
|
|
81
|
+
fi
|
|
82
|
+
return 0
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
graceful_reboot() {
|
|
86
|
+
log "Initiating graceful reboot..."
|
|
87
|
+
|
|
88
|
+
# 1. Trigger emergency stop to prevent new work
|
|
89
|
+
echo "Memory watchdog initiated graceful reboot" > "$MAESTRO_DIR/.emergency-stop"
|
|
90
|
+
log "Emergency stop flag set"
|
|
91
|
+
|
|
92
|
+
# 2. Give running sessions 30 seconds to wrap up
|
|
93
|
+
log "Waiting 30s for running sessions to complete..."
|
|
94
|
+
sleep 30
|
|
95
|
+
|
|
96
|
+
# 3. Kill remaining claude processes gracefully
|
|
97
|
+
log "Sending SIGTERM to remaining claude processes..."
|
|
98
|
+
pkill -TERM -f "claude" 2>/dev/null || true
|
|
99
|
+
sleep 5
|
|
100
|
+
|
|
101
|
+
# 4. Commit any uncommitted state
|
|
102
|
+
log "Attempting to commit state..."
|
|
103
|
+
(
|
|
104
|
+
cd "$MAESTRO_DIR"
|
|
105
|
+
git add -A state/ logs/ 2>/dev/null || true
|
|
106
|
+
git commit -m "chore: pre-reboot state save (watchdog-initiated)" 2>/dev/null || true
|
|
107
|
+
git push origin main 2>/dev/null || true
|
|
108
|
+
) || true
|
|
109
|
+
|
|
110
|
+
# 5. Reboot
|
|
111
|
+
log "Rebooting now..."
|
|
112
|
+
sudo shutdown -r now 2>/dev/null || {
|
|
113
|
+
log "sudo shutdown failed — trying osascript..."
|
|
114
|
+
osascript -e 'tell application "System Events" to restart' 2>/dev/null || {
|
|
115
|
+
log "All reboot methods failed — manual intervention required"
|
|
116
|
+
exit 1
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
force_reboot() {
|
|
122
|
+
log "FORCE REBOOT initiated"
|
|
123
|
+
echo "Force reboot requested" > "$MAESTRO_DIR/.emergency-stop" 2>/dev/null || true
|
|
124
|
+
sudo reboot 2>/dev/null || sudo shutdown -r now 2>/dev/null || {
|
|
125
|
+
log "sudo reboot failed — requires password or system is frozen"
|
|
126
|
+
echo "ERROR: Cannot force reboot without sudo. Options:"
|
|
127
|
+
echo " 1. Run: sudo reboot"
|
|
128
|
+
echo " 2. Power cycle via smart plug"
|
|
129
|
+
echo " 3. Physical power cycle"
|
|
130
|
+
exit 1
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
case "${1:---status}" in
|
|
135
|
+
--status)
|
|
136
|
+
check_heartbeat
|
|
137
|
+
;;
|
|
138
|
+
--graceful)
|
|
139
|
+
graceful_reboot
|
|
140
|
+
;;
|
|
141
|
+
--force)
|
|
142
|
+
force_reboot
|
|
143
|
+
;;
|
|
144
|
+
*)
|
|
145
|
+
echo "Usage: $0 [--status|--graceful|--force]"
|
|
146
|
+
exit 1
|
|
147
|
+
;;
|
|
148
|
+
esac
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# memory-watchdog.sh — System Resource Watchdog for Maestro Agent Deployment
|
|
4
|
+
# =============================================================================
|
|
5
|
+
#
|
|
6
|
+
# Monitors memory usage and process count for Claude Code subprocesses.
|
|
7
|
+
# Kills runaway subagents when thresholds are exceeded.
|
|
8
|
+
# Triggers emergency stop at critical levels.
|
|
9
|
+
#
|
|
10
|
+
# Runs every 30 seconds via launchd (ai.maestro.memory-watchdog.plist).
|
|
11
|
+
#
|
|
12
|
+
# Thresholds (configurable via environment or defaults below):
|
|
13
|
+
# WARNING: >60% RAM used by claude processes → log warning
|
|
14
|
+
# CRITICAL: >75% RAM used by claude processes → kill oldest subagents
|
|
15
|
+
# EMERGENCY: >85% total system memory pressure → emergency stop
|
|
16
|
+
#
|
|
17
|
+
# Usage:
|
|
18
|
+
# ./scripts/watchdog/memory-watchdog.sh # Normal run
|
|
19
|
+
# ./scripts/watchdog/memory-watchdog.sh --check # Report status only
|
|
20
|
+
# ./scripts/watchdog/memory-watchdog.sh --dry-run # Show what would be killed
|
|
21
|
+
#
|
|
22
|
+
# =============================================================================
|
|
23
|
+
|
|
24
|
+
set -euo pipefail
|
|
25
|
+
|
|
26
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
27
|
+
MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
28
|
+
|
|
29
|
+
# Auto-detect agent directory (sophie-ai, wundr, etc.)
|
|
30
|
+
# The watchdog runs from maestro but protects the whole machine
|
|
31
|
+
AGENT_DIR="${AGENT_DIR:-}"
|
|
32
|
+
|
|
33
|
+
# --- Configuration -----------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
# Total physical RAM in KB
|
|
36
|
+
TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
|
|
37
|
+
|
|
38
|
+
# Thresholds as percentage of total RAM for claude process RSS
|
|
39
|
+
CLAUDE_WARN_PERCENT="${WATCHDOG_WARN_PERCENT:-60}"
|
|
40
|
+
CLAUDE_CRITICAL_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
|
|
41
|
+
|
|
42
|
+
# System-wide memory pressure threshold (percentage)
|
|
43
|
+
SYSTEM_EMERGENCY_PERCENT="${WATCHDOG_EMERGENCY_PERCENT:-85}"
|
|
44
|
+
|
|
45
|
+
# Max concurrent claude CLI processes (excluding the main interactive session)
|
|
46
|
+
MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
|
|
47
|
+
|
|
48
|
+
# Minimum age (seconds) before a claude process can be killed
|
|
49
|
+
# Protects freshly-spawned sessions from being killed immediately
|
|
50
|
+
MIN_AGE_SECONDS="${WATCHDOG_MIN_AGE:-60}"
|
|
51
|
+
|
|
52
|
+
# Log file
|
|
53
|
+
LOG_DIR="$MAESTRO_DIR/logs/watchdog"
|
|
54
|
+
LOG_FILE="$LOG_DIR/$(date +%Y-%m-%d)-watchdog.jsonl"
|
|
55
|
+
|
|
56
|
+
# State file for tracking actions across runs
|
|
57
|
+
STATE_FILE="$MAESTRO_DIR/state/watchdog-state.yaml"
|
|
58
|
+
|
|
59
|
+
# --- Helpers -----------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
mkdir -p "$LOG_DIR" "$(dirname "$STATE_FILE")"
|
|
62
|
+
|
|
63
|
+
log_event() {
|
|
64
|
+
local level="$1" event="$2" detail="${3:-}"
|
|
65
|
+
local ts
|
|
66
|
+
ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
67
|
+
printf '{"ts":"%s","level":"%s","event":"%s","detail":%s}\n' \
|
|
68
|
+
"$ts" "$level" "$event" "$(echo "$detail" | python3 -c 'import sys,json; print(json.dumps(sys.stdin.read().strip()))' 2>/dev/null || echo '""')" \
|
|
69
|
+
>> "$LOG_FILE"
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# --- Memory Metrics ----------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
get_system_memory_pressure() {
|
|
75
|
+
# macOS memory_pressure returns a percentage of total memory under pressure
|
|
76
|
+
# We parse the "System-wide memory free percentage" line
|
|
77
|
+
local free_pct
|
|
78
|
+
free_pct=$(memory_pressure 2>/dev/null | grep "System-wide memory free percentage:" | awk '{print $NF}' | tr -d '%')
|
|
79
|
+
if [ -z "$free_pct" ]; then
|
|
80
|
+
# Fallback: calculate from vm_stat
|
|
81
|
+
local page_size free_pages
|
|
82
|
+
page_size=$(vm_stat | head -1 | grep -o '[0-9]*')
|
|
83
|
+
free_pages=$(vm_stat | awk '/Pages free/ {print $NF}' | tr -d '.')
|
|
84
|
+
local free_kb=$(( (free_pages * page_size) / 1024 ))
|
|
85
|
+
free_pct=$(( (free_kb * 100) / TOTAL_RAM_KB ))
|
|
86
|
+
fi
|
|
87
|
+
echo $(( 100 - free_pct ))
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
get_claude_processes() {
|
|
91
|
+
# Returns: PID RSS_KB ELAPSED_SECONDS COMMAND
|
|
92
|
+
# Finds all claude CLI processes (both 'claude' and node-based claude)
|
|
93
|
+
# Excludes this watchdog script itself
|
|
94
|
+
/bin/ps -eo pid,rss,etime,comm | \
|
|
95
|
+
grep -i "claude" | \
|
|
96
|
+
grep -v "Claude.app" | \
|
|
97
|
+
grep -v "Claude Helper" | \
|
|
98
|
+
grep -v "grep" | \
|
|
99
|
+
grep -v "watchdog" | \
|
|
100
|
+
awk '{
|
|
101
|
+
pid = $1
|
|
102
|
+
rss = $2
|
|
103
|
+
etime = $3
|
|
104
|
+
comm = $4
|
|
105
|
+
# Parse etime (formats: SS, MM:SS, HH:MM:SS, D-HH:MM:SS)
|
|
106
|
+
n = split(etime, parts, ":")
|
|
107
|
+
if (n == 1) { secs = parts[1] }
|
|
108
|
+
else if (n == 2) { secs = parts[1]*60 + parts[2] }
|
|
109
|
+
else if (n == 3) {
|
|
110
|
+
# Check for D- prefix
|
|
111
|
+
if (index(parts[1], "-") > 0) {
|
|
112
|
+
split(parts[1], dp, "-")
|
|
113
|
+
secs = dp[1]*86400 + dp[2]*3600 + parts[2]*60 + parts[3]
|
|
114
|
+
} else {
|
|
115
|
+
secs = parts[1]*3600 + parts[2]*60 + parts[3]
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
printf "%s %s %s %s\n", pid, rss, secs, comm
|
|
119
|
+
}' | sort -k2 -rn
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
get_total_claude_rss_kb() {
|
|
123
|
+
get_claude_processes | awk '{sum += $2} END {print sum+0}'
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
count_claude_processes() {
|
|
127
|
+
get_claude_processes | wc -l | tr -d ' '
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
# --- Identification ----------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
identify_main_session() {
|
|
133
|
+
# The main interactive claude session is typically:
|
|
134
|
+
# 1. The one connected to a TTY
|
|
135
|
+
# 2. The longest-running one
|
|
136
|
+
# 3. The parent of subagent processes
|
|
137
|
+
# We use the longest-running as a heuristic
|
|
138
|
+
get_claude_processes | tail -1 | awk '{print $1}'
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# --- Actions -----------------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
kill_oldest_subagents() {
|
|
144
|
+
local target_count="$1"
|
|
145
|
+
local dry_run="${2:-false}"
|
|
146
|
+
local main_pid
|
|
147
|
+
main_pid=$(identify_main_session)
|
|
148
|
+
local killed=0
|
|
149
|
+
|
|
150
|
+
# Get processes sorted by RSS descending (kill biggest first)
|
|
151
|
+
# Skip the main session
|
|
152
|
+
while IFS=' ' read -r pid rss age comm; do
|
|
153
|
+
if [ "$pid" = "$main_pid" ]; then
|
|
154
|
+
continue
|
|
155
|
+
fi
|
|
156
|
+
if [ "$age" -lt "$MIN_AGE_SECONDS" ]; then
|
|
157
|
+
log_event "info" "skip_young" "PID $pid age ${age}s < ${MIN_AGE_SECONDS}s minimum"
|
|
158
|
+
continue
|
|
159
|
+
fi
|
|
160
|
+
if [ "$killed" -ge "$target_count" ]; then
|
|
161
|
+
break
|
|
162
|
+
fi
|
|
163
|
+
|
|
164
|
+
local rss_mb=$(( rss / 1024 ))
|
|
165
|
+
if [ "$dry_run" = "true" ]; then
|
|
166
|
+
echo " [DRY RUN] Would kill PID $pid (${rss_mb}MB, age ${age}s)"
|
|
167
|
+
log_event "info" "dry_run_kill" "PID=$pid RSS=${rss_mb}MB age=${age}s"
|
|
168
|
+
else
|
|
169
|
+
echo " Killing PID $pid (${rss_mb}MB, age ${age}s)"
|
|
170
|
+
kill -TERM "$pid" 2>/dev/null || true
|
|
171
|
+
log_event "warn" "killed_subagent" "PID=$pid RSS=${rss_mb}MB age=${age}s signal=TERM"
|
|
172
|
+
killed=$((killed + 1))
|
|
173
|
+
|
|
174
|
+
# Give it 5 seconds to exit gracefully, then SIGKILL
|
|
175
|
+
(
|
|
176
|
+
sleep 5
|
|
177
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
178
|
+
kill -9 "$pid" 2>/dev/null || true
|
|
179
|
+
# Log to file directly since this is a subshell
|
|
180
|
+
printf '{"ts":"%s","level":"warn","event":"force_killed","detail":"PID=%s did not exit after SIGTERM"}\n' \
|
|
181
|
+
"$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$pid" >> "$LOG_FILE"
|
|
182
|
+
fi
|
|
183
|
+
) &
|
|
184
|
+
fi
|
|
185
|
+
done <<< "$(get_claude_processes)"
|
|
186
|
+
|
|
187
|
+
echo "$killed"
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
trigger_emergency_stop() {
|
|
191
|
+
local reason="$1"
|
|
192
|
+
local stop_file="$MAESTRO_DIR/.emergency-stop"
|
|
193
|
+
|
|
194
|
+
if [ -f "$stop_file" ]; then
|
|
195
|
+
log_event "warn" "emergency_already_active" "$reason"
|
|
196
|
+
return 0
|
|
197
|
+
fi
|
|
198
|
+
|
|
199
|
+
echo "$reason" > "$stop_file"
|
|
200
|
+
log_event "critical" "emergency_stop_triggered" "$reason"
|
|
201
|
+
|
|
202
|
+
# Also create in any detected agent directories
|
|
203
|
+
for agent_dir in "$HOME"/sophie-ai "$HOME"/wundr; do
|
|
204
|
+
if [ -d "$agent_dir" ]; then
|
|
205
|
+
echo "$reason" > "$agent_dir/.emergency-stop" 2>/dev/null || true
|
|
206
|
+
fi
|
|
207
|
+
done
|
|
208
|
+
|
|
209
|
+
# Kill ALL non-main claude subprocesses
|
|
210
|
+
local main_pid
|
|
211
|
+
main_pid=$(identify_main_session)
|
|
212
|
+
while IFS=' ' read -r pid rss age comm; do
|
|
213
|
+
if [ "$pid" != "$main_pid" ] && [ -n "$pid" ]; then
|
|
214
|
+
kill -TERM "$pid" 2>/dev/null || true
|
|
215
|
+
fi
|
|
216
|
+
done <<< "$(get_claude_processes)"
|
|
217
|
+
|
|
218
|
+
echo "EMERGENCY STOP: $reason"
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
# --- Main Logic --------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
run_watchdog() {
|
|
224
|
+
local mode="${1:-run}"
|
|
225
|
+
local dry_run="false"
|
|
226
|
+
[ "$mode" = "--dry-run" ] && dry_run="true"
|
|
227
|
+
[ "$mode" = "--check" ] && dry_run="true"
|
|
228
|
+
|
|
229
|
+
# Collect metrics
|
|
230
|
+
local claude_rss_kb claude_count system_pressure
|
|
231
|
+
claude_rss_kb=$(get_total_claude_rss_kb)
|
|
232
|
+
claude_count=$(count_claude_processes)
|
|
233
|
+
system_pressure=$(get_system_memory_pressure)
|
|
234
|
+
|
|
235
|
+
local claude_rss_mb=$(( claude_rss_kb / 1024 ))
|
|
236
|
+
local claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
|
|
237
|
+
local warn_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 / 1024 ))
|
|
238
|
+
local critical_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_CRITICAL_PERCENT) / 100 / 1024 ))
|
|
239
|
+
|
|
240
|
+
# Report
|
|
241
|
+
if [ "$mode" = "--check" ]; then
|
|
242
|
+
echo "=== Memory Watchdog Status ==="
|
|
243
|
+
echo "Total RAM: $(( TOTAL_RAM_KB / 1024 ))MB"
|
|
244
|
+
echo "Claude processes: $claude_count (max: $MAX_CLAUDE_PROCS)"
|
|
245
|
+
echo "Claude RSS total: ${claude_rss_mb}MB (${claude_pct}% of RAM)"
|
|
246
|
+
echo " Warning threshold: ${warn_threshold_mb}MB (${CLAUDE_WARN_PERCENT}%)"
|
|
247
|
+
echo " Critical threshold: ${critical_threshold_mb}MB (${CLAUDE_CRITICAL_PERCENT}%)"
|
|
248
|
+
echo "System mem pressure: ${system_pressure}% (emergency at ${SYSTEM_EMERGENCY_PERCENT}%)"
|
|
249
|
+
echo ""
|
|
250
|
+
echo "Claude processes:"
|
|
251
|
+
get_claude_processes | while IFS=' ' read -r pid rss age comm; do
|
|
252
|
+
echo " PID $pid: $(( rss / 1024 ))MB, age ${age}s — $comm"
|
|
253
|
+
done
|
|
254
|
+
echo "=============================="
|
|
255
|
+
return 0
|
|
256
|
+
fi
|
|
257
|
+
|
|
258
|
+
# --- Level 1: System emergency (memory pressure) ---------------------------
|
|
259
|
+
if [ "$system_pressure" -ge "$SYSTEM_EMERGENCY_PERCENT" ]; then
|
|
260
|
+
log_event "critical" "system_memory_emergency" "pressure=${system_pressure}% >= ${SYSTEM_EMERGENCY_PERCENT}%"
|
|
261
|
+
if [ "$dry_run" = "false" ]; then
|
|
262
|
+
trigger_emergency_stop "Memory pressure ${system_pressure}% exceeded ${SYSTEM_EMERGENCY_PERCENT}% emergency threshold"
|
|
263
|
+
else
|
|
264
|
+
echo "[DRY RUN] Would trigger EMERGENCY STOP (pressure=${system_pressure}%)"
|
|
265
|
+
fi
|
|
266
|
+
return 0
|
|
267
|
+
fi
|
|
268
|
+
|
|
269
|
+
# --- Level 2: Claude RSS critical — kill biggest subagents -----------------
|
|
270
|
+
if [ "$claude_pct" -ge "$CLAUDE_CRITICAL_PERCENT" ]; then
|
|
271
|
+
log_event "warn" "claude_memory_critical" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_CRITICAL_PERCENT}%"
|
|
272
|
+
echo "CRITICAL: Claude processes using ${claude_rss_mb}MB (${claude_pct}% of RAM)"
|
|
273
|
+
|
|
274
|
+
# Kill enough to get below warning threshold
|
|
275
|
+
local target_rss_kb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 ))
|
|
276
|
+
local excess_kb=$(( claude_rss_kb - target_rss_kb ))
|
|
277
|
+
# Estimate ~600MB per process
|
|
278
|
+
local procs_to_kill=$(( (excess_kb / 614400) + 1 ))
|
|
279
|
+
[ "$procs_to_kill" -lt 1 ] && procs_to_kill=1
|
|
280
|
+
|
|
281
|
+
echo " Killing $procs_to_kill subagent(s) to free ~$(( excess_kb / 1024 ))MB..."
|
|
282
|
+
kill_oldest_subagents "$procs_to_kill" "$dry_run"
|
|
283
|
+
return 0
|
|
284
|
+
fi
|
|
285
|
+
|
|
286
|
+
# --- Level 3: Too many concurrent processes --------------------------------
|
|
287
|
+
if [ "$claude_count" -gt "$MAX_CLAUDE_PROCS" ]; then
|
|
288
|
+
local excess=$(( claude_count - MAX_CLAUDE_PROCS ))
|
|
289
|
+
log_event "warn" "too_many_claude_procs" "count=${claude_count} > max=${MAX_CLAUDE_PROCS}"
|
|
290
|
+
echo "WARNING: $claude_count claude processes running (max: $MAX_CLAUDE_PROCS)"
|
|
291
|
+
echo " Killing $excess oldest subagent(s)..."
|
|
292
|
+
kill_oldest_subagents "$excess" "$dry_run"
|
|
293
|
+
return 0
|
|
294
|
+
fi
|
|
295
|
+
|
|
296
|
+
# --- Level 4: Claude RSS warning — log only --------------------------------
|
|
297
|
+
if [ "$claude_pct" -ge "$CLAUDE_WARN_PERCENT" ]; then
|
|
298
|
+
log_event "warn" "claude_memory_warning" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_WARN_PERCENT}%"
|
|
299
|
+
return 0
|
|
300
|
+
fi
|
|
301
|
+
|
|
302
|
+
# --- All clear -------------------------------------------------------------
|
|
303
|
+
# Only log periodically (every 5 minutes = every 10th run at 30s interval)
|
|
304
|
+
local run_count_file="$MAESTRO_DIR/state/watchdog-run-count"
|
|
305
|
+
local run_count=0
|
|
306
|
+
[ -f "$run_count_file" ] && run_count=$(cat "$run_count_file")
|
|
307
|
+
run_count=$(( (run_count + 1) % 10 ))
|
|
308
|
+
echo "$run_count" > "$run_count_file"
|
|
309
|
+
|
|
310
|
+
if [ "$run_count" -eq 0 ]; then
|
|
311
|
+
log_event "info" "healthy" "claude_procs=${claude_count} rss=${claude_rss_mb}MB (${claude_pct}%) pressure=${system_pressure}%"
|
|
312
|
+
fi
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
# --- Write heartbeat ---------------------------------------------------------
|
|
316
|
+
|
|
317
|
+
write_heartbeat() {
|
|
318
|
+
local heartbeat_file="$MAESTRO_DIR/state/heartbeat"
|
|
319
|
+
date -u +"%Y-%m-%dT%H:%M:%SZ" > "$heartbeat_file"
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
# --- Entry point -------------------------------------------------------------
|
|
323
|
+
|
|
324
|
+
write_heartbeat
|
|
325
|
+
run_watchdog "${1:-run}"
|