@adaptic/maestro 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/setup/boot-claude-session.sh +89 -0
- package/scripts/setup/configure-macos.sh +166 -6
- package/scripts/spawn-session.sh +20 -0
- package/scripts/watchdog/ai.maestro.memory-watchdog.plist +41 -0
- package/scripts/watchdog/force-reboot.sh +148 -0
- package/scripts/watchdog/memory-watchdog.sh +329 -0
package/package.json
CHANGED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# boot-claude-session.sh — Launch interactive Claude Code session on boot
|
|
4
|
+
# =============================================================================
|
|
5
|
+
#
|
|
6
|
+
# Opens Terminal.app with a Claude Code interactive session in the agent's
|
|
7
|
+
# working directory. Called via launchd at login.
|
|
8
|
+
#
|
|
9
|
+
# This is separate from the sophie-daemon (which runs headless as a Node.js
|
|
10
|
+
# process for polling/dispatching). This script provides a visible, interactive
|
|
11
|
+
# Claude Code session that the operator can observe and interact with.
|
|
12
|
+
#
|
|
13
|
+
# Usage:
|
|
14
|
+
# ./scripts/setup/boot-claude-session.sh [agent-dir]
|
|
15
|
+
#
|
|
16
|
+
# =============================================================================
|
|
17
|
+
|
|
18
|
+
set -euo pipefail
|
|
19
|
+
|
|
20
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
21
|
+
MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
22
|
+
|
|
23
|
+
# Detect agent directory — prefer sophie-ai if it exists
|
|
24
|
+
AGENT_DIR="${1:-}"
|
|
25
|
+
if [ -z "$AGENT_DIR" ]; then
|
|
26
|
+
if [ -d "$HOME/sophie-ai" ]; then
|
|
27
|
+
AGENT_DIR="$HOME/sophie-ai"
|
|
28
|
+
else
|
|
29
|
+
echo "ERROR: No agent directory specified and ~/sophie-ai not found" >&2
|
|
30
|
+
exit 1
|
|
31
|
+
fi
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
AGENT_NAME=$(basename "$AGENT_DIR")
|
|
35
|
+
LOG_FILE="$MAESTRO_DIR/logs/watchdog/$(date +%Y-%m-%d)-boot-session.log"
|
|
36
|
+
mkdir -p "$(dirname "$LOG_FILE")"
|
|
37
|
+
|
|
38
|
+
log() {
|
|
39
|
+
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] $1" | tee -a "$LOG_FILE"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Wait for the system to settle after login (network, services, etc.)
|
|
43
|
+
log "Waiting 15s for system to settle..."
|
|
44
|
+
sleep 15
|
|
45
|
+
|
|
46
|
+
# Check for emergency stop
|
|
47
|
+
if [ -f "$AGENT_DIR/.emergency-stop" ]; then
|
|
48
|
+
log "Emergency stop active — not starting Claude session"
|
|
49
|
+
exit 0
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
if [ -f "$MAESTRO_DIR/.emergency-stop" ]; then
|
|
53
|
+
log "Emergency stop active in maestro — not starting Claude session"
|
|
54
|
+
exit 0
|
|
55
|
+
fi
|
|
56
|
+
|
|
57
|
+
# Check if claude CLI is available
|
|
58
|
+
if ! command -v claude &>/dev/null; then
|
|
59
|
+
# Try common paths
|
|
60
|
+
CLAUDE_PATH=""
|
|
61
|
+
for p in "$HOME/.local/bin/claude" "/opt/homebrew/bin/claude" "/usr/local/bin/claude"; do
|
|
62
|
+
if [ -x "$p" ]; then
|
|
63
|
+
CLAUDE_PATH="$p"
|
|
64
|
+
break
|
|
65
|
+
fi
|
|
66
|
+
done
|
|
67
|
+
|
|
68
|
+
if [ -z "$CLAUDE_PATH" ]; then
|
|
69
|
+
log "ERROR: claude CLI not found"
|
|
70
|
+
exit 1
|
|
71
|
+
fi
|
|
72
|
+
else
|
|
73
|
+
CLAUDE_PATH=$(which claude)
|
|
74
|
+
fi
|
|
75
|
+
|
|
76
|
+
log "Starting Claude Code session in $AGENT_DIR (claude: $CLAUDE_PATH)"
|
|
77
|
+
|
|
78
|
+
# Open Terminal.app with Claude Code running in the agent directory
|
|
79
|
+
osascript <<APPLESCRIPT
|
|
80
|
+
tell application "Terminal"
|
|
81
|
+
activate
|
|
82
|
+
-- Open a new window with claude running in the agent directory
|
|
83
|
+
do script "cd '$AGENT_DIR' && clear && echo '╔══════════════════════════════════════════════════════════╗' && echo '║ Sophie AI — Boot Session ($(date +%Y-%m-%d)) ║' && echo '╠══════════════════════════════════════════════════════════╣' && echo '║ Agent dir: $AGENT_DIR' && echo '║ Claude: $CLAUDE_PATH' && echo '╚══════════════════════════════════════════════════════════╝' && echo '' && '$CLAUDE_PATH'"
|
|
84
|
+
-- Set the window title
|
|
85
|
+
set custom title of front window to "$AGENT_NAME — Claude Code"
|
|
86
|
+
end tell
|
|
87
|
+
APPLESCRIPT
|
|
88
|
+
|
|
89
|
+
log "Terminal window opened with Claude Code session"
|
|
@@ -70,6 +70,32 @@ configure_auto_login() {
|
|
|
70
70
|
defaults write com.apple.loginwindow LoginwindowLaunchesRelaunchApps -bool false
|
|
71
71
|
|
|
72
72
|
ok "Auto-login enabled for $SUDO_USER"
|
|
73
|
+
|
|
74
|
+
# --- Disable all password gates that could block unattended operation -------
|
|
75
|
+
local real_home
|
|
76
|
+
real_home=$(eval echo "~$SUDO_USER")
|
|
77
|
+
|
|
78
|
+
# Disable "Require password after sleep or screen saver"
|
|
79
|
+
sudo -u "$SUDO_USER" defaults write com.apple.screensaver askForPassword -int 0
|
|
80
|
+
sudo -u "$SUDO_USER" defaults write com.apple.screensaver askForPasswordDelay -int 0
|
|
81
|
+
ok "Screen saver password prompt disabled"
|
|
82
|
+
|
|
83
|
+
# Disable screen lock via sysadminctl
|
|
84
|
+
sysadminctl -screenLock off 2>/dev/null || true
|
|
85
|
+
ok "Screen lock disabled"
|
|
86
|
+
|
|
87
|
+
# Disable idle logout (Privacy & Security > Log out when idle)
|
|
88
|
+
defaults write /Library/Preferences/.GlobalPreferences com.apple.autologout.AutoLogOutDelay -int 0
|
|
89
|
+
ok "Idle auto-logout disabled"
|
|
90
|
+
|
|
91
|
+
# Disable screen lock on wake for current user
|
|
92
|
+
sudo -u "$SUDO_USER" defaults -currentHost write com.apple.screensaver idleTime -int 0
|
|
93
|
+
ok "Screen saver idle timer set to never"
|
|
94
|
+
|
|
95
|
+
# Prevent display from sleeping and requiring login on wake
|
|
96
|
+
# (already handled by pmset displaysleep 0, but belt-and-suspenders)
|
|
97
|
+
defaults write /Library/Preferences/com.apple.loginwindow DisableScreenLock -bool true 2>/dev/null || true
|
|
98
|
+
|
|
73
99
|
warn "NOTE: FileVault must be disabled for auto-login to work"
|
|
74
100
|
warn "Check: fdesetup status"
|
|
75
101
|
|
|
@@ -296,14 +322,122 @@ configure_app_launches() {
|
|
|
296
322
|
# The poller uses IMAP directly, not the browser
|
|
297
323
|
log "Safari: not needed as Login Item (Gmail uses IMAP polling)"
|
|
298
324
|
|
|
299
|
-
# Ensure required directories exist
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
325
|
+
# Ensure required directories exist (as the real user, not root)
|
|
326
|
+
local real_user="${SUDO_USER:-$CURRENT_USER}"
|
|
327
|
+
local dirs=("$AGENT_DIR/logs/huddle" "$AGENT_DIR/state/huddle" "$AGENT_DIR/logs/watchdog")
|
|
328
|
+
for d in "${dirs[@]}"; do
|
|
329
|
+
mkdir -p "$d"
|
|
330
|
+
chown "$real_user:staff" "$d" 2>/dev/null || true
|
|
331
|
+
done
|
|
332
|
+
ok "Log and state directories verified (owned by $real_user)"
|
|
333
|
+
|
|
334
|
+
# --- Claude Code boot session (opens Terminal with interactive claude) ------
|
|
335
|
+
local BOOT_SCRIPT="$AGENT_DIR/scripts/setup/boot-claude-session.sh"
|
|
336
|
+
if [ -f "$BOOT_SCRIPT" ]; then
|
|
337
|
+
chmod +x "$BOOT_SCRIPT"
|
|
338
|
+
|
|
339
|
+
local PLIST_NAME="ai.maestro.boot-claude-session"
|
|
340
|
+
local PLIST_PATH="$real_home/Library/LaunchAgents/${PLIST_NAME}.plist"
|
|
341
|
+
local real_home
|
|
342
|
+
real_home=$(eval echo "~$real_user")
|
|
343
|
+
|
|
344
|
+
# Detect the agent working directory (prefer sophie-ai)
|
|
345
|
+
local boot_agent_dir="$HOME/sophie-ai"
|
|
346
|
+
[ -d "$boot_agent_dir" ] || boot_agent_dir="$AGENT_DIR"
|
|
347
|
+
|
|
348
|
+
cat > "$PLIST_PATH" << PLIST_EOF
|
|
349
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
350
|
+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
|
351
|
+
<plist version="1.0">
|
|
352
|
+
<dict>
|
|
353
|
+
<key>Label</key>
|
|
354
|
+
<string>${PLIST_NAME}</string>
|
|
355
|
+
<key>ProgramArguments</key>
|
|
356
|
+
<array>
|
|
357
|
+
<string>/bin/bash</string>
|
|
358
|
+
<string>${BOOT_SCRIPT}</string>
|
|
359
|
+
<string>${boot_agent_dir}</string>
|
|
360
|
+
</array>
|
|
361
|
+
<key>RunAtLoad</key>
|
|
362
|
+
<true/>
|
|
363
|
+
<key>StandardOutPath</key>
|
|
364
|
+
<string>${AGENT_DIR}/logs/watchdog/boot-session-stdout.log</string>
|
|
365
|
+
<key>StandardErrorPath</key>
|
|
366
|
+
<string>${AGENT_DIR}/logs/watchdog/boot-session-stderr.log</string>
|
|
367
|
+
<key>EnvironmentVariables</key>
|
|
368
|
+
<dict>
|
|
369
|
+
<key>PATH</key>
|
|
370
|
+
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:${real_home}/.local/bin</string>
|
|
371
|
+
<key>HOME</key>
|
|
372
|
+
<string>${real_home}</string>
|
|
373
|
+
</dict>
|
|
374
|
+
</dict>
|
|
375
|
+
</plist>
|
|
376
|
+
PLIST_EOF
|
|
377
|
+
|
|
378
|
+
chown "$real_user:staff" "$PLIST_PATH" 2>/dev/null || true
|
|
379
|
+
|
|
380
|
+
# Load as the real user
|
|
381
|
+
if [ "$(id -u)" -eq 0 ]; then
|
|
382
|
+
su "$real_user" -c "launchctl unload '$PLIST_PATH' 2>/dev/null; launchctl load '$PLIST_PATH'" 2>/dev/null || true
|
|
383
|
+
else
|
|
384
|
+
launchctl unload "$PLIST_PATH" 2>/dev/null || true
|
|
385
|
+
launchctl load "$PLIST_PATH" 2>/dev/null || true
|
|
386
|
+
fi
|
|
387
|
+
|
|
388
|
+
ok "Claude Code boot session agent installed (opens Terminal + claude on login)"
|
|
389
|
+
else
|
|
390
|
+
warn "Boot session script not found at $BOOT_SCRIPT"
|
|
391
|
+
fi
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
# =============================================================================
|
|
395
|
+
# 7. MEMORY WATCHDOG
|
|
396
|
+
# =============================================================================
|
|
397
|
+
|
|
398
|
+
configure_memory_watchdog() {
|
|
399
|
+
section "Memory Watchdog (OOM Protection)"
|
|
400
|
+
|
|
401
|
+
local WATCHDOG_SCRIPT="$AGENT_DIR/scripts/watchdog/memory-watchdog.sh"
|
|
402
|
+
local PLIST_SOURCE="$AGENT_DIR/scripts/watchdog/ai.maestro.memory-watchdog.plist"
|
|
403
|
+
|
|
404
|
+
if [ ! -f "$WATCHDOG_SCRIPT" ]; then
|
|
405
|
+
fail "Watchdog script not found at $WATCHDOG_SCRIPT"
|
|
406
|
+
return 0
|
|
407
|
+
fi
|
|
408
|
+
|
|
409
|
+
chmod +x "$WATCHDOG_SCRIPT"
|
|
410
|
+
ok "Watchdog script executable"
|
|
411
|
+
|
|
412
|
+
# Determine the real user's home for LaunchAgents
|
|
413
|
+
local real_user="${SUDO_USER:-$CURRENT_USER}"
|
|
414
|
+
local real_home
|
|
415
|
+
real_home=$(eval echo "~$real_user")
|
|
416
|
+
local PLIST_DEST="$real_home/Library/LaunchAgents/ai.maestro.memory-watchdog.plist"
|
|
417
|
+
|
|
418
|
+
# Install the plist (update paths for this agent directory)
|
|
419
|
+
sed "s|/Users/sophie/maestro|$AGENT_DIR|g" "$PLIST_SOURCE" > "$PLIST_DEST"
|
|
420
|
+
chown "$real_user:staff" "$PLIST_DEST" 2>/dev/null || true
|
|
421
|
+
|
|
422
|
+
# Load it (as the real user)
|
|
423
|
+
if [ "$(id -u)" -eq 0 ]; then
|
|
424
|
+
su "$real_user" -c "launchctl unload '$PLIST_DEST' 2>/dev/null; launchctl load '$PLIST_DEST'" 2>/dev/null || true
|
|
425
|
+
else
|
|
426
|
+
launchctl unload "$PLIST_DEST" 2>/dev/null || true
|
|
427
|
+
launchctl load "$PLIST_DEST" 2>/dev/null || true
|
|
428
|
+
fi
|
|
429
|
+
|
|
430
|
+
ok "Memory watchdog launchd agent installed (runs every 30s)"
|
|
431
|
+
log "Thresholds: warn=60% critical=75% emergency=85% max_procs=8"
|
|
432
|
+
log "Override via env: WATCHDOG_WARN_PERCENT, WATCHDOG_CRITICAL_PERCENT, etc."
|
|
433
|
+
|
|
434
|
+
# Create log directory with correct ownership
|
|
435
|
+
mkdir -p "$AGENT_DIR/logs/watchdog"
|
|
436
|
+
chown "$real_user:staff" "$AGENT_DIR/logs/watchdog" 2>/dev/null || true
|
|
303
437
|
}
|
|
304
438
|
|
|
305
439
|
# =============================================================================
|
|
306
|
-
#
|
|
440
|
+
# 8. ORCHESTRATOR AND SUBSYSTEMS ON BOOT
|
|
307
441
|
# =============================================================================
|
|
308
442
|
|
|
309
443
|
configure_boot_services() {
|
|
@@ -364,7 +498,7 @@ configure_boot_services() {
|
|
|
364
498
|
}
|
|
365
499
|
|
|
366
500
|
# =============================================================================
|
|
367
|
-
#
|
|
501
|
+
# 9. SYSTEM VERIFICATION
|
|
368
502
|
# =============================================================================
|
|
369
503
|
|
|
370
504
|
verify_all() {
|
|
@@ -440,6 +574,30 @@ verify_all() {
|
|
|
440
574
|
issues=$((issues + 1))
|
|
441
575
|
fi
|
|
442
576
|
|
|
577
|
+
# Check memory watchdog
|
|
578
|
+
if launchctl list 2>/dev/null | grep -q "ai.maestro.memory-watchdog"; then
|
|
579
|
+
ok "Memory watchdog: running"
|
|
580
|
+
elif [ -f "$HOME/Library/LaunchAgents/ai.maestro.memory-watchdog.plist" ]; then
|
|
581
|
+
warn "Memory watchdog: installed but not loaded"
|
|
582
|
+
else
|
|
583
|
+
fail "Memory watchdog: not installed"
|
|
584
|
+
issues=$((issues + 1))
|
|
585
|
+
fi
|
|
586
|
+
|
|
587
|
+
# Check heartbeat freshness
|
|
588
|
+
local heartbeat_file="$AGENT_DIR/state/heartbeat"
|
|
589
|
+
if [ -f "$heartbeat_file" ]; then
|
|
590
|
+
local hb_age
|
|
591
|
+
hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$(cat "$heartbeat_file")" +%s 2>/dev/null || echo 0) ))
|
|
592
|
+
if [ "$hb_age" -lt 120 ]; then
|
|
593
|
+
ok "Heartbeat: fresh (${hb_age}s ago)"
|
|
594
|
+
else
|
|
595
|
+
warn "Heartbeat: stale (${hb_age}s ago — watchdog may not be running)"
|
|
596
|
+
fi
|
|
597
|
+
else
|
|
598
|
+
warn "Heartbeat: no heartbeat file found"
|
|
599
|
+
fi
|
|
600
|
+
|
|
443
601
|
# Summary
|
|
444
602
|
echo ""
|
|
445
603
|
if [ "$issues" -eq 0 ]; then
|
|
@@ -473,6 +631,7 @@ main() {
|
|
|
473
631
|
configure_slack_cdp
|
|
474
632
|
configure_audio
|
|
475
633
|
configure_app_launches
|
|
634
|
+
configure_memory_watchdog
|
|
476
635
|
configure_boot_services
|
|
477
636
|
;;
|
|
478
637
|
--full|full|"")
|
|
@@ -482,6 +641,7 @@ main() {
|
|
|
482
641
|
configure_slack_cdp
|
|
483
642
|
configure_audio
|
|
484
643
|
configure_app_launches
|
|
644
|
+
configure_memory_watchdog
|
|
485
645
|
configure_boot_services
|
|
486
646
|
|
|
487
647
|
section "Configuration Complete"
|
package/scripts/spawn-session.sh
CHANGED
|
@@ -28,6 +28,26 @@ if [ ! -f "$TASK_FILE" ]; then
|
|
|
28
28
|
exit 1
|
|
29
29
|
fi
|
|
30
30
|
|
|
31
|
+
# --- Resource gate: refuse to spawn if machine is under pressure -------------
|
|
32
|
+
MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
|
|
33
|
+
TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
|
|
34
|
+
SPAWN_BLOCK_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
|
|
35
|
+
|
|
36
|
+
claude_count=$(/bin/ps -eo comm | grep -ci "claude" | tr -d ' ' || true)
|
|
37
|
+
claude_rss_kb=$(/bin/ps -eo rss,comm | grep -i "claude" | grep -v "Claude.app" | grep -v "Claude Helper" | awk '{sum+=$1} END {print sum+0}')
|
|
38
|
+
claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
|
|
39
|
+
|
|
40
|
+
if [ "$claude_count" -ge "$MAX_CLAUDE_PROCS" ]; then
|
|
41
|
+
echo "[$TIMESTAMP] BLOCKED: Too many claude processes ($claude_count >= $MAX_CLAUDE_PROCS)" >&2
|
|
42
|
+
exit 1
|
|
43
|
+
fi
|
|
44
|
+
|
|
45
|
+
if [ "$claude_pct" -ge "$SPAWN_BLOCK_PERCENT" ]; then
|
|
46
|
+
echo "[$TIMESTAMP] BLOCKED: Claude memory usage ${claude_pct}% >= ${SPAWN_BLOCK_PERCENT}% threshold" >&2
|
|
47
|
+
exit 1
|
|
48
|
+
fi
|
|
49
|
+
# -----------------------------------------------------------------------------
|
|
50
|
+
|
|
31
51
|
# Create session directory
|
|
32
52
|
mkdir -p "$SESSION_DIR" "$(dirname "$LOG_FILE")"
|
|
33
53
|
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
|
3
|
+
<plist version="1.0">
|
|
4
|
+
<dict>
|
|
5
|
+
<key>Label</key>
|
|
6
|
+
<string>ai.maestro.memory-watchdog</string>
|
|
7
|
+
|
|
8
|
+
<key>ProgramArguments</key>
|
|
9
|
+
<array>
|
|
10
|
+
<string>/bin/bash</string>
|
|
11
|
+
<string>/Users/sophie/maestro/scripts/watchdog/memory-watchdog.sh</string>
|
|
12
|
+
</array>
|
|
13
|
+
|
|
14
|
+
<key>StartInterval</key>
|
|
15
|
+
<integer>30</integer>
|
|
16
|
+
|
|
17
|
+
<key>RunAtLoad</key>
|
|
18
|
+
<true/>
|
|
19
|
+
|
|
20
|
+
<key>StandardOutPath</key>
|
|
21
|
+
<string>/Users/sophie/maestro/logs/watchdog/launchd-stdout.log</string>
|
|
22
|
+
|
|
23
|
+
<key>StandardErrorPath</key>
|
|
24
|
+
<string>/Users/sophie/maestro/logs/watchdog/launchd-stderr.log</string>
|
|
25
|
+
|
|
26
|
+
<key>EnvironmentVariables</key>
|
|
27
|
+
<dict>
|
|
28
|
+
<key>PATH</key>
|
|
29
|
+
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
|
|
30
|
+
</dict>
|
|
31
|
+
|
|
32
|
+
<key>Nice</key>
|
|
33
|
+
<integer>10</integer>
|
|
34
|
+
|
|
35
|
+
<key>ProcessType</key>
|
|
36
|
+
<string>Background</string>
|
|
37
|
+
|
|
38
|
+
<key>ThrottleInterval</key>
|
|
39
|
+
<integer>10</integer>
|
|
40
|
+
</dict>
|
|
41
|
+
</plist>
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# force-reboot.sh — Remote/Automated Force Reboot for Frozen Mac Mini
|
|
4
|
+
# =============================================================================
|
|
5
|
+
#
|
|
6
|
+
# This script provides multiple escalation levels for rebooting a frozen
|
|
7
|
+
# or unresponsive Mac mini running the Maestro agent system.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# ./scripts/watchdog/force-reboot.sh --graceful # Try graceful shutdown first
|
|
11
|
+
# ./scripts/watchdog/force-reboot.sh --force # Immediate reboot (requires sudo)
|
|
12
|
+
# ./scripts/watchdog/force-reboot.sh --status # Check heartbeat and uptime
|
|
13
|
+
#
|
|
14
|
+
# Remote usage (from another machine via SSH):
|
|
15
|
+
# ssh sophie@mac-mini.local "~/maestro/scripts/watchdog/force-reboot.sh --graceful"
|
|
16
|
+
#
|
|
17
|
+
# =============================================================================
|
|
18
|
+
#
|
|
19
|
+
# FORCE-REBOOT OPTIONS WHEN THE MACHINE IS COMPLETELY FROZEN:
|
|
20
|
+
#
|
|
21
|
+
# If SSH is unresponsive and the GUI is frozen, the only options are:
|
|
22
|
+
#
|
|
23
|
+
# 1. SMART PLUG (recommended for headless servers):
|
|
24
|
+
# - Install a WiFi smart plug (TP-Link Kasa, Shelly, etc.) on the Mac mini power
|
|
25
|
+
# - Power cycle via the smart plug's app or API
|
|
26
|
+
# - Combined with `pmset autorestart 1` (already configured), the Mac mini
|
|
27
|
+
# will boot automatically when power is restored
|
|
28
|
+
# - Example (TP-Link Kasa):
|
|
29
|
+
# kasa --host <plug-ip> --type plug off && sleep 5 && kasa --host <plug-ip> --type plug on
|
|
30
|
+
# - Example (Shelly):
|
|
31
|
+
# curl -s "http://<shelly-ip>/relay/0?turn=off" && sleep 5 && curl -s "http://<shelly-ip>/relay/0?turn=on"
|
|
32
|
+
#
|
|
33
|
+
# 2. PARSEC (if display is frozen but network is up):
|
|
34
|
+
# - Parsec may still be responsive even if the GUI is frozen
|
|
35
|
+
# - Connect via Parsec and use Cmd+Ctrl+Power to force restart
|
|
36
|
+
#
|
|
37
|
+
# 3. APPLE REMOTE MANAGEMENT (ARD):
|
|
38
|
+
# - If enabled, use "Send UNIX Command" from another Mac:
|
|
39
|
+
# sudo shutdown -r now
|
|
40
|
+
#
|
|
41
|
+
# 4. PHYSICAL POWER CYCLE:
|
|
42
|
+
# - Unplug and replug the Mac mini power cable
|
|
43
|
+
# - With `pmset autorestart 1`, it will boot automatically
|
|
44
|
+
#
|
|
45
|
+
# 5. SSH + SYSDIAGNOSE RESET (partial freeze, SSH still works):
|
|
46
|
+
# - ssh sophie@mac-mini.local "sudo reboot"
|
|
47
|
+
# - Or: ssh sophie@mac-mini.local "sudo shutdown -r now"
|
|
48
|
+
#
|
|
49
|
+
# =============================================================================
|
|
50
|
+
|
|
51
|
+
set -euo pipefail
|
|
52
|
+
|
|
53
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
54
|
+
MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
55
|
+
|
|
56
|
+
HEARTBEAT_FILE="$MAESTRO_DIR/state/heartbeat"
|
|
57
|
+
LOG_FILE="$MAESTRO_DIR/logs/watchdog/$(date +%Y-%m-%d)-reboot.log"
|
|
58
|
+
|
|
59
|
+
log() {
|
|
60
|
+
local ts
|
|
61
|
+
ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
62
|
+
echo "[$ts] $1" | tee -a "$LOG_FILE"
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
check_heartbeat() {
|
|
66
|
+
if [ ! -f "$HEARTBEAT_FILE" ]; then
|
|
67
|
+
echo "No heartbeat file found — watchdog may not be running"
|
|
68
|
+
return 1
|
|
69
|
+
fi
|
|
70
|
+
|
|
71
|
+
local hb_time hb_age
|
|
72
|
+
hb_time=$(cat "$HEARTBEAT_FILE")
|
|
73
|
+
hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$hb_time" +%s 2>/dev/null || echo 0) ))
|
|
74
|
+
|
|
75
|
+
echo "Last heartbeat: $hb_time (${hb_age}s ago)"
|
|
76
|
+
echo "System uptime: $(uptime)"
|
|
77
|
+
|
|
78
|
+
if [ "$hb_age" -gt 120 ]; then
|
|
79
|
+
echo "WARNING: Heartbeat is stale (>2 min) — watchdog may be stuck"
|
|
80
|
+
return 1
|
|
81
|
+
fi
|
|
82
|
+
return 0
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
graceful_reboot() {
|
|
86
|
+
log "Initiating graceful reboot..."
|
|
87
|
+
|
|
88
|
+
# 1. Trigger emergency stop to prevent new work
|
|
89
|
+
echo "Memory watchdog initiated graceful reboot" > "$MAESTRO_DIR/.emergency-stop"
|
|
90
|
+
log "Emergency stop flag set"
|
|
91
|
+
|
|
92
|
+
# 2. Give running sessions 30 seconds to wrap up
|
|
93
|
+
log "Waiting 30s for running sessions to complete..."
|
|
94
|
+
sleep 30
|
|
95
|
+
|
|
96
|
+
# 3. Kill remaining claude processes gracefully
|
|
97
|
+
log "Sending SIGTERM to remaining claude processes..."
|
|
98
|
+
pkill -TERM -f "claude" 2>/dev/null || true
|
|
99
|
+
sleep 5
|
|
100
|
+
|
|
101
|
+
# 4. Commit any uncommitted state
|
|
102
|
+
log "Attempting to commit state..."
|
|
103
|
+
(
|
|
104
|
+
cd "$MAESTRO_DIR"
|
|
105
|
+
git add -A state/ logs/ 2>/dev/null || true
|
|
106
|
+
git commit -m "chore: pre-reboot state save (watchdog-initiated)" 2>/dev/null || true
|
|
107
|
+
git push origin main 2>/dev/null || true
|
|
108
|
+
) || true
|
|
109
|
+
|
|
110
|
+
# 5. Reboot
|
|
111
|
+
log "Rebooting now..."
|
|
112
|
+
sudo shutdown -r now 2>/dev/null || {
|
|
113
|
+
log "sudo shutdown failed — trying osascript..."
|
|
114
|
+
osascript -e 'tell application "System Events" to restart' 2>/dev/null || {
|
|
115
|
+
log "All reboot methods failed — manual intervention required"
|
|
116
|
+
exit 1
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
force_reboot() {
|
|
122
|
+
log "FORCE REBOOT initiated"
|
|
123
|
+
echo "Force reboot requested" > "$MAESTRO_DIR/.emergency-stop" 2>/dev/null || true
|
|
124
|
+
sudo reboot 2>/dev/null || sudo shutdown -r now 2>/dev/null || {
|
|
125
|
+
log "sudo reboot failed — requires password or system is frozen"
|
|
126
|
+
echo "ERROR: Cannot force reboot without sudo. Options:"
|
|
127
|
+
echo " 1. Run: sudo reboot"
|
|
128
|
+
echo " 2. Power cycle via smart plug"
|
|
129
|
+
echo " 3. Physical power cycle"
|
|
130
|
+
exit 1
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
case "${1:---status}" in
|
|
135
|
+
--status)
|
|
136
|
+
check_heartbeat
|
|
137
|
+
;;
|
|
138
|
+
--graceful)
|
|
139
|
+
graceful_reboot
|
|
140
|
+
;;
|
|
141
|
+
--force)
|
|
142
|
+
force_reboot
|
|
143
|
+
;;
|
|
144
|
+
*)
|
|
145
|
+
echo "Usage: $0 [--status|--graceful|--force]"
|
|
146
|
+
exit 1
|
|
147
|
+
;;
|
|
148
|
+
esac
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# memory-watchdog.sh — System Resource Watchdog for Maestro Agent Deployment
|
|
4
|
+
# =============================================================================
|
|
5
|
+
#
|
|
6
|
+
# Monitors memory usage and process count for Claude Code subprocesses.
|
|
7
|
+
# Kills runaway subagents when thresholds are exceeded.
|
|
8
|
+
# Triggers emergency stop at critical levels.
|
|
9
|
+
#
|
|
10
|
+
# Runs every 30 seconds via launchd (ai.maestro.memory-watchdog.plist).
|
|
11
|
+
#
|
|
12
|
+
# Thresholds (configurable via environment or defaults below):
|
|
13
|
+
# WARNING: >60% RAM used by claude processes → log warning
|
|
14
|
+
# CRITICAL: >75% RAM used by claude processes → kill oldest subagents
|
|
15
|
+
# EMERGENCY: >85% total system memory pressure → emergency stop
|
|
16
|
+
#
|
|
17
|
+
# Usage:
|
|
18
|
+
# ./scripts/watchdog/memory-watchdog.sh # Normal run
|
|
19
|
+
# ./scripts/watchdog/memory-watchdog.sh --check # Report status only
|
|
20
|
+
# ./scripts/watchdog/memory-watchdog.sh --dry-run # Show what would be killed
|
|
21
|
+
#
|
|
22
|
+
# =============================================================================
|
|
23
|
+
|
|
24
|
+
set -euo pipefail
|
|
25
|
+
|
|
26
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
27
|
+
MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
28
|
+
|
|
29
|
+
# Auto-detect agent directory (sophie-ai, wundr, etc.)
|
|
30
|
+
# The watchdog runs from maestro but protects the whole machine
|
|
31
|
+
AGENT_DIR="${AGENT_DIR:-}"
|
|
32
|
+
|
|
33
|
+
# --- Ensure system paths are available ----------------------------------------
|
|
34
|
+
# launchd PATH may not include /usr/sbin where sysctl lives
|
|
35
|
+
export PATH="/usr/sbin:/usr/bin:/bin:/usr/local/bin:/opt/homebrew/bin:$PATH"
|
|
36
|
+
|
|
37
|
+
# --- Configuration -----------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
# Total physical RAM in KB
|
|
40
|
+
TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
|
|
41
|
+
|
|
42
|
+
# Thresholds as percentage of total RAM for claude process RSS
|
|
43
|
+
CLAUDE_WARN_PERCENT="${WATCHDOG_WARN_PERCENT:-60}"
|
|
44
|
+
CLAUDE_CRITICAL_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
|
|
45
|
+
|
|
46
|
+
# System-wide memory pressure threshold (percentage)
|
|
47
|
+
SYSTEM_EMERGENCY_PERCENT="${WATCHDOG_EMERGENCY_PERCENT:-85}"
|
|
48
|
+
|
|
49
|
+
# Max concurrent claude CLI processes (excluding the main interactive session)
|
|
50
|
+
MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
|
|
51
|
+
|
|
52
|
+
# Minimum age (seconds) before a claude process can be killed
|
|
53
|
+
# Protects freshly-spawned sessions from being killed immediately
|
|
54
|
+
MIN_AGE_SECONDS="${WATCHDOG_MIN_AGE:-60}"
|
|
55
|
+
|
|
56
|
+
# Log file
|
|
57
|
+
LOG_DIR="$MAESTRO_DIR/logs/watchdog"
|
|
58
|
+
LOG_FILE="$LOG_DIR/$(date +%Y-%m-%d)-watchdog.jsonl"
|
|
59
|
+
|
|
60
|
+
# State file for tracking actions across runs
|
|
61
|
+
STATE_FILE="$MAESTRO_DIR/state/watchdog-state.yaml"
|
|
62
|
+
|
|
63
|
+
# --- Helpers -----------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
mkdir -p "$LOG_DIR" "$(dirname "$STATE_FILE")"
|
|
66
|
+
|
|
67
|
+
log_event() {
|
|
68
|
+
local level="$1" event="$2" detail="${3:-}"
|
|
69
|
+
local ts
|
|
70
|
+
ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
71
|
+
printf '{"ts":"%s","level":"%s","event":"%s","detail":%s}\n' \
|
|
72
|
+
"$ts" "$level" "$event" "$(echo "$detail" | python3 -c 'import sys,json; print(json.dumps(sys.stdin.read().strip()))' 2>/dev/null || echo '""')" \
|
|
73
|
+
>> "$LOG_FILE"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# --- Memory Metrics ----------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
get_system_memory_pressure() {
|
|
79
|
+
# macOS memory_pressure returns a percentage of total memory under pressure
|
|
80
|
+
# We parse the "System-wide memory free percentage" line
|
|
81
|
+
local free_pct
|
|
82
|
+
free_pct=$(memory_pressure 2>/dev/null | grep "System-wide memory free percentage:" | awk '{print $NF}' | tr -d '%')
|
|
83
|
+
if [ -z "$free_pct" ]; then
|
|
84
|
+
# Fallback: calculate from vm_stat
|
|
85
|
+
local page_size free_pages
|
|
86
|
+
page_size=$(vm_stat | head -1 | grep -o '[0-9]*')
|
|
87
|
+
free_pages=$(vm_stat | awk '/Pages free/ {print $NF}' | tr -d '.')
|
|
88
|
+
local free_kb=$(( (free_pages * page_size) / 1024 ))
|
|
89
|
+
free_pct=$(( (free_kb * 100) / TOTAL_RAM_KB ))
|
|
90
|
+
fi
|
|
91
|
+
echo $(( 100 - free_pct ))
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
get_claude_processes() {
|
|
95
|
+
# Returns: PID RSS_KB ELAPSED_SECONDS COMMAND
|
|
96
|
+
# Finds all claude CLI processes (both 'claude' and node-based claude)
|
|
97
|
+
# Excludes this watchdog script itself
|
|
98
|
+
/bin/ps -eo pid,rss,etime,comm | \
|
|
99
|
+
grep -i "claude" | \
|
|
100
|
+
grep -v "Claude.app" | \
|
|
101
|
+
grep -v "Claude Helper" | \
|
|
102
|
+
grep -v "grep" | \
|
|
103
|
+
grep -v "watchdog" | \
|
|
104
|
+
awk '{
|
|
105
|
+
pid = $1
|
|
106
|
+
rss = $2
|
|
107
|
+
etime = $3
|
|
108
|
+
comm = $4
|
|
109
|
+
# Parse etime (formats: SS, MM:SS, HH:MM:SS, D-HH:MM:SS)
|
|
110
|
+
n = split(etime, parts, ":")
|
|
111
|
+
if (n == 1) { secs = parts[1] }
|
|
112
|
+
else if (n == 2) { secs = parts[1]*60 + parts[2] }
|
|
113
|
+
else if (n == 3) {
|
|
114
|
+
# Check for D- prefix
|
|
115
|
+
if (index(parts[1], "-") > 0) {
|
|
116
|
+
split(parts[1], dp, "-")
|
|
117
|
+
secs = dp[1]*86400 + dp[2]*3600 + parts[2]*60 + parts[3]
|
|
118
|
+
} else {
|
|
119
|
+
secs = parts[1]*3600 + parts[2]*60 + parts[3]
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
printf "%s %s %s %s\n", pid, rss, secs, comm
|
|
123
|
+
}' | sort -k2 -rn
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
get_total_claude_rss_kb() {
|
|
127
|
+
get_claude_processes | awk '{sum += $2} END {print sum+0}'
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
count_claude_processes() {
|
|
131
|
+
get_claude_processes | wc -l | tr -d ' '
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
# --- Identification ----------------------------------------------------------
|
|
135
|
+
|
|
136
|
+
identify_main_session() {
|
|
137
|
+
# The main interactive claude session is typically:
|
|
138
|
+
# 1. The one connected to a TTY
|
|
139
|
+
# 2. The longest-running one
|
|
140
|
+
# 3. The parent of subagent processes
|
|
141
|
+
# We use the longest-running as a heuristic
|
|
142
|
+
get_claude_processes | tail -1 | awk '{print $1}'
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# --- Actions -----------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
kill_oldest_subagents() {
|
|
148
|
+
local target_count="$1"
|
|
149
|
+
local dry_run="${2:-false}"
|
|
150
|
+
local main_pid
|
|
151
|
+
main_pid=$(identify_main_session)
|
|
152
|
+
local killed=0
|
|
153
|
+
|
|
154
|
+
# Get processes sorted by RSS descending (kill biggest first)
|
|
155
|
+
# Skip the main session
|
|
156
|
+
while IFS=' ' read -r pid rss age comm; do
|
|
157
|
+
if [ "$pid" = "$main_pid" ]; then
|
|
158
|
+
continue
|
|
159
|
+
fi
|
|
160
|
+
if [ "$age" -lt "$MIN_AGE_SECONDS" ]; then
|
|
161
|
+
log_event "info" "skip_young" "PID $pid age ${age}s < ${MIN_AGE_SECONDS}s minimum"
|
|
162
|
+
continue
|
|
163
|
+
fi
|
|
164
|
+
if [ "$killed" -ge "$target_count" ]; then
|
|
165
|
+
break
|
|
166
|
+
fi
|
|
167
|
+
|
|
168
|
+
local rss_mb=$(( rss / 1024 ))
|
|
169
|
+
if [ "$dry_run" = "true" ]; then
|
|
170
|
+
echo " [DRY RUN] Would kill PID $pid (${rss_mb}MB, age ${age}s)"
|
|
171
|
+
log_event "info" "dry_run_kill" "PID=$pid RSS=${rss_mb}MB age=${age}s"
|
|
172
|
+
else
|
|
173
|
+
echo " Killing PID $pid (${rss_mb}MB, age ${age}s)"
|
|
174
|
+
kill -TERM "$pid" 2>/dev/null || true
|
|
175
|
+
log_event "warn" "killed_subagent" "PID=$pid RSS=${rss_mb}MB age=${age}s signal=TERM"
|
|
176
|
+
killed=$((killed + 1))
|
|
177
|
+
|
|
178
|
+
# Give it 5 seconds to exit gracefully, then SIGKILL
|
|
179
|
+
(
|
|
180
|
+
sleep 5
|
|
181
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
182
|
+
kill -9 "$pid" 2>/dev/null || true
|
|
183
|
+
# Log to file directly since this is a subshell
|
|
184
|
+
printf '{"ts":"%s","level":"warn","event":"force_killed","detail":"PID=%s did not exit after SIGTERM"}\n' \
|
|
185
|
+
"$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$pid" >> "$LOG_FILE"
|
|
186
|
+
fi
|
|
187
|
+
) &
|
|
188
|
+
fi
|
|
189
|
+
done <<< "$(get_claude_processes)"
|
|
190
|
+
|
|
191
|
+
echo "$killed"
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
trigger_emergency_stop() {
|
|
195
|
+
local reason="$1"
|
|
196
|
+
local stop_file="$MAESTRO_DIR/.emergency-stop"
|
|
197
|
+
|
|
198
|
+
if [ -f "$stop_file" ]; then
|
|
199
|
+
log_event "warn" "emergency_already_active" "$reason"
|
|
200
|
+
return 0
|
|
201
|
+
fi
|
|
202
|
+
|
|
203
|
+
echo "$reason" > "$stop_file"
|
|
204
|
+
log_event "critical" "emergency_stop_triggered" "$reason"
|
|
205
|
+
|
|
206
|
+
# Also create in any detected agent directories
|
|
207
|
+
for agent_dir in "$HOME"/sophie-ai "$HOME"/wundr; do
|
|
208
|
+
if [ -d "$agent_dir" ]; then
|
|
209
|
+
echo "$reason" > "$agent_dir/.emergency-stop" 2>/dev/null || true
|
|
210
|
+
fi
|
|
211
|
+
done
|
|
212
|
+
|
|
213
|
+
# Kill ALL non-main claude subprocesses
|
|
214
|
+
local main_pid
|
|
215
|
+
main_pid=$(identify_main_session)
|
|
216
|
+
while IFS=' ' read -r pid rss age comm; do
|
|
217
|
+
if [ "$pid" != "$main_pid" ] && [ -n "$pid" ]; then
|
|
218
|
+
kill -TERM "$pid" 2>/dev/null || true
|
|
219
|
+
fi
|
|
220
|
+
done <<< "$(get_claude_processes)"
|
|
221
|
+
|
|
222
|
+
echo "EMERGENCY STOP: $reason"
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
# --- Main Logic --------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
run_watchdog() {
|
|
228
|
+
local mode="${1:-run}"
|
|
229
|
+
local dry_run="false"
|
|
230
|
+
[ "$mode" = "--dry-run" ] && dry_run="true"
|
|
231
|
+
[ "$mode" = "--check" ] && dry_run="true"
|
|
232
|
+
|
|
233
|
+
# Collect metrics
|
|
234
|
+
local claude_rss_kb claude_count system_pressure
|
|
235
|
+
claude_rss_kb=$(get_total_claude_rss_kb)
|
|
236
|
+
claude_count=$(count_claude_processes)
|
|
237
|
+
system_pressure=$(get_system_memory_pressure)
|
|
238
|
+
|
|
239
|
+
local claude_rss_mb=$(( claude_rss_kb / 1024 ))
|
|
240
|
+
local claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
|
|
241
|
+
local warn_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 / 1024 ))
|
|
242
|
+
local critical_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_CRITICAL_PERCENT) / 100 / 1024 ))
|
|
243
|
+
|
|
244
|
+
# Report
|
|
245
|
+
if [ "$mode" = "--check" ]; then
|
|
246
|
+
echo "=== Memory Watchdog Status ==="
|
|
247
|
+
echo "Total RAM: $(( TOTAL_RAM_KB / 1024 ))MB"
|
|
248
|
+
echo "Claude processes: $claude_count (max: $MAX_CLAUDE_PROCS)"
|
|
249
|
+
echo "Claude RSS total: ${claude_rss_mb}MB (${claude_pct}% of RAM)"
|
|
250
|
+
echo " Warning threshold: ${warn_threshold_mb}MB (${CLAUDE_WARN_PERCENT}%)"
|
|
251
|
+
echo " Critical threshold: ${critical_threshold_mb}MB (${CLAUDE_CRITICAL_PERCENT}%)"
|
|
252
|
+
echo "System mem pressure: ${system_pressure}% (emergency at ${SYSTEM_EMERGENCY_PERCENT}%)"
|
|
253
|
+
echo ""
|
|
254
|
+
echo "Claude processes:"
|
|
255
|
+
get_claude_processes | while IFS=' ' read -r pid rss age comm; do
|
|
256
|
+
echo " PID $pid: $(( rss / 1024 ))MB, age ${age}s — $comm"
|
|
257
|
+
done
|
|
258
|
+
echo "=============================="
|
|
259
|
+
return 0
|
|
260
|
+
fi
|
|
261
|
+
|
|
262
|
+
# --- Level 1: System emergency (memory pressure) ---------------------------
|
|
263
|
+
if [ "$system_pressure" -ge "$SYSTEM_EMERGENCY_PERCENT" ]; then
|
|
264
|
+
log_event "critical" "system_memory_emergency" "pressure=${system_pressure}% >= ${SYSTEM_EMERGENCY_PERCENT}%"
|
|
265
|
+
if [ "$dry_run" = "false" ]; then
|
|
266
|
+
trigger_emergency_stop "Memory pressure ${system_pressure}% exceeded ${SYSTEM_EMERGENCY_PERCENT}% emergency threshold"
|
|
267
|
+
else
|
|
268
|
+
echo "[DRY RUN] Would trigger EMERGENCY STOP (pressure=${system_pressure}%)"
|
|
269
|
+
fi
|
|
270
|
+
return 0
|
|
271
|
+
fi
|
|
272
|
+
|
|
273
|
+
# --- Level 2: Claude RSS critical — kill biggest subagents -----------------
|
|
274
|
+
if [ "$claude_pct" -ge "$CLAUDE_CRITICAL_PERCENT" ]; then
|
|
275
|
+
log_event "warn" "claude_memory_critical" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_CRITICAL_PERCENT}%"
|
|
276
|
+
echo "CRITICAL: Claude processes using ${claude_rss_mb}MB (${claude_pct}% of RAM)"
|
|
277
|
+
|
|
278
|
+
# Kill enough to get below warning threshold
|
|
279
|
+
local target_rss_kb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 ))
|
|
280
|
+
local excess_kb=$(( claude_rss_kb - target_rss_kb ))
|
|
281
|
+
# Estimate ~600MB per process
|
|
282
|
+
local procs_to_kill=$(( (excess_kb / 614400) + 1 ))
|
|
283
|
+
[ "$procs_to_kill" -lt 1 ] && procs_to_kill=1
|
|
284
|
+
|
|
285
|
+
echo " Killing $procs_to_kill subagent(s) to free ~$(( excess_kb / 1024 ))MB..."
|
|
286
|
+
kill_oldest_subagents "$procs_to_kill" "$dry_run"
|
|
287
|
+
return 0
|
|
288
|
+
fi
|
|
289
|
+
|
|
290
|
+
# --- Level 3: Too many concurrent processes --------------------------------
|
|
291
|
+
if [ "$claude_count" -gt "$MAX_CLAUDE_PROCS" ]; then
|
|
292
|
+
local excess=$(( claude_count - MAX_CLAUDE_PROCS ))
|
|
293
|
+
log_event "warn" "too_many_claude_procs" "count=${claude_count} > max=${MAX_CLAUDE_PROCS}"
|
|
294
|
+
echo "WARNING: $claude_count claude processes running (max: $MAX_CLAUDE_PROCS)"
|
|
295
|
+
echo " Killing $excess oldest subagent(s)..."
|
|
296
|
+
kill_oldest_subagents "$excess" "$dry_run"
|
|
297
|
+
return 0
|
|
298
|
+
fi
|
|
299
|
+
|
|
300
|
+
# --- Level 4: Claude RSS warning — log only --------------------------------
|
|
301
|
+
if [ "$claude_pct" -ge "$CLAUDE_WARN_PERCENT" ]; then
|
|
302
|
+
log_event "warn" "claude_memory_warning" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_WARN_PERCENT}%"
|
|
303
|
+
return 0
|
|
304
|
+
fi
|
|
305
|
+
|
|
306
|
+
# --- All clear -------------------------------------------------------------
|
|
307
|
+
# Only log periodically (every 5 minutes = every 10th run at 30s interval)
|
|
308
|
+
local run_count_file="$MAESTRO_DIR/state/watchdog-run-count"
|
|
309
|
+
local run_count=0
|
|
310
|
+
[ -f "$run_count_file" ] && run_count=$(cat "$run_count_file")
|
|
311
|
+
run_count=$(( (run_count + 1) % 10 ))
|
|
312
|
+
echo "$run_count" > "$run_count_file"
|
|
313
|
+
|
|
314
|
+
if [ "$run_count" -eq 0 ]; then
|
|
315
|
+
log_event "info" "healthy" "claude_procs=${claude_count} rss=${claude_rss_mb}MB (${claude_pct}%) pressure=${system_pressure}%"
|
|
316
|
+
fi
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
# --- Write heartbeat ---------------------------------------------------------
|
|
320
|
+
|
|
321
|
+
write_heartbeat() {
|
|
322
|
+
local heartbeat_file="$MAESTRO_DIR/state/heartbeat"
|
|
323
|
+
date -u +"%Y-%m-%dT%H:%M:%SZ" > "$heartbeat_file"
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
# --- Entry point -------------------------------------------------------------
|
|
327
|
+
|
|
328
|
+
write_heartbeat
|
|
329
|
+
run_watchdog "${1:-run}"
|