@adaptic/maestro 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adaptic/maestro",
3
- "version": "1.1.4",
3
+ "version": "1.1.6",
4
4
  "description": "Maestro — Autonomous AI agent operating system. Deploy AI employees on dedicated Mac minis.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -0,0 +1,89 @@
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # boot-claude-session.sh — Launch interactive Claude Code session on boot
4
+ # =============================================================================
5
+ #
6
+ # Opens Terminal.app with a Claude Code interactive session in the agent's
7
+ # working directory. Called via launchd at login.
8
+ #
9
+ # This is separate from the sophie-daemon (which runs headless as a Node.js
10
+ # process for polling/dispatching). This script provides a visible, interactive
11
+ # Claude Code session that the operator can observe and interact with.
12
+ #
13
+ # Usage:
14
+ # ./scripts/setup/boot-claude-session.sh [agent-dir]
15
+ #
16
+ # =============================================================================
17
+
18
+ set -euo pipefail
19
+
20
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
21
+ MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
22
+
23
+ # Detect agent directory — prefer sophie-ai if it exists
24
+ AGENT_DIR="${1:-}"
25
+ if [ -z "$AGENT_DIR" ]; then
26
+ if [ -d "$HOME/sophie-ai" ]; then
27
+ AGENT_DIR="$HOME/sophie-ai"
28
+ else
29
+ echo "ERROR: No agent directory specified and ~/sophie-ai not found" >&2
30
+ exit 1
31
+ fi
32
+ fi
33
+
34
+ AGENT_NAME=$(basename "$AGENT_DIR")
35
+ LOG_FILE="$MAESTRO_DIR/logs/watchdog/$(date +%Y-%m-%d)-boot-session.log"
36
+ mkdir -p "$(dirname "$LOG_FILE")"
37
+
38
+ log() {
39
+ echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] $1" | tee -a "$LOG_FILE"
40
+ }
41
+
42
+ # Wait for the system to settle after login (network, services, etc.)
43
+ log "Waiting 15s for system to settle..."
44
+ sleep 15
45
+
46
+ # Check for emergency stop
47
+ if [ -f "$AGENT_DIR/.emergency-stop" ]; then
48
+ log "Emergency stop active — not starting Claude session"
49
+ exit 0
50
+ fi
51
+
52
+ if [ -f "$MAESTRO_DIR/.emergency-stop" ]; then
53
+ log "Emergency stop active in maestro — not starting Claude session"
54
+ exit 0
55
+ fi
56
+
57
+ # Check if claude CLI is available
58
+ if ! command -v claude &>/dev/null; then
59
+ # Try common paths
60
+ CLAUDE_PATH=""
61
+ for p in "$HOME/.local/bin/claude" "/opt/homebrew/bin/claude" "/usr/local/bin/claude"; do
62
+ if [ -x "$p" ]; then
63
+ CLAUDE_PATH="$p"
64
+ break
65
+ fi
66
+ done
67
+
68
+ if [ -z "$CLAUDE_PATH" ]; then
69
+ log "ERROR: claude CLI not found"
70
+ exit 1
71
+ fi
72
+ else
73
+ CLAUDE_PATH=$(which claude)
74
+ fi
75
+
76
+ log "Starting Claude Code session in $AGENT_DIR (claude: $CLAUDE_PATH)"
77
+
78
+ # Open Terminal.app with Claude Code running in the agent directory
79
+ osascript <<APPLESCRIPT
80
+ tell application "Terminal"
81
+ activate
82
+ -- Open a new window with claude running in the agent directory
83
+ do script "cd '$AGENT_DIR' && clear && echo '╔══════════════════════════════════════════════════════════╗' && echo '║ Sophie AI — Boot Session ($(date +%Y-%m-%d)) ║' && echo '╠══════════════════════════════════════════════════════════╣' && echo '║ Agent dir: $AGENT_DIR' && echo '║ Claude: $CLAUDE_PATH' && echo '╚══════════════════════════════════════════════════════════╝' && echo '' && '$CLAUDE_PATH'"
84
+ -- Set the window title
85
+ set custom title of front window to "$AGENT_NAME — Claude Code"
86
+ end tell
87
+ APPLESCRIPT
88
+
89
+ log "Terminal window opened with Claude Code session"
@@ -70,6 +70,32 @@ configure_auto_login() {
70
70
  defaults write com.apple.loginwindow LoginwindowLaunchesRelaunchApps -bool false
71
71
 
72
72
  ok "Auto-login enabled for $SUDO_USER"
73
+
74
+ # --- Disable all password gates that could block unattended operation -------
75
+ local real_home
76
+ real_home=$(eval echo "~$SUDO_USER")
77
+
78
+ # Disable "Require password after sleep or screen saver"
79
+ sudo -u "$SUDO_USER" defaults write com.apple.screensaver askForPassword -int 0
80
+ sudo -u "$SUDO_USER" defaults write com.apple.screensaver askForPasswordDelay -int 0
81
+ ok "Screen saver password prompt disabled"
82
+
83
+ # Disable screen lock via sysadminctl
84
+ sysadminctl -screenLock off 2>/dev/null || true
85
+ ok "Screen lock disabled"
86
+
87
+ # Disable idle logout (Privacy & Security > Log out when idle)
88
+ defaults write /Library/Preferences/.GlobalPreferences com.apple.autologout.AutoLogOutDelay -int 0
89
+ ok "Idle auto-logout disabled"
90
+
91
+ # Disable screen lock on wake for current user
92
+ sudo -u "$SUDO_USER" defaults -currentHost write com.apple.screensaver idleTime -int 0
93
+ ok "Screen saver idle timer set to never"
94
+
95
+ # Prevent display from sleeping and requiring login on wake
96
+ # (already handled by pmset displaysleep 0, but belt-and-suspenders)
97
+ defaults write /Library/Preferences/com.apple.loginwindow DisableScreenLock -bool true 2>/dev/null || true
98
+
73
99
  warn "NOTE: FileVault must be disabled for auto-login to work"
74
100
  warn "Check: fdesetup status"
75
101
 
@@ -296,14 +322,122 @@ configure_app_launches() {
296
322
  # The poller uses IMAP directly, not the browser
297
323
  log "Safari: not needed as Login Item (Gmail uses IMAP polling)"
298
324
 
299
- # Ensure required directories exist
300
- mkdir -p "$AGENT_DIR/logs/huddle"
301
- mkdir -p "$AGENT_DIR/state/huddle"
302
- ok "Log and state directories verified"
325
+ # Ensure required directories exist (as the real user, not root)
326
+ local real_user="${SUDO_USER:-$CURRENT_USER}"
327
+ local dirs=("$AGENT_DIR/logs/huddle" "$AGENT_DIR/state/huddle" "$AGENT_DIR/logs/watchdog")
328
+ for d in "${dirs[@]}"; do
329
+ mkdir -p "$d"
330
+ chown "$real_user:staff" "$d" 2>/dev/null || true
331
+ done
332
+ ok "Log and state directories verified (owned by $real_user)"
333
+
334
+ # --- Claude Code boot session (opens Terminal with interactive claude) ------
335
+ local BOOT_SCRIPT="$AGENT_DIR/scripts/setup/boot-claude-session.sh"
336
+ if [ -f "$BOOT_SCRIPT" ]; then
337
+ chmod +x "$BOOT_SCRIPT"
338
+
339
+ local PLIST_NAME="ai.maestro.boot-claude-session"
340
+ local PLIST_PATH="$real_home/Library/LaunchAgents/${PLIST_NAME}.plist"
341
+ local real_home
342
+ real_home=$(eval echo "~$real_user")
343
+
344
+ # Detect the agent working directory (prefer sophie-ai)
345
+ local boot_agent_dir="$HOME/sophie-ai"
346
+ [ -d "$boot_agent_dir" ] || boot_agent_dir="$AGENT_DIR"
347
+
348
+ cat > "$PLIST_PATH" << PLIST_EOF
349
+ <?xml version="1.0" encoding="UTF-8"?>
350
+ <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
351
+ <plist version="1.0">
352
+ <dict>
353
+ <key>Label</key>
354
+ <string>${PLIST_NAME}</string>
355
+ <key>ProgramArguments</key>
356
+ <array>
357
+ <string>/bin/bash</string>
358
+ <string>${BOOT_SCRIPT}</string>
359
+ <string>${boot_agent_dir}</string>
360
+ </array>
361
+ <key>RunAtLoad</key>
362
+ <true/>
363
+ <key>StandardOutPath</key>
364
+ <string>${AGENT_DIR}/logs/watchdog/boot-session-stdout.log</string>
365
+ <key>StandardErrorPath</key>
366
+ <string>${AGENT_DIR}/logs/watchdog/boot-session-stderr.log</string>
367
+ <key>EnvironmentVariables</key>
368
+ <dict>
369
+ <key>PATH</key>
370
+ <string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:${real_home}/.local/bin</string>
371
+ <key>HOME</key>
372
+ <string>${real_home}</string>
373
+ </dict>
374
+ </dict>
375
+ </plist>
376
+ PLIST_EOF
377
+
378
+ chown "$real_user:staff" "$PLIST_PATH" 2>/dev/null || true
379
+
380
+ # Load as the real user
381
+ if [ "$(id -u)" -eq 0 ]; then
382
+ su "$real_user" -c "launchctl unload '$PLIST_PATH' 2>/dev/null; launchctl load '$PLIST_PATH'" 2>/dev/null || true
383
+ else
384
+ launchctl unload "$PLIST_PATH" 2>/dev/null || true
385
+ launchctl load "$PLIST_PATH" 2>/dev/null || true
386
+ fi
387
+
388
+ ok "Claude Code boot session agent installed (opens Terminal + claude on login)"
389
+ else
390
+ warn "Boot session script not found at $BOOT_SCRIPT"
391
+ fi
392
+ }
393
+
394
+ # =============================================================================
395
+ # 7. MEMORY WATCHDOG
396
+ # =============================================================================
397
+
398
+ configure_memory_watchdog() {
399
+ section "Memory Watchdog (OOM Protection)"
400
+
401
+ local WATCHDOG_SCRIPT="$AGENT_DIR/scripts/watchdog/memory-watchdog.sh"
402
+ local PLIST_SOURCE="$AGENT_DIR/scripts/watchdog/ai.maestro.memory-watchdog.plist"
403
+
404
+ if [ ! -f "$WATCHDOG_SCRIPT" ]; then
405
+ fail "Watchdog script not found at $WATCHDOG_SCRIPT"
406
+ return 0
407
+ fi
408
+
409
+ chmod +x "$WATCHDOG_SCRIPT"
410
+ ok "Watchdog script executable"
411
+
412
+ # Determine the real user's home for LaunchAgents
413
+ local real_user="${SUDO_USER:-$CURRENT_USER}"
414
+ local real_home
415
+ real_home=$(eval echo "~$real_user")
416
+ local PLIST_DEST="$real_home/Library/LaunchAgents/ai.maestro.memory-watchdog.plist"
417
+
418
+ # Install the plist (update paths for this agent directory)
419
+ sed "s|/Users/sophie/maestro|$AGENT_DIR|g" "$PLIST_SOURCE" > "$PLIST_DEST"
420
+ chown "$real_user:staff" "$PLIST_DEST" 2>/dev/null || true
421
+
422
+ # Load it (as the real user)
423
+ if [ "$(id -u)" -eq 0 ]; then
424
+ su "$real_user" -c "launchctl unload '$PLIST_DEST' 2>/dev/null; launchctl load '$PLIST_DEST'" 2>/dev/null || true
425
+ else
426
+ launchctl unload "$PLIST_DEST" 2>/dev/null || true
427
+ launchctl load "$PLIST_DEST" 2>/dev/null || true
428
+ fi
429
+
430
+ ok "Memory watchdog launchd agent installed (runs every 30s)"
431
+ log "Thresholds: warn=60% critical=75% emergency=85% max_procs=8"
432
+ log "Override via env: WATCHDOG_WARN_PERCENT, WATCHDOG_CRITICAL_PERCENT, etc."
433
+
434
+ # Create log directory with correct ownership
435
+ mkdir -p "$AGENT_DIR/logs/watchdog"
436
+ chown "$real_user:staff" "$AGENT_DIR/logs/watchdog" 2>/dev/null || true
303
437
  }
304
438
 
305
439
  # =============================================================================
306
- # 7. ORCHESTRATOR AND SUBSYSTEMS ON BOOT
440
+ # 8. ORCHESTRATOR AND SUBSYSTEMS ON BOOT
307
441
  # =============================================================================
308
442
 
309
443
  configure_boot_services() {
@@ -364,7 +498,7 @@ configure_boot_services() {
364
498
  }
365
499
 
366
500
  # =============================================================================
367
- # 8. SYSTEM VERIFICATION
501
+ # 9. SYSTEM VERIFICATION
368
502
  # =============================================================================
369
503
 
370
504
  verify_all() {
@@ -440,6 +574,30 @@ verify_all() {
440
574
  issues=$((issues + 1))
441
575
  fi
442
576
 
577
+ # Check memory watchdog
578
+ if launchctl list 2>/dev/null | grep -q "ai.maestro.memory-watchdog"; then
579
+ ok "Memory watchdog: running"
580
+ elif [ -f "$HOME/Library/LaunchAgents/ai.maestro.memory-watchdog.plist" ]; then
581
+ warn "Memory watchdog: installed but not loaded"
582
+ else
583
+ fail "Memory watchdog: not installed"
584
+ issues=$((issues + 1))
585
+ fi
586
+
587
+ # Check heartbeat freshness
588
+ local heartbeat_file="$AGENT_DIR/state/heartbeat"
589
+ if [ -f "$heartbeat_file" ]; then
590
+ local hb_age
591
+ hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$(cat "$heartbeat_file")" +%s 2>/dev/null || echo 0) ))
592
+ if [ "$hb_age" -lt 120 ]; then
593
+ ok "Heartbeat: fresh (${hb_age}s ago)"
594
+ else
595
+ warn "Heartbeat: stale (${hb_age}s ago — watchdog may not be running)"
596
+ fi
597
+ else
598
+ warn "Heartbeat: no heartbeat file found"
599
+ fi
600
+
443
601
  # Summary
444
602
  echo ""
445
603
  if [ "$issues" -eq 0 ]; then
@@ -473,6 +631,7 @@ main() {
473
631
  configure_slack_cdp
474
632
  configure_audio
475
633
  configure_app_launches
634
+ configure_memory_watchdog
476
635
  configure_boot_services
477
636
  ;;
478
637
  --full|full|"")
@@ -482,6 +641,7 @@ main() {
482
641
  configure_slack_cdp
483
642
  configure_audio
484
643
  configure_app_launches
644
+ configure_memory_watchdog
485
645
  configure_boot_services
486
646
 
487
647
  section "Configuration Complete"
@@ -28,6 +28,26 @@ if [ ! -f "$TASK_FILE" ]; then
28
28
  exit 1
29
29
  fi
30
30
 
31
+ # --- Resource gate: refuse to spawn if machine is under pressure -------------
32
+ MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
33
+ TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
34
+ SPAWN_BLOCK_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
35
+
36
+ claude_count=$(/bin/ps -eo comm | grep -ci "claude" | tr -d ' ' || true)
37
+ claude_rss_kb=$(/bin/ps -eo rss,comm | grep -i "claude" | grep -v "Claude.app" | grep -v "Claude Helper" | awk '{sum+=$1} END {print sum+0}')
38
+ claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
39
+
40
+ if [ "$claude_count" -ge "$MAX_CLAUDE_PROCS" ]; then
41
+ echo "[$TIMESTAMP] BLOCKED: Too many claude processes ($claude_count >= $MAX_CLAUDE_PROCS)" >&2
42
+ exit 1
43
+ fi
44
+
45
+ if [ "$claude_pct" -ge "$SPAWN_BLOCK_PERCENT" ]; then
46
+ echo "[$TIMESTAMP] BLOCKED: Claude memory usage ${claude_pct}% >= ${SPAWN_BLOCK_PERCENT}% threshold" >&2
47
+ exit 1
48
+ fi
49
+ # -----------------------------------------------------------------------------
50
+
31
51
  # Create session directory
32
52
  mkdir -p "$SESSION_DIR" "$(dirname "$LOG_FILE")"
33
53
 
@@ -0,0 +1,41 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3
+ <plist version="1.0">
4
+ <dict>
5
+ <key>Label</key>
6
+ <string>ai.maestro.memory-watchdog</string>
7
+
8
+ <key>ProgramArguments</key>
9
+ <array>
10
+ <string>/bin/bash</string>
11
+ <string>/Users/sophie/maestro/scripts/watchdog/memory-watchdog.sh</string>
12
+ </array>
13
+
14
+ <key>StartInterval</key>
15
+ <integer>30</integer>
16
+
17
+ <key>RunAtLoad</key>
18
+ <true/>
19
+
20
+ <key>StandardOutPath</key>
21
+ <string>/Users/sophie/maestro/logs/watchdog/launchd-stdout.log</string>
22
+
23
+ <key>StandardErrorPath</key>
24
+ <string>/Users/sophie/maestro/logs/watchdog/launchd-stderr.log</string>
25
+
26
+ <key>EnvironmentVariables</key>
27
+ <dict>
28
+ <key>PATH</key>
29
+ <string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
30
+ </dict>
31
+
32
+ <key>Nice</key>
33
+ <integer>10</integer>
34
+
35
+ <key>ProcessType</key>
36
+ <string>Background</string>
37
+
38
+ <key>ThrottleInterval</key>
39
+ <integer>10</integer>
40
+ </dict>
41
+ </plist>
@@ -0,0 +1,148 @@
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # force-reboot.sh — Remote/Automated Force Reboot for Frozen Mac Mini
4
+ # =============================================================================
5
+ #
6
+ # This script provides multiple escalation levels for rebooting a frozen
7
+ # or unresponsive Mac mini running the Maestro agent system.
8
+ #
9
+ # Usage:
10
+ # ./scripts/watchdog/force-reboot.sh --graceful # Try graceful shutdown first
11
+ # ./scripts/watchdog/force-reboot.sh --force # Immediate reboot (requires sudo)
12
+ # ./scripts/watchdog/force-reboot.sh --status # Check heartbeat and uptime
13
+ #
14
+ # Remote usage (from another machine via SSH):
15
+ # ssh sophie@mac-mini.local "~/maestro/scripts/watchdog/force-reboot.sh --graceful"
16
+ #
17
+ # =============================================================================
18
+ #
19
+ # FORCE-REBOOT OPTIONS WHEN THE MACHINE IS COMPLETELY FROZEN:
20
+ #
21
+ # If SSH is unresponsive and the GUI is frozen, the only options are:
22
+ #
23
+ # 1. SMART PLUG (recommended for headless servers):
24
+ # - Install a WiFi smart plug (TP-Link Kasa, Shelly, etc.) on the Mac mini power
25
+ # - Power cycle via the smart plug's app or API
26
+ # - Combined with `pmset autorestart 1` (already configured), the Mac mini
27
+ # will boot automatically when power is restored
28
+ # - Example (TP-Link Kasa):
29
+ # kasa --host <plug-ip> --type plug off && sleep 5 && kasa --host <plug-ip> --type plug on
30
+ # - Example (Shelly):
31
+ # curl -s "http://<shelly-ip>/relay/0?turn=off" && sleep 5 && curl -s "http://<shelly-ip>/relay/0?turn=on"
32
+ #
33
+ # 2. PARSEC (if display is frozen but network is up):
34
+ # - Parsec may still be responsive even if the GUI is frozen
35
+ # - Connect via Parsec and use Cmd+Ctrl+Power to force restart
36
+ #
37
+ # 3. APPLE REMOTE MANAGEMENT (ARD):
38
+ # - If enabled, use "Send UNIX Command" from another Mac:
39
+ # sudo shutdown -r now
40
+ #
41
+ # 4. PHYSICAL POWER CYCLE:
42
+ # - Unplug and replug the Mac mini power cable
43
+ # - With `pmset autorestart 1`, it will boot automatically
44
+ #
45
+ # 5. SSH + SYSDIAGNOSE RESET (partial freeze, SSH still works):
46
+ # - ssh sophie@mac-mini.local "sudo reboot"
47
+ # - Or: ssh sophie@mac-mini.local "sudo shutdown -r now"
48
+ #
49
+ # =============================================================================
50
+
51
+ set -euo pipefail
52
+
53
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
54
+ MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
55
+
56
+ HEARTBEAT_FILE="$MAESTRO_DIR/state/heartbeat"
57
+ LOG_FILE="$MAESTRO_DIR/logs/watchdog/$(date +%Y-%m-%d)-reboot.log"
58
+
59
+ log() {
60
+ local ts
61
+ ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
62
+ echo "[$ts] $1" | tee -a "$LOG_FILE"
63
+ }
64
+
65
+ check_heartbeat() {
66
+ if [ ! -f "$HEARTBEAT_FILE" ]; then
67
+ echo "No heartbeat file found — watchdog may not be running"
68
+ return 1
69
+ fi
70
+
71
+ local hb_time hb_age
72
+ hb_time=$(cat "$HEARTBEAT_FILE")
73
+ hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$hb_time" +%s 2>/dev/null || echo 0) ))
74
+
75
+ echo "Last heartbeat: $hb_time (${hb_age}s ago)"
76
+ echo "System uptime: $(uptime)"
77
+
78
+ if [ "$hb_age" -gt 120 ]; then
79
+ echo "WARNING: Heartbeat is stale (>2 min) — watchdog may be stuck"
80
+ return 1
81
+ fi
82
+ return 0
83
+ }
84
+
85
+ graceful_reboot() {
86
+ log "Initiating graceful reboot..."
87
+
88
+ # 1. Trigger emergency stop to prevent new work
89
+ echo "Memory watchdog initiated graceful reboot" > "$MAESTRO_DIR/.emergency-stop"
90
+ log "Emergency stop flag set"
91
+
92
+ # 2. Give running sessions 30 seconds to wrap up
93
+ log "Waiting 30s for running sessions to complete..."
94
+ sleep 30
95
+
96
+ # 3. Kill remaining claude processes gracefully
97
+ log "Sending SIGTERM to remaining claude processes..."
98
+ pkill -TERM -f "claude" 2>/dev/null || true
99
+ sleep 5
100
+
101
+ # 4. Commit any uncommitted state
102
+ log "Attempting to commit state..."
103
+ (
104
+ cd "$MAESTRO_DIR"
105
+ git add -A state/ logs/ 2>/dev/null || true
106
+ git commit -m "chore: pre-reboot state save (watchdog-initiated)" 2>/dev/null || true
107
+ git push origin main 2>/dev/null || true
108
+ ) || true
109
+
110
+ # 5. Reboot
111
+ log "Rebooting now..."
112
+ sudo shutdown -r now 2>/dev/null || {
113
+ log "sudo shutdown failed — trying osascript..."
114
+ osascript -e 'tell application "System Events" to restart' 2>/dev/null || {
115
+ log "All reboot methods failed — manual intervention required"
116
+ exit 1
117
+ }
118
+ }
119
+ }
120
+
121
+ force_reboot() {
122
+ log "FORCE REBOOT initiated"
123
+ echo "Force reboot requested" > "$MAESTRO_DIR/.emergency-stop" 2>/dev/null || true
124
+ sudo reboot 2>/dev/null || sudo shutdown -r now 2>/dev/null || {
125
+ log "sudo reboot failed — requires password or system is frozen"
126
+ echo "ERROR: Cannot force reboot without sudo. Options:"
127
+ echo " 1. Run: sudo reboot"
128
+ echo " 2. Power cycle via smart plug"
129
+ echo " 3. Physical power cycle"
130
+ exit 1
131
+ }
132
+ }
133
+
134
+ case "${1:---status}" in
135
+ --status)
136
+ check_heartbeat
137
+ ;;
138
+ --graceful)
139
+ graceful_reboot
140
+ ;;
141
+ --force)
142
+ force_reboot
143
+ ;;
144
+ *)
145
+ echo "Usage: $0 [--status|--graceful|--force]"
146
+ exit 1
147
+ ;;
148
+ esac
@@ -0,0 +1,329 @@
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # memory-watchdog.sh — System Resource Watchdog for Maestro Agent Deployment
4
+ # =============================================================================
5
+ #
6
+ # Monitors memory usage and process count for Claude Code subprocesses.
7
+ # Kills runaway subagents when thresholds are exceeded.
8
+ # Triggers emergency stop at critical levels.
9
+ #
10
+ # Runs every 30 seconds via launchd (ai.maestro.memory-watchdog.plist).
11
+ #
12
+ # Thresholds (configurable via environment or defaults below):
13
+ # WARNING: >60% RAM used by claude processes → log warning
14
+ # CRITICAL: >75% RAM used by claude processes → kill oldest subagents
15
+ # EMERGENCY: >85% total system memory pressure → emergency stop
16
+ #
17
+ # Usage:
18
+ # ./scripts/watchdog/memory-watchdog.sh # Normal run
19
+ # ./scripts/watchdog/memory-watchdog.sh --check # Report status only
20
+ # ./scripts/watchdog/memory-watchdog.sh --dry-run # Show what would be killed
21
+ #
22
+ # =============================================================================
23
+
24
+ set -euo pipefail
25
+
26
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
27
+ MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
28
+
29
+ # Auto-detect agent directory (sophie-ai, wundr, etc.)
30
+ # The watchdog runs from maestro but protects the whole machine
31
+ AGENT_DIR="${AGENT_DIR:-}"
32
+
33
+ # --- Ensure system paths are available ----------------------------------------
34
+ # launchd PATH may not include /usr/sbin where sysctl lives
35
+ export PATH="/usr/sbin:/usr/bin:/bin:/usr/local/bin:/opt/homebrew/bin:$PATH"
36
+
37
+ # --- Configuration -----------------------------------------------------------
38
+
39
+ # Total physical RAM in KB
40
+ TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
41
+
42
+ # Thresholds as percentage of total RAM for claude process RSS
43
+ CLAUDE_WARN_PERCENT="${WATCHDOG_WARN_PERCENT:-60}"
44
+ CLAUDE_CRITICAL_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
45
+
46
+ # System-wide memory pressure threshold (percentage)
47
+ SYSTEM_EMERGENCY_PERCENT="${WATCHDOG_EMERGENCY_PERCENT:-85}"
48
+
49
+ # Max concurrent claude CLI processes (excluding the main interactive session)
50
+ MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
51
+
52
+ # Minimum age (seconds) before a claude process can be killed
53
+ # Protects freshly-spawned sessions from being killed immediately
54
+ MIN_AGE_SECONDS="${WATCHDOG_MIN_AGE:-60}"
55
+
56
+ # Log file
57
+ LOG_DIR="$MAESTRO_DIR/logs/watchdog"
58
+ LOG_FILE="$LOG_DIR/$(date +%Y-%m-%d)-watchdog.jsonl"
59
+
60
+ # State file for tracking actions across runs
61
+ STATE_FILE="$MAESTRO_DIR/state/watchdog-state.yaml"
62
+
63
+ # --- Helpers -----------------------------------------------------------------
64
+
65
+ mkdir -p "$LOG_DIR" "$(dirname "$STATE_FILE")"
66
+
67
+ log_event() {
68
+ local level="$1" event="$2" detail="${3:-}"
69
+ local ts
70
+ ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
71
+ printf '{"ts":"%s","level":"%s","event":"%s","detail":%s}\n' \
72
+ "$ts" "$level" "$event" "$(echo "$detail" | python3 -c 'import sys,json; print(json.dumps(sys.stdin.read().strip()))' 2>/dev/null || echo '""')" \
73
+ >> "$LOG_FILE"
74
+ }
75
+
76
+ # --- Memory Metrics ----------------------------------------------------------
77
+
78
+ get_system_memory_pressure() {
79
+ # macOS memory_pressure returns a percentage of total memory under pressure
80
+ # We parse the "System-wide memory free percentage" line
81
+ local free_pct
82
+ free_pct=$(memory_pressure 2>/dev/null | grep "System-wide memory free percentage:" | awk '{print $NF}' | tr -d '%')
83
+ if [ -z "$free_pct" ]; then
84
+ # Fallback: calculate from vm_stat
85
+ local page_size free_pages
86
+ page_size=$(vm_stat | head -1 | grep -o '[0-9]*')
87
+ free_pages=$(vm_stat | awk '/Pages free/ {print $NF}' | tr -d '.')
88
+ local free_kb=$(( (free_pages * page_size) / 1024 ))
89
+ free_pct=$(( (free_kb * 100) / TOTAL_RAM_KB ))
90
+ fi
91
+ echo $(( 100 - free_pct ))
92
+ }
93
+
94
+ get_claude_processes() {
95
+ # Returns: PID RSS_KB ELAPSED_SECONDS COMMAND
96
+ # Finds all claude CLI processes (both 'claude' and node-based claude)
97
+ # Excludes this watchdog script itself
98
+ /bin/ps -eo pid,rss,etime,comm | \
99
+ grep -i "claude" | \
100
+ grep -v "Claude.app" | \
101
+ grep -v "Claude Helper" | \
102
+ grep -v "grep" | \
103
+ grep -v "watchdog" | \
104
+ awk '{
105
+ pid = $1
106
+ rss = $2
107
+ etime = $3
108
+ comm = $4
109
+ # Parse etime (formats: SS, MM:SS, HH:MM:SS, D-HH:MM:SS)
110
+ n = split(etime, parts, ":")
111
+ if (n == 1) { secs = parts[1] }
112
+ else if (n == 2) { secs = parts[1]*60 + parts[2] }
113
+ else if (n == 3) {
114
+ # Check for D- prefix
115
+ if (index(parts[1], "-") > 0) {
116
+ split(parts[1], dp, "-")
117
+ secs = dp[1]*86400 + dp[2]*3600 + parts[2]*60 + parts[3]
118
+ } else {
119
+ secs = parts[1]*3600 + parts[2]*60 + parts[3]
120
+ }
121
+ }
122
+ printf "%s %s %s %s\n", pid, rss, secs, comm
123
+ }' | sort -k2 -rn
124
+ }
125
+
126
+ get_total_claude_rss_kb() {
127
+ get_claude_processes | awk '{sum += $2} END {print sum+0}'
128
+ }
129
+
130
+ count_claude_processes() {
131
+ get_claude_processes | wc -l | tr -d ' '
132
+ }
133
+
134
+ # --- Identification ----------------------------------------------------------
135
+
136
+ identify_main_session() {
137
+ # The main interactive claude session is typically:
138
+ # 1. The one connected to a TTY
139
+ # 2. The longest-running one
140
+ # 3. The parent of subagent processes
141
+ # We use the longest-running as a heuristic
142
+ get_claude_processes | tail -1 | awk '{print $1}'
143
+ }
144
+
145
+ # --- Actions -----------------------------------------------------------------
146
+
147
+ kill_oldest_subagents() {
148
+ local target_count="$1"
149
+ local dry_run="${2:-false}"
150
+ local main_pid
151
+ main_pid=$(identify_main_session)
152
+ local killed=0
153
+
154
+ # Get processes sorted by RSS descending (kill biggest first)
155
+ # Skip the main session
156
+ while IFS=' ' read -r pid rss age comm; do
157
+ if [ "$pid" = "$main_pid" ]; then
158
+ continue
159
+ fi
160
+ if [ "$age" -lt "$MIN_AGE_SECONDS" ]; then
161
+ log_event "info" "skip_young" "PID $pid age ${age}s < ${MIN_AGE_SECONDS}s minimum"
162
+ continue
163
+ fi
164
+ if [ "$killed" -ge "$target_count" ]; then
165
+ break
166
+ fi
167
+
168
+ local rss_mb=$(( rss / 1024 ))
169
+ if [ "$dry_run" = "true" ]; then
170
+ echo " [DRY RUN] Would kill PID $pid (${rss_mb}MB, age ${age}s)"
171
+ log_event "info" "dry_run_kill" "PID=$pid RSS=${rss_mb}MB age=${age}s"
172
+ else
173
+ echo " Killing PID $pid (${rss_mb}MB, age ${age}s)"
174
+ kill -TERM "$pid" 2>/dev/null || true
175
+ log_event "warn" "killed_subagent" "PID=$pid RSS=${rss_mb}MB age=${age}s signal=TERM"
176
+ killed=$((killed + 1))
177
+
178
+ # Give it 5 seconds to exit gracefully, then SIGKILL
179
+ (
180
+ sleep 5
181
+ if kill -0 "$pid" 2>/dev/null; then
182
+ kill -9 "$pid" 2>/dev/null || true
183
+ # Log to file directly since this is a subshell
184
+ printf '{"ts":"%s","level":"warn","event":"force_killed","detail":"PID=%s did not exit after SIGTERM"}\n' \
185
+ "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$pid" >> "$LOG_FILE"
186
+ fi
187
+ ) &
188
+ fi
189
+ done <<< "$(get_claude_processes)"
190
+
191
+ echo "$killed"
192
+ }
193
+
194
+ trigger_emergency_stop() {
195
+ local reason="$1"
196
+ local stop_file="$MAESTRO_DIR/.emergency-stop"
197
+
198
+ if [ -f "$stop_file" ]; then
199
+ log_event "warn" "emergency_already_active" "$reason"
200
+ return 0
201
+ fi
202
+
203
+ echo "$reason" > "$stop_file"
204
+ log_event "critical" "emergency_stop_triggered" "$reason"
205
+
206
+ # Also create in any detected agent directories
207
+ for agent_dir in "$HOME"/sophie-ai "$HOME"/wundr; do
208
+ if [ -d "$agent_dir" ]; then
209
+ echo "$reason" > "$agent_dir/.emergency-stop" 2>/dev/null || true
210
+ fi
211
+ done
212
+
213
+ # Kill ALL non-main claude subprocesses
214
+ local main_pid
215
+ main_pid=$(identify_main_session)
216
+ while IFS=' ' read -r pid rss age comm; do
217
+ if [ "$pid" != "$main_pid" ] && [ -n "$pid" ]; then
218
+ kill -TERM "$pid" 2>/dev/null || true
219
+ fi
220
+ done <<< "$(get_claude_processes)"
221
+
222
+ echo "EMERGENCY STOP: $reason"
223
+ }
224
+
225
+ # --- Main Logic --------------------------------------------------------------
226
+
227
+ run_watchdog() {
228
+ local mode="${1:-run}"
229
+ local dry_run="false"
230
+ [ "$mode" = "--dry-run" ] && dry_run="true"
231
+ [ "$mode" = "--check" ] && dry_run="true"
232
+
233
+ # Collect metrics
234
+ local claude_rss_kb claude_count system_pressure
235
+ claude_rss_kb=$(get_total_claude_rss_kb)
236
+ claude_count=$(count_claude_processes)
237
+ system_pressure=$(get_system_memory_pressure)
238
+
239
+ local claude_rss_mb=$(( claude_rss_kb / 1024 ))
240
+ local claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
241
+ local warn_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 / 1024 ))
242
+ local critical_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_CRITICAL_PERCENT) / 100 / 1024 ))
243
+
244
+ # Report
245
+ if [ "$mode" = "--check" ]; then
246
+ echo "=== Memory Watchdog Status ==="
247
+ echo "Total RAM: $(( TOTAL_RAM_KB / 1024 ))MB"
248
+ echo "Claude processes: $claude_count (max: $MAX_CLAUDE_PROCS)"
249
+ echo "Claude RSS total: ${claude_rss_mb}MB (${claude_pct}% of RAM)"
250
+ echo " Warning threshold: ${warn_threshold_mb}MB (${CLAUDE_WARN_PERCENT}%)"
251
+ echo " Critical threshold: ${critical_threshold_mb}MB (${CLAUDE_CRITICAL_PERCENT}%)"
252
+ echo "System mem pressure: ${system_pressure}% (emergency at ${SYSTEM_EMERGENCY_PERCENT}%)"
253
+ echo ""
254
+ echo "Claude processes:"
255
+ get_claude_processes | while IFS=' ' read -r pid rss age comm; do
256
+ echo " PID $pid: $(( rss / 1024 ))MB, age ${age}s — $comm"
257
+ done
258
+ echo "=============================="
259
+ return 0
260
+ fi
261
+
262
+ # --- Level 1: System emergency (memory pressure) ---------------------------
263
+ if [ "$system_pressure" -ge "$SYSTEM_EMERGENCY_PERCENT" ]; then
264
+ log_event "critical" "system_memory_emergency" "pressure=${system_pressure}% >= ${SYSTEM_EMERGENCY_PERCENT}%"
265
+ if [ "$dry_run" = "false" ]; then
266
+ trigger_emergency_stop "Memory pressure ${system_pressure}% exceeded ${SYSTEM_EMERGENCY_PERCENT}% emergency threshold"
267
+ else
268
+ echo "[DRY RUN] Would trigger EMERGENCY STOP (pressure=${system_pressure}%)"
269
+ fi
270
+ return 0
271
+ fi
272
+
273
+ # --- Level 2: Claude RSS critical — kill biggest subagents -----------------
274
+ if [ "$claude_pct" -ge "$CLAUDE_CRITICAL_PERCENT" ]; then
275
+ log_event "warn" "claude_memory_critical" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_CRITICAL_PERCENT}%"
276
+ echo "CRITICAL: Claude processes using ${claude_rss_mb}MB (${claude_pct}% of RAM)"
277
+
278
+ # Kill enough to get below warning threshold
279
+ local target_rss_kb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 ))
280
+ local excess_kb=$(( claude_rss_kb - target_rss_kb ))
281
+ # Estimate ~600MB per process
282
+ local procs_to_kill=$(( (excess_kb / 614400) + 1 ))
283
+ [ "$procs_to_kill" -lt 1 ] && procs_to_kill=1
284
+
285
+ echo " Killing $procs_to_kill subagent(s) to free ~$(( excess_kb / 1024 ))MB..."
286
+ kill_oldest_subagents "$procs_to_kill" "$dry_run"
287
+ return 0
288
+ fi
289
+
290
+ # --- Level 3: Too many concurrent processes --------------------------------
291
+ if [ "$claude_count" -gt "$MAX_CLAUDE_PROCS" ]; then
292
+ local excess=$(( claude_count - MAX_CLAUDE_PROCS ))
293
+ log_event "warn" "too_many_claude_procs" "count=${claude_count} > max=${MAX_CLAUDE_PROCS}"
294
+ echo "WARNING: $claude_count claude processes running (max: $MAX_CLAUDE_PROCS)"
295
+ echo " Killing $excess oldest subagent(s)..."
296
+ kill_oldest_subagents "$excess" "$dry_run"
297
+ return 0
298
+ fi
299
+
300
+ # --- Level 4: Claude RSS warning — log only --------------------------------
301
+ if [ "$claude_pct" -ge "$CLAUDE_WARN_PERCENT" ]; then
302
+ log_event "warn" "claude_memory_warning" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_WARN_PERCENT}%"
303
+ return 0
304
+ fi
305
+
306
+ # --- All clear -------------------------------------------------------------
307
+ # Only log periodically (every 5 minutes = every 10th run at 30s interval)
308
+ local run_count_file="$MAESTRO_DIR/state/watchdog-run-count"
309
+ local run_count=0
310
+ [ -f "$run_count_file" ] && run_count=$(cat "$run_count_file")
311
+ run_count=$(( (run_count + 1) % 10 ))
312
+ echo "$run_count" > "$run_count_file"
313
+
314
+ if [ "$run_count" -eq 0 ]; then
315
+ log_event "info" "healthy" "claude_procs=${claude_count} rss=${claude_rss_mb}MB (${claude_pct}%) pressure=${system_pressure}%"
316
+ fi
317
+ }
318
+
319
+ # --- Write heartbeat ---------------------------------------------------------
320
+
321
+ write_heartbeat() {
322
+ local heartbeat_file="$MAESTRO_DIR/state/heartbeat"
323
+ date -u +"%Y-%m-%dT%H:%M:%SZ" > "$heartbeat_file"
324
+ }
325
+
326
+ # --- Entry point -------------------------------------------------------------
327
+
328
+ write_heartbeat
329
+ run_watchdog "${1:-run}"