@adaptic/maestro 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adaptic/maestro",
3
- "version": "1.1.4",
3
+ "version": "1.1.5",
4
4
  "description": "Maestro — Autonomous AI agent operating system. Deploy AI employees on dedicated Mac minis.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -296,14 +296,63 @@ configure_app_launches() {
296
296
  # The poller uses IMAP directly, not the browser
297
297
  log "Safari: not needed as Login Item (Gmail uses IMAP polling)"
298
298
 
299
- # Ensure required directories exist
300
- mkdir -p "$AGENT_DIR/logs/huddle"
301
- mkdir -p "$AGENT_DIR/state/huddle"
302
- ok "Log and state directories verified"
299
+ # Ensure required directories exist (as the real user, not root)
300
+ local real_user="${SUDO_USER:-$CURRENT_USER}"
301
+ local dirs=("$AGENT_DIR/logs/huddle" "$AGENT_DIR/state/huddle" "$AGENT_DIR/logs/watchdog")
302
+ for d in "${dirs[@]}"; do
303
+ mkdir -p "$d"
304
+ chown "$real_user:staff" "$d" 2>/dev/null || true
305
+ done
306
+ ok "Log and state directories verified (owned by $real_user)"
303
307
  }
304
308
 
305
309
  # =============================================================================
306
- # 7. ORCHESTRATOR AND SUBSYSTEMS ON BOOT
310
+ # 7. MEMORY WATCHDOG
311
+ # =============================================================================
312
+
313
+ configure_memory_watchdog() {
314
+ section "Memory Watchdog (OOM Protection)"
315
+
316
+ local WATCHDOG_SCRIPT="$AGENT_DIR/scripts/watchdog/memory-watchdog.sh"
317
+ local PLIST_SOURCE="$AGENT_DIR/scripts/watchdog/ai.maestro.memory-watchdog.plist"
318
+
319
+ if [ ! -f "$WATCHDOG_SCRIPT" ]; then
320
+ fail "Watchdog script not found at $WATCHDOG_SCRIPT"
321
+ return 0
322
+ fi
323
+
324
+ chmod +x "$WATCHDOG_SCRIPT"
325
+ ok "Watchdog script executable"
326
+
327
+ # Determine the real user's home for LaunchAgents
328
+ local real_user="${SUDO_USER:-$CURRENT_USER}"
329
+ local real_home
330
+ real_home=$(eval echo "~$real_user")
331
+ local PLIST_DEST="$real_home/Library/LaunchAgents/ai.maestro.memory-watchdog.plist"
332
+
333
+ # Install the plist (update paths for this agent directory)
334
+ sed "s|/Users/sophie/maestro|$AGENT_DIR|g" "$PLIST_SOURCE" > "$PLIST_DEST"
335
+ chown "$real_user:staff" "$PLIST_DEST" 2>/dev/null || true
336
+
337
+ # Load it (as the real user)
338
+ if [ "$(id -u)" -eq 0 ]; then
339
+ su "$real_user" -c "launchctl unload '$PLIST_DEST' 2>/dev/null; launchctl load '$PLIST_DEST'" 2>/dev/null || true
340
+ else
341
+ launchctl unload "$PLIST_DEST" 2>/dev/null || true
342
+ launchctl load "$PLIST_DEST" 2>/dev/null || true
343
+ fi
344
+
345
+ ok "Memory watchdog launchd agent installed (runs every 30s)"
346
+ log "Thresholds: warn=60% critical=75% emergency=85% max_procs=8"
347
+ log "Override via env: WATCHDOG_WARN_PERCENT, WATCHDOG_CRITICAL_PERCENT, etc."
348
+
349
+ # Create log directory with correct ownership
350
+ mkdir -p "$AGENT_DIR/logs/watchdog"
351
+ chown "$real_user:staff" "$AGENT_DIR/logs/watchdog" 2>/dev/null || true
352
+ }
353
+
354
+ # =============================================================================
355
+ # 8. ORCHESTRATOR AND SUBSYSTEMS ON BOOT
307
356
  # =============================================================================
308
357
 
309
358
  configure_boot_services() {
@@ -364,7 +413,7 @@ configure_boot_services() {
364
413
  }
365
414
 
366
415
  # =============================================================================
367
- # 8. SYSTEM VERIFICATION
416
+ # 9. SYSTEM VERIFICATION
368
417
  # =============================================================================
369
418
 
370
419
  verify_all() {
@@ -440,6 +489,30 @@ verify_all() {
440
489
  issues=$((issues + 1))
441
490
  fi
442
491
 
492
+ # Check memory watchdog
493
+ if launchctl list 2>/dev/null | grep -q "ai.maestro.memory-watchdog"; then
494
+ ok "Memory watchdog: running"
495
+ elif [ -f "$HOME/Library/LaunchAgents/ai.maestro.memory-watchdog.plist" ]; then
496
+ warn "Memory watchdog: installed but not loaded"
497
+ else
498
+ fail "Memory watchdog: not installed"
499
+ issues=$((issues + 1))
500
+ fi
501
+
502
+ # Check heartbeat freshness
503
+ local heartbeat_file="$AGENT_DIR/state/heartbeat"
504
+ if [ -f "$heartbeat_file" ]; then
505
+ local hb_age
506
+ hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$(cat "$heartbeat_file")" +%s 2>/dev/null || echo 0) ))
507
+ if [ "$hb_age" -lt 120 ]; then
508
+ ok "Heartbeat: fresh (${hb_age}s ago)"
509
+ else
510
+ warn "Heartbeat: stale (${hb_age}s ago — watchdog may not be running)"
511
+ fi
512
+ else
513
+ warn "Heartbeat: no heartbeat file found"
514
+ fi
515
+
443
516
  # Summary
444
517
  echo ""
445
518
  if [ "$issues" -eq 0 ]; then
@@ -473,6 +546,7 @@ main() {
473
546
  configure_slack_cdp
474
547
  configure_audio
475
548
  configure_app_launches
549
+ configure_memory_watchdog
476
550
  configure_boot_services
477
551
  ;;
478
552
  --full|full|"")
@@ -482,6 +556,7 @@ main() {
482
556
  configure_slack_cdp
483
557
  configure_audio
484
558
  configure_app_launches
559
+ configure_memory_watchdog
485
560
  configure_boot_services
486
561
 
487
562
  section "Configuration Complete"
@@ -28,6 +28,26 @@ if [ ! -f "$TASK_FILE" ]; then
28
28
  exit 1
29
29
  fi
30
30
 
31
+ # --- Resource gate: refuse to spawn if machine is under pressure -------------
32
+ MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
33
+ TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
34
+ SPAWN_BLOCK_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
35
+
36
+ claude_count=$(/bin/ps -eo comm | grep -ci "claude" | tr -d ' ' || true)
37
+ claude_rss_kb=$(/bin/ps -eo rss,comm | grep -i "claude" | grep -v "Claude.app" | grep -v "Claude Helper" | awk '{sum+=$1} END {print sum+0}')
38
+ claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
39
+
40
+ if [ "$claude_count" -ge "$MAX_CLAUDE_PROCS" ]; then
41
+ echo "[$TIMESTAMP] BLOCKED: Too many claude processes ($claude_count >= $MAX_CLAUDE_PROCS)" >&2
42
+ exit 1
43
+ fi
44
+
45
+ if [ "$claude_pct" -ge "$SPAWN_BLOCK_PERCENT" ]; then
46
+ echo "[$TIMESTAMP] BLOCKED: Claude memory usage ${claude_pct}% >= ${SPAWN_BLOCK_PERCENT}% threshold" >&2
47
+ exit 1
48
+ fi
49
+ # -----------------------------------------------------------------------------
50
+
31
51
  # Create session directory
32
52
  mkdir -p "$SESSION_DIR" "$(dirname "$LOG_FILE")"
33
53
 
@@ -0,0 +1,41 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3
+ <plist version="1.0">
4
+ <dict>
5
+ <key>Label</key>
6
+ <string>ai.maestro.memory-watchdog</string>
7
+
8
+ <key>ProgramArguments</key>
9
+ <array>
10
+ <string>/bin/bash</string>
11
+ <string>/Users/sophie/maestro/scripts/watchdog/memory-watchdog.sh</string>
12
+ </array>
13
+
14
+ <key>StartInterval</key>
15
+ <integer>30</integer>
16
+
17
+ <key>RunAtLoad</key>
18
+ <true/>
19
+
20
+ <key>StandardOutPath</key>
21
+ <string>/Users/sophie/maestro/logs/watchdog/launchd-stdout.log</string>
22
+
23
+ <key>StandardErrorPath</key>
24
+ <string>/Users/sophie/maestro/logs/watchdog/launchd-stderr.log</string>
25
+
26
+ <key>EnvironmentVariables</key>
27
+ <dict>
28
+ <key>PATH</key>
29
+ <string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
30
+ </dict>
31
+
32
+ <key>Nice</key>
33
+ <integer>10</integer>
34
+
35
+ <key>ProcessType</key>
36
+ <string>Background</string>
37
+
38
+ <key>ThrottleInterval</key>
39
+ <integer>10</integer>
40
+ </dict>
41
+ </plist>
@@ -0,0 +1,148 @@
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # force-reboot.sh — Remote/Automated Force Reboot for Frozen Mac Mini
4
+ # =============================================================================
5
+ #
6
+ # This script provides multiple escalation levels for rebooting a frozen
7
+ # or unresponsive Mac mini running the Maestro agent system.
8
+ #
9
+ # Usage:
10
+ # ./scripts/watchdog/force-reboot.sh --graceful # Try graceful shutdown first
11
+ # ./scripts/watchdog/force-reboot.sh --force # Immediate reboot (requires sudo)
12
+ # ./scripts/watchdog/force-reboot.sh --status # Check heartbeat and uptime
13
+ #
14
+ # Remote usage (from another machine via SSH):
15
+ # ssh sophie@mac-mini.local "~/maestro/scripts/watchdog/force-reboot.sh --graceful"
16
+ #
17
+ # =============================================================================
18
+ #
19
+ # FORCE-REBOOT OPTIONS WHEN THE MACHINE IS COMPLETELY FROZEN:
20
+ #
21
+ # If SSH is unresponsive and the GUI is frozen, the only options are:
22
+ #
23
+ # 1. SMART PLUG (recommended for headless servers):
24
+ # - Install a WiFi smart plug (TP-Link Kasa, Shelly, etc.) on the Mac mini power
25
+ # - Power cycle via the smart plug's app or API
26
+ # - Combined with `pmset autorestart 1` (already configured), the Mac mini
27
+ # will boot automatically when power is restored
28
+ # - Example (TP-Link Kasa):
29
+ # kasa --host <plug-ip> --type plug off && sleep 5 && kasa --host <plug-ip> --type plug on
30
+ # - Example (Shelly):
31
+ # curl -s "http://<shelly-ip>/relay/0?turn=off" && sleep 5 && curl -s "http://<shelly-ip>/relay/0?turn=on"
32
+ #
33
+ # 2. PARSEC (if display is frozen but network is up):
34
+ # - Parsec may still be responsive even if the GUI is frozen
35
+ # - Connect via Parsec and use Cmd+Ctrl+Power to force restart
36
+ #
37
+ # 3. APPLE REMOTE MANAGEMENT (ARD):
38
+ # - If enabled, use "Send UNIX Command" from another Mac:
39
+ # sudo shutdown -r now
40
+ #
41
+ # 4. PHYSICAL POWER CYCLE:
42
+ # - Unplug and replug the Mac mini power cable
43
+ # - With `pmset autorestart 1`, it will boot automatically
44
+ #
45
+ # 5. SSH + SYSDIAGNOSE RESET (partial freeze, SSH still works):
46
+ # - ssh sophie@mac-mini.local "sudo reboot"
47
+ # - Or: ssh sophie@mac-mini.local "sudo shutdown -r now"
48
+ #
49
+ # =============================================================================
50
+
51
+ set -euo pipefail
52
+
53
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
54
+ MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
55
+
56
+ HEARTBEAT_FILE="$MAESTRO_DIR/state/heartbeat"
57
+ LOG_FILE="$MAESTRO_DIR/logs/watchdog/$(date +%Y-%m-%d)-reboot.log"
58
+
59
+ log() {
60
+ local ts
61
+ ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
62
+ echo "[$ts] $1" | tee -a "$LOG_FILE"
63
+ }
64
+
65
+ check_heartbeat() {
66
+ if [ ! -f "$HEARTBEAT_FILE" ]; then
67
+ echo "No heartbeat file found — watchdog may not be running"
68
+ return 1
69
+ fi
70
+
71
+ local hb_time hb_age
72
+ hb_time=$(cat "$HEARTBEAT_FILE")
73
+ hb_age=$(( $(date +%s) - $(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$hb_time" +%s 2>/dev/null || echo 0) ))
74
+
75
+ echo "Last heartbeat: $hb_time (${hb_age}s ago)"
76
+ echo "System uptime: $(uptime)"
77
+
78
+ if [ "$hb_age" -gt 120 ]; then
79
+ echo "WARNING: Heartbeat is stale (>2 min) — watchdog may be stuck"
80
+ return 1
81
+ fi
82
+ return 0
83
+ }
84
+
85
+ graceful_reboot() {
86
+ log "Initiating graceful reboot..."
87
+
88
+ # 1. Trigger emergency stop to prevent new work
89
+ echo "Memory watchdog initiated graceful reboot" > "$MAESTRO_DIR/.emergency-stop"
90
+ log "Emergency stop flag set"
91
+
92
+ # 2. Give running sessions 30 seconds to wrap up
93
+ log "Waiting 30s for running sessions to complete..."
94
+ sleep 30
95
+
96
+ # 3. Kill remaining claude processes gracefully
97
+ log "Sending SIGTERM to remaining claude processes..."
98
+ pkill -TERM -f "claude" 2>/dev/null || true
99
+ sleep 5
100
+
101
+ # 4. Commit any uncommitted state
102
+ log "Attempting to commit state..."
103
+ (
104
+ cd "$MAESTRO_DIR"
105
+ git add -A state/ logs/ 2>/dev/null || true
106
+ git commit -m "chore: pre-reboot state save (watchdog-initiated)" 2>/dev/null || true
107
+ git push origin main 2>/dev/null || true
108
+ ) || true
109
+
110
+ # 5. Reboot
111
+ log "Rebooting now..."
112
+ sudo shutdown -r now 2>/dev/null || {
113
+ log "sudo shutdown failed — trying osascript..."
114
+ osascript -e 'tell application "System Events" to restart' 2>/dev/null || {
115
+ log "All reboot methods failed — manual intervention required"
116
+ exit 1
117
+ }
118
+ }
119
+ }
120
+
121
+ force_reboot() {
122
+ log "FORCE REBOOT initiated"
123
+ echo "Force reboot requested" > "$MAESTRO_DIR/.emergency-stop" 2>/dev/null || true
124
+ sudo reboot 2>/dev/null || sudo shutdown -r now 2>/dev/null || {
125
+ log "sudo reboot failed — requires password or system is frozen"
126
+ echo "ERROR: Cannot force reboot without sudo. Options:"
127
+ echo " 1. Run: sudo reboot"
128
+ echo " 2. Power cycle via smart plug"
129
+ echo " 3. Physical power cycle"
130
+ exit 1
131
+ }
132
+ }
133
+
134
+ case "${1:---status}" in
135
+ --status)
136
+ check_heartbeat
137
+ ;;
138
+ --graceful)
139
+ graceful_reboot
140
+ ;;
141
+ --force)
142
+ force_reboot
143
+ ;;
144
+ *)
145
+ echo "Usage: $0 [--status|--graceful|--force]"
146
+ exit 1
147
+ ;;
148
+ esac
@@ -0,0 +1,325 @@
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # memory-watchdog.sh — System Resource Watchdog for Maestro Agent Deployment
4
+ # =============================================================================
5
+ #
6
+ # Monitors memory usage and process count for Claude Code subprocesses.
7
+ # Kills runaway subagents when thresholds are exceeded.
8
+ # Triggers emergency stop at critical levels.
9
+ #
10
+ # Runs every 30 seconds via launchd (ai.maestro.memory-watchdog.plist).
11
+ #
12
+ # Thresholds (configurable via environment or defaults below):
13
+ # WARNING: >60% RAM used by claude processes → log warning
14
+ # CRITICAL: >75% RAM used by claude processes → kill oldest subagents
15
+ # EMERGENCY: >85% total system memory pressure → emergency stop
16
+ #
17
+ # Usage:
18
+ # ./scripts/watchdog/memory-watchdog.sh # Normal run
19
+ # ./scripts/watchdog/memory-watchdog.sh --check # Report status only
20
+ # ./scripts/watchdog/memory-watchdog.sh --dry-run # Show what would be killed
21
+ #
22
+ # =============================================================================
23
+
24
+ set -euo pipefail
25
+
26
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
27
+ MAESTRO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
28
+
29
+ # Auto-detect agent directory (sophie-ai, wundr, etc.)
30
+ # The watchdog runs from maestro but protects the whole machine
31
+ AGENT_DIR="${AGENT_DIR:-}"
32
+
33
+ # --- Configuration -----------------------------------------------------------
34
+
35
+ # Total physical RAM in KB
36
+ TOTAL_RAM_KB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1 / 1024}')
37
+
38
+ # Thresholds as percentage of total RAM for claude process RSS
39
+ CLAUDE_WARN_PERCENT="${WATCHDOG_WARN_PERCENT:-60}"
40
+ CLAUDE_CRITICAL_PERCENT="${WATCHDOG_CRITICAL_PERCENT:-75}"
41
+
42
+ # System-wide memory pressure threshold (percentage)
43
+ SYSTEM_EMERGENCY_PERCENT="${WATCHDOG_EMERGENCY_PERCENT:-85}"
44
+
45
+ # Max concurrent claude CLI processes (excluding the main interactive session)
46
+ MAX_CLAUDE_PROCS="${WATCHDOG_MAX_CLAUDE_PROCS:-8}"
47
+
48
+ # Minimum age (seconds) before a claude process can be killed
49
+ # Protects freshly-spawned sessions from being killed immediately
50
+ MIN_AGE_SECONDS="${WATCHDOG_MIN_AGE:-60}"
51
+
52
+ # Log file
53
+ LOG_DIR="$MAESTRO_DIR/logs/watchdog"
54
+ LOG_FILE="$LOG_DIR/$(date +%Y-%m-%d)-watchdog.jsonl"
55
+
56
+ # State file for tracking actions across runs
57
+ STATE_FILE="$MAESTRO_DIR/state/watchdog-state.yaml"
58
+
59
+ # --- Helpers -----------------------------------------------------------------
60
+
61
+ mkdir -p "$LOG_DIR" "$(dirname "$STATE_FILE")"
62
+
63
+ log_event() {
64
+ local level="$1" event="$2" detail="${3:-}"
65
+ local ts
66
+ ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
67
+ printf '{"ts":"%s","level":"%s","event":"%s","detail":%s}\n' \
68
+ "$ts" "$level" "$event" "$(echo "$detail" | python3 -c 'import sys,json; print(json.dumps(sys.stdin.read().strip()))' 2>/dev/null || echo '""')" \
69
+ >> "$LOG_FILE"
70
+ }
71
+
72
+ # --- Memory Metrics ----------------------------------------------------------
73
+
74
+ get_system_memory_pressure() {
75
+ # macOS memory_pressure returns a percentage of total memory under pressure
76
+ # We parse the "System-wide memory free percentage" line
77
+ local free_pct
78
+ free_pct=$(memory_pressure 2>/dev/null | grep "System-wide memory free percentage:" | awk '{print $NF}' | tr -d '%')
79
+ if [ -z "$free_pct" ]; then
80
+ # Fallback: calculate from vm_stat
81
+ local page_size free_pages
82
+ page_size=$(vm_stat | head -1 | grep -o '[0-9]*')
83
+ free_pages=$(vm_stat | awk '/Pages free/ {print $NF}' | tr -d '.')
84
+ local free_kb=$(( (free_pages * page_size) / 1024 ))
85
+ free_pct=$(( (free_kb * 100) / TOTAL_RAM_KB ))
86
+ fi
87
+ echo $(( 100 - free_pct ))
88
+ }
89
+
90
+ get_claude_processes() {
91
+ # Returns: PID RSS_KB ELAPSED_SECONDS COMMAND
92
+ # Finds all claude CLI processes (both 'claude' and node-based claude)
93
+ # Excludes this watchdog script itself
94
+ /bin/ps -eo pid,rss,etime,comm | \
95
+ grep -i "claude" | \
96
+ grep -v "Claude.app" | \
97
+ grep -v "Claude Helper" | \
98
+ grep -v "grep" | \
99
+ grep -v "watchdog" | \
100
+ awk '{
101
+ pid = $1
102
+ rss = $2
103
+ etime = $3
104
+ comm = $4
105
+ # Parse etime (formats: SS, MM:SS, HH:MM:SS, D-HH:MM:SS)
106
+ n = split(etime, parts, ":")
107
+ if (n == 1) { secs = parts[1] }
108
+ else if (n == 2) { secs = parts[1]*60 + parts[2] }
109
+ else if (n == 3) {
110
+ # Check for D- prefix
111
+ if (index(parts[1], "-") > 0) {
112
+ split(parts[1], dp, "-")
113
+ secs = dp[1]*86400 + dp[2]*3600 + parts[2]*60 + parts[3]
114
+ } else {
115
+ secs = parts[1]*3600 + parts[2]*60 + parts[3]
116
+ }
117
+ }
118
+ printf "%s %s %s %s\n", pid, rss, secs, comm
119
+ }' | sort -k2 -rn
120
+ }
121
+
122
+ get_total_claude_rss_kb() {
123
+ get_claude_processes | awk '{sum += $2} END {print sum+0}'
124
+ }
125
+
126
+ count_claude_processes() {
127
+ get_claude_processes | wc -l | tr -d ' '
128
+ }
129
+
130
+ # --- Identification ----------------------------------------------------------
131
+
132
+ identify_main_session() {
133
+ # The main interactive claude session is typically:
134
+ # 1. The one connected to a TTY
135
+ # 2. The longest-running one
136
+ # 3. The parent of subagent processes
137
+ # We use the longest-running as a heuristic
138
+ get_claude_processes | tail -1 | awk '{print $1}'
139
+ }
140
+
141
+ # --- Actions -----------------------------------------------------------------
142
+
143
+ kill_oldest_subagents() {
144
+ local target_count="$1"
145
+ local dry_run="${2:-false}"
146
+ local main_pid
147
+ main_pid=$(identify_main_session)
148
+ local killed=0
149
+
150
+ # Get processes sorted by RSS descending (kill biggest first)
151
+ # Skip the main session
152
+ while IFS=' ' read -r pid rss age comm; do
153
+ if [ "$pid" = "$main_pid" ]; then
154
+ continue
155
+ fi
156
+ if [ "$age" -lt "$MIN_AGE_SECONDS" ]; then
157
+ log_event "info" "skip_young" "PID $pid age ${age}s < ${MIN_AGE_SECONDS}s minimum"
158
+ continue
159
+ fi
160
+ if [ "$killed" -ge "$target_count" ]; then
161
+ break
162
+ fi
163
+
164
+ local rss_mb=$(( rss / 1024 ))
165
+ if [ "$dry_run" = "true" ]; then
166
+ echo " [DRY RUN] Would kill PID $pid (${rss_mb}MB, age ${age}s)"
167
+ log_event "info" "dry_run_kill" "PID=$pid RSS=${rss_mb}MB age=${age}s"
168
+ else
169
+ echo " Killing PID $pid (${rss_mb}MB, age ${age}s)"
170
+ kill -TERM "$pid" 2>/dev/null || true
171
+ log_event "warn" "killed_subagent" "PID=$pid RSS=${rss_mb}MB age=${age}s signal=TERM"
172
+ killed=$((killed + 1))
173
+
174
+ # Give it 5 seconds to exit gracefully, then SIGKILL
175
+ (
176
+ sleep 5
177
+ if kill -0 "$pid" 2>/dev/null; then
178
+ kill -9 "$pid" 2>/dev/null || true
179
+ # Log to file directly since this is a subshell
180
+ printf '{"ts":"%s","level":"warn","event":"force_killed","detail":"PID=%s did not exit after SIGTERM"}\n' \
181
+ "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$pid" >> "$LOG_FILE"
182
+ fi
183
+ ) &
184
+ fi
185
+ done <<< "$(get_claude_processes)"
186
+
187
+ echo "$killed"
188
+ }
189
+
190
+ trigger_emergency_stop() {
191
+ local reason="$1"
192
+ local stop_file="$MAESTRO_DIR/.emergency-stop"
193
+
194
+ if [ -f "$stop_file" ]; then
195
+ log_event "warn" "emergency_already_active" "$reason"
196
+ return 0
197
+ fi
198
+
199
+ echo "$reason" > "$stop_file"
200
+ log_event "critical" "emergency_stop_triggered" "$reason"
201
+
202
+ # Also create in any detected agent directories
203
+ for agent_dir in "$HOME"/sophie-ai "$HOME"/wundr; do
204
+ if [ -d "$agent_dir" ]; then
205
+ echo "$reason" > "$agent_dir/.emergency-stop" 2>/dev/null || true
206
+ fi
207
+ done
208
+
209
+ # Kill ALL non-main claude subprocesses
210
+ local main_pid
211
+ main_pid=$(identify_main_session)
212
+ while IFS=' ' read -r pid rss age comm; do
213
+ if [ "$pid" != "$main_pid" ] && [ -n "$pid" ]; then
214
+ kill -TERM "$pid" 2>/dev/null || true
215
+ fi
216
+ done <<< "$(get_claude_processes)"
217
+
218
+ echo "EMERGENCY STOP: $reason"
219
+ }
220
+
221
+ # --- Main Logic --------------------------------------------------------------
222
+
223
+ run_watchdog() {
224
+ local mode="${1:-run}"
225
+ local dry_run="false"
226
+ [ "$mode" = "--dry-run" ] && dry_run="true"
227
+ [ "$mode" = "--check" ] && dry_run="true"
228
+
229
+ # Collect metrics
230
+ local claude_rss_kb claude_count system_pressure
231
+ claude_rss_kb=$(get_total_claude_rss_kb)
232
+ claude_count=$(count_claude_processes)
233
+ system_pressure=$(get_system_memory_pressure)
234
+
235
+ local claude_rss_mb=$(( claude_rss_kb / 1024 ))
236
+ local claude_pct=$(( (claude_rss_kb * 100) / TOTAL_RAM_KB ))
237
+ local warn_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 / 1024 ))
238
+ local critical_threshold_mb=$(( (TOTAL_RAM_KB * CLAUDE_CRITICAL_PERCENT) / 100 / 1024 ))
239
+
240
+ # Report
241
+ if [ "$mode" = "--check" ]; then
242
+ echo "=== Memory Watchdog Status ==="
243
+ echo "Total RAM: $(( TOTAL_RAM_KB / 1024 ))MB"
244
+ echo "Claude processes: $claude_count (max: $MAX_CLAUDE_PROCS)"
245
+ echo "Claude RSS total: ${claude_rss_mb}MB (${claude_pct}% of RAM)"
246
+ echo " Warning threshold: ${warn_threshold_mb}MB (${CLAUDE_WARN_PERCENT}%)"
247
+ echo " Critical threshold: ${critical_threshold_mb}MB (${CLAUDE_CRITICAL_PERCENT}%)"
248
+ echo "System mem pressure: ${system_pressure}% (emergency at ${SYSTEM_EMERGENCY_PERCENT}%)"
249
+ echo ""
250
+ echo "Claude processes:"
251
+ get_claude_processes | while IFS=' ' read -r pid rss age comm; do
252
+ echo " PID $pid: $(( rss / 1024 ))MB, age ${age}s — $comm"
253
+ done
254
+ echo "=============================="
255
+ return 0
256
+ fi
257
+
258
+ # --- Level 1: System emergency (memory pressure) ---------------------------
259
+ if [ "$system_pressure" -ge "$SYSTEM_EMERGENCY_PERCENT" ]; then
260
+ log_event "critical" "system_memory_emergency" "pressure=${system_pressure}% >= ${SYSTEM_EMERGENCY_PERCENT}%"
261
+ if [ "$dry_run" = "false" ]; then
262
+ trigger_emergency_stop "Memory pressure ${system_pressure}% exceeded ${SYSTEM_EMERGENCY_PERCENT}% emergency threshold"
263
+ else
264
+ echo "[DRY RUN] Would trigger EMERGENCY STOP (pressure=${system_pressure}%)"
265
+ fi
266
+ return 0
267
+ fi
268
+
269
+ # --- Level 2: Claude RSS critical — kill biggest subagents -----------------
270
+ if [ "$claude_pct" -ge "$CLAUDE_CRITICAL_PERCENT" ]; then
271
+ log_event "warn" "claude_memory_critical" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_CRITICAL_PERCENT}%"
272
+ echo "CRITICAL: Claude processes using ${claude_rss_mb}MB (${claude_pct}% of RAM)"
273
+
274
+ # Kill enough to get below warning threshold
275
+ local target_rss_kb=$(( (TOTAL_RAM_KB * CLAUDE_WARN_PERCENT) / 100 ))
276
+ local excess_kb=$(( claude_rss_kb - target_rss_kb ))
277
+ # Estimate ~600MB per process
278
+ local procs_to_kill=$(( (excess_kb / 614400) + 1 ))
279
+ [ "$procs_to_kill" -lt 1 ] && procs_to_kill=1
280
+
281
+ echo " Killing $procs_to_kill subagent(s) to free ~$(( excess_kb / 1024 ))MB..."
282
+ kill_oldest_subagents "$procs_to_kill" "$dry_run"
283
+ return 0
284
+ fi
285
+
286
+ # --- Level 3: Too many concurrent processes --------------------------------
287
+ if [ "$claude_count" -gt "$MAX_CLAUDE_PROCS" ]; then
288
+ local excess=$(( claude_count - MAX_CLAUDE_PROCS ))
289
+ log_event "warn" "too_many_claude_procs" "count=${claude_count} > max=${MAX_CLAUDE_PROCS}"
290
+ echo "WARNING: $claude_count claude processes running (max: $MAX_CLAUDE_PROCS)"
291
+ echo " Killing $excess oldest subagent(s)..."
292
+ kill_oldest_subagents "$excess" "$dry_run"
293
+ return 0
294
+ fi
295
+
296
+ # --- Level 4: Claude RSS warning — log only --------------------------------
297
+ if [ "$claude_pct" -ge "$CLAUDE_WARN_PERCENT" ]; then
298
+ log_event "warn" "claude_memory_warning" "RSS=${claude_rss_mb}MB (${claude_pct}%) >= ${CLAUDE_WARN_PERCENT}%"
299
+ return 0
300
+ fi
301
+
302
+ # --- All clear -------------------------------------------------------------
303
+ # Only log periodically (every 5 minutes = every 10th run at 30s interval)
304
+ local run_count_file="$MAESTRO_DIR/state/watchdog-run-count"
305
+ local run_count=0
306
+ [ -f "$run_count_file" ] && run_count=$(cat "$run_count_file")
307
+ run_count=$(( (run_count + 1) % 10 ))
308
+ echo "$run_count" > "$run_count_file"
309
+
310
+ if [ "$run_count" -eq 0 ]; then
311
+ log_event "info" "healthy" "claude_procs=${claude_count} rss=${claude_rss_mb}MB (${claude_pct}%) pressure=${system_pressure}%"
312
+ fi
313
+ }
314
+
315
+ # --- Write heartbeat ---------------------------------------------------------
316
+
317
+ write_heartbeat() {
318
+ local heartbeat_file="$MAESTRO_DIR/state/heartbeat"
319
+ date -u +"%Y-%m-%dT%H:%M:%SZ" > "$heartbeat_file"
320
+ }
321
+
322
+ # --- Entry point -------------------------------------------------------------
323
+
324
+ write_heartbeat
325
+ run_watchdog "${1:-run}"