nexo-brain 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,645 @@
1
+ #!/bin/bash
2
+ # ============================================================================
3
+ # NEXO Watchdog — Health monitor with two-level auto-repair
4
+ # ============================================================================
5
+ # Monitors all NEXO core LaunchAgents, cron jobs, and infrastructure.
6
+ # Level 1: Mechanical repair (launchctl bootstrap/kickstart, chmod)
7
+ # Level 2: Launches NEXO CLI for intelligent diagnosis and fix
8
+ #
9
+ # Install: Add to LaunchAgents for periodic execution (every 5 min recommended)
10
+ # ============================================================================
11
+ set -uo pipefail
12
+
13
+ # === PATHS ===
14
+ HOME_DIR="$HOME"
15
+ NEXO_DIR="$HOME_DIR/claude/nexo-mcp"
16
+ OPS_DIR="$HOME_DIR/claude/operations"
17
+ LOG_DIR="$HOME_DIR/claude/logs"
18
+ LOG="$LOG_DIR/watchdog.log"
19
+ STATUS_JSON="$OPS_DIR/watchdog-status.json"
20
+ REPORT_TXT="$OPS_DIR/watchdog-report.txt"
21
+ ALERT_FILE="$OPS_DIR/.watchdog-alert"
22
+ FAIL_COUNT_FILE="$HOME_DIR/claude/scripts/.watchdog-fails"
23
+ MAX_FAILS=3
24
+
25
+ mkdir -p "$LOG_DIR" "$OPS_DIR"
26
+
27
+ TS=$(date "+%Y-%m-%d %H:%M:%S")
28
+ TS_EPOCH=$(date +%s)
29
+
30
+ log() { echo "[$TS] $1" >> "$LOG"; }
31
+
32
+ # ============================================================================
33
+ # HELPER FUNCTIONS
34
+ # ============================================================================
35
+
36
+ UID_NUM=$(id -u)
37
+ REPAIR_LOG="$LOG_DIR/watchdog-repairs.log"
38
+ TOTAL_HEALED=0
39
+
40
+ log_repair() { echo "[$TS] REPAIR: $1" >> "$REPAIR_LOG"; log "REPAIR: $1"; }
41
+
42
+ is_loaded() {
43
+ launchctl list "$1" &>/dev/null
44
+ }
45
+
46
+ file_age() {
47
+ if [ -f "$1" ]; then
48
+ local mod_epoch
49
+ # macOS: stat -f %m, Linux: stat -c %Y
50
+ mod_epoch=$(stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo 0)
51
+ echo $(( TS_EPOCH - mod_epoch ))
52
+ else
53
+ echo 999999
54
+ fi
55
+ }
56
+
57
+ format_age() {
58
+ local secs=$1
59
+ if [ "$secs" -ge 999999 ]; then
60
+ echo "never"
61
+ elif [ "$secs" -ge 86400 ]; then
62
+ echo "$((secs / 86400))d $((secs % 86400 / 3600))h ago"
63
+ elif [ "$secs" -ge 3600 ]; then
64
+ echo "$((secs / 3600))h $((secs % 3600 / 60))m ago"
65
+ elif [ "$secs" -ge 60 ]; then
66
+ echo "$((secs / 60))m ago"
67
+ else
68
+ echo "${secs}s ago"
69
+ fi
70
+ }
71
+
72
+ check_errors() {
73
+ local logfile="$1"
74
+ if [ -f "$logfile" ] && [ -s "$logfile" ]; then
75
+ tail -50 "$logfile" 2>/dev/null | grep -cE "$ERROR_PATTERNS" 2>/dev/null || echo 0
76
+ else
77
+ echo 0
78
+ fi
79
+ }
80
+
81
+ process_running() {
82
+ if [ -n "$1" ]; then
83
+ pgrep -f "$1" > /dev/null 2>&1
84
+ else
85
+ return 1
86
+ fi
87
+ }
88
+
89
+ json_escape() {
90
+ echo "$1" | sed 's/\\/\\\\/g; s/"/\\"/g; s/ / /g' | tr '\n' ' '
91
+ }
92
+
93
+ # ============================================================================
94
+ # AUTO-REPAIR FUNCTIONS
95
+ # ============================================================================
96
+
97
+ try_repair_launchagent() {
98
+ local plist_id="$1"
99
+ local proc_grep="$2"
100
+ local plist_file="$HOME_DIR/Library/LaunchAgents/${plist_id}.plist"
101
+
102
+ # Repair 1: Not loaded — try to bootstrap
103
+ if ! is_loaded "$plist_id"; then
104
+ if [ -f "$plist_file" ]; then
105
+ launchctl bootstrap "gui/$UID_NUM" "$plist_file" 2>/dev/null
106
+ sleep 1
107
+ if is_loaded "$plist_id"; then
108
+ log_repair "$plist_id: bootstrapped successfully"
109
+ return 0
110
+ fi
111
+ fi
112
+ return 1
113
+ fi
114
+
115
+ # Repair 2: Loaded but process not running (KeepAlive) — kickstart
116
+ if [ -n "$proc_grep" ] && ! process_running "$proc_grep"; then
117
+ launchctl kickstart "gui/$UID_NUM/$plist_id" 2>/dev/null
118
+ sleep 2
119
+ if process_running "$proc_grep"; then
120
+ log_repair "$plist_id: kickstarted process '$proc_grep'"
121
+ return 0
122
+ fi
123
+ fi
124
+
125
+ return 1
126
+ }
127
+
128
+ try_repair_cron() {
129
+ local script="$1"
130
+
131
+ if [ -f "$script" ] && [ ! -x "$script" ]; then
132
+ chmod +x "$script"
133
+ if [ -x "$script" ]; then
134
+ log_repair "$script: made executable"
135
+ return 0
136
+ fi
137
+ fi
138
+
139
+ return 1
140
+ }
141
+
142
+ try_repair_backup() {
143
+ local backup_script="$NEXO_DIR/backup_cron.sh"
144
+ if [ -x "$backup_script" ]; then
145
+ "$backup_script" 2>/dev/null
146
+ sleep 1
147
+ local newest
148
+ newest=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
149
+ if [ -n "$newest" ]; then
150
+ local age
151
+ age=$(file_age "$newest")
152
+ if [ "$age" -lt 60 ]; then
153
+ log_repair "backup_cron.sh: ran successfully, fresh backup created"
154
+ return 0
155
+ fi
156
+ fi
157
+ fi
158
+ return 1
159
+ }
160
+
161
+ # ============================================================================
162
+ # MONITOR REGISTRY — NEXO Core Services
163
+ # ============================================================================
164
+ # Format: NAME|PLIST_ID|LOG_STDOUT|LOG_STDERR|MAX_STALE_SECS|PROCESS_GREP|SCHEDULE_DESC
165
+ #
166
+ # Users can add custom monitors in ~/claude/config/watchdog-monitors.conf
167
+ # (same format, one per line, # for comments)
168
+ # ============================================================================
169
+ MONITORS=(
170
+ "Auto-Close Sessions|com.nexo.auto-close-sessions|$HOME_DIR/claude/coordination/auto-close-stdout.log|$HOME_DIR/claude/coordination/auto-close-stderr.log|900||Every 5 min"
171
+ "Catchup|com.nexo.catchup|$HOME_DIR/claude/logs/catchup-stdout.log|$HOME_DIR/claude/logs/catchup-stderr.log|0||RunAtLoad once"
172
+ "Cognitive Decay|com.nexo.cognitive-decay|$HOME_DIR/claude/logs/cognitive-decay-stdout.log|$HOME_DIR/claude/logs/cognitive-decay-stderr.log|90000||Daily 3:00 AM"
173
+ "Evolution|com.nexo.evolution|$HOME_DIR/claude/logs/evolution-stdout.log|$HOME_DIR/claude/logs/evolution-stderr.log|0||Weekly Sun 3:00 AM"
174
+ "GitHub Monitor|com.nexo.github-monitor|$HOME_DIR/claude/logs/github-monitor-stdout.log|$HOME_DIR/claude/logs/github-monitor-stderr.log|90000||Daily 8:00 AM"
175
+ "Immune|com.nexo.immune|$HOME_DIR/claude/coordination/immune-stdout.log|$HOME_DIR/claude/coordination/immune-stderr.log|3600||Every 30 min"
176
+ "Postmortem|com.nexo.postmortem|$HOME_DIR/claude/logs/postmortem-stdout.log|$HOME_DIR/claude/logs/postmortem-stderr.log|90000||Daily 23:30"
177
+ "Prevent Sleep|com.nexo.prevent-sleep|||0|caffeinate|KeepAlive"
178
+ "Self Audit|com.nexo.self-audit|$HOME_DIR/claude/logs/self-audit-stdout.log|$HOME_DIR/claude/logs/self-audit-stderr.log|90000||Daily 7:00 AM"
179
+ "Sleep|com.nexo.sleep|$HOME_DIR/claude/coordination/sleep-stdout.log|$HOME_DIR/claude/coordination/sleep-stderr.log|90000||Daily 4:00 AM"
180
+ "Synthesis|com.nexo.synthesis|$HOME_DIR/claude/coordination/synthesis-stdout.log|$HOME_DIR/claude/coordination/synthesis-stderr.log|10800||Every 2 hours"
181
+ )
182
+
183
+ # Load user-defined monitors if file exists
184
+ USER_MONITORS_FILE="$HOME_DIR/claude/config/watchdog-monitors.conf"
185
+ if [ -f "$USER_MONITORS_FILE" ]; then
186
+ while IFS= read -r line; do
187
+ [[ "$line" =~ ^[[:space:]]*# ]] && continue
188
+ [[ -z "$line" ]] && continue
189
+ MONITORS+=("$line")
190
+ done < "$USER_MONITORS_FILE"
191
+ fi
192
+
193
+ # Cron jobs to check (NAME|SCRIPT|CHECK_PATH|MAX_STALE_SECS|SCHEDULE)
194
+ CRON_MONITORS=(
195
+ "Backup Cron|$NEXO_DIR/backup_cron.sh|$NEXO_DIR/backups/|7200|Hourly"
196
+ )
197
+
198
+ # Error patterns to search in stderr logs (last 50 lines)
199
+ ERROR_PATTERNS="Traceback|Error:|CRITICAL|FATAL|ModuleNotFoundError|PermissionError|FileNotFoundError|ConnectionRefused|Errno"
200
+
201
+ # ============================================================================
202
+ # RUN CHECKS
203
+ # ============================================================================
204
+
205
+ TOTAL_PASS=0
206
+ TOTAL_WARN=0
207
+ TOTAL_FAIL=0
208
+ JSON_AGENTS=""
209
+ REPORT_LINES=""
210
+ FAILED_MONITORS=() # Track failed monitors for Level 2 repair
211
+
212
+ for monitor in "${MONITORS[@]}"; do
213
+ [[ "$monitor" =~ ^[[:space:]]*# ]] && continue
214
+ IFS='|' read -r name plist_id log_stdout log_stderr max_stale proc_grep schedule <<< "$monitor"
215
+
216
+ status="PASS"
217
+ details=""
218
+ loaded="unknown"
219
+ stale_age="n/a"
220
+ error_count=0
221
+ proc_alive="n/a"
222
+
223
+ # Check 1: LaunchAgent loaded?
224
+ if is_loaded "$plist_id"; then
225
+ loaded="yes"
226
+ else
227
+ loaded="no"
228
+ if try_repair_launchagent "$plist_id" "$proc_grep"; then
229
+ loaded="yes"
230
+ status="HEALED"
231
+ details="${details}Self-healed: bootstrapped. "
232
+ TOTAL_HEALED=$((TOTAL_HEALED + 1))
233
+ else
234
+ status="FAIL"
235
+ details="${details}Not loaded in launchctl (repair failed). "
236
+ fi
237
+ fi
238
+
239
+ # Check 2: Process alive? (only for KeepAlive / long-running)
240
+ if [ -n "$proc_grep" ]; then
241
+ if process_running "$proc_grep"; then
242
+ proc_alive="yes"
243
+ else
244
+ proc_alive="no"
245
+ if [ "$status" != "FAIL" ] && [ "$status" != "HEALED" ]; then
246
+ if try_repair_launchagent "$plist_id" "$proc_grep"; then
247
+ proc_alive="yes"
248
+ status="HEALED"
249
+ details="${details}Self-healed: kickstarted. "
250
+ TOTAL_HEALED=$((TOTAL_HEALED + 1))
251
+ else
252
+ status="WARN"
253
+ details="${details}Process '$proc_grep' not running (repair failed). "
254
+ fi
255
+ elif [ "$status" = "HEALED" ]; then
256
+ sleep 1
257
+ if process_running "$proc_grep"; then
258
+ proc_alive="yes"
259
+ else
260
+ details="${details}Process '$proc_grep' still not running after bootstrap. "
261
+ fi
262
+ fi
263
+ fi
264
+ fi
265
+
266
+ # Check 3: Log staleness
267
+ if [ -n "$log_stdout" ] && [ "$max_stale" -gt 0 ]; then
268
+ age=$(file_age "$log_stdout")
269
+ stale_age=$(format_age "$age")
270
+ if [ "$age" -gt $(( max_stale * 3 )) ]; then
271
+ status="FAIL"
272
+ details="${details}Log stale: $stale_age (limit: $(format_age "$max_stale")). "
273
+ elif [ "$age" -gt "$max_stale" ]; then
274
+ [ "$status" = "PASS" ] && status="WARN"
275
+ details="${details}Log slightly stale: $stale_age. "
276
+ fi
277
+ elif [ -n "$log_stdout" ]; then
278
+ if [ -f "$log_stdout" ]; then
279
+ age=$(file_age "$log_stdout")
280
+ stale_age=$(format_age "$age")
281
+ else
282
+ stale_age="no log file"
283
+ fi
284
+ fi
285
+
286
+ # Check 4: Errors in stderr log
287
+ if [ -n "$log_stderr" ]; then
288
+ error_count=$(check_errors "$log_stderr")
289
+ if [ "$error_count" -gt 5 ]; then
290
+ [ "$status" = "PASS" ] && status="WARN"
291
+ details="${details}${error_count} errors in recent stderr. "
292
+ fi
293
+ fi
294
+
295
+ [ -z "$details" ] && details="All checks passed"
296
+
297
+ case "$status" in
298
+ PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
299
+ WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
300
+ FAIL)
301
+ TOTAL_FAIL=$((TOTAL_FAIL + 1))
302
+ FAILED_MONITORS+=("${name}|${plist_id}|${log_stdout}|${log_stderr}|${proc_grep}|${schedule}|${details}")
303
+ ;;
304
+ esac
305
+
306
+ # JSON
307
+ escaped_details=$(json_escape "$details")
308
+ json_item=" {\"name\":\"$name\",\"plist\":\"$plist_id\",\"status\":\"$status\",\"loaded\":\"$loaded\",\"process\":\"$proc_alive\",\"last_activity\":\"$stale_age\",\"stderr_errors\":$error_count,\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
309
+ [ -n "$JSON_AGENTS" ] && JSON_AGENTS="${JSON_AGENTS},
310
+ ${json_item}" || JSON_AGENTS="$json_item"
311
+
312
+ # Report
313
+ case "$status" in
314
+ PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;; *) icon="????" ;;
315
+ esac
316
+ REPORT_LINES="${REPORT_LINES} [${icon}] ${name} (${schedule})
317
+ Loaded: ${loaded} | Process: ${proc_alive} | Last: ${stale_age} | Errors: ${error_count}
318
+ ${details}
319
+ "
320
+ done
321
+
322
+ # --- Cron job checks ---
323
+ CRON_JSON=""
324
+ CRON_REPORT=""
325
+ for cron_entry in "${CRON_MONITORS[@]}"; do
326
+ IFS='|' read -r name script check_path max_stale schedule <<< "$cron_entry"
327
+
328
+ c_status="PASS"
329
+ c_details=""
330
+ age_str="n/a"
331
+
332
+ if [ ! -x "$script" ]; then
333
+ if try_repair_cron "$script"; then
334
+ c_status="HEALED"
335
+ c_details="Self-healed: made executable. "
336
+ TOTAL_HEALED=$((TOTAL_HEALED + 1))
337
+ else
338
+ c_status="FAIL"
339
+ c_details="Script not executable or missing (repair failed). "
340
+ fi
341
+ fi
342
+
343
+ if [ -d "$check_path" ]; then
344
+ newest=$(ls -t "$check_path" 2>/dev/null | head -1)
345
+ if [ -n "$newest" ]; then
346
+ age=$(file_age "${check_path}${newest}")
347
+ age_str=$(format_age "$age")
348
+ if [ "$age" -gt $(( max_stale * 3 )) ]; then
349
+ c_status="FAIL"
350
+ c_details="${c_details}Output stale: $age_str. "
351
+ elif [ "$age" -gt "$max_stale" ]; then
352
+ [ "$c_status" = "PASS" ] && c_status="WARN"
353
+ c_details="${c_details}Output slightly stale: $age_str. "
354
+ fi
355
+ else
356
+ c_status="WARN"
357
+ c_details="${c_details}No output files found. "
358
+ age_str="no files"
359
+ fi
360
+ elif [ -f "$check_path" ]; then
361
+ age=$(file_age "$check_path")
362
+ age_str=$(format_age "$age")
363
+ if [ "$age" -gt $(( max_stale * 3 )) ]; then
364
+ c_status="FAIL"
365
+ c_details="${c_details}Output stale: $age_str. "
366
+ elif [ "$age" -gt "$max_stale" ]; then
367
+ [ "$c_status" = "PASS" ] && c_status="WARN"
368
+ c_details="${c_details}Output slightly stale: $age_str. "
369
+ fi
370
+ fi
371
+
372
+ [ -z "$c_details" ] && c_details="All checks passed"
373
+
374
+ case "$c_status" in
375
+ PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
376
+ WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
377
+ FAIL) TOTAL_FAIL=$((TOTAL_FAIL + 1)) ;;
378
+ esac
379
+
380
+ escaped_details=$(json_escape "$c_details")
381
+ cron_item=" {\"name\":\"$name\",\"script\":\"$script\",\"status\":\"$c_status\",\"last_output\":\"$age_str\",\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
382
+ [ -n "$CRON_JSON" ] && CRON_JSON="${CRON_JSON},
383
+ ${cron_item}" || CRON_JSON="$cron_item"
384
+
385
+ case "$c_status" in
386
+ PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;; *) icon="????" ;;
387
+ esac
388
+ CRON_REPORT="${CRON_REPORT} [${icon}] ${name} (${schedule})
389
+ Last output: ${age_str}
390
+ ${c_details}
391
+ "
392
+ done
393
+
394
+ # ============================================================================
395
+ # INFRASTRUCTURE CHECKS
396
+ # ============================================================================
397
+
398
+ # --- SQLite integrity ---
399
+ SQLITE_STATUS="PASS"
400
+ SQLITE_DETAIL=""
401
+ INTEGRITY=$(sqlite3 "$NEXO_DIR/nexo.db" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
402
+ if [ "$INTEGRITY" != "ok" ]; then
403
+ SQLITE_STATUS="FAIL"
404
+ SQLITE_DETAIL="Integrity check: $INTEGRITY"
405
+ log "CRITICAL: SQLite integrity check failed: $INTEGRITY"
406
+ TOTAL_FAIL=$((TOTAL_FAIL + 1))
407
+ # Save corrupt copy before restoring
408
+ cp "$NEXO_DIR/nexo.db" "$NEXO_DIR/nexo.db.corrupt.$(date +%s)" 2>/dev/null
409
+ LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
410
+ if [ -n "$LATEST_BACKUP" ]; then
411
+ cp "$LATEST_BACKUP" "$NEXO_DIR/nexo.db"
412
+ log "RESTORED from $LATEST_BACKUP"
413
+ SQLITE_DETAIL="${SQLITE_DETAIL}. Restored from backup."
414
+ fi
415
+ else
416
+ SQLITE_DETAIL="Integrity OK"
417
+ TOTAL_PASS=$((TOTAL_PASS + 1))
418
+ fi
419
+
420
+ # --- Cognitive DB check ---
421
+ COG_STATUS="PASS"
422
+ COG_DETAIL=""
423
+ COG_DB="$NEXO_DIR/cognitive.db"
424
+ if [ -f "$COG_DB" ]; then
425
+ COG_INT=$(sqlite3 "$COG_DB" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
426
+ if [ "$COG_INT" != "ok" ]; then
427
+ COG_STATUS="FAIL"
428
+ COG_DETAIL="Cognitive DB integrity: $COG_INT"
429
+ TOTAL_FAIL=$((TOTAL_FAIL + 1))
430
+ else
431
+ COG_DETAIL="Integrity OK"
432
+ TOTAL_PASS=$((TOTAL_PASS + 1))
433
+ fi
434
+ else
435
+ COG_STATUS="WARN"
436
+ COG_DETAIL="cognitive.db not found"
437
+ TOTAL_WARN=$((TOTAL_WARN + 1))
438
+ fi
439
+
440
+ # --- Backup freshness ---
441
+ BACKUP_STATUS="PASS"
442
+ BACKUP_DETAIL=""
443
+ LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
444
+ if [ -n "$LATEST_BACKUP" ]; then
445
+ BACKUP_AGE=$(file_age "$LATEST_BACKUP")
446
+ BACKUP_AGE_STR=$(format_age "$BACKUP_AGE")
447
+ if [ "$BACKUP_AGE" -gt 7200 ]; then
448
+ if try_repair_backup; then
449
+ BACKUP_STATUS="HEALED"
450
+ BACKUP_DETAIL="Self-healed: backup was stale ($BACKUP_AGE_STR), ran fresh backup"
451
+ TOTAL_HEALED=$((TOTAL_HEALED + 1))
452
+ TOTAL_PASS=$((TOTAL_PASS + 1))
453
+ else
454
+ BACKUP_STATUS="WARN"
455
+ BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR (>2h, repair failed)"
456
+ TOTAL_WARN=$((TOTAL_WARN + 1))
457
+ fi
458
+ else
459
+ BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR"
460
+ TOTAL_PASS=$((TOTAL_PASS + 1))
461
+ fi
462
+ else
463
+ BACKUP_STATUS="FAIL"
464
+ BACKUP_DETAIL="No backups found"
465
+ TOTAL_FAIL=$((TOTAL_FAIL + 1))
466
+ fi
467
+
468
+ # ============================================================================
469
+ # WRITE JSON STATUS
470
+ # ============================================================================
471
+ TOTAL=$((TOTAL_PASS + TOTAL_WARN + TOTAL_FAIL))
472
+ OVERALL="PASS"
473
+ [ "$TOTAL_WARN" -gt 0 ] && OVERALL="WARN"
474
+ [ "$TOTAL_FAIL" -gt 0 ] && OVERALL="FAIL"
475
+
476
+ cat > "$STATUS_JSON" <<JSONEOF
477
+ {
478
+ "timestamp": "$TS",
479
+ "summary": {
480
+ "total": $TOTAL,
481
+ "pass": $TOTAL_PASS,
482
+ "warn": $TOTAL_WARN,
483
+ "fail": $TOTAL_FAIL,
484
+ "healed": $TOTAL_HEALED,
485
+ "overall": "$OVERALL"
486
+ },
487
+ "launch_agents": [
488
+ $JSON_AGENTS
489
+ ],
490
+ "cron_jobs": [
491
+ $CRON_JSON
492
+ ],
493
+ "infrastructure": {
494
+ "sqlite": {"status": "$SQLITE_STATUS", "detail": "$(json_escape "$SQLITE_DETAIL")"},
495
+ "cognitive_db": {"status": "$COG_STATUS", "detail": "$(json_escape "$COG_DETAIL")"},
496
+ "backups": {"status": "$BACKUP_STATUS", "detail": "$(json_escape "$BACKUP_DETAIL")"}
497
+ }
498
+ }
499
+ JSONEOF
500
+
501
+ # ============================================================================
502
+ # WRITE HUMAN-READABLE REPORT
503
+ # ============================================================================
504
+ cat > "$REPORT_TXT" <<REPORTEOF
505
+ ======================================================
506
+ NEXO WATCHDOG REPORT — $TS
507
+ ======================================================
508
+ PASS: $TOTAL_PASS | HEALED: $TOTAL_HEALED | WARN: $TOTAL_WARN | FAIL: $TOTAL_FAIL | TOTAL: $TOTAL
509
+ OVERALL: $OVERALL
510
+ ======================================================
511
+
512
+ -- LaunchAgents (${#MONITORS[@]}) ---------------------
513
+ $REPORT_LINES
514
+ -- Cron Jobs ------------------------------------------
515
+ $CRON_REPORT
516
+ -- Infrastructure -------------------------------------
517
+ [$SQLITE_STATUS] SQLite nexo.db: $SQLITE_DETAIL
518
+ [$COG_STATUS] Cognitive DB: $COG_DETAIL
519
+ [$BACKUP_STATUS] Backups: $BACKUP_DETAIL
520
+
521
+ -- End of Report --------------------------------------
522
+ REPORTEOF
523
+
524
+ # ============================================================================
525
+ # ALERT FILE
526
+ # ============================================================================
527
+ if [ "$TOTAL_FAIL" -gt 0 ]; then
528
+ {
529
+ echo "timestamp=$TS"
530
+ echo "fail_count=$TOTAL_FAIL"
531
+ echo "warn_count=$TOTAL_WARN"
532
+ echo "failures:"
533
+ grep '\[FAIL\]' "$REPORT_TXT" | head -10 | sed 's/^/ /'
534
+ } > "$ALERT_FILE"
535
+ log "ALERT: $TOTAL_FAIL failures detected"
536
+ else
537
+ rm -f "$ALERT_FILE"
538
+ fi
539
+
540
+ # ============================================================================
541
+ # CONSECUTIVE FAILURE TRACKING
542
+ # ============================================================================
543
+ FAILS=$(cat "$FAIL_COUNT_FILE" 2>/dev/null || echo 0)
544
+ if [ "$TOTAL_FAIL" -gt 0 ]; then
545
+ FAILS=$((FAILS + 1))
546
+ echo "$FAILS" > "$FAIL_COUNT_FILE"
547
+ if [ "$FAILS" -ge "$MAX_FAILS" ]; then
548
+ log "ALERT: $FAILS consecutive runs with failures"
549
+ fi
550
+ else
551
+ echo "0" > "$FAIL_COUNT_FILE"
552
+ fi
553
+
554
+ # ============================================================================
555
+ # LEVEL 2 AUTO-REPAIR: Launch NEXO for intelligent diagnosis
556
+ # ============================================================================
557
+ REPAIR_LOCK="$HOME_DIR/claude/scripts/.watchdog-nexo-repair.lock"
558
+ REPAIR_COOLDOWN=1800 # 30 min between NEXO repair attempts
559
+
560
+ if [ "$TOTAL_FAIL" -gt 0 ]; then
561
+ LOCK_AGE=999999
562
+ SKIP_REPAIR=false
563
+ if [ -f "$REPAIR_LOCK" ]; then
564
+ LOCK_AGE=$(file_age "$REPAIR_LOCK")
565
+ if [ "$LOCK_AGE" -lt "$REPAIR_COOLDOWN" ]; then
566
+ log "NEXO repair skipped: cooldown (${LOCK_AGE}s < ${REPAIR_COOLDOWN}s)"
567
+ SKIP_REPAIR=true
568
+ fi
569
+ fi
570
+
571
+ if ! $SKIP_REPAIR; then
572
+ # Collect failure details from tracked FAILED_MONITORS array
573
+ FAIL_DETAILS=""
574
+ for failed in "${FAILED_MONITORS[@]}"; do
575
+ IFS='|' read -r m_name m_plist m_stdout m_stderr m_proc m_sched m_details <<< "$failed"
576
+ STDERR_TAIL=""
577
+ if [ -n "$m_stderr" ] && [ -f "$m_stderr" ]; then
578
+ STDERR_TAIL=$(tail -20 "$m_stderr" 2>/dev/null | head -20)
579
+ fi
580
+ STDOUT_TAIL=""
581
+ if [ -n "$m_stdout" ] && [ -f "$m_stdout" ]; then
582
+ STDOUT_TAIL=$(tail -10 "$m_stdout" 2>/dev/null | head -10)
583
+ fi
584
+ FAIL_DETAILS="${FAIL_DETAILS}
585
+ --- ${m_name} (${m_plist}) ---
586
+ Schedule: ${m_sched}
587
+ Failure reason: ${m_details}
588
+ Plist: ~/Library/LaunchAgents/${m_plist}.plist
589
+ Process grep: ${m_proc}
590
+ Stderr (last 20 lines):
591
+ ${STDERR_TAIL}
592
+ Stdout (last 10 lines):
593
+ ${STDOUT_TAIL}
594
+ "
595
+ done
596
+
597
+ if [ -n "$FAIL_DETAILS" ]; then
598
+ touch "$REPAIR_LOCK"
599
+ log "Launching NEXO Level 2 repair..."
600
+
601
+ REPAIR_PROMPT_FILE=$(mktemp /tmp/nexo-repair-XXXXXX.txt)
602
+ cat > "$REPAIR_PROMPT_FILE" <<NEXOPROMPT
603
+ WATCHDOG LEVEL 2 REPAIR — Automated diagnostic session.
604
+
605
+ The watchdog detected services that failed mechanical repair (launchctl bootstrap/kickstart).
606
+ Your job: diagnose the root cause and fix it. Do NOT ask the user anything — resolve autonomously.
607
+
608
+ FAILURES:
609
+ ${FAIL_DETAILS}
610
+
611
+ STEPS:
612
+ 1. Read the plist file to understand the service configuration
613
+ 2. Check stderr/stdout logs for the actual error
614
+ 3. Fix the root cause (missing file, bad config, dependency issue, etc.)
615
+ 4. Reload the service and verify it is running
616
+ 5. Log what you did to ~/claude/logs/watchdog-repair-result.log
617
+
618
+ CONSTRAINTS:
619
+ - Do NOT modify CLAUDE.md or any protected file
620
+ - Do NOT start interactive conversations
621
+ - Keep it under 5 minutes
622
+ - Log what you did to ~/claude/logs/watchdog-repair-result.log
623
+ NEXOPROMPT
624
+
625
+ # Find claude CLI (may not be in PATH for cron/LaunchAgent)
626
+ CLAUDE_BIN=$(command -v claude 2>/dev/null || echo "$HOME_DIR/.claude/local/bin/claude")
627
+ if [ ! -x "$CLAUDE_BIN" ]; then
628
+ CLAUDE_BIN=$(find /usr/local/bin /opt/homebrew/bin "$HOME_DIR/.local/bin" "$HOME_DIR/.npm-global/bin" -name claude -type f 2>/dev/null | head -1)
629
+ fi
630
+
631
+ if [ -n "$CLAUDE_BIN" ] && [ -x "$CLAUDE_BIN" ]; then
632
+ nohup bash -c "\"$CLAUDE_BIN\" --print --dangerously-skip-permissions -p \"\$(cat '$REPAIR_PROMPT_FILE')\" >> '$LOG_DIR/watchdog-nexo-repair.log' 2>&1; rm -f '$REPAIR_PROMPT_FILE'" &
633
+ log "NEXO repair launched (PID: $!)"
634
+ else
635
+ log "NEXO repair ABORTED: claude CLI not found in PATH"
636
+ rm -f "$REPAIR_PROMPT_FILE"
637
+ fi
638
+ fi
639
+ fi
640
+ fi
641
+
642
+ # ============================================================================
643
+ # LOG SUMMARY
644
+ # ============================================================================
645
+ log "Complete: PASS=$TOTAL_PASS HEALED=$TOTAL_HEALED WARN=$TOTAL_WARN FAIL=$TOTAL_FAIL"