nexo-brain 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -16
- package/README.md +2 -2
- package/package.json +4 -4
- package/src/__pycache__/db.cpython-314.pyc +0 -0
- package/src/__pycache__/tools_credentials.cpython-314.pyc +0 -0
- package/src/dashboard/__pycache__/__init__.cpython-314.pyc +0 -0
- package/src/dashboard/__pycache__/app.cpython-314.pyc +0 -0
- package/src/plugins/__pycache__/episodic_memory.cpython-314.pyc +0 -0
- package/src/plugins/guard.py +36 -20
- package/src/rules/__init__ 2.py +0 -0
- package/src/rules/__pycache__/migrate.cpython-314.pyc +0 -0
- package/src/rules/core-rules 2.json +329 -0
- package/src/rules/migrate 2.py +207 -0
- package/src/scripts/nexo-watchdog.sh +645 -0
|
@@ -0,0 +1,645 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# ============================================================================
|
|
3
|
+
# NEXO Watchdog — Health monitor with two-level auto-repair
|
|
4
|
+
# ============================================================================
|
|
5
|
+
# Monitors all NEXO core LaunchAgents, cron jobs, and infrastructure.
|
|
6
|
+
# Level 1: Mechanical repair (launchctl bootstrap/kickstart, chmod)
|
|
7
|
+
# Level 2: Launches NEXO CLI for intelligent diagnosis and fix
|
|
8
|
+
#
|
|
9
|
+
# Install: Add to LaunchAgents for periodic execution (every 5 min recommended)
|
|
10
|
+
# ============================================================================
|
|
11
|
+
set -uo pipefail
|
|
12
|
+
|
|
13
|
+
# === PATHS ===
|
|
14
|
+
HOME_DIR="$HOME"
|
|
15
|
+
NEXO_DIR="$HOME_DIR/claude/nexo-mcp"
|
|
16
|
+
OPS_DIR="$HOME_DIR/claude/operations"
|
|
17
|
+
LOG_DIR="$HOME_DIR/claude/logs"
|
|
18
|
+
LOG="$LOG_DIR/watchdog.log"
|
|
19
|
+
STATUS_JSON="$OPS_DIR/watchdog-status.json"
|
|
20
|
+
REPORT_TXT="$OPS_DIR/watchdog-report.txt"
|
|
21
|
+
ALERT_FILE="$OPS_DIR/.watchdog-alert"
|
|
22
|
+
FAIL_COUNT_FILE="$HOME_DIR/claude/scripts/.watchdog-fails"
|
|
23
|
+
MAX_FAILS=3
|
|
24
|
+
|
|
25
|
+
mkdir -p "$LOG_DIR" "$OPS_DIR"
|
|
26
|
+
|
|
27
|
+
TS=$(date "+%Y-%m-%d %H:%M:%S")
|
|
28
|
+
TS_EPOCH=$(date +%s)
|
|
29
|
+
|
|
30
|
+
log() { echo "[$TS] $1" >> "$LOG"; }
|
|
31
|
+
|
|
32
|
+
# ============================================================================
|
|
33
|
+
# HELPER FUNCTIONS
|
|
34
|
+
# ============================================================================
|
|
35
|
+
|
|
36
|
+
UID_NUM=$(id -u)
|
|
37
|
+
REPAIR_LOG="$LOG_DIR/watchdog-repairs.log"
|
|
38
|
+
TOTAL_HEALED=0
|
|
39
|
+
|
|
40
|
+
log_repair() { echo "[$TS] REPAIR: $1" >> "$REPAIR_LOG"; log "REPAIR: $1"; }
|
|
41
|
+
|
|
42
|
+
is_loaded() {
|
|
43
|
+
launchctl list "$1" &>/dev/null
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
file_age() {
|
|
47
|
+
if [ -f "$1" ]; then
|
|
48
|
+
local mod_epoch
|
|
49
|
+
# macOS: stat -f %m, Linux: stat -c %Y
|
|
50
|
+
mod_epoch=$(stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo 0)
|
|
51
|
+
echo $(( TS_EPOCH - mod_epoch ))
|
|
52
|
+
else
|
|
53
|
+
echo 999999
|
|
54
|
+
fi
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
format_age() {
|
|
58
|
+
local secs=$1
|
|
59
|
+
if [ "$secs" -ge 999999 ]; then
|
|
60
|
+
echo "never"
|
|
61
|
+
elif [ "$secs" -ge 86400 ]; then
|
|
62
|
+
echo "$((secs / 86400))d $((secs % 86400 / 3600))h ago"
|
|
63
|
+
elif [ "$secs" -ge 3600 ]; then
|
|
64
|
+
echo "$((secs / 3600))h $((secs % 3600 / 60))m ago"
|
|
65
|
+
elif [ "$secs" -ge 60 ]; then
|
|
66
|
+
echo "$((secs / 60))m ago"
|
|
67
|
+
else
|
|
68
|
+
echo "${secs}s ago"
|
|
69
|
+
fi
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
check_errors() {
|
|
73
|
+
local logfile="$1"
|
|
74
|
+
if [ -f "$logfile" ] && [ -s "$logfile" ]; then
|
|
75
|
+
tail -50 "$logfile" 2>/dev/null | grep -cE "$ERROR_PATTERNS" 2>/dev/null || echo 0
|
|
76
|
+
else
|
|
77
|
+
echo 0
|
|
78
|
+
fi
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
process_running() {
|
|
82
|
+
if [ -n "$1" ]; then
|
|
83
|
+
pgrep -f "$1" > /dev/null 2>&1
|
|
84
|
+
else
|
|
85
|
+
return 1
|
|
86
|
+
fi
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
json_escape() {
|
|
90
|
+
echo "$1" | sed 's/\\/\\\\/g; s/"/\\"/g; s/ / /g' | tr '\n' ' '
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# ============================================================================
|
|
94
|
+
# AUTO-REPAIR FUNCTIONS
|
|
95
|
+
# ============================================================================
|
|
96
|
+
|
|
97
|
+
try_repair_launchagent() {
|
|
98
|
+
local plist_id="$1"
|
|
99
|
+
local proc_grep="$2"
|
|
100
|
+
local plist_file="$HOME_DIR/Library/LaunchAgents/${plist_id}.plist"
|
|
101
|
+
|
|
102
|
+
# Repair 1: Not loaded — try to bootstrap
|
|
103
|
+
if ! is_loaded "$plist_id"; then
|
|
104
|
+
if [ -f "$plist_file" ]; then
|
|
105
|
+
launchctl bootstrap "gui/$UID_NUM" "$plist_file" 2>/dev/null
|
|
106
|
+
sleep 1
|
|
107
|
+
if is_loaded "$plist_id"; then
|
|
108
|
+
log_repair "$plist_id: bootstrapped successfully"
|
|
109
|
+
return 0
|
|
110
|
+
fi
|
|
111
|
+
fi
|
|
112
|
+
return 1
|
|
113
|
+
fi
|
|
114
|
+
|
|
115
|
+
# Repair 2: Loaded but process not running (KeepAlive) — kickstart
|
|
116
|
+
if [ -n "$proc_grep" ] && ! process_running "$proc_grep"; then
|
|
117
|
+
launchctl kickstart "gui/$UID_NUM/$plist_id" 2>/dev/null
|
|
118
|
+
sleep 2
|
|
119
|
+
if process_running "$proc_grep"; then
|
|
120
|
+
log_repair "$plist_id: kickstarted process '$proc_grep'"
|
|
121
|
+
return 0
|
|
122
|
+
fi
|
|
123
|
+
fi
|
|
124
|
+
|
|
125
|
+
return 1
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
try_repair_cron() {
|
|
129
|
+
local script="$1"
|
|
130
|
+
|
|
131
|
+
if [ -f "$script" ] && [ ! -x "$script" ]; then
|
|
132
|
+
chmod +x "$script"
|
|
133
|
+
if [ -x "$script" ]; then
|
|
134
|
+
log_repair "$script: made executable"
|
|
135
|
+
return 0
|
|
136
|
+
fi
|
|
137
|
+
fi
|
|
138
|
+
|
|
139
|
+
return 1
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
try_repair_backup() {
|
|
143
|
+
local backup_script="$NEXO_DIR/backup_cron.sh"
|
|
144
|
+
if [ -x "$backup_script" ]; then
|
|
145
|
+
"$backup_script" 2>/dev/null
|
|
146
|
+
sleep 1
|
|
147
|
+
local newest
|
|
148
|
+
newest=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
|
|
149
|
+
if [ -n "$newest" ]; then
|
|
150
|
+
local age
|
|
151
|
+
age=$(file_age "$newest")
|
|
152
|
+
if [ "$age" -lt 60 ]; then
|
|
153
|
+
log_repair "backup_cron.sh: ran successfully, fresh backup created"
|
|
154
|
+
return 0
|
|
155
|
+
fi
|
|
156
|
+
fi
|
|
157
|
+
fi
|
|
158
|
+
return 1
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# ============================================================================
|
|
162
|
+
# MONITOR REGISTRY — NEXO Core Services
|
|
163
|
+
# ============================================================================
|
|
164
|
+
# Format: NAME|PLIST_ID|LOG_STDOUT|LOG_STDERR|MAX_STALE_SECS|PROCESS_GREP|SCHEDULE_DESC
|
|
165
|
+
#
|
|
166
|
+
# Users can add custom monitors in ~/claude/config/watchdog-monitors.conf
|
|
167
|
+
# (same format, one per line, # for comments)
|
|
168
|
+
# ============================================================================
|
|
169
|
+
MONITORS=(
|
|
170
|
+
"Auto-Close Sessions|com.nexo.auto-close-sessions|$HOME_DIR/claude/coordination/auto-close-stdout.log|$HOME_DIR/claude/coordination/auto-close-stderr.log|900||Every 5 min"
|
|
171
|
+
"Catchup|com.nexo.catchup|$HOME_DIR/claude/logs/catchup-stdout.log|$HOME_DIR/claude/logs/catchup-stderr.log|0||RunAtLoad once"
|
|
172
|
+
"Cognitive Decay|com.nexo.cognitive-decay|$HOME_DIR/claude/logs/cognitive-decay-stdout.log|$HOME_DIR/claude/logs/cognitive-decay-stderr.log|90000||Daily 3:00 AM"
|
|
173
|
+
"Evolution|com.nexo.evolution|$HOME_DIR/claude/logs/evolution-stdout.log|$HOME_DIR/claude/logs/evolution-stderr.log|0||Weekly Sun 3:00 AM"
|
|
174
|
+
"GitHub Monitor|com.nexo.github-monitor|$HOME_DIR/claude/logs/github-monitor-stdout.log|$HOME_DIR/claude/logs/github-monitor-stderr.log|90000||Daily 8:00 AM"
|
|
175
|
+
"Immune|com.nexo.immune|$HOME_DIR/claude/coordination/immune-stdout.log|$HOME_DIR/claude/coordination/immune-stderr.log|3600||Every 30 min"
|
|
176
|
+
"Postmortem|com.nexo.postmortem|$HOME_DIR/claude/logs/postmortem-stdout.log|$HOME_DIR/claude/logs/postmortem-stderr.log|90000||Daily 23:30"
|
|
177
|
+
"Prevent Sleep|com.nexo.prevent-sleep|||0|caffeinate|KeepAlive"
|
|
178
|
+
"Self Audit|com.nexo.self-audit|$HOME_DIR/claude/logs/self-audit-stdout.log|$HOME_DIR/claude/logs/self-audit-stderr.log|90000||Daily 7:00 AM"
|
|
179
|
+
"Sleep|com.nexo.sleep|$HOME_DIR/claude/coordination/sleep-stdout.log|$HOME_DIR/claude/coordination/sleep-stderr.log|90000||Daily 4:00 AM"
|
|
180
|
+
"Synthesis|com.nexo.synthesis|$HOME_DIR/claude/coordination/synthesis-stdout.log|$HOME_DIR/claude/coordination/synthesis-stderr.log|10800||Every 2 hours"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Load user-defined monitors if file exists
|
|
184
|
+
USER_MONITORS_FILE="$HOME_DIR/claude/config/watchdog-monitors.conf"
|
|
185
|
+
if [ -f "$USER_MONITORS_FILE" ]; then
|
|
186
|
+
while IFS= read -r line; do
|
|
187
|
+
[[ "$line" =~ ^[[:space:]]*# ]] && continue
|
|
188
|
+
[[ -z "$line" ]] && continue
|
|
189
|
+
MONITORS+=("$line")
|
|
190
|
+
done < "$USER_MONITORS_FILE"
|
|
191
|
+
fi
|
|
192
|
+
|
|
193
|
+
# Cron jobs to check (NAME|SCRIPT|CHECK_PATH|MAX_STALE_SECS|SCHEDULE)
|
|
194
|
+
CRON_MONITORS=(
|
|
195
|
+
"Backup Cron|$NEXO_DIR/backup_cron.sh|$NEXO_DIR/backups/|7200|Hourly"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Error patterns to search in stderr logs (last 50 lines)
|
|
199
|
+
ERROR_PATTERNS="Traceback|Error:|CRITICAL|FATAL|ModuleNotFoundError|PermissionError|FileNotFoundError|ConnectionRefused|Errno"
|
|
200
|
+
|
|
201
|
+
# ============================================================================
|
|
202
|
+
# RUN CHECKS
|
|
203
|
+
# ============================================================================
|
|
204
|
+
|
|
205
|
+
TOTAL_PASS=0
|
|
206
|
+
TOTAL_WARN=0
|
|
207
|
+
TOTAL_FAIL=0
|
|
208
|
+
JSON_AGENTS=""
|
|
209
|
+
REPORT_LINES=""
|
|
210
|
+
FAILED_MONITORS=() # Track failed monitors for Level 2 repair
|
|
211
|
+
|
|
212
|
+
for monitor in "${MONITORS[@]}"; do
|
|
213
|
+
[[ "$monitor" =~ ^[[:space:]]*# ]] && continue
|
|
214
|
+
IFS='|' read -r name plist_id log_stdout log_stderr max_stale proc_grep schedule <<< "$monitor"
|
|
215
|
+
|
|
216
|
+
status="PASS"
|
|
217
|
+
details=""
|
|
218
|
+
loaded="unknown"
|
|
219
|
+
stale_age="n/a"
|
|
220
|
+
error_count=0
|
|
221
|
+
proc_alive="n/a"
|
|
222
|
+
|
|
223
|
+
# Check 1: LaunchAgent loaded?
|
|
224
|
+
if is_loaded "$plist_id"; then
|
|
225
|
+
loaded="yes"
|
|
226
|
+
else
|
|
227
|
+
loaded="no"
|
|
228
|
+
if try_repair_launchagent "$plist_id" "$proc_grep"; then
|
|
229
|
+
loaded="yes"
|
|
230
|
+
status="HEALED"
|
|
231
|
+
details="${details}Self-healed: bootstrapped. "
|
|
232
|
+
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
233
|
+
else
|
|
234
|
+
status="FAIL"
|
|
235
|
+
details="${details}Not loaded in launchctl (repair failed). "
|
|
236
|
+
fi
|
|
237
|
+
fi
|
|
238
|
+
|
|
239
|
+
# Check 2: Process alive? (only for KeepAlive / long-running)
|
|
240
|
+
if [ -n "$proc_grep" ]; then
|
|
241
|
+
if process_running "$proc_grep"; then
|
|
242
|
+
proc_alive="yes"
|
|
243
|
+
else
|
|
244
|
+
proc_alive="no"
|
|
245
|
+
if [ "$status" != "FAIL" ] && [ "$status" != "HEALED" ]; then
|
|
246
|
+
if try_repair_launchagent "$plist_id" "$proc_grep"; then
|
|
247
|
+
proc_alive="yes"
|
|
248
|
+
status="HEALED"
|
|
249
|
+
details="${details}Self-healed: kickstarted. "
|
|
250
|
+
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
251
|
+
else
|
|
252
|
+
status="WARN"
|
|
253
|
+
details="${details}Process '$proc_grep' not running (repair failed). "
|
|
254
|
+
fi
|
|
255
|
+
elif [ "$status" = "HEALED" ]; then
|
|
256
|
+
sleep 1
|
|
257
|
+
if process_running "$proc_grep"; then
|
|
258
|
+
proc_alive="yes"
|
|
259
|
+
else
|
|
260
|
+
details="${details}Process '$proc_grep' still not running after bootstrap. "
|
|
261
|
+
fi
|
|
262
|
+
fi
|
|
263
|
+
fi
|
|
264
|
+
fi
|
|
265
|
+
|
|
266
|
+
# Check 3: Log staleness
|
|
267
|
+
if [ -n "$log_stdout" ] && [ "$max_stale" -gt 0 ]; then
|
|
268
|
+
age=$(file_age "$log_stdout")
|
|
269
|
+
stale_age=$(format_age "$age")
|
|
270
|
+
if [ "$age" -gt $(( max_stale * 3 )) ]; then
|
|
271
|
+
status="FAIL"
|
|
272
|
+
details="${details}Log stale: $stale_age (limit: $(format_age "$max_stale")). "
|
|
273
|
+
elif [ "$age" -gt "$max_stale" ]; then
|
|
274
|
+
[ "$status" = "PASS" ] && status="WARN"
|
|
275
|
+
details="${details}Log slightly stale: $stale_age. "
|
|
276
|
+
fi
|
|
277
|
+
elif [ -n "$log_stdout" ]; then
|
|
278
|
+
if [ -f "$log_stdout" ]; then
|
|
279
|
+
age=$(file_age "$log_stdout")
|
|
280
|
+
stale_age=$(format_age "$age")
|
|
281
|
+
else
|
|
282
|
+
stale_age="no log file"
|
|
283
|
+
fi
|
|
284
|
+
fi
|
|
285
|
+
|
|
286
|
+
# Check 4: Errors in stderr log
|
|
287
|
+
if [ -n "$log_stderr" ]; then
|
|
288
|
+
error_count=$(check_errors "$log_stderr")
|
|
289
|
+
if [ "$error_count" -gt 5 ]; then
|
|
290
|
+
[ "$status" = "PASS" ] && status="WARN"
|
|
291
|
+
details="${details}${error_count} errors in recent stderr. "
|
|
292
|
+
fi
|
|
293
|
+
fi
|
|
294
|
+
|
|
295
|
+
[ -z "$details" ] && details="All checks passed"
|
|
296
|
+
|
|
297
|
+
case "$status" in
|
|
298
|
+
PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
|
|
299
|
+
WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
|
|
300
|
+
FAIL)
|
|
301
|
+
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
302
|
+
FAILED_MONITORS+=("${name}|${plist_id}|${log_stdout}|${log_stderr}|${proc_grep}|${schedule}|${details}")
|
|
303
|
+
;;
|
|
304
|
+
esac
|
|
305
|
+
|
|
306
|
+
# JSON
|
|
307
|
+
escaped_details=$(json_escape "$details")
|
|
308
|
+
json_item=" {\"name\":\"$name\",\"plist\":\"$plist_id\",\"status\":\"$status\",\"loaded\":\"$loaded\",\"process\":\"$proc_alive\",\"last_activity\":\"$stale_age\",\"stderr_errors\":$error_count,\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
|
|
309
|
+
[ -n "$JSON_AGENTS" ] && JSON_AGENTS="${JSON_AGENTS},
|
|
310
|
+
${json_item}" || JSON_AGENTS="$json_item"
|
|
311
|
+
|
|
312
|
+
# Report
|
|
313
|
+
case "$status" in
|
|
314
|
+
PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;; *) icon="????" ;;
|
|
315
|
+
esac
|
|
316
|
+
REPORT_LINES="${REPORT_LINES} [${icon}] ${name} (${schedule})
|
|
317
|
+
Loaded: ${loaded} | Process: ${proc_alive} | Last: ${stale_age} | Errors: ${error_count}
|
|
318
|
+
${details}
|
|
319
|
+
"
|
|
320
|
+
done
|
|
321
|
+
|
|
322
|
+
# --- Cron job checks ---
|
|
323
|
+
CRON_JSON=""
|
|
324
|
+
CRON_REPORT=""
|
|
325
|
+
for cron_entry in "${CRON_MONITORS[@]}"; do
|
|
326
|
+
IFS='|' read -r name script check_path max_stale schedule <<< "$cron_entry"
|
|
327
|
+
|
|
328
|
+
c_status="PASS"
|
|
329
|
+
c_details=""
|
|
330
|
+
age_str="n/a"
|
|
331
|
+
|
|
332
|
+
if [ ! -x "$script" ]; then
|
|
333
|
+
if try_repair_cron "$script"; then
|
|
334
|
+
c_status="HEALED"
|
|
335
|
+
c_details="Self-healed: made executable. "
|
|
336
|
+
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
337
|
+
else
|
|
338
|
+
c_status="FAIL"
|
|
339
|
+
c_details="Script not executable or missing (repair failed). "
|
|
340
|
+
fi
|
|
341
|
+
fi
|
|
342
|
+
|
|
343
|
+
if [ -d "$check_path" ]; then
|
|
344
|
+
newest=$(ls -t "$check_path" 2>/dev/null | head -1)
|
|
345
|
+
if [ -n "$newest" ]; then
|
|
346
|
+
age=$(file_age "${check_path}${newest}")
|
|
347
|
+
age_str=$(format_age "$age")
|
|
348
|
+
if [ "$age" -gt $(( max_stale * 3 )) ]; then
|
|
349
|
+
c_status="FAIL"
|
|
350
|
+
c_details="${c_details}Output stale: $age_str. "
|
|
351
|
+
elif [ "$age" -gt "$max_stale" ]; then
|
|
352
|
+
[ "$c_status" = "PASS" ] && c_status="WARN"
|
|
353
|
+
c_details="${c_details}Output slightly stale: $age_str. "
|
|
354
|
+
fi
|
|
355
|
+
else
|
|
356
|
+
c_status="WARN"
|
|
357
|
+
c_details="${c_details}No output files found. "
|
|
358
|
+
age_str="no files"
|
|
359
|
+
fi
|
|
360
|
+
elif [ -f "$check_path" ]; then
|
|
361
|
+
age=$(file_age "$check_path")
|
|
362
|
+
age_str=$(format_age "$age")
|
|
363
|
+
if [ "$age" -gt $(( max_stale * 3 )) ]; then
|
|
364
|
+
c_status="FAIL"
|
|
365
|
+
c_details="${c_details}Output stale: $age_str. "
|
|
366
|
+
elif [ "$age" -gt "$max_stale" ]; then
|
|
367
|
+
[ "$c_status" = "PASS" ] && c_status="WARN"
|
|
368
|
+
c_details="${c_details}Output slightly stale: $age_str. "
|
|
369
|
+
fi
|
|
370
|
+
fi
|
|
371
|
+
|
|
372
|
+
[ -z "$c_details" ] && c_details="All checks passed"
|
|
373
|
+
|
|
374
|
+
case "$c_status" in
|
|
375
|
+
PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
|
|
376
|
+
WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
|
|
377
|
+
FAIL) TOTAL_FAIL=$((TOTAL_FAIL + 1)) ;;
|
|
378
|
+
esac
|
|
379
|
+
|
|
380
|
+
escaped_details=$(json_escape "$c_details")
|
|
381
|
+
cron_item=" {\"name\":\"$name\",\"script\":\"$script\",\"status\":\"$c_status\",\"last_output\":\"$age_str\",\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
|
|
382
|
+
[ -n "$CRON_JSON" ] && CRON_JSON="${CRON_JSON},
|
|
383
|
+
${cron_item}" || CRON_JSON="$cron_item"
|
|
384
|
+
|
|
385
|
+
case "$c_status" in
|
|
386
|
+
PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;; *) icon="????" ;;
|
|
387
|
+
esac
|
|
388
|
+
CRON_REPORT="${CRON_REPORT} [${icon}] ${name} (${schedule})
|
|
389
|
+
Last output: ${age_str}
|
|
390
|
+
${c_details}
|
|
391
|
+
"
|
|
392
|
+
done
|
|
393
|
+
|
|
394
|
+
# ============================================================================
|
|
395
|
+
# INFRASTRUCTURE CHECKS
|
|
396
|
+
# ============================================================================
|
|
397
|
+
|
|
398
|
+
# --- SQLite integrity ---
|
|
399
|
+
SQLITE_STATUS="PASS"
|
|
400
|
+
SQLITE_DETAIL=""
|
|
401
|
+
INTEGRITY=$(sqlite3 "$NEXO_DIR/nexo.db" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
|
|
402
|
+
if [ "$INTEGRITY" != "ok" ]; then
|
|
403
|
+
SQLITE_STATUS="FAIL"
|
|
404
|
+
SQLITE_DETAIL="Integrity check: $INTEGRITY"
|
|
405
|
+
log "CRITICAL: SQLite integrity check failed: $INTEGRITY"
|
|
406
|
+
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
407
|
+
# Save corrupt copy before restoring
|
|
408
|
+
cp "$NEXO_DIR/nexo.db" "$NEXO_DIR/nexo.db.corrupt.$(date +%s)" 2>/dev/null
|
|
409
|
+
LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
|
|
410
|
+
if [ -n "$LATEST_BACKUP" ]; then
|
|
411
|
+
cp "$LATEST_BACKUP" "$NEXO_DIR/nexo.db"
|
|
412
|
+
log "RESTORED from $LATEST_BACKUP"
|
|
413
|
+
SQLITE_DETAIL="${SQLITE_DETAIL}. Restored from backup."
|
|
414
|
+
fi
|
|
415
|
+
else
|
|
416
|
+
SQLITE_DETAIL="Integrity OK"
|
|
417
|
+
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
418
|
+
fi
|
|
419
|
+
|
|
420
|
+
# --- Cognitive DB check ---
|
|
421
|
+
COG_STATUS="PASS"
|
|
422
|
+
COG_DETAIL=""
|
|
423
|
+
COG_DB="$NEXO_DIR/cognitive.db"
|
|
424
|
+
if [ -f "$COG_DB" ]; then
|
|
425
|
+
COG_INT=$(sqlite3 "$COG_DB" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
|
|
426
|
+
if [ "$COG_INT" != "ok" ]; then
|
|
427
|
+
COG_STATUS="FAIL"
|
|
428
|
+
COG_DETAIL="Cognitive DB integrity: $COG_INT"
|
|
429
|
+
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
430
|
+
else
|
|
431
|
+
COG_DETAIL="Integrity OK"
|
|
432
|
+
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
433
|
+
fi
|
|
434
|
+
else
|
|
435
|
+
COG_STATUS="WARN"
|
|
436
|
+
COG_DETAIL="cognitive.db not found"
|
|
437
|
+
TOTAL_WARN=$((TOTAL_WARN + 1))
|
|
438
|
+
fi
|
|
439
|
+
|
|
440
|
+
# --- Backup freshness ---
|
|
441
|
+
BACKUP_STATUS="PASS"
|
|
442
|
+
BACKUP_DETAIL=""
|
|
443
|
+
LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
|
|
444
|
+
if [ -n "$LATEST_BACKUP" ]; then
|
|
445
|
+
BACKUP_AGE=$(file_age "$LATEST_BACKUP")
|
|
446
|
+
BACKUP_AGE_STR=$(format_age "$BACKUP_AGE")
|
|
447
|
+
if [ "$BACKUP_AGE" -gt 7200 ]; then
|
|
448
|
+
if try_repair_backup; then
|
|
449
|
+
BACKUP_STATUS="HEALED"
|
|
450
|
+
BACKUP_DETAIL="Self-healed: backup was stale ($BACKUP_AGE_STR), ran fresh backup"
|
|
451
|
+
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
452
|
+
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
453
|
+
else
|
|
454
|
+
BACKUP_STATUS="WARN"
|
|
455
|
+
BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR (>2h, repair failed)"
|
|
456
|
+
TOTAL_WARN=$((TOTAL_WARN + 1))
|
|
457
|
+
fi
|
|
458
|
+
else
|
|
459
|
+
BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR"
|
|
460
|
+
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
461
|
+
fi
|
|
462
|
+
else
|
|
463
|
+
BACKUP_STATUS="FAIL"
|
|
464
|
+
BACKUP_DETAIL="No backups found"
|
|
465
|
+
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
466
|
+
fi
|
|
467
|
+
|
|
468
|
+
# ============================================================================
|
|
469
|
+
# WRITE JSON STATUS
|
|
470
|
+
# ============================================================================
|
|
471
|
+
TOTAL=$((TOTAL_PASS + TOTAL_WARN + TOTAL_FAIL))
|
|
472
|
+
OVERALL="PASS"
|
|
473
|
+
[ "$TOTAL_WARN" -gt 0 ] && OVERALL="WARN"
|
|
474
|
+
[ "$TOTAL_FAIL" -gt 0 ] && OVERALL="FAIL"
|
|
475
|
+
|
|
476
|
+
cat > "$STATUS_JSON" <<JSONEOF
|
|
477
|
+
{
|
|
478
|
+
"timestamp": "$TS",
|
|
479
|
+
"summary": {
|
|
480
|
+
"total": $TOTAL,
|
|
481
|
+
"pass": $TOTAL_PASS,
|
|
482
|
+
"warn": $TOTAL_WARN,
|
|
483
|
+
"fail": $TOTAL_FAIL,
|
|
484
|
+
"healed": $TOTAL_HEALED,
|
|
485
|
+
"overall": "$OVERALL"
|
|
486
|
+
},
|
|
487
|
+
"launch_agents": [
|
|
488
|
+
$JSON_AGENTS
|
|
489
|
+
],
|
|
490
|
+
"cron_jobs": [
|
|
491
|
+
$CRON_JSON
|
|
492
|
+
],
|
|
493
|
+
"infrastructure": {
|
|
494
|
+
"sqlite": {"status": "$SQLITE_STATUS", "detail": "$(json_escape "$SQLITE_DETAIL")"},
|
|
495
|
+
"cognitive_db": {"status": "$COG_STATUS", "detail": "$(json_escape "$COG_DETAIL")"},
|
|
496
|
+
"backups": {"status": "$BACKUP_STATUS", "detail": "$(json_escape "$BACKUP_DETAIL")"}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
JSONEOF
|
|
500
|
+
|
|
501
|
+
# ============================================================================
|
|
502
|
+
# WRITE HUMAN-READABLE REPORT
|
|
503
|
+
# ============================================================================
|
|
504
|
+
cat > "$REPORT_TXT" <<REPORTEOF
|
|
505
|
+
======================================================
|
|
506
|
+
NEXO WATCHDOG REPORT — $TS
|
|
507
|
+
======================================================
|
|
508
|
+
PASS: $TOTAL_PASS | HEALED: $TOTAL_HEALED | WARN: $TOTAL_WARN | FAIL: $TOTAL_FAIL | TOTAL: $TOTAL
|
|
509
|
+
OVERALL: $OVERALL
|
|
510
|
+
======================================================
|
|
511
|
+
|
|
512
|
+
-- LaunchAgents (${#MONITORS[@]}) ---------------------
|
|
513
|
+
$REPORT_LINES
|
|
514
|
+
-- Cron Jobs ------------------------------------------
|
|
515
|
+
$CRON_REPORT
|
|
516
|
+
-- Infrastructure -------------------------------------
|
|
517
|
+
[$SQLITE_STATUS] SQLite nexo.db: $SQLITE_DETAIL
|
|
518
|
+
[$COG_STATUS] Cognitive DB: $COG_DETAIL
|
|
519
|
+
[$BACKUP_STATUS] Backups: $BACKUP_DETAIL
|
|
520
|
+
|
|
521
|
+
-- End of Report --------------------------------------
|
|
522
|
+
REPORTEOF
|
|
523
|
+
|
|
524
|
+
# ============================================================================
|
|
525
|
+
# ALERT FILE
|
|
526
|
+
# ============================================================================
|
|
527
|
+
if [ "$TOTAL_FAIL" -gt 0 ]; then
|
|
528
|
+
{
|
|
529
|
+
echo "timestamp=$TS"
|
|
530
|
+
echo "fail_count=$TOTAL_FAIL"
|
|
531
|
+
echo "warn_count=$TOTAL_WARN"
|
|
532
|
+
echo "failures:"
|
|
533
|
+
grep '\[FAIL\]' "$REPORT_TXT" | head -10 | sed 's/^/ /'
|
|
534
|
+
} > "$ALERT_FILE"
|
|
535
|
+
log "ALERT: $TOTAL_FAIL failures detected"
|
|
536
|
+
else
|
|
537
|
+
rm -f "$ALERT_FILE"
|
|
538
|
+
fi
|
|
539
|
+
|
|
540
|
+
# ============================================================================
|
|
541
|
+
# CONSECUTIVE FAILURE TRACKING
|
|
542
|
+
# ============================================================================
|
|
543
|
+
FAILS=$(cat "$FAIL_COUNT_FILE" 2>/dev/null || echo 0)
|
|
544
|
+
if [ "$TOTAL_FAIL" -gt 0 ]; then
|
|
545
|
+
FAILS=$((FAILS + 1))
|
|
546
|
+
echo "$FAILS" > "$FAIL_COUNT_FILE"
|
|
547
|
+
if [ "$FAILS" -ge "$MAX_FAILS" ]; then
|
|
548
|
+
log "ALERT: $FAILS consecutive runs with failures"
|
|
549
|
+
fi
|
|
550
|
+
else
|
|
551
|
+
echo "0" > "$FAIL_COUNT_FILE"
|
|
552
|
+
fi
|
|
553
|
+
|
|
554
|
+
# ============================================================================
|
|
555
|
+
# LEVEL 2 AUTO-REPAIR: Launch NEXO for intelligent diagnosis
|
|
556
|
+
# ============================================================================
|
|
557
|
+
REPAIR_LOCK="$HOME_DIR/claude/scripts/.watchdog-nexo-repair.lock"
|
|
558
|
+
REPAIR_COOLDOWN=1800 # 30 min between NEXO repair attempts
|
|
559
|
+
|
|
560
|
+
if [ "$TOTAL_FAIL" -gt 0 ]; then
|
|
561
|
+
LOCK_AGE=999999
|
|
562
|
+
SKIP_REPAIR=false
|
|
563
|
+
if [ -f "$REPAIR_LOCK" ]; then
|
|
564
|
+
LOCK_AGE=$(file_age "$REPAIR_LOCK")
|
|
565
|
+
if [ "$LOCK_AGE" -lt "$REPAIR_COOLDOWN" ]; then
|
|
566
|
+
log "NEXO repair skipped: cooldown (${LOCK_AGE}s < ${REPAIR_COOLDOWN}s)"
|
|
567
|
+
SKIP_REPAIR=true
|
|
568
|
+
fi
|
|
569
|
+
fi
|
|
570
|
+
|
|
571
|
+
if ! $SKIP_REPAIR; then
|
|
572
|
+
# Collect failure details from tracked FAILED_MONITORS array
|
|
573
|
+
FAIL_DETAILS=""
|
|
574
|
+
for failed in "${FAILED_MONITORS[@]}"; do
|
|
575
|
+
IFS='|' read -r m_name m_plist m_stdout m_stderr m_proc m_sched m_details <<< "$failed"
|
|
576
|
+
STDERR_TAIL=""
|
|
577
|
+
if [ -n "$m_stderr" ] && [ -f "$m_stderr" ]; then
|
|
578
|
+
STDERR_TAIL=$(tail -20 "$m_stderr" 2>/dev/null | head -20)
|
|
579
|
+
fi
|
|
580
|
+
STDOUT_TAIL=""
|
|
581
|
+
if [ -n "$m_stdout" ] && [ -f "$m_stdout" ]; then
|
|
582
|
+
STDOUT_TAIL=$(tail -10 "$m_stdout" 2>/dev/null | head -10)
|
|
583
|
+
fi
|
|
584
|
+
FAIL_DETAILS="${FAIL_DETAILS}
|
|
585
|
+
--- ${m_name} (${m_plist}) ---
|
|
586
|
+
Schedule: ${m_sched}
|
|
587
|
+
Failure reason: ${m_details}
|
|
588
|
+
Plist: ~/Library/LaunchAgents/${m_plist}.plist
|
|
589
|
+
Process grep: ${m_proc}
|
|
590
|
+
Stderr (last 20 lines):
|
|
591
|
+
${STDERR_TAIL}
|
|
592
|
+
Stdout (last 10 lines):
|
|
593
|
+
${STDOUT_TAIL}
|
|
594
|
+
"
|
|
595
|
+
done
|
|
596
|
+
|
|
597
|
+
if [ -n "$FAIL_DETAILS" ]; then
|
|
598
|
+
touch "$REPAIR_LOCK"
|
|
599
|
+
log "Launching NEXO Level 2 repair..."
|
|
600
|
+
|
|
601
|
+
REPAIR_PROMPT_FILE=$(mktemp /tmp/nexo-repair-XXXXXX.txt)
|
|
602
|
+
cat > "$REPAIR_PROMPT_FILE" <<NEXOPROMPT
|
|
603
|
+
WATCHDOG LEVEL 2 REPAIR — Automated diagnostic session.
|
|
604
|
+
|
|
605
|
+
The watchdog detected services that failed mechanical repair (launchctl bootstrap/kickstart).
|
|
606
|
+
Your job: diagnose the root cause and fix it. Do NOT ask the user anything — resolve autonomously.
|
|
607
|
+
|
|
608
|
+
FAILURES:
|
|
609
|
+
${FAIL_DETAILS}
|
|
610
|
+
|
|
611
|
+
STEPS:
|
|
612
|
+
1. Read the plist file to understand the service configuration
|
|
613
|
+
2. Check stderr/stdout logs for the actual error
|
|
614
|
+
3. Fix the root cause (missing file, bad config, dependency issue, etc.)
|
|
615
|
+
4. Reload the service and verify it is running
|
|
616
|
+
5. Log what you did to ~/claude/logs/watchdog-repair-result.log
|
|
617
|
+
|
|
618
|
+
CONSTRAINTS:
|
|
619
|
+
- Do NOT modify CLAUDE.md or any protected file
|
|
620
|
+
- Do NOT start interactive conversations
|
|
621
|
+
- Keep it under 5 minutes
|
|
622
|
+
- Log what you did to ~/claude/logs/watchdog-repair-result.log
|
|
623
|
+
NEXOPROMPT
|
|
624
|
+
|
|
625
|
+
# Find claude CLI (may not be in PATH for cron/LaunchAgent)
|
|
626
|
+
CLAUDE_BIN=$(command -v claude 2>/dev/null || echo "$HOME_DIR/.claude/local/bin/claude")
|
|
627
|
+
if [ ! -x "$CLAUDE_BIN" ]; then
|
|
628
|
+
CLAUDE_BIN=$(find /usr/local/bin /opt/homebrew/bin "$HOME_DIR/.local/bin" "$HOME_DIR/.npm-global/bin" -name claude -type f 2>/dev/null | head -1)
|
|
629
|
+
fi
|
|
630
|
+
|
|
631
|
+
if [ -n "$CLAUDE_BIN" ] && [ -x "$CLAUDE_BIN" ]; then
|
|
632
|
+
nohup bash -c "\"$CLAUDE_BIN\" --print --dangerously-skip-permissions -p \"\$(cat '$REPAIR_PROMPT_FILE')\" >> '$LOG_DIR/watchdog-nexo-repair.log' 2>&1; rm -f '$REPAIR_PROMPT_FILE'" &
|
|
633
|
+
log "NEXO repair launched (PID: $!)"
|
|
634
|
+
else
|
|
635
|
+
log "NEXO repair ABORTED: claude CLI not found in PATH"
|
|
636
|
+
rm -f "$REPAIR_PROMPT_FILE"
|
|
637
|
+
fi
|
|
638
|
+
fi
|
|
639
|
+
fi
|
|
640
|
+
fi
|
|
641
|
+
|
|
642
|
+
# ============================================================================
|
|
643
|
+
# LOG SUMMARY
|
|
644
|
+
# ============================================================================
|
|
645
|
+
log "Complete: PASS=$TOTAL_PASS HEALED=$TOTAL_HEALED WARN=$TOTAL_WARN FAIL=$TOTAL_FAIL"
|