nexo-brain 5.3.26 → 5.3.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/src/server.py +3 -0
- package/src/tools_sessions.py +6 -1
- package/src/dashboard/static/favicon 2.svg +0 -32
- package/src/dashboard/static/nexo-logo 2.png +0 -0
- package/src/dashboard/static/nexo-logo 2.svg +0 -40
- package/src/dashboard/static/style 2.css +0 -2458
- package/src/dashboard/templates/adaptive 2.html +0 -118
- package/src/dashboard/templates/artifacts 2.html +0 -133
- package/src/dashboard/templates/backups 2.html +0 -136
- package/src/dashboard/templates/base 2.html +0 -417
- package/src/dashboard/templates/calendar 2.html +0 -591
- package/src/dashboard/templates/chat 2.html +0 -356
- package/src/dashboard/templates/claims 2.html +0 -259
- package/src/dashboard/templates/cortex 2.html +0 -321
- package/src/dashboard/templates/credentials 2.html +0 -128
- package/src/dashboard/templates/crons 2.html +0 -370
- package/src/dashboard/templates/dashboard 2.html +0 -494
- package/src/dashboard/templates/dreams 2.html +0 -252
- package/src/dashboard/templates/email 2.html +0 -160
- package/src/dashboard/templates/evolution 2.html +0 -189
- package/src/dashboard/templates/feed 2.html +0 -249
- package/src/dashboard/templates/followup_health 2.html +0 -170
- package/src/dashboard/templates/graph 2.html +0 -201
- package/src/dashboard/templates/guard 2.html +0 -259
- package/src/dashboard/templates/inbox 2.html +0 -251
- package/src/dashboard/templates/memory 2.html +0 -420
- package/src/dashboard/templates/operations 2.html +0 -608
- package/src/dashboard/templates/plugins 2.html +0 -185
- package/src/dashboard/templates/protocol 2.html +0 -199
- package/src/dashboard/templates/rules 2.html +0 -246
- package/src/dashboard/templates/sentiment 2.html +0 -247
- package/src/dashboard/templates/sessions 2.html +0 -218
- package/src/dashboard/templates/skills 2.html +0 -329
- package/src/dashboard/templates/somatic 2.html +0 -73
- package/src/dashboard/templates/triggers 2.html +0 -133
- package/src/dashboard/templates/trust 2.html +0 -360
- package/src/db/__init__ 2.py +0 -259
- package/src/db/_core 2.py +0 -437
- package/src/db/_credentials 2.py +0 -124
- package/src/db/_episodic 2.py +0 -762
- package/src/db/_evolution 2.py +0 -54
- package/src/db/_fts 2.py +0 -406
- package/src/db/_goal_profiles 2.py +0 -376
- package/src/db/_hot_context 2.py +0 -660
- package/src/db/_outcomes 2.py +0 -800
- package/src/db/_personal_scripts 2.py +0 -582
- package/src/db/_sessions 2.py +0 -330
- package/src/db/_tasks 2.py +0 -91
- package/src/db/_watchers 2.py +0 -173
- package/src/doctor/formatters 2.py +0 -52
- package/src/doctor/models 2.py +0 -69
- package/src/doctor/planes 2.py +0 -87
- package/src/doctor/providers/__init__ 2.py +0 -1
- package/src/doctor/providers/deep 2.py +0 -367
- package/src/evolution_cycle 2.py +0 -519
- package/src/hooks/auto_capture 2.py +0 -208
- package/src/hooks/caffeinate-guard 2.sh +0 -8
- package/src/hooks/capture-session 2.sh +0 -21
- package/src/hooks/capture-tool-logs 2.sh +0 -158
- package/src/hooks/daily-briefing-check 2.sh +0 -33
- package/src/hooks/heartbeat-enforcement 2.py +0 -90
- package/src/hooks/heartbeat-posttool 2.sh +0 -18
- package/src/hooks/inbox-hook 2.sh +0 -76
- package/src/hooks/post-compact 2.sh +0 -152
- package/src/hooks/pre-compact 2.sh +0 -169
- package/src/hooks/protocol-guardrail 2.sh +0 -10
- package/src/hooks/protocol-pretool-guardrail 2.sh +0 -9
- package/src/hooks/session-stop 2.sh +0 -52
- package/src/kg_populate 2.py +0 -292
- package/src/maintenance 2.py +0 -53
- package/src/memory_backends 2.py +0 -71
- package/src/migrate_embeddings 2.py +0 -124
- package/src/nexo_sdk 2.py +0 -103
- package/src/observability 2.py +0 -199
- package/src/plugin_loader 2.py +0 -217
- package/src/plugins/__init__ 2.py +0 -0
- package/src/plugins/artifact_registry 2.py +0 -450
- package/src/plugins/backup 2.py +0 -127
- package/src/plugins/claims_tools 2.py +0 -119
- package/src/plugins/cognitive_memory 2.py +0 -609
- package/src/plugins/core_rules 2.py +0 -252
- package/src/plugins/cortex 2.py +0 -1155
- package/src/plugins/entities 2.py +0 -67
- package/src/plugins/episodic_memory 2.py +0 -560
- package/src/plugins/evolution 2.py +0 -167
- package/src/plugins/goal_engine 2.py +0 -142
- package/src/plugins/guard 2.py +0 -862
- package/src/plugins/impact 2.py +0 -29
- package/src/plugins/knowledge_graph_tools 2.py +0 -137
- package/src/plugins/media_memory_tools 2.py +0 -98
- package/src/plugins/memory_export 2.py +0 -196
- package/src/plugins/outcomes 2.py +0 -130
- package/src/plugins/personal_scripts 2.py +0 -117
- package/src/plugins/preferences 2.py +0 -47
- package/src/plugins/protocol 2.py +0 -1449
- package/src/plugins/simple_api 2.py +0 -106
- package/src/plugins/skills 2.py +0 -341
- package/src/plugins/state_watchers 2.py +0 -79
- package/src/plugins/update 2.py +0 -986
- package/src/plugins/user_state_tools 2.py +0 -43
- package/src/plugins/workflow 2.py +0 -588
- package/src/protocol_settings 2.py +0 -59
- package/src/public_contribution 2.py +0 -466
- package/src/public_evolution_queue 2.py +0 -241
- package/src/requirements 2.txt +0 -14
- package/src/retroactive_learnings 2.py +0 -373
- package/src/rules/__init__ 2.py +0 -0
- package/src/rules/core-rules 2.json +0 -331
- package/src/rules/migrate 2.py +0 -207
- package/src/runtime_power 2.py +0 -874
- package/src/script_registry 2.py +0 -1559
- package/src/scripts/check-context 2.py +0 -272
- package/src/scripts/deep-sleep/apply_findings 2.py +0 -2327
- package/src/scripts/deep-sleep/collect 2.py +0 -928
- package/src/scripts/deep-sleep/extract 2.py +0 -330
- package/src/scripts/deep-sleep/extract-prompt 2.md +0 -285
- package/src/scripts/deep-sleep/synthesize 2.py +0 -312
- package/src/scripts/deep-sleep/synthesize-prompt 2.md +0 -336
- package/src/scripts/nexo-agent-run 2.py +0 -75
- package/src/scripts/nexo-auto-update 2.py +0 -6
- package/src/scripts/nexo-backup 2.sh +0 -25
- package/src/scripts/nexo-brain-activation 2.sh +0 -140
- package/src/scripts/nexo-catchup 2.py +0 -300
- package/src/scripts/nexo-cognitive-decay 2.py +0 -257
- package/src/scripts/nexo-cortex-cycle 2.py +0 -293
- package/src/scripts/nexo-cron-wrapper 2.sh +0 -53
- package/src/scripts/nexo-daily-self-audit 2.py +0 -2161
- package/src/scripts/nexo-dashboard 2.sh +0 -29
- package/src/scripts/nexo-deep-sleep 2.sh +0 -86
- package/src/scripts/nexo-evolution-run 2.py +0 -1664
- package/src/scripts/nexo-followup-hygiene 2.py +0 -139
- package/src/scripts/nexo-hook-record 2.py +0 -42
- package/src/scripts/nexo-immune 2.py +0 -936
- package/src/scripts/nexo-impact-scorer 2.py +0 -117
- package/src/scripts/nexo-inbox-hook 2.sh +0 -74
- package/src/scripts/nexo-install 2.py +0 -6
- package/src/scripts/nexo-learning-housekeep 2.py +0 -401
- package/src/scripts/nexo-learning-validator 2.py +0 -266
- package/src/scripts/nexo-migrate 2.py +0 -260
- package/src/scripts/nexo-outcome-checker 2.py +0 -127
- package/src/scripts/nexo-postmortem-consolidator 2.py +0 -456
- package/src/scripts/nexo-pre-commit 2.py +0 -120
- package/src/scripts/nexo-prevent-sleep 2.sh +0 -35
- package/src/scripts/nexo-proactive-dashboard 2.py +0 -354
- package/src/scripts/nexo-reflection 2.py +0 -256
- package/src/scripts/nexo-runtime-preflight 2.py +0 -274
- package/src/scripts/nexo-sleep 2.py +0 -631
- package/src/scripts/nexo-snapshot-restore 2.sh +0 -35
- package/src/scripts/nexo-sync-clients 2.py +0 -16
- package/src/scripts/nexo-synthesis 2.py +0 -475
- package/src/scripts/nexo-tcc-approve 2.sh +0 -79
- package/src/scripts/nexo-update 2.sh +0 -306
- package/src/scripts/nexo-watchdog 2.sh +0 -1207
- package/src/scripts/nexo-watchdog-smoke 2.py +0 -119
- package/src/scripts/rehydrate_learnings_from_archive 2.py +0 -245
- package/src/server 2.py +0 -1296
- package/src/skills/run-nexo-audit-phase/guide 2.md +0 -43
- package/src/skills/run-nexo-audit-phase/skill 2.json +0 -59
- package/src/skills/run-nexo-core-fix-cycle/guide 2.md +0 -17
- package/src/skills/run-nexo-core-fix-cycle/script 2.py +0 -276
- package/src/skills/run-nexo-core-fix-cycle/skill 2.json +0 -58
- package/src/skills/run-release-final-audit/guide 2.md +0 -16
- package/src/skills/run-release-final-audit/script 2.py +0 -259
- package/src/skills/run-release-final-audit/skill 2.json +0 -77
- package/src/skills/run-runtime-doctor/guide 2.md +0 -12
- package/src/skills/run-runtime-doctor/script 2.py +0 -21
- package/src/skills/run-runtime-doctor/skill 2.json +0 -25
- package/src/skills_runtime 2.py +0 -932
- package/src/state_watchers_runtime 2.py +0 -475
- package/src/storage_router 2.py +0 -32
- package/src/system_catalog 2.py +0 -786
- package/src/tools_coordination 2.py +0 -103
- package/src/tools_credentials 2.py +0 -68
- package/src/tools_drive 2.py +0 -487
- package/src/tools_hot_context 2.py +0 -163
- package/src/tools_learnings 2.py +0 -612
- package/src/tools_menu 2.py +0 -229
- package/src/tools_reminders 2.py +0 -88
- package/src/tools_reminders_crud 2.py +0 -363
- package/src/tools_sessions 2.py +0 -1054
- package/src/tools_system_catalog 2.py +0 -19
- package/src/tools_task_history 2.py +0 -57
- package/src/tools_transcripts 2.py +0 -98
- package/src/transcript_utils 2.py +0 -412
- package/src/user_context 2.py +0 -46
- package/src/user_data_portability 2.py +0 -328
- package/src/user_state_model 2.py +0 -170
- package/templates/CLAUDE.md 2.template +0 -108
- package/templates/CODEX.AGENTS.md 2.template +0 -66
- package/templates/launchagents/README 2.md +0 -132
- package/templates/launchagents/com.nexo.auto-close-sessions 2.plist +0 -39
- package/templates/launchagents/com.nexo.catchup 2.plist +0 -39
- package/templates/launchagents/com.nexo.cognitive-decay 2.plist +0 -40
- package/templates/launchagents/com.nexo.dashboard 2.plist +0 -43
- package/templates/launchagents/com.nexo.deep-sleep 2.plist +0 -43
- package/templates/launchagents/com.nexo.evolution 2.plist +0 -44
- package/templates/launchagents/com.nexo.followup-hygiene 2.plist +0 -45
- package/templates/launchagents/com.nexo.immune 2.plist +0 -41
- package/templates/launchagents/com.nexo.postmortem 2.plist +0 -45
- package/templates/launchagents/com.nexo.self-audit 2.plist +0 -47
- package/templates/launchagents/com.nexo.synthesis 2.plist +0 -45
- package/templates/launchagents/com.nexo.watchdog 2.plist +0 -37
- package/templates/nexo_helper 2.py +0 -301
- package/templates/openclaw 2.json +0 -13
- package/templates/plugin-template 2.py +0 -40
- package/templates/script-template 2.py +0 -59
- package/templates/script-template 2.sh +0 -13
- package/templates/skill-script-template 2.py +0 -48
- package/templates/skill-template 2.md +0 -33
|
@@ -1,1207 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# ============================================================================
|
|
3
|
-
# NEXO Watchdog — Comprehensive health monitor for all NEXO services
|
|
4
|
-
# Schedule: every 30 minutes (interval_seconds: 1800)
|
|
5
|
-
# ============================================================================
|
|
6
|
-
# Monitors ALL LaunchAgents, cron jobs, and background processes.
|
|
7
|
-
# Outputs: watchdog-status.json (machine), watchdog-report.txt (human),
|
|
8
|
-
# .watchdog-alert (if any FAIL detected)
|
|
9
|
-
# ============================================================================
|
|
10
|
-
set -uo pipefail
|
|
11
|
-
|
|
12
|
-
# === PATHS ===
|
|
13
|
-
HOME_DIR="$HOME"
|
|
14
|
-
NEXO_HOME="${NEXO_HOME:-$HOME/.nexo}"
|
|
15
|
-
NEXO_DIR="$NEXO_HOME"
|
|
16
|
-
CORTEX_DIR="$NEXO_HOME/brain"
|
|
17
|
-
OPS_DIR="$NEXO_HOME/operations"
|
|
18
|
-
LOG_DIR="$NEXO_HOME/logs"
|
|
19
|
-
DB_PATH="$NEXO_HOME/data/nexo.db"
|
|
20
|
-
LOG="$LOG_DIR/watchdog.log"
|
|
21
|
-
STATUS_JSON="$OPS_DIR/watchdog-status.json"
|
|
22
|
-
REPORT_TXT="$OPS_DIR/watchdog-report.txt"
|
|
23
|
-
ALERT_FILE="$OPS_DIR/.watchdog-alert"
|
|
24
|
-
HASH_REGISTRY="$NEXO_HOME/scripts/.watchdog-hashes"
|
|
25
|
-
FAIL_COUNT_FILE="$NEXO_HOME/scripts/.watchdog-fails"
|
|
26
|
-
MAX_FAILS=3
|
|
27
|
-
|
|
28
|
-
mkdir -p "$LOG_DIR" "$OPS_DIR"
|
|
29
|
-
|
|
30
|
-
TS=$(date "+%Y-%m-%d %H:%M:%S")
|
|
31
|
-
TS_EPOCH=$(date +%s)
|
|
32
|
-
|
|
33
|
-
log() { echo "[$TS] $1" >> "$LOG"; }
|
|
34
|
-
|
|
35
|
-
# ============================================================================
|
|
36
|
-
# MONITOR REGISTRY — generated dynamically from manifest.json
|
|
37
|
-
# ============================================================================
|
|
38
|
-
# Format: NAME|PLIST_ID|LOG_STDOUT|LOG_STDERR|MAX_STALE_SECS|PROCESS_GREP|SCHEDULE_DESC|TYPE
|
|
39
|
-
#
|
|
40
|
-
# MAX_STALE_SECS: how old stdout log can be before WARN.
|
|
41
|
-
# 0 = skip staleness check (for one-shot or infrequent tasks)
|
|
42
|
-
# WARN at MAX_STALE_SECS, FAIL at 3x MAX_STALE_SECS
|
|
43
|
-
# PROCESS_GREP: pattern to grep in ps (empty = skip process check)
|
|
44
|
-
# ============================================================================
|
|
45
|
-
# Core monitors are built from crons/manifest.json (single source of truth).
|
|
46
|
-
# The NEXO_CODE env var must point to the repo src/ directory.
|
|
47
|
-
# Add personal (non-manifest) monitors to PERSONAL_MONITORS below.
|
|
48
|
-
NEXO_CODE="${NEXO_CODE:-$(cd "$(dirname "$0")/.." 2>/dev/null && pwd)}"
|
|
49
|
-
# Look for manifest in NEXO_HOME first (packaged install), then NEXO_CODE (dev/repo)
|
|
50
|
-
if [ -f "$NEXO_HOME/crons/manifest.json" ]; then
|
|
51
|
-
MANIFEST_FILE="$NEXO_HOME/crons/manifest.json"
|
|
52
|
-
else
|
|
53
|
-
MANIFEST_FILE="$NEXO_CODE/crons/manifest.json"
|
|
54
|
-
fi
|
|
55
|
-
|
|
56
|
-
_build_monitors_from_manifest() {
|
|
57
|
-
if [ ! -f "$MANIFEST_FILE" ]; then
|
|
58
|
-
log "WARNING: manifest.json not found at $MANIFEST_FILE — no core monitors loaded"
|
|
59
|
-
return
|
|
60
|
-
fi
|
|
61
|
-
python3 -c "
|
|
62
|
-
import json, sys, platform
|
|
63
|
-
|
|
64
|
-
nexo_home = '$NEXO_HOME'
|
|
65
|
-
is_mac = platform.system() == 'Darwin'
|
|
66
|
-
optionals_file = '$NEXO_HOME/config/optionals.json'
|
|
67
|
-
schedule_file = '$NEXO_HOME/config/schedule.json'
|
|
68
|
-
optionals = {}
|
|
69
|
-
automation_default = True
|
|
70
|
-
|
|
71
|
-
with open('$MANIFEST_FILE') as f:
|
|
72
|
-
data = json.load(f)
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
with open(optionals_file) as f:
|
|
76
|
-
maybe = json.load(f)
|
|
77
|
-
if isinstance(maybe, dict):
|
|
78
|
-
optionals = {str(k): bool(v) for k, v in maybe.items()}
|
|
79
|
-
except Exception:
|
|
80
|
-
optionals = {}
|
|
81
|
-
|
|
82
|
-
try:
|
|
83
|
-
with open(schedule_file) as f:
|
|
84
|
-
schedule = json.load(f)
|
|
85
|
-
if isinstance(schedule, dict):
|
|
86
|
-
automation_default = bool(schedule.get('automation_enabled', True))
|
|
87
|
-
except Exception:
|
|
88
|
-
automation_default = True
|
|
89
|
-
|
|
90
|
-
for c in data.get('crons', []):
|
|
91
|
-
cid = c['id']
|
|
92
|
-
optional_key = c.get('optional')
|
|
93
|
-
if optional_key == 'automation':
|
|
94
|
-
optional_enabled = optionals.get(optional_key, automation_default)
|
|
95
|
-
else:
|
|
96
|
-
optional_enabled = optionals.get(optional_key, False)
|
|
97
|
-
if optional_key and not optional_enabled:
|
|
98
|
-
continue
|
|
99
|
-
name = cid.replace('-', ' ').title()
|
|
100
|
-
# Use the right service identifier per platform
|
|
101
|
-
if is_mac:
|
|
102
|
-
svc_id = 'com.nexo.' + cid
|
|
103
|
-
else:
|
|
104
|
-
svc_id = 'nexo-' + cid + '.timer'
|
|
105
|
-
stdout_log = nexo_home + '/logs/' + cid + '-stdout.log'
|
|
106
|
-
stderr_log = nexo_home + '/logs/' + cid + '-stderr.log'
|
|
107
|
-
|
|
108
|
-
recovery_policy = c.get('recovery_policy')
|
|
109
|
-
if not recovery_policy:
|
|
110
|
-
if c.get('keep_alive') or 'interval_seconds' in c:
|
|
111
|
-
recovery_policy = 'restart'
|
|
112
|
-
elif 'schedule' in c:
|
|
113
|
-
recovery_policy = 'catchup'
|
|
114
|
-
else:
|
|
115
|
-
recovery_policy = 'none'
|
|
116
|
-
run_at_load = bool(c.get('run_at_load') or (c.get('run_on_boot') and 'interval_seconds' in c and not c.get('keep_alive')))
|
|
117
|
-
|
|
118
|
-
# Derive max_stale_secs and schedule_desc from schedule config
|
|
119
|
-
if c.get('keep_alive'):
|
|
120
|
-
max_stale = 0
|
|
121
|
-
schedule_desc = 'KeepAlive'
|
|
122
|
-
proc_grep = c.get('script', '').split('/')[-1]
|
|
123
|
-
elif 'interval_seconds' in c:
|
|
124
|
-
iv = c['interval_seconds']
|
|
125
|
-
# Allow 2x the interval before WARN
|
|
126
|
-
max_stale = iv * 2
|
|
127
|
-
if iv >= 3600:
|
|
128
|
-
schedule_desc = f'Every {iv // 3600}h'
|
|
129
|
-
else:
|
|
130
|
-
schedule_desc = f'Every {iv // 60} min'
|
|
131
|
-
if run_at_load:
|
|
132
|
-
schedule_desc += ' + boot'
|
|
133
|
-
proc_grep = ''
|
|
134
|
-
elif 'schedule' in c:
|
|
135
|
-
s = c['schedule']
|
|
136
|
-
h = s.get('hour', 0)
|
|
137
|
-
m = s.get('minute', 0)
|
|
138
|
-
if 'weekday' in s:
|
|
139
|
-
days = ['Sun','Mon','Tue','Wed','Thu','Fri','Sat']
|
|
140
|
-
schedule_desc = f'Weekly {days[s[\"weekday\"]]} {h}:{m:02d}'
|
|
141
|
-
max_stale = 0 # weekly tasks: skip staleness
|
|
142
|
-
else:
|
|
143
|
-
schedule_desc = f'Daily {h}:{m:02d}'
|
|
144
|
-
max_stale = 90000 # ~25h
|
|
145
|
-
proc_grep = ''
|
|
146
|
-
elif run_at_load:
|
|
147
|
-
max_stale = 0
|
|
148
|
-
schedule_desc = 'RunAtLoad once'
|
|
149
|
-
proc_grep = ''
|
|
150
|
-
else:
|
|
151
|
-
max_stale = 0
|
|
152
|
-
schedule_desc = 'unknown'
|
|
153
|
-
proc_grep = ''
|
|
154
|
-
|
|
155
|
-
mon_type = 'core' if c.get('core') else 'personal'
|
|
156
|
-
|
|
157
|
-
print(f'{name}|{svc_id}|{stdout_log}|{stderr_log}|{max_stale}|{proc_grep}|{schedule_desc}|{mon_type}|{recovery_policy}')
|
|
158
|
-
" 2>/dev/null
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
MONITORS=()
|
|
162
|
-
while IFS= read -r line; do
|
|
163
|
-
[ -n "$line" ] && MONITORS+=("$line")
|
|
164
|
-
done < <(_build_monitors_from_manifest)
|
|
165
|
-
|
|
166
|
-
# Personal (non-manifest) monitors — add yours below.
|
|
167
|
-
# These are NOT in manifest.json and won't be synced by cron-sync.
|
|
168
|
-
PERSONAL_MONITORS=(
|
|
169
|
-
# "My Service|com.nexo.my-service|$NEXO_HOME/logs/my-service.log||3600||Every 30 min|personal"
|
|
170
|
-
)
|
|
171
|
-
MONITORS+=("${PERSONAL_MONITORS[@]+"${PERSONAL_MONITORS[@]}"}")
|
|
172
|
-
|
|
173
|
-
# Cron jobs to check (NAME|SCRIPT|CHECK_PATH|MAX_STALE_SECS|SCHEDULE)
|
|
174
|
-
# Core cron monitors are loaded from manifest above.
|
|
175
|
-
# Maintainer-only monitors go here (guarded by NEXO_MAINTAINER env var).
|
|
176
|
-
CRON_MONITORS=()
|
|
177
|
-
if [ "${NEXO_MAINTAINER:-}" = "1" ]; then
|
|
178
|
-
CRON_MONITORS+=(
|
|
179
|
-
"Backup|$NEXO_DIR/scripts/nexo-backup.sh|$NEXO_DIR/backups/|7200|Hourly"
|
|
180
|
-
)
|
|
181
|
-
fi
|
|
182
|
-
|
|
183
|
-
# Error patterns to search in stderr logs (last 50 lines)
|
|
184
|
-
ERROR_PATTERNS="Traceback|Error:|CRITICAL|FATAL|ModuleNotFoundError|PermissionError|FileNotFoundError|ConnectionRefused|Errno|Operation not permitted|SyntaxError|sqlite3\\.OperationalError"
|
|
185
|
-
|
|
186
|
-
# ============================================================================
|
|
187
|
-
# HELPER FUNCTIONS
|
|
188
|
-
# ============================================================================
|
|
189
|
-
|
|
190
|
-
UID_NUM=$(id -u)
|
|
191
|
-
REPAIR_LOG="$LOG_DIR/watchdog-repairs.log"
|
|
192
|
-
TOTAL_HEALED=0
|
|
193
|
-
IS_MACOS=false
|
|
194
|
-
[ "$(uname)" = "Darwin" ] && IS_MACOS=true
|
|
195
|
-
|
|
196
|
-
log_repair() { echo "[$TS] REPAIR: $1" >> "$REPAIR_LOG"; log "REPAIR: $1"; }
|
|
197
|
-
|
|
198
|
-
is_loaded() {
|
|
199
|
-
if $IS_MACOS; then
|
|
200
|
-
launchctl print "gui/$UID_NUM/$1" &>/dev/null
|
|
201
|
-
else
|
|
202
|
-
# On Linux, check if the systemd timer is enabled
|
|
203
|
-
systemctl --user is-enabled "$1" &>/dev/null
|
|
204
|
-
fi
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
# ============================================================================
|
|
208
|
-
# AUTO-REPAIR FUNCTIONS
|
|
209
|
-
# ============================================================================
|
|
210
|
-
|
|
211
|
-
try_repair_launchagent() {
|
|
212
|
-
$IS_MACOS || return 1
|
|
213
|
-
local plist_id="$1"
|
|
214
|
-
local proc_grep="$2"
|
|
215
|
-
local plist_file="$HOME_DIR/Library/LaunchAgents/${plist_id}.plist"
|
|
216
|
-
|
|
217
|
-
# Repair 1: Not loaded — try to bootstrap
|
|
218
|
-
if ! is_loaded "$plist_id"; then
|
|
219
|
-
if [ -f "$plist_file" ]; then
|
|
220
|
-
launchctl bootstrap "gui/$UID_NUM" "$plist_file" 2>/dev/null
|
|
221
|
-
sleep 1
|
|
222
|
-
if is_loaded "$plist_id"; then
|
|
223
|
-
log_repair "$plist_id: bootstrapped successfully"
|
|
224
|
-
return 0
|
|
225
|
-
fi
|
|
226
|
-
fi
|
|
227
|
-
return 1
|
|
228
|
-
fi
|
|
229
|
-
|
|
230
|
-
# Repair 2: Loaded but process not running (KeepAlive) — kickstart
|
|
231
|
-
if [ -n "$proc_grep" ] && ! process_running "$proc_grep"; then
|
|
232
|
-
launchctl kickstart "gui/$UID_NUM/$plist_id" 2>/dev/null
|
|
233
|
-
sleep 2
|
|
234
|
-
if process_running "$proc_grep"; then
|
|
235
|
-
log_repair "$plist_id: kickstarted process '$proc_grep'"
|
|
236
|
-
return 0
|
|
237
|
-
fi
|
|
238
|
-
fi
|
|
239
|
-
|
|
240
|
-
return 1
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
try_repair_systemd() {
|
|
244
|
-
$IS_MACOS && return 1
|
|
245
|
-
local timer_unit="$1"
|
|
246
|
-
local service_unit="${timer_unit%.timer}.service"
|
|
247
|
-
|
|
248
|
-
# Repair 1: Timer not enabled — try to enable and start
|
|
249
|
-
if ! systemctl --user is-enabled "$timer_unit" &>/dev/null; then
|
|
250
|
-
systemctl --user daemon-reload 2>/dev/null
|
|
251
|
-
systemctl --user enable --now "$timer_unit" 2>/dev/null
|
|
252
|
-
sleep 1
|
|
253
|
-
if systemctl --user is-enabled "$timer_unit" &>/dev/null; then
|
|
254
|
-
log_repair "$timer_unit: enabled and started"
|
|
255
|
-
return 0
|
|
256
|
-
fi
|
|
257
|
-
return 1
|
|
258
|
-
fi
|
|
259
|
-
|
|
260
|
-
# Repair 2: Timer enabled but not active — start it
|
|
261
|
-
if ! systemctl --user is-active "$timer_unit" &>/dev/null; then
|
|
262
|
-
systemctl --user start "$timer_unit" 2>/dev/null
|
|
263
|
-
sleep 1
|
|
264
|
-
if systemctl --user is-active "$timer_unit" &>/dev/null; then
|
|
265
|
-
log_repair "$timer_unit: restarted"
|
|
266
|
-
return 0
|
|
267
|
-
fi
|
|
268
|
-
fi
|
|
269
|
-
|
|
270
|
-
return 1
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
try_repair_cron() {
|
|
274
|
-
local script="$1"
|
|
275
|
-
|
|
276
|
-
# Repair: Script not executable — chmod it
|
|
277
|
-
if [ -f "$script" ] && [ ! -x "$script" ]; then
|
|
278
|
-
chmod +x "$script"
|
|
279
|
-
if [ -x "$script" ]; then
|
|
280
|
-
log_repair "$script: made executable"
|
|
281
|
-
return 0
|
|
282
|
-
fi
|
|
283
|
-
fi
|
|
284
|
-
|
|
285
|
-
return 1
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
try_reexecute_missed_cron() {
|
|
289
|
-
local svc_id="$1"
|
|
290
|
-
|
|
291
|
-
if $IS_MACOS; then
|
|
292
|
-
log "Re-executing missed cron via launchctl kickstart: $svc_id"
|
|
293
|
-
if launchctl kickstart -k "gui/$UID_NUM/$svc_id" >> "$LOG_DIR/watchdog-reexec.log" 2>&1; then
|
|
294
|
-
log_repair "$svc_id: re-executed missed cron via launchctl kickstart"
|
|
295
|
-
return 0
|
|
296
|
-
fi
|
|
297
|
-
log "Re-execute failed for $svc_id"
|
|
298
|
-
return 1
|
|
299
|
-
else
|
|
300
|
-
# Linux: start the corresponding service unit directly
|
|
301
|
-
local service_unit="${svc_id%.timer}.service"
|
|
302
|
-
log "Re-executing missed cron: $svc_id → systemctl start $service_unit"
|
|
303
|
-
if systemctl --user start "$service_unit" 2>/dev/null; then
|
|
304
|
-
log_repair "$svc_id: re-executed via systemctl start $service_unit"
|
|
305
|
-
return 0
|
|
306
|
-
else
|
|
307
|
-
log "Re-execute failed for $svc_id"
|
|
308
|
-
return 1
|
|
309
|
-
fi
|
|
310
|
-
fi
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
CATCHUP_REQUESTED=false
|
|
314
|
-
try_request_catchup() {
|
|
315
|
-
if $CATCHUP_REQUESTED; then
|
|
316
|
-
return 0
|
|
317
|
-
fi
|
|
318
|
-
local catchup_svc
|
|
319
|
-
if $IS_MACOS; then
|
|
320
|
-
catchup_svc="com.nexo.catchup"
|
|
321
|
-
else
|
|
322
|
-
catchup_svc="nexo-catchup.timer"
|
|
323
|
-
fi
|
|
324
|
-
if try_reexecute_missed_cron "$catchup_svc"; then
|
|
325
|
-
CATCHUP_REQUESTED=true
|
|
326
|
-
return 0
|
|
327
|
-
fi
|
|
328
|
-
return 1
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
try_verify_repair() {
|
|
332
|
-
# After Level 2 repair, wait and verify the service is healthy
|
|
333
|
-
local plist_id="$1"
|
|
334
|
-
local log_stdout="$2"
|
|
335
|
-
local proc_grep="$3"
|
|
336
|
-
local mon_type="${4:-core}"
|
|
337
|
-
local max_wait=30
|
|
338
|
-
|
|
339
|
-
log "Verifying repair for $plist_id..."
|
|
340
|
-
|
|
341
|
-
# Check 1: Is it loaded?
|
|
342
|
-
if ! is_loaded "$plist_id"; then
|
|
343
|
-
log "Verify FAILED: $plist_id still not loaded"
|
|
344
|
-
return 1
|
|
345
|
-
fi
|
|
346
|
-
|
|
347
|
-
# Check 2: Process running? (for KeepAlive services)
|
|
348
|
-
if [ -n "$proc_grep" ]; then
|
|
349
|
-
local waited=0
|
|
350
|
-
while [ $waited -lt $max_wait ]; do
|
|
351
|
-
if process_running "$proc_grep"; then
|
|
352
|
-
log "Verify OK: $plist_id process running after ${waited}s"
|
|
353
|
-
return 0
|
|
354
|
-
fi
|
|
355
|
-
sleep 5
|
|
356
|
-
waited=$((waited + 5))
|
|
357
|
-
done
|
|
358
|
-
log "Verify FAILED: $plist_id process not running after ${max_wait}s"
|
|
359
|
-
return 1
|
|
360
|
-
fi
|
|
361
|
-
|
|
362
|
-
# Check 3: For scheduled crons, check if cron_runs/logs were updated recently
|
|
363
|
-
if [ "$mon_type" = "core" ]; then
|
|
364
|
-
local cron_id
|
|
365
|
-
cron_id=$(cron_id_from_service "$plist_id")
|
|
366
|
-
local run_info
|
|
367
|
-
run_info=$(cron_last_run_info "$cron_id" || true)
|
|
368
|
-
if [ -n "$run_info" ]; then
|
|
369
|
-
local run_age
|
|
370
|
-
IFS='|' read -r run_age _ _ _ _ _ <<< "$run_info"
|
|
371
|
-
if [ -n "$run_age" ] && [ "$run_age" -lt 300 ]; then
|
|
372
|
-
log "Verify OK: $plist_id cron_runs updated ${run_age}s ago"
|
|
373
|
-
return 0
|
|
374
|
-
fi
|
|
375
|
-
fi
|
|
376
|
-
fi
|
|
377
|
-
|
|
378
|
-
if [ -n "$log_stdout" ] && [ -f "$log_stdout" ]; then
|
|
379
|
-
local age
|
|
380
|
-
age=$(file_age "$log_stdout")
|
|
381
|
-
if [ "$age" -lt 300 ]; then
|
|
382
|
-
log "Verify OK: $plist_id log updated ${age}s ago"
|
|
383
|
-
return 0
|
|
384
|
-
fi
|
|
385
|
-
fi
|
|
386
|
-
|
|
387
|
-
# If we get here for a scheduled service, it's loaded which is sufficient
|
|
388
|
-
log "Verify OK: $plist_id is loaded (scheduled service)"
|
|
389
|
-
return 0
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
try_repair_backup() {
|
|
393
|
-
# Use the core backup script (nexo-backup.sh)
|
|
394
|
-
local backup_script="$NEXO_DIR/scripts/nexo-backup.sh"
|
|
395
|
-
[ ! -x "$backup_script" ] && backup_script="$SCRIPT_DIR/nexo-backup.sh"
|
|
396
|
-
if [ -x "$backup_script" ]; then
|
|
397
|
-
bash "$backup_script" 2>/dev/null
|
|
398
|
-
sleep 1
|
|
399
|
-
local newest
|
|
400
|
-
newest=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
|
|
401
|
-
if [ -n "$newest" ]; then
|
|
402
|
-
if $IS_MACOS; then local age=$(( TS_EPOCH - $(stat -f %m "$newest") )); else local age=$(( TS_EPOCH - $(stat -c %Y "$newest") )); fi
|
|
403
|
-
if [ "$age" -lt 60 ]; then
|
|
404
|
-
log_repair "nexo-backup.sh: ran successfully, fresh backup created"
|
|
405
|
-
return 0
|
|
406
|
-
fi
|
|
407
|
-
fi
|
|
408
|
-
fi
|
|
409
|
-
return 1
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
file_age() {
|
|
413
|
-
if [ -f "$1" ]; then
|
|
414
|
-
local mod_epoch
|
|
415
|
-
if $IS_MACOS; then
|
|
416
|
-
mod_epoch=$(stat -f %m "$1" 2>/dev/null || echo 0)
|
|
417
|
-
else
|
|
418
|
-
mod_epoch=$(stat -c %Y "$1" 2>/dev/null || echo 0)
|
|
419
|
-
fi
|
|
420
|
-
echo $(( TS_EPOCH - mod_epoch ))
|
|
421
|
-
else
|
|
422
|
-
echo 999999
|
|
423
|
-
fi
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
format_age() {
|
|
427
|
-
local secs=$1
|
|
428
|
-
if [ "$secs" -ge 999999 ]; then
|
|
429
|
-
echo "never"
|
|
430
|
-
elif [ "$secs" -ge 86400 ]; then
|
|
431
|
-
echo "$((secs / 86400))d $((secs % 86400 / 3600))h ago"
|
|
432
|
-
elif [ "$secs" -ge 3600 ]; then
|
|
433
|
-
echo "$((secs / 3600))h $((secs % 3600 / 60))m ago"
|
|
434
|
-
elif [ "$secs" -ge 60 ]; then
|
|
435
|
-
echo "$((secs / 60))m ago"
|
|
436
|
-
else
|
|
437
|
-
echo "${secs}s ago"
|
|
438
|
-
fi
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
check_errors() {
|
|
442
|
-
local logfile="$1"
|
|
443
|
-
if [ -f "$logfile" ] && [ -s "$logfile" ]; then
|
|
444
|
-
local count
|
|
445
|
-
count=$(tail -50 "$logfile" 2>/dev/null | grep -cE "$ERROR_PATTERNS" 2>/dev/null) || true
|
|
446
|
-
echo "${count:-0}"
|
|
447
|
-
else
|
|
448
|
-
echo 0
|
|
449
|
-
fi
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
process_running() {
|
|
453
|
-
if [ -n "$1" ]; then
|
|
454
|
-
pgrep -f "$1" > /dev/null 2>&1
|
|
455
|
-
else
|
|
456
|
-
return 0
|
|
457
|
-
fi
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
cron_id_from_service() {
|
|
461
|
-
local svc_id="$1"
|
|
462
|
-
if $IS_MACOS; then
|
|
463
|
-
echo "${svc_id#com.nexo.}"
|
|
464
|
-
else
|
|
465
|
-
echo "${svc_id#nexo-}" | sed 's/\.timer$//'
|
|
466
|
-
fi
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
cron_last_run_info() {
|
|
470
|
-
local cron_id="$1"
|
|
471
|
-
[ ! -f "$DB_PATH" ] && return 1
|
|
472
|
-
sqlite3 -separator '|' "$DB_PATH" "
|
|
473
|
-
SELECT
|
|
474
|
-
CAST(strftime('%s','now') - strftime('%s', started_at) AS INTEGER) AS age_secs,
|
|
475
|
-
COALESCE(started_at, ''),
|
|
476
|
-
COALESCE(ended_at, ''),
|
|
477
|
-
COALESCE(exit_code, ''),
|
|
478
|
-
COALESCE(error, ''),
|
|
479
|
-
COALESCE(summary, '')
|
|
480
|
-
FROM cron_runs
|
|
481
|
-
WHERE cron_id = '$cron_id'
|
|
482
|
-
ORDER BY id DESC
|
|
483
|
-
LIMIT 1;
|
|
484
|
-
" 2>/dev/null
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
classify_log_issue() {
|
|
488
|
-
local logfile="$1"
|
|
489
|
-
if [ ! -f "$logfile" ] || [ ! -s "$logfile" ]; then
|
|
490
|
-
return 0
|
|
491
|
-
fi
|
|
492
|
-
local tail_text
|
|
493
|
-
tail_text=$(tail -50 "$logfile" 2>/dev/null || true)
|
|
494
|
-
if echo "$tail_text" | grep -q "Operation not permitted"; then
|
|
495
|
-
echo "tcc"
|
|
496
|
-
elif echo "$tail_text" | grep -q "ModuleNotFoundError"; then
|
|
497
|
-
echo "dependency"
|
|
498
|
-
elif echo "$tail_text" | grep -q "SyntaxError"; then
|
|
499
|
-
echo "syntax"
|
|
500
|
-
elif echo "$tail_text" | grep -q "sqlite3.OperationalError"; then
|
|
501
|
-
echo "schema"
|
|
502
|
-
fi
|
|
503
|
-
}
|
|
504
|
-
|
|
505
|
-
# Escape strings for JSON
|
|
506
|
-
json_escape() {
|
|
507
|
-
echo "$1" | sed 's/\\/\\\\/g; s/"/\\"/g; s/ / /g' | tr '\n' ' '
|
|
508
|
-
}
|
|
509
|
-
|
|
510
|
-
# ============================================================================
|
|
511
|
-
# RUN CHECKS
|
|
512
|
-
# ============================================================================
|
|
513
|
-
|
|
514
|
-
TOTAL_PASS=0
|
|
515
|
-
TOTAL_WARN=0
|
|
516
|
-
TOTAL_FAIL=0
|
|
517
|
-
JSON_AGENTS=""
|
|
518
|
-
REPORT_LINES=""
|
|
519
|
-
FAILED_MONITORS=() # Track failed monitors for Level 2 repair
|
|
520
|
-
|
|
521
|
-
for monitor in "${MONITORS[@]}"; do
|
|
522
|
-
# Skip comment lines
|
|
523
|
-
[[ "$monitor" =~ ^[[:space:]]*# ]] && continue
|
|
524
|
-
IFS='|' read -r name plist_id log_stdout log_stderr max_stale proc_grep schedule mon_type recovery_policy <<< "$monitor"
|
|
525
|
-
mon_type="${mon_type:-core}"
|
|
526
|
-
recovery_policy="${recovery_policy:-restart}"
|
|
527
|
-
|
|
528
|
-
status="PASS"
|
|
529
|
-
details=""
|
|
530
|
-
loaded="unknown"
|
|
531
|
-
stale_age="n/a"
|
|
532
|
-
error_count=0
|
|
533
|
-
proc_alive="n/a"
|
|
534
|
-
error_kind=""
|
|
535
|
-
cron_id=$(cron_id_from_service "$plist_id")
|
|
536
|
-
latest_run_has_record=false
|
|
537
|
-
latest_run_failed=false
|
|
538
|
-
|
|
539
|
-
# Check 1: Service loaded? (launchd on macOS, systemd on Linux)
|
|
540
|
-
if is_loaded "$plist_id"; then
|
|
541
|
-
loaded="yes"
|
|
542
|
-
else
|
|
543
|
-
loaded="no"
|
|
544
|
-
# AUTO-REPAIR: try platform-appropriate repair
|
|
545
|
-
repair_ok=false
|
|
546
|
-
if $IS_MACOS; then
|
|
547
|
-
try_repair_launchagent "$plist_id" "$proc_grep" && repair_ok=true
|
|
548
|
-
else
|
|
549
|
-
try_repair_systemd "$plist_id" && repair_ok=true
|
|
550
|
-
fi
|
|
551
|
-
if $repair_ok; then
|
|
552
|
-
loaded="yes"
|
|
553
|
-
status="HEALED"
|
|
554
|
-
details="${details}Self-healed: service re-registered. "
|
|
555
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
556
|
-
else
|
|
557
|
-
status="FAIL"
|
|
558
|
-
details="${details}Service not loaded (repair failed). "
|
|
559
|
-
fi
|
|
560
|
-
fi
|
|
561
|
-
|
|
562
|
-
# Check 2: Process alive? (only for KeepAlive / long-running)
|
|
563
|
-
if [ -n "$proc_grep" ]; then
|
|
564
|
-
if process_running "$proc_grep"; then
|
|
565
|
-
proc_alive="yes"
|
|
566
|
-
else
|
|
567
|
-
proc_alive="no"
|
|
568
|
-
# AUTO-REPAIR: try to kickstart (platform-appropriate)
|
|
569
|
-
if [ "$status" != "FAIL" ] && [ "$status" != "HEALED" ]; then
|
|
570
|
-
if ($IS_MACOS && try_repair_launchagent "$plist_id" "$proc_grep") || \
|
|
571
|
-
(! $IS_MACOS && try_repair_systemd "$plist_id"); then
|
|
572
|
-
proc_alive="yes"
|
|
573
|
-
status="HEALED"
|
|
574
|
-
details="${details}Self-healed: kickstarted. "
|
|
575
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
576
|
-
else
|
|
577
|
-
status="WARN"
|
|
578
|
-
details="${details}Process '$proc_grep' not running (repair failed). "
|
|
579
|
-
fi
|
|
580
|
-
elif [ "$status" = "HEALED" ]; then
|
|
581
|
-
# Already healed by bootstrap, check if process came up
|
|
582
|
-
sleep 1
|
|
583
|
-
if process_running "$proc_grep"; then
|
|
584
|
-
proc_alive="yes"
|
|
585
|
-
else
|
|
586
|
-
details="${details}Process '$proc_grep' still not running after bootstrap. "
|
|
587
|
-
fi
|
|
588
|
-
fi
|
|
589
|
-
fi
|
|
590
|
-
fi
|
|
591
|
-
|
|
592
|
-
# Check 3: Staleness + AUTO RE-EXECUTE missed crons
|
|
593
|
-
if [ "$mon_type" = "core" ] && [ "$max_stale" -gt 0 ]; then
|
|
594
|
-
run_info=$(cron_last_run_info "$cron_id" || true)
|
|
595
|
-
if [ -n "$run_info" ]; then
|
|
596
|
-
latest_run_has_record=true
|
|
597
|
-
IFS='|' read -r age _ _ last_exit last_error last_summary <<< "$run_info"
|
|
598
|
-
age="${age:-999999}"
|
|
599
|
-
stale_age=$(format_age "$age")
|
|
600
|
-
if [ -n "$last_exit" ] && [ "$last_exit" != "0" ]; then
|
|
601
|
-
latest_run_failed=true
|
|
602
|
-
status="FAIL"
|
|
603
|
-
details="${details}Last run exited ${last_exit}. "
|
|
604
|
-
[ -n "$last_error" ] && details="${details}Error: ${last_error}. "
|
|
605
|
-
fi
|
|
606
|
-
if [ "$age" -gt $(( max_stale * 3 )) ]; then
|
|
607
|
-
if [ "$recovery_policy" = "catchup" ]; then
|
|
608
|
-
if try_request_catchup; then
|
|
609
|
-
status="HEALED"
|
|
610
|
-
details="${details}Self-healed: requested catchup for missed window (last run: $stale_age). "
|
|
611
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
612
|
-
else
|
|
613
|
-
status="FAIL"
|
|
614
|
-
details="${details}cron_runs stale: $stale_age (limit: $(format_age "$max_stale")). Catchup request failed. "
|
|
615
|
-
fi
|
|
616
|
-
else
|
|
617
|
-
if try_reexecute_missed_cron "$plist_id"; then
|
|
618
|
-
status="HEALED"
|
|
619
|
-
details="${details}Self-healed: re-executed missed cron (last run: $stale_age). "
|
|
620
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
621
|
-
else
|
|
622
|
-
status="FAIL"
|
|
623
|
-
details="${details}cron_runs stale: $stale_age (limit: $(format_age "$max_stale")). Re-execute failed. "
|
|
624
|
-
fi
|
|
625
|
-
fi
|
|
626
|
-
elif [ "$age" -gt "$max_stale" ]; then
|
|
627
|
-
[ "$status" = "PASS" ] && status="WARN"
|
|
628
|
-
details="${details}cron_runs slightly stale: $stale_age. "
|
|
629
|
-
elif [ -z "$details" ] && [ -n "$last_summary" ]; then
|
|
630
|
-
details="${details}Last run summary: ${last_summary}. "
|
|
631
|
-
fi
|
|
632
|
-
else
|
|
633
|
-
stale_age="no cron_runs entry"
|
|
634
|
-
if [ "$recovery_policy" = "catchup" ]; then
|
|
635
|
-
if try_request_catchup; then
|
|
636
|
-
status="HEALED"
|
|
637
|
-
details="${details}Self-healed: requested catchup for missing cron_runs entry. "
|
|
638
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
639
|
-
else
|
|
640
|
-
status="FAIL"
|
|
641
|
-
details="${details}No cron_runs entry recorded yet and catchup request failed. "
|
|
642
|
-
fi
|
|
643
|
-
else
|
|
644
|
-
if try_reexecute_missed_cron "$plist_id"; then
|
|
645
|
-
status="HEALED"
|
|
646
|
-
details="${details}Self-healed: executed missing cron for first run. "
|
|
647
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
648
|
-
else
|
|
649
|
-
status="FAIL"
|
|
650
|
-
details="${details}No cron_runs entry recorded yet. "
|
|
651
|
-
fi
|
|
652
|
-
fi
|
|
653
|
-
fi
|
|
654
|
-
elif [ -n "$log_stdout" ] && [ "$max_stale" -gt 0 ]; then
|
|
655
|
-
age=$(file_age "$log_stdout")
|
|
656
|
-
stale_age=$(format_age "$age")
|
|
657
|
-
if [ "$age" -gt $(( max_stale * 3 )) ]; then
|
|
658
|
-
# Severely stale — try to re-execute the missed cron
|
|
659
|
-
if try_reexecute_missed_cron "$plist_id"; then
|
|
660
|
-
status="HEALED"
|
|
661
|
-
details="${details}Self-healed: re-executed missed cron (was stale: $stale_age). "
|
|
662
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
663
|
-
else
|
|
664
|
-
status="FAIL"
|
|
665
|
-
details="${details}Log stale: $stale_age (limit: $(format_age "$max_stale")). Re-execute failed. "
|
|
666
|
-
fi
|
|
667
|
-
elif [ "$age" -gt "$max_stale" ]; then
|
|
668
|
-
[ "$status" = "PASS" ] && status="WARN"
|
|
669
|
-
details="${details}Log slightly stale: $stale_age. "
|
|
670
|
-
fi
|
|
671
|
-
elif [ -n "$log_stdout" ]; then
|
|
672
|
-
if [ -f "$log_stdout" ]; then
|
|
673
|
-
age=$(file_age "$log_stdout")
|
|
674
|
-
stale_age=$(format_age "$age")
|
|
675
|
-
else
|
|
676
|
-
stale_age="no log file"
|
|
677
|
-
fi
|
|
678
|
-
fi
|
|
679
|
-
|
|
680
|
-
# Check 4: Errors in stderr log
|
|
681
|
-
if [ -n "$log_stderr" ]; then
|
|
682
|
-
consider_stderr=true
|
|
683
|
-
if [ "$mon_type" = "core" ] && $latest_run_has_record && ! $latest_run_failed && [ "$loaded" = "yes" ]; then
|
|
684
|
-
consider_stderr=false
|
|
685
|
-
fi
|
|
686
|
-
if $consider_stderr; then
|
|
687
|
-
error_count=$(check_errors "$log_stderr")
|
|
688
|
-
error_kind=$(classify_log_issue "$log_stderr" || true)
|
|
689
|
-
if [ "$error_count" -gt 5 ]; then
|
|
690
|
-
[ "$status" = "PASS" ] && status="WARN"
|
|
691
|
-
details="${details}${error_count} errors in recent stderr. "
|
|
692
|
-
fi
|
|
693
|
-
case "$error_kind" in
|
|
694
|
-
tcc)
|
|
695
|
-
status="FAIL"
|
|
696
|
-
details="${details}Recent stderr shows macOS TCC/Sandbox denial ('Operation not permitted'). "
|
|
697
|
-
;;
|
|
698
|
-
dependency)
|
|
699
|
-
[ "$status" = "PASS" ] && status="WARN"
|
|
700
|
-
details="${details}Recent stderr shows missing Python dependency. "
|
|
701
|
-
;;
|
|
702
|
-
syntax)
|
|
703
|
-
status="FAIL"
|
|
704
|
-
details="${details}Recent stderr shows syntax error. "
|
|
705
|
-
;;
|
|
706
|
-
schema)
|
|
707
|
-
status="FAIL"
|
|
708
|
-
details="${details}Recent stderr shows DB/schema mismatch. "
|
|
709
|
-
;;
|
|
710
|
-
esac
|
|
711
|
-
fi
|
|
712
|
-
fi
|
|
713
|
-
|
|
714
|
-
[ -z "$details" ] && details="All checks passed"
|
|
715
|
-
|
|
716
|
-
# HEALED counts as PASS for overall status
|
|
717
|
-
case "$status" in
|
|
718
|
-
PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
|
|
719
|
-
WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
|
|
720
|
-
FAIL)
|
|
721
|
-
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
722
|
-
FAILED_MONITORS+=("${name}|${plist_id}|${log_stdout}|${log_stderr}|${proc_grep}|${schedule}|${mon_type}|${details}")
|
|
723
|
-
;;
|
|
724
|
-
esac
|
|
725
|
-
|
|
726
|
-
# JSON
|
|
727
|
-
escaped_details=$(json_escape "$details")
|
|
728
|
-
json_item=" {\"name\":\"$name\",\"plist\":\"$plist_id\",\"status\":\"$status\",\"type\":\"$mon_type\",\"loaded\":\"$loaded\",\"process\":\"$proc_alive\",\"last_activity\":\"$stale_age\",\"stderr_errors\":$error_count,\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
|
|
729
|
-
[ -n "$JSON_AGENTS" ] && JSON_AGENTS="${JSON_AGENTS},
|
|
730
|
-
${json_item}" || JSON_AGENTS="$json_item"
|
|
731
|
-
|
|
732
|
-
# Report
|
|
733
|
-
case "$status" in
|
|
734
|
-
PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;;
|
|
735
|
-
esac
|
|
736
|
-
REPORT_LINES="${REPORT_LINES} [${icon}] ${name} (${schedule})
|
|
737
|
-
Loaded: ${loaded} | Process: ${proc_alive} | Last: ${stale_age} | Errors: ${error_count}
|
|
738
|
-
${details}
|
|
739
|
-
"
|
|
740
|
-
done
|
|
741
|
-
|
|
742
|
-
# --- Cron job checks ---
|
|
743
|
-
CRON_JSON=""
|
|
744
|
-
CRON_REPORT=""
|
|
745
|
-
for cron_entry in ${CRON_MONITORS[@]+"${CRON_MONITORS[@]}"}; do
|
|
746
|
-
IFS='|' read -r name script check_path max_stale schedule <<< "$cron_entry"
|
|
747
|
-
|
|
748
|
-
c_status="PASS"
|
|
749
|
-
c_details=""
|
|
750
|
-
age_str="n/a"
|
|
751
|
-
|
|
752
|
-
# Check script exists and is executable
|
|
753
|
-
if [ ! -x "$script" ]; then
|
|
754
|
-
# AUTO-REPAIR: try chmod
|
|
755
|
-
if try_repair_cron "$script"; then
|
|
756
|
-
c_status="HEALED"
|
|
757
|
-
c_details="Self-healed: made executable. "
|
|
758
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
759
|
-
else
|
|
760
|
-
c_status="FAIL"
|
|
761
|
-
c_details="Script not executable or missing (repair failed). "
|
|
762
|
-
fi
|
|
763
|
-
fi
|
|
764
|
-
|
|
765
|
-
# Check output freshness
|
|
766
|
-
if [ -d "$check_path" ]; then
|
|
767
|
-
newest=$(ls -t "$check_path" 2>/dev/null | head -1)
|
|
768
|
-
if [ -n "$newest" ]; then
|
|
769
|
-
age=$(file_age "${check_path}${newest}")
|
|
770
|
-
age_str=$(format_age "$age")
|
|
771
|
-
if [ "$age" -gt $(( max_stale * 3 )) ]; then
|
|
772
|
-
c_status="FAIL"
|
|
773
|
-
c_details="${c_details}Output stale: $age_str. "
|
|
774
|
-
elif [ "$age" -gt "$max_stale" ]; then
|
|
775
|
-
[ "$c_status" = "PASS" ] && c_status="WARN"
|
|
776
|
-
c_details="${c_details}Output slightly stale: $age_str. "
|
|
777
|
-
fi
|
|
778
|
-
else
|
|
779
|
-
c_status="WARN"
|
|
780
|
-
c_details="${c_details}No output files found. "
|
|
781
|
-
age_str="no files"
|
|
782
|
-
fi
|
|
783
|
-
elif [ -f "$check_path" ]; then
|
|
784
|
-
age=$(file_age "$check_path")
|
|
785
|
-
age_str=$(format_age "$age")
|
|
786
|
-
if [ "$age" -gt $(( max_stale * 3 )) ]; then
|
|
787
|
-
c_status="FAIL"
|
|
788
|
-
c_details="${c_details}Output stale: $age_str. "
|
|
789
|
-
elif [ "$age" -gt "$max_stale" ]; then
|
|
790
|
-
[ "$c_status" = "PASS" ] && c_status="WARN"
|
|
791
|
-
c_details="${c_details}Output slightly stale: $age_str. "
|
|
792
|
-
fi
|
|
793
|
-
fi
|
|
794
|
-
|
|
795
|
-
[ -z "$c_details" ] && c_details="All checks passed"
|
|
796
|
-
|
|
797
|
-
case "$c_status" in
|
|
798
|
-
PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
|
|
799
|
-
WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
|
|
800
|
-
FAIL) TOTAL_FAIL=$((TOTAL_FAIL + 1)) ;;
|
|
801
|
-
esac
|
|
802
|
-
|
|
803
|
-
escaped_details=$(json_escape "$c_details")
|
|
804
|
-
cron_item=" {\"name\":\"$name\",\"script\":\"$script\",\"status\":\"$c_status\",\"last_output\":\"$age_str\",\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
|
|
805
|
-
[ -n "$CRON_JSON" ] && CRON_JSON="${CRON_JSON},
|
|
806
|
-
${cron_item}" || CRON_JSON="$cron_item"
|
|
807
|
-
|
|
808
|
-
case "$c_status" in
|
|
809
|
-
PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;;
|
|
810
|
-
esac
|
|
811
|
-
CRON_REPORT="${CRON_REPORT} [${icon}] ${name} (${schedule})
|
|
812
|
-
Last output: ${age_str}
|
|
813
|
-
${c_details}
|
|
814
|
-
"
|
|
815
|
-
done
|
|
816
|
-
|
|
817
|
-
# ============================================================================
|
|
818
|
-
# INFRASTRUCTURE CHECKS
|
|
819
|
-
# ============================================================================
|
|
820
|
-
|
|
821
|
-
# --- SQLite integrity ---
|
|
822
|
-
SQLITE_STATUS="PASS"
|
|
823
|
-
SQLITE_DETAIL=""
|
|
824
|
-
INTEGRITY=$(sqlite3 "$NEXO_DIR/data/nexo.db" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
|
|
825
|
-
if [ "$INTEGRITY" != "ok" ]; then
|
|
826
|
-
SQLITE_STATUS="FAIL"
|
|
827
|
-
SQLITE_DETAIL="Integrity check: $INTEGRITY"
|
|
828
|
-
log "CRITICAL: SQLite integrity check failed: $INTEGRITY"
|
|
829
|
-
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
830
|
-
LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
|
|
831
|
-
if [ -n "$LATEST_BACKUP" ]; then
|
|
832
|
-
cp "$LATEST_BACKUP" "$NEXO_DIR/data/nexo.db"
|
|
833
|
-
log "RESTORED from $LATEST_BACKUP"
|
|
834
|
-
SQLITE_DETAIL="${SQLITE_DETAIL}. Restored from backup."
|
|
835
|
-
fi
|
|
836
|
-
else
|
|
837
|
-
SQLITE_DETAIL="Integrity OK"
|
|
838
|
-
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
839
|
-
fi
|
|
840
|
-
|
|
841
|
-
# --- Immutable file integrity ---
|
|
842
|
-
IMMUTABLE_STATUS="PASS"
|
|
843
|
-
IMMUTABLE_DETAIL=""
|
|
844
|
-
OBJECTIVE="$CORTEX_DIR/evolution-objective.json"
|
|
845
|
-
if [ -f "$HASH_REGISTRY" ]; then
|
|
846
|
-
TAMPERED=0
|
|
847
|
-
while IFS='|' read -r filepath expected_hash; do
|
|
848
|
-
if [ -f "$filepath" ]; then
|
|
849
|
-
ACTUAL=$(shasum -a 256 "$filepath" | cut -d' ' -f1)
|
|
850
|
-
if [ "$ACTUAL" != "$expected_hash" ]; then
|
|
851
|
-
TAMPERED=$((TAMPERED + 1))
|
|
852
|
-
log "CRITICAL: Immutable file modified: $filepath"
|
|
853
|
-
LATEST_SNAP=$(ls -td "$NEXO_HOME/snapshots/"*/ 2>/dev/null | head -1)
|
|
854
|
-
if [ -n "$LATEST_SNAP" ] && [ -f "${LATEST_SNAP}files/${filepath#$HOME_DIR/}" ]; then
|
|
855
|
-
cp "${LATEST_SNAP}files/${filepath#$HOME_DIR/}" "$filepath"
|
|
856
|
-
log "RESTORED immutable file from snapshot"
|
|
857
|
-
fi
|
|
858
|
-
fi
|
|
859
|
-
fi
|
|
860
|
-
done < "$HASH_REGISTRY"
|
|
861
|
-
if [ "$TAMPERED" -gt 0 ]; then
|
|
862
|
-
IMMUTABLE_STATUS="FAIL"
|
|
863
|
-
IMMUTABLE_DETAIL="$TAMPERED immutable files tampered"
|
|
864
|
-
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
865
|
-
if [ -f "$OBJECTIVE" ]; then
|
|
866
|
-
python3 -c "
|
|
867
|
-
import json
|
|
868
|
-
with open('$OBJECTIVE') as f: d = json.load(f)
|
|
869
|
-
d['evolution_enabled'] = False
|
|
870
|
-
d['disabled_reason'] = 'Immutable file tampered — watchdog disabled Evolution'
|
|
871
|
-
with open('$OBJECTIVE', 'w') as f: json.dump(d, f, indent=2)
|
|
872
|
-
" 2>/dev/null
|
|
873
|
-
log "DISABLED Evolution due to immutable file tampering"
|
|
874
|
-
fi
|
|
875
|
-
else
|
|
876
|
-
IMMUTABLE_DETAIL="All files intact"
|
|
877
|
-
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
878
|
-
if [ -f "$OBJECTIVE" ]; then
|
|
879
|
-
python3 -c "
|
|
880
|
-
import json
|
|
881
|
-
from pathlib import Path
|
|
882
|
-
obj = Path('$OBJECTIVE')
|
|
883
|
-
try:
|
|
884
|
-
data = json.loads(obj.read_text())
|
|
885
|
-
except Exception:
|
|
886
|
-
raise SystemExit(0)
|
|
887
|
-
reason = data.get('disabled_reason', '') or ''
|
|
888
|
-
if data.get('evolution_enabled') is False and 'watchdog disabled Evolution' in reason:
|
|
889
|
-
data['evolution_enabled'] = True
|
|
890
|
-
data.pop('disabled_reason', None)
|
|
891
|
-
obj.write_text(json.dumps(data, indent=2, ensure_ascii=False))
|
|
892
|
-
print('REENABLED')
|
|
893
|
-
" 2>/dev/null | grep -q "REENABLED" && log "REENABLED Evolution after immutable integrity recovered"
|
|
894
|
-
fi
|
|
895
|
-
fi
|
|
896
|
-
else
|
|
897
|
-
IMMUTABLE_DETAIL="No hash registry (skipped)"
|
|
898
|
-
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
899
|
-
fi
|
|
900
|
-
|
|
901
|
-
# --- Backup freshness ---
|
|
902
|
-
BACKUP_STATUS="PASS"
|
|
903
|
-
BACKUP_DETAIL=""
|
|
904
|
-
LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
|
|
905
|
-
if [ -n "$LATEST_BACKUP" ]; then
|
|
906
|
-
if $IS_MACOS; then BACKUP_AGE=$(( TS_EPOCH - $(stat -f %m "$LATEST_BACKUP") )); else BACKUP_AGE=$(( TS_EPOCH - $(stat -c %Y "$LATEST_BACKUP") )); fi
|
|
907
|
-
BACKUP_AGE_STR=$(format_age "$BACKUP_AGE")
|
|
908
|
-
if [ "$BACKUP_AGE" -gt 7200 ]; then
|
|
909
|
-
# AUTO-REPAIR: run backup now
|
|
910
|
-
if try_repair_backup; then
|
|
911
|
-
BACKUP_STATUS="HEALED"
|
|
912
|
-
BACKUP_DETAIL="Self-healed: backup was stale ($BACKUP_AGE_STR), ran fresh backup"
|
|
913
|
-
TOTAL_HEALED=$((TOTAL_HEALED + 1))
|
|
914
|
-
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
915
|
-
else
|
|
916
|
-
BACKUP_STATUS="WARN"
|
|
917
|
-
BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR (>2h, repair failed)"
|
|
918
|
-
TOTAL_WARN=$((TOTAL_WARN + 1))
|
|
919
|
-
fi
|
|
920
|
-
else
|
|
921
|
-
BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR"
|
|
922
|
-
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
923
|
-
fi
|
|
924
|
-
else
|
|
925
|
-
BACKUP_STATUS="FAIL"
|
|
926
|
-
BACKUP_DETAIL="No backups found"
|
|
927
|
-
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
928
|
-
fi
|
|
929
|
-
|
|
930
|
-
# --- Cognitive DB check ---
|
|
931
|
-
COG_STATUS="PASS"
|
|
932
|
-
COG_DETAIL=""
|
|
933
|
-
COG_DB="$NEXO_DIR/data/cognitive.db"
|
|
934
|
-
if [ -f "$COG_DB" ]; then
|
|
935
|
-
COG_INT=$(sqlite3 "$COG_DB" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
|
|
936
|
-
if [ "$COG_INT" != "ok" ]; then
|
|
937
|
-
COG_STATUS="FAIL"
|
|
938
|
-
COG_DETAIL="Cognitive DB integrity: $COG_INT"
|
|
939
|
-
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
|
940
|
-
else
|
|
941
|
-
COG_DETAIL="Integrity OK"
|
|
942
|
-
TOTAL_PASS=$((TOTAL_PASS + 1))
|
|
943
|
-
fi
|
|
944
|
-
else
|
|
945
|
-
COG_STATUS="WARN"
|
|
946
|
-
COG_DETAIL="cognitive.db not found"
|
|
947
|
-
TOTAL_WARN=$((TOTAL_WARN + 1))
|
|
948
|
-
fi
|
|
949
|
-
|
|
950
|
-
# ============================================================================
|
|
951
|
-
# WRITE JSON STATUS
|
|
952
|
-
# ============================================================================
|
|
953
|
-
TOTAL=$((TOTAL_PASS + TOTAL_WARN + TOTAL_FAIL))
|
|
954
|
-
OVERALL="PASS"
|
|
955
|
-
[ "$TOTAL_WARN" -gt 0 ] && OVERALL="WARN"
|
|
956
|
-
[ "$TOTAL_FAIL" -gt 0 ] && OVERALL="FAIL"
|
|
957
|
-
|
|
958
|
-
cat > "$STATUS_JSON" <<JSONEOF
|
|
959
|
-
{
|
|
960
|
-
"timestamp": "$TS",
|
|
961
|
-
"summary": {
|
|
962
|
-
"total": $TOTAL,
|
|
963
|
-
"pass": $TOTAL_PASS,
|
|
964
|
-
"warn": $TOTAL_WARN,
|
|
965
|
-
"fail": $TOTAL_FAIL,
|
|
966
|
-
"healed": $TOTAL_HEALED,
|
|
967
|
-
"overall": "$OVERALL"
|
|
968
|
-
},
|
|
969
|
-
"launch_agents": [
|
|
970
|
-
$JSON_AGENTS
|
|
971
|
-
],
|
|
972
|
-
"cron_jobs": [
|
|
973
|
-
$CRON_JSON
|
|
974
|
-
],
|
|
975
|
-
"infrastructure": {
|
|
976
|
-
"sqlite": {"status": "$SQLITE_STATUS", "detail": "$(json_escape "$SQLITE_DETAIL")"},
|
|
977
|
-
"cognitive_db": {"status": "$COG_STATUS", "detail": "$(json_escape "$COG_DETAIL")"},
|
|
978
|
-
"immutable_files": {"status": "$IMMUTABLE_STATUS", "detail": "$(json_escape "$IMMUTABLE_DETAIL")"},
|
|
979
|
-
"backups": {"status": "$BACKUP_STATUS", "detail": "$(json_escape "$BACKUP_DETAIL")"}
|
|
980
|
-
}
|
|
981
|
-
}
|
|
982
|
-
JSONEOF
|
|
983
|
-
|
|
984
|
-
# ============================================================================
|
|
985
|
-
# WRITE HUMAN-READABLE REPORT
|
|
986
|
-
# ============================================================================
|
|
987
|
-
cat > "$REPORT_TXT" <<REPORTEOF
|
|
988
|
-
======================================================
|
|
989
|
-
NEXO WATCHDOG REPORT — $TS
|
|
990
|
-
======================================================
|
|
991
|
-
PASS: $TOTAL_PASS | HEALED: $TOTAL_HEALED | WARN: $TOTAL_WARN | FAIL: $TOTAL_FAIL | TOTAL: $TOTAL
|
|
992
|
-
OVERALL: $OVERALL
|
|
993
|
-
======================================================
|
|
994
|
-
|
|
995
|
-
-- LaunchAgents (${#MONITORS[@]}) ---------------------
|
|
996
|
-
$REPORT_LINES
|
|
997
|
-
-- Cron Jobs ------------------------------------------
|
|
998
|
-
$CRON_REPORT
|
|
999
|
-
-- Infrastructure -------------------------------------
|
|
1000
|
-
[$SQLITE_STATUS] SQLite nexo.db: $SQLITE_DETAIL
|
|
1001
|
-
[$COG_STATUS] Cognitive DB: $COG_DETAIL
|
|
1002
|
-
[$IMMUTABLE_STATUS] Immutable Files: $IMMUTABLE_DETAIL
|
|
1003
|
-
[$BACKUP_STATUS] Backups: $BACKUP_DETAIL
|
|
1004
|
-
|
|
1005
|
-
-- End of Report --------------------------------------
|
|
1006
|
-
REPORTEOF
|
|
1007
|
-
|
|
1008
|
-
# ============================================================================
|
|
1009
|
-
# ALERT FILE
|
|
1010
|
-
# ============================================================================
|
|
1011
|
-
if [ "$TOTAL_FAIL" -gt 0 ]; then
|
|
1012
|
-
{
|
|
1013
|
-
echo "timestamp=$TS"
|
|
1014
|
-
echo "fail_count=$TOTAL_FAIL"
|
|
1015
|
-
echo "warn_count=$TOTAL_WARN"
|
|
1016
|
-
echo "failures:"
|
|
1017
|
-
grep '\[FAIL\]' "$REPORT_TXT" | head -10 | sed 's/^/ /'
|
|
1018
|
-
} > "$ALERT_FILE"
|
|
1019
|
-
log "ALERT: $TOTAL_FAIL failures detected"
|
|
1020
|
-
else
|
|
1021
|
-
rm -f "$ALERT_FILE"
|
|
1022
|
-
fi
|
|
1023
|
-
|
|
1024
|
-
# ============================================================================
|
|
1025
|
-
# CONSECUTIVE FAILURE TRACKING + NOTIFICATION
|
|
1026
|
-
# ============================================================================
|
|
1027
|
-
FAILS=$(cat "$FAIL_COUNT_FILE" 2>/dev/null || echo 0)
|
|
1028
|
-
if [ "$TOTAL_FAIL" -gt 0 ]; then
|
|
1029
|
-
FAILS=$((FAILS + 1))
|
|
1030
|
-
echo "$FAILS" > "$FAIL_COUNT_FILE"
|
|
1031
|
-
if [ "$FAILS" -ge "$MAX_FAILS" ]; then
|
|
1032
|
-
log "ALERT: $FAILS consecutive runs with failures"
|
|
1033
|
-
# Configure your own notification method here (optional)
|
|
1034
|
-
# Example: send email, Slack webhook, desktop notification, etc.
|
|
1035
|
-
log "NOTIFICATION: $FAILS consecutive failures ($TOTAL_FAIL items FAIL)"
|
|
1036
|
-
fi
|
|
1037
|
-
else
|
|
1038
|
-
echo "0" > "$FAIL_COUNT_FILE"
|
|
1039
|
-
fi
|
|
1040
|
-
|
|
1041
|
-
# ============================================================================
|
|
1042
|
-
# LEVEL 2 AUTO-REPAIR: Launch NEXO for intelligent diagnosis
|
|
1043
|
-
# ============================================================================
|
|
1044
|
-
# Only triggers if: (a) there are FAILs after mechanical repair, (b) no NEXO
|
|
1045
|
-
# repair is already running, (c) no interactive session is active (avoid conflict)
|
|
1046
|
-
REPAIR_LOCK="$NEXO_HOME/scripts/.watchdog-nexo-repair.lock"
|
|
1047
|
-
REPAIR_COOLDOWN=1800 # 30 min between NEXO repair attempts
|
|
1048
|
-
|
|
1049
|
-
if [ "$TOTAL_FAIL" -gt 0 ]; then
|
|
1050
|
-
# Check cooldown — don't spam NEXO invocations
|
|
1051
|
-
LOCK_AGE=999999
|
|
1052
|
-
SKIP_REPAIR=false
|
|
1053
|
-
if [ -f "$REPAIR_LOCK" ]; then
|
|
1054
|
-
if $IS_MACOS; then LOCK_AGE=$(( TS_EPOCH - $(stat -f %m "$REPAIR_LOCK" 2>/dev/null || echo 0) )); else LOCK_AGE=$(( TS_EPOCH - $(stat -c %Y "$REPAIR_LOCK" 2>/dev/null || echo 0) )); fi
|
|
1055
|
-
if [ "$LOCK_AGE" -lt "$REPAIR_COOLDOWN" ]; then
|
|
1056
|
-
log "NEXO repair skipped: cooldown (${LOCK_AGE}s < ${REPAIR_COOLDOWN}s)"
|
|
1057
|
-
SKIP_REPAIR=true
|
|
1058
|
-
fi
|
|
1059
|
-
fi
|
|
1060
|
-
|
|
1061
|
-
if ! $SKIP_REPAIR; then
|
|
1062
|
-
# Collect failure details from tracked FAILED_MONITORS array
|
|
1063
|
-
FAIL_DETAILS=""
|
|
1064
|
-
HAS_CORE_FAILS=false
|
|
1065
|
-
for failed in ${FAILED_MONITORS[@]+"${FAILED_MONITORS[@]}"}; do
|
|
1066
|
-
IFS='|' read -r m_name m_plist m_stdout m_stderr m_proc m_sched m_type m_details <<< "$failed"
|
|
1067
|
-
STDERR_TAIL=""
|
|
1068
|
-
if [ -n "$m_stderr" ] && [ -f "$m_stderr" ]; then
|
|
1069
|
-
STDERR_TAIL=$(tail -20 "$m_stderr" 2>/dev/null | head -20)
|
|
1070
|
-
fi
|
|
1071
|
-
STDOUT_TAIL=""
|
|
1072
|
-
if [ -n "$m_stdout" ] && [ -f "$m_stdout" ]; then
|
|
1073
|
-
STDOUT_TAIL=$(tail -10 "$m_stdout" 2>/dev/null | head -10)
|
|
1074
|
-
fi
|
|
1075
|
-
[ "$m_type" = "core" ] && HAS_CORE_FAILS=true
|
|
1076
|
-
FAIL_DETAILS="${FAIL_DETAILS}
|
|
1077
|
-
--- ${m_name} (${m_plist}) [${m_type}] ---
|
|
1078
|
-
Schedule: ${m_sched}
|
|
1079
|
-
Type: ${m_type}
|
|
1080
|
-
Failure reason: ${m_details}
|
|
1081
|
-
Service config: $($IS_MACOS && echo "~/Library/LaunchAgents/${m_plist}.plist" || echo "~/.config/systemd/user/${m_plist}")
|
|
1082
|
-
Process grep: ${m_proc}
|
|
1083
|
-
Stderr (last 20 lines):
|
|
1084
|
-
${STDERR_TAIL}
|
|
1085
|
-
Stdout (last 10 lines):
|
|
1086
|
-
${STDOUT_TAIL}
|
|
1087
|
-
"
|
|
1088
|
-
done
|
|
1089
|
-
|
|
1090
|
-
# Only launch if we actually have fail details
|
|
1091
|
-
if [ -n "$FAIL_DETAILS" ]; then
|
|
1092
|
-
touch "$REPAIR_LOCK"
|
|
1093
|
-
log "Launching NEXO Level 2 repair..."
|
|
1094
|
-
|
|
1095
|
-
# Build propagation instructions if core services failed
|
|
1096
|
-
# Only runs when NEXO_MAINTAINER=1 and NEXO_PUBLIC_REPO is configured
|
|
1097
|
-
PROPAGATE_BLOCK=""
|
|
1098
|
-
if [ "${NEXO_MAINTAINER:-}" = "1" ]; then
|
|
1099
|
-
NEXO_PUBLIC_REPO="${NEXO_PUBLIC_REPO:-}"
|
|
1100
|
-
if $HAS_CORE_FAILS && [ -n "$NEXO_PUBLIC_REPO" ] && [ -d "$NEXO_PUBLIC_REPO/.git" ]; then
|
|
1101
|
-
PROPAGATE_BLOCK="
|
|
1102
|
-
PROPAGATION (for [core] fixes ONLY):
|
|
1103
|
-
If your fix modifies a file under $NEXO_HOME/ (server.py, db/, plugins/, scripts/):
|
|
1104
|
-
1. Commit the fix locally with a descriptive message
|
|
1105
|
-
2. Copy the changed files (sanitized — no personal data) to $NEXO_PUBLIC_REPO/src/
|
|
1106
|
-
3. Bump patch version in $NEXO_PUBLIC_REPO/package.json
|
|
1107
|
-
4. Commit + push
|
|
1108
|
-
5. Create a GitHub release with gh release create
|
|
1109
|
-
Do NOT propagate fixes for [personal] services — those stay local only."
|
|
1110
|
-
fi
|
|
1111
|
-
fi
|
|
1112
|
-
|
|
1113
|
-
# Write prompt to temp file (avoids heredoc quoting issues in subshell)
|
|
1114
|
-
REPAIR_PROMPT_FILE=$(mktemp /tmp/nexo-repair-XXXXXXXX)
|
|
1115
|
-
cat > "$REPAIR_PROMPT_FILE" <<NEXOPROMPT
|
|
1116
|
-
WATCHDOG LEVEL 2 REPAIR — Automated diagnostic session.
|
|
1117
|
-
|
|
1118
|
-
The watchdog detected services that failed mechanical repair (launchctl/systemctl re-registration).
|
|
1119
|
-
Your job: diagnose the root cause and fix it. Do NOT ask the user anything — resolve autonomously.
|
|
1120
|
-
|
|
1121
|
-
Each failure is tagged [core] or [personal]:
|
|
1122
|
-
- [core] = part of NEXO public package — fix may need propagation to public repo
|
|
1123
|
-
- [personal] = user-specific service — fix stays local only
|
|
1124
|
-
|
|
1125
|
-
FAILURES:
|
|
1126
|
-
${FAIL_DETAILS}
|
|
1127
|
-
|
|
1128
|
-
STEPS:
|
|
1129
|
-
1. Read the service config (plist on macOS, systemd unit on Linux) to understand the service
|
|
1130
|
-
2. Check stderr/stdout logs for the actual error
|
|
1131
|
-
3. Fix the root cause (missing file, bad config, dependency issue, etc.)
|
|
1132
|
-
4. Reload the service and verify it is running (launchctl on macOS, systemctl on Linux)
|
|
1133
|
-
5. Log what you did to $NEXO_HOME/logs/watchdog-repair-result.log
|
|
1134
|
-
${PROPAGATE_BLOCK}
|
|
1135
|
-
|
|
1136
|
-
CONSTRAINTS:
|
|
1137
|
-
- Do NOT modify CLAUDE.md, AGENTS.md, or any protected file
|
|
1138
|
-
- Do NOT start interactive conversations
|
|
1139
|
-
- Keep it under 5 minutes
|
|
1140
|
-
- Log what you did to $NEXO_HOME/logs/watchdog-repair-result.log
|
|
1141
|
-
NEXOPROMPT
|
|
1142
|
-
|
|
1143
|
-
# Launch NEXO in background with the configured automation backend.
|
|
1144
|
-
# Keep the hardened Claude fallback for older runtimes or partial installs.
|
|
1145
|
-
AGENT_RUNNER="$NEXO_HOME/scripts/nexo-agent-run.py"
|
|
1146
|
-
NEXO_PYTHON="$NEXO_HOME/.venv/bin/python3"
|
|
1147
|
-
if [ ! -x "$NEXO_PYTHON" ]; then
|
|
1148
|
-
NEXO_PYTHON=$(command -v python3 2>/dev/null || echo "python3")
|
|
1149
|
-
fi
|
|
1150
|
-
|
|
1151
|
-
if [ -f "$AGENT_RUNNER" ]; then
|
|
1152
|
-
nohup bash -c "\"$NEXO_PYTHON\" \"$AGENT_RUNNER\" --prompt-file '$REPAIR_PROMPT_FILE' >> '$LOG_DIR/watchdog-nexo-repair.log' 2>&1; rm -f '$REPAIR_PROMPT_FILE'" &
|
|
1153
|
-
else
|
|
1154
|
-
CLAUDE_BIN=$(command -v claude 2>/dev/null || echo "$HOME_DIR/.claude/local/bin/claude")
|
|
1155
|
-
if [ ! -x "$CLAUDE_BIN" ]; then
|
|
1156
|
-
CLAUDE_BIN=$(find /usr/local/bin /opt/homebrew/bin "$HOME_DIR/.local/bin" "$HOME_DIR/.npm-global/bin" -name claude -type f 2>/dev/null | head -1)
|
|
1157
|
-
fi
|
|
1158
|
-
|
|
1159
|
-
if [ -n "$CLAUDE_BIN" ] && [ -x "$CLAUDE_BIN" ]; then
|
|
1160
|
-
nohup bash -c "\"$CLAUDE_BIN\" --print --dangerously-skip-permissions -p \"\$(cat '$REPAIR_PROMPT_FILE')\" >> '$LOG_DIR/watchdog-nexo-repair.log' 2>&1; rm -f '$REPAIR_PROMPT_FILE'" &
|
|
1161
|
-
else
|
|
1162
|
-
log "NEXO repair ABORTED: no automation backend wrapper and no claude CLI fallback found"
|
|
1163
|
-
rm -f "$REPAIR_PROMPT_FILE"
|
|
1164
|
-
fi
|
|
1165
|
-
fi
|
|
1166
|
-
|
|
1167
|
-
REPAIR_PID=$!
|
|
1168
|
-
log "NEXO repair launched (PID: $REPAIR_PID)"
|
|
1169
|
-
|
|
1170
|
-
# Wait for repair to complete (max 5 min) then verify
|
|
1171
|
-
(
|
|
1172
|
-
wait_count=0
|
|
1173
|
-
while kill -0 $REPAIR_PID 2>/dev/null && [ $wait_count -lt 60 ]; do
|
|
1174
|
-
sleep 5
|
|
1175
|
-
wait_count=$((wait_count + 1))
|
|
1176
|
-
done
|
|
1177
|
-
|
|
1178
|
-
if [ $wait_count -ge 60 ]; then
|
|
1179
|
-
log "NEXO repair timed out after 5 min"
|
|
1180
|
-
kill $REPAIR_PID 2>/dev/null
|
|
1181
|
-
else
|
|
1182
|
-
log "NEXO repair completed. Verifying fixes..."
|
|
1183
|
-
# Verify each failed monitor
|
|
1184
|
-
VERIFY_PASS=0
|
|
1185
|
-
VERIFY_FAIL=0
|
|
1186
|
-
for failed in ${FAILED_MONITORS[@]+"${FAILED_MONITORS[@]}"}; do
|
|
1187
|
-
IFS='|' read -r v_name v_plist v_stdout v_stderr v_proc v_sched v_type v_details <<< "$failed"
|
|
1188
|
-
if try_verify_repair "$v_plist" "$v_stdout" "$v_proc" "$v_type"; then
|
|
1189
|
-
VERIFY_PASS=$((VERIFY_PASS + 1))
|
|
1190
|
-
log "VERIFY OK: $v_name"
|
|
1191
|
-
else
|
|
1192
|
-
VERIFY_FAIL=$((VERIFY_FAIL + 1))
|
|
1193
|
-
log "VERIFY FAIL: $v_name — still broken after repair"
|
|
1194
|
-
fi
|
|
1195
|
-
done
|
|
1196
|
-
log "Post-repair verification: $VERIFY_PASS passed, $VERIFY_FAIL failed"
|
|
1197
|
-
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Verification: $VERIFY_PASS OK, $VERIFY_FAIL FAIL" >> "$LOG_DIR/watchdog-nexo-repair.log"
|
|
1198
|
-
fi
|
|
1199
|
-
) &
|
|
1200
|
-
fi
|
|
1201
|
-
fi
|
|
1202
|
-
fi
|
|
1203
|
-
|
|
1204
|
-
# ============================================================================
|
|
1205
|
-
# LOG SUMMARY
|
|
1206
|
-
# ============================================================================
|
|
1207
|
-
log "Complete: PASS=$TOTAL_PASS HEALED=$TOTAL_HEALED WARN=$TOTAL_WARN FAIL=$TOTAL_FAIL"
|