nexo-brain 5.3.26 → 5.3.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/package.json +1 -1
  3. package/src/server.py +3 -0
  4. package/src/tools_sessions.py +6 -1
  5. package/src/dashboard/static/favicon 2.svg +0 -32
  6. package/src/dashboard/static/nexo-logo 2.png +0 -0
  7. package/src/dashboard/static/nexo-logo 2.svg +0 -40
  8. package/src/dashboard/static/style 2.css +0 -2458
  9. package/src/dashboard/templates/adaptive 2.html +0 -118
  10. package/src/dashboard/templates/artifacts 2.html +0 -133
  11. package/src/dashboard/templates/backups 2.html +0 -136
  12. package/src/dashboard/templates/base 2.html +0 -417
  13. package/src/dashboard/templates/calendar 2.html +0 -591
  14. package/src/dashboard/templates/chat 2.html +0 -356
  15. package/src/dashboard/templates/claims 2.html +0 -259
  16. package/src/dashboard/templates/cortex 2.html +0 -321
  17. package/src/dashboard/templates/credentials 2.html +0 -128
  18. package/src/dashboard/templates/crons 2.html +0 -370
  19. package/src/dashboard/templates/dashboard 2.html +0 -494
  20. package/src/dashboard/templates/dreams 2.html +0 -252
  21. package/src/dashboard/templates/email 2.html +0 -160
  22. package/src/dashboard/templates/evolution 2.html +0 -189
  23. package/src/dashboard/templates/feed 2.html +0 -249
  24. package/src/dashboard/templates/followup_health 2.html +0 -170
  25. package/src/dashboard/templates/graph 2.html +0 -201
  26. package/src/dashboard/templates/guard 2.html +0 -259
  27. package/src/dashboard/templates/inbox 2.html +0 -251
  28. package/src/dashboard/templates/memory 2.html +0 -420
  29. package/src/dashboard/templates/operations 2.html +0 -608
  30. package/src/dashboard/templates/plugins 2.html +0 -185
  31. package/src/dashboard/templates/protocol 2.html +0 -199
  32. package/src/dashboard/templates/rules 2.html +0 -246
  33. package/src/dashboard/templates/sentiment 2.html +0 -247
  34. package/src/dashboard/templates/sessions 2.html +0 -218
  35. package/src/dashboard/templates/skills 2.html +0 -329
  36. package/src/dashboard/templates/somatic 2.html +0 -73
  37. package/src/dashboard/templates/triggers 2.html +0 -133
  38. package/src/dashboard/templates/trust 2.html +0 -360
  39. package/src/db/__init__ 2.py +0 -259
  40. package/src/db/_core 2.py +0 -437
  41. package/src/db/_credentials 2.py +0 -124
  42. package/src/db/_episodic 2.py +0 -762
  43. package/src/db/_evolution 2.py +0 -54
  44. package/src/db/_fts 2.py +0 -406
  45. package/src/db/_goal_profiles 2.py +0 -376
  46. package/src/db/_hot_context 2.py +0 -660
  47. package/src/db/_outcomes 2.py +0 -800
  48. package/src/db/_personal_scripts 2.py +0 -582
  49. package/src/db/_sessions 2.py +0 -330
  50. package/src/db/_tasks 2.py +0 -91
  51. package/src/db/_watchers 2.py +0 -173
  52. package/src/doctor/formatters 2.py +0 -52
  53. package/src/doctor/models 2.py +0 -69
  54. package/src/doctor/planes 2.py +0 -87
  55. package/src/doctor/providers/__init__ 2.py +0 -1
  56. package/src/doctor/providers/deep 2.py +0 -367
  57. package/src/evolution_cycle 2.py +0 -519
  58. package/src/hooks/auto_capture 2.py +0 -208
  59. package/src/hooks/caffeinate-guard 2.sh +0 -8
  60. package/src/hooks/capture-session 2.sh +0 -21
  61. package/src/hooks/capture-tool-logs 2.sh +0 -158
  62. package/src/hooks/daily-briefing-check 2.sh +0 -33
  63. package/src/hooks/heartbeat-enforcement 2.py +0 -90
  64. package/src/hooks/heartbeat-posttool 2.sh +0 -18
  65. package/src/hooks/inbox-hook 2.sh +0 -76
  66. package/src/hooks/post-compact 2.sh +0 -152
  67. package/src/hooks/pre-compact 2.sh +0 -169
  68. package/src/hooks/protocol-guardrail 2.sh +0 -10
  69. package/src/hooks/protocol-pretool-guardrail 2.sh +0 -9
  70. package/src/hooks/session-stop 2.sh +0 -52
  71. package/src/kg_populate 2.py +0 -292
  72. package/src/maintenance 2.py +0 -53
  73. package/src/memory_backends 2.py +0 -71
  74. package/src/migrate_embeddings 2.py +0 -124
  75. package/src/nexo_sdk 2.py +0 -103
  76. package/src/observability 2.py +0 -199
  77. package/src/plugin_loader 2.py +0 -217
  78. package/src/plugins/__init__ 2.py +0 -0
  79. package/src/plugins/artifact_registry 2.py +0 -450
  80. package/src/plugins/backup 2.py +0 -127
  81. package/src/plugins/claims_tools 2.py +0 -119
  82. package/src/plugins/cognitive_memory 2.py +0 -609
  83. package/src/plugins/core_rules 2.py +0 -252
  84. package/src/plugins/cortex 2.py +0 -1155
  85. package/src/plugins/entities 2.py +0 -67
  86. package/src/plugins/episodic_memory 2.py +0 -560
  87. package/src/plugins/evolution 2.py +0 -167
  88. package/src/plugins/goal_engine 2.py +0 -142
  89. package/src/plugins/guard 2.py +0 -862
  90. package/src/plugins/impact 2.py +0 -29
  91. package/src/plugins/knowledge_graph_tools 2.py +0 -137
  92. package/src/plugins/media_memory_tools 2.py +0 -98
  93. package/src/plugins/memory_export 2.py +0 -196
  94. package/src/plugins/outcomes 2.py +0 -130
  95. package/src/plugins/personal_scripts 2.py +0 -117
  96. package/src/plugins/preferences 2.py +0 -47
  97. package/src/plugins/protocol 2.py +0 -1449
  98. package/src/plugins/simple_api 2.py +0 -106
  99. package/src/plugins/skills 2.py +0 -341
  100. package/src/plugins/state_watchers 2.py +0 -79
  101. package/src/plugins/update 2.py +0 -986
  102. package/src/plugins/user_state_tools 2.py +0 -43
  103. package/src/plugins/workflow 2.py +0 -588
  104. package/src/protocol_settings 2.py +0 -59
  105. package/src/public_contribution 2.py +0 -466
  106. package/src/public_evolution_queue 2.py +0 -241
  107. package/src/requirements 2.txt +0 -14
  108. package/src/retroactive_learnings 2.py +0 -373
  109. package/src/rules/__init__ 2.py +0 -0
  110. package/src/rules/core-rules 2.json +0 -331
  111. package/src/rules/migrate 2.py +0 -207
  112. package/src/runtime_power 2.py +0 -874
  113. package/src/script_registry 2.py +0 -1559
  114. package/src/scripts/check-context 2.py +0 -272
  115. package/src/scripts/deep-sleep/apply_findings 2.py +0 -2327
  116. package/src/scripts/deep-sleep/collect 2.py +0 -928
  117. package/src/scripts/deep-sleep/extract 2.py +0 -330
  118. package/src/scripts/deep-sleep/extract-prompt 2.md +0 -285
  119. package/src/scripts/deep-sleep/synthesize 2.py +0 -312
  120. package/src/scripts/deep-sleep/synthesize-prompt 2.md +0 -336
  121. package/src/scripts/nexo-agent-run 2.py +0 -75
  122. package/src/scripts/nexo-auto-update 2.py +0 -6
  123. package/src/scripts/nexo-backup 2.sh +0 -25
  124. package/src/scripts/nexo-brain-activation 2.sh +0 -140
  125. package/src/scripts/nexo-catchup 2.py +0 -300
  126. package/src/scripts/nexo-cognitive-decay 2.py +0 -257
  127. package/src/scripts/nexo-cortex-cycle 2.py +0 -293
  128. package/src/scripts/nexo-cron-wrapper 2.sh +0 -53
  129. package/src/scripts/nexo-daily-self-audit 2.py +0 -2161
  130. package/src/scripts/nexo-dashboard 2.sh +0 -29
  131. package/src/scripts/nexo-deep-sleep 2.sh +0 -86
  132. package/src/scripts/nexo-evolution-run 2.py +0 -1664
  133. package/src/scripts/nexo-followup-hygiene 2.py +0 -139
  134. package/src/scripts/nexo-hook-record 2.py +0 -42
  135. package/src/scripts/nexo-immune 2.py +0 -936
  136. package/src/scripts/nexo-impact-scorer 2.py +0 -117
  137. package/src/scripts/nexo-inbox-hook 2.sh +0 -74
  138. package/src/scripts/nexo-install 2.py +0 -6
  139. package/src/scripts/nexo-learning-housekeep 2.py +0 -401
  140. package/src/scripts/nexo-learning-validator 2.py +0 -266
  141. package/src/scripts/nexo-migrate 2.py +0 -260
  142. package/src/scripts/nexo-outcome-checker 2.py +0 -127
  143. package/src/scripts/nexo-postmortem-consolidator 2.py +0 -456
  144. package/src/scripts/nexo-pre-commit 2.py +0 -120
  145. package/src/scripts/nexo-prevent-sleep 2.sh +0 -35
  146. package/src/scripts/nexo-proactive-dashboard 2.py +0 -354
  147. package/src/scripts/nexo-reflection 2.py +0 -256
  148. package/src/scripts/nexo-runtime-preflight 2.py +0 -274
  149. package/src/scripts/nexo-sleep 2.py +0 -631
  150. package/src/scripts/nexo-snapshot-restore 2.sh +0 -35
  151. package/src/scripts/nexo-sync-clients 2.py +0 -16
  152. package/src/scripts/nexo-synthesis 2.py +0 -475
  153. package/src/scripts/nexo-tcc-approve 2.sh +0 -79
  154. package/src/scripts/nexo-update 2.sh +0 -306
  155. package/src/scripts/nexo-watchdog 2.sh +0 -1207
  156. package/src/scripts/nexo-watchdog-smoke 2.py +0 -119
  157. package/src/scripts/rehydrate_learnings_from_archive 2.py +0 -245
  158. package/src/server 2.py +0 -1296
  159. package/src/skills/run-nexo-audit-phase/guide 2.md +0 -43
  160. package/src/skills/run-nexo-audit-phase/skill 2.json +0 -59
  161. package/src/skills/run-nexo-core-fix-cycle/guide 2.md +0 -17
  162. package/src/skills/run-nexo-core-fix-cycle/script 2.py +0 -276
  163. package/src/skills/run-nexo-core-fix-cycle/skill 2.json +0 -58
  164. package/src/skills/run-release-final-audit/guide 2.md +0 -16
  165. package/src/skills/run-release-final-audit/script 2.py +0 -259
  166. package/src/skills/run-release-final-audit/skill 2.json +0 -77
  167. package/src/skills/run-runtime-doctor/guide 2.md +0 -12
  168. package/src/skills/run-runtime-doctor/script 2.py +0 -21
  169. package/src/skills/run-runtime-doctor/skill 2.json +0 -25
  170. package/src/skills_runtime 2.py +0 -932
  171. package/src/state_watchers_runtime 2.py +0 -475
  172. package/src/storage_router 2.py +0 -32
  173. package/src/system_catalog 2.py +0 -786
  174. package/src/tools_coordination 2.py +0 -103
  175. package/src/tools_credentials 2.py +0 -68
  176. package/src/tools_drive 2.py +0 -487
  177. package/src/tools_hot_context 2.py +0 -163
  178. package/src/tools_learnings 2.py +0 -612
  179. package/src/tools_menu 2.py +0 -229
  180. package/src/tools_reminders 2.py +0 -88
  181. package/src/tools_reminders_crud 2.py +0 -363
  182. package/src/tools_sessions 2.py +0 -1054
  183. package/src/tools_system_catalog 2.py +0 -19
  184. package/src/tools_task_history 2.py +0 -57
  185. package/src/tools_transcripts 2.py +0 -98
  186. package/src/transcript_utils 2.py +0 -412
  187. package/src/user_context 2.py +0 -46
  188. package/src/user_data_portability 2.py +0 -328
  189. package/src/user_state_model 2.py +0 -170
  190. package/templates/CLAUDE.md 2.template +0 -108
  191. package/templates/CODEX.AGENTS.md 2.template +0 -66
  192. package/templates/launchagents/README 2.md +0 -132
  193. package/templates/launchagents/com.nexo.auto-close-sessions 2.plist +0 -39
  194. package/templates/launchagents/com.nexo.catchup 2.plist +0 -39
  195. package/templates/launchagents/com.nexo.cognitive-decay 2.plist +0 -40
  196. package/templates/launchagents/com.nexo.dashboard 2.plist +0 -43
  197. package/templates/launchagents/com.nexo.deep-sleep 2.plist +0 -43
  198. package/templates/launchagents/com.nexo.evolution 2.plist +0 -44
  199. package/templates/launchagents/com.nexo.followup-hygiene 2.plist +0 -45
  200. package/templates/launchagents/com.nexo.immune 2.plist +0 -41
  201. package/templates/launchagents/com.nexo.postmortem 2.plist +0 -45
  202. package/templates/launchagents/com.nexo.self-audit 2.plist +0 -47
  203. package/templates/launchagents/com.nexo.synthesis 2.plist +0 -45
  204. package/templates/launchagents/com.nexo.watchdog 2.plist +0 -37
  205. package/templates/nexo_helper 2.py +0 -301
  206. package/templates/openclaw 2.json +0 -13
  207. package/templates/plugin-template 2.py +0 -40
  208. package/templates/script-template 2.py +0 -59
  209. package/templates/script-template 2.sh +0 -13
  210. package/templates/skill-script-template 2.py +0 -48
  211. package/templates/skill-template 2.md +0 -33
@@ -1,1207 +0,0 @@
1
- #!/bin/bash
2
- # ============================================================================
3
- # NEXO Watchdog — Comprehensive health monitor for all NEXO services
4
- # Schedule: every 30 minutes (interval_seconds: 1800)
5
- # ============================================================================
6
- # Monitors ALL LaunchAgents, cron jobs, and background processes.
7
- # Outputs: watchdog-status.json (machine), watchdog-report.txt (human),
8
- # .watchdog-alert (if any FAIL detected)
9
- # ============================================================================
10
- set -uo pipefail
11
-
12
- # === PATHS ===
13
- HOME_DIR="$HOME"
14
- NEXO_HOME="${NEXO_HOME:-$HOME/.nexo}"
15
- NEXO_DIR="$NEXO_HOME"
16
- CORTEX_DIR="$NEXO_HOME/brain"
17
- OPS_DIR="$NEXO_HOME/operations"
18
- LOG_DIR="$NEXO_HOME/logs"
19
- DB_PATH="$NEXO_HOME/data/nexo.db"
20
- LOG="$LOG_DIR/watchdog.log"
21
- STATUS_JSON="$OPS_DIR/watchdog-status.json"
22
- REPORT_TXT="$OPS_DIR/watchdog-report.txt"
23
- ALERT_FILE="$OPS_DIR/.watchdog-alert"
24
- HASH_REGISTRY="$NEXO_HOME/scripts/.watchdog-hashes"
25
- FAIL_COUNT_FILE="$NEXO_HOME/scripts/.watchdog-fails"
26
- MAX_FAILS=3
27
-
28
- mkdir -p "$LOG_DIR" "$OPS_DIR"
29
-
30
- TS=$(date "+%Y-%m-%d %H:%M:%S")
31
- TS_EPOCH=$(date +%s)
32
-
33
- log() { echo "[$TS] $1" >> "$LOG"; }
34
-
35
- # ============================================================================
36
- # MONITOR REGISTRY — generated dynamically from manifest.json
37
- # ============================================================================
38
- # Format: NAME|PLIST_ID|LOG_STDOUT|LOG_STDERR|MAX_STALE_SECS|PROCESS_GREP|SCHEDULE_DESC|TYPE
39
- #
40
- # MAX_STALE_SECS: how old stdout log can be before WARN.
41
- # 0 = skip staleness check (for one-shot or infrequent tasks)
42
- # WARN at MAX_STALE_SECS, FAIL at 3x MAX_STALE_SECS
43
- # PROCESS_GREP: pattern to grep in ps (empty = skip process check)
44
- # ============================================================================
45
- # Core monitors are built from crons/manifest.json (single source of truth).
46
- # The NEXO_CODE env var must point to the repo src/ directory.
47
- # Add personal (non-manifest) monitors to PERSONAL_MONITORS below.
48
- NEXO_CODE="${NEXO_CODE:-$(cd "$(dirname "$0")/.." 2>/dev/null && pwd)}"
49
- # Look for manifest in NEXO_HOME first (packaged install), then NEXO_CODE (dev/repo)
50
- if [ -f "$NEXO_HOME/crons/manifest.json" ]; then
51
- MANIFEST_FILE="$NEXO_HOME/crons/manifest.json"
52
- else
53
- MANIFEST_FILE="$NEXO_CODE/crons/manifest.json"
54
- fi
55
-
56
- _build_monitors_from_manifest() {
57
- if [ ! -f "$MANIFEST_FILE" ]; then
58
- log "WARNING: manifest.json not found at $MANIFEST_FILE — no core monitors loaded"
59
- return
60
- fi
61
- python3 -c "
62
- import json, sys, platform
63
-
64
- nexo_home = '$NEXO_HOME'
65
- is_mac = platform.system() == 'Darwin'
66
- optionals_file = '$NEXO_HOME/config/optionals.json'
67
- schedule_file = '$NEXO_HOME/config/schedule.json'
68
- optionals = {}
69
- automation_default = True
70
-
71
- with open('$MANIFEST_FILE') as f:
72
- data = json.load(f)
73
-
74
- try:
75
- with open(optionals_file) as f:
76
- maybe = json.load(f)
77
- if isinstance(maybe, dict):
78
- optionals = {str(k): bool(v) for k, v in maybe.items()}
79
- except Exception:
80
- optionals = {}
81
-
82
- try:
83
- with open(schedule_file) as f:
84
- schedule = json.load(f)
85
- if isinstance(schedule, dict):
86
- automation_default = bool(schedule.get('automation_enabled', True))
87
- except Exception:
88
- automation_default = True
89
-
90
- for c in data.get('crons', []):
91
- cid = c['id']
92
- optional_key = c.get('optional')
93
- if optional_key == 'automation':
94
- optional_enabled = optionals.get(optional_key, automation_default)
95
- else:
96
- optional_enabled = optionals.get(optional_key, False)
97
- if optional_key and not optional_enabled:
98
- continue
99
- name = cid.replace('-', ' ').title()
100
- # Use the right service identifier per platform
101
- if is_mac:
102
- svc_id = 'com.nexo.' + cid
103
- else:
104
- svc_id = 'nexo-' + cid + '.timer'
105
- stdout_log = nexo_home + '/logs/' + cid + '-stdout.log'
106
- stderr_log = nexo_home + '/logs/' + cid + '-stderr.log'
107
-
108
- recovery_policy = c.get('recovery_policy')
109
- if not recovery_policy:
110
- if c.get('keep_alive') or 'interval_seconds' in c:
111
- recovery_policy = 'restart'
112
- elif 'schedule' in c:
113
- recovery_policy = 'catchup'
114
- else:
115
- recovery_policy = 'none'
116
- run_at_load = bool(c.get('run_at_load') or (c.get('run_on_boot') and 'interval_seconds' in c and not c.get('keep_alive')))
117
-
118
- # Derive max_stale_secs and schedule_desc from schedule config
119
- if c.get('keep_alive'):
120
- max_stale = 0
121
- schedule_desc = 'KeepAlive'
122
- proc_grep = c.get('script', '').split('/')[-1]
123
- elif 'interval_seconds' in c:
124
- iv = c['interval_seconds']
125
- # Allow 2x the interval before WARN
126
- max_stale = iv * 2
127
- if iv >= 3600:
128
- schedule_desc = f'Every {iv // 3600}h'
129
- else:
130
- schedule_desc = f'Every {iv // 60} min'
131
- if run_at_load:
132
- schedule_desc += ' + boot'
133
- proc_grep = ''
134
- elif 'schedule' in c:
135
- s = c['schedule']
136
- h = s.get('hour', 0)
137
- m = s.get('minute', 0)
138
- if 'weekday' in s:
139
- days = ['Sun','Mon','Tue','Wed','Thu','Fri','Sat']
140
- schedule_desc = f'Weekly {days[s[\"weekday\"]]} {h}:{m:02d}'
141
- max_stale = 0 # weekly tasks: skip staleness
142
- else:
143
- schedule_desc = f'Daily {h}:{m:02d}'
144
- max_stale = 90000 # ~25h
145
- proc_grep = ''
146
- elif run_at_load:
147
- max_stale = 0
148
- schedule_desc = 'RunAtLoad once'
149
- proc_grep = ''
150
- else:
151
- max_stale = 0
152
- schedule_desc = 'unknown'
153
- proc_grep = ''
154
-
155
- mon_type = 'core' if c.get('core') else 'personal'
156
-
157
- print(f'{name}|{svc_id}|{stdout_log}|{stderr_log}|{max_stale}|{proc_grep}|{schedule_desc}|{mon_type}|{recovery_policy}')
158
- " 2>/dev/null
159
- }
160
-
161
- MONITORS=()
162
- while IFS= read -r line; do
163
- [ -n "$line" ] && MONITORS+=("$line")
164
- done < <(_build_monitors_from_manifest)
165
-
166
- # Personal (non-manifest) monitors — add yours below.
167
- # These are NOT in manifest.json and won't be synced by cron-sync.
168
- PERSONAL_MONITORS=(
169
- # "My Service|com.nexo.my-service|$NEXO_HOME/logs/my-service.log||3600||Every 30 min|personal"
170
- )
171
- MONITORS+=("${PERSONAL_MONITORS[@]+"${PERSONAL_MONITORS[@]}"}")
172
-
173
- # Cron jobs to check (NAME|SCRIPT|CHECK_PATH|MAX_STALE_SECS|SCHEDULE)
174
- # Core cron monitors are loaded from manifest above.
175
- # Maintainer-only monitors go here (guarded by NEXO_MAINTAINER env var).
176
- CRON_MONITORS=()
177
- if [ "${NEXO_MAINTAINER:-}" = "1" ]; then
178
- CRON_MONITORS+=(
179
- "Backup|$NEXO_DIR/scripts/nexo-backup.sh|$NEXO_DIR/backups/|7200|Hourly"
180
- )
181
- fi
182
-
183
- # Error patterns to search in stderr logs (last 50 lines)
184
- ERROR_PATTERNS="Traceback|Error:|CRITICAL|FATAL|ModuleNotFoundError|PermissionError|FileNotFoundError|ConnectionRefused|Errno|Operation not permitted|SyntaxError|sqlite3\\.OperationalError"
185
-
186
- # ============================================================================
187
- # HELPER FUNCTIONS
188
- # ============================================================================
189
-
190
- UID_NUM=$(id -u)
191
- REPAIR_LOG="$LOG_DIR/watchdog-repairs.log"
192
- TOTAL_HEALED=0
193
- IS_MACOS=false
194
- [ "$(uname)" = "Darwin" ] && IS_MACOS=true
195
-
196
- log_repair() { echo "[$TS] REPAIR: $1" >> "$REPAIR_LOG"; log "REPAIR: $1"; }
197
-
198
- is_loaded() {
199
- if $IS_MACOS; then
200
- launchctl print "gui/$UID_NUM/$1" &>/dev/null
201
- else
202
- # On Linux, check if the systemd timer is enabled
203
- systemctl --user is-enabled "$1" &>/dev/null
204
- fi
205
- }
206
-
207
- # ============================================================================
208
- # AUTO-REPAIR FUNCTIONS
209
- # ============================================================================
210
-
211
- try_repair_launchagent() {
212
- $IS_MACOS || return 1
213
- local plist_id="$1"
214
- local proc_grep="$2"
215
- local plist_file="$HOME_DIR/Library/LaunchAgents/${plist_id}.plist"
216
-
217
- # Repair 1: Not loaded — try to bootstrap
218
- if ! is_loaded "$plist_id"; then
219
- if [ -f "$plist_file" ]; then
220
- launchctl bootstrap "gui/$UID_NUM" "$plist_file" 2>/dev/null
221
- sleep 1
222
- if is_loaded "$plist_id"; then
223
- log_repair "$plist_id: bootstrapped successfully"
224
- return 0
225
- fi
226
- fi
227
- return 1
228
- fi
229
-
230
- # Repair 2: Loaded but process not running (KeepAlive) — kickstart
231
- if [ -n "$proc_grep" ] && ! process_running "$proc_grep"; then
232
- launchctl kickstart "gui/$UID_NUM/$plist_id" 2>/dev/null
233
- sleep 2
234
- if process_running "$proc_grep"; then
235
- log_repair "$plist_id: kickstarted process '$proc_grep'"
236
- return 0
237
- fi
238
- fi
239
-
240
- return 1
241
- }
242
-
243
- try_repair_systemd() {
244
- $IS_MACOS && return 1
245
- local timer_unit="$1"
246
- local service_unit="${timer_unit%.timer}.service"
247
-
248
- # Repair 1: Timer not enabled — try to enable and start
249
- if ! systemctl --user is-enabled "$timer_unit" &>/dev/null; then
250
- systemctl --user daemon-reload 2>/dev/null
251
- systemctl --user enable --now "$timer_unit" 2>/dev/null
252
- sleep 1
253
- if systemctl --user is-enabled "$timer_unit" &>/dev/null; then
254
- log_repair "$timer_unit: enabled and started"
255
- return 0
256
- fi
257
- return 1
258
- fi
259
-
260
- # Repair 2: Timer enabled but not active — start it
261
- if ! systemctl --user is-active "$timer_unit" &>/dev/null; then
262
- systemctl --user start "$timer_unit" 2>/dev/null
263
- sleep 1
264
- if systemctl --user is-active "$timer_unit" &>/dev/null; then
265
- log_repair "$timer_unit: restarted"
266
- return 0
267
- fi
268
- fi
269
-
270
- return 1
271
- }
272
-
273
- try_repair_cron() {
274
- local script="$1"
275
-
276
- # Repair: Script not executable — chmod it
277
- if [ -f "$script" ] && [ ! -x "$script" ]; then
278
- chmod +x "$script"
279
- if [ -x "$script" ]; then
280
- log_repair "$script: made executable"
281
- return 0
282
- fi
283
- fi
284
-
285
- return 1
286
- }
287
-
288
- try_reexecute_missed_cron() {
289
- local svc_id="$1"
290
-
291
- if $IS_MACOS; then
292
- log "Re-executing missed cron via launchctl kickstart: $svc_id"
293
- if launchctl kickstart -k "gui/$UID_NUM/$svc_id" >> "$LOG_DIR/watchdog-reexec.log" 2>&1; then
294
- log_repair "$svc_id: re-executed missed cron via launchctl kickstart"
295
- return 0
296
- fi
297
- log "Re-execute failed for $svc_id"
298
- return 1
299
- else
300
- # Linux: start the corresponding service unit directly
301
- local service_unit="${svc_id%.timer}.service"
302
- log "Re-executing missed cron: $svc_id → systemctl start $service_unit"
303
- if systemctl --user start "$service_unit" 2>/dev/null; then
304
- log_repair "$svc_id: re-executed via systemctl start $service_unit"
305
- return 0
306
- else
307
- log "Re-execute failed for $svc_id"
308
- return 1
309
- fi
310
- fi
311
- }
312
-
313
- CATCHUP_REQUESTED=false
314
- try_request_catchup() {
315
- if $CATCHUP_REQUESTED; then
316
- return 0
317
- fi
318
- local catchup_svc
319
- if $IS_MACOS; then
320
- catchup_svc="com.nexo.catchup"
321
- else
322
- catchup_svc="nexo-catchup.timer"
323
- fi
324
- if try_reexecute_missed_cron "$catchup_svc"; then
325
- CATCHUP_REQUESTED=true
326
- return 0
327
- fi
328
- return 1
329
- }
330
-
331
- try_verify_repair() {
332
- # After Level 2 repair, wait and verify the service is healthy
333
- local plist_id="$1"
334
- local log_stdout="$2"
335
- local proc_grep="$3"
336
- local mon_type="${4:-core}"
337
- local max_wait=30
338
-
339
- log "Verifying repair for $plist_id..."
340
-
341
- # Check 1: Is it loaded?
342
- if ! is_loaded "$plist_id"; then
343
- log "Verify FAILED: $plist_id still not loaded"
344
- return 1
345
- fi
346
-
347
- # Check 2: Process running? (for KeepAlive services)
348
- if [ -n "$proc_grep" ]; then
349
- local waited=0
350
- while [ $waited -lt $max_wait ]; do
351
- if process_running "$proc_grep"; then
352
- log "Verify OK: $plist_id process running after ${waited}s"
353
- return 0
354
- fi
355
- sleep 5
356
- waited=$((waited + 5))
357
- done
358
- log "Verify FAILED: $plist_id process not running after ${max_wait}s"
359
- return 1
360
- fi
361
-
362
- # Check 3: For scheduled crons, check if cron_runs/logs were updated recently
363
- if [ "$mon_type" = "core" ]; then
364
- local cron_id
365
- cron_id=$(cron_id_from_service "$plist_id")
366
- local run_info
367
- run_info=$(cron_last_run_info "$cron_id" || true)
368
- if [ -n "$run_info" ]; then
369
- local run_age
370
- IFS='|' read -r run_age _ _ _ _ _ <<< "$run_info"
371
- if [ -n "$run_age" ] && [ "$run_age" -lt 300 ]; then
372
- log "Verify OK: $plist_id cron_runs updated ${run_age}s ago"
373
- return 0
374
- fi
375
- fi
376
- fi
377
-
378
- if [ -n "$log_stdout" ] && [ -f "$log_stdout" ]; then
379
- local age
380
- age=$(file_age "$log_stdout")
381
- if [ "$age" -lt 300 ]; then
382
- log "Verify OK: $plist_id log updated ${age}s ago"
383
- return 0
384
- fi
385
- fi
386
-
387
- # If we get here for a scheduled service, it's loaded which is sufficient
388
- log "Verify OK: $plist_id is loaded (scheduled service)"
389
- return 0
390
- }
391
-
392
- try_repair_backup() {
393
- # Use the core backup script (nexo-backup.sh)
394
- local backup_script="$NEXO_DIR/scripts/nexo-backup.sh"
395
- [ ! -x "$backup_script" ] && backup_script="$SCRIPT_DIR/nexo-backup.sh"
396
- if [ -x "$backup_script" ]; then
397
- bash "$backup_script" 2>/dev/null
398
- sleep 1
399
- local newest
400
- newest=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
401
- if [ -n "$newest" ]; then
402
- if $IS_MACOS; then local age=$(( TS_EPOCH - $(stat -f %m "$newest") )); else local age=$(( TS_EPOCH - $(stat -c %Y "$newest") )); fi
403
- if [ "$age" -lt 60 ]; then
404
- log_repair "nexo-backup.sh: ran successfully, fresh backup created"
405
- return 0
406
- fi
407
- fi
408
- fi
409
- return 1
410
- }
411
-
412
- file_age() {
413
- if [ -f "$1" ]; then
414
- local mod_epoch
415
- if $IS_MACOS; then
416
- mod_epoch=$(stat -f %m "$1" 2>/dev/null || echo 0)
417
- else
418
- mod_epoch=$(stat -c %Y "$1" 2>/dev/null || echo 0)
419
- fi
420
- echo $(( TS_EPOCH - mod_epoch ))
421
- else
422
- echo 999999
423
- fi
424
- }
425
-
426
- format_age() {
427
- local secs=$1
428
- if [ "$secs" -ge 999999 ]; then
429
- echo "never"
430
- elif [ "$secs" -ge 86400 ]; then
431
- echo "$((secs / 86400))d $((secs % 86400 / 3600))h ago"
432
- elif [ "$secs" -ge 3600 ]; then
433
- echo "$((secs / 3600))h $((secs % 3600 / 60))m ago"
434
- elif [ "$secs" -ge 60 ]; then
435
- echo "$((secs / 60))m ago"
436
- else
437
- echo "${secs}s ago"
438
- fi
439
- }
440
-
441
- check_errors() {
442
- local logfile="$1"
443
- if [ -f "$logfile" ] && [ -s "$logfile" ]; then
444
- local count
445
- count=$(tail -50 "$logfile" 2>/dev/null | grep -cE "$ERROR_PATTERNS" 2>/dev/null) || true
446
- echo "${count:-0}"
447
- else
448
- echo 0
449
- fi
450
- }
451
-
452
- process_running() {
453
- if [ -n "$1" ]; then
454
- pgrep -f "$1" > /dev/null 2>&1
455
- else
456
- return 0
457
- fi
458
- }
459
-
460
- cron_id_from_service() {
461
- local svc_id="$1"
462
- if $IS_MACOS; then
463
- echo "${svc_id#com.nexo.}"
464
- else
465
- echo "${svc_id#nexo-}" | sed 's/\.timer$//'
466
- fi
467
- }
468
-
469
- cron_last_run_info() {
470
- local cron_id="$1"
471
- [ ! -f "$DB_PATH" ] && return 1
472
- sqlite3 -separator '|' "$DB_PATH" "
473
- SELECT
474
- CAST(strftime('%s','now') - strftime('%s', started_at) AS INTEGER) AS age_secs,
475
- COALESCE(started_at, ''),
476
- COALESCE(ended_at, ''),
477
- COALESCE(exit_code, ''),
478
- COALESCE(error, ''),
479
- COALESCE(summary, '')
480
- FROM cron_runs
481
- WHERE cron_id = '$cron_id'
482
- ORDER BY id DESC
483
- LIMIT 1;
484
- " 2>/dev/null
485
- }
486
-
487
- classify_log_issue() {
488
- local logfile="$1"
489
- if [ ! -f "$logfile" ] || [ ! -s "$logfile" ]; then
490
- return 0
491
- fi
492
- local tail_text
493
- tail_text=$(tail -50 "$logfile" 2>/dev/null || true)
494
- if echo "$tail_text" | grep -q "Operation not permitted"; then
495
- echo "tcc"
496
- elif echo "$tail_text" | grep -q "ModuleNotFoundError"; then
497
- echo "dependency"
498
- elif echo "$tail_text" | grep -q "SyntaxError"; then
499
- echo "syntax"
500
- elif echo "$tail_text" | grep -q "sqlite3.OperationalError"; then
501
- echo "schema"
502
- fi
503
- }
504
-
505
- # Escape strings for JSON
506
- json_escape() {
507
- echo "$1" | sed 's/\\/\\\\/g; s/"/\\"/g; s/ / /g' | tr '\n' ' '
508
- }
509
-
510
- # ============================================================================
511
- # RUN CHECKS
512
- # ============================================================================
513
-
514
- TOTAL_PASS=0
515
- TOTAL_WARN=0
516
- TOTAL_FAIL=0
517
- JSON_AGENTS=""
518
- REPORT_LINES=""
519
- FAILED_MONITORS=() # Track failed monitors for Level 2 repair
520
-
521
- for monitor in "${MONITORS[@]}"; do
522
- # Skip comment lines
523
- [[ "$monitor" =~ ^[[:space:]]*# ]] && continue
524
- IFS='|' read -r name plist_id log_stdout log_stderr max_stale proc_grep schedule mon_type recovery_policy <<< "$monitor"
525
- mon_type="${mon_type:-core}"
526
- recovery_policy="${recovery_policy:-restart}"
527
-
528
- status="PASS"
529
- details=""
530
- loaded="unknown"
531
- stale_age="n/a"
532
- error_count=0
533
- proc_alive="n/a"
534
- error_kind=""
535
- cron_id=$(cron_id_from_service "$plist_id")
536
- latest_run_has_record=false
537
- latest_run_failed=false
538
-
539
- # Check 1: Service loaded? (launchd on macOS, systemd on Linux)
540
- if is_loaded "$plist_id"; then
541
- loaded="yes"
542
- else
543
- loaded="no"
544
- # AUTO-REPAIR: try platform-appropriate repair
545
- repair_ok=false
546
- if $IS_MACOS; then
547
- try_repair_launchagent "$plist_id" "$proc_grep" && repair_ok=true
548
- else
549
- try_repair_systemd "$plist_id" && repair_ok=true
550
- fi
551
- if $repair_ok; then
552
- loaded="yes"
553
- status="HEALED"
554
- details="${details}Self-healed: service re-registered. "
555
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
556
- else
557
- status="FAIL"
558
- details="${details}Service not loaded (repair failed). "
559
- fi
560
- fi
561
-
562
- # Check 2: Process alive? (only for KeepAlive / long-running)
563
- if [ -n "$proc_grep" ]; then
564
- if process_running "$proc_grep"; then
565
- proc_alive="yes"
566
- else
567
- proc_alive="no"
568
- # AUTO-REPAIR: try to kickstart (platform-appropriate)
569
- if [ "$status" != "FAIL" ] && [ "$status" != "HEALED" ]; then
570
- if ($IS_MACOS && try_repair_launchagent "$plist_id" "$proc_grep") || \
571
- (! $IS_MACOS && try_repair_systemd "$plist_id"); then
572
- proc_alive="yes"
573
- status="HEALED"
574
- details="${details}Self-healed: kickstarted. "
575
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
576
- else
577
- status="WARN"
578
- details="${details}Process '$proc_grep' not running (repair failed). "
579
- fi
580
- elif [ "$status" = "HEALED" ]; then
581
- # Already healed by bootstrap, check if process came up
582
- sleep 1
583
- if process_running "$proc_grep"; then
584
- proc_alive="yes"
585
- else
586
- details="${details}Process '$proc_grep' still not running after bootstrap. "
587
- fi
588
- fi
589
- fi
590
- fi
591
-
592
- # Check 3: Staleness + AUTO RE-EXECUTE missed crons
593
- if [ "$mon_type" = "core" ] && [ "$max_stale" -gt 0 ]; then
594
- run_info=$(cron_last_run_info "$cron_id" || true)
595
- if [ -n "$run_info" ]; then
596
- latest_run_has_record=true
597
- IFS='|' read -r age _ _ last_exit last_error last_summary <<< "$run_info"
598
- age="${age:-999999}"
599
- stale_age=$(format_age "$age")
600
- if [ -n "$last_exit" ] && [ "$last_exit" != "0" ]; then
601
- latest_run_failed=true
602
- status="FAIL"
603
- details="${details}Last run exited ${last_exit}. "
604
- [ -n "$last_error" ] && details="${details}Error: ${last_error}. "
605
- fi
606
- if [ "$age" -gt $(( max_stale * 3 )) ]; then
607
- if [ "$recovery_policy" = "catchup" ]; then
608
- if try_request_catchup; then
609
- status="HEALED"
610
- details="${details}Self-healed: requested catchup for missed window (last run: $stale_age). "
611
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
612
- else
613
- status="FAIL"
614
- details="${details}cron_runs stale: $stale_age (limit: $(format_age "$max_stale")). Catchup request failed. "
615
- fi
616
- else
617
- if try_reexecute_missed_cron "$plist_id"; then
618
- status="HEALED"
619
- details="${details}Self-healed: re-executed missed cron (last run: $stale_age). "
620
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
621
- else
622
- status="FAIL"
623
- details="${details}cron_runs stale: $stale_age (limit: $(format_age "$max_stale")). Re-execute failed. "
624
- fi
625
- fi
626
- elif [ "$age" -gt "$max_stale" ]; then
627
- [ "$status" = "PASS" ] && status="WARN"
628
- details="${details}cron_runs slightly stale: $stale_age. "
629
- elif [ -z "$details" ] && [ -n "$last_summary" ]; then
630
- details="${details}Last run summary: ${last_summary}. "
631
- fi
632
- else
633
- stale_age="no cron_runs entry"
634
- if [ "$recovery_policy" = "catchup" ]; then
635
- if try_request_catchup; then
636
- status="HEALED"
637
- details="${details}Self-healed: requested catchup for missing cron_runs entry. "
638
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
639
- else
640
- status="FAIL"
641
- details="${details}No cron_runs entry recorded yet and catchup request failed. "
642
- fi
643
- else
644
- if try_reexecute_missed_cron "$plist_id"; then
645
- status="HEALED"
646
- details="${details}Self-healed: executed missing cron for first run. "
647
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
648
- else
649
- status="FAIL"
650
- details="${details}No cron_runs entry recorded yet. "
651
- fi
652
- fi
653
- fi
654
- elif [ -n "$log_stdout" ] && [ "$max_stale" -gt 0 ]; then
655
- age=$(file_age "$log_stdout")
656
- stale_age=$(format_age "$age")
657
- if [ "$age" -gt $(( max_stale * 3 )) ]; then
658
- # Severely stale — try to re-execute the missed cron
659
- if try_reexecute_missed_cron "$plist_id"; then
660
- status="HEALED"
661
- details="${details}Self-healed: re-executed missed cron (was stale: $stale_age). "
662
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
663
- else
664
- status="FAIL"
665
- details="${details}Log stale: $stale_age (limit: $(format_age "$max_stale")). Re-execute failed. "
666
- fi
667
- elif [ "$age" -gt "$max_stale" ]; then
668
- [ "$status" = "PASS" ] && status="WARN"
669
- details="${details}Log slightly stale: $stale_age. "
670
- fi
671
- elif [ -n "$log_stdout" ]; then
672
- if [ -f "$log_stdout" ]; then
673
- age=$(file_age "$log_stdout")
674
- stale_age=$(format_age "$age")
675
- else
676
- stale_age="no log file"
677
- fi
678
- fi
679
-
680
- # Check 4: Errors in stderr log
681
- if [ -n "$log_stderr" ]; then
682
- consider_stderr=true
683
- if [ "$mon_type" = "core" ] && $latest_run_has_record && ! $latest_run_failed && [ "$loaded" = "yes" ]; then
684
- consider_stderr=false
685
- fi
686
- if $consider_stderr; then
687
- error_count=$(check_errors "$log_stderr")
688
- error_kind=$(classify_log_issue "$log_stderr" || true)
689
- if [ "$error_count" -gt 5 ]; then
690
- [ "$status" = "PASS" ] && status="WARN"
691
- details="${details}${error_count} errors in recent stderr. "
692
- fi
693
- case "$error_kind" in
694
- tcc)
695
- status="FAIL"
696
- details="${details}Recent stderr shows macOS TCC/Sandbox denial ('Operation not permitted'). "
697
- ;;
698
- dependency)
699
- [ "$status" = "PASS" ] && status="WARN"
700
- details="${details}Recent stderr shows missing Python dependency. "
701
- ;;
702
- syntax)
703
- status="FAIL"
704
- details="${details}Recent stderr shows syntax error. "
705
- ;;
706
- schema)
707
- status="FAIL"
708
- details="${details}Recent stderr shows DB/schema mismatch. "
709
- ;;
710
- esac
711
- fi
712
- fi
713
-
714
- [ -z "$details" ] && details="All checks passed"
715
-
716
- # HEALED counts as PASS for overall status
717
- case "$status" in
718
- PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
719
- WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
720
- FAIL)
721
- TOTAL_FAIL=$((TOTAL_FAIL + 1))
722
- FAILED_MONITORS+=("${name}|${plist_id}|${log_stdout}|${log_stderr}|${proc_grep}|${schedule}|${mon_type}|${details}")
723
- ;;
724
- esac
725
-
726
- # JSON
727
- escaped_details=$(json_escape "$details")
728
- json_item=" {\"name\":\"$name\",\"plist\":\"$plist_id\",\"status\":\"$status\",\"type\":\"$mon_type\",\"loaded\":\"$loaded\",\"process\":\"$proc_alive\",\"last_activity\":\"$stale_age\",\"stderr_errors\":$error_count,\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
729
- [ -n "$JSON_AGENTS" ] && JSON_AGENTS="${JSON_AGENTS},
730
- ${json_item}" || JSON_AGENTS="$json_item"
731
-
732
- # Report
733
- case "$status" in
734
- PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;;
735
- esac
736
- REPORT_LINES="${REPORT_LINES} [${icon}] ${name} (${schedule})
737
- Loaded: ${loaded} | Process: ${proc_alive} | Last: ${stale_age} | Errors: ${error_count}
738
- ${details}
739
- "
740
- done
741
-
742
- # --- Cron job checks ---
743
- CRON_JSON=""
744
- CRON_REPORT=""
745
- for cron_entry in ${CRON_MONITORS[@]+"${CRON_MONITORS[@]}"}; do
746
- IFS='|' read -r name script check_path max_stale schedule <<< "$cron_entry"
747
-
748
- c_status="PASS"
749
- c_details=""
750
- age_str="n/a"
751
-
752
- # Check script exists and is executable
753
- if [ ! -x "$script" ]; then
754
- # AUTO-REPAIR: try chmod
755
- if try_repair_cron "$script"; then
756
- c_status="HEALED"
757
- c_details="Self-healed: made executable. "
758
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
759
- else
760
- c_status="FAIL"
761
- c_details="Script not executable or missing (repair failed). "
762
- fi
763
- fi
764
-
765
- # Check output freshness
766
- if [ -d "$check_path" ]; then
767
- newest=$(ls -t "$check_path" 2>/dev/null | head -1)
768
- if [ -n "$newest" ]; then
769
- age=$(file_age "${check_path}${newest}")
770
- age_str=$(format_age "$age")
771
- if [ "$age" -gt $(( max_stale * 3 )) ]; then
772
- c_status="FAIL"
773
- c_details="${c_details}Output stale: $age_str. "
774
- elif [ "$age" -gt "$max_stale" ]; then
775
- [ "$c_status" = "PASS" ] && c_status="WARN"
776
- c_details="${c_details}Output slightly stale: $age_str. "
777
- fi
778
- else
779
- c_status="WARN"
780
- c_details="${c_details}No output files found. "
781
- age_str="no files"
782
- fi
783
- elif [ -f "$check_path" ]; then
784
- age=$(file_age "$check_path")
785
- age_str=$(format_age "$age")
786
- if [ "$age" -gt $(( max_stale * 3 )) ]; then
787
- c_status="FAIL"
788
- c_details="${c_details}Output stale: $age_str. "
789
- elif [ "$age" -gt "$max_stale" ]; then
790
- [ "$c_status" = "PASS" ] && c_status="WARN"
791
- c_details="${c_details}Output slightly stale: $age_str. "
792
- fi
793
- fi
794
-
795
- [ -z "$c_details" ] && c_details="All checks passed"
796
-
797
- case "$c_status" in
798
- PASS|HEALED) TOTAL_PASS=$((TOTAL_PASS + 1)) ;;
799
- WARN) TOTAL_WARN=$((TOTAL_WARN + 1)) ;;
800
- FAIL) TOTAL_FAIL=$((TOTAL_FAIL + 1)) ;;
801
- esac
802
-
803
- escaped_details=$(json_escape "$c_details")
804
- cron_item=" {\"name\":\"$name\",\"script\":\"$script\",\"status\":\"$c_status\",\"last_output\":\"$age_str\",\"schedule\":\"$schedule\",\"details\":\"$escaped_details\"}"
805
- [ -n "$CRON_JSON" ] && CRON_JSON="${CRON_JSON},
806
- ${cron_item}" || CRON_JSON="$cron_item"
807
-
808
- case "$c_status" in
809
- PASS) icon="PASS" ;; HEALED) icon="HEAL" ;; WARN) icon="WARN" ;; FAIL) icon="FAIL" ;;
810
- esac
811
- CRON_REPORT="${CRON_REPORT} [${icon}] ${name} (${schedule})
812
- Last output: ${age_str}
813
- ${c_details}
814
- "
815
- done
816
-
817
- # ============================================================================
818
- # INFRASTRUCTURE CHECKS
819
- # ============================================================================
820
-
821
- # --- SQLite integrity ---
822
- SQLITE_STATUS="PASS"
823
- SQLITE_DETAIL=""
824
- INTEGRITY=$(sqlite3 "$NEXO_DIR/data/nexo.db" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
825
- if [ "$INTEGRITY" != "ok" ]; then
826
- SQLITE_STATUS="FAIL"
827
- SQLITE_DETAIL="Integrity check: $INTEGRITY"
828
- log "CRITICAL: SQLite integrity check failed: $INTEGRITY"
829
- TOTAL_FAIL=$((TOTAL_FAIL + 1))
830
- LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
831
- if [ -n "$LATEST_BACKUP" ]; then
832
- cp "$LATEST_BACKUP" "$NEXO_DIR/data/nexo.db"
833
- log "RESTORED from $LATEST_BACKUP"
834
- SQLITE_DETAIL="${SQLITE_DETAIL}. Restored from backup."
835
- fi
836
- else
837
- SQLITE_DETAIL="Integrity OK"
838
- TOTAL_PASS=$((TOTAL_PASS + 1))
839
- fi
840
-
841
- # --- Immutable file integrity ---
842
- IMMUTABLE_STATUS="PASS"
843
- IMMUTABLE_DETAIL=""
844
- OBJECTIVE="$CORTEX_DIR/evolution-objective.json"
845
- if [ -f "$HASH_REGISTRY" ]; then
846
- TAMPERED=0
847
- while IFS='|' read -r filepath expected_hash; do
848
- if [ -f "$filepath" ]; then
849
- ACTUAL=$(shasum -a 256 "$filepath" | cut -d' ' -f1)
850
- if [ "$ACTUAL" != "$expected_hash" ]; then
851
- TAMPERED=$((TAMPERED + 1))
852
- log "CRITICAL: Immutable file modified: $filepath"
853
- LATEST_SNAP=$(ls -td "$NEXO_HOME/snapshots/"*/ 2>/dev/null | head -1)
854
- if [ -n "$LATEST_SNAP" ] && [ -f "${LATEST_SNAP}files/${filepath#$HOME_DIR/}" ]; then
855
- cp "${LATEST_SNAP}files/${filepath#$HOME_DIR/}" "$filepath"
856
- log "RESTORED immutable file from snapshot"
857
- fi
858
- fi
859
- fi
860
- done < "$HASH_REGISTRY"
861
- if [ "$TAMPERED" -gt 0 ]; then
862
- IMMUTABLE_STATUS="FAIL"
863
- IMMUTABLE_DETAIL="$TAMPERED immutable files tampered"
864
- TOTAL_FAIL=$((TOTAL_FAIL + 1))
865
- if [ -f "$OBJECTIVE" ]; then
866
- python3 -c "
867
- import json
868
- with open('$OBJECTIVE') as f: d = json.load(f)
869
- d['evolution_enabled'] = False
870
- d['disabled_reason'] = 'Immutable file tampered — watchdog disabled Evolution'
871
- with open('$OBJECTIVE', 'w') as f: json.dump(d, f, indent=2)
872
- " 2>/dev/null
873
- log "DISABLED Evolution due to immutable file tampering"
874
- fi
875
- else
876
- IMMUTABLE_DETAIL="All files intact"
877
- TOTAL_PASS=$((TOTAL_PASS + 1))
878
- if [ -f "$OBJECTIVE" ]; then
879
- python3 -c "
880
- import json
881
- from pathlib import Path
882
- obj = Path('$OBJECTIVE')
883
- try:
884
- data = json.loads(obj.read_text())
885
- except Exception:
886
- raise SystemExit(0)
887
- reason = data.get('disabled_reason', '') or ''
888
- if data.get('evolution_enabled') is False and 'watchdog disabled Evolution' in reason:
889
- data['evolution_enabled'] = True
890
- data.pop('disabled_reason', None)
891
- obj.write_text(json.dumps(data, indent=2, ensure_ascii=False))
892
- print('REENABLED')
893
- " 2>/dev/null | grep -q "REENABLED" && log "REENABLED Evolution after immutable integrity recovered"
894
- fi
895
- fi
896
- else
897
- IMMUTABLE_DETAIL="No hash registry (skipped)"
898
- TOTAL_PASS=$((TOTAL_PASS + 1))
899
- fi
900
-
901
- # --- Backup freshness ---
902
- BACKUP_STATUS="PASS"
903
- BACKUP_DETAIL=""
904
- LATEST_BACKUP=$(ls -t "$NEXO_DIR/backups/nexo-"*.db 2>/dev/null | head -1)
905
- if [ -n "$LATEST_BACKUP" ]; then
906
- if $IS_MACOS; then BACKUP_AGE=$(( TS_EPOCH - $(stat -f %m "$LATEST_BACKUP") )); else BACKUP_AGE=$(( TS_EPOCH - $(stat -c %Y "$LATEST_BACKUP") )); fi
907
- BACKUP_AGE_STR=$(format_age "$BACKUP_AGE")
908
- if [ "$BACKUP_AGE" -gt 7200 ]; then
909
- # AUTO-REPAIR: run backup now
910
- if try_repair_backup; then
911
- BACKUP_STATUS="HEALED"
912
- BACKUP_DETAIL="Self-healed: backup was stale ($BACKUP_AGE_STR), ran fresh backup"
913
- TOTAL_HEALED=$((TOTAL_HEALED + 1))
914
- TOTAL_PASS=$((TOTAL_PASS + 1))
915
- else
916
- BACKUP_STATUS="WARN"
917
- BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR (>2h, repair failed)"
918
- TOTAL_WARN=$((TOTAL_WARN + 1))
919
- fi
920
- else
921
- BACKUP_DETAIL="Last backup: $BACKUP_AGE_STR"
922
- TOTAL_PASS=$((TOTAL_PASS + 1))
923
- fi
924
- else
925
- BACKUP_STATUS="FAIL"
926
- BACKUP_DETAIL="No backups found"
927
- TOTAL_FAIL=$((TOTAL_FAIL + 1))
928
- fi
929
-
930
- # --- Cognitive DB check ---
931
- COG_STATUS="PASS"
932
- COG_DETAIL=""
933
- COG_DB="$NEXO_DIR/data/cognitive.db"
934
- if [ -f "$COG_DB" ]; then
935
- COG_INT=$(sqlite3 "$COG_DB" "PRAGMA integrity_check;" 2>/dev/null || echo "CORRUPT")
936
- if [ "$COG_INT" != "ok" ]; then
937
- COG_STATUS="FAIL"
938
- COG_DETAIL="Cognitive DB integrity: $COG_INT"
939
- TOTAL_FAIL=$((TOTAL_FAIL + 1))
940
- else
941
- COG_DETAIL="Integrity OK"
942
- TOTAL_PASS=$((TOTAL_PASS + 1))
943
- fi
944
- else
945
- COG_STATUS="WARN"
946
- COG_DETAIL="cognitive.db not found"
947
- TOTAL_WARN=$((TOTAL_WARN + 1))
948
- fi
949
-
950
- # ============================================================================
951
- # WRITE JSON STATUS
952
- # ============================================================================
953
- TOTAL=$((TOTAL_PASS + TOTAL_WARN + TOTAL_FAIL))
954
- OVERALL="PASS"
955
- [ "$TOTAL_WARN" -gt 0 ] && OVERALL="WARN"
956
- [ "$TOTAL_FAIL" -gt 0 ] && OVERALL="FAIL"
957
-
958
- cat > "$STATUS_JSON" <<JSONEOF
959
- {
960
- "timestamp": "$TS",
961
- "summary": {
962
- "total": $TOTAL,
963
- "pass": $TOTAL_PASS,
964
- "warn": $TOTAL_WARN,
965
- "fail": $TOTAL_FAIL,
966
- "healed": $TOTAL_HEALED,
967
- "overall": "$OVERALL"
968
- },
969
- "launch_agents": [
970
- $JSON_AGENTS
971
- ],
972
- "cron_jobs": [
973
- $CRON_JSON
974
- ],
975
- "infrastructure": {
976
- "sqlite": {"status": "$SQLITE_STATUS", "detail": "$(json_escape "$SQLITE_DETAIL")"},
977
- "cognitive_db": {"status": "$COG_STATUS", "detail": "$(json_escape "$COG_DETAIL")"},
978
- "immutable_files": {"status": "$IMMUTABLE_STATUS", "detail": "$(json_escape "$IMMUTABLE_DETAIL")"},
979
- "backups": {"status": "$BACKUP_STATUS", "detail": "$(json_escape "$BACKUP_DETAIL")"}
980
- }
981
- }
982
- JSONEOF
983
-
984
- # ============================================================================
985
- # WRITE HUMAN-READABLE REPORT
986
- # ============================================================================
987
- cat > "$REPORT_TXT" <<REPORTEOF
988
- ======================================================
989
- NEXO WATCHDOG REPORT — $TS
990
- ======================================================
991
- PASS: $TOTAL_PASS | HEALED: $TOTAL_HEALED | WARN: $TOTAL_WARN | FAIL: $TOTAL_FAIL | TOTAL: $TOTAL
992
- OVERALL: $OVERALL
993
- ======================================================
994
-
995
- -- LaunchAgents (${#MONITORS[@]}) ---------------------
996
- $REPORT_LINES
997
- -- Cron Jobs ------------------------------------------
998
- $CRON_REPORT
999
- -- Infrastructure -------------------------------------
1000
- [$SQLITE_STATUS] SQLite nexo.db: $SQLITE_DETAIL
1001
- [$COG_STATUS] Cognitive DB: $COG_DETAIL
1002
- [$IMMUTABLE_STATUS] Immutable Files: $IMMUTABLE_DETAIL
1003
- [$BACKUP_STATUS] Backups: $BACKUP_DETAIL
1004
-
1005
- -- End of Report --------------------------------------
1006
- REPORTEOF
1007
-
1008
- # ============================================================================
1009
- # ALERT FILE
1010
- # ============================================================================
1011
- if [ "$TOTAL_FAIL" -gt 0 ]; then
1012
- {
1013
- echo "timestamp=$TS"
1014
- echo "fail_count=$TOTAL_FAIL"
1015
- echo "warn_count=$TOTAL_WARN"
1016
- echo "failures:"
1017
- grep '\[FAIL\]' "$REPORT_TXT" | head -10 | sed 's/^/ /'
1018
- } > "$ALERT_FILE"
1019
- log "ALERT: $TOTAL_FAIL failures detected"
1020
- else
1021
- rm -f "$ALERT_FILE"
1022
- fi
1023
-
1024
- # ============================================================================
1025
- # CONSECUTIVE FAILURE TRACKING + NOTIFICATION
1026
- # ============================================================================
1027
- FAILS=$(cat "$FAIL_COUNT_FILE" 2>/dev/null || echo 0)
1028
- if [ "$TOTAL_FAIL" -gt 0 ]; then
1029
- FAILS=$((FAILS + 1))
1030
- echo "$FAILS" > "$FAIL_COUNT_FILE"
1031
- if [ "$FAILS" -ge "$MAX_FAILS" ]; then
1032
- log "ALERT: $FAILS consecutive runs with failures"
1033
- # Configure your own notification method here (optional)
1034
- # Example: send email, Slack webhook, desktop notification, etc.
1035
- log "NOTIFICATION: $FAILS consecutive failures ($TOTAL_FAIL items FAIL)"
1036
- fi
1037
- else
1038
- echo "0" > "$FAIL_COUNT_FILE"
1039
- fi
1040
-
1041
- # ============================================================================
1042
- # LEVEL 2 AUTO-REPAIR: Launch NEXO for intelligent diagnosis
1043
- # ============================================================================
1044
- # Only triggers if: (a) there are FAILs after mechanical repair, (b) no NEXO
1045
- # repair is already running, (c) no interactive session is active (avoid conflict)
1046
- REPAIR_LOCK="$NEXO_HOME/scripts/.watchdog-nexo-repair.lock"
1047
- REPAIR_COOLDOWN=1800 # 30 min between NEXO repair attempts
1048
-
1049
- if [ "$TOTAL_FAIL" -gt 0 ]; then
1050
- # Check cooldown — don't spam NEXO invocations
1051
- LOCK_AGE=999999
1052
- SKIP_REPAIR=false
1053
- if [ -f "$REPAIR_LOCK" ]; then
1054
- if $IS_MACOS; then LOCK_AGE=$(( TS_EPOCH - $(stat -f %m "$REPAIR_LOCK" 2>/dev/null || echo 0) )); else LOCK_AGE=$(( TS_EPOCH - $(stat -c %Y "$REPAIR_LOCK" 2>/dev/null || echo 0) )); fi
1055
- if [ "$LOCK_AGE" -lt "$REPAIR_COOLDOWN" ]; then
1056
- log "NEXO repair skipped: cooldown (${LOCK_AGE}s < ${REPAIR_COOLDOWN}s)"
1057
- SKIP_REPAIR=true
1058
- fi
1059
- fi
1060
-
1061
- if ! $SKIP_REPAIR; then
1062
- # Collect failure details from tracked FAILED_MONITORS array
1063
- FAIL_DETAILS=""
1064
- HAS_CORE_FAILS=false
1065
- for failed in ${FAILED_MONITORS[@]+"${FAILED_MONITORS[@]}"}; do
1066
- IFS='|' read -r m_name m_plist m_stdout m_stderr m_proc m_sched m_type m_details <<< "$failed"
1067
- STDERR_TAIL=""
1068
- if [ -n "$m_stderr" ] && [ -f "$m_stderr" ]; then
1069
- STDERR_TAIL=$(tail -20 "$m_stderr" 2>/dev/null | head -20)
1070
- fi
1071
- STDOUT_TAIL=""
1072
- if [ -n "$m_stdout" ] && [ -f "$m_stdout" ]; then
1073
- STDOUT_TAIL=$(tail -10 "$m_stdout" 2>/dev/null | head -10)
1074
- fi
1075
- [ "$m_type" = "core" ] && HAS_CORE_FAILS=true
1076
- FAIL_DETAILS="${FAIL_DETAILS}
1077
- --- ${m_name} (${m_plist}) [${m_type}] ---
1078
- Schedule: ${m_sched}
1079
- Type: ${m_type}
1080
- Failure reason: ${m_details}
1081
- Service config: $($IS_MACOS && echo "~/Library/LaunchAgents/${m_plist}.plist" || echo "~/.config/systemd/user/${m_plist}")
1082
- Process grep: ${m_proc}
1083
- Stderr (last 20 lines):
1084
- ${STDERR_TAIL}
1085
- Stdout (last 10 lines):
1086
- ${STDOUT_TAIL}
1087
- "
1088
- done
1089
-
1090
- # Only launch if we actually have fail details
1091
- if [ -n "$FAIL_DETAILS" ]; then
1092
- touch "$REPAIR_LOCK"
1093
- log "Launching NEXO Level 2 repair..."
1094
-
1095
- # Build propagation instructions if core services failed
1096
- # Only runs when NEXO_MAINTAINER=1 and NEXO_PUBLIC_REPO is configured
1097
- PROPAGATE_BLOCK=""
1098
- if [ "${NEXO_MAINTAINER:-}" = "1" ]; then
1099
- NEXO_PUBLIC_REPO="${NEXO_PUBLIC_REPO:-}"
1100
- if $HAS_CORE_FAILS && [ -n "$NEXO_PUBLIC_REPO" ] && [ -d "$NEXO_PUBLIC_REPO/.git" ]; then
1101
- PROPAGATE_BLOCK="
1102
- PROPAGATION (for [core] fixes ONLY):
1103
- If your fix modifies a file under $NEXO_HOME/ (server.py, db/, plugins/, scripts/):
1104
- 1. Commit the fix locally with a descriptive message
1105
- 2. Copy the changed files (sanitized — no personal data) to $NEXO_PUBLIC_REPO/src/
1106
- 3. Bump patch version in $NEXO_PUBLIC_REPO/package.json
1107
- 4. Commit + push
1108
- 5. Create a GitHub release with gh release create
1109
- Do NOT propagate fixes for [personal] services — those stay local only."
1110
- fi
1111
- fi
1112
-
1113
- # Write prompt to temp file (avoids heredoc quoting issues in subshell)
1114
- REPAIR_PROMPT_FILE=$(mktemp /tmp/nexo-repair-XXXXXXXX)
1115
- cat > "$REPAIR_PROMPT_FILE" <<NEXOPROMPT
1116
- WATCHDOG LEVEL 2 REPAIR — Automated diagnostic session.
1117
-
1118
- The watchdog detected services that failed mechanical repair (launchctl/systemctl re-registration).
1119
- Your job: diagnose the root cause and fix it. Do NOT ask the user anything — resolve autonomously.
1120
-
1121
- Each failure is tagged [core] or [personal]:
1122
- - [core] = part of NEXO public package — fix may need propagation to public repo
1123
- - [personal] = user-specific service — fix stays local only
1124
-
1125
- FAILURES:
1126
- ${FAIL_DETAILS}
1127
-
1128
- STEPS:
1129
- 1. Read the service config (plist on macOS, systemd unit on Linux) to understand the service
1130
- 2. Check stderr/stdout logs for the actual error
1131
- 3. Fix the root cause (missing file, bad config, dependency issue, etc.)
1132
- 4. Reload the service and verify it is running (launchctl on macOS, systemctl on Linux)
1133
- 5. Log what you did to $NEXO_HOME/logs/watchdog-repair-result.log
1134
- ${PROPAGATE_BLOCK}
1135
-
1136
- CONSTRAINTS:
1137
- - Do NOT modify CLAUDE.md, AGENTS.md, or any protected file
1138
- - Do NOT start interactive conversations
1139
- - Keep it under 5 minutes
1140
- - Log what you did to $NEXO_HOME/logs/watchdog-repair-result.log
1141
- NEXOPROMPT
1142
-
1143
- # Launch NEXO in background with the configured automation backend.
1144
- # Keep the hardened Claude fallback for older runtimes or partial installs.
1145
- AGENT_RUNNER="$NEXO_HOME/scripts/nexo-agent-run.py"
1146
- NEXO_PYTHON="$NEXO_HOME/.venv/bin/python3"
1147
- if [ ! -x "$NEXO_PYTHON" ]; then
1148
- NEXO_PYTHON=$(command -v python3 2>/dev/null || echo "python3")
1149
- fi
1150
-
1151
- if [ -f "$AGENT_RUNNER" ]; then
1152
- nohup bash -c "\"$NEXO_PYTHON\" \"$AGENT_RUNNER\" --prompt-file '$REPAIR_PROMPT_FILE' >> '$LOG_DIR/watchdog-nexo-repair.log' 2>&1; rm -f '$REPAIR_PROMPT_FILE'" &
1153
- else
1154
- CLAUDE_BIN=$(command -v claude 2>/dev/null || echo "$HOME_DIR/.claude/local/bin/claude")
1155
- if [ ! -x "$CLAUDE_BIN" ]; then
1156
- CLAUDE_BIN=$(find /usr/local/bin /opt/homebrew/bin "$HOME_DIR/.local/bin" "$HOME_DIR/.npm-global/bin" -name claude -type f 2>/dev/null | head -1)
1157
- fi
1158
-
1159
- if [ -n "$CLAUDE_BIN" ] && [ -x "$CLAUDE_BIN" ]; then
1160
- nohup bash -c "\"$CLAUDE_BIN\" --print --dangerously-skip-permissions -p \"\$(cat '$REPAIR_PROMPT_FILE')\" >> '$LOG_DIR/watchdog-nexo-repair.log' 2>&1; rm -f '$REPAIR_PROMPT_FILE'" &
1161
- else
1162
- log "NEXO repair ABORTED: no automation backend wrapper and no claude CLI fallback found"
1163
- rm -f "$REPAIR_PROMPT_FILE"
1164
- fi
1165
- fi
1166
-
1167
- REPAIR_PID=$!
1168
- log "NEXO repair launched (PID: $REPAIR_PID)"
1169
-
1170
- # Wait for repair to complete (max 5 min) then verify
1171
- (
1172
- wait_count=0
1173
- while kill -0 $REPAIR_PID 2>/dev/null && [ $wait_count -lt 60 ]; do
1174
- sleep 5
1175
- wait_count=$((wait_count + 1))
1176
- done
1177
-
1178
- if [ $wait_count -ge 60 ]; then
1179
- log "NEXO repair timed out after 5 min"
1180
- kill $REPAIR_PID 2>/dev/null
1181
- else
1182
- log "NEXO repair completed. Verifying fixes..."
1183
- # Verify each failed monitor
1184
- VERIFY_PASS=0
1185
- VERIFY_FAIL=0
1186
- for failed in ${FAILED_MONITORS[@]+"${FAILED_MONITORS[@]}"}; do
1187
- IFS='|' read -r v_name v_plist v_stdout v_stderr v_proc v_sched v_type v_details <<< "$failed"
1188
- if try_verify_repair "$v_plist" "$v_stdout" "$v_proc" "$v_type"; then
1189
- VERIFY_PASS=$((VERIFY_PASS + 1))
1190
- log "VERIFY OK: $v_name"
1191
- else
1192
- VERIFY_FAIL=$((VERIFY_FAIL + 1))
1193
- log "VERIFY FAIL: $v_name — still broken after repair"
1194
- fi
1195
- done
1196
- log "Post-repair verification: $VERIFY_PASS passed, $VERIFY_FAIL failed"
1197
- echo "[$(date '+%Y-%m-%d %H:%M:%S')] Verification: $VERIFY_PASS OK, $VERIFY_FAIL FAIL" >> "$LOG_DIR/watchdog-nexo-repair.log"
1198
- fi
1199
- ) &
1200
- fi
1201
- fi
1202
- fi
1203
-
1204
- # ============================================================================
1205
- # LOG SUMMARY
1206
- # ============================================================================
1207
- log "Complete: PASS=$TOTAL_PASS HEALED=$TOTAL_HEALED WARN=$TOTAL_WARN FAIL=$TOTAL_FAIL"