nexo-brain 5.3.20 → 5.3.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/src/auto_update.py +11 -8
- package/src/dashboard/static/favicon 2.svg +32 -0
- package/src/dashboard/static/nexo-logo 2.png +0 -0
- package/src/dashboard/static/nexo-logo 2.svg +40 -0
- package/src/dashboard/static/style 2.css +2458 -0
- package/src/dashboard/templates/adaptive 2.html +118 -0
- package/src/dashboard/templates/artifacts 2.html +133 -0
- package/src/dashboard/templates/backups 2.html +136 -0
- package/src/dashboard/templates/base 2.html +417 -0
- package/src/dashboard/templates/calendar 2.html +591 -0
- package/src/dashboard/templates/chat 2.html +356 -0
- package/src/dashboard/templates/claims 2.html +259 -0
- package/src/dashboard/templates/cortex 2.html +321 -0
- package/src/dashboard/templates/credentials 2.html +128 -0
- package/src/dashboard/templates/crons 2.html +370 -0
- package/src/dashboard/templates/dashboard 2.html +494 -0
- package/src/dashboard/templates/dreams 2.html +252 -0
- package/src/dashboard/templates/email 2.html +160 -0
- package/src/dashboard/templates/evolution 2.html +189 -0
- package/src/dashboard/templates/feed 2.html +249 -0
- package/src/dashboard/templates/followup_health 2.html +170 -0
- package/src/dashboard/templates/graph 2.html +201 -0
- package/src/dashboard/templates/guard 2.html +259 -0
- package/src/dashboard/templates/inbox 2.html +251 -0
- package/src/dashboard/templates/memory 2.html +420 -0
- package/src/dashboard/templates/operations 2.html +608 -0
- package/src/dashboard/templates/plugins 2.html +185 -0
- package/src/dashboard/templates/protocol 2.html +199 -0
- package/src/dashboard/templates/rules 2.html +246 -0
- package/src/dashboard/templates/sentiment 2.html +247 -0
- package/src/dashboard/templates/sessions 2.html +218 -0
- package/src/dashboard/templates/skills 2.html +329 -0
- package/src/dashboard/templates/somatic 2.html +73 -0
- package/src/dashboard/templates/triggers 2.html +133 -0
- package/src/dashboard/templates/trust 2.html +360 -0
- package/src/db/__init__ 2.py +259 -0
- package/src/db/_core 2.py +437 -0
- package/src/db/_credentials 2.py +124 -0
- package/src/db/_episodic 2.py +762 -0
- package/src/db/_evolution 2.py +54 -0
- package/src/db/_fts 2.py +406 -0
- package/src/db/_goal_profiles 2.py +376 -0
- package/src/db/_hot_context 2.py +660 -0
- package/src/db/_outcomes 2.py +800 -0
- package/src/db/_personal_scripts 2.py +582 -0
- package/src/db/_sessions 2.py +330 -0
- package/src/db/_tasks 2.py +91 -0
- package/src/db/_watchers 2.py +173 -0
- package/src/doctor/formatters 2.py +52 -0
- package/src/doctor/models 2.py +69 -0
- package/src/doctor/planes 2.py +87 -0
- package/src/doctor/providers/__init__ 2.py +1 -0
- package/src/doctor/providers/deep 2.py +367 -0
- package/src/evolution_cycle 2.py +519 -0
- package/src/hooks/auto_capture 2.py +208 -0
- package/src/hooks/caffeinate-guard 2.sh +8 -0
- package/src/hooks/capture-session 2.sh +21 -0
- package/src/hooks/capture-tool-logs 2.sh +158 -0
- package/src/hooks/daily-briefing-check 2.sh +33 -0
- package/src/hooks/heartbeat-enforcement 2.py +90 -0
- package/src/hooks/heartbeat-posttool 2.sh +18 -0
- package/src/hooks/inbox-hook 2.sh +76 -0
- package/src/hooks/post-compact 2.sh +152 -0
- package/src/hooks/pre-compact 2.sh +169 -0
- package/src/hooks/protocol-guardrail 2.sh +10 -0
- package/src/hooks/protocol-pretool-guardrail 2.sh +9 -0
- package/src/hooks/session-stop 2.sh +52 -0
- package/src/kg_populate 2.py +292 -0
- package/src/maintenance 2.py +53 -0
- package/src/memory_backends 2.py +71 -0
- package/src/migrate_embeddings 2.py +124 -0
- package/src/nexo_sdk 2.py +103 -0
- package/src/observability 2.py +199 -0
- package/src/plugin_loader 2.py +217 -0
- package/src/plugins/__init__ 2.py +0 -0
- package/src/plugins/artifact_registry 2.py +450 -0
- package/src/plugins/backup 2.py +127 -0
- package/src/plugins/claims_tools 2.py +119 -0
- package/src/plugins/cognitive_memory 2.py +609 -0
- package/src/plugins/core_rules 2.py +252 -0
- package/src/plugins/cortex 2.py +1155 -0
- package/src/plugins/entities 2.py +67 -0
- package/src/plugins/episodic_memory 2.py +560 -0
- package/src/plugins/evolution 2.py +167 -0
- package/src/plugins/goal_engine 2.py +142 -0
- package/src/plugins/guard 2.py +862 -0
- package/src/plugins/impact 2.py +29 -0
- package/src/plugins/knowledge_graph_tools 2.py +137 -0
- package/src/plugins/media_memory_tools 2.py +98 -0
- package/src/plugins/memory_export 2.py +196 -0
- package/src/plugins/outcomes 2.py +130 -0
- package/src/plugins/personal_scripts 2.py +117 -0
- package/src/plugins/preferences 2.py +47 -0
- package/src/plugins/protocol 2.py +1449 -0
- package/src/plugins/simple_api 2.py +106 -0
- package/src/plugins/skills 2.py +341 -0
- package/src/plugins/state_watchers 2.py +79 -0
- package/src/plugins/update 2.py +986 -0
- package/src/plugins/user_state_tools 2.py +43 -0
- package/src/plugins/workflow 2.py +588 -0
- package/src/protocol_settings 2.py +59 -0
- package/src/public_contribution 2.py +466 -0
- package/src/public_evolution_queue 2.py +241 -0
- package/src/requirements 2.txt +14 -0
- package/src/retroactive_learnings 2.py +373 -0
- package/src/rules/__init__ 2.py +0 -0
- package/src/rules/core-rules 2.json +331 -0
- package/src/rules/migrate 2.py +207 -0
- package/src/runtime_power 2.py +874 -0
- package/src/script_registry 2.py +1559 -0
- package/src/scripts/check-context 2.py +272 -0
- package/src/scripts/deep-sleep/apply_findings 2.py +2327 -0
- package/src/scripts/deep-sleep/collect 2.py +928 -0
- package/src/scripts/deep-sleep/extract 2.py +330 -0
- package/src/scripts/deep-sleep/extract-prompt 2.md +285 -0
- package/src/scripts/deep-sleep/synthesize 2.py +312 -0
- package/src/scripts/deep-sleep/synthesize-prompt 2.md +336 -0
- package/src/scripts/nexo-agent-run 2.py +75 -0
- package/src/scripts/nexo-auto-update 2.py +6 -0
- package/src/scripts/nexo-backup 2.sh +25 -0
- package/src/scripts/nexo-brain-activation 2.sh +140 -0
- package/src/scripts/nexo-catchup 2.py +300 -0
- package/src/scripts/nexo-cognitive-decay 2.py +257 -0
- package/src/scripts/nexo-cortex-cycle 2.py +293 -0
- package/src/scripts/nexo-cron-wrapper 2.sh +53 -0
- package/src/scripts/nexo-daily-self-audit 2.py +2161 -0
- package/src/scripts/nexo-dashboard 2.sh +29 -0
- package/src/scripts/nexo-deep-sleep 2.sh +86 -0
- package/src/scripts/nexo-evolution-run 2.py +1664 -0
- package/src/scripts/nexo-followup-hygiene 2.py +139 -0
- package/src/scripts/nexo-hook-record 2.py +42 -0
- package/src/scripts/nexo-immune 2.py +936 -0
- package/src/scripts/nexo-impact-scorer 2.py +117 -0
- package/src/scripts/nexo-inbox-hook 2.sh +74 -0
- package/src/scripts/nexo-install 2.py +6 -0
- package/src/scripts/nexo-learning-housekeep 2.py +401 -0
- package/src/scripts/nexo-learning-validator 2.py +266 -0
- package/src/scripts/nexo-migrate 2.py +260 -0
- package/src/scripts/nexo-outcome-checker 2.py +127 -0
- package/src/scripts/nexo-postmortem-consolidator 2.py +456 -0
- package/src/scripts/nexo-pre-commit 2.py +120 -0
- package/src/scripts/nexo-prevent-sleep 2.sh +35 -0
- package/src/scripts/nexo-proactive-dashboard 2.py +354 -0
- package/src/scripts/nexo-reflection 2.py +256 -0
- package/src/scripts/nexo-runtime-preflight 2.py +274 -0
- package/src/scripts/nexo-sleep 2.py +631 -0
- package/src/scripts/nexo-snapshot-restore 2.sh +35 -0
- package/src/scripts/nexo-sync-clients 2.py +16 -0
- package/src/scripts/nexo-synthesis 2.py +475 -0
- package/src/scripts/nexo-tcc-approve 2.sh +79 -0
- package/src/scripts/nexo-update 2.sh +306 -0
- package/src/scripts/nexo-watchdog 2.sh +1207 -0
- package/src/scripts/nexo-watchdog-smoke 2.py +119 -0
- package/src/scripts/rehydrate_learnings_from_archive 2.py +245 -0
- package/src/server 2.py +1296 -0
- package/src/skills/run-nexo-audit-phase/guide 2.md +43 -0
- package/src/skills/run-nexo-audit-phase/skill 2.json +59 -0
- package/src/skills/run-nexo-core-fix-cycle/guide 2.md +17 -0
- package/src/skills/run-nexo-core-fix-cycle/script 2.py +276 -0
- package/src/skills/run-nexo-core-fix-cycle/skill 2.json +58 -0
- package/src/skills/run-release-final-audit/guide 2.md +16 -0
- package/src/skills/run-release-final-audit/script 2.py +259 -0
- package/src/skills/run-release-final-audit/skill 2.json +77 -0
- package/src/skills/run-runtime-doctor/guide 2.md +12 -0
- package/src/skills/run-runtime-doctor/script 2.py +21 -0
- package/src/skills/run-runtime-doctor/skill 2.json +25 -0
- package/src/skills_runtime 2.py +932 -0
- package/src/state_watchers_runtime 2.py +475 -0
- package/src/storage_router 2.py +32 -0
- package/src/system_catalog 2.py +786 -0
- package/src/tools_coordination 2.py +103 -0
- package/src/tools_credentials 2.py +68 -0
- package/src/tools_drive 2.py +487 -0
- package/src/tools_hot_context 2.py +163 -0
- package/src/tools_learnings 2.py +612 -0
- package/src/tools_menu 2.py +229 -0
- package/src/tools_reminders 2.py +88 -0
- package/src/tools_reminders_crud 2.py +363 -0
- package/src/tools_sessions 2.py +1054 -0
- package/src/tools_system_catalog 2.py +19 -0
- package/src/tools_task_history 2.py +57 -0
- package/src/tools_transcripts 2.py +98 -0
- package/src/transcript_utils 2.py +412 -0
- package/src/user_context 2.py +46 -0
- package/src/user_data_portability 2.py +328 -0
- package/src/user_state_model 2.py +170 -0
- package/templates/CLAUDE.md 2.template +108 -0
- package/templates/CODEX.AGENTS.md 2.template +66 -0
- package/templates/launchagents/README 2.md +132 -0
- package/templates/launchagents/com.nexo.auto-close-sessions 2.plist +39 -0
- package/templates/launchagents/com.nexo.catchup 2.plist +39 -0
- package/templates/launchagents/com.nexo.cognitive-decay 2.plist +40 -0
- package/templates/launchagents/com.nexo.dashboard 2.plist +43 -0
- package/templates/launchagents/com.nexo.deep-sleep 2.plist +43 -0
- package/templates/launchagents/com.nexo.evolution 2.plist +44 -0
- package/templates/launchagents/com.nexo.followup-hygiene 2.plist +45 -0
- package/templates/launchagents/com.nexo.immune 2.plist +41 -0
- package/templates/launchagents/com.nexo.postmortem 2.plist +45 -0
- package/templates/launchagents/com.nexo.self-audit 2.plist +47 -0
- package/templates/launchagents/com.nexo.synthesis 2.plist +45 -0
- package/templates/launchagents/com.nexo.watchdog 2.plist +37 -0
- package/templates/nexo_helper 2.py +301 -0
- package/templates/openclaw 2.json +13 -0
- package/templates/plugin-template 2.py +40 -0
- package/templates/script-template 2.py +59 -0
- package/templates/script-template 2.sh +13 -0
- package/templates/skill-script-template 2.py +48 -0
- package/templates/skill-template 2.md +33 -0
|
@@ -0,0 +1,936 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
NEXO Immune System — Health monitor & auto-repair.
|
|
4
|
+
|
|
5
|
+
Runs every 30 minutes via LaunchAgent. Checks tokens, LaunchAgents, DBs,
|
|
6
|
+
scripts, logs, disk, and remote server crons. Auto-repairs what it can,
|
|
7
|
+
alerts via notification on NEW failures.
|
|
8
|
+
|
|
9
|
+
Zero external dependencies. Stdlib + sqlite3 + urllib only.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import fcntl
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import shlex
|
|
17
|
+
import signal
|
|
18
|
+
import sqlite3
|
|
19
|
+
import ssl
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
from datetime import datetime, date, timedelta
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from client_preferences import resolve_user_model as _resolve_user_model
|
|
29
|
+
_USER_MODEL = _resolve_user_model()
|
|
30
|
+
except Exception:
|
|
31
|
+
_USER_MODEL = ""
|
|
32
|
+
|
|
33
|
+
NEXO_HOME = Path(os.environ.get("NEXO_HOME", str(Path.home() / ".nexo")))
|
|
34
|
+
_script_dir = Path(__file__).resolve().parent
|
|
35
|
+
_repo_src = _script_dir.parent
|
|
36
|
+
NEXO_CODE = Path(os.environ.get("NEXO_CODE", str(_repo_src) if (_repo_src / "server.py").exists() else str(NEXO_HOME)))
|
|
37
|
+
if str(NEXO_CODE) not in sys.path:
|
|
38
|
+
sys.path.insert(0, str(NEXO_CODE))
|
|
39
|
+
|
|
40
|
+
from agent_runner import AutomationBackendUnavailableError, run_automation_prompt
|
|
41
|
+
|
|
42
|
+
from urllib.request import Request, urlopen
|
|
43
|
+
from urllib.error import URLError, HTTPError
|
|
44
|
+
|
|
45
|
+
# ─── SSL context for macOS (certifi or system certs) ─────────────────────────
|
|
46
|
+
def _make_ssl_context():
|
|
47
|
+
"""Create an SSL context that works on macOS with Python.org Python."""
|
|
48
|
+
# Try certifi first (pip-installed)
|
|
49
|
+
try:
|
|
50
|
+
import certifi
|
|
51
|
+
ctx = ssl.create_default_context(cafile=certifi.where())
|
|
52
|
+
return ctx
|
|
53
|
+
except ImportError:
|
|
54
|
+
pass
|
|
55
|
+
# Try macOS system certificates
|
|
56
|
+
for ca_path in [
|
|
57
|
+
"/etc/ssl/cert.pem",
|
|
58
|
+
"/usr/local/etc/openssl/cert.pem",
|
|
59
|
+
"/usr/local/etc/openssl@3/cert.pem",
|
|
60
|
+
"/opt/homebrew/etc/openssl@3/cert.pem",
|
|
61
|
+
]:
|
|
62
|
+
if os.path.exists(ca_path):
|
|
63
|
+
ctx = ssl.create_default_context(cafile=ca_path)
|
|
64
|
+
return ctx
|
|
65
|
+
# Last resort: unverified (still better than crashing)
|
|
66
|
+
ctx = ssl.create_default_context()
|
|
67
|
+
ctx.check_hostname = False
|
|
68
|
+
ctx.verify_mode = ssl.CERT_NONE
|
|
69
|
+
return ctx
|
|
70
|
+
|
|
71
|
+
SSL_CTX = _make_ssl_context()
|
|
72
|
+
|
|
73
|
+
# ─── Paths ────────────────────────────────────────────────────────────────────
|
|
74
|
+
HOME = Path.home()
|
|
75
|
+
CLAUDE_DIR = NEXO_HOME
|
|
76
|
+
COORD_DIR = CLAUDE_DIR / "coordination"
|
|
77
|
+
BRAIN_DIR = CLAUDE_DIR / "brain"
|
|
78
|
+
SCRIPTS_DIR = CLAUDE_DIR / "scripts"
|
|
79
|
+
|
|
80
|
+
IMMUNE_STATUS = COORD_DIR / "immune-status.json"
|
|
81
|
+
IMMUNE_LOG = COORD_DIR / "immune-log.json"
|
|
82
|
+
LOCK_FILE = COORD_DIR / "immune-process.lock"
|
|
83
|
+
|
|
84
|
+
# Configure your alert script here (optional)
|
|
85
|
+
# ALERT_SCRIPT = SCRIPTS_DIR / "my-notify.sh"
|
|
86
|
+
|
|
87
|
+
CLAUDE_MEM_DB = HOME / ".claude-mem" / "claude-mem.db"
|
|
88
|
+
|
|
89
|
+
LAUNCH_AGENTS_DIR = HOME / "Library" / "LaunchAgents"
|
|
90
|
+
CLAUDE_CLI = HOME / ".local" / "bin" / "claude"
|
|
91
|
+
|
|
92
|
+
NOW = datetime.now()
|
|
93
|
+
TODAY = date.today()
|
|
94
|
+
|
|
95
|
+
# ─── Config ───────────────────────────────────────────────────────────────────
|
|
96
|
+
|
|
97
|
+
# Token checks — configure for your services.
|
|
98
|
+
# Supported types: file_text (read file, optional test_url), json_field (check for refresh_token),
|
|
99
|
+
# service_account (check for private_key/client_email), hardcoded (direct URL test)
|
|
100
|
+
TOKEN_CHECKS = [
|
|
101
|
+
# Example: uncomment and configure for your services
|
|
102
|
+
# {
|
|
103
|
+
# "name": "My API",
|
|
104
|
+
# "path": "~/.nexo/my_api_token.txt",
|
|
105
|
+
# "type": "file_text",
|
|
106
|
+
# "test_url": "https://api.example.com/health?token={token}",
|
|
107
|
+
# },
|
|
108
|
+
# {
|
|
109
|
+
# "name": "My Service Account",
|
|
110
|
+
# "path": "~/.nexo/service-account.json",
|
|
111
|
+
# "type": "service_account",
|
|
112
|
+
# },
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
EXPECTED_AGENTS = [
|
|
116
|
+
"com.nexo.immune",
|
|
117
|
+
"com.nexo.sleep",
|
|
118
|
+
"com.nexo.synthesis",
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
# SSH check interval — only every 2 hours, not every 30 min
|
|
122
|
+
SSH_CHECK_INTERVAL_HOURS = 2
|
|
123
|
+
|
|
124
|
+
# Log size thresholds (bytes)
|
|
125
|
+
LOG_WARN_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
126
|
+
LOG_FAIL_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
127
|
+
LOG_TRUNCATE_SIZE = 50 * 1024 * 1024 # 50 MB — auto-truncate threshold
|
|
128
|
+
|
|
129
|
+
# Disk thresholds (percentage used)
|
|
130
|
+
DISK_WARN_PCT = 85
|
|
131
|
+
DISK_FAIL_PCT = 95
|
|
132
|
+
|
|
133
|
+
# Quiet hours — no WhatsApp alerts
|
|
134
|
+
QUIET_START = 23 # 23:00
|
|
135
|
+
QUIET_END = 7 # 07:00
|
|
136
|
+
|
|
137
|
+
# Skip execution hours (deep night)
|
|
138
|
+
SKIP_START = 0 # 00:00
|
|
139
|
+
SKIP_END = 6 # 06:00
|
|
140
|
+
|
|
141
|
+
# Max entries in immune-log.json
|
|
142
|
+
MAX_LOG_ENTRIES = 500
|
|
143
|
+
|
|
144
|
+
# HTTP timeout for token checks
|
|
145
|
+
HTTP_TIMEOUT = 10
|
|
146
|
+
|
|
147
|
+
# SSH timeout
|
|
148
|
+
SSH_TIMEOUT = 15
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
152
|
+
|
|
153
|
+
def load_json(path, default=None):
|
|
154
|
+
if not path.exists():
|
|
155
|
+
return default if default is not None else {}
|
|
156
|
+
try:
|
|
157
|
+
return json.loads(path.read_text())
|
|
158
|
+
except Exception:
|
|
159
|
+
return default if default is not None else {}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def save_json(path, data):
|
|
163
|
+
path.write_text(json.dumps(data, indent=2, ensure_ascii=False))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def is_quiet_hours():
|
|
167
|
+
"""Check if within WhatsApp quiet hours (23:00 - 07:00)."""
|
|
168
|
+
h = NOW.hour
|
|
169
|
+
if QUIET_START > QUIET_END:
|
|
170
|
+
return h >= QUIET_START or h < QUIET_END
|
|
171
|
+
return QUIET_START <= h < QUIET_END
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def is_skip_hours():
|
|
175
|
+
"""Check if within skip hours (00:00 - 06:00)."""
|
|
176
|
+
return SKIP_START <= NOW.hour < SKIP_END
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def send_alert(title, message):
|
|
180
|
+
"""Send alert notification if not in quiet hours.
|
|
181
|
+
|
|
182
|
+
Configure ALERT_SCRIPT at the top of this file to enable.
|
|
183
|
+
Override this function for custom alerting (email, Slack, etc.).
|
|
184
|
+
"""
|
|
185
|
+
if is_quiet_hours():
|
|
186
|
+
print(f" [QUIET] Suppressed alert: {title}")
|
|
187
|
+
return False
|
|
188
|
+
# Default: log only. Configure ALERT_SCRIPT for active notifications.
|
|
189
|
+
print(f" [ALERT] {title}: {message}")
|
|
190
|
+
return True
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def http_get(url, headers=None, timeout=HTTP_TIMEOUT):
|
|
194
|
+
"""Simple HTTP GET, returns (status_code, body) or (0, error_string)."""
|
|
195
|
+
try:
|
|
196
|
+
req = Request(url)
|
|
197
|
+
if headers:
|
|
198
|
+
for k, v in headers.items():
|
|
199
|
+
req.add_header(k, v)
|
|
200
|
+
with urlopen(req, timeout=timeout, context=SSL_CTX) as resp:
|
|
201
|
+
body = resp.read().decode("utf-8", errors="replace")
|
|
202
|
+
return resp.status, body
|
|
203
|
+
except HTTPError as e:
|
|
204
|
+
return e.code, str(e)
|
|
205
|
+
except URLError as e:
|
|
206
|
+
return 0, str(e.reason)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
return 0, str(e)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def run_cmd(cmd, timeout=30):
|
|
212
|
+
"""Run a command without invoking a shell. Accepts string or argv list."""
|
|
213
|
+
try:
|
|
214
|
+
argv = shlex.split(cmd) if isinstance(cmd, str) else list(cmd)
|
|
215
|
+
r = subprocess.run(
|
|
216
|
+
argv, capture_output=True, text=True, timeout=timeout
|
|
217
|
+
)
|
|
218
|
+
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
|
219
|
+
except subprocess.TimeoutExpired:
|
|
220
|
+
return -1, "", "timeout"
|
|
221
|
+
except Exception as e:
|
|
222
|
+
return -1, "", str(e)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def pid_alive(pid):
|
|
226
|
+
"""Check if a PID is still running."""
|
|
227
|
+
try:
|
|
228
|
+
os.kill(pid, 0)
|
|
229
|
+
return True
|
|
230
|
+
except (OSError, ProcessLookupError):
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# ─── Check Functions ──────────────────────────────────────────────────────────
|
|
235
|
+
|
|
236
|
+
def check_tokens():
|
|
237
|
+
"""Check all configured tokens. Returns list of result dicts."""
|
|
238
|
+
results = []
|
|
239
|
+
|
|
240
|
+
for tc in TOKEN_CHECKS:
|
|
241
|
+
name = tc["name"]
|
|
242
|
+
result = {"name": name, "status": "OK", "detail": ""}
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
if tc["type"] == "file_text":
|
|
246
|
+
path = Path(tc["path"]).expanduser()
|
|
247
|
+
if not path.exists():
|
|
248
|
+
result["status"] = "FAIL"
|
|
249
|
+
result["detail"] = f"Token file missing: {path}"
|
|
250
|
+
else:
|
|
251
|
+
token = path.read_text().strip()
|
|
252
|
+
if not token:
|
|
253
|
+
result["status"] = "FAIL"
|
|
254
|
+
result["detail"] = "Token file empty"
|
|
255
|
+
elif "test_url" in tc:
|
|
256
|
+
url = tc["test_url"].format(token=token)
|
|
257
|
+
code, body = http_get(url)
|
|
258
|
+
if code == 200:
|
|
259
|
+
result["detail"] = "HTTP 200 OK"
|
|
260
|
+
elif code == 190 or (isinstance(body, str) and "expired" in body.lower()):
|
|
261
|
+
result["status"] = "FAIL"
|
|
262
|
+
result["detail"] = f"Token expired (HTTP {code})"
|
|
263
|
+
else:
|
|
264
|
+
result["status"] = "FAIL"
|
|
265
|
+
result["detail"] = f"HTTP {code}: {body[:200]}"
|
|
266
|
+
|
|
267
|
+
elif tc["type"] == "json_field":
|
|
268
|
+
path = Path(tc["path"]).expanduser()
|
|
269
|
+
if not path.exists():
|
|
270
|
+
result["status"] = "FAIL"
|
|
271
|
+
result["detail"] = f"Token file missing: {path}"
|
|
272
|
+
else:
|
|
273
|
+
data = load_json(path, default=None)
|
|
274
|
+
if data is None:
|
|
275
|
+
result["status"] = "FAIL"
|
|
276
|
+
result["detail"] = "Invalid JSON"
|
|
277
|
+
elif "refresh_token" not in data:
|
|
278
|
+
result["status"] = "FAIL"
|
|
279
|
+
result["detail"] = "No refresh_token in JSON"
|
|
280
|
+
else:
|
|
281
|
+
result["detail"] = "refresh_token present"
|
|
282
|
+
|
|
283
|
+
elif tc["type"] == "service_account":
|
|
284
|
+
path = Path(tc["path"]).expanduser()
|
|
285
|
+
if not path.exists():
|
|
286
|
+
result["status"] = "FAIL"
|
|
287
|
+
result["detail"] = f"Service account file missing: {path}"
|
|
288
|
+
else:
|
|
289
|
+
data = load_json(path, default=None)
|
|
290
|
+
if data is None:
|
|
291
|
+
result["status"] = "FAIL"
|
|
292
|
+
result["detail"] = "Invalid JSON"
|
|
293
|
+
elif "private_key" not in data or "client_email" not in data:
|
|
294
|
+
result["status"] = "FAIL"
|
|
295
|
+
result["detail"] = "Missing private_key or client_email"
|
|
296
|
+
else:
|
|
297
|
+
result["detail"] = f"SA: {data.get('client_email', '?')[:40]}"
|
|
298
|
+
|
|
299
|
+
elif tc["type"] == "hardcoded":
|
|
300
|
+
url = tc["test_url"]
|
|
301
|
+
headers = {tc["header"]: tc["token"]}
|
|
302
|
+
code, body = http_get(url, headers=headers)
|
|
303
|
+
if code == 200:
|
|
304
|
+
result["detail"] = "HTTP 200 OK"
|
|
305
|
+
elif code == 401:
|
|
306
|
+
result["status"] = "FAIL"
|
|
307
|
+
result["detail"] = "Token unauthorized (401)"
|
|
308
|
+
else:
|
|
309
|
+
result["status"] = "FAIL"
|
|
310
|
+
result["detail"] = f"HTTP {code}: {body[:200]}"
|
|
311
|
+
|
|
312
|
+
except Exception as e:
|
|
313
|
+
result["status"] = "FAIL"
|
|
314
|
+
result["detail"] = f"Exception: {str(e)[:200]}"
|
|
315
|
+
|
|
316
|
+
results.append(result)
|
|
317
|
+
|
|
318
|
+
return results
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def check_launch_agents():
|
|
322
|
+
"""Check that expected LaunchAgents are loaded. Auto-repair if not."""
|
|
323
|
+
results = []
|
|
324
|
+
|
|
325
|
+
# Get list of loaded agents
|
|
326
|
+
rc, stdout, _ = run_cmd("launchctl list")
|
|
327
|
+
loaded_labels = set()
|
|
328
|
+
if rc == 0:
|
|
329
|
+
for line in stdout.splitlines():
|
|
330
|
+
parts = line.split("\t")
|
|
331
|
+
if len(parts) >= 3:
|
|
332
|
+
loaded_labels.add(parts[2])
|
|
333
|
+
|
|
334
|
+
for agent in EXPECTED_AGENTS:
|
|
335
|
+
result = {"name": agent, "status": "OK", "detail": "", "repaired": False}
|
|
336
|
+
|
|
337
|
+
if agent in loaded_labels:
|
|
338
|
+
result["detail"] = "Loaded"
|
|
339
|
+
else:
|
|
340
|
+
# Try auto-repair
|
|
341
|
+
plist = LAUNCH_AGENTS_DIR / f"{agent}.plist"
|
|
342
|
+
if plist.exists():
|
|
343
|
+
rc, out, err = run_cmd(f"launchctl load '{plist}'")
|
|
344
|
+
if rc == 0:
|
|
345
|
+
result["status"] = "WARN"
|
|
346
|
+
result["detail"] = f"Was unloaded, auto-loaded successfully"
|
|
347
|
+
result["repaired"] = True
|
|
348
|
+
else:
|
|
349
|
+
result["status"] = "FAIL"
|
|
350
|
+
result["detail"] = f"Unloaded, auto-load failed: {err[:100]}"
|
|
351
|
+
else:
|
|
352
|
+
result["status"] = "FAIL"
|
|
353
|
+
result["detail"] = f"Unloaded, plist not found: {plist}"
|
|
354
|
+
|
|
355
|
+
results.append(result)
|
|
356
|
+
|
|
357
|
+
return results
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def check_databases():
|
|
361
|
+
"""Run PRAGMA integrity_check on known databases."""
|
|
362
|
+
results = []
|
|
363
|
+
|
|
364
|
+
dbs = [
|
|
365
|
+
("nexo.db", NEXO_HOME / "data" / "nexo.db"),
|
|
366
|
+
("cognitive.db", NEXO_HOME / "data" / "cognitive.db"),
|
|
367
|
+
("claude-mem.db", CLAUDE_MEM_DB),
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
for name, path in dbs:
|
|
371
|
+
result = {"name": name, "status": "OK", "detail": ""}
|
|
372
|
+
|
|
373
|
+
if not path.exists():
|
|
374
|
+
result["status"] = "FAIL"
|
|
375
|
+
result["detail"] = f"File missing: {path}"
|
|
376
|
+
else:
|
|
377
|
+
try:
|
|
378
|
+
conn = sqlite3.connect(str(path), timeout=5)
|
|
379
|
+
cursor = conn.execute("PRAGMA integrity_check")
|
|
380
|
+
check_result = cursor.fetchone()[0]
|
|
381
|
+
conn.close()
|
|
382
|
+
if check_result == "ok":
|
|
383
|
+
size_mb = path.stat().st_size / (1024 * 1024)
|
|
384
|
+
result["detail"] = f"Integrity OK ({size_mb:.1f} MB)"
|
|
385
|
+
else:
|
|
386
|
+
result["status"] = "FAIL"
|
|
387
|
+
result["detail"] = f"Integrity failed: {check_result[:200]}"
|
|
388
|
+
except Exception as e:
|
|
389
|
+
result["status"] = "FAIL"
|
|
390
|
+
result["detail"] = f"Error: {str(e)[:200]}"
|
|
391
|
+
|
|
392
|
+
results.append(result)
|
|
393
|
+
|
|
394
|
+
return results
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def check_scripts():
|
|
398
|
+
"""Check stale lock files."""
|
|
399
|
+
results = []
|
|
400
|
+
|
|
401
|
+
# Stale lock files (PID dead)
|
|
402
|
+
lock_files = list(COORD_DIR.glob("*.lock"))
|
|
403
|
+
for lf in lock_files:
|
|
404
|
+
if lf == LOCK_FILE:
|
|
405
|
+
continue # Skip our own lock
|
|
406
|
+
result = {"name": f"lock:{lf.name}", "status": "OK", "detail": "", "repaired": False}
|
|
407
|
+
try:
|
|
408
|
+
content = lf.read_text().strip()
|
|
409
|
+
if content and content.isdigit():
|
|
410
|
+
pid = int(content)
|
|
411
|
+
if pid_alive(pid):
|
|
412
|
+
result["detail"] = f"PID {pid} alive"
|
|
413
|
+
else:
|
|
414
|
+
# Auto-repair: remove stale lock
|
|
415
|
+
lf.unlink()
|
|
416
|
+
result["status"] = "WARN"
|
|
417
|
+
result["detail"] = f"PID {pid} dead — lock removed"
|
|
418
|
+
result["repaired"] = True
|
|
419
|
+
elif content:
|
|
420
|
+
# Lock file has non-PID content — check if size 0 (normal flock pattern)
|
|
421
|
+
if lf.stat().st_size == 0:
|
|
422
|
+
result["detail"] = "Empty lock (flock pattern)"
|
|
423
|
+
else:
|
|
424
|
+
result["detail"] = f"Non-PID content: {content[:50]}"
|
|
425
|
+
else:
|
|
426
|
+
result["detail"] = "Empty lock file"
|
|
427
|
+
except Exception as e:
|
|
428
|
+
result["detail"] = f"Error checking: {e}"
|
|
429
|
+
results.append(result)
|
|
430
|
+
|
|
431
|
+
return results
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def check_logs():
|
|
435
|
+
"""Check log file sizes. Auto-truncate if > 50 MB."""
|
|
436
|
+
results = []
|
|
437
|
+
|
|
438
|
+
# JSON logs to check
|
|
439
|
+
json_logs = [
|
|
440
|
+
COORD_DIR / "heartbeat-log.json",
|
|
441
|
+
COORD_DIR / "reflection-log.json",
|
|
442
|
+
COORD_DIR / "immune-log.json",
|
|
443
|
+
COORD_DIR / "ops-board.json",
|
|
444
|
+
COORD_DIR / "messages.json",
|
|
445
|
+
]
|
|
446
|
+
|
|
447
|
+
# Text logs to check
|
|
448
|
+
text_logs = [
|
|
449
|
+
COORD_DIR / "heartbeat-stdout.log",
|
|
450
|
+
COORD_DIR / "heartbeat-stderr.log",
|
|
451
|
+
COORD_DIR / "reflection-stdout.log",
|
|
452
|
+
COORD_DIR / "reflection-stderr.log",
|
|
453
|
+
COORD_DIR / "immune-stdout.log",
|
|
454
|
+
COORD_DIR / "immune-stderr.log",
|
|
455
|
+
]
|
|
456
|
+
|
|
457
|
+
for log_path in json_logs + text_logs:
|
|
458
|
+
if not log_path.exists():
|
|
459
|
+
continue
|
|
460
|
+
|
|
461
|
+
result = {"name": log_path.name, "status": "OK", "detail": "", "repaired": False}
|
|
462
|
+
size = log_path.stat().st_size
|
|
463
|
+
size_mb = size / (1024 * 1024)
|
|
464
|
+
|
|
465
|
+
if size >= LOG_FAIL_SIZE:
|
|
466
|
+
result["status"] = "FAIL"
|
|
467
|
+
result["detail"] = f"{size_mb:.1f} MB — exceeds {LOG_FAIL_SIZE // (1024*1024)} MB"
|
|
468
|
+
|
|
469
|
+
# Auto-truncate
|
|
470
|
+
try:
|
|
471
|
+
if log_path.suffix == ".json":
|
|
472
|
+
_truncate_json_log(log_path, keep_entries=200)
|
|
473
|
+
else:
|
|
474
|
+
_truncate_text_log(log_path, keep_lines=1000)
|
|
475
|
+
new_size = log_path.stat().st_size / (1024 * 1024)
|
|
476
|
+
result["detail"] += f" -> truncated to {new_size:.1f} MB"
|
|
477
|
+
result["repaired"] = True
|
|
478
|
+
except Exception as e:
|
|
479
|
+
result["detail"] += f" -> truncate failed: {e}"
|
|
480
|
+
|
|
481
|
+
elif size >= LOG_WARN_SIZE:
|
|
482
|
+
result["status"] = "WARN"
|
|
483
|
+
result["detail"] = f"{size_mb:.1f} MB — approaching limit"
|
|
484
|
+
else:
|
|
485
|
+
result["detail"] = f"{size_mb:.2f} MB"
|
|
486
|
+
|
|
487
|
+
results.append(result)
|
|
488
|
+
|
|
489
|
+
return results
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def _truncate_json_log(path, keep_entries=200):
|
|
493
|
+
"""Truncate a JSON log file to the last N entries."""
|
|
494
|
+
data = load_json(path, default=[])
|
|
495
|
+
if isinstance(data, list) and len(data) > keep_entries:
|
|
496
|
+
data = data[-keep_entries:]
|
|
497
|
+
save_json(path, data)
|
|
498
|
+
elif isinstance(data, dict):
|
|
499
|
+
# Some logs are dicts with a list value
|
|
500
|
+
for key in data:
|
|
501
|
+
if isinstance(data[key], list) and len(data[key]) > keep_entries:
|
|
502
|
+
data[key] = data[key][-keep_entries:]
|
|
503
|
+
save_json(path, data)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _truncate_text_log(path, keep_lines=1000):
|
|
507
|
+
"""Truncate a text log to the last N lines."""
|
|
508
|
+
lines = path.read_text().splitlines()
|
|
509
|
+
if len(lines) > keep_lines:
|
|
510
|
+
path.write_text("\n".join(lines[-keep_lines:]) + "\n")
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def check_disk():
|
|
514
|
+
"""Check disk usage via os.statvfs."""
|
|
515
|
+
results = []
|
|
516
|
+
result = {"name": "disk:/", "status": "OK", "detail": ""}
|
|
517
|
+
|
|
518
|
+
try:
|
|
519
|
+
st = os.statvfs("/")
|
|
520
|
+
total = st.f_frsize * st.f_blocks
|
|
521
|
+
avail = st.f_frsize * st.f_bavail
|
|
522
|
+
used = total - avail
|
|
523
|
+
pct = (used / total) * 100 if total > 0 else 0
|
|
524
|
+
|
|
525
|
+
avail_gb = avail / (1024 ** 3)
|
|
526
|
+
total_gb = total / (1024 ** 3)
|
|
527
|
+
|
|
528
|
+
if pct >= DISK_FAIL_PCT:
|
|
529
|
+
result["status"] = "FAIL"
|
|
530
|
+
result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
|
|
531
|
+
elif pct >= DISK_WARN_PCT:
|
|
532
|
+
result["status"] = "WARN"
|
|
533
|
+
result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
|
|
534
|
+
else:
|
|
535
|
+
result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
|
|
536
|
+
except Exception as e:
|
|
537
|
+
result["status"] = "FAIL"
|
|
538
|
+
result["detail"] = f"Error: {e}"
|
|
539
|
+
|
|
540
|
+
results.append(result)
|
|
541
|
+
return results
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def check_server_crons():
|
|
545
|
+
"""Check remote server crons via SSH. Only runs every 2 hours.
|
|
546
|
+
|
|
547
|
+
Configure SSH_SERVER_CMD below with your server details if you want
|
|
548
|
+
remote health checks. Leave empty to skip.
|
|
549
|
+
"""
|
|
550
|
+
results = []
|
|
551
|
+
result = {"name": "remote-server", "status": "OK", "detail": ""}
|
|
552
|
+
|
|
553
|
+
# Configure your SSH health check command here (empty = skip)
|
|
554
|
+
# Example: 'ssh -p 22 user@myserver.example.com "echo OK"'
|
|
555
|
+
SSH_SERVER_CMD = ""
|
|
556
|
+
|
|
557
|
+
if not SSH_SERVER_CMD:
|
|
558
|
+
result["detail"] = "No remote server configured (SSH_SERVER_CMD empty)"
|
|
559
|
+
results.append(result)
|
|
560
|
+
return results, False
|
|
561
|
+
|
|
562
|
+
# Check if we should run (every 2 hours based on last check)
|
|
563
|
+
status = load_json(IMMUNE_STATUS)
|
|
564
|
+
last_ssh_str = status.get("last_ssh_check", "")
|
|
565
|
+
should_run = True
|
|
566
|
+
|
|
567
|
+
if last_ssh_str:
|
|
568
|
+
try:
|
|
569
|
+
last_ssh = datetime.strptime(last_ssh_str, "%Y-%m-%d %H:%M")
|
|
570
|
+
hours_ago = (NOW - last_ssh).total_seconds() / 3600
|
|
571
|
+
if hours_ago < SSH_CHECK_INTERVAL_HOURS:
|
|
572
|
+
result["detail"] = f"Skipped (last check {hours_ago:.1f}h ago, interval {SSH_CHECK_INTERVAL_HOURS}h)"
|
|
573
|
+
should_run = False
|
|
574
|
+
except Exception:
|
|
575
|
+
pass
|
|
576
|
+
|
|
577
|
+
if should_run:
|
|
578
|
+
rc, stdout, stderr = run_cmd(SSH_SERVER_CMD, timeout=SSH_TIMEOUT)
|
|
579
|
+
|
|
580
|
+
if rc == 0:
|
|
581
|
+
result["detail"] = f"Server OK: {stdout[:100]}"
|
|
582
|
+
else:
|
|
583
|
+
result["status"] = "FAIL"
|
|
584
|
+
err_short = (stderr or "unknown error")[:150]
|
|
585
|
+
result["detail"] = f"SSH failed (rc={rc}): {err_short}"
|
|
586
|
+
|
|
587
|
+
results.append(result)
|
|
588
|
+
return results, should_run
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
# ─── Alerting ─────────────────────────────────────────────────────────────────
|
|
592
|
+
|
|
593
|
+
def get_system_uptime_minutes():
|
|
594
|
+
"""Get system uptime in minutes via sysctl."""
|
|
595
|
+
try:
|
|
596
|
+
r = subprocess.run(
|
|
597
|
+
["sysctl", "-n", "kern.boottime"],
|
|
598
|
+
capture_output=True, text=True, timeout=5
|
|
599
|
+
)
|
|
600
|
+
if r.returncode == 0:
|
|
601
|
+
# Format: { sec = 1709000000, usec = 0 } ...
|
|
602
|
+
import re as _re
|
|
603
|
+
m = _re.search(r'sec\s*=\s*(\d+)', r.stdout)
|
|
604
|
+
if m:
|
|
605
|
+
boot_ts = int(m.group(1))
|
|
606
|
+
return (time.time() - boot_ts) / 60
|
|
607
|
+
except Exception:
|
|
608
|
+
pass
|
|
609
|
+
return 9999 # Assume long uptime if we can't determine
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def detect_new_failures(current_results, previous_status):
|
|
613
|
+
"""Compare current results with previous to find NEW failures.
|
|
614
|
+
|
|
615
|
+
Includes debounce: SSH/server checks need 2 consecutive failures before alerting.
|
|
616
|
+
Includes boot grace: suppresses all alerts within 10 min of system boot.
|
|
617
|
+
"""
|
|
618
|
+
# Boot grace period — suppress alerts when network may still be settling
|
|
619
|
+
uptime = get_system_uptime_minutes()
|
|
620
|
+
if uptime < 10:
|
|
621
|
+
print(f" [GRACE] System uptime {uptime:.0f}min < 10min — suppressing alerts")
|
|
622
|
+
return []
|
|
623
|
+
|
|
624
|
+
prev_checks = {}
|
|
625
|
+
for category in previous_status.get("checks", {}):
|
|
626
|
+
for item in previous_status["checks"][category]:
|
|
627
|
+
key = f"{category}:{item.get('name', '')}"
|
|
628
|
+
prev_checks[key] = item.get("status", "OK")
|
|
629
|
+
|
|
630
|
+
# Load consecutive failure counts for debounce
|
|
631
|
+
consec_file = COORD_DIR / "immune-consecutive-failures.json"
|
|
632
|
+
consec = load_json(consec_file, default={})
|
|
633
|
+
|
|
634
|
+
new_failures = []
|
|
635
|
+
for category, items in current_results.items():
|
|
636
|
+
for item in items:
|
|
637
|
+
key = f"{category}:{item.get('name', '')}"
|
|
638
|
+
current_status = item.get("status", "OK")
|
|
639
|
+
prev_stat = prev_checks.get(key, "OK")
|
|
640
|
+
|
|
641
|
+
if current_status in ("FAIL", "WARN"):
|
|
642
|
+
consec[key] = consec.get(key, 0) + 1
|
|
643
|
+
else:
|
|
644
|
+
consec[key] = 0
|
|
645
|
+
|
|
646
|
+
# Debounce: server/SSH checks need 2+ consecutive failures
|
|
647
|
+
is_server_check = category == "server" or "ssh" in key.lower()
|
|
648
|
+
min_consecutive = 2 if is_server_check else 1
|
|
649
|
+
|
|
650
|
+
if current_status == "FAIL" and prev_stat != "FAIL":
|
|
651
|
+
if consec.get(key, 0) >= min_consecutive:
|
|
652
|
+
new_failures.append(item)
|
|
653
|
+
elif current_status == "WARN" and prev_stat == "OK":
|
|
654
|
+
if consec.get(key, 0) >= min_consecutive:
|
|
655
|
+
new_failures.append(item)
|
|
656
|
+
|
|
657
|
+
save_json(consec_file, consec)
|
|
658
|
+
return new_failures
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def send_failure_alerts(new_failures):
|
|
662
|
+
"""Send WhatsApp alerts for new failures. Max 1 alert per 30 min."""
|
|
663
|
+
if not new_failures:
|
|
664
|
+
return
|
|
665
|
+
|
|
666
|
+
# Global alert cooldown — max 1 WhatsApp alert per 30 minutes
|
|
667
|
+
cooldown_file = COORD_DIR / "immune-last-alert.txt"
|
|
668
|
+
if cooldown_file.exists():
|
|
669
|
+
try:
|
|
670
|
+
last_alert = datetime.strptime(cooldown_file.read_text().strip(), "%Y-%m-%d %H:%M")
|
|
671
|
+
minutes_since = (NOW - last_alert).total_seconds() / 60
|
|
672
|
+
if minutes_since < 30:
|
|
673
|
+
print(f" [COOLDOWN] Last alert {minutes_since:.0f}min ago — suppressing")
|
|
674
|
+
return
|
|
675
|
+
except Exception:
|
|
676
|
+
pass
|
|
677
|
+
|
|
678
|
+
fails = [f for f in new_failures if f["status"] == "FAIL"]
|
|
679
|
+
warns = [f for f in new_failures if f["status"] == "WARN"]
|
|
680
|
+
|
|
681
|
+
sent = False
|
|
682
|
+
if fails:
|
|
683
|
+
lines = [f"- {f['name']}: {f['detail']}" for f in fails[:5]]
|
|
684
|
+
msg = "\n".join(lines)
|
|
685
|
+
if len(fails) > 5:
|
|
686
|
+
msg += f"\n... +{len(fails) - 5} more"
|
|
687
|
+
sent = send_alert(
|
|
688
|
+
"NEXO Immune FAIL",
|
|
689
|
+
f"{len(fails)} new failure(s):\n{msg}"
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
if warns and not fails:
|
|
693
|
+
lines = [f"- {f['name']}: {f['detail']}" for f in warns[:3]]
|
|
694
|
+
msg = "\n".join(lines)
|
|
695
|
+
sent = send_alert(
|
|
696
|
+
"NEXO Immune WARN",
|
|
697
|
+
f"{len(warns)} new warning(s):\n{msg}"
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
if sent:
|
|
701
|
+
cooldown_file.write_text(NOW.strftime("%Y-%m-%d %H:%M"))
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
# ─── Main ─────────────────────────────────────────────────────────────────────
|
|
705
|
+
|
|
706
|
+
def main():
|
|
707
|
+
print(f"\n{'='*60}")
|
|
708
|
+
print(f"NEXO Immune System — {NOW.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
709
|
+
print(f"{'='*60}")
|
|
710
|
+
|
|
711
|
+
# Skip hours gate
|
|
712
|
+
if is_skip_hours():
|
|
713
|
+
print(f"[SKIP] Hour {NOW.hour} is within skip range ({SKIP_START}:00-{SKIP_END}:00). Exiting.")
|
|
714
|
+
return
|
|
715
|
+
|
|
716
|
+
# Ensure coordination directory exists
|
|
717
|
+
COORD_DIR.mkdir(parents=True, exist_ok=True)
|
|
718
|
+
|
|
719
|
+
# Process lock (fcntl)
|
|
720
|
+
lock_fd = None
|
|
721
|
+
try:
|
|
722
|
+
lock_fd = open(LOCK_FILE, "w")
|
|
723
|
+
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
724
|
+
except (IOError, OSError):
|
|
725
|
+
print("[LOCKED] Another immune instance is running. Exiting.")
|
|
726
|
+
if lock_fd:
|
|
727
|
+
lock_fd.close()
|
|
728
|
+
return
|
|
729
|
+
|
|
730
|
+
try:
|
|
731
|
+
_run_checks(lock_fd)
|
|
732
|
+
finally:
|
|
733
|
+
try:
|
|
734
|
+
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
|
735
|
+
lock_fd.close()
|
|
736
|
+
except Exception:
|
|
737
|
+
pass
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def _run_checks(lock_fd):
|
|
741
|
+
"""Execute all checks and produce report."""
|
|
742
|
+
previous_status = load_json(IMMUNE_STATUS)
|
|
743
|
+
|
|
744
|
+
all_results = {}
|
|
745
|
+
repairs = []
|
|
746
|
+
|
|
747
|
+
# 1. Tokens
|
|
748
|
+
print("\n[1/7] Checking tokens...")
|
|
749
|
+
all_results["tokens"] = check_tokens()
|
|
750
|
+
for r in all_results["tokens"]:
|
|
751
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
752
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
753
|
+
|
|
754
|
+
# 2. LaunchAgents
|
|
755
|
+
print("\n[2/7] Checking LaunchAgents...")
|
|
756
|
+
all_results["agents"] = check_launch_agents()
|
|
757
|
+
for r in all_results["agents"]:
|
|
758
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
759
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
760
|
+
if r.get("repaired"):
|
|
761
|
+
repairs.append(f"LaunchAgent {r['name']} reloaded")
|
|
762
|
+
|
|
763
|
+
# 3. Databases
|
|
764
|
+
print("\n[3/7] Checking databases...")
|
|
765
|
+
all_results["databases"] = check_databases()
|
|
766
|
+
for r in all_results["databases"]:
|
|
767
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
768
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
769
|
+
|
|
770
|
+
# 4. Scripts & locks
|
|
771
|
+
print("\n[4/7] Checking scripts & locks...")
|
|
772
|
+
all_results["scripts"] = check_scripts()
|
|
773
|
+
for r in all_results["scripts"]:
|
|
774
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
775
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
776
|
+
if r.get("repaired"):
|
|
777
|
+
repairs.append(f"Stale lock {r['name']} removed")
|
|
778
|
+
|
|
779
|
+
# 5. Logs
|
|
780
|
+
print("\n[5/7] Checking log sizes...")
|
|
781
|
+
all_results["logs"] = check_logs()
|
|
782
|
+
for r in all_results["logs"]:
|
|
783
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
784
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
785
|
+
if r.get("repaired"):
|
|
786
|
+
repairs.append(f"Log {r['name']} truncated")
|
|
787
|
+
|
|
788
|
+
# 6. Disk
|
|
789
|
+
print("\n[6/7] Checking disk usage...")
|
|
790
|
+
all_results["disk"] = check_disk()
|
|
791
|
+
for r in all_results["disk"]:
|
|
792
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
793
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
794
|
+
|
|
795
|
+
# 7. Server crons
|
|
796
|
+
print("\n[7/7] Checking server crons...")
|
|
797
|
+
server_results, ssh_ran = check_server_crons()
|
|
798
|
+
all_results["server"] = server_results
|
|
799
|
+
for r in all_results["server"]:
|
|
800
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
801
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
802
|
+
|
|
803
|
+
# ─── Summary ──────────────────────────────────────────────────────────
|
|
804
|
+
counts = {"OK": 0, "WARN": 0, "FAIL": 0}
|
|
805
|
+
for category_items in all_results.values():
|
|
806
|
+
for item in category_items:
|
|
807
|
+
s = item.get("status", "OK")
|
|
808
|
+
if s in counts:
|
|
809
|
+
counts[s] += 1
|
|
810
|
+
|
|
811
|
+
total = sum(counts.values())
|
|
812
|
+
|
|
813
|
+
print(f"\n{'─'*60}")
|
|
814
|
+
print(f"SUMMARY: {total} checks — {counts['OK']} OK, {counts['WARN']} WARN, {counts['FAIL']} FAIL")
|
|
815
|
+
if repairs:
|
|
816
|
+
print(f"AUTO-REPAIRS: {len(repairs)}")
|
|
817
|
+
for r in repairs:
|
|
818
|
+
print(f" - {r}")
|
|
819
|
+
print(f"{'─'*60}\n")
|
|
820
|
+
|
|
821
|
+
# ─── Detect new failures & alert ──────────────────────────────────────
|
|
822
|
+
new_failures = detect_new_failures(all_results, previous_status)
|
|
823
|
+
if new_failures:
|
|
824
|
+
print(f"[ALERT] {len(new_failures)} new failure(s)/warning(s) detected:")
|
|
825
|
+
for nf in new_failures:
|
|
826
|
+
print(f" - [{nf['status']}] {nf['name']}: {nf['detail']}")
|
|
827
|
+
send_failure_alerts(new_failures)
|
|
828
|
+
else:
|
|
829
|
+
print("[OK] No new failures.")
|
|
830
|
+
|
|
831
|
+
# ─── Save status ──────────────────────────────────────────────────────
|
|
832
|
+
status = {
|
|
833
|
+
"last_run": NOW.strftime("%Y-%m-%d %H:%M"),
|
|
834
|
+
"counts": counts,
|
|
835
|
+
"repairs": repairs,
|
|
836
|
+
"new_failures": len(new_failures),
|
|
837
|
+
"checks": all_results,
|
|
838
|
+
}
|
|
839
|
+
if ssh_ran:
|
|
840
|
+
status["last_ssh_check"] = NOW.strftime("%Y-%m-%d %H:%M")
|
|
841
|
+
elif "last_ssh_check" in previous_status:
|
|
842
|
+
status["last_ssh_check"] = previous_status["last_ssh_check"]
|
|
843
|
+
|
|
844
|
+
save_json(IMMUNE_STATUS, status)
|
|
845
|
+
|
|
846
|
+
# ─── Append to log ────────────────────────────────────────────────────
|
|
847
|
+
log_entry = {
|
|
848
|
+
"ts": NOW.strftime("%Y-%m-%d %H:%M"),
|
|
849
|
+
"ok": counts["OK"],
|
|
850
|
+
"warn": counts["WARN"],
|
|
851
|
+
"fail": counts["FAIL"],
|
|
852
|
+
"repairs": len(repairs),
|
|
853
|
+
"new_failures": len(new_failures),
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
log = load_json(IMMUNE_LOG, default=[])
|
|
857
|
+
if not isinstance(log, list):
|
|
858
|
+
log = []
|
|
859
|
+
log.append(log_entry)
|
|
860
|
+
if len(log) > MAX_LOG_ENTRIES:
|
|
861
|
+
log = log[-MAX_LOG_ENTRIES:]
|
|
862
|
+
save_json(IMMUNE_LOG, log)
|
|
863
|
+
|
|
864
|
+
print(f"Status saved to {IMMUNE_STATUS}")
|
|
865
|
+
print(f"Log appended to {IMMUNE_LOG} ({len(log)} entries)")
|
|
866
|
+
|
|
867
|
+
# ─── Stage B: CLI interpretation (only when issues found) ────────────
|
|
868
|
+
if counts["FAIL"] > 0 or counts["WARN"] > 2 or repairs:
|
|
869
|
+
_run_cli_triage(all_results, repairs, counts)
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def _run_cli_triage(all_results: dict, repairs: list, counts: dict):
|
|
873
|
+
"""Pass all findings to the configured automation backend for intelligent triage and recommendations."""
|
|
874
|
+
triage_file = COORD_DIR / "immune-triage.md"
|
|
875
|
+
findings_json = json.dumps({
|
|
876
|
+
"timestamp": NOW.strftime("%Y-%m-%d %H:%M"),
|
|
877
|
+
"counts": counts,
|
|
878
|
+
"repairs": repairs,
|
|
879
|
+
"checks": all_results,
|
|
880
|
+
}, indent=2, default=str)
|
|
881
|
+
|
|
882
|
+
prompt = f"""You are the NEXO Immune System triage analyst.
|
|
883
|
+
|
|
884
|
+
Below are the raw health check results from a scheduled scan. Your job:
|
|
885
|
+
|
|
886
|
+
1. Identify which failures are REAL problems vs transient/expected
|
|
887
|
+
2. Group related issues (e.g. SSH failure + server cron failure = same root cause)
|
|
888
|
+
3. Prioritize: what needs attention NOW vs can wait
|
|
889
|
+
4. For each real issue, suggest a specific remediation action
|
|
890
|
+
5. Note any patterns across recent runs if visible
|
|
891
|
+
|
|
892
|
+
Write a concise triage report to: {triage_file}
|
|
893
|
+
|
|
894
|
+
Format:
|
|
895
|
+
## Immune Triage — YYYY-MM-DD HH:MM
|
|
896
|
+
|
|
897
|
+
### Critical (act now)
|
|
898
|
+
- ...
|
|
899
|
+
|
|
900
|
+
### Monitor (watch next run)
|
|
901
|
+
- ...
|
|
902
|
+
|
|
903
|
+
### Resolved (auto-repaired)
|
|
904
|
+
- ...
|
|
905
|
+
|
|
906
|
+
### Patterns
|
|
907
|
+
- ...
|
|
908
|
+
|
|
909
|
+
Raw findings:
|
|
910
|
+
{findings_json}
|
|
911
|
+
|
|
912
|
+
Write the report. Be concise — max 40 lines."""
|
|
913
|
+
|
|
914
|
+
print("\n[TRIAGE] Running CLI interpretation...")
|
|
915
|
+
try:
|
|
916
|
+
result = run_automation_prompt(
|
|
917
|
+
prompt,
|
|
918
|
+
model=_USER_MODEL or "opus",
|
|
919
|
+
timeout=21600,
|
|
920
|
+
output_format="text",
|
|
921
|
+
allowed_tools="Read,Write,Edit,Glob,Grep,Bash,mcp__nexo__*",
|
|
922
|
+
)
|
|
923
|
+
if result.returncode == 0:
|
|
924
|
+
print(f"[TRIAGE] Report written to {triage_file}")
|
|
925
|
+
else:
|
|
926
|
+
print(f"[TRIAGE] CLI exited {result.returncode}: {result.stderr[:200]}")
|
|
927
|
+
except AutomationBackendUnavailableError as e:
|
|
928
|
+
print(f"[TRIAGE] Skipping triage: {e}")
|
|
929
|
+
except subprocess.TimeoutExpired:
|
|
930
|
+
print("[TRIAGE] CLI timed out (120s)")
|
|
931
|
+
except Exception as e:
|
|
932
|
+
print(f"[TRIAGE] Error: {e}")
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
if __name__ == "__main__":
|
|
936
|
+
main()
|