nexo-brain 0.2.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -72
- package/bin/nexo-brain 2.js +610 -0
- package/package.json +2 -2
- package/scripts/pre-commit-check 2.sh +55 -0
- package/src/cognitive.py +1582 -56
- package/src/db.py +49 -25
- package/src/hooks/auto_capture.py +208 -0
- package/src/plugins/cognitive_memory.py +276 -17
- package/src/scripts/nexo-catchup.py +32 -15
- package/src/scripts/nexo-cognitive-decay.py +2 -4
- package/src/scripts/nexo-daily-self-audit.py +148 -29
- package/src/scripts/nexo-immune.py +869 -0
- package/src/scripts/nexo-postmortem-consolidator.py +42 -40
- package/src/scripts/nexo-sleep.py +90 -39
- package/src/scripts/nexo-synthesis.py +78 -76
- package/src/tools_sessions.py +2 -2
- package/templates/CLAUDE.md 2.template +89 -0
- package/templates/CLAUDE.md.template +1 -1
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
NEXO Immune System — Health monitor & auto-repair.
|
|
4
|
+
|
|
5
|
+
Runs every 30 minutes via LaunchAgent. Checks tokens, LaunchAgents, DBs,
|
|
6
|
+
scripts, logs, disk, and server crons. Auto-repairs what it can, alerts
|
|
7
|
+
User via WhatsApp only on NEW failures.
|
|
8
|
+
|
|
9
|
+
Zero external dependencies. Stdlib + sqlite3 + urllib only.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import fcntl
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import shlex
|
|
17
|
+
import shutil
|
|
18
|
+
import signal
|
|
19
|
+
import sqlite3
|
|
20
|
+
import ssl
|
|
21
|
+
import subprocess
|
|
22
|
+
import sys
|
|
23
|
+
import time
|
|
24
|
+
from datetime import datetime, date, timedelta
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from urllib.request import Request, urlopen
|
|
27
|
+
from urllib.error import URLError, HTTPError
|
|
28
|
+
|
|
29
|
+
# ─── SSL context for macOS (certifi or system certs) ─────────────────────────
|
|
30
|
+
def _make_ssl_context():
|
|
31
|
+
"""Create an SSL context that works on macOS with Python.org Python."""
|
|
32
|
+
# Try certifi first (pip-installed)
|
|
33
|
+
try:
|
|
34
|
+
import certifi
|
|
35
|
+
ctx = ssl.create_default_context(cafile=certifi.where())
|
|
36
|
+
return ctx
|
|
37
|
+
except ImportError:
|
|
38
|
+
pass
|
|
39
|
+
# Try macOS system certificates
|
|
40
|
+
for ca_path in [
|
|
41
|
+
"/etc/ssl/cert.pem",
|
|
42
|
+
"/usr/local/etc/openssl/cert.pem",
|
|
43
|
+
"/usr/local/etc/openssl@3/cert.pem",
|
|
44
|
+
"/opt/homebrew/etc/openssl@3/cert.pem",
|
|
45
|
+
]:
|
|
46
|
+
if os.path.exists(ca_path):
|
|
47
|
+
ctx = ssl.create_default_context(cafile=ca_path)
|
|
48
|
+
return ctx
|
|
49
|
+
# Last resort: unverified (still better than crashing)
|
|
50
|
+
ctx = ssl.create_default_context()
|
|
51
|
+
ctx.check_hostname = False
|
|
52
|
+
ctx.verify_mode = ssl.CERT_NONE
|
|
53
|
+
return ctx
|
|
54
|
+
|
|
55
|
+
SSL_CTX = _make_ssl_context()
|
|
56
|
+
|
|
57
|
+
# ─── Paths ────────────────────────────────────────────────────────────────────
|
|
58
|
+
HOME = Path.home()
|
|
59
|
+
CLAUDE_DIR = HOME / "claude"
|
|
60
|
+
COORD_DIR = CLAUDE_DIR / "coordination"
|
|
61
|
+
BRAIN_DIR = CLAUDE_DIR / "brain"
|
|
62
|
+
SCRIPTS_DIR = CLAUDE_DIR / "scripts"
|
|
63
|
+
|
|
64
|
+
IMMUNE_STATUS = COORD_DIR / "immune-status.json"
|
|
65
|
+
IMMUNE_LOG = COORD_DIR / "immune-log.json"
|
|
66
|
+
LOCK_FILE = COORD_DIR / "immune-process.lock"
|
|
67
|
+
|
|
68
|
+
WA_NOTIFY = SCRIPTS_DIR / "nexo-whatsapp-notify.sh"
|
|
69
|
+
|
|
70
|
+
CLAUDE_MEM_DB = HOME / ".claude-mem" / "claude-mem.db"
|
|
71
|
+
|
|
72
|
+
LAUNCH_AGENTS_DIR = HOME / "Library" / "LaunchAgents"
|
|
73
|
+
|
|
74
|
+
NOW = datetime.now()
|
|
75
|
+
TODAY = date.today()
|
|
76
|
+
|
|
77
|
+
# ─── Config ───────────────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
TOKEN_CHECKS = [
|
|
80
|
+
{
|
|
81
|
+
"name": "Meta Ads",
|
|
82
|
+
"path": "~/.claude/meta_token.txt",
|
|
83
|
+
"type": "file_text",
|
|
84
|
+
"test_url": "https://graph.facebook.com/v21.0/me?access_token={token}",
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"name": "Instagram",
|
|
88
|
+
"path": "~/.claude/instagram_token.txt",
|
|
89
|
+
"type": "file_text",
|
|
90
|
+
"test_url": "https://graph.instagram.com/v21.0/me?access_token={token}",
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
"name": "YouTube",
|
|
94
|
+
"path": "~/.claude/youtube_token.json",
|
|
95
|
+
"type": "json_field",
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"name": "X/Twitter",
|
|
99
|
+
"path": "~/.claude/x_credentials.json",
|
|
100
|
+
"type": "json_field",
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"name": "GA4 Service Account",
|
|
104
|
+
"path": "~/.claude/ga4-service-account.json",
|
|
105
|
+
"type": "service_account",
|
|
106
|
+
},
|
|
107
|
+
# Example: Shopify Admin token check
|
|
108
|
+
# {
|
|
109
|
+
# "name": "Shopify Admin",
|
|
110
|
+
# "type": "hardcoded",
|
|
111
|
+
# "test_url": "https://YOUR_STORE.myshopify.com/admin/api/2024-01/shop.json",
|
|
112
|
+
# "token": "YOUR_SHOPIFY_ADMIN_TOKEN",
|
|
113
|
+
# "header": "X-Shopify-Access-Token",
|
|
114
|
+
# },
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
EXPECTED_AGENTS = [
|
|
118
|
+
"com.nexo.immune",
|
|
119
|
+
"com.nexo.sleep",
|
|
120
|
+
"com.nexo.synthesis",
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
# SSH check interval — only every 2 hours, not every 30 min
|
|
124
|
+
SSH_CHECK_INTERVAL_HOURS = 2
|
|
125
|
+
|
|
126
|
+
# Log size thresholds (bytes)
|
|
127
|
+
LOG_WARN_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
128
|
+
LOG_FAIL_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
129
|
+
LOG_TRUNCATE_SIZE = 50 * 1024 * 1024 # 50 MB — auto-truncate threshold
|
|
130
|
+
|
|
131
|
+
# Disk thresholds (percentage used)
|
|
132
|
+
DISK_WARN_PCT = 85
|
|
133
|
+
DISK_FAIL_PCT = 95
|
|
134
|
+
|
|
135
|
+
# Quiet hours — no WhatsApp alerts
|
|
136
|
+
QUIET_START = 23 # 23:00
|
|
137
|
+
QUIET_END = 7 # 07:00
|
|
138
|
+
|
|
139
|
+
# Skip execution hours (deep night)
|
|
140
|
+
SKIP_START = 0 # 00:00
|
|
141
|
+
SKIP_END = 6 # 06:00
|
|
142
|
+
|
|
143
|
+
# Max entries in immune-log.json
|
|
144
|
+
MAX_LOG_ENTRIES = 500
|
|
145
|
+
|
|
146
|
+
# HTTP timeout for token checks
|
|
147
|
+
HTTP_TIMEOUT = 10
|
|
148
|
+
|
|
149
|
+
# SSH timeout
|
|
150
|
+
SSH_TIMEOUT = 15
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
154
|
+
|
|
155
|
+
def load_json(path, default=None):
|
|
156
|
+
if not path.exists():
|
|
157
|
+
return default if default is not None else {}
|
|
158
|
+
try:
|
|
159
|
+
return json.loads(path.read_text())
|
|
160
|
+
except Exception:
|
|
161
|
+
return default if default is not None else {}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def save_json(path, data):
|
|
165
|
+
path.write_text(json.dumps(data, indent=2, ensure_ascii=False))
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def is_quiet_hours():
|
|
169
|
+
"""Check if within WhatsApp quiet hours (23:00 - 07:00)."""
|
|
170
|
+
h = NOW.hour
|
|
171
|
+
if QUIET_START > QUIET_END:
|
|
172
|
+
return h >= QUIET_START or h < QUIET_END
|
|
173
|
+
return QUIET_START <= h < QUIET_END
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def is_skip_hours():
|
|
177
|
+
"""Check if within skip hours (00:00 - 06:00)."""
|
|
178
|
+
return SKIP_START <= NOW.hour < SKIP_END
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def send_wa_alert(title, message):
|
|
182
|
+
"""Send WhatsApp alert if not in quiet hours."""
|
|
183
|
+
if is_quiet_hours():
|
|
184
|
+
print(f" [QUIET] Suppressed WA alert: {title}")
|
|
185
|
+
return False
|
|
186
|
+
try:
|
|
187
|
+
subprocess.run(
|
|
188
|
+
[str(WA_NOTIFY), title, message],
|
|
189
|
+
timeout=15,
|
|
190
|
+
capture_output=True,
|
|
191
|
+
)
|
|
192
|
+
print(f" [WA] Sent alert: {title}")
|
|
193
|
+
return True
|
|
194
|
+
except Exception as e:
|
|
195
|
+
print(f" [WA] Failed to send: {e}")
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def http_get(url, headers=None, timeout=HTTP_TIMEOUT):
|
|
200
|
+
"""Simple HTTP GET, returns (status_code, body) or (0, error_string)."""
|
|
201
|
+
try:
|
|
202
|
+
req = Request(url)
|
|
203
|
+
if headers:
|
|
204
|
+
for k, v in headers.items():
|
|
205
|
+
req.add_header(k, v)
|
|
206
|
+
with urlopen(req, timeout=timeout, context=SSL_CTX) as resp:
|
|
207
|
+
body = resp.read().decode("utf-8", errors="replace")
|
|
208
|
+
return resp.status, body
|
|
209
|
+
except HTTPError as e:
|
|
210
|
+
return e.code, str(e)
|
|
211
|
+
except URLError as e:
|
|
212
|
+
return 0, str(e.reason)
|
|
213
|
+
except Exception as e:
|
|
214
|
+
return 0, str(e)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def run_cmd(cmd, timeout=30):
|
|
218
|
+
"""Run a command without invoking a shell. Accepts string or argv list."""
|
|
219
|
+
try:
|
|
220
|
+
argv = shlex.split(cmd) if isinstance(cmd, str) else list(cmd)
|
|
221
|
+
r = subprocess.run(
|
|
222
|
+
argv, capture_output=True, text=True, timeout=timeout
|
|
223
|
+
)
|
|
224
|
+
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
|
225
|
+
except subprocess.TimeoutExpired:
|
|
226
|
+
return -1, "", "timeout"
|
|
227
|
+
except Exception as e:
|
|
228
|
+
return -1, "", str(e)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def pid_alive(pid):
|
|
232
|
+
"""Check if a PID is still running."""
|
|
233
|
+
try:
|
|
234
|
+
os.kill(pid, 0)
|
|
235
|
+
return True
|
|
236
|
+
except (OSError, ProcessLookupError):
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ─── Check Functions ──────────────────────────────────────────────────────────
|
|
241
|
+
|
|
242
|
+
def check_tokens():
|
|
243
|
+
"""Check all configured tokens. Returns list of result dicts."""
|
|
244
|
+
results = []
|
|
245
|
+
|
|
246
|
+
for tc in TOKEN_CHECKS:
|
|
247
|
+
name = tc["name"]
|
|
248
|
+
result = {"name": name, "status": "OK", "detail": ""}
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
if tc["type"] == "file_text":
|
|
252
|
+
path = Path(tc["path"]).expanduser()
|
|
253
|
+
if not path.exists():
|
|
254
|
+
result["status"] = "FAIL"
|
|
255
|
+
result["detail"] = f"Token file missing: {path}"
|
|
256
|
+
else:
|
|
257
|
+
token = path.read_text().strip()
|
|
258
|
+
if not token:
|
|
259
|
+
result["status"] = "FAIL"
|
|
260
|
+
result["detail"] = "Token file empty"
|
|
261
|
+
elif "test_url" in tc:
|
|
262
|
+
url = tc["test_url"].format(token=token)
|
|
263
|
+
code, body = http_get(url)
|
|
264
|
+
if code == 200:
|
|
265
|
+
result["detail"] = "HTTP 200 OK"
|
|
266
|
+
elif code == 190 or (isinstance(body, str) and "expired" in body.lower()):
|
|
267
|
+
result["status"] = "FAIL"
|
|
268
|
+
result["detail"] = f"Token expired (HTTP {code})"
|
|
269
|
+
else:
|
|
270
|
+
result["status"] = "FAIL"
|
|
271
|
+
result["detail"] = f"HTTP {code}: {body[:200]}"
|
|
272
|
+
|
|
273
|
+
elif tc["type"] == "json_field":
|
|
274
|
+
path = Path(tc["path"]).expanduser()
|
|
275
|
+
if not path.exists():
|
|
276
|
+
result["status"] = "FAIL"
|
|
277
|
+
result["detail"] = f"Token file missing: {path}"
|
|
278
|
+
else:
|
|
279
|
+
data = load_json(path, default=None)
|
|
280
|
+
if data is None:
|
|
281
|
+
result["status"] = "FAIL"
|
|
282
|
+
result["detail"] = "Invalid JSON"
|
|
283
|
+
elif "refresh_token" not in data:
|
|
284
|
+
result["status"] = "FAIL"
|
|
285
|
+
result["detail"] = "No refresh_token in JSON"
|
|
286
|
+
else:
|
|
287
|
+
result["detail"] = "refresh_token present"
|
|
288
|
+
|
|
289
|
+
elif tc["type"] == "service_account":
|
|
290
|
+
path = Path(tc["path"]).expanduser()
|
|
291
|
+
if not path.exists():
|
|
292
|
+
result["status"] = "FAIL"
|
|
293
|
+
result["detail"] = f"Service account file missing: {path}"
|
|
294
|
+
else:
|
|
295
|
+
data = load_json(path, default=None)
|
|
296
|
+
if data is None:
|
|
297
|
+
result["status"] = "FAIL"
|
|
298
|
+
result["detail"] = "Invalid JSON"
|
|
299
|
+
elif "private_key" not in data or "client_email" not in data:
|
|
300
|
+
result["status"] = "FAIL"
|
|
301
|
+
result["detail"] = "Missing private_key or client_email"
|
|
302
|
+
else:
|
|
303
|
+
result["detail"] = f"SA: {data.get('client_email', '?')[:40]}"
|
|
304
|
+
|
|
305
|
+
elif tc["type"] == "hardcoded":
|
|
306
|
+
url = tc["test_url"]
|
|
307
|
+
headers = {tc["header"]: tc["token"]}
|
|
308
|
+
code, body = http_get(url, headers=headers)
|
|
309
|
+
if code == 200:
|
|
310
|
+
result["detail"] = "HTTP 200 OK"
|
|
311
|
+
elif code == 401:
|
|
312
|
+
result["status"] = "FAIL"
|
|
313
|
+
result["detail"] = "Token unauthorized (401)"
|
|
314
|
+
else:
|
|
315
|
+
result["status"] = "FAIL"
|
|
316
|
+
result["detail"] = f"HTTP {code}: {body[:200]}"
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
result["status"] = "FAIL"
|
|
320
|
+
result["detail"] = f"Exception: {str(e)[:200]}"
|
|
321
|
+
|
|
322
|
+
results.append(result)
|
|
323
|
+
|
|
324
|
+
return results
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def check_launch_agents():
|
|
328
|
+
"""Check that expected LaunchAgents are loaded. Auto-repair if not."""
|
|
329
|
+
results = []
|
|
330
|
+
|
|
331
|
+
# Get list of loaded agents
|
|
332
|
+
rc, stdout, _ = run_cmd("launchctl list")
|
|
333
|
+
loaded_labels = set()
|
|
334
|
+
if rc == 0:
|
|
335
|
+
for line in stdout.splitlines():
|
|
336
|
+
parts = line.split("\t")
|
|
337
|
+
if len(parts) >= 3:
|
|
338
|
+
loaded_labels.add(parts[2])
|
|
339
|
+
|
|
340
|
+
for agent in EXPECTED_AGENTS:
|
|
341
|
+
result = {"name": agent, "status": "OK", "detail": "", "repaired": False}
|
|
342
|
+
|
|
343
|
+
if agent in loaded_labels:
|
|
344
|
+
result["detail"] = "Loaded"
|
|
345
|
+
else:
|
|
346
|
+
# Try auto-repair
|
|
347
|
+
plist = LAUNCH_AGENTS_DIR / f"{agent}.plist"
|
|
348
|
+
if plist.exists():
|
|
349
|
+
rc, out, err = run_cmd(f"launchctl load '{plist}'")
|
|
350
|
+
if rc == 0:
|
|
351
|
+
result["status"] = "WARN"
|
|
352
|
+
result["detail"] = f"Was unloaded, auto-loaded successfully"
|
|
353
|
+
result["repaired"] = True
|
|
354
|
+
else:
|
|
355
|
+
result["status"] = "FAIL"
|
|
356
|
+
result["detail"] = f"Unloaded, auto-load failed: {err[:100]}"
|
|
357
|
+
else:
|
|
358
|
+
result["status"] = "FAIL"
|
|
359
|
+
result["detail"] = f"Unloaded, plist not found: {plist}"
|
|
360
|
+
|
|
361
|
+
results.append(result)
|
|
362
|
+
|
|
363
|
+
return results
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def check_databases():
|
|
367
|
+
"""Run PRAGMA integrity_check on known databases."""
|
|
368
|
+
results = []
|
|
369
|
+
|
|
370
|
+
dbs = [
|
|
371
|
+
("nexo.db", Path.home() / "claude" / "nexo-mcp" / "nexo.db"),
|
|
372
|
+
("cognitive.db", Path.home() / "claude" / "nexo-mcp" / "cognitive.db"),
|
|
373
|
+
("claude-mem.db", CLAUDE_MEM_DB),
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
for name, path in dbs:
|
|
377
|
+
result = {"name": name, "status": "OK", "detail": ""}
|
|
378
|
+
|
|
379
|
+
if not path.exists():
|
|
380
|
+
result["status"] = "FAIL"
|
|
381
|
+
result["detail"] = f"File missing: {path}"
|
|
382
|
+
else:
|
|
383
|
+
try:
|
|
384
|
+
conn = sqlite3.connect(str(path), timeout=5)
|
|
385
|
+
cursor = conn.execute("PRAGMA integrity_check")
|
|
386
|
+
check_result = cursor.fetchone()[0]
|
|
387
|
+
conn.close()
|
|
388
|
+
if check_result == "ok":
|
|
389
|
+
size_mb = path.stat().st_size / (1024 * 1024)
|
|
390
|
+
result["detail"] = f"Integrity OK ({size_mb:.1f} MB)"
|
|
391
|
+
else:
|
|
392
|
+
result["status"] = "FAIL"
|
|
393
|
+
result["detail"] = f"Integrity failed: {check_result[:200]}"
|
|
394
|
+
except Exception as e:
|
|
395
|
+
result["status"] = "FAIL"
|
|
396
|
+
result["detail"] = f"Error: {str(e)[:200]}"
|
|
397
|
+
|
|
398
|
+
results.append(result)
|
|
399
|
+
|
|
400
|
+
return results
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def check_scripts():
|
|
404
|
+
"""Check stale lock files."""
|
|
405
|
+
results = []
|
|
406
|
+
|
|
407
|
+
# Stale lock files (PID dead)
|
|
408
|
+
lock_files = list(COORD_DIR.glob("*.lock"))
|
|
409
|
+
for lf in lock_files:
|
|
410
|
+
if lf == LOCK_FILE:
|
|
411
|
+
continue # Skip our own lock
|
|
412
|
+
result = {"name": f"lock:{lf.name}", "status": "OK", "detail": "", "repaired": False}
|
|
413
|
+
try:
|
|
414
|
+
content = lf.read_text().strip()
|
|
415
|
+
if content and content.isdigit():
|
|
416
|
+
pid = int(content)
|
|
417
|
+
if pid_alive(pid):
|
|
418
|
+
result["detail"] = f"PID {pid} alive"
|
|
419
|
+
else:
|
|
420
|
+
# Auto-repair: remove stale lock
|
|
421
|
+
lf.unlink()
|
|
422
|
+
result["status"] = "WARN"
|
|
423
|
+
result["detail"] = f"PID {pid} dead — lock removed"
|
|
424
|
+
result["repaired"] = True
|
|
425
|
+
elif content:
|
|
426
|
+
# Lock file has non-PID content — check if size 0 (normal flock pattern)
|
|
427
|
+
if lf.stat().st_size == 0:
|
|
428
|
+
result["detail"] = "Empty lock (flock pattern)"
|
|
429
|
+
else:
|
|
430
|
+
result["detail"] = f"Non-PID content: {content[:50]}"
|
|
431
|
+
else:
|
|
432
|
+
result["detail"] = "Empty lock file"
|
|
433
|
+
except Exception as e:
|
|
434
|
+
result["detail"] = f"Error checking: {e}"
|
|
435
|
+
results.append(result)
|
|
436
|
+
|
|
437
|
+
return results
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def check_logs():
|
|
441
|
+
"""Check log file sizes. Auto-truncate if > 50 MB."""
|
|
442
|
+
results = []
|
|
443
|
+
|
|
444
|
+
# JSON logs to check
|
|
445
|
+
json_logs = [
|
|
446
|
+
COORD_DIR / "heartbeat-log.json",
|
|
447
|
+
COORD_DIR / "reflection-log.json",
|
|
448
|
+
COORD_DIR / "immune-log.json",
|
|
449
|
+
COORD_DIR / "ops-board.json",
|
|
450
|
+
COORD_DIR / "messages.json",
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
# Text logs to check
|
|
454
|
+
text_logs = [
|
|
455
|
+
COORD_DIR / "heartbeat-stdout.log",
|
|
456
|
+
COORD_DIR / "heartbeat-stderr.log",
|
|
457
|
+
COORD_DIR / "reflection-stdout.log",
|
|
458
|
+
COORD_DIR / "reflection-stderr.log",
|
|
459
|
+
COORD_DIR / "immune-stdout.log",
|
|
460
|
+
COORD_DIR / "immune-stderr.log",
|
|
461
|
+
]
|
|
462
|
+
|
|
463
|
+
for log_path in json_logs + text_logs:
|
|
464
|
+
if not log_path.exists():
|
|
465
|
+
continue
|
|
466
|
+
|
|
467
|
+
result = {"name": log_path.name, "status": "OK", "detail": "", "repaired": False}
|
|
468
|
+
size = log_path.stat().st_size
|
|
469
|
+
size_mb = size / (1024 * 1024)
|
|
470
|
+
|
|
471
|
+
if size >= LOG_FAIL_SIZE:
|
|
472
|
+
result["status"] = "FAIL"
|
|
473
|
+
result["detail"] = f"{size_mb:.1f} MB — exceeds {LOG_FAIL_SIZE // (1024*1024)} MB"
|
|
474
|
+
|
|
475
|
+
# Auto-truncate
|
|
476
|
+
try:
|
|
477
|
+
if log_path.suffix == ".json":
|
|
478
|
+
_truncate_json_log(log_path, keep_entries=200)
|
|
479
|
+
else:
|
|
480
|
+
_truncate_text_log(log_path, keep_lines=1000)
|
|
481
|
+
new_size = log_path.stat().st_size / (1024 * 1024)
|
|
482
|
+
result["detail"] += f" -> truncated to {new_size:.1f} MB"
|
|
483
|
+
result["repaired"] = True
|
|
484
|
+
except Exception as e:
|
|
485
|
+
result["detail"] += f" -> truncate failed: {e}"
|
|
486
|
+
|
|
487
|
+
elif size >= LOG_WARN_SIZE:
|
|
488
|
+
result["status"] = "WARN"
|
|
489
|
+
result["detail"] = f"{size_mb:.1f} MB — approaching limit"
|
|
490
|
+
else:
|
|
491
|
+
result["detail"] = f"{size_mb:.2f} MB"
|
|
492
|
+
|
|
493
|
+
results.append(result)
|
|
494
|
+
|
|
495
|
+
return results
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def _truncate_json_log(path, keep_entries=200):
|
|
499
|
+
"""Truncate a JSON log file to the last N entries."""
|
|
500
|
+
data = load_json(path, default=[])
|
|
501
|
+
if isinstance(data, list) and len(data) > keep_entries:
|
|
502
|
+
data = data[-keep_entries:]
|
|
503
|
+
save_json(path, data)
|
|
504
|
+
elif isinstance(data, dict):
|
|
505
|
+
# Some logs are dicts with a list value
|
|
506
|
+
for key in data:
|
|
507
|
+
if isinstance(data[key], list) and len(data[key]) > keep_entries:
|
|
508
|
+
data[key] = data[key][-keep_entries:]
|
|
509
|
+
save_json(path, data)
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _truncate_text_log(path, keep_lines=1000):
|
|
513
|
+
"""Truncate a text log to the last N lines."""
|
|
514
|
+
lines = path.read_text().splitlines()
|
|
515
|
+
if len(lines) > keep_lines:
|
|
516
|
+
path.write_text("\n".join(lines[-keep_lines:]) + "\n")
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def check_disk():
|
|
520
|
+
"""Check disk usage via os.statvfs."""
|
|
521
|
+
results = []
|
|
522
|
+
result = {"name": "disk:/", "status": "OK", "detail": ""}
|
|
523
|
+
|
|
524
|
+
try:
|
|
525
|
+
st = os.statvfs("/")
|
|
526
|
+
total = st.f_frsize * st.f_blocks
|
|
527
|
+
avail = st.f_frsize * st.f_bavail
|
|
528
|
+
used = total - avail
|
|
529
|
+
pct = (used / total) * 100 if total > 0 else 0
|
|
530
|
+
|
|
531
|
+
avail_gb = avail / (1024 ** 3)
|
|
532
|
+
total_gb = total / (1024 ** 3)
|
|
533
|
+
|
|
534
|
+
if pct >= DISK_FAIL_PCT:
|
|
535
|
+
result["status"] = "FAIL"
|
|
536
|
+
result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
|
|
537
|
+
elif pct >= DISK_WARN_PCT:
|
|
538
|
+
result["status"] = "WARN"
|
|
539
|
+
result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
|
|
540
|
+
else:
|
|
541
|
+
result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
|
|
542
|
+
except Exception as e:
|
|
543
|
+
result["status"] = "FAIL"
|
|
544
|
+
result["detail"] = f"Error: {e}"
|
|
545
|
+
|
|
546
|
+
results.append(result)
|
|
547
|
+
return results
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def check_server_crons():
|
|
551
|
+
"""Check remote server crons via SSH. Only runs every 2 hours.
|
|
552
|
+
|
|
553
|
+
Configure SSH_HOST, SSH_PORT, SSH_USER and the cron check command for your server.
|
|
554
|
+
Example: check that a MySQL/cron log table has entries for today.
|
|
555
|
+
"""
|
|
556
|
+
results = []
|
|
557
|
+
result = {"name": "server-crons", "status": "OK", "detail": ""}
|
|
558
|
+
|
|
559
|
+
# ── Configure for your server ──────────────────────────────────────────────
|
|
560
|
+
# SSH_HOST = "your-server.example.com"
|
|
561
|
+
# SSH_PORT = 22
|
|
562
|
+
# SSH_USER = "root"
|
|
563
|
+
# CRON_CHECK_CMD = '"echo cron-check-not-configured"'
|
|
564
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
565
|
+
|
|
566
|
+
# Check if we should run (every 2 hours based on last check)
|
|
567
|
+
status = load_json(IMMUNE_STATUS)
|
|
568
|
+
last_ssh_str = status.get("last_ssh_check", "")
|
|
569
|
+
should_run = True
|
|
570
|
+
|
|
571
|
+
if last_ssh_str:
|
|
572
|
+
try:
|
|
573
|
+
last_ssh = datetime.strptime(last_ssh_str, "%Y-%m-%d %H:%M")
|
|
574
|
+
hours_ago = (NOW - last_ssh).total_seconds() / 3600
|
|
575
|
+
if hours_ago < SSH_CHECK_INTERVAL_HOURS:
|
|
576
|
+
result["detail"] = f"Skipped (last check {hours_ago:.1f}h ago, interval {SSH_CHECK_INTERVAL_HOURS}h)"
|
|
577
|
+
should_run = False
|
|
578
|
+
except Exception:
|
|
579
|
+
pass
|
|
580
|
+
|
|
581
|
+
if should_run:
|
|
582
|
+
result["status"] = "WARN"
|
|
583
|
+
result["detail"] = "Server cron check not configured — see check_server_crons() to set up SSH+command"
|
|
584
|
+
|
|
585
|
+
results.append(result)
|
|
586
|
+
rc = 1 # Default to indicate SSH did not run
|
|
587
|
+
|
|
588
|
+
return results, should_run
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
# ─── Alerting ─────────────────────────────────────────────────────────────────
|
|
592
|
+
|
|
593
|
+
def get_system_uptime_minutes():
|
|
594
|
+
"""Get system uptime in minutes via sysctl."""
|
|
595
|
+
try:
|
|
596
|
+
r = subprocess.run(
|
|
597
|
+
["sysctl", "-n", "kern.boottime"],
|
|
598
|
+
capture_output=True, text=True, timeout=5
|
|
599
|
+
)
|
|
600
|
+
if r.returncode == 0:
|
|
601
|
+
# Format: { sec = 1709000000, usec = 0 } ...
|
|
602
|
+
import re as _re
|
|
603
|
+
m = _re.search(r'sec\s*=\s*(\d+)', r.stdout)
|
|
604
|
+
if m:
|
|
605
|
+
boot_ts = int(m.group(1))
|
|
606
|
+
return (time.time() - boot_ts) / 60
|
|
607
|
+
except Exception:
|
|
608
|
+
pass
|
|
609
|
+
return 9999 # Assume long uptime if we can't determine
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def detect_new_failures(current_results, previous_status):
|
|
613
|
+
"""Compare current results with previous to find NEW failures.
|
|
614
|
+
|
|
615
|
+
Includes debounce: SSH/server checks need 2 consecutive failures before alerting.
|
|
616
|
+
Includes boot grace: suppresses all alerts within 10 min of system boot.
|
|
617
|
+
"""
|
|
618
|
+
# Boot grace period — suppress alerts when network may still be settling
|
|
619
|
+
uptime = get_system_uptime_minutes()
|
|
620
|
+
if uptime < 10:
|
|
621
|
+
print(f" [GRACE] System uptime {uptime:.0f}min < 10min — suppressing alerts")
|
|
622
|
+
return []
|
|
623
|
+
|
|
624
|
+
prev_checks = {}
|
|
625
|
+
for category in previous_status.get("checks", {}):
|
|
626
|
+
for item in previous_status["checks"][category]:
|
|
627
|
+
key = f"{category}:{item.get('name', '')}"
|
|
628
|
+
prev_checks[key] = item.get("status", "OK")
|
|
629
|
+
|
|
630
|
+
# Load consecutive failure counts for debounce
|
|
631
|
+
consec_file = COORD_DIR / "immune-consecutive-failures.json"
|
|
632
|
+
consec = load_json(consec_file, default={})
|
|
633
|
+
|
|
634
|
+
new_failures = []
|
|
635
|
+
for category, items in current_results.items():
|
|
636
|
+
for item in items:
|
|
637
|
+
key = f"{category}:{item.get('name', '')}"
|
|
638
|
+
current_status = item.get("status", "OK")
|
|
639
|
+
prev_stat = prev_checks.get(key, "OK")
|
|
640
|
+
|
|
641
|
+
if current_status in ("FAIL", "WARN"):
|
|
642
|
+
consec[key] = consec.get(key, 0) + 1
|
|
643
|
+
else:
|
|
644
|
+
consec[key] = 0
|
|
645
|
+
|
|
646
|
+
# Debounce: server/SSH checks need 2+ consecutive failures
|
|
647
|
+
is_server_check = category == "server" or "ssh" in key.lower()
|
|
648
|
+
min_consecutive = 2 if is_server_check else 1
|
|
649
|
+
|
|
650
|
+
if current_status == "FAIL" and prev_stat != "FAIL":
|
|
651
|
+
if consec.get(key, 0) >= min_consecutive:
|
|
652
|
+
new_failures.append(item)
|
|
653
|
+
elif current_status == "WARN" and prev_stat == "OK":
|
|
654
|
+
if consec.get(key, 0) >= min_consecutive:
|
|
655
|
+
new_failures.append(item)
|
|
656
|
+
|
|
657
|
+
save_json(consec_file, consec)
|
|
658
|
+
return new_failures
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def send_failure_alerts(new_failures):
|
|
662
|
+
"""Send WhatsApp alerts for new failures. Max 1 alert per 30 min."""
|
|
663
|
+
if not new_failures:
|
|
664
|
+
return
|
|
665
|
+
|
|
666
|
+
# Global alert cooldown — max 1 WhatsApp alert per 30 minutes
|
|
667
|
+
cooldown_file = COORD_DIR / "immune-last-alert.txt"
|
|
668
|
+
if cooldown_file.exists():
|
|
669
|
+
try:
|
|
670
|
+
last_alert = datetime.strptime(cooldown_file.read_text().strip(), "%Y-%m-%d %H:%M")
|
|
671
|
+
minutes_since = (NOW - last_alert).total_seconds() / 60
|
|
672
|
+
if minutes_since < 30:
|
|
673
|
+
print(f" [COOLDOWN] Last alert {minutes_since:.0f}min ago — suppressing")
|
|
674
|
+
return
|
|
675
|
+
except Exception:
|
|
676
|
+
pass
|
|
677
|
+
|
|
678
|
+
fails = [f for f in new_failures if f["status"] == "FAIL"]
|
|
679
|
+
warns = [f for f in new_failures if f["status"] == "WARN"]
|
|
680
|
+
|
|
681
|
+
sent = False
|
|
682
|
+
if fails:
|
|
683
|
+
lines = [f"- {f['name']}: {f['detail']}" for f in fails[:5]]
|
|
684
|
+
msg = "\n".join(lines)
|
|
685
|
+
if len(fails) > 5:
|
|
686
|
+
msg += f"\n... +{len(fails) - 5} more"
|
|
687
|
+
sent = send_wa_alert(
|
|
688
|
+
"NEXO Immune FAIL",
|
|
689
|
+
f"{len(fails)} new failure(s):\n{msg}"
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
if warns and not fails:
|
|
693
|
+
lines = [f"- {f['name']}: {f['detail']}" for f in warns[:3]]
|
|
694
|
+
msg = "\n".join(lines)
|
|
695
|
+
sent = send_wa_alert(
|
|
696
|
+
"NEXO Immune WARN",
|
|
697
|
+
f"{len(warns)} new warning(s):\n{msg}"
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
if sent:
|
|
701
|
+
cooldown_file.write_text(NOW.strftime("%Y-%m-%d %H:%M"))
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
# ─── Main ─────────────────────────────────────────────────────────────────────
|
|
705
|
+
|
|
706
|
+
def main():
|
|
707
|
+
print(f"\n{'='*60}")
|
|
708
|
+
print(f"NEXO Immune System — {NOW.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
709
|
+
print(f"{'='*60}")
|
|
710
|
+
|
|
711
|
+
# Skip hours gate
|
|
712
|
+
if is_skip_hours():
|
|
713
|
+
print(f"[SKIP] Hour {NOW.hour} is within skip range ({SKIP_START}:00-{SKIP_END}:00). Exiting.")
|
|
714
|
+
return
|
|
715
|
+
|
|
716
|
+
# Ensure coordination directory exists
|
|
717
|
+
COORD_DIR.mkdir(parents=True, exist_ok=True)
|
|
718
|
+
|
|
719
|
+
# Process lock (fcntl)
|
|
720
|
+
lock_fd = None
|
|
721
|
+
try:
|
|
722
|
+
lock_fd = open(LOCK_FILE, "w")
|
|
723
|
+
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
724
|
+
except (IOError, OSError):
|
|
725
|
+
print("[LOCKED] Another immune instance is running. Exiting.")
|
|
726
|
+
if lock_fd:
|
|
727
|
+
lock_fd.close()
|
|
728
|
+
return
|
|
729
|
+
|
|
730
|
+
try:
|
|
731
|
+
_run_checks(lock_fd)
|
|
732
|
+
finally:
|
|
733
|
+
try:
|
|
734
|
+
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
|
735
|
+
lock_fd.close()
|
|
736
|
+
except Exception:
|
|
737
|
+
pass
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def _run_checks(lock_fd):
|
|
741
|
+
"""Execute all checks and produce report."""
|
|
742
|
+
previous_status = load_json(IMMUNE_STATUS)
|
|
743
|
+
|
|
744
|
+
all_results = {}
|
|
745
|
+
repairs = []
|
|
746
|
+
|
|
747
|
+
# 1. Tokens
|
|
748
|
+
print("\n[1/7] Checking tokens...")
|
|
749
|
+
all_results["tokens"] = check_tokens()
|
|
750
|
+
for r in all_results["tokens"]:
|
|
751
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
752
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
753
|
+
|
|
754
|
+
# 2. LaunchAgents
|
|
755
|
+
print("\n[2/7] Checking LaunchAgents...")
|
|
756
|
+
all_results["agents"] = check_launch_agents()
|
|
757
|
+
for r in all_results["agents"]:
|
|
758
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
759
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
760
|
+
if r.get("repaired"):
|
|
761
|
+
repairs.append(f"LaunchAgent {r['name']} reloaded")
|
|
762
|
+
|
|
763
|
+
# 3. Databases
|
|
764
|
+
print("\n[3/7] Checking databases...")
|
|
765
|
+
all_results["databases"] = check_databases()
|
|
766
|
+
for r in all_results["databases"]:
|
|
767
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
768
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
769
|
+
|
|
770
|
+
# 4. Scripts & locks
|
|
771
|
+
print("\n[4/7] Checking scripts & locks...")
|
|
772
|
+
all_results["scripts"] = check_scripts()
|
|
773
|
+
for r in all_results["scripts"]:
|
|
774
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
775
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
776
|
+
if r.get("repaired"):
|
|
777
|
+
repairs.append(f"Stale lock {r['name']} removed")
|
|
778
|
+
|
|
779
|
+
# 5. Logs
|
|
780
|
+
print("\n[5/7] Checking log sizes...")
|
|
781
|
+
all_results["logs"] = check_logs()
|
|
782
|
+
for r in all_results["logs"]:
|
|
783
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
784
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
785
|
+
if r.get("repaired"):
|
|
786
|
+
repairs.append(f"Log {r['name']} truncated")
|
|
787
|
+
|
|
788
|
+
# 6. Disk
|
|
789
|
+
print("\n[6/7] Checking disk usage...")
|
|
790
|
+
all_results["disk"] = check_disk()
|
|
791
|
+
for r in all_results["disk"]:
|
|
792
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
793
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
794
|
+
|
|
795
|
+
# 7. Server crons
|
|
796
|
+
print("\n[7/7] Checking server crons...")
|
|
797
|
+
server_results, ssh_ran = check_server_crons()
|
|
798
|
+
all_results["server"] = server_results
|
|
799
|
+
for r in all_results["server"]:
|
|
800
|
+
icon = "OK" if r["status"] == "OK" else r["status"]
|
|
801
|
+
print(f" [{icon}] {r['name']}: {r['detail']}")
|
|
802
|
+
|
|
803
|
+
# ─── Summary ──────────────────────────────────────────────────────────
|
|
804
|
+
counts = {"OK": 0, "WARN": 0, "FAIL": 0}
|
|
805
|
+
for category_items in all_results.values():
|
|
806
|
+
for item in category_items:
|
|
807
|
+
s = item.get("status", "OK")
|
|
808
|
+
if s in counts:
|
|
809
|
+
counts[s] += 1
|
|
810
|
+
|
|
811
|
+
total = sum(counts.values())
|
|
812
|
+
|
|
813
|
+
print(f"\n{'─'*60}")
|
|
814
|
+
print(f"SUMMARY: {total} checks — {counts['OK']} OK, {counts['WARN']} WARN, {counts['FAIL']} FAIL")
|
|
815
|
+
if repairs:
|
|
816
|
+
print(f"AUTO-REPAIRS: {len(repairs)}")
|
|
817
|
+
for r in repairs:
|
|
818
|
+
print(f" - {r}")
|
|
819
|
+
print(f"{'─'*60}\n")
|
|
820
|
+
|
|
821
|
+
# ─── Detect new failures & alert ──────────────────────────────────────
|
|
822
|
+
new_failures = detect_new_failures(all_results, previous_status)
|
|
823
|
+
if new_failures:
|
|
824
|
+
print(f"[ALERT] {len(new_failures)} new failure(s)/warning(s) detected:")
|
|
825
|
+
for nf in new_failures:
|
|
826
|
+
print(f" - [{nf['status']}] {nf['name']}: {nf['detail']}")
|
|
827
|
+
send_failure_alerts(new_failures)
|
|
828
|
+
else:
|
|
829
|
+
print("[OK] No new failures.")
|
|
830
|
+
|
|
831
|
+
# ─── Save status ──────────────────────────────────────────────────────
|
|
832
|
+
status = {
|
|
833
|
+
"last_run": NOW.strftime("%Y-%m-%d %H:%M"),
|
|
834
|
+
"counts": counts,
|
|
835
|
+
"repairs": repairs,
|
|
836
|
+
"new_failures": len(new_failures),
|
|
837
|
+
"checks": all_results,
|
|
838
|
+
}
|
|
839
|
+
if ssh_ran:
|
|
840
|
+
status["last_ssh_check"] = NOW.strftime("%Y-%m-%d %H:%M")
|
|
841
|
+
elif "last_ssh_check" in previous_status:
|
|
842
|
+
status["last_ssh_check"] = previous_status["last_ssh_check"]
|
|
843
|
+
|
|
844
|
+
save_json(IMMUNE_STATUS, status)
|
|
845
|
+
|
|
846
|
+
# ─── Append to log ────────────────────────────────────────────────────
|
|
847
|
+
log_entry = {
|
|
848
|
+
"ts": NOW.strftime("%Y-%m-%d %H:%M"),
|
|
849
|
+
"ok": counts["OK"],
|
|
850
|
+
"warn": counts["WARN"],
|
|
851
|
+
"fail": counts["FAIL"],
|
|
852
|
+
"repairs": len(repairs),
|
|
853
|
+
"new_failures": len(new_failures),
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
log = load_json(IMMUNE_LOG, default=[])
|
|
857
|
+
if not isinstance(log, list):
|
|
858
|
+
log = []
|
|
859
|
+
log.append(log_entry)
|
|
860
|
+
if len(log) > MAX_LOG_ENTRIES:
|
|
861
|
+
log = log[-MAX_LOG_ENTRIES:]
|
|
862
|
+
save_json(IMMUNE_LOG, log)
|
|
863
|
+
|
|
864
|
+
print(f"Status saved to {IMMUNE_STATUS}")
|
|
865
|
+
print(f"Log appended to {IMMUNE_LOG} ({len(log)} entries)")
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
if __name__ == "__main__":
|
|
869
|
+
main()
|