nexo-brain 0.2.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,869 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ NEXO Immune System — Health monitor & auto-repair.
4
+
5
+ Runs every 30 minutes via LaunchAgent. Checks tokens, LaunchAgents, DBs,
6
+ scripts, logs, disk, and server crons. Auto-repairs what it can, alerts
7
+ User via WhatsApp only on NEW failures.
8
+
9
+ Zero external dependencies. Stdlib + sqlite3 + urllib only.
10
+ """
11
+
12
+ import fcntl
13
+ import json
14
+ import os
15
+ import re
16
+ import shlex
17
+ import shutil
18
+ import signal
19
+ import sqlite3
20
+ import ssl
21
+ import subprocess
22
+ import sys
23
+ import time
24
+ from datetime import datetime, date, timedelta
25
+ from pathlib import Path
26
+ from urllib.request import Request, urlopen
27
+ from urllib.error import URLError, HTTPError
28
+
29
+ # ─── SSL context for macOS (certifi or system certs) ─────────────────────────
30
+ def _make_ssl_context():
31
+ """Create an SSL context that works on macOS with Python.org Python."""
32
+ # Try certifi first (pip-installed)
33
+ try:
34
+ import certifi
35
+ ctx = ssl.create_default_context(cafile=certifi.where())
36
+ return ctx
37
+ except ImportError:
38
+ pass
39
+ # Try macOS system certificates
40
+ for ca_path in [
41
+ "/etc/ssl/cert.pem",
42
+ "/usr/local/etc/openssl/cert.pem",
43
+ "/usr/local/etc/openssl@3/cert.pem",
44
+ "/opt/homebrew/etc/openssl@3/cert.pem",
45
+ ]:
46
+ if os.path.exists(ca_path):
47
+ ctx = ssl.create_default_context(cafile=ca_path)
48
+ return ctx
49
+ # Last resort: unverified (still better than crashing)
50
+ ctx = ssl.create_default_context()
51
+ ctx.check_hostname = False
52
+ ctx.verify_mode = ssl.CERT_NONE
53
+ return ctx
54
+
55
+ SSL_CTX = _make_ssl_context()
56
+
57
+ # ─── Paths ────────────────────────────────────────────────────────────────────
58
+ HOME = Path.home()
59
+ CLAUDE_DIR = HOME / "claude"
60
+ COORD_DIR = CLAUDE_DIR / "coordination"
61
+ BRAIN_DIR = CLAUDE_DIR / "brain"
62
+ SCRIPTS_DIR = CLAUDE_DIR / "scripts"
63
+
64
+ IMMUNE_STATUS = COORD_DIR / "immune-status.json"
65
+ IMMUNE_LOG = COORD_DIR / "immune-log.json"
66
+ LOCK_FILE = COORD_DIR / "immune-process.lock"
67
+
68
+ WA_NOTIFY = SCRIPTS_DIR / "nexo-whatsapp-notify.sh"
69
+
70
+ CLAUDE_MEM_DB = HOME / ".claude-mem" / "claude-mem.db"
71
+
72
+ LAUNCH_AGENTS_DIR = HOME / "Library" / "LaunchAgents"
73
+
74
+ NOW = datetime.now()
75
+ TODAY = date.today()
76
+
77
+ # ─── Config ───────────────────────────────────────────────────────────────────
78
+
79
+ TOKEN_CHECKS = [
80
+ {
81
+ "name": "Meta Ads",
82
+ "path": "~/.claude/meta_token.txt",
83
+ "type": "file_text",
84
+ "test_url": "https://graph.facebook.com/v21.0/me?access_token={token}",
85
+ },
86
+ {
87
+ "name": "Instagram",
88
+ "path": "~/.claude/instagram_token.txt",
89
+ "type": "file_text",
90
+ "test_url": "https://graph.instagram.com/v21.0/me?access_token={token}",
91
+ },
92
+ {
93
+ "name": "YouTube",
94
+ "path": "~/.claude/youtube_token.json",
95
+ "type": "json_field",
96
+ },
97
+ {
98
+ "name": "X/Twitter",
99
+ "path": "~/.claude/x_credentials.json",
100
+ "type": "json_field",
101
+ },
102
+ {
103
+ "name": "GA4 Service Account",
104
+ "path": "~/.claude/ga4-service-account.json",
105
+ "type": "service_account",
106
+ },
107
+ # Example: Shopify Admin token check
108
+ # {
109
+ # "name": "Shopify Admin",
110
+ # "type": "hardcoded",
111
+ # "test_url": "https://YOUR_STORE.myshopify.com/admin/api/2024-01/shop.json",
112
+ # "token": "YOUR_SHOPIFY_ADMIN_TOKEN",
113
+ # "header": "X-Shopify-Access-Token",
114
+ # },
115
+ ]
116
+
117
+ EXPECTED_AGENTS = [
118
+ "com.nexo.immune",
119
+ "com.nexo.sleep",
120
+ "com.nexo.synthesis",
121
+ ]
122
+
123
+ # SSH check interval — only every 2 hours, not every 30 min
124
+ SSH_CHECK_INTERVAL_HOURS = 2
125
+
126
+ # Log size thresholds (bytes)
127
+ LOG_WARN_SIZE = 10 * 1024 * 1024 # 10 MB
128
+ LOG_FAIL_SIZE = 50 * 1024 * 1024 # 50 MB
129
+ LOG_TRUNCATE_SIZE = 50 * 1024 * 1024 # 50 MB — auto-truncate threshold
130
+
131
+ # Disk thresholds (percentage used)
132
+ DISK_WARN_PCT = 85
133
+ DISK_FAIL_PCT = 95
134
+
135
+ # Quiet hours — no WhatsApp alerts
136
+ QUIET_START = 23 # 23:00
137
+ QUIET_END = 7 # 07:00
138
+
139
+ # Skip execution hours (deep night)
140
+ SKIP_START = 0 # 00:00
141
+ SKIP_END = 6 # 06:00
142
+
143
+ # Max entries in immune-log.json
144
+ MAX_LOG_ENTRIES = 500
145
+
146
+ # HTTP timeout for token checks
147
+ HTTP_TIMEOUT = 10
148
+
149
+ # SSH timeout
150
+ SSH_TIMEOUT = 15
151
+
152
+
153
+ # ─── Helpers ──────────────────────────────────────────────────────────────────
154
+
155
+ def load_json(path, default=None):
156
+ if not path.exists():
157
+ return default if default is not None else {}
158
+ try:
159
+ return json.loads(path.read_text())
160
+ except Exception:
161
+ return default if default is not None else {}
162
+
163
+
164
+ def save_json(path, data):
165
+ path.write_text(json.dumps(data, indent=2, ensure_ascii=False))
166
+
167
+
168
+ def is_quiet_hours():
169
+ """Check if within WhatsApp quiet hours (23:00 - 07:00)."""
170
+ h = NOW.hour
171
+ if QUIET_START > QUIET_END:
172
+ return h >= QUIET_START or h < QUIET_END
173
+ return QUIET_START <= h < QUIET_END
174
+
175
+
176
+ def is_skip_hours():
177
+ """Check if within skip hours (00:00 - 06:00)."""
178
+ return SKIP_START <= NOW.hour < SKIP_END
179
+
180
+
181
+ def send_wa_alert(title, message):
182
+ """Send WhatsApp alert if not in quiet hours."""
183
+ if is_quiet_hours():
184
+ print(f" [QUIET] Suppressed WA alert: {title}")
185
+ return False
186
+ try:
187
+ subprocess.run(
188
+ [str(WA_NOTIFY), title, message],
189
+ timeout=15,
190
+ capture_output=True,
191
+ )
192
+ print(f" [WA] Sent alert: {title}")
193
+ return True
194
+ except Exception as e:
195
+ print(f" [WA] Failed to send: {e}")
196
+ return False
197
+
198
+
199
+ def http_get(url, headers=None, timeout=HTTP_TIMEOUT):
200
+ """Simple HTTP GET, returns (status_code, body) or (0, error_string)."""
201
+ try:
202
+ req = Request(url)
203
+ if headers:
204
+ for k, v in headers.items():
205
+ req.add_header(k, v)
206
+ with urlopen(req, timeout=timeout, context=SSL_CTX) as resp:
207
+ body = resp.read().decode("utf-8", errors="replace")
208
+ return resp.status, body
209
+ except HTTPError as e:
210
+ return e.code, str(e)
211
+ except URLError as e:
212
+ return 0, str(e.reason)
213
+ except Exception as e:
214
+ return 0, str(e)
215
+
216
+
217
+ def run_cmd(cmd, timeout=30):
218
+ """Run a command without invoking a shell. Accepts string or argv list."""
219
+ try:
220
+ argv = shlex.split(cmd) if isinstance(cmd, str) else list(cmd)
221
+ r = subprocess.run(
222
+ argv, capture_output=True, text=True, timeout=timeout
223
+ )
224
+ return r.returncode, r.stdout.strip(), r.stderr.strip()
225
+ except subprocess.TimeoutExpired:
226
+ return -1, "", "timeout"
227
+ except Exception as e:
228
+ return -1, "", str(e)
229
+
230
+
231
+ def pid_alive(pid):
232
+ """Check if a PID is still running."""
233
+ try:
234
+ os.kill(pid, 0)
235
+ return True
236
+ except (OSError, ProcessLookupError):
237
+ return False
238
+
239
+
240
+ # ─── Check Functions ──────────────────────────────────────────────────────────
241
+
242
+ def check_tokens():
243
+ """Check all configured tokens. Returns list of result dicts."""
244
+ results = []
245
+
246
+ for tc in TOKEN_CHECKS:
247
+ name = tc["name"]
248
+ result = {"name": name, "status": "OK", "detail": ""}
249
+
250
+ try:
251
+ if tc["type"] == "file_text":
252
+ path = Path(tc["path"]).expanduser()
253
+ if not path.exists():
254
+ result["status"] = "FAIL"
255
+ result["detail"] = f"Token file missing: {path}"
256
+ else:
257
+ token = path.read_text().strip()
258
+ if not token:
259
+ result["status"] = "FAIL"
260
+ result["detail"] = "Token file empty"
261
+ elif "test_url" in tc:
262
+ url = tc["test_url"].format(token=token)
263
+ code, body = http_get(url)
264
+ if code == 200:
265
+ result["detail"] = "HTTP 200 OK"
266
+ elif code == 190 or (isinstance(body, str) and "expired" in body.lower()):
267
+ result["status"] = "FAIL"
268
+ result["detail"] = f"Token expired (HTTP {code})"
269
+ else:
270
+ result["status"] = "FAIL"
271
+ result["detail"] = f"HTTP {code}: {body[:200]}"
272
+
273
+ elif tc["type"] == "json_field":
274
+ path = Path(tc["path"]).expanduser()
275
+ if not path.exists():
276
+ result["status"] = "FAIL"
277
+ result["detail"] = f"Token file missing: {path}"
278
+ else:
279
+ data = load_json(path, default=None)
280
+ if data is None:
281
+ result["status"] = "FAIL"
282
+ result["detail"] = "Invalid JSON"
283
+ elif "refresh_token" not in data:
284
+ result["status"] = "FAIL"
285
+ result["detail"] = "No refresh_token in JSON"
286
+ else:
287
+ result["detail"] = "refresh_token present"
288
+
289
+ elif tc["type"] == "service_account":
290
+ path = Path(tc["path"]).expanduser()
291
+ if not path.exists():
292
+ result["status"] = "FAIL"
293
+ result["detail"] = f"Service account file missing: {path}"
294
+ else:
295
+ data = load_json(path, default=None)
296
+ if data is None:
297
+ result["status"] = "FAIL"
298
+ result["detail"] = "Invalid JSON"
299
+ elif "private_key" not in data or "client_email" not in data:
300
+ result["status"] = "FAIL"
301
+ result["detail"] = "Missing private_key or client_email"
302
+ else:
303
+ result["detail"] = f"SA: {data.get('client_email', '?')[:40]}"
304
+
305
+ elif tc["type"] == "hardcoded":
306
+ url = tc["test_url"]
307
+ headers = {tc["header"]: tc["token"]}
308
+ code, body = http_get(url, headers=headers)
309
+ if code == 200:
310
+ result["detail"] = "HTTP 200 OK"
311
+ elif code == 401:
312
+ result["status"] = "FAIL"
313
+ result["detail"] = "Token unauthorized (401)"
314
+ else:
315
+ result["status"] = "FAIL"
316
+ result["detail"] = f"HTTP {code}: {body[:200]}"
317
+
318
+ except Exception as e:
319
+ result["status"] = "FAIL"
320
+ result["detail"] = f"Exception: {str(e)[:200]}"
321
+
322
+ results.append(result)
323
+
324
+ return results
325
+
326
+
327
+ def check_launch_agents():
328
+ """Check that expected LaunchAgents are loaded. Auto-repair if not."""
329
+ results = []
330
+
331
+ # Get list of loaded agents
332
+ rc, stdout, _ = run_cmd("launchctl list")
333
+ loaded_labels = set()
334
+ if rc == 0:
335
+ for line in stdout.splitlines():
336
+ parts = line.split("\t")
337
+ if len(parts) >= 3:
338
+ loaded_labels.add(parts[2])
339
+
340
+ for agent in EXPECTED_AGENTS:
341
+ result = {"name": agent, "status": "OK", "detail": "", "repaired": False}
342
+
343
+ if agent in loaded_labels:
344
+ result["detail"] = "Loaded"
345
+ else:
346
+ # Try auto-repair
347
+ plist = LAUNCH_AGENTS_DIR / f"{agent}.plist"
348
+ if plist.exists():
349
+ rc, out, err = run_cmd(f"launchctl load '{plist}'")
350
+ if rc == 0:
351
+ result["status"] = "WARN"
352
+ result["detail"] = f"Was unloaded, auto-loaded successfully"
353
+ result["repaired"] = True
354
+ else:
355
+ result["status"] = "FAIL"
356
+ result["detail"] = f"Unloaded, auto-load failed: {err[:100]}"
357
+ else:
358
+ result["status"] = "FAIL"
359
+ result["detail"] = f"Unloaded, plist not found: {plist}"
360
+
361
+ results.append(result)
362
+
363
+ return results
364
+
365
+
366
+ def check_databases():
367
+ """Run PRAGMA integrity_check on known databases."""
368
+ results = []
369
+
370
+ dbs = [
371
+ ("nexo.db", Path.home() / "claude" / "nexo-mcp" / "nexo.db"),
372
+ ("cognitive.db", Path.home() / "claude" / "nexo-mcp" / "cognitive.db"),
373
+ ("claude-mem.db", CLAUDE_MEM_DB),
374
+ ]
375
+
376
+ for name, path in dbs:
377
+ result = {"name": name, "status": "OK", "detail": ""}
378
+
379
+ if not path.exists():
380
+ result["status"] = "FAIL"
381
+ result["detail"] = f"File missing: {path}"
382
+ else:
383
+ try:
384
+ conn = sqlite3.connect(str(path), timeout=5)
385
+ cursor = conn.execute("PRAGMA integrity_check")
386
+ check_result = cursor.fetchone()[0]
387
+ conn.close()
388
+ if check_result == "ok":
389
+ size_mb = path.stat().st_size / (1024 * 1024)
390
+ result["detail"] = f"Integrity OK ({size_mb:.1f} MB)"
391
+ else:
392
+ result["status"] = "FAIL"
393
+ result["detail"] = f"Integrity failed: {check_result[:200]}"
394
+ except Exception as e:
395
+ result["status"] = "FAIL"
396
+ result["detail"] = f"Error: {str(e)[:200]}"
397
+
398
+ results.append(result)
399
+
400
+ return results
401
+
402
+
403
+ def check_scripts():
404
+ """Check stale lock files."""
405
+ results = []
406
+
407
+ # Stale lock files (PID dead)
408
+ lock_files = list(COORD_DIR.glob("*.lock"))
409
+ for lf in lock_files:
410
+ if lf == LOCK_FILE:
411
+ continue # Skip our own lock
412
+ result = {"name": f"lock:{lf.name}", "status": "OK", "detail": "", "repaired": False}
413
+ try:
414
+ content = lf.read_text().strip()
415
+ if content and content.isdigit():
416
+ pid = int(content)
417
+ if pid_alive(pid):
418
+ result["detail"] = f"PID {pid} alive"
419
+ else:
420
+ # Auto-repair: remove stale lock
421
+ lf.unlink()
422
+ result["status"] = "WARN"
423
+ result["detail"] = f"PID {pid} dead — lock removed"
424
+ result["repaired"] = True
425
+ elif content:
426
+ # Lock file has non-PID content — check if size 0 (normal flock pattern)
427
+ if lf.stat().st_size == 0:
428
+ result["detail"] = "Empty lock (flock pattern)"
429
+ else:
430
+ result["detail"] = f"Non-PID content: {content[:50]}"
431
+ else:
432
+ result["detail"] = "Empty lock file"
433
+ except Exception as e:
434
+ result["detail"] = f"Error checking: {e}"
435
+ results.append(result)
436
+
437
+ return results
438
+
439
+
440
+ def check_logs():
441
+ """Check log file sizes. Auto-truncate if > 50 MB."""
442
+ results = []
443
+
444
+ # JSON logs to check
445
+ json_logs = [
446
+ COORD_DIR / "heartbeat-log.json",
447
+ COORD_DIR / "reflection-log.json",
448
+ COORD_DIR / "immune-log.json",
449
+ COORD_DIR / "ops-board.json",
450
+ COORD_DIR / "messages.json",
451
+ ]
452
+
453
+ # Text logs to check
454
+ text_logs = [
455
+ COORD_DIR / "heartbeat-stdout.log",
456
+ COORD_DIR / "heartbeat-stderr.log",
457
+ COORD_DIR / "reflection-stdout.log",
458
+ COORD_DIR / "reflection-stderr.log",
459
+ COORD_DIR / "immune-stdout.log",
460
+ COORD_DIR / "immune-stderr.log",
461
+ ]
462
+
463
+ for log_path in json_logs + text_logs:
464
+ if not log_path.exists():
465
+ continue
466
+
467
+ result = {"name": log_path.name, "status": "OK", "detail": "", "repaired": False}
468
+ size = log_path.stat().st_size
469
+ size_mb = size / (1024 * 1024)
470
+
471
+ if size >= LOG_FAIL_SIZE:
472
+ result["status"] = "FAIL"
473
+ result["detail"] = f"{size_mb:.1f} MB — exceeds {LOG_FAIL_SIZE // (1024*1024)} MB"
474
+
475
+ # Auto-truncate
476
+ try:
477
+ if log_path.suffix == ".json":
478
+ _truncate_json_log(log_path, keep_entries=200)
479
+ else:
480
+ _truncate_text_log(log_path, keep_lines=1000)
481
+ new_size = log_path.stat().st_size / (1024 * 1024)
482
+ result["detail"] += f" -> truncated to {new_size:.1f} MB"
483
+ result["repaired"] = True
484
+ except Exception as e:
485
+ result["detail"] += f" -> truncate failed: {e}"
486
+
487
+ elif size >= LOG_WARN_SIZE:
488
+ result["status"] = "WARN"
489
+ result["detail"] = f"{size_mb:.1f} MB — approaching limit"
490
+ else:
491
+ result["detail"] = f"{size_mb:.2f} MB"
492
+
493
+ results.append(result)
494
+
495
+ return results
496
+
497
+
498
+ def _truncate_json_log(path, keep_entries=200):
499
+ """Truncate a JSON log file to the last N entries."""
500
+ data = load_json(path, default=[])
501
+ if isinstance(data, list) and len(data) > keep_entries:
502
+ data = data[-keep_entries:]
503
+ save_json(path, data)
504
+ elif isinstance(data, dict):
505
+ # Some logs are dicts with a list value
506
+ for key in data:
507
+ if isinstance(data[key], list) and len(data[key]) > keep_entries:
508
+ data[key] = data[key][-keep_entries:]
509
+ save_json(path, data)
510
+
511
+
512
+ def _truncate_text_log(path, keep_lines=1000):
513
+ """Truncate a text log to the last N lines."""
514
+ lines = path.read_text().splitlines()
515
+ if len(lines) > keep_lines:
516
+ path.write_text("\n".join(lines[-keep_lines:]) + "\n")
517
+
518
+
519
+ def check_disk():
520
+ """Check disk usage via os.statvfs."""
521
+ results = []
522
+ result = {"name": "disk:/", "status": "OK", "detail": ""}
523
+
524
+ try:
525
+ st = os.statvfs("/")
526
+ total = st.f_frsize * st.f_blocks
527
+ avail = st.f_frsize * st.f_bavail
528
+ used = total - avail
529
+ pct = (used / total) * 100 if total > 0 else 0
530
+
531
+ avail_gb = avail / (1024 ** 3)
532
+ total_gb = total / (1024 ** 3)
533
+
534
+ if pct >= DISK_FAIL_PCT:
535
+ result["status"] = "FAIL"
536
+ result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
537
+ elif pct >= DISK_WARN_PCT:
538
+ result["status"] = "WARN"
539
+ result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
540
+ else:
541
+ result["detail"] = f"{pct:.1f}% used ({avail_gb:.1f} GB free of {total_gb:.0f} GB)"
542
+ except Exception as e:
543
+ result["status"] = "FAIL"
544
+ result["detail"] = f"Error: {e}"
545
+
546
+ results.append(result)
547
+ return results
548
+
549
+
550
+ def check_server_crons():
551
+ """Check remote server crons via SSH. Only runs every 2 hours.
552
+
553
+ Configure SSH_HOST, SSH_PORT, SSH_USER and the cron check command for your server.
554
+ Example: check that a MySQL/cron log table has entries for today.
555
+ """
556
+ results = []
557
+ result = {"name": "server-crons", "status": "OK", "detail": ""}
558
+
559
+ # ── Configure for your server ──────────────────────────────────────────────
560
+ # SSH_HOST = "your-server.example.com"
561
+ # SSH_PORT = 22
562
+ # SSH_USER = "root"
563
+ # CRON_CHECK_CMD = '"echo cron-check-not-configured"'
564
+ # ───────────────────────────────────────────────────────────────────────────
565
+
566
+ # Check if we should run (every 2 hours based on last check)
567
+ status = load_json(IMMUNE_STATUS)
568
+ last_ssh_str = status.get("last_ssh_check", "")
569
+ should_run = True
570
+
571
+ if last_ssh_str:
572
+ try:
573
+ last_ssh = datetime.strptime(last_ssh_str, "%Y-%m-%d %H:%M")
574
+ hours_ago = (NOW - last_ssh).total_seconds() / 3600
575
+ if hours_ago < SSH_CHECK_INTERVAL_HOURS:
576
+ result["detail"] = f"Skipped (last check {hours_ago:.1f}h ago, interval {SSH_CHECK_INTERVAL_HOURS}h)"
577
+ should_run = False
578
+ except Exception:
579
+ pass
580
+
581
+ if should_run:
582
+ result["status"] = "WARN"
583
+ result["detail"] = "Server cron check not configured — see check_server_crons() to set up SSH+command"
584
+
585
+ results.append(result)
586
+ rc = 1 # Default to indicate SSH did not run
587
+
588
+ return results, should_run
589
+
590
+
591
+ # ─── Alerting ─────────────────────────────────────────────────────────────────
592
+
593
+ def get_system_uptime_minutes():
594
+ """Get system uptime in minutes via sysctl."""
595
+ try:
596
+ r = subprocess.run(
597
+ ["sysctl", "-n", "kern.boottime"],
598
+ capture_output=True, text=True, timeout=5
599
+ )
600
+ if r.returncode == 0:
601
+ # Format: { sec = 1709000000, usec = 0 } ...
602
+ import re as _re
603
+ m = _re.search(r'sec\s*=\s*(\d+)', r.stdout)
604
+ if m:
605
+ boot_ts = int(m.group(1))
606
+ return (time.time() - boot_ts) / 60
607
+ except Exception:
608
+ pass
609
+ return 9999 # Assume long uptime if we can't determine
610
+
611
+
612
+ def detect_new_failures(current_results, previous_status):
613
+ """Compare current results with previous to find NEW failures.
614
+
615
+ Includes debounce: SSH/server checks need 2 consecutive failures before alerting.
616
+ Includes boot grace: suppresses all alerts within 10 min of system boot.
617
+ """
618
+ # Boot grace period — suppress alerts when network may still be settling
619
+ uptime = get_system_uptime_minutes()
620
+ if uptime < 10:
621
+ print(f" [GRACE] System uptime {uptime:.0f}min < 10min — suppressing alerts")
622
+ return []
623
+
624
+ prev_checks = {}
625
+ for category in previous_status.get("checks", {}):
626
+ for item in previous_status["checks"][category]:
627
+ key = f"{category}:{item.get('name', '')}"
628
+ prev_checks[key] = item.get("status", "OK")
629
+
630
+ # Load consecutive failure counts for debounce
631
+ consec_file = COORD_DIR / "immune-consecutive-failures.json"
632
+ consec = load_json(consec_file, default={})
633
+
634
+ new_failures = []
635
+ for category, items in current_results.items():
636
+ for item in items:
637
+ key = f"{category}:{item.get('name', '')}"
638
+ current_status = item.get("status", "OK")
639
+ prev_stat = prev_checks.get(key, "OK")
640
+
641
+ if current_status in ("FAIL", "WARN"):
642
+ consec[key] = consec.get(key, 0) + 1
643
+ else:
644
+ consec[key] = 0
645
+
646
+ # Debounce: server/SSH checks need 2+ consecutive failures
647
+ is_server_check = category == "server" or "ssh" in key.lower()
648
+ min_consecutive = 2 if is_server_check else 1
649
+
650
+ if current_status == "FAIL" and prev_stat != "FAIL":
651
+ if consec.get(key, 0) >= min_consecutive:
652
+ new_failures.append(item)
653
+ elif current_status == "WARN" and prev_stat == "OK":
654
+ if consec.get(key, 0) >= min_consecutive:
655
+ new_failures.append(item)
656
+
657
+ save_json(consec_file, consec)
658
+ return new_failures
659
+
660
+
661
+ def send_failure_alerts(new_failures):
662
+ """Send WhatsApp alerts for new failures. Max 1 alert per 30 min."""
663
+ if not new_failures:
664
+ return
665
+
666
+ # Global alert cooldown — max 1 WhatsApp alert per 30 minutes
667
+ cooldown_file = COORD_DIR / "immune-last-alert.txt"
668
+ if cooldown_file.exists():
669
+ try:
670
+ last_alert = datetime.strptime(cooldown_file.read_text().strip(), "%Y-%m-%d %H:%M")
671
+ minutes_since = (NOW - last_alert).total_seconds() / 60
672
+ if minutes_since < 30:
673
+ print(f" [COOLDOWN] Last alert {minutes_since:.0f}min ago — suppressing")
674
+ return
675
+ except Exception:
676
+ pass
677
+
678
+ fails = [f for f in new_failures if f["status"] == "FAIL"]
679
+ warns = [f for f in new_failures if f["status"] == "WARN"]
680
+
681
+ sent = False
682
+ if fails:
683
+ lines = [f"- {f['name']}: {f['detail']}" for f in fails[:5]]
684
+ msg = "\n".join(lines)
685
+ if len(fails) > 5:
686
+ msg += f"\n... +{len(fails) - 5} more"
687
+ sent = send_wa_alert(
688
+ "NEXO Immune FAIL",
689
+ f"{len(fails)} new failure(s):\n{msg}"
690
+ )
691
+
692
+ if warns and not fails:
693
+ lines = [f"- {f['name']}: {f['detail']}" for f in warns[:3]]
694
+ msg = "\n".join(lines)
695
+ sent = send_wa_alert(
696
+ "NEXO Immune WARN",
697
+ f"{len(warns)} new warning(s):\n{msg}"
698
+ )
699
+
700
+ if sent:
701
+ cooldown_file.write_text(NOW.strftime("%Y-%m-%d %H:%M"))
702
+
703
+
704
+ # ─── Main ─────────────────────────────────────────────────────────────────────
705
+
706
+ def main():
707
+ print(f"\n{'='*60}")
708
+ print(f"NEXO Immune System — {NOW.strftime('%Y-%m-%d %H:%M:%S')}")
709
+ print(f"{'='*60}")
710
+
711
+ # Skip hours gate
712
+ if is_skip_hours():
713
+ print(f"[SKIP] Hour {NOW.hour} is within skip range ({SKIP_START}:00-{SKIP_END}:00). Exiting.")
714
+ return
715
+
716
+ # Ensure coordination directory exists
717
+ COORD_DIR.mkdir(parents=True, exist_ok=True)
718
+
719
+ # Process lock (fcntl)
720
+ lock_fd = None
721
+ try:
722
+ lock_fd = open(LOCK_FILE, "w")
723
+ fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
724
+ except (IOError, OSError):
725
+ print("[LOCKED] Another immune instance is running. Exiting.")
726
+ if lock_fd:
727
+ lock_fd.close()
728
+ return
729
+
730
+ try:
731
+ _run_checks(lock_fd)
732
+ finally:
733
+ try:
734
+ fcntl.flock(lock_fd, fcntl.LOCK_UN)
735
+ lock_fd.close()
736
+ except Exception:
737
+ pass
738
+
739
+
740
+ def _run_checks(lock_fd):
741
+ """Execute all checks and produce report."""
742
+ previous_status = load_json(IMMUNE_STATUS)
743
+
744
+ all_results = {}
745
+ repairs = []
746
+
747
+ # 1. Tokens
748
+ print("\n[1/7] Checking tokens...")
749
+ all_results["tokens"] = check_tokens()
750
+ for r in all_results["tokens"]:
751
+ icon = "OK" if r["status"] == "OK" else r["status"]
752
+ print(f" [{icon}] {r['name']}: {r['detail']}")
753
+
754
+ # 2. LaunchAgents
755
+ print("\n[2/7] Checking LaunchAgents...")
756
+ all_results["agents"] = check_launch_agents()
757
+ for r in all_results["agents"]:
758
+ icon = "OK" if r["status"] == "OK" else r["status"]
759
+ print(f" [{icon}] {r['name']}: {r['detail']}")
760
+ if r.get("repaired"):
761
+ repairs.append(f"LaunchAgent {r['name']} reloaded")
762
+
763
+ # 3. Databases
764
+ print("\n[3/7] Checking databases...")
765
+ all_results["databases"] = check_databases()
766
+ for r in all_results["databases"]:
767
+ icon = "OK" if r["status"] == "OK" else r["status"]
768
+ print(f" [{icon}] {r['name']}: {r['detail']}")
769
+
770
+ # 4. Scripts & locks
771
+ print("\n[4/7] Checking scripts & locks...")
772
+ all_results["scripts"] = check_scripts()
773
+ for r in all_results["scripts"]:
774
+ icon = "OK" if r["status"] == "OK" else r["status"]
775
+ print(f" [{icon}] {r['name']}: {r['detail']}")
776
+ if r.get("repaired"):
777
+ repairs.append(f"Stale lock {r['name']} removed")
778
+
779
+ # 5. Logs
780
+ print("\n[5/7] Checking log sizes...")
781
+ all_results["logs"] = check_logs()
782
+ for r in all_results["logs"]:
783
+ icon = "OK" if r["status"] == "OK" else r["status"]
784
+ print(f" [{icon}] {r['name']}: {r['detail']}")
785
+ if r.get("repaired"):
786
+ repairs.append(f"Log {r['name']} truncated")
787
+
788
+ # 6. Disk
789
+ print("\n[6/7] Checking disk usage...")
790
+ all_results["disk"] = check_disk()
791
+ for r in all_results["disk"]:
792
+ icon = "OK" if r["status"] == "OK" else r["status"]
793
+ print(f" [{icon}] {r['name']}: {r['detail']}")
794
+
795
+ # 7. Server crons
796
+ print("\n[7/7] Checking server crons...")
797
+ server_results, ssh_ran = check_server_crons()
798
+ all_results["server"] = server_results
799
+ for r in all_results["server"]:
800
+ icon = "OK" if r["status"] == "OK" else r["status"]
801
+ print(f" [{icon}] {r['name']}: {r['detail']}")
802
+
803
+ # ─── Summary ──────────────────────────────────────────────────────────
804
+ counts = {"OK": 0, "WARN": 0, "FAIL": 0}
805
+ for category_items in all_results.values():
806
+ for item in category_items:
807
+ s = item.get("status", "OK")
808
+ if s in counts:
809
+ counts[s] += 1
810
+
811
+ total = sum(counts.values())
812
+
813
+ print(f"\n{'─'*60}")
814
+ print(f"SUMMARY: {total} checks — {counts['OK']} OK, {counts['WARN']} WARN, {counts['FAIL']} FAIL")
815
+ if repairs:
816
+ print(f"AUTO-REPAIRS: {len(repairs)}")
817
+ for r in repairs:
818
+ print(f" - {r}")
819
+ print(f"{'─'*60}\n")
820
+
821
+ # ─── Detect new failures & alert ──────────────────────────────────────
822
+ new_failures = detect_new_failures(all_results, previous_status)
823
+ if new_failures:
824
+ print(f"[ALERT] {len(new_failures)} new failure(s)/warning(s) detected:")
825
+ for nf in new_failures:
826
+ print(f" - [{nf['status']}] {nf['name']}: {nf['detail']}")
827
+ send_failure_alerts(new_failures)
828
+ else:
829
+ print("[OK] No new failures.")
830
+
831
+ # ─── Save status ──────────────────────────────────────────────────────
832
+ status = {
833
+ "last_run": NOW.strftime("%Y-%m-%d %H:%M"),
834
+ "counts": counts,
835
+ "repairs": repairs,
836
+ "new_failures": len(new_failures),
837
+ "checks": all_results,
838
+ }
839
+ if ssh_ran:
840
+ status["last_ssh_check"] = NOW.strftime("%Y-%m-%d %H:%M")
841
+ elif "last_ssh_check" in previous_status:
842
+ status["last_ssh_check"] = previous_status["last_ssh_check"]
843
+
844
+ save_json(IMMUNE_STATUS, status)
845
+
846
+ # ─── Append to log ────────────────────────────────────────────────────
847
+ log_entry = {
848
+ "ts": NOW.strftime("%Y-%m-%d %H:%M"),
849
+ "ok": counts["OK"],
850
+ "warn": counts["WARN"],
851
+ "fail": counts["FAIL"],
852
+ "repairs": len(repairs),
853
+ "new_failures": len(new_failures),
854
+ }
855
+
856
+ log = load_json(IMMUNE_LOG, default=[])
857
+ if not isinstance(log, list):
858
+ log = []
859
+ log.append(log_entry)
860
+ if len(log) > MAX_LOG_ENTRIES:
861
+ log = log[-MAX_LOG_ENTRIES:]
862
+ save_json(IMMUNE_LOG, log)
863
+
864
+ print(f"Status saved to {IMMUNE_STATUS}")
865
+ print(f"Log appended to {IMMUNE_LOG} ({len(log)} entries)")
866
+
867
+
868
+ if __name__ == "__main__":
869
+ main()