nexo-brain 5.5.4 → 5.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,449 @@
1
+ """NEXO DB Guard — data-loss detection, validated backups, and self-heal primitives.
2
+
3
+ This module exists because v5.5.4 surfaced a data-loss incident where
4
+ ``~/.nexo/data/nexo.db`` was reset to a 4 KB empty-schema file between two
5
+ observed states (hourly backup 38 MB → pre-update backup 4 KB). The existing
6
+ ``plugins/update.py`` copied the already-empty DB into the ``pre-update-*``
7
+ directory and reported a successful backup, masking the problem.
8
+
9
+ Design principles:
10
+ - Pure stdlib (sqlite3 + pathlib). No NEXO imports, keeps the module import-safe
11
+ from installer, auto-update, and CLI paths even when the runtime is broken.
12
+ - Single source of truth for "what counts as a critical wipe".
13
+ - Every operation that writes to a DB is wrapped in a validation pass so a
14
+ silent failure leaves an explicit trail instead of a 4 KB placeholder.
15
+
16
+ Public surface (stable for use by plugins/update.py, plugins/recover.py,
17
+ auto_update.py):
18
+
19
+ CRITICAL_TABLES
20
+ WIPE_THRESHOLD_PCT
21
+ MIN_REFERENCE_ROWS
22
+
23
+ db_row_counts(path, tables) -> dict[str, int | None]
24
+ db_looks_wiped(path, tables, min_reference_rows) -> bool
25
+ find_latest_hourly_backup(backups_dir, max_age_seconds) -> Path | None
26
+ diff_row_counts(current, reference, tables) -> WipeReport
27
+ safe_sqlite_backup(source, dest) -> tuple[bool, str | None]
28
+ validate_backup_matches_source(source, dest, tables) -> tuple[bool, str | None]
29
+ kill_nexo_mcp_servers(dry_run) -> dict
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import os
35
+ import signal
36
+ import sqlite3
37
+ import subprocess
38
+ import time
39
+ from dataclasses import dataclass, field
40
+ from pathlib import Path
41
+
42
+
43
+ # ── Constants ───────────────────────────────────────────────────────────
44
+
45
+ # Tables whose row counts we treat as canonical evidence of "DB has real data".
46
+ # Kept narrow on purpose: a fresh install has zero rows in some of these
47
+ # (e.g. reminders), so the wipe detector requires a reference backup with
48
+ # meaningful counts before it fires. See MIN_REFERENCE_ROWS below.
49
+ CRITICAL_TABLES: tuple[str, ...] = (
50
+ "protocol_tasks",
51
+ "followups",
52
+ "reminders",
53
+ "learnings",
54
+ "session_diary",
55
+ "guard_checks",
56
+ "protocol_debt",
57
+ "cron_runs",
58
+ "change_log",
59
+ "decisions",
60
+ )
61
+
62
+ # A reference backup must contain at least this many rows (summed across
63
+ # CRITICAL_TABLES) before we will treat it as "proof the user has real data".
64
+ # Otherwise we cannot distinguish a fresh install from a wipe.
65
+ MIN_REFERENCE_ROWS = 50
66
+
67
+ # If the current DB has lost >= this percentage of rows across CRITICAL_TABLES
68
+ # compared to the reference, we call it a wipe. Set conservatively to avoid
69
+ # tripping on legitimate churn like reminder cleanup.
70
+ WIPE_THRESHOLD_PCT = 80
71
+
72
+ # Minimum file size (bytes) a non-empty SQLite DB should clearly exceed.
73
+ # A fresh schema-only nexo.db is 4096 B. Real data crosses this in minutes.
74
+ EMPTY_DB_SIZE_BYTES = 32 * 1024
75
+
76
+ # Filename prefix produced by ``src/scripts/nexo-backup.sh``.
77
+ HOURLY_BACKUP_GLOB = "nexo-*.db"
78
+
79
+ # Hourly backups older than this (seconds) are considered too stale to use
80
+ # as an automatic self-heal source. 48h matches nexo-backup.sh retention.
81
+ HOURLY_BACKUP_MAX_AGE = 48 * 3600
82
+
83
+
84
+ # ── Types ───────────────────────────────────────────────────────────────
85
+
86
+ @dataclass
87
+ class TableDiff:
88
+ table: str
89
+ source: int | None
90
+ reference: int | None
91
+ lost_pct: float # 0..100, meaningful only when reference > 0
92
+
93
+ def is_regression(self, threshold_pct: float = WIPE_THRESHOLD_PCT) -> bool:
94
+ if self.reference is None or self.reference == 0:
95
+ return False
96
+ if self.source is None:
97
+ return True
98
+ return self.lost_pct >= threshold_pct
99
+
100
+
101
+ @dataclass
102
+ class WipeReport:
103
+ source_counts: dict[str, int | None] = field(default_factory=dict)
104
+ reference_counts: dict[str, int | None] = field(default_factory=dict)
105
+ table_diffs: list[TableDiff] = field(default_factory=list)
106
+ total_source_rows: int = 0
107
+ total_reference_rows: int = 0
108
+
109
+ @property
110
+ def overall_lost_pct(self) -> float:
111
+ if self.total_reference_rows <= 0:
112
+ return 0.0
113
+ lost = max(self.total_reference_rows - self.total_source_rows, 0)
114
+ return (lost / self.total_reference_rows) * 100.0
115
+
116
+ def is_wipe(
117
+ self,
118
+ threshold_pct: float = WIPE_THRESHOLD_PCT,
119
+ min_reference_rows: int = MIN_REFERENCE_ROWS,
120
+ ) -> bool:
121
+ """Return True only when reference looks real AND we lost >= threshold."""
122
+ if self.total_reference_rows < min_reference_rows:
123
+ return False
124
+ if self.overall_lost_pct >= threshold_pct:
125
+ return True
126
+ # Also flag when 2+ individual critical tables each dropped >= threshold
127
+ regressions = sum(1 for d in self.table_diffs if d.is_regression(threshold_pct))
128
+ return regressions >= 2
129
+
130
+ def summary_lines(self) -> list[str]:
131
+ lines = [
132
+ f" source rows (critical tables): {self.total_source_rows}",
133
+ f" reference rows (critical tables): {self.total_reference_rows}",
134
+ f" overall loss: {self.overall_lost_pct:.1f}%",
135
+ ]
136
+ regressions = [d for d in self.table_diffs if d.is_regression()]
137
+ if regressions:
138
+ lines.append(" regressions:")
139
+ for d in regressions:
140
+ src = "missing" if d.source is None else str(d.source)
141
+ lines.append(f" - {d.table}: {d.reference} -> {src} ({d.lost_pct:.1f}% lost)")
142
+ return lines
143
+
144
+
145
+ # ── Row count primitives ────────────────────────────────────────────────
146
+
147
+ def _table_count(conn: sqlite3.Connection, table: str) -> int | None:
148
+ """Return COUNT(*) for ``table`` or None if the table is missing."""
149
+ row = conn.execute(
150
+ "SELECT name FROM sqlite_master WHERE type='table' AND name = ?",
151
+ (table,),
152
+ ).fetchone()
153
+ if row is None:
154
+ return None
155
+ cur = conn.execute(f"SELECT COUNT(*) FROM {table}")
156
+ result = cur.fetchone()
157
+ return int(result[0]) if result is not None else 0
158
+
159
+
160
+ def db_row_counts(path: str | Path, tables: tuple[str, ...] = CRITICAL_TABLES) -> dict[str, int | None]:
161
+ """Return {table: count} for a SQLite DB. Missing DB / missing tables map to None."""
162
+ p = Path(path)
163
+ counts: dict[str, int | None] = {t: None for t in tables}
164
+ if not p.is_file():
165
+ return counts
166
+ try:
167
+ conn = sqlite3.connect(str(p), timeout=5)
168
+ except Exception:
169
+ return counts
170
+ try:
171
+ for table in tables:
172
+ try:
173
+ counts[table] = _table_count(conn, table)
174
+ except Exception:
175
+ counts[table] = None
176
+ finally:
177
+ try:
178
+ conn.close()
179
+ except Exception:
180
+ pass
181
+ return counts
182
+
183
+
184
+ def db_looks_wiped(
185
+ path: str | Path,
186
+ tables: tuple[str, ...] = CRITICAL_TABLES,
187
+ min_reference_rows: int = MIN_REFERENCE_ROWS,
188
+ ) -> bool:
189
+ """Heuristic: the file exists AND either all critical tables exist with 0 rows,
190
+ OR the file is suspiciously close to the empty-schema size (4 KB).
191
+
192
+ Returns False when the DB is missing entirely — that is a separate condition
193
+ handled by the caller (nothing to protect vs. something to restore).
194
+ """
195
+ p = Path(path)
196
+ if not p.is_file():
197
+ return False
198
+ try:
199
+ size = p.stat().st_size
200
+ except OSError:
201
+ return False
202
+ if size <= EMPTY_DB_SIZE_BYTES:
203
+ # Small but not necessarily wiped — confirm via row counts.
204
+ counts = db_row_counts(p, tables)
205
+ return _all_tables_empty_or_missing(counts)
206
+ counts = db_row_counts(p, tables)
207
+ return _all_tables_empty_or_missing(counts)
208
+
209
+
210
+ def _all_tables_empty_or_missing(counts: dict[str, int | None]) -> bool:
211
+ """True when every critical table is either missing or 0 rows."""
212
+ if not counts:
213
+ return False
214
+ for val in counts.values():
215
+ if val is not None and val > 0:
216
+ return False
217
+ return True
218
+
219
+
220
+ # ── Reference backup discovery ──────────────────────────────────────────
221
+
222
+ def find_latest_hourly_backup(
223
+ backups_dir: str | Path,
224
+ max_age_seconds: int = HOURLY_BACKUP_MAX_AGE,
225
+ glob: str = HOURLY_BACKUP_GLOB,
226
+ min_critical_rows: int = 1,
227
+ ) -> Path | None:
228
+ """Return the newest hourly backup that contains at least ``min_critical_rows``
229
+ across CRITICAL_TABLES and is not older than ``max_age_seconds``.
230
+
231
+ Row count is used rather than file size because a busy install accumulates
232
+ thousands of small rows in minutes, so size alone is a poor heuristic and
233
+ fails on test fixtures. The whole point of the guard is that file size
234
+ lies when the source has been silently wiped.
235
+ """
236
+ base = Path(backups_dir)
237
+ if not base.is_dir():
238
+ return None
239
+ now = time.time()
240
+ # Step 1: cheap stat-only pass (no sqlite open) — produces sorted newest-first.
241
+ stat_candidates: list[tuple[float, Path]] = []
242
+ for entry in base.glob(glob):
243
+ if not entry.is_file():
244
+ continue
245
+ try:
246
+ stat = entry.stat()
247
+ except OSError:
248
+ continue
249
+ if now - stat.st_mtime > max_age_seconds:
250
+ continue
251
+ if stat.st_size <= EMPTY_DB_SIZE_BYTES:
252
+ continue # Clearly empty schema file.
253
+ stat_candidates.append((stat.st_mtime, entry))
254
+ if not stat_candidates:
255
+ return None
256
+ stat_candidates.sort(key=lambda pair: pair[0], reverse=True)
257
+ # Step 2: open backups newest-first and return the first one that passes
258
+ # the row-count floor. A production NEXO_HOME can accumulate 40+ hourly
259
+ # backups, so opening every file would add seconds to the CLI startup.
260
+ for _, candidate in stat_candidates:
261
+ counts = db_row_counts(candidate)
262
+ total = sum(v for v in counts.values() if isinstance(v, int))
263
+ if total >= min_critical_rows:
264
+ return candidate
265
+ return None
266
+
267
+
268
+ # ── Diff & wipe detection ───────────────────────────────────────────────
269
+
270
+ def diff_row_counts(
271
+ current: str | Path,
272
+ reference: str | Path,
273
+ tables: tuple[str, ...] = CRITICAL_TABLES,
274
+ ) -> WipeReport:
275
+ """Compare row counts between two SQLite DBs and return a WipeReport."""
276
+ source_counts = db_row_counts(current, tables)
277
+ reference_counts = db_row_counts(reference, tables)
278
+
279
+ report = WipeReport(
280
+ source_counts=source_counts,
281
+ reference_counts=reference_counts,
282
+ )
283
+ for table in tables:
284
+ src = source_counts.get(table)
285
+ ref = reference_counts.get(table)
286
+ if src is not None:
287
+ report.total_source_rows += src
288
+ if ref is not None:
289
+ report.total_reference_rows += ref
290
+ if ref is None or ref == 0:
291
+ lost_pct = 0.0
292
+ elif src is None:
293
+ lost_pct = 100.0
294
+ else:
295
+ lost_pct = max(0.0, (ref - src) / ref * 100.0)
296
+ report.table_diffs.append(TableDiff(
297
+ table=table,
298
+ source=src,
299
+ reference=ref,
300
+ lost_pct=lost_pct,
301
+ ))
302
+ return report
303
+
304
+
305
+ # ── Validated SQLite backup ────────────────────────────────────────────
306
+
307
+ def safe_sqlite_backup(source: str | Path, dest: str | Path) -> tuple[bool, str | None]:
308
+ """Copy ``source`` to ``dest`` via sqlite3's online backup API.
309
+
310
+ Returns (True, None) on success, (False, reason) on failure. Creates the
311
+ destination directory if missing. Does NOT validate that the copy contains
312
+ rows — that is the caller's job via validate_backup_matches_source().
313
+ """
314
+ src = Path(source)
315
+ dst = Path(dest)
316
+ if not src.is_file():
317
+ return False, f"source missing: {src}"
318
+ try:
319
+ dst.parent.mkdir(parents=True, exist_ok=True)
320
+ except Exception as e:
321
+ return False, f"cannot create dest dir: {e}"
322
+ src_conn = None
323
+ dst_conn = None
324
+ try:
325
+ src_conn = sqlite3.connect(str(src), timeout=30)
326
+ dst_conn = sqlite3.connect(str(dst), timeout=30)
327
+ src_conn.backup(dst_conn)
328
+ except Exception as e:
329
+ return False, f"sqlite3.backup failed: {e}"
330
+ finally:
331
+ for conn in (dst_conn, src_conn):
332
+ if conn is not None:
333
+ try:
334
+ conn.close()
335
+ except Exception:
336
+ pass
337
+ return True, None
338
+
339
+
340
+ def validate_backup_matches_source(
341
+ source: str | Path,
342
+ dest: str | Path,
343
+ tables: tuple[str, ...] = CRITICAL_TABLES,
344
+ ) -> tuple[bool, str | None]:
345
+ """After a backup, verify that every critical table in the copy has at
346
+ least as many rows as the source — i.e. we did not lose data in transit.
347
+
348
+ Tables missing from both sides are ignored. Tables present in source but
349
+ missing in dest return an explicit error.
350
+ """
351
+ src = Path(source)
352
+ dst = Path(dest)
353
+ if not dst.is_file():
354
+ return False, f"backup missing at {dst}"
355
+ source_counts = db_row_counts(src, tables)
356
+ dest_counts = db_row_counts(dst, tables)
357
+ discrepancies: list[str] = []
358
+ for table in tables:
359
+ s = source_counts.get(table)
360
+ d = dest_counts.get(table)
361
+ if s is None and d is None:
362
+ continue
363
+ if s is not None and d is None:
364
+ discrepancies.append(f"{table}: source={s} backup=missing")
365
+ continue
366
+ if s is not None and d is not None and d < s:
367
+ discrepancies.append(f"{table}: source={s} backup={d}")
368
+ if discrepancies:
369
+ return False, "; ".join(discrepancies)
370
+ return True, None
371
+
372
+
373
+ # ── MCP server discovery / kill ─────────────────────────────────────────
374
+
375
+ def kill_nexo_mcp_servers(dry_run: bool = False) -> dict:
376
+ """Best-effort: find and terminate any running NEXO MCP server processes.
377
+
378
+ Used before `nexo recover` overwrites ~/.nexo/data/nexo.db so a live server
379
+ does not keep a stale connection that immediately re-writes the restored
380
+ file. Never raises — callers treat failures as "maybe still alive".
381
+
382
+ Returns: {scanned, terminated, errors, dry_run, pids}
383
+ """
384
+ result: dict = {
385
+ "scanned": 0,
386
+ "terminated": 0,
387
+ "errors": [],
388
+ "dry_run": dry_run,
389
+ "pids": [],
390
+ }
391
+ if os.name != "posix":
392
+ result["errors"].append("unsupported platform")
393
+ return result
394
+ try:
395
+ proc = subprocess.run(
396
+ ["ps", "-axo", "pid=,command="],
397
+ capture_output=True,
398
+ text=True,
399
+ timeout=5,
400
+ )
401
+ except Exception as e:
402
+ result["errors"].append(f"ps failed: {e}")
403
+ return result
404
+ if proc.returncode != 0:
405
+ result["errors"].append(f"ps exit {proc.returncode}: {proc.stderr.strip()[:200]}")
406
+ return result
407
+
408
+ my_pid = os.getpid()
409
+ for raw in proc.stdout.splitlines():
410
+ line = raw.strip()
411
+ if not line:
412
+ continue
413
+ head, _, rest = line.partition(" ")
414
+ if not head.isdigit():
415
+ continue
416
+ pid = int(head)
417
+ if pid == my_pid:
418
+ continue
419
+ cmd = rest.strip()
420
+ if not _looks_like_nexo_mcp(cmd):
421
+ continue
422
+ result["scanned"] += 1
423
+ result["pids"].append({"pid": pid, "command": cmd[:180]})
424
+ if dry_run:
425
+ continue
426
+ try:
427
+ os.kill(pid, signal.SIGTERM)
428
+ result["terminated"] += 1
429
+ except ProcessLookupError:
430
+ pass
431
+ except Exception as e:
432
+ result["errors"].append(f"kill {pid} failed: {e}")
433
+ return result
434
+
435
+
436
+ def _looks_like_nexo_mcp(cmd: str) -> bool:
437
+ """Heuristic: is this command line a NEXO MCP server worth terminating?"""
438
+ if not cmd:
439
+ return False
440
+ lowered = cmd.lower()
441
+ # server.py is the MCP entrypoint; fastmcp is the framework marker; avoid
442
+ # matching the generic claude binary which may be running other servers.
443
+ if "server.py" in lowered and "nexo" in lowered:
444
+ return True
445
+ if "fastmcp" in lowered and "nexo" in lowered:
446
+ return True
447
+ if "nexo_sdk" in lowered or "nexo-mcp" in lowered:
448
+ return True
449
+ return False
@@ -1,8 +1,20 @@
1
- """Backup plugin — hourly SQLite backups with 7-day retention."""
1
+ """Backup plugin — hourly SQLite backups with 7-day retention.
2
+
3
+ v5.5.6: all three tools are rate-limited in-process so that a runaway MCP
4
+ client (tool-use loop in Claude Code, buggy Desktop handler, etc.) cannot
5
+ hammer ``sqlite3.Connection.backup()`` hundreds of times in minutes. The
6
+ v5.5.4 incident where an external loop caused ~8.5 GB of file-backed writes
7
+ in 37 minutes and corrupted nexo.db when the OS finally killed the process
8
+ is the exact scenario this limit prevents at the tool boundary — in addition
9
+ to the v5.5.5 self-heal that recovers from that class of wipe.
10
+ """
11
+ import glob
2
12
  import os
3
13
  import shutil
14
+ import sqlite3
15
+ import threading
4
16
  import time
5
- import glob
17
+
6
18
  from db import get_db
7
19
 
8
20
  NEXO_HOME = os.environ.get("NEXO_HOME", os.path.expanduser("~/.nexo"))
@@ -11,15 +23,63 @@ BACKUP_DIR = os.path.join(NEXO_HOME, "backups")
11
23
 
12
24
  RETENTION_DAYS = 7
13
25
 
26
+ # ── Rate limits (v5.5.6) ────────────────────────────────────────────
27
+ # Minimum seconds between successive calls to each destructive/expensive
28
+ # backup tool. Overridable per-tool via env var for tests or deliberate
29
+ # recovery scenarios (NEXO_BACKUP_MIN_INTERVAL_SECS, etc.).
30
+ BACKUP_NOW_MIN_INTERVAL_SECS = int(
31
+ os.environ.get("NEXO_BACKUP_MIN_INTERVAL_SECS", "30")
32
+ )
33
+ BACKUP_RESTORE_MIN_INTERVAL_SECS = int(
34
+ os.environ.get("NEXO_BACKUP_RESTORE_MIN_INTERVAL_SECS", "60")
35
+ )
36
+
37
+ _rate_limit_lock = threading.Lock()
38
+ _last_call_ts: dict[str, float] = {
39
+ "backup_now": 0.0,
40
+ "backup_restore": 0.0,
41
+ }
42
+
43
+
44
+ def _check_rate_limit(tool: str, min_interval: int) -> str | None:
45
+ """Return a rate-limit error string if the tool is called too soon, else None."""
46
+ now = time.time()
47
+ with _rate_limit_lock:
48
+ last = _last_call_ts.get(tool, 0.0)
49
+ elapsed = now - last
50
+ if last > 0 and elapsed < min_interval:
51
+ remaining = int(min_interval - elapsed)
52
+ return (
53
+ f"Rate-limited: {tool} called {int(elapsed)}s ago "
54
+ f"(min {min_interval}s between calls). Wait {remaining}s. "
55
+ "If you are seeing this message repeatedly, a client may be stuck in a "
56
+ "tool-use loop — check NEXO transcripts and kill the runaway session."
57
+ )
58
+ _last_call_ts[tool] = now
59
+ return None
60
+
61
+
62
+ def _reset_rate_limit_state_for_tests() -> None:
63
+ """Test hook: clear all tracked call timestamps."""
64
+ with _rate_limit_lock:
65
+ for key in _last_call_ts:
66
+ _last_call_ts[key] = 0.0
67
+
14
68
 
15
69
  def handle_backup_now() -> str:
16
- """Create an immediate backup of the NEXO database."""
70
+ """Create an immediate backup of the NEXO database.
71
+
72
+ Rate-limited to one call every BACKUP_NOW_MIN_INTERVAL_SECS (default 30 s).
73
+ """
74
+ err = _check_rate_limit("backup_now", BACKUP_NOW_MIN_INTERVAL_SECS)
75
+ if err is not None:
76
+ return err
77
+
17
78
  os.makedirs(BACKUP_DIR, exist_ok=True)
18
79
  timestamp = time.strftime("%Y-%m-%d-%H%M")
19
80
  dest = os.path.join(BACKUP_DIR, f"nexo-{timestamp}.db")
20
81
 
21
82
  # Use SQLite backup API for consistency
22
- import sqlite3
23
83
  src_conn = sqlite3.connect(DB_PATH)
24
84
  try:
25
85
  dst_conn = sqlite3.connect(dest)
@@ -56,16 +116,23 @@ def handle_backup_list() -> str:
56
116
  def handle_backup_restore(filename: str) -> str:
57
117
  """Restore database from a backup file. DESTRUCTIVE — replaces current DB.
58
118
 
119
+ Rate-limited to one call every BACKUP_RESTORE_MIN_INTERVAL_SECS (default
120
+ 60 s). A client hammering restore in a loop is the exact shape of the
121
+ v5.5.4 incident.
122
+
59
123
  Args:
60
124
  filename: Backup filename (e.g., 'nexo-2026-03-11-1200.db')
61
125
  """
126
+ err = _check_rate_limit("backup_restore", BACKUP_RESTORE_MIN_INTERVAL_SECS)
127
+ if err is not None:
128
+ return err
129
+
62
130
  src = os.path.join(BACKUP_DIR, filename)
63
131
  if not os.path.isfile(src):
64
132
  return f"Backup not found: {filename}"
65
133
 
66
134
  # Create safety backup first
67
135
  safety = os.path.join(BACKUP_DIR, f"nexo-pre-restore-{time.strftime('%Y%m%d%H%M%S')}.db")
68
- import sqlite3
69
136
  src_conn = sqlite3.connect(DB_PATH)
70
137
  try:
71
138
  dst_conn = sqlite3.connect(safety)