nexo-brain 7.36.0 → 7.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.36.0",
3
+ "version": "7.37.0",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,7 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.36.0` is the current packaged-runtime line. Minor release - local index disk reclaim: the local file/code index (`local-context.db`) no longer grows without bound. It now uses `auto_vacuum=INCREMENTAL` plus a one-time guarded `VACUUM` to convert existing databases, stores embeddings as compact float32 BLOBs instead of JSON text (~4-6x smaller, back-compatible dual-write/dual-read with a resumable backfill and kill switches), reclaims disk on purge/clear, and the daily self-audit now actively compacts at its size cap (`NEXO_LOCAL_INDEX_MAX_BYTES`) instead of only warning. An established index reclaims ~10-20GB immediately and grows several-fold slower; the backup subsystem was audited and is already bounded. Builds on v7.35.0 (selective forget + recurring-incident diagnostic templates).
21
+ Version `7.37.0` is the current packaged-runtime line. Minor release - transparent server self-heal: when an update lands while a Brain MCP server is already running, the resident stdio child now re-execs itself in place (same process, same live MCP connection) instead of telling the user to restart, so the updated code runs immediately with nothing visible. Fail-open (non-POSIX, re-exec error, resident service, or `NEXO_DISABLE_SELFHEAL_REEXEC` kill switch all fall back to the prior safe hard-exit), anti-loop (bounded generations + same-target guard), defers past any in-flight tool call, and a boot-time pre-serve heal. Also fixes email-monitor zombie reinjection: an already-replied email left in 'processing' after a crash is closed as terminal 'processed' and never re-sent as a duplicate reply. Builds on v7.36.0 (local index disk reclaim).
22
22
 
23
23
  Previously in `7.31.9`: patch release over v7.31.8 - UI release closeout now has to prove the original reported symptom was reopened with observable evidence before claiming the release is ready.
24
24
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.36.0",
3
+ "version": "7.37.0",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
@@ -1130,15 +1130,128 @@ def prime_process_fingerprint() -> str:
1130
1130
  _DRIFT_AUTOEXIT_SCHEDULED = False
1131
1131
  _DRIFT_EXIT_CODE = 75
1132
1132
  _DRIFT_EXIT_DELAY_SECONDS = 0.5
1133
+ # Anti crash-loop: cap how many times one process-chain may self-heal-reexec
1134
+ # before giving up and falling back to a plain exit. A half-written update or
1135
+ # an unreadable tree must never thrash.
1136
+ _SELFHEAL_MAX_GENERATIONS = 3
1137
+ # Tool calls currently executing: never re-exec mid-request (would desync the
1138
+ # JSON-RPC stream of a sibling call). Incremented/decremented in on_call_tool.
1139
+ _INFLIGHT_TOOL_CALLS = 0
1140
+ _DRIFT_REEXEC_DEFER_MAX = 20
1141
+ _drift_reexec_defers = 0
1133
1142
 
1134
1143
 
1135
- def _request_drift_exit() -> None:
1144
+ def _selfheal_reexec_disabled() -> bool:
1145
+ return str(os.environ.get("NEXO_DISABLE_SELFHEAL_REEXEC", "") or "").strip().lower() in {"1", "true", "yes"}
1146
+
1147
+
1148
+ def _running_as_resident_service() -> bool:
1149
+ # The resident HTTP runtime-service serves multiple clients and has its own
1150
+ # self-retire (start_resident_obsolescence_watch). It must NOT execv. Lazy
1151
+ # import to avoid a circular import; fall back to an env sentinel.
1152
+ try:
1153
+ from runtime_service import is_runtime_service_process
1154
+
1155
+ return bool(is_runtime_service_process())
1156
+ except Exception:
1157
+ return str(os.environ.get("NEXO_RUNTIME_SERVICE", "") or "").strip().lower() in {"1", "true", "yes"}
1158
+
1159
+
1160
+ def _selfheal_teardown() -> None:
1161
+ """Release SQLite/WAL handles before re-exec so the new image does not fight
1162
+ its own locks. Best-effort: a teardown failure must never block the heal."""
1163
+ try:
1164
+ from local_context.db import close_local_context_db
1165
+
1166
+ close_local_context_db()
1167
+ except Exception:
1168
+ pass
1169
+ try:
1170
+ from db import close_db
1171
+
1172
+ close_db()
1173
+ except Exception:
1174
+ pass
1175
+
1176
+
1177
+ def _drift_hard_exit() -> None:
1178
+ # Fallback (today's behavior): exit so a relaunching client (e.g. Claude
1179
+ # Code) spawns a fresh process on the new code. Used when re-exec can't run.
1136
1180
  try:
1137
1181
  os._exit(_DRIFT_EXIT_CODE)
1138
1182
  except Exception:
1139
1183
  os._exit(1)
1140
1184
 
1141
1185
 
1186
+ def _request_drift_exit() -> None:
1187
+ """Heal a post-update fingerprint drift TRANSPARENTLY: re-exec the live
1188
+ process in place (os.execv -> same PID, same inherited stdio pipes to the
1189
+ MCP client) so it loads the new code on disk without the client/session
1190
+ breaking and without the user restarting anything. Falls back to a plain
1191
+ exit on any obstacle. FAIL-OPEN: this must never be worse than today's exit.
1192
+ """
1193
+ global _drift_reexec_defers
1194
+ try:
1195
+ # 0. Opt-out / non-posix / resident service -> today's behavior.
1196
+ # (execv on native Windows spawns+exits, dropping inherited stdio.)
1197
+ if _selfheal_reexec_disabled() or os.name != "posix" or _running_as_resident_service():
1198
+ _drift_hard_exit()
1199
+ return
1200
+
1201
+ # 1. Never re-exec mid tool-call: defer until in-flight calls drain.
1202
+ if _INFLIGHT_TOOL_CALLS > 0 and _drift_reexec_defers < _DRIFT_REEXEC_DEFER_MAX:
1203
+ _drift_reexec_defers += 1
1204
+ try:
1205
+ loop = asyncio.get_running_loop()
1206
+ loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
1207
+ return
1208
+ except RuntimeError:
1209
+ pass # no running loop -> proceed to re-exec now
1210
+
1211
+ # 2. Resolve the target fingerprint + anti-loop guards.
1212
+ try:
1213
+ target_fp = installed_runtime_fingerprint(use_cache=False) or ""
1214
+ except Exception:
1215
+ target_fp = ""
1216
+ already_healed_target = bool(target_fp) and os.environ.get("NEXO_SELFHEAL_GEN", "") == target_fp[:16]
1217
+ try:
1218
+ count = int(os.environ.get("NEXO_SELFHEAL_COUNT", "0") or "0")
1219
+ except ValueError:
1220
+ count = 0
1221
+ # We already re-exec'd toward this exact target (or hit the cap) and STILL
1222
+ # drift -> the update is broken/unstable; stop looping, exit once so a
1223
+ # relaunching client gets a clean process; a non-relaunching client keeps
1224
+ # the stale-but-alive server returning mcp_restart_required.
1225
+ if already_healed_target or count >= _SELFHEAL_MAX_GENERATIONS:
1226
+ _drift_hard_exit()
1227
+ return
1228
+
1229
+ # 3. Resolve the new entrypoint (the active snapshot's server.py).
1230
+ server_path = ""
1231
+ try:
1232
+ candidate = active_runtime_root() / "server.py"
1233
+ if candidate.is_file():
1234
+ server_path = str(candidate)
1235
+ except Exception:
1236
+ server_path = ""
1237
+ if not server_path and len(sys.argv) > 1 and os.path.isfile(sys.argv[1]):
1238
+ server_path = sys.argv[1]
1239
+ if not server_path:
1240
+ _drift_hard_exit()
1241
+ return
1242
+
1243
+ # 4. Best-effort teardown, stamp anti-loop env, re-exec in place.
1244
+ _selfheal_teardown()
1245
+ os.environ["NEXO_SELFHEAL_COUNT"] = str(count + 1)
1246
+ if target_fp:
1247
+ os.environ["NEXO_SELFHEAL_GEN"] = target_fp[:16]
1248
+ argv_tail = sys.argv[2:] if len(sys.argv) > 2 else []
1249
+ os.execv(sys.executable, [sys.executable, server_path, *argv_tail])
1250
+ except Exception:
1251
+ # Fail-open: any failure (execv raised, teardown, platform) -> plain exit.
1252
+ _drift_hard_exit()
1253
+
1254
+
1142
1255
  def _schedule_drift_autoexit() -> None:
1143
1256
  global _DRIFT_AUTOEXIT_SCHEDULED
1144
1257
  if _DRIFT_AUTOEXIT_SCHEDULED:
@@ -1152,6 +1265,25 @@ def _schedule_drift_autoexit() -> None:
1152
1265
  loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
1153
1266
 
1154
1267
 
1268
+ def maybe_selfheal_on_boot(client: str = "") -> bool:
1269
+ """Pre-serve drift check: if a freshly-spawned stdio child already loaded
1270
+ stale code (launched right after an update and would only ever receive
1271
+ allowlisted tools, so the per-call middleware never trips), re-exec into the
1272
+ new code BEFORE serving the first request. Normally does not return (execv
1273
+ replaces the process). Fail-open: any error -> return False and serve as-is.
1274
+ Call only in stdio-child mode (the resident HTTP service self-retires)."""
1275
+ try:
1276
+ state = resolve_restart_required(client=client)
1277
+ if not state.get("restart_required"):
1278
+ return False
1279
+ if state.get("reason") not in ("fingerprint_mismatch", "version_mismatch"):
1280
+ return False
1281
+ _request_drift_exit()
1282
+ return True
1283
+ except Exception:
1284
+ return False
1285
+
1286
+
1155
1287
  @dataclass
1156
1288
  class RestartRequiredMiddleware(Middleware):
1157
1289
  client: str = ""
@@ -1214,11 +1346,18 @@ class RestartRequiredMiddleware(Middleware):
1214
1346
  )
1215
1347
 
1216
1348
  async def on_call_tool(self, context, call_next):
1349
+ global _INFLIGHT_TOOL_CALLS
1217
1350
  tool_name = str(getattr(context.message, "name", "") or "").strip()
1218
1351
  state = resolve_restart_required(client=self.client)
1219
1352
  state = self._ack_current_client_if_restarted(state)
1220
1353
  if not state["restart_required"] or tool_name in RESTART_ALLOWLIST:
1221
- return await call_next(context)
1354
+ # Track in-flight executions so a drift self-heal re-exec defers until
1355
+ # no tool call is mid-stream (avoids desyncing the JSON-RPC framing).
1356
+ _INFLIGHT_TOOL_CALLS += 1
1357
+ try:
1358
+ return await call_next(context)
1359
+ finally:
1360
+ _INFLIGHT_TOOL_CALLS -= 1
1222
1361
 
1223
1362
  payload = {
1224
1363
  "ok": False,
@@ -947,6 +947,131 @@ def _reconcile_finished_rows(conn, *, hours=24):
947
947
  return reconciled
948
948
 
949
949
 
950
+ def _reconcile_replied_zombies(conn):
951
+ """Close 'processing'/'pending' emails that were ALREADY replied to before
952
+ the worker session marked them processed.
953
+
954
+ Failure mode (self-critiques 1111/1112, 25-may-2026): a worker session
955
+ sends the reply through ``nexo-send-reply.py`` but dies (exit -9) BEFORE
956
+ it flips the BD row to a terminal status. The stuck/zombie recovery then
957
+ resets the row to 'pending' and the daemon reinjects the MID, producing a
958
+ DUPLICATE reply to the operator.
959
+
960
+ This reconciler consults two durable signals that survive a session crash
961
+ and, if either says the operator was already answered, closes the row as
962
+ terminal ('processed') and logs a 'resolution' marker instead of letting it
963
+ be reinjected:
964
+ 1. ``email_events`` lifecycle markers ('replied'/'resolution'/
965
+ 'action_done') written by ``record_reply_lifecycle()`` at send time.
966
+ 2. ``sent_email_events`` rows whose In-Reply-To / References point back at
967
+ the inbound ``message_id`` (the durable outbound ledger written by
968
+ ``record_sent_email()``).
969
+
970
+ Matching is strictly per inbound message_id, so a fresh message in an
971
+ already-answered thread (its own distinct MID) never false-positives.
972
+ """
973
+ if not _table_exists(conn, "emails"):
974
+ return []
975
+
976
+ cols = _email_table_columns(conn)
977
+ has_sent_ledger = _table_exists(conn, "sent_email_events")
978
+
979
+ rows = conn.execute(
980
+ """
981
+ SELECT message_id, subject, status
982
+ FROM emails
983
+ WHERE status IN ('processing', 'pending')
984
+ """
985
+ ).fetchall()
986
+
987
+ sanitized = []
988
+ for row in rows:
989
+ mid = row["message_id"]
990
+ if not mid:
991
+ continue
992
+
993
+ signal = None
994
+ sent_reference = None
995
+
996
+ # Signal 1 — in-DB lifecycle marker keyed to this inbound MID.
997
+ ev = conn.execute(
998
+ """
999
+ SELECT event, MAX(timestamp) AS ts
1000
+ FROM email_events
1001
+ WHERE email_id = ?
1002
+ AND event IN ('replied', 'resolution', 'action_done')
1003
+ """,
1004
+ (mid,),
1005
+ ).fetchone()
1006
+ if ev and ev["ts"]:
1007
+ signal = f"email_event:{ev['event']}"
1008
+ sent_reference = ev["ts"]
1009
+
1010
+ # Signal 2 — durable outbound ledger pointing back at this MID.
1011
+ if signal is None and has_sent_ledger:
1012
+ sent = conn.execute(
1013
+ """
1014
+ SELECT message_id AS sent_mid, sent_at
1015
+ FROM sent_email_events
1016
+ WHERE in_reply_to = ?
1017
+ OR references_header LIKE '%' || ? || '%'
1018
+ ORDER BY sent_at DESC
1019
+ LIMIT 1
1020
+ """,
1021
+ (mid, mid),
1022
+ ).fetchone()
1023
+ if sent:
1024
+ signal = "sent_email_events"
1025
+ sent_reference = sent["sent_at"]
1026
+
1027
+ if signal is None:
1028
+ continue
1029
+
1030
+ updates = ["status = 'processed'"]
1031
+ if "completed_at" in cols:
1032
+ updates.append(
1033
+ "completed_at = COALESCE(completed_at, datetime('now','localtime'))"
1034
+ )
1035
+ if "error" in cols:
1036
+ updates.append("error = NULL")
1037
+ conn.execute(
1038
+ f"""
1039
+ UPDATE emails
1040
+ SET {', '.join(updates)}
1041
+ WHERE message_id = ?
1042
+ AND status IN ('processing', 'pending')
1043
+ """,
1044
+ (mid,),
1045
+ )
1046
+ _insert_event(
1047
+ conn,
1048
+ mid,
1049
+ "resolution",
1050
+ "Sanitized: reply already sent before BD close (zombie reconcile)",
1051
+ {
1052
+ "reason": "already_replied_reconciled",
1053
+ "previous_status": row["status"],
1054
+ "signal": signal,
1055
+ "sent_reference": sent_reference,
1056
+ },
1057
+ )
1058
+ log.warning(
1059
+ f"Sanitized already-replied zombie email: status={row['status']} "
1060
+ f"signal={signal} subj={(row['subject'] or '')[:40]} [{mid}] — "
1061
+ f"closed as 'processed', not reinjected"
1062
+ )
1063
+ sanitized.append(
1064
+ {
1065
+ "email_id": mid,
1066
+ "subject": row["subject"],
1067
+ "previous_status": row["status"],
1068
+ "signal": signal,
1069
+ }
1070
+ )
1071
+
1072
+ return sanitized
1073
+
1074
+
950
1075
  def _recent_debt_flagged(conn, email_id, *, hours=6):
951
1076
  row = conn.execute(
952
1077
  """
@@ -1153,6 +1278,9 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
1153
1278
  return ""
1154
1279
  live_reconciled = _reconcile_processing_rows(conn)
1155
1280
  finished_reconciled = _reconcile_finished_rows(conn)
1281
+ # Close already-replied zombies BEFORE the 2h stuck-recovery below resets
1282
+ # them to 'pending', so the daemon never reinjects a MID we already answered.
1283
+ replied_sanitized = _reconcile_replied_zombies(conn)
1156
1284
 
1157
1285
  items = []
1158
1286
  now_label = datetime.now().isoformat(timespec="seconds")
@@ -1278,14 +1406,17 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
1278
1406
  conn.commit()
1279
1407
  conn.close()
1280
1408
 
1281
- if not items:
1409
+ if not items and not replied_sanitized:
1282
1410
  return ""
1283
1411
 
1284
- lines = ["== PENDING EMAIL DEBT DETECTED ==", "Prioritize closing or clarifying these threads before ignoring them:"]
1285
- for item in items[:max_items]:
1286
- lines.append(f"- {item['label']} ({item['detail']})")
1287
- if len(items) > max_items:
1288
- lines.append(f"- ... and {len(items) - max_items} more item(s)")
1412
+ lines = []
1413
+ if items:
1414
+ lines.append("== PENDING EMAIL DEBT DETECTED ==")
1415
+ lines.append("Prioritize closing or clarifying these threads before ignoring them:")
1416
+ for item in items[:max_items]:
1417
+ lines.append(f"- {item['label']} ({item['detail']})")
1418
+ if len(items) > max_items:
1419
+ lines.append(f"- ... and {len(items) - max_items} more item(s)")
1289
1420
  if recovered:
1290
1421
  lines.append("")
1291
1422
  lines.append(f"Auto-recovery applied: {len(recovered)} processing-stuck email(s) were reset to pending.")
@@ -1294,6 +1425,12 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
1294
1425
  lines.append(
1295
1426
  f"Reconciled {len(sent_reconciled)} processing email(s) with already-sent reply events; no re-open applied."
1296
1427
  )
1428
+ if replied_sanitized:
1429
+ lines.append("")
1430
+ lines.append(
1431
+ f"Sanitized {len(replied_sanitized)} already-replied email(s): closed as 'processed' "
1432
+ f"to prevent duplicate operator replies (no reinjection)."
1433
+ )
1297
1434
  total_reconciled = len(live_reconciled) + len(finished_reconciled)
1298
1435
  if total_reconciled:
1299
1436
  lines.append(f"Reconciled {total_reconciled} email(s) with inconsistent lifecycle state.")
package/src/server.py CHANGED
@@ -139,6 +139,7 @@ from tools_api_call import (
139
139
  from runtime_versioning import (
140
140
  RestartRequiredMiddleware,
141
141
  build_mcp_status,
142
+ maybe_selfheal_on_boot,
142
143
  prime_process_fingerprint,
143
144
  prime_process_version,
144
145
  )
@@ -3264,4 +3265,10 @@ if __name__ == "__main__":
3264
3265
  port=port,
3265
3266
  on_exit=lambda: (close_local_context_db(), close_db()),
3266
3267
  )
3268
+ else:
3269
+ # stdio child: if we booted already-stale (spawned right after an
3270
+ # update), re-exec into the new code transparently before serving —
3271
+ # covers the case where only allowlisted tools are called and the
3272
+ # per-call drift middleware would never trip. Fail-open.
3273
+ maybe_selfheal_on_boot(client=str(os.environ.get("NEXO_MCP_CLIENT", "") or "").strip())
3267
3274
  mcp.run(**run_kwargs)