npm - nexo-brain - Versions diffs - 7.36.0 → 7.37.0 - Mend

nexo-brain 7.36.0 → 7.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/.claude-plugin/plugin.json +1 -1
package/README.md +1 -1
package/package.json +1 -1
package/src/runtime_versioning.py +141 -2
package/src/scripts/nexo-email-monitor.py +143 -6
package/src/server.py +7 -0

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nexo-brain",
-  "version": "7.36.0",
+  "version": "7.37.0",
   "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
   "author": {
     "name": "NEXO Brain",

package/README.md CHANGED Viewed

@@ -18,7 +18,7 @@
 [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
-Version `7.36.0` is the current packaged-runtime line. Minor release - local index disk reclaim: the local file/code index (`local-context.db`) no longer grows without bound. It now uses `auto_vacuum=INCREMENTAL` plus a one-time guarded `VACUUM` to convert existing databases, stores embeddings as compact float32 BLOBs instead of JSON text (~4-6x smaller, back-compatible dual-write/dual-read with a resumable backfill and kill switches), reclaims disk on purge/clear, and the daily self-audit now actively compacts at its size cap (`NEXO_LOCAL_INDEX_MAX_BYTES`) instead of only warning. An established index reclaims ~10-20GB immediately and grows several-fold slower; the backup subsystem was audited and is already bounded. Builds on v7.35.0 (selective forget + recurring-incident diagnostic templates).
+Version `7.37.0` is the current packaged-runtime line. Minor release - transparent server self-heal: when an update lands while a Brain MCP server is already running, the resident stdio child now re-execs itself in place (same process, same live MCP connection) instead of telling the user to restart, so the updated code runs immediately with nothing visible. Fail-open (non-POSIX, re-exec error, resident service, or `NEXO_DISABLE_SELFHEAL_REEXEC` kill switch all fall back to the prior safe hard-exit), anti-loop (bounded generations + same-target guard), defers past any in-flight tool call, and a boot-time pre-serve heal. Also fixes email-monitor zombie reinjection: an already-replied email left in 'processing' after a crash is closed as terminal 'processed' and never re-sent as a duplicate reply. Builds on v7.36.0 (local index disk reclaim).
 Previously in `7.31.9`: patch release over v7.31.8 - UI release closeout now has to prove the original reported symptom was reopened with observable evidence before claiming the release is ready.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nexo-brain",
-  "version": "7.36.0",
+  "version": "7.37.0",
   "mcpName": "io.github.wazionapps/nexo",
   "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
   "homepage": "https://nexo-brain.com",

package/src/runtime_versioning.py CHANGED Viewed

@@ -1130,15 +1130,128 @@ def prime_process_fingerprint() -> str:
 _DRIFT_AUTOEXIT_SCHEDULED = False
 _DRIFT_EXIT_CODE = 75
 _DRIFT_EXIT_DELAY_SECONDS = 0.5
+# Anti crash-loop: cap how many times one process-chain may self-heal-reexec
+# before giving up and falling back to a plain exit. A half-written update or
+# an unreadable tree must never thrash.
+_SELFHEAL_MAX_GENERATIONS = 3
+# Tool calls currently executing: never re-exec mid-request (would desync the
+# JSON-RPC stream of a sibling call). Incremented/decremented in on_call_tool.
+_INFLIGHT_TOOL_CALLS = 0
+_DRIFT_REEXEC_DEFER_MAX = 20
+_drift_reexec_defers = 0
-def _request_drift_exit() -> None:
+def _selfheal_reexec_disabled() -> bool:
+    return str(os.environ.get("NEXO_DISABLE_SELFHEAL_REEXEC", "") or "").strip().lower() in {"1", "true", "yes"}
+def _running_as_resident_service() -> bool:
+    # The resident HTTP runtime-service serves multiple clients and has its own
+    # self-retire (start_resident_obsolescence_watch). It must NOT execv. Lazy
+    # import to avoid a circular import; fall back to an env sentinel.
+    try:
+        from runtime_service import is_runtime_service_process
+        return bool(is_runtime_service_process())
+    except Exception:
+        return str(os.environ.get("NEXO_RUNTIME_SERVICE", "") or "").strip().lower() in {"1", "true", "yes"}
+def _selfheal_teardown() -> None:
+    """Release SQLite/WAL handles before re-exec so the new image does not fight
+    its own locks. Best-effort: a teardown failure must never block the heal."""
+    try:
+        from local_context.db import close_local_context_db
+        close_local_context_db()
+    except Exception:
+        pass
+    try:
+        from db import close_db
+        close_db()
+    except Exception:
+        pass
+def _drift_hard_exit() -> None:
+    # Fallback (today's behavior): exit so a relaunching client (e.g. Claude
+    # Code) spawns a fresh process on the new code. Used when re-exec can't run.
     try:
         os._exit(_DRIFT_EXIT_CODE)
     except Exception:
         os._exit(1)
+def _request_drift_exit() -> None:
+    """Heal a post-update fingerprint drift TRANSPARENTLY: re-exec the live
+    process in place (os.execv -> same PID, same inherited stdio pipes to the
+    MCP client) so it loads the new code on disk without the client/session
+    breaking and without the user restarting anything. Falls back to a plain
+    exit on any obstacle. FAIL-OPEN: this must never be worse than today's exit.
+    """
+    global _drift_reexec_defers
+    try:
+        # 0. Opt-out / non-posix / resident service -> today's behavior.
+        #    (execv on native Windows spawns+exits, dropping inherited stdio.)
+        if _selfheal_reexec_disabled() or os.name != "posix" or _running_as_resident_service():
+            _drift_hard_exit()
+            return
+        # 1. Never re-exec mid tool-call: defer until in-flight calls drain.
+        if _INFLIGHT_TOOL_CALLS > 0 and _drift_reexec_defers < _DRIFT_REEXEC_DEFER_MAX:
+            _drift_reexec_defers += 1
+            try:
+                loop = asyncio.get_running_loop()
+                loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
+                return
+            except RuntimeError:
+                pass  # no running loop -> proceed to re-exec now
+        # 2. Resolve the target fingerprint + anti-loop guards.
+        try:
+            target_fp = installed_runtime_fingerprint(use_cache=False) or ""
+        except Exception:
+            target_fp = ""
+        already_healed_target = bool(target_fp) and os.environ.get("NEXO_SELFHEAL_GEN", "") == target_fp[:16]
+        try:
+            count = int(os.environ.get("NEXO_SELFHEAL_COUNT", "0") or "0")
+        except ValueError:
+            count = 0
+        # We already re-exec'd toward this exact target (or hit the cap) and STILL
+        # drift -> the update is broken/unstable; stop looping, exit once so a
+        # relaunching client gets a clean process; a non-relaunching client keeps
+        # the stale-but-alive server returning mcp_restart_required.
+        if already_healed_target or count >= _SELFHEAL_MAX_GENERATIONS:
+            _drift_hard_exit()
+            return
+        # 3. Resolve the new entrypoint (the active snapshot's server.py).
+        server_path = ""
+        try:
+            candidate = active_runtime_root() / "server.py"
+            if candidate.is_file():
+                server_path = str(candidate)
+        except Exception:
+            server_path = ""
+        if not server_path and len(sys.argv) > 1 and os.path.isfile(sys.argv[1]):
+            server_path = sys.argv[1]
+        if not server_path:
+            _drift_hard_exit()
+            return
+        # 4. Best-effort teardown, stamp anti-loop env, re-exec in place.
+        _selfheal_teardown()
+        os.environ["NEXO_SELFHEAL_COUNT"] = str(count + 1)
+        if target_fp:
+            os.environ["NEXO_SELFHEAL_GEN"] = target_fp[:16]
+        argv_tail = sys.argv[2:] if len(sys.argv) > 2 else []
+        os.execv(sys.executable, [sys.executable, server_path, *argv_tail])
+    except Exception:
+        # Fail-open: any failure (execv raised, teardown, platform) -> plain exit.
+        _drift_hard_exit()
 def _schedule_drift_autoexit() -> None:
     global _DRIFT_AUTOEXIT_SCHEDULED
     if _DRIFT_AUTOEXIT_SCHEDULED:
@@ -1152,6 +1265,25 @@ def _schedule_drift_autoexit() -> None:
     loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
+def maybe_selfheal_on_boot(client: str = "") -> bool:
+    """Pre-serve drift check: if a freshly-spawned stdio child already loaded
+    stale code (launched right after an update and would only ever receive
+    allowlisted tools, so the per-call middleware never trips), re-exec into the
+    new code BEFORE serving the first request. Normally does not return (execv
+    replaces the process). Fail-open: any error -> return False and serve as-is.
+    Call only in stdio-child mode (the resident HTTP service self-retires)."""
+    try:
+        state = resolve_restart_required(client=client)
+        if not state.get("restart_required"):
+            return False
+        if state.get("reason") not in ("fingerprint_mismatch", "version_mismatch"):
+            return False
+        _request_drift_exit()
+        return True
+    except Exception:
+        return False
 @dataclass
 class RestartRequiredMiddleware(Middleware):
     client: str = ""
@@ -1214,11 +1346,18 @@ class RestartRequiredMiddleware(Middleware):
         )
     async def on_call_tool(self, context, call_next):
+        global _INFLIGHT_TOOL_CALLS
         tool_name = str(getattr(context.message, "name", "") or "").strip()
         state = resolve_restart_required(client=self.client)
         state = self._ack_current_client_if_restarted(state)
         if not state["restart_required"] or tool_name in RESTART_ALLOWLIST:
-            return await call_next(context)
+            # Track in-flight executions so a drift self-heal re-exec defers until
+            # no tool call is mid-stream (avoids desyncing the JSON-RPC framing).
+            _INFLIGHT_TOOL_CALLS += 1
+            try:
+                return await call_next(context)
+            finally:
+                _INFLIGHT_TOOL_CALLS -= 1
         payload = {
             "ok": False,

package/src/scripts/nexo-email-monitor.py CHANGED Viewed

@@ -947,6 +947,131 @@ def _reconcile_finished_rows(conn, *, hours=24):
     return reconciled
+def _reconcile_replied_zombies(conn):
+    """Close 'processing'/'pending' emails that were ALREADY replied to before
+    the worker session marked them processed.
+    Failure mode (self-critiques 1111/1112, 25-may-2026): a worker session
+    sends the reply through ``nexo-send-reply.py`` but dies (exit -9) BEFORE
+    it flips the BD row to a terminal status. The stuck/zombie recovery then
+    resets the row to 'pending' and the daemon reinjects the MID, producing a
+    DUPLICATE reply to the operator.
+    This reconciler consults two durable signals that survive a session crash
+    and, if either says the operator was already answered, closes the row as
+    terminal ('processed') and logs a 'resolution' marker instead of letting it
+    be reinjected:
+      1. ``email_events`` lifecycle markers ('replied'/'resolution'/
+         'action_done') written by ``record_reply_lifecycle()`` at send time.
+      2. ``sent_email_events`` rows whose In-Reply-To / References point back at
+         the inbound ``message_id`` (the durable outbound ledger written by
+         ``record_sent_email()``).
+    Matching is strictly per inbound message_id, so a fresh message in an
+    already-answered thread (its own distinct MID) never false-positives.
+    """
+    if not _table_exists(conn, "emails"):
+        return []
+    cols = _email_table_columns(conn)
+    has_sent_ledger = _table_exists(conn, "sent_email_events")
+    rows = conn.execute(
+        """
+        SELECT message_id, subject, status
+        FROM emails
+        WHERE status IN ('processing', 'pending')
+        """
+    ).fetchall()
+    sanitized = []
+    for row in rows:
+        mid = row["message_id"]
+        if not mid:
+            continue
+        signal = None
+        sent_reference = None
+        # Signal 1 — in-DB lifecycle marker keyed to this inbound MID.
+        ev = conn.execute(
+            """
+            SELECT event, MAX(timestamp) AS ts
+            FROM email_events
+            WHERE email_id = ?
+              AND event IN ('replied', 'resolution', 'action_done')
+            """,
+            (mid,),
+        ).fetchone()
+        if ev and ev["ts"]:
+            signal = f"email_event:{ev['event']}"
+            sent_reference = ev["ts"]
+        # Signal 2 — durable outbound ledger pointing back at this MID.
+        if signal is None and has_sent_ledger:
+            sent = conn.execute(
+                """
+                SELECT message_id AS sent_mid, sent_at
+                FROM sent_email_events
+                WHERE in_reply_to = ?
+                   OR references_header LIKE '%' || ? || '%'
+                ORDER BY sent_at DESC
+                LIMIT 1
+                """,
+                (mid, mid),
+            ).fetchone()
+            if sent:
+                signal = "sent_email_events"
+                sent_reference = sent["sent_at"]
+        if signal is None:
+            continue
+        updates = ["status = 'processed'"]
+        if "completed_at" in cols:
+            updates.append(
+                "completed_at = COALESCE(completed_at, datetime('now','localtime'))"
+            )
+        if "error" in cols:
+            updates.append("error = NULL")
+        conn.execute(
+            f"""
+            UPDATE emails
+            SET {', '.join(updates)}
+            WHERE message_id = ?
+              AND status IN ('processing', 'pending')
+            """,
+            (mid,),
+        )
+        _insert_event(
+            conn,
+            mid,
+            "resolution",
+            "Sanitized: reply already sent before BD close (zombie reconcile)",
+            {
+                "reason": "already_replied_reconciled",
+                "previous_status": row["status"],
+                "signal": signal,
+                "sent_reference": sent_reference,
+            },
+        )
+        log.warning(
+            f"Sanitized already-replied zombie email: status={row['status']} "
+            f"signal={signal} subj={(row['subject'] or '')[:40]} [{mid}] — "
+            f"closed as 'processed', not reinjected"
+        )
+        sanitized.append(
+            {
+                "email_id": mid,
+                "subject": row["subject"],
+                "previous_status": row["status"],
+                "signal": signal,
+            }
+        )
+    return sanitized
 def _recent_debt_flagged(conn, email_id, *, hours=6):
     row = conn.execute(
         """
@@ -1153,6 +1278,9 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
         return ""
     live_reconciled = _reconcile_processing_rows(conn)
     finished_reconciled = _reconcile_finished_rows(conn)
+    # Close already-replied zombies BEFORE the 2h stuck-recovery below resets
+    # them to 'pending', so the daemon never reinjects a MID we already answered.
+    replied_sanitized = _reconcile_replied_zombies(conn)
     items = []
     now_label = datetime.now().isoformat(timespec="seconds")
@@ -1278,14 +1406,17 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
     conn.commit()
     conn.close()
-    if not items:
+    if not items and not replied_sanitized:
         return ""
-    lines = ["== PENDING EMAIL DEBT DETECTED ==", "Prioritize closing or clarifying these threads before ignoring them:"]
-    for item in items[:max_items]:
-        lines.append(f"- {item['label']} ({item['detail']})")
-    if len(items) > max_items:
-        lines.append(f"- ... and {len(items) - max_items} more item(s)")
+    lines = []
+    if items:
+        lines.append("== PENDING EMAIL DEBT DETECTED ==")
+        lines.append("Prioritize closing or clarifying these threads before ignoring them:")
+        for item in items[:max_items]:
+            lines.append(f"- {item['label']} ({item['detail']})")
+        if len(items) > max_items:
+            lines.append(f"- ... and {len(items) - max_items} more item(s)")
     if recovered:
         lines.append("")
         lines.append(f"Auto-recovery applied: {len(recovered)} processing-stuck email(s) were reset to pending.")
@@ -1294,6 +1425,12 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
         lines.append(
             f"Reconciled {len(sent_reconciled)} processing email(s) with already-sent reply events; no re-open applied."
         )
+    if replied_sanitized:
+        lines.append("")
+        lines.append(
+            f"Sanitized {len(replied_sanitized)} already-replied email(s): closed as 'processed' "
+            f"to prevent duplicate operator replies (no reinjection)."
+        )
     total_reconciled = len(live_reconciled) + len(finished_reconciled)
     if total_reconciled:
         lines.append(f"Reconciled {total_reconciled} email(s) with inconsistent lifecycle state.")

package/src/server.py CHANGED Viewed

@@ -139,6 +139,7 @@ from tools_api_call import (
 from runtime_versioning import (
     RestartRequiredMiddleware,
     build_mcp_status,
+    maybe_selfheal_on_boot,
     prime_process_fingerprint,
     prime_process_version,
 )
@@ -3264,4 +3265,10 @@ if __name__ == "__main__":
                 port=port,
                 on_exit=lambda: (close_local_context_db(), close_db()),
             )
+        else:
+            # stdio child: if we booted already-stale (spawned right after an
+            # update), re-exec into the new code transparently before serving —
+            # covers the case where only allowlisted tools are called and the
+            # per-call drift middleware would never trip. Fail-open.
+            maybe_selfheal_on_boot(client=str(os.environ.get("NEXO_MCP_CLIENT", "") or "").strip())
         mcp.run(**run_kwargs)