nexo-brain 7.35.0 → 7.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.35.0",
3
+ "version": "7.37.0",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,7 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.35.0` is the current packaged-runtime line. Minor release - Cognitive OS Ola 4: selective forget lets you delete a leaked secret or a wrong memory and prove it is gone (zeroed across every live store, `secure_delete=ON`) or correct a fact reversibly, recurring failure archetypes are distilled into reusable diagnostic templates primed before a matching action (strong/weak marker tiers so benign success phrasing never triggers them, guidance-only), and closing a local-only followup-runner is no longer mis-flagged as an external real-world action. Builds on v7.34.0 (working memory + self-error learning + associative graph + deep-sleep rewrite + evals).
21
+ Version `7.37.0` is the current packaged-runtime line. Minor release - transparent server self-heal: when an update lands while a Brain MCP server is already running, the resident stdio child now re-execs itself in place (same process, same live MCP connection) instead of telling the user to restart, so the updated code runs immediately with nothing visible. Fail-open (non-POSIX, re-exec error, resident service, or `NEXO_DISABLE_SELFHEAL_REEXEC` kill switch all fall back to the prior safe hard-exit), anti-loop (bounded generations + same-target guard), defers past any in-flight tool call, and a boot-time pre-serve heal. Also fixes email-monitor zombie reinjection: an already-replied email left in 'processing' after a crash is closed as terminal 'processed' and never re-sent as a duplicate reply. Builds on v7.36.0 (local index disk reclaim).
22
22
 
23
23
  Previously in `7.31.9`: patch release over v7.31.8 - UI release closeout now has to prove the original reported symptom was reopened with observable evidence before claiming the release is ready.
24
24
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.35.0",
3
+ "version": "7.37.0",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
package/src/db/_schema.py CHANGED
@@ -2007,6 +2007,7 @@ def _m63_local_context_layer(conn):
2007
2007
  model_revision TEXT NOT NULL DEFAULT '',
2008
2008
  dimension INTEGER NOT NULL,
2009
2009
  vector_json TEXT NOT NULL,
2010
+ vector_blob BLOB,
2010
2011
  created_at REAL NOT NULL
2011
2012
  );
2012
2013
 
@@ -146,6 +146,14 @@ def _prune_db_backups(deep_sleep_dir: Path, report: dict, *, keep: int, apply: b
146
146
  _record_delete(report, backup, reason=f"old-db-backup:{family}", apply=apply)
147
147
  for sidecar in _sidecars(backup):
148
148
  _record_delete(report, sidecar, reason=f"old-db-backup-sidecar:{family}", apply=apply)
149
+ # Orphan sweep: -wal/-shm sidecars whose base .db no longer exists (left by
150
+ # interrupted/legacy deep-sleep processes). The online-backup path produces
151
+ # sidecar-free snapshots, so any sidecar with a missing base is a true
152
+ # orphan. Scoped strictly to this deep-sleep backup dir; never the live DBs.
153
+ for sidecar in list(deep_sleep_dir.glob("*-backup-*.db-wal")) + list(deep_sleep_dir.glob("*-backup-*.db-shm")):
154
+ base = Path(str(sidecar)[: -len("-wal")]) if str(sidecar).endswith("-wal") else Path(str(sidecar)[: -len("-shm")])
155
+ if not base.exists():
156
+ _record_delete(report, sidecar, reason="orphan-db-sidecar", apply=apply)
149
157
 
150
158
 
151
159
  def _prune_contexts(deep_sleep_dir: Path, report: dict, *, keep: int, apply: bool) -> None:
@@ -465,6 +465,7 @@ class HeadlessEnforcer:
465
465
  self.user_message_count = 0
466
466
  self.tool_timestamps: dict[str, float] = {}
467
467
  self.msg_since_tool: dict[str, int] = {}
468
+ self._tool_user_message_index: dict[str, int] = {}
468
469
  self.injection_queue: list[dict] = []
469
470
  self._started_at = time.time()
470
471
  self._injections_done = 0
@@ -551,6 +552,8 @@ class HeadlessEnforcer:
551
552
  # seen, periodic/conditional reminders stay suppressed so cron
552
553
  # runners can reach TURN_END instead of reopening the task loop.
553
554
  self._session_stopped: bool = False
555
+ self._first_visible_startup_gate_fired: bool = False
556
+ self._first_visible_text_allowed: bool = False
554
557
  try:
555
558
  self._post_close_cooldown_seconds = max(
556
559
  0,
@@ -1036,6 +1039,52 @@ class HeadlessEnforcer:
1036
1039
  except Exception:
1037
1040
  pass
1038
1041
 
1042
+ def should_block_first_visible_text(self) -> bool:
1043
+ """Fail closed before the first visible answer when startup context is missing."""
1044
+ if self._first_visible_text_allowed:
1045
+ return False
1046
+ if self.user_message_count <= 0:
1047
+ self._first_visible_text_allowed = True
1048
+ return False
1049
+
1050
+ current_turn = int(self.user_message_count or 0)
1051
+ has_startup = "nexo_startup" in self.tools_called
1052
+ continuity_tools = {
1053
+ "nexo_smart_startup",
1054
+ "nexo_session_diary_read",
1055
+ "nexo_reminders",
1056
+ "nexo_checkpoint_read",
1057
+ }
1058
+ has_continuity = bool(self.tools_called.intersection(continuity_tools))
1059
+ heartbeat_turn = max(
1060
+ self._tool_user_message_index.get("nexo_heartbeat", -1),
1061
+ self._tool_user_message_index.get("nexo_task_open", -1),
1062
+ )
1063
+ has_turn_heartbeat = heartbeat_turn >= current_turn
1064
+
1065
+ missing = []
1066
+ if not has_startup:
1067
+ missing.append("nexo_startup")
1068
+ if not has_continuity:
1069
+ missing.append("continuidad minima")
1070
+ if not has_turn_heartbeat:
1071
+ missing.append("nexo_heartbeat")
1072
+ if not missing:
1073
+ self._first_visible_text_allowed = True
1074
+ return False
1075
+ if self._first_visible_startup_gate_fired:
1076
+ return True
1077
+
1078
+ prompt = (
1079
+ "Before any visible answer, register the session, load minimal continuity, "
1080
+ "and associate the current user message with a heartbeat. Missing: "
1081
+ f"{', '.join(missing)}. Execute the required NEXO tool calls now. "
1082
+ "Do not produce visible text for this reminder."
1083
+ )
1084
+ self._enqueue(prompt, "first-visible-startup-heartbeat-gate", rule_id="R38_first_visible_startup_gate")
1085
+ self._first_visible_startup_gate_fired = True
1086
+ return True
1087
+
1039
1088
  def _check_capability_denial_requires_reality(self, text: str):
1040
1089
  """Block unsupported capability denials until a live source was checked."""
1041
1090
  if not text or not _CAPABILITY_DENIAL_RE.search(text):
@@ -2537,6 +2586,7 @@ class HeadlessEnforcer:
2537
2586
  self.tools_called.add(name)
2538
2587
  self.tool_timestamps[name] = time.time()
2539
2588
  self.msg_since_tool[name] = 0
2589
+ self._tool_user_message_index[name] = int(self.user_message_count or 0)
2540
2590
 
2541
2591
  # v7.6 conditional counter advance. Tools watched by a
2542
2592
  # conditional rule tick a counter on every non-matching call.
@@ -3346,6 +3396,14 @@ def run_with_enforcement(
3346
3396
  msg = event.get("message", {})
3347
3397
  for block in msg.get("content", []):
3348
3398
  if block.get("type") == "text":
3399
+ try:
3400
+ if enforcer.should_block_first_visible_text():
3401
+ item = enforcer.flush()
3402
+ if item:
3403
+ _inject(item["prompt"])
3404
+ return False
3405
+ except Exception as _startup_gate_exc: # noqa: BLE001
3406
+ _logger.warning("first visible startup gate failed: %s", _startup_gate_exc)
3349
3407
  collected_text.append(block["text"])
3350
3408
  # R16 — probe each assistant text block as it arrives
3351
3409
  # so a declared-done line is caught on the same turn
@@ -374,6 +374,118 @@ def _write_json(path: Path, payload: dict) -> None:
374
374
  tmp.replace(path)
375
375
 
376
376
 
377
+ def _pending_trace_path(sid: str) -> Path:
378
+ safe_sid = "".join(ch if ch.isalnum() or ch in "-_" else "_" for ch in (sid or "unknown"))
379
+ return _production_closeout_dir() / f"post-change-trace-{safe_sid}.json"
380
+
381
+
382
+ def _split_files(value: object) -> set[str]:
383
+ if value is None:
384
+ return set()
385
+ if isinstance(value, (list, tuple, set)):
386
+ raw = "\n".join(str(item) for item in value)
387
+ else:
388
+ raw = str(value)
389
+ parts = re.split(r"[\n,;]+", raw)
390
+ return {part.strip() for part in parts if part and part.strip()}
391
+
392
+
393
+ def _record_post_change_trace(payload: dict, sid: str) -> None:
394
+ if not sid:
395
+ sid = "unknown"
396
+ path = _pending_trace_path(sid)
397
+ trace = _read_json(path) or {
398
+ "sid": sid,
399
+ "touched_files": [],
400
+ "guard_files": [],
401
+ "change_log_files": [],
402
+ "production_mutation": False,
403
+ "created_at": time.time(),
404
+ }
405
+ tool_name = _tool_name(payload)
406
+ tool_input = _tool_input(payload)
407
+ cmd = _extract_command(payload)
408
+
409
+ touched = set(trace.get("touched_files") or [])
410
+ guards = set(trace.get("guard_files") or [])
411
+ logged = set(trace.get("change_log_files") or [])
412
+
413
+ if _is_shared_mutation_payload(payload):
414
+ touched.update(_split_files(tool_input.get("file_path")))
415
+ touched.update(_split_files(tool_input.get("path")))
416
+ touched.update(_split_files(tool_input.get("files")))
417
+ touched.update(_split_files(tool_input.get("paths")))
418
+ if cmd:
419
+ trace["last_mutation_command"] = cmd[:500]
420
+ if _is_production_mutation_command(cmd):
421
+ trace["production_mutation"] = True
422
+
423
+ if tool_name in {"nexo_guard_check", "mcp__nexo__nexo_guard_check"}:
424
+ guards.update(_split_files(tool_input.get("files")))
425
+
426
+ if _is_change_log_tool(tool_name):
427
+ logged.update(_split_files(tool_input.get("files")))
428
+ logged.update(_split_files(tool_input.get("files_changed")))
429
+ if not logged and touched:
430
+ logged.update(touched)
431
+
432
+ if _is_task_close_tool(tool_name):
433
+ touched.update(_split_files(tool_input.get("files_changed")))
434
+
435
+ trace["touched_files"] = sorted(touched)
436
+ trace["guard_files"] = sorted(guards)
437
+ trace["change_log_files"] = sorted(logged)
438
+ trace["updated_at"] = time.time()
439
+
440
+ if touched or guards or logged or trace.get("production_mutation"):
441
+ _write_json(path, trace)
442
+
443
+
444
+ def _missing_trace_items(payload: dict, sid: str) -> list[str]:
445
+ if not _is_task_close_tool(_tool_name(payload)):
446
+ return []
447
+ trace = _read_json(_pending_trace_path(sid or "unknown"))
448
+ if not trace:
449
+ return []
450
+ tool_input = _tool_input(payload)
451
+ touched = set(trace.get("touched_files") or [])
452
+ if not touched and not trace.get("production_mutation"):
453
+ return []
454
+ guards = set(trace.get("guard_files") or [])
455
+ logged = set(trace.get("change_log_files") or [])
456
+ closing_files = _split_files(tool_input.get("files_changed"))
457
+
458
+ missing = []
459
+ if touched and not guards:
460
+ missing.append("guardias ejecutados")
461
+ if trace.get("production_mutation") and not logged and not _task_close_payload_has_change_trace(payload):
462
+ missing.append("registro de cambios")
463
+ if touched and closing_files and not touched.issubset(closing_files):
464
+ missing.append("files_changed completo")
465
+ if touched and not closing_files:
466
+ missing.append("files_changed")
467
+ return missing
468
+
469
+
470
+ def check_post_change_trace_closeout(payload: dict, sid: str) -> str | None:
471
+ if not sid:
472
+ sid = "unknown"
473
+ _record_post_change_trace(payload, sid)
474
+ missing = _missing_trace_items(payload, sid)
475
+ if not missing:
476
+ if _is_task_close_tool(_tool_name(payload)):
477
+ _pending_trace_path(sid).unlink(missing_ok=True)
478
+ return None
479
+ trace = _read_json(_pending_trace_path(sid))
480
+ files = ", ".join((trace.get("touched_files") or [])[:6]) or "cambio detectado"
481
+ message = (
482
+ "Cierre bloqueado: antes de marcar completado hay que cuadrar archivos tocados, "
483
+ f"guardias y registro de cambios. Falta: {', '.join(missing)}. "
484
+ f"Archivos detectados: {files}."
485
+ )
486
+ return append_operator_language_contract(message)
487
+
488
+
377
489
  def check_production_change_log_closeout(payload: dict, sid: str) -> str | None:
378
490
  if not sid:
379
491
  sid = "unknown"
@@ -551,6 +663,7 @@ def main() -> int:
551
663
  sid = _resolve_sid_from_payload(payload)
552
664
  reminder = check_inbox_and_emit_reminder(sid)
553
665
  change_log_message = check_production_change_log_closeout(payload, sid)
666
+ post_change_trace_message = check_post_change_trace_closeout(payload, sid)
554
667
  shared_scope_message = check_shared_scope_closeout(payload)
555
668
  g1_message: str | None = None
556
669
  try:
@@ -562,6 +675,7 @@ def main() -> int:
562
675
  protocol_message,
563
676
  reminder,
564
677
  change_log_message,
678
+ post_change_trace_message,
565
679
  shared_scope_message,
566
680
  g1_message,
567
681
  )
@@ -7,6 +7,7 @@ import re
7
7
  import shutil
8
8
  import sqlite3
9
9
  import stat
10
+ import struct
10
11
  import subprocess
11
12
  import sys
12
13
  import time
@@ -56,6 +57,16 @@ FTS_BACKFILL_BATCH = int(os.environ.get("NEXO_LOCAL_FTS_BACKFILL_BATCH", "500")
56
57
  FTS_MIGRATION_CURSOR_KEY = "fts_migration_cursor"
57
58
  FTS_MIGRATION_DONE_KEY = "fts_migration_done"
58
59
  FTS_BACKFILL_TOTAL_KEY = "fts_backfill_total"
60
+ # Compact float32 BLOB embedding storage (replaces JSON-text vector_json, which
61
+ # bloated the index ~4-6x). Dual-write both columns, read prefers the BLOB and
62
+ # falls back to JSON, backfill converts old rows incrementally. Feature flags
63
+ # are kill switches that revert to JSON-only with no redeploy.
64
+ EMB_BLOB_WRITE_ENABLED = os.environ.get("NEXO_LOCAL_EMB_BLOB_WRITE", "1") != "0"
65
+ EMB_BLOB_READ_ENABLED = os.environ.get("NEXO_LOCAL_EMB_BLOB_READ", "1") != "0"
66
+ EMB_BLOB_BACKFILL_BATCH = int(os.environ.get("NEXO_LOCAL_EMB_BLOB_BACKFILL_BATCH", "500") or "500")
67
+ EMB_BLOB_CURSOR_KEY = "emb_blob_backfill_cursor"
68
+ EMB_BLOB_DONE_KEY = "emb_blob_backfill_done"
69
+ EMB_BLOB_TOTAL_KEY = "emb_blob_backfill_total"
59
70
  EMBEDDING_REFRESH_JOB = "embedding_refresh"
60
71
  ENTITY_FACTS_JOB = "entity_facts"
61
72
  BACKGROUND_INDEX_JOB_TYPES = {ENTITY_FACTS_JOB}
@@ -2888,6 +2899,47 @@ def _latest_version_id(conn, asset_id: str) -> str:
2888
2899
  return row["version_id"] if row else stable_id("ver", asset_id)
2889
2900
 
2890
2901
 
2902
+ def _encode_embedding_blob(vector) -> bytes | None:
2903
+ """Pack a vector of floats into a little-endian float32 BLOB (dimension*4
2904
+ bytes). Returns None when blob writes are disabled or the vector is empty,
2905
+ so the caller still writes vector_json (the source of truth during the
2906
+ transition). float32 vs the legacy float64 JSON is a deliberate, negligible
2907
+ cosine drift (vectors are L2-normalized / already 8-dp-rounded)."""
2908
+ if not EMB_BLOB_WRITE_ENABLED:
2909
+ return None
2910
+ try:
2911
+ floats = [float(v) for v in (vector or [])]
2912
+ if not floats:
2913
+ return None
2914
+ return struct.pack(f"<{len(floats)}f", *floats)
2915
+ except (TypeError, ValueError, struct.error):
2916
+ return None
2917
+
2918
+
2919
+ def _decode_embedding(row) -> list:
2920
+ """Read a stored embedding, preferring the compact BLOB and falling back to
2921
+ the legacy JSON text. The BLOB is trusted only when its length matches
2922
+ dimension*4 (4 bytes per float32); a short/garbage blob falls through to
2923
+ JSON so it can never reach the cosine loop. Returns a plain Python list so
2924
+ embeddings.cosine() and the `elif vector:` truthiness need no changes."""
2925
+ if EMB_BLOB_READ_ENABLED:
2926
+ try:
2927
+ blob = row["vector_blob"]
2928
+ except (KeyError, IndexError):
2929
+ blob = None
2930
+ if blob:
2931
+ try:
2932
+ dim = int(row["dimension"] or 0)
2933
+ except (KeyError, IndexError, TypeError, ValueError):
2934
+ dim = 0
2935
+ if dim and len(blob) == dim * 4:
2936
+ try:
2937
+ return list(struct.unpack(f"<{dim}f", blob))
2938
+ except struct.error:
2939
+ pass # fall through to JSON
2940
+ return json_loads(row["vector_json"], [])
2941
+
2942
+
2891
2943
  def _insert_chunk_embedding(conn, asset_id: str, chunk_id: str, text: str) -> None:
2892
2944
  record = embeddings.embed_record(text)
2893
2945
  model_id = str(record["model_id"])
@@ -2895,8 +2947,8 @@ def _insert_chunk_embedding(conn, asset_id: str, chunk_id: str, text: str) -> No
2895
2947
  dimension = int(record["dimension"])
2896
2948
  conn.execute(
2897
2949
  """
2898
- INSERT INTO local_embeddings(embedding_id, asset_id, chunk_id, model_id, model_revision, dimension, vector_json, created_at)
2899
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
2950
+ INSERT INTO local_embeddings(embedding_id, asset_id, chunk_id, model_id, model_revision, dimension, vector_json, vector_blob, created_at)
2951
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2900
2952
  """,
2901
2953
  (
2902
2954
  stable_id("emb", f"{chunk_id}:{model_id}:{model_revision}:{dimension}"),
@@ -2906,6 +2958,7 @@ def _insert_chunk_embedding(conn, asset_id: str, chunk_id: str, text: str) -> No
2906
2958
  model_revision,
2907
2959
  dimension,
2908
2960
  json_dumps(record["vector"]),
2961
+ _encode_embedding_blob(record["vector"]),
2909
2962
  now(),
2910
2963
  ),
2911
2964
  )
@@ -3555,6 +3608,13 @@ def run_once(
3555
3608
  _backfill_fts_rows(conn, batch_limit=FTS_BACKFILL_BATCH)
3556
3609
  except Exception:
3557
3610
  pass
3611
+ # Incremental embedding TEXT->BLOB backfill: same bounded one-batch-per-tick
3612
+ # discipline. Best-effort; skips when disabled or already done.
3613
+ if EMB_BLOB_BACKFILL_BATCH > 0:
3614
+ try:
3615
+ _backfill_embedding_blobs(conn, batch_limit=EMB_BLOB_BACKFILL_BATCH)
3616
+ except Exception:
3617
+ pass
3558
3618
  conn_after = _conn()
3559
3619
  initial_after = _initial_scan_status(conn_after, list_roots(readonly=False))
3560
3620
  blocking_active_after = _active_job_count(conn_after, blocking_only=True)
@@ -4603,6 +4663,76 @@ def _backfill_fts_rows(conn, *, batch_limit: int | None = None) -> dict:
4603
4663
  return _with_sqlite_busy_retry(_run)
4604
4664
 
4605
4665
 
4666
+ def _backfill_embedding_blobs(conn, *, batch_limit: int | None = None) -> dict:
4667
+ """Incrementally convert legacy vector_json TEXT rows to compact float32
4668
+ vector_blob. Idempotent + resumable via a rowid cursor in local_index_state,
4669
+ committing per batch. Converts the EXISTING JSON in place (never re-embeds —
4670
+ re-embedding could re-stamp model_id if fastembed availability differs). New
4671
+ rows already get vector_blob from the dual-write, so this only handles
4672
+ pre-existing rows (the legacy ~19GB DB). Rows whose JSON length != dimension
4673
+ are skipped (left JSON-only; dual-read falls back) but still advance the
4674
+ cursor so they are not retried forever.
4675
+ """
4676
+ if batch_limit is None:
4677
+ batch_limit = EMB_BLOB_BACKFILL_BATCH
4678
+ batch_limit = int(batch_limit)
4679
+ if batch_limit <= 0:
4680
+ return {"ok": True, "skipped": "disabled", "done": _get_state_conn(conn, EMB_BLOB_DONE_KEY, "0") == "1"}
4681
+ if not EMB_BLOB_WRITE_ENABLED:
4682
+ return {"ok": True, "skipped": "blob_write_disabled", "done": False}
4683
+ if _get_state_conn(conn, EMB_BLOB_DONE_KEY, "0") == "1":
4684
+ return {"ok": True, "skipped": "already_done", "done": True}
4685
+
4686
+ def _run() -> dict:
4687
+ try:
4688
+ cursor = int(_get_state_conn(conn, EMB_BLOB_CURSOR_KEY, "0") or "0")
4689
+ except Exception:
4690
+ cursor = 0
4691
+ if _get_state_conn(conn, EMB_BLOB_TOTAL_KEY, "") == "":
4692
+ try:
4693
+ total_row = conn.execute(
4694
+ "SELECT COUNT(*) AS total FROM local_embeddings WHERE vector_blob IS NULL"
4695
+ ).fetchone()
4696
+ _set_state_conn(conn, EMB_BLOB_TOTAL_KEY, str(int(total_row["total"] or 0)))
4697
+ except Exception:
4698
+ pass
4699
+ rows = conn.execute(
4700
+ """
4701
+ SELECT rowid AS rid, dimension, vector_json
4702
+ FROM local_embeddings
4703
+ WHERE rowid > ? AND vector_blob IS NULL
4704
+ ORDER BY rowid ASC
4705
+ LIMIT ?
4706
+ """,
4707
+ (cursor, batch_limit),
4708
+ ).fetchall()
4709
+ if not rows:
4710
+ _set_state_conn(conn, EMB_BLOB_DONE_KEY, "1")
4711
+ conn.commit()
4712
+ return {"ok": True, "done": True, "processed": 0, "cursor": cursor}
4713
+ max_rid = cursor
4714
+ converted = 0
4715
+ for row in rows:
4716
+ rid = int(row["rid"])
4717
+ if rid > max_rid:
4718
+ max_rid = rid
4719
+ try:
4720
+ dim = int(row["dimension"] or 0)
4721
+ except (TypeError, ValueError):
4722
+ dim = 0
4723
+ vec = json_loads(row["vector_json"], [])
4724
+ if dim and len(vec) == dim:
4725
+ blob = _encode_embedding_blob(vec)
4726
+ if blob is not None and len(blob) == dim * 4:
4727
+ conn.execute("UPDATE local_embeddings SET vector_blob=? WHERE rowid=?", (blob, rid))
4728
+ converted += 1
4729
+ _set_state_conn(conn, EMB_BLOB_CURSOR_KEY, str(max_rid))
4730
+ conn.commit()
4731
+ return {"ok": True, "done": False, "processed": len(rows), "converted": converted, "cursor": max_rid}
4732
+
4733
+ return _with_sqlite_busy_retry(_run)
4734
+
4735
+
4606
4736
  def _context_candidate_rows(
4607
4737
  conn,
4608
4738
  entity_asset_ids: list[str],
@@ -4625,7 +4755,7 @@ def _context_candidate_rows(
4625
4755
  prefilter_rows = conn.execute(
4626
4756
  """
4627
4757
  SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
4628
- e.vector_json, e.model_id, e.model_revision, e.dimension
4758
+ e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
4629
4759
  FROM local_chunks_fts f
4630
4760
  JOIN local_chunks c ON c.rowid = f.rowid
4631
4761
  JOIN local_assets a ON a.asset_id = c.asset_id
@@ -4657,7 +4787,7 @@ def _context_candidate_rows(
4657
4787
  prefilter_rows = conn.execute(
4658
4788
  f"""
4659
4789
  SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
4660
- e.vector_json, e.model_id, e.model_revision, e.dimension
4790
+ e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
4661
4791
  FROM local_chunks c
4662
4792
  JOIN local_assets a ON a.asset_id = c.asset_id
4663
4793
  LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
@@ -4686,7 +4816,7 @@ def _context_candidate_rows(
4686
4816
  base_rows = conn.execute(
4687
4817
  """
4688
4818
  SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
4689
- e.vector_json, e.model_id, e.model_revision, e.dimension
4819
+ e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
4690
4820
  FROM local_chunks c
4691
4821
  JOIN local_assets a ON a.asset_id = c.asset_id
4692
4822
  LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
@@ -4713,7 +4843,7 @@ def _context_candidate_rows(
4713
4843
  entity_rows = conn.execute(
4714
4844
  f"""
4715
4845
  SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
4716
- e.vector_json, e.model_id, e.model_revision, e.dimension
4846
+ e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
4717
4847
  FROM local_chunks c
4718
4848
  JOIN local_assets a ON a.asset_id = c.asset_id
4719
4849
  LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
@@ -5200,7 +5330,7 @@ def _context_query_conn(
5200
5330
  for row in rows:
5201
5331
  if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
5202
5332
  continue
5203
- vector = json_loads(row["vector_json"], [])
5333
+ vector = _decode_embedding(row)
5204
5334
  text_score = _search_text_score(search_query, row["text"])
5205
5335
  path_score = _search_text_score(search_query, row["path"] or "")
5206
5336
  summary_score = _search_text_score(search_query, row["summary"] or "")
@@ -5756,6 +5886,14 @@ def purge_asset(asset_id: str) -> dict:
5756
5886
  conn = _conn()
5757
5887
  _purge_asset_ids(conn, [asset_id])
5758
5888
  conn.commit()
5889
+ # Reclaim the just-freed pages. Cheap incremental_vacuum (not a full VACUUM
5890
+ # — this is a frequent single-asset op; a 19GB rewrite per purge would be
5891
+ # catastrophic). No-op unless auto_vacuum=INCREMENTAL is active. Best-effort.
5892
+ try:
5893
+ conn.execute("PRAGMA incremental_vacuum")
5894
+ conn.commit()
5895
+ except Exception:
5896
+ pass
5759
5897
  log_event("info", "asset_purged", "Asset purged", asset_id=asset_id)
5760
5898
  return {"ok": True, "asset_id": asset_id}
5761
5899
 
@@ -5790,6 +5928,18 @@ def clear_index() -> dict:
5790
5928
  )
5791
5929
  _set_initial_index_complete(conn, False)
5792
5930
  conn.commit()
5931
+ # The index is now near-empty, so a full VACUUM rewrites a tiny file and
5932
+ # actually returns the freed disk to the OS (DELETE alone only moves pages
5933
+ # to the free-list). Checkpoint the WAL first so its pages are folded in,
5934
+ # VACUUM, then checkpoint again — in WAL mode VACUUM's rewrite lands in the
5935
+ # WAL, so the main file is only truncated by the trailing checkpoint. Works
5936
+ # regardless of auto_vacuum mode. Best-effort — never fail the clear.
5937
+ try:
5938
+ conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
5939
+ conn.execute("VACUUM")
5940
+ conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
5941
+ except Exception:
5942
+ pass
5793
5943
  log_event("warn", "index_cleared", "Local memory index cleared")
5794
5944
  return {"ok": True}
5795
5945
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ import shutil
4
5
  import sqlite3
5
6
  import time
6
7
  from pathlib import Path
@@ -14,6 +15,11 @@ LOCAL_CONTEXT_DB_NAME = "local-context.db"
14
15
  MIGRATION_STATE_KEY = "local_context_db_migrated_from_main"
15
16
  MIGRATION_SKIPPED_KEY = "local_context_db_migration_skipped"
16
17
  MAIN_CLEANUP_STATE_KEY = "local_context_main_tables_drained"
18
+ # One-time conversion flag: auto_vacuum=INCREMENTAL is a no-op on an already
19
+ # populated DB until exactly one full VACUUM runs. We do that conversion once
20
+ # per never-converted DB (guarded by free disk) and record it here so it never
21
+ # re-runs the expensive rewrite. See ensure_local_context_db().
22
+ AUTO_VACUUM_CONVERTED_KEY = "auto_vacuum_converted"
17
23
 
18
24
  LOCAL_CONTEXT_TABLES: tuple[str, ...] = (
19
25
  "local_index_roots",
@@ -77,6 +83,12 @@ def _connect(db_path: Path) -> sqlite3.Connection:
77
83
  conn = sqlite3.connect(str(db_path), timeout=max(_busy_timeout_ms() / 1000.0, 1.0), check_same_thread=False)
78
84
  conn.row_factory = sqlite3.Row
79
85
  conn.execute(f"PRAGMA busy_timeout={_busy_timeout_ms()}")
86
+ # auto_vacuum must be set BEFORE the first table is created to take effect on
87
+ # a brand-new DB (it is a no-op on an already-populated file — those are
88
+ # converted once via a guarded full VACUUM in ensure_local_context_db()).
89
+ # INCREMENTAL lets deletes (privacy purge, reconcile, purge_asset) reclaim
90
+ # pages via `PRAGMA incremental_vacuum` instead of growing the file forever.
91
+ conn.execute("PRAGMA auto_vacuum=INCREMENTAL")
80
92
  conn.execute("PRAGMA journal_mode=WAL")
81
93
  conn.execute("PRAGMA synchronous=NORMAL")
82
94
  conn.execute("PRAGMA temp_store=MEMORY")
@@ -119,10 +131,20 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
119
131
  _ensure_entity_dossier_schema(conn)
120
132
  _ensure_local_context_v2_schema(conn)
121
133
  _m84_local_chunks_fts(conn)
122
- conn.execute("PRAGMA user_version=84")
134
+ _m85_local_embeddings_blob(conn)
135
+ conn.execute("PRAGMA user_version=85")
123
136
  conn.commit()
124
137
 
125
138
 
139
+ def _m85_local_embeddings_blob(conn: sqlite3.Connection) -> None:
140
+ """v85: compact float32 BLOB embedding storage alongside the legacy
141
+ vector_json TEXT. Nullable + no DEFAULT so the ALTER is metadata-only (a
142
+ DEFAULT would rewrite the whole table). The write path dual-writes both
143
+ columns; the read path prefers the BLOB and falls back to JSON, so adding
144
+ the column is safe even before any backfill runs."""
145
+ _add_column_if_missing(conn, "local_embeddings", "vector_blob", "BLOB")
146
+
147
+
126
148
  def _table_columns(conn: sqlite3.Connection, table: str) -> set[str]:
127
149
  rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
128
150
  return {str(row["name"] if isinstance(row, sqlite3.Row) else row[1]) for row in rows}
@@ -421,11 +443,49 @@ def ensure_local_context_db() -> None:
421
443
  pass
422
444
  return
423
445
  _ensure_schema(_CONN)
446
+ _convert_auto_vacuum_once(_CONN, db_path)
424
447
  _LAST_MIGRATION_ATTEMPT = now
425
448
  migration = migrate_from_main_if_needed(_CONN)
426
449
  _READY = True
427
450
 
428
451
 
452
+ def _convert_auto_vacuum_once(conn: sqlite3.Connection, db_path: Path) -> None:
453
+ """Flip an existing DB from auto_vacuum=NONE to INCREMENTAL.
454
+
455
+ Setting the PRAGMA only takes effect after one full VACUUM that writes the
456
+ pointer-map pages. This rewrites the whole file once, so we guard on free
457
+ disk (VACUUM needs ~1x the DB size of scratch; require 2x margin) and only
458
+ record the done-flag once the mode is actually INCREMENTAL, so a machine
459
+ that was too full retries on a later boot. Best-effort: a failure here must
460
+ never block index startup. Runs on the writer connection only.
461
+ """
462
+ try:
463
+ if _state(conn, AUTO_VACUUM_CONVERTED_KEY) == "1":
464
+ return
465
+ mode = int(conn.execute("PRAGMA auto_vacuum").fetchone()[0])
466
+ if mode == 2: # already INCREMENTAL (e.g. freshly created DB)
467
+ _set_state(conn, AUTO_VACUUM_CONVERTED_KEY, "1")
468
+ conn.commit()
469
+ return
470
+ try:
471
+ db_size = db_path.stat().st_size
472
+ free = shutil.disk_usage(db_path.parent).free
473
+ except OSError:
474
+ return
475
+ if free <= db_size * 2:
476
+ # Not enough scratch room — leave NONE mode, retry on a later boot.
477
+ return
478
+ conn.execute("PRAGMA auto_vacuum=INCREMENTAL")
479
+ conn.execute("VACUUM")
480
+ new_mode = int(conn.execute("PRAGMA auto_vacuum").fetchone()[0])
481
+ if new_mode == 2:
482
+ _set_state(conn, AUTO_VACUUM_CONVERTED_KEY, "1")
483
+ conn.commit()
484
+ except Exception:
485
+ # Conversion is an optimization; never break startup over it.
486
+ pass
487
+
488
+
429
489
  def get_local_context_db() -> sqlite3.Connection:
430
490
  ensure_local_context_db()
431
491
  assert _CONN is not None
@@ -1130,15 +1130,128 @@ def prime_process_fingerprint() -> str:
1130
1130
  _DRIFT_AUTOEXIT_SCHEDULED = False
1131
1131
  _DRIFT_EXIT_CODE = 75
1132
1132
  _DRIFT_EXIT_DELAY_SECONDS = 0.5
1133
+ # Anti crash-loop: cap how many times one process-chain may self-heal-reexec
1134
+ # before giving up and falling back to a plain exit. A half-written update or
1135
+ # an unreadable tree must never thrash.
1136
+ _SELFHEAL_MAX_GENERATIONS = 3
1137
+ # Tool calls currently executing: never re-exec mid-request (would desync the
1138
+ # JSON-RPC stream of a sibling call). Incremented/decremented in on_call_tool.
1139
+ _INFLIGHT_TOOL_CALLS = 0
1140
+ _DRIFT_REEXEC_DEFER_MAX = 20
1141
+ _drift_reexec_defers = 0
1133
1142
 
1134
1143
 
1135
- def _request_drift_exit() -> None:
1144
+ def _selfheal_reexec_disabled() -> bool:
1145
+ return str(os.environ.get("NEXO_DISABLE_SELFHEAL_REEXEC", "") or "").strip().lower() in {"1", "true", "yes"}
1146
+
1147
+
1148
+ def _running_as_resident_service() -> bool:
1149
+ # The resident HTTP runtime-service serves multiple clients and has its own
1150
+ # self-retire (start_resident_obsolescence_watch). It must NOT execv. Lazy
1151
+ # import to avoid a circular import; fall back to an env sentinel.
1152
+ try:
1153
+ from runtime_service import is_runtime_service_process
1154
+
1155
+ return bool(is_runtime_service_process())
1156
+ except Exception:
1157
+ return str(os.environ.get("NEXO_RUNTIME_SERVICE", "") or "").strip().lower() in {"1", "true", "yes"}
1158
+
1159
+
1160
+ def _selfheal_teardown() -> None:
1161
+ """Release SQLite/WAL handles before re-exec so the new image does not fight
1162
+ its own locks. Best-effort: a teardown failure must never block the heal."""
1163
+ try:
1164
+ from local_context.db import close_local_context_db
1165
+
1166
+ close_local_context_db()
1167
+ except Exception:
1168
+ pass
1169
+ try:
1170
+ from db import close_db
1171
+
1172
+ close_db()
1173
+ except Exception:
1174
+ pass
1175
+
1176
+
1177
+ def _drift_hard_exit() -> None:
1178
+ # Fallback (today's behavior): exit so a relaunching client (e.g. Claude
1179
+ # Code) spawns a fresh process on the new code. Used when re-exec can't run.
1136
1180
  try:
1137
1181
  os._exit(_DRIFT_EXIT_CODE)
1138
1182
  except Exception:
1139
1183
  os._exit(1)
1140
1184
 
1141
1185
 
1186
+ def _request_drift_exit() -> None:
1187
+ """Heal a post-update fingerprint drift TRANSPARENTLY: re-exec the live
1188
+ process in place (os.execv -> same PID, same inherited stdio pipes to the
1189
+ MCP client) so it loads the new code on disk without the client/session
1190
+ breaking and without the user restarting anything. Falls back to a plain
1191
+ exit on any obstacle. FAIL-OPEN: this must never be worse than today's exit.
1192
+ """
1193
+ global _drift_reexec_defers
1194
+ try:
1195
+ # 0. Opt-out / non-posix / resident service -> today's behavior.
1196
+ # (execv on native Windows spawns+exits, dropping inherited stdio.)
1197
+ if _selfheal_reexec_disabled() or os.name != "posix" or _running_as_resident_service():
1198
+ _drift_hard_exit()
1199
+ return
1200
+
1201
+ # 1. Never re-exec mid tool-call: defer until in-flight calls drain.
1202
+ if _INFLIGHT_TOOL_CALLS > 0 and _drift_reexec_defers < _DRIFT_REEXEC_DEFER_MAX:
1203
+ _drift_reexec_defers += 1
1204
+ try:
1205
+ loop = asyncio.get_running_loop()
1206
+ loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
1207
+ return
1208
+ except RuntimeError:
1209
+ pass # no running loop -> proceed to re-exec now
1210
+
1211
+ # 2. Resolve the target fingerprint + anti-loop guards.
1212
+ try:
1213
+ target_fp = installed_runtime_fingerprint(use_cache=False) or ""
1214
+ except Exception:
1215
+ target_fp = ""
1216
+ already_healed_target = bool(target_fp) and os.environ.get("NEXO_SELFHEAL_GEN", "") == target_fp[:16]
1217
+ try:
1218
+ count = int(os.environ.get("NEXO_SELFHEAL_COUNT", "0") or "0")
1219
+ except ValueError:
1220
+ count = 0
1221
+ # We already re-exec'd toward this exact target (or hit the cap) and STILL
1222
+ # drift -> the update is broken/unstable; stop looping, exit once so a
1223
+ # relaunching client gets a clean process; a non-relaunching client keeps
1224
+ # the stale-but-alive server returning mcp_restart_required.
1225
+ if already_healed_target or count >= _SELFHEAL_MAX_GENERATIONS:
1226
+ _drift_hard_exit()
1227
+ return
1228
+
1229
+ # 3. Resolve the new entrypoint (the active snapshot's server.py).
1230
+ server_path = ""
1231
+ try:
1232
+ candidate = active_runtime_root() / "server.py"
1233
+ if candidate.is_file():
1234
+ server_path = str(candidate)
1235
+ except Exception:
1236
+ server_path = ""
1237
+ if not server_path and len(sys.argv) > 1 and os.path.isfile(sys.argv[1]):
1238
+ server_path = sys.argv[1]
1239
+ if not server_path:
1240
+ _drift_hard_exit()
1241
+ return
1242
+
1243
+ # 4. Best-effort teardown, stamp anti-loop env, re-exec in place.
1244
+ _selfheal_teardown()
1245
+ os.environ["NEXO_SELFHEAL_COUNT"] = str(count + 1)
1246
+ if target_fp:
1247
+ os.environ["NEXO_SELFHEAL_GEN"] = target_fp[:16]
1248
+ argv_tail = sys.argv[2:] if len(sys.argv) > 2 else []
1249
+ os.execv(sys.executable, [sys.executable, server_path, *argv_tail])
1250
+ except Exception:
1251
+ # Fail-open: any failure (execv raised, teardown, platform) -> plain exit.
1252
+ _drift_hard_exit()
1253
+
1254
+
1142
1255
  def _schedule_drift_autoexit() -> None:
1143
1256
  global _DRIFT_AUTOEXIT_SCHEDULED
1144
1257
  if _DRIFT_AUTOEXIT_SCHEDULED:
@@ -1152,6 +1265,25 @@ def _schedule_drift_autoexit() -> None:
1152
1265
  loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
1153
1266
 
1154
1267
 
1268
+ def maybe_selfheal_on_boot(client: str = "") -> bool:
1269
+ """Pre-serve drift check: if a freshly-spawned stdio child already loaded
1270
+ stale code (launched right after an update and would only ever receive
1271
+ allowlisted tools, so the per-call middleware never trips), re-exec into the
1272
+ new code BEFORE serving the first request. Normally does not return (execv
1273
+ replaces the process). Fail-open: any error -> return False and serve as-is.
1274
+ Call only in stdio-child mode (the resident HTTP service self-retires)."""
1275
+ try:
1276
+ state = resolve_restart_required(client=client)
1277
+ if not state.get("restart_required"):
1278
+ return False
1279
+ if state.get("reason") not in ("fingerprint_mismatch", "version_mismatch"):
1280
+ return False
1281
+ _request_drift_exit()
1282
+ return True
1283
+ except Exception:
1284
+ return False
1285
+
1286
+
1155
1287
  @dataclass
1156
1288
  class RestartRequiredMiddleware(Middleware):
1157
1289
  client: str = ""
@@ -1214,11 +1346,18 @@ class RestartRequiredMiddleware(Middleware):
1214
1346
  )
1215
1347
 
1216
1348
  async def on_call_tool(self, context, call_next):
1349
+ global _INFLIGHT_TOOL_CALLS
1217
1350
  tool_name = str(getattr(context.message, "name", "") or "").strip()
1218
1351
  state = resolve_restart_required(client=self.client)
1219
1352
  state = self._ack_current_client_if_restarted(state)
1220
1353
  if not state["restart_required"] or tool_name in RESTART_ALLOWLIST:
1221
- return await call_next(context)
1354
+ # Track in-flight executions so a drift self-heal re-exec defers until
1355
+ # no tool call is mid-stream (avoids desyncing the JSON-RPC framing).
1356
+ _INFLIGHT_TOOL_CALLS += 1
1357
+ try:
1358
+ return await call_next(context)
1359
+ finally:
1360
+ _INFLIGHT_TOOL_CALLS -= 1
1222
1361
 
1223
1362
  payload = {
1224
1363
  "ok": False,
@@ -1043,19 +1043,63 @@ def check_db_size():
1043
1043
 
1044
1044
  local_ctx = paths_module.memory_dir() / "local-context.db"
1045
1045
  if local_ctx.exists():
1046
- size_gb = local_ctx.stat().st_size / (1024 ** 3)
1047
- if size_gb > 60:
1048
- finding(
1049
- "ERROR",
1050
- "database",
1051
- f"local-context.db is {size_gb:.1f} GB — local index runaway; purge + VACUUM (see roots/exclusions)",
1052
- )
1053
- elif size_gb > 25:
1054
- finding(
1055
- "WARN",
1056
- "database",
1057
- f"local-context.db is {size_gb:.1f} GB — local index growing; review indexed roots/exclusions",
1058
- )
1046
+ def _index_bytes() -> int:
1047
+ # Include the -wal/-shm sidecars: a large orphan WAL was invisible
1048
+ # to a bare stat() and could hide real growth.
1049
+ total = 0
1050
+ for suffix in ("", "-wal", "-shm"):
1051
+ p = local_ctx.with_name(local_ctx.name + suffix)
1052
+ try:
1053
+ total += p.stat().st_size
1054
+ except OSError:
1055
+ pass
1056
+ return total
1057
+
1058
+ # Distinct, stricter audit cap (NOT the 60 GiB runtime soft-pause
1059
+ # NEXO_LOCAL_CONTEXT_MAX_DB_BYTES). Default 25 GiB.
1060
+ try:
1061
+ hard_cap = int(os.environ.get("NEXO_LOCAL_INDEX_MAX_BYTES", str(25 * 1024 ** 3)) or str(25 * 1024 ** 3))
1062
+ except ValueError:
1063
+ hard_cap = 25 * 1024 ** 3
1064
+
1065
+ size_gb = _index_bytes() / (1024 ** 3)
1066
+ if size_gb > 25:
1067
+ # ACT, don't just warn (learning #824: the 268 GB burst went
1068
+ # unseen because this check was advisory-only). Reclaim freed
1069
+ # pages cheaply: checkpoint the WAL + incremental_vacuum (no-op
1070
+ # unless auto_vacuum=INCREMENTAL is active). Best-effort, short
1071
+ # timeout so we never fight the live indexer's write lock.
1072
+ reclaimed_gb = 0.0
1073
+ try:
1074
+ import sqlite3 as _sqlite3
1075
+
1076
+ conn = _sqlite3.connect(str(local_ctx), timeout=5.0)
1077
+ try:
1078
+ conn.execute("PRAGMA busy_timeout=5000")
1079
+ conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
1080
+ conn.execute("PRAGMA incremental_vacuum")
1081
+ conn.commit()
1082
+ finally:
1083
+ conn.close()
1084
+ after_gb = _index_bytes() / (1024 ** 3)
1085
+ reclaimed_gb = max(0.0, size_gb - after_gb)
1086
+ size_gb = after_gb
1087
+ except Exception:
1088
+ pass
1089
+ reclaimed_note = f" (reclaimed {reclaimed_gb:.1f} GB)" if reclaimed_gb > 0.05 else ""
1090
+ if (size_gb * 1024 ** 3) > hard_cap or size_gb > 60:
1091
+ finding(
1092
+ "ERROR",
1093
+ "database",
1094
+ f"local-context.db is {size_gb:.1f} GB{reclaimed_note} — over the local-index cap; "
1095
+ f"review indexed roots/exclusions or run clear_index (operator decision)",
1096
+ )
1097
+ else:
1098
+ finding(
1099
+ "WARN",
1100
+ "database",
1101
+ f"local-context.db is {size_gb:.1f} GB{reclaimed_note} — local index growing; review roots/exclusions",
1102
+ )
1059
1103
  except Exception as exc:
1060
1104
  finding("WARN", "database", f"Could not check local-context.db size: {exc}")
1061
1105
 
@@ -947,6 +947,131 @@ def _reconcile_finished_rows(conn, *, hours=24):
947
947
  return reconciled
948
948
 
949
949
 
950
+ def _reconcile_replied_zombies(conn):
951
+ """Close 'processing'/'pending' emails that were ALREADY replied to before
952
+ the worker session marked them processed.
953
+
954
+ Failure mode (self-critiques 1111/1112, 25-may-2026): a worker session
955
+ sends the reply through ``nexo-send-reply.py`` but dies (exit -9) BEFORE
956
+ it flips the BD row to a terminal status. The stuck/zombie recovery then
957
+ resets the row to 'pending' and the daemon reinjects the MID, producing a
958
+ DUPLICATE reply to the operator.
959
+
960
+ This reconciler consults two durable signals that survive a session crash
961
+ and, if either says the operator was already answered, closes the row as
962
+ terminal ('processed') and logs a 'resolution' marker instead of letting it
963
+ be reinjected:
964
+ 1. ``email_events`` lifecycle markers ('replied'/'resolution'/
965
+ 'action_done') written by ``record_reply_lifecycle()`` at send time.
966
+ 2. ``sent_email_events`` rows whose In-Reply-To / References point back at
967
+ the inbound ``message_id`` (the durable outbound ledger written by
968
+ ``record_sent_email()``).
969
+
970
+ Matching is strictly per inbound message_id, so a fresh message in an
971
+ already-answered thread (its own distinct MID) never false-positives.
972
+ """
973
+ if not _table_exists(conn, "emails"):
974
+ return []
975
+
976
+ cols = _email_table_columns(conn)
977
+ has_sent_ledger = _table_exists(conn, "sent_email_events")
978
+
979
+ rows = conn.execute(
980
+ """
981
+ SELECT message_id, subject, status
982
+ FROM emails
983
+ WHERE status IN ('processing', 'pending')
984
+ """
985
+ ).fetchall()
986
+
987
+ sanitized = []
988
+ for row in rows:
989
+ mid = row["message_id"]
990
+ if not mid:
991
+ continue
992
+
993
+ signal = None
994
+ sent_reference = None
995
+
996
+ # Signal 1 — in-DB lifecycle marker keyed to this inbound MID.
997
+ ev = conn.execute(
998
+ """
999
+ SELECT event, MAX(timestamp) AS ts
1000
+ FROM email_events
1001
+ WHERE email_id = ?
1002
+ AND event IN ('replied', 'resolution', 'action_done')
1003
+ """,
1004
+ (mid,),
1005
+ ).fetchone()
1006
+ if ev and ev["ts"]:
1007
+ signal = f"email_event:{ev['event']}"
1008
+ sent_reference = ev["ts"]
1009
+
1010
+ # Signal 2 — durable outbound ledger pointing back at this MID.
1011
+ if signal is None and has_sent_ledger:
1012
+ sent = conn.execute(
1013
+ """
1014
+ SELECT message_id AS sent_mid, sent_at
1015
+ FROM sent_email_events
1016
+ WHERE in_reply_to = ?
1017
+ OR references_header LIKE '%' || ? || '%'
1018
+ ORDER BY sent_at DESC
1019
+ LIMIT 1
1020
+ """,
1021
+ (mid, mid),
1022
+ ).fetchone()
1023
+ if sent:
1024
+ signal = "sent_email_events"
1025
+ sent_reference = sent["sent_at"]
1026
+
1027
+ if signal is None:
1028
+ continue
1029
+
1030
+ updates = ["status = 'processed'"]
1031
+ if "completed_at" in cols:
1032
+ updates.append(
1033
+ "completed_at = COALESCE(completed_at, datetime('now','localtime'))"
1034
+ )
1035
+ if "error" in cols:
1036
+ updates.append("error = NULL")
1037
+ conn.execute(
1038
+ f"""
1039
+ UPDATE emails
1040
+ SET {', '.join(updates)}
1041
+ WHERE message_id = ?
1042
+ AND status IN ('processing', 'pending')
1043
+ """,
1044
+ (mid,),
1045
+ )
1046
+ _insert_event(
1047
+ conn,
1048
+ mid,
1049
+ "resolution",
1050
+ "Sanitized: reply already sent before BD close (zombie reconcile)",
1051
+ {
1052
+ "reason": "already_replied_reconciled",
1053
+ "previous_status": row["status"],
1054
+ "signal": signal,
1055
+ "sent_reference": sent_reference,
1056
+ },
1057
+ )
1058
+ log.warning(
1059
+ f"Sanitized already-replied zombie email: status={row['status']} "
1060
+ f"signal={signal} subj={(row['subject'] or '')[:40]} [{mid}] — "
1061
+ f"closed as 'processed', not reinjected"
1062
+ )
1063
+ sanitized.append(
1064
+ {
1065
+ "email_id": mid,
1066
+ "subject": row["subject"],
1067
+ "previous_status": row["status"],
1068
+ "signal": signal,
1069
+ }
1070
+ )
1071
+
1072
+ return sanitized
1073
+
1074
+
950
1075
  def _recent_debt_flagged(conn, email_id, *, hours=6):
951
1076
  row = conn.execute(
952
1077
  """
@@ -1153,6 +1278,9 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
1153
1278
  return ""
1154
1279
  live_reconciled = _reconcile_processing_rows(conn)
1155
1280
  finished_reconciled = _reconcile_finished_rows(conn)
1281
+ # Close already-replied zombies BEFORE the 2h stuck-recovery below resets
1282
+ # them to 'pending', so the daemon never reinjects a MID we already answered.
1283
+ replied_sanitized = _reconcile_replied_zombies(conn)
1156
1284
 
1157
1285
  items = []
1158
1286
  now_label = datetime.now().isoformat(timespec="seconds")
@@ -1278,14 +1406,17 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
1278
1406
  conn.commit()
1279
1407
  conn.close()
1280
1408
 
1281
- if not items:
1409
+ if not items and not replied_sanitized:
1282
1410
  return ""
1283
1411
 
1284
- lines = ["== PENDING EMAIL DEBT DETECTED ==", "Prioritize closing or clarifying these threads before ignoring them:"]
1285
- for item in items[:max_items]:
1286
- lines.append(f"- {item['label']} ({item['detail']})")
1287
- if len(items) > max_items:
1288
- lines.append(f"- ... and {len(items) - max_items} more item(s)")
1412
+ lines = []
1413
+ if items:
1414
+ lines.append("== PENDING EMAIL DEBT DETECTED ==")
1415
+ lines.append("Prioritize closing or clarifying these threads before ignoring them:")
1416
+ for item in items[:max_items]:
1417
+ lines.append(f"- {item['label']} ({item['detail']})")
1418
+ if len(items) > max_items:
1419
+ lines.append(f"- ... and {len(items) - max_items} more item(s)")
1289
1420
  if recovered:
1290
1421
  lines.append("")
1291
1422
  lines.append(f"Auto-recovery applied: {len(recovered)} processing-stuck email(s) were reset to pending.")
@@ -1294,6 +1425,12 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
1294
1425
  lines.append(
1295
1426
  f"Reconciled {len(sent_reconciled)} processing email(s) with already-sent reply events; no re-open applied."
1296
1427
  )
1428
+ if replied_sanitized:
1429
+ lines.append("")
1430
+ lines.append(
1431
+ f"Sanitized {len(replied_sanitized)} already-replied email(s): closed as 'processed' "
1432
+ f"to prevent duplicate operator replies (no reinjection)."
1433
+ )
1297
1434
  total_reconciled = len(live_reconciled) + len(finished_reconciled)
1298
1435
  if total_reconciled:
1299
1436
  lines.append(f"Reconciled {total_reconciled} email(s) with inconsistent lifecycle state.")
package/src/server.py CHANGED
@@ -139,6 +139,7 @@ from tools_api_call import (
139
139
  from runtime_versioning import (
140
140
  RestartRequiredMiddleware,
141
141
  build_mcp_status,
142
+ maybe_selfheal_on_boot,
142
143
  prime_process_fingerprint,
143
144
  prime_process_version,
144
145
  )
@@ -3264,4 +3265,10 @@ if __name__ == "__main__":
3264
3265
  port=port,
3265
3266
  on_exit=lambda: (close_local_context_db(), close_db()),
3266
3267
  )
3268
+ else:
3269
+ # stdio child: if we booted already-stale (spawned right after an
3270
+ # update), re-exec into the new code transparently before serving —
3271
+ # covers the case where only allowlisted tools are called and the
3272
+ # per-call drift middleware would never trip. Fail-open.
3273
+ maybe_selfheal_on_boot(client=str(os.environ.get("NEXO_MCP_CLIENT", "") or "").strip())
3267
3274
  mcp.run(**run_kwargs)