nexo-brain 7.35.0 → 7.37.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +1 -1
- package/package.json +1 -1
- package/src/db/_schema.py +1 -0
- package/src/deep_sleep_retention.py +8 -0
- package/src/enforcement_engine.py +58 -0
- package/src/hooks/post_tool_use.py +114 -0
- package/src/local_context/api.py +157 -7
- package/src/local_context/db.py +61 -1
- package/src/runtime_versioning.py +141 -2
- package/src/scripts/nexo-daily-self-audit.py +57 -13
- package/src/scripts/nexo-email-monitor.py +143 -6
- package/src/server.py +7 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.
|
|
3
|
+
"version": "7.37.0",
|
|
4
4
|
"description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "NEXO Brain",
|
package/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
|
|
19
19
|
[Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
|
|
20
20
|
|
|
21
|
-
Version `7.
|
|
21
|
+
Version `7.37.0` is the current packaged-runtime line. Minor release - transparent server self-heal: when an update lands while a Brain MCP server is already running, the resident stdio child now re-execs itself in place (same process, same live MCP connection) instead of telling the user to restart, so the updated code runs immediately with nothing visible. Fail-open (non-POSIX, re-exec error, resident service, or `NEXO_DISABLE_SELFHEAL_REEXEC` kill switch all fall back to the prior safe hard-exit), anti-loop (bounded generations + same-target guard), defers past any in-flight tool call, and a boot-time pre-serve heal. Also fixes email-monitor zombie reinjection: an already-replied email left in 'processing' after a crash is closed as terminal 'processed' and never re-sent as a duplicate reply. Builds on v7.36.0 (local index disk reclaim).
|
|
22
22
|
|
|
23
23
|
Previously in `7.31.9`: patch release over v7.31.8 - UI release closeout now has to prove the original reported symptom was reopened with observable evidence before claiming the release is ready.
|
|
24
24
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.
|
|
3
|
+
"version": "7.37.0",
|
|
4
4
|
"mcpName": "io.github.wazionapps/nexo",
|
|
5
5
|
"description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
|
|
6
6
|
"homepage": "https://nexo-brain.com",
|
package/src/db/_schema.py
CHANGED
|
@@ -146,6 +146,14 @@ def _prune_db_backups(deep_sleep_dir: Path, report: dict, *, keep: int, apply: b
|
|
|
146
146
|
_record_delete(report, backup, reason=f"old-db-backup:{family}", apply=apply)
|
|
147
147
|
for sidecar in _sidecars(backup):
|
|
148
148
|
_record_delete(report, sidecar, reason=f"old-db-backup-sidecar:{family}", apply=apply)
|
|
149
|
+
# Orphan sweep: -wal/-shm sidecars whose base .db no longer exists (left by
|
|
150
|
+
# interrupted/legacy deep-sleep processes). The online-backup path produces
|
|
151
|
+
# sidecar-free snapshots, so any sidecar with a missing base is a true
|
|
152
|
+
# orphan. Scoped strictly to this deep-sleep backup dir; never the live DBs.
|
|
153
|
+
for sidecar in list(deep_sleep_dir.glob("*-backup-*.db-wal")) + list(deep_sleep_dir.glob("*-backup-*.db-shm")):
|
|
154
|
+
base = Path(str(sidecar)[: -len("-wal")]) if str(sidecar).endswith("-wal") else Path(str(sidecar)[: -len("-shm")])
|
|
155
|
+
if not base.exists():
|
|
156
|
+
_record_delete(report, sidecar, reason="orphan-db-sidecar", apply=apply)
|
|
149
157
|
|
|
150
158
|
|
|
151
159
|
def _prune_contexts(deep_sleep_dir: Path, report: dict, *, keep: int, apply: bool) -> None:
|
|
@@ -465,6 +465,7 @@ class HeadlessEnforcer:
|
|
|
465
465
|
self.user_message_count = 0
|
|
466
466
|
self.tool_timestamps: dict[str, float] = {}
|
|
467
467
|
self.msg_since_tool: dict[str, int] = {}
|
|
468
|
+
self._tool_user_message_index: dict[str, int] = {}
|
|
468
469
|
self.injection_queue: list[dict] = []
|
|
469
470
|
self._started_at = time.time()
|
|
470
471
|
self._injections_done = 0
|
|
@@ -551,6 +552,8 @@ class HeadlessEnforcer:
|
|
|
551
552
|
# seen, periodic/conditional reminders stay suppressed so cron
|
|
552
553
|
# runners can reach TURN_END instead of reopening the task loop.
|
|
553
554
|
self._session_stopped: bool = False
|
|
555
|
+
self._first_visible_startup_gate_fired: bool = False
|
|
556
|
+
self._first_visible_text_allowed: bool = False
|
|
554
557
|
try:
|
|
555
558
|
self._post_close_cooldown_seconds = max(
|
|
556
559
|
0,
|
|
@@ -1036,6 +1039,52 @@ class HeadlessEnforcer:
|
|
|
1036
1039
|
except Exception:
|
|
1037
1040
|
pass
|
|
1038
1041
|
|
|
1042
|
+
def should_block_first_visible_text(self) -> bool:
|
|
1043
|
+
"""Fail closed before the first visible answer when startup context is missing."""
|
|
1044
|
+
if self._first_visible_text_allowed:
|
|
1045
|
+
return False
|
|
1046
|
+
if self.user_message_count <= 0:
|
|
1047
|
+
self._first_visible_text_allowed = True
|
|
1048
|
+
return False
|
|
1049
|
+
|
|
1050
|
+
current_turn = int(self.user_message_count or 0)
|
|
1051
|
+
has_startup = "nexo_startup" in self.tools_called
|
|
1052
|
+
continuity_tools = {
|
|
1053
|
+
"nexo_smart_startup",
|
|
1054
|
+
"nexo_session_diary_read",
|
|
1055
|
+
"nexo_reminders",
|
|
1056
|
+
"nexo_checkpoint_read",
|
|
1057
|
+
}
|
|
1058
|
+
has_continuity = bool(self.tools_called.intersection(continuity_tools))
|
|
1059
|
+
heartbeat_turn = max(
|
|
1060
|
+
self._tool_user_message_index.get("nexo_heartbeat", -1),
|
|
1061
|
+
self._tool_user_message_index.get("nexo_task_open", -1),
|
|
1062
|
+
)
|
|
1063
|
+
has_turn_heartbeat = heartbeat_turn >= current_turn
|
|
1064
|
+
|
|
1065
|
+
missing = []
|
|
1066
|
+
if not has_startup:
|
|
1067
|
+
missing.append("nexo_startup")
|
|
1068
|
+
if not has_continuity:
|
|
1069
|
+
missing.append("continuidad minima")
|
|
1070
|
+
if not has_turn_heartbeat:
|
|
1071
|
+
missing.append("nexo_heartbeat")
|
|
1072
|
+
if not missing:
|
|
1073
|
+
self._first_visible_text_allowed = True
|
|
1074
|
+
return False
|
|
1075
|
+
if self._first_visible_startup_gate_fired:
|
|
1076
|
+
return True
|
|
1077
|
+
|
|
1078
|
+
prompt = (
|
|
1079
|
+
"Before any visible answer, register the session, load minimal continuity, "
|
|
1080
|
+
"and associate the current user message with a heartbeat. Missing: "
|
|
1081
|
+
f"{', '.join(missing)}. Execute the required NEXO tool calls now. "
|
|
1082
|
+
"Do not produce visible text for this reminder."
|
|
1083
|
+
)
|
|
1084
|
+
self._enqueue(prompt, "first-visible-startup-heartbeat-gate", rule_id="R38_first_visible_startup_gate")
|
|
1085
|
+
self._first_visible_startup_gate_fired = True
|
|
1086
|
+
return True
|
|
1087
|
+
|
|
1039
1088
|
def _check_capability_denial_requires_reality(self, text: str):
|
|
1040
1089
|
"""Block unsupported capability denials until a live source was checked."""
|
|
1041
1090
|
if not text or not _CAPABILITY_DENIAL_RE.search(text):
|
|
@@ -2537,6 +2586,7 @@ class HeadlessEnforcer:
|
|
|
2537
2586
|
self.tools_called.add(name)
|
|
2538
2587
|
self.tool_timestamps[name] = time.time()
|
|
2539
2588
|
self.msg_since_tool[name] = 0
|
|
2589
|
+
self._tool_user_message_index[name] = int(self.user_message_count or 0)
|
|
2540
2590
|
|
|
2541
2591
|
# v7.6 conditional counter advance. Tools watched by a
|
|
2542
2592
|
# conditional rule tick a counter on every non-matching call.
|
|
@@ -3346,6 +3396,14 @@ def run_with_enforcement(
|
|
|
3346
3396
|
msg = event.get("message", {})
|
|
3347
3397
|
for block in msg.get("content", []):
|
|
3348
3398
|
if block.get("type") == "text":
|
|
3399
|
+
try:
|
|
3400
|
+
if enforcer.should_block_first_visible_text():
|
|
3401
|
+
item = enforcer.flush()
|
|
3402
|
+
if item:
|
|
3403
|
+
_inject(item["prompt"])
|
|
3404
|
+
return False
|
|
3405
|
+
except Exception as _startup_gate_exc: # noqa: BLE001
|
|
3406
|
+
_logger.warning("first visible startup gate failed: %s", _startup_gate_exc)
|
|
3349
3407
|
collected_text.append(block["text"])
|
|
3350
3408
|
# R16 — probe each assistant text block as it arrives
|
|
3351
3409
|
# so a declared-done line is caught on the same turn
|
|
@@ -374,6 +374,118 @@ def _write_json(path: Path, payload: dict) -> None:
|
|
|
374
374
|
tmp.replace(path)
|
|
375
375
|
|
|
376
376
|
|
|
377
|
+
def _pending_trace_path(sid: str) -> Path:
|
|
378
|
+
safe_sid = "".join(ch if ch.isalnum() or ch in "-_" else "_" for ch in (sid or "unknown"))
|
|
379
|
+
return _production_closeout_dir() / f"post-change-trace-{safe_sid}.json"
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _split_files(value: object) -> set[str]:
|
|
383
|
+
if value is None:
|
|
384
|
+
return set()
|
|
385
|
+
if isinstance(value, (list, tuple, set)):
|
|
386
|
+
raw = "\n".join(str(item) for item in value)
|
|
387
|
+
else:
|
|
388
|
+
raw = str(value)
|
|
389
|
+
parts = re.split(r"[\n,;]+", raw)
|
|
390
|
+
return {part.strip() for part in parts if part and part.strip()}
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _record_post_change_trace(payload: dict, sid: str) -> None:
|
|
394
|
+
if not sid:
|
|
395
|
+
sid = "unknown"
|
|
396
|
+
path = _pending_trace_path(sid)
|
|
397
|
+
trace = _read_json(path) or {
|
|
398
|
+
"sid": sid,
|
|
399
|
+
"touched_files": [],
|
|
400
|
+
"guard_files": [],
|
|
401
|
+
"change_log_files": [],
|
|
402
|
+
"production_mutation": False,
|
|
403
|
+
"created_at": time.time(),
|
|
404
|
+
}
|
|
405
|
+
tool_name = _tool_name(payload)
|
|
406
|
+
tool_input = _tool_input(payload)
|
|
407
|
+
cmd = _extract_command(payload)
|
|
408
|
+
|
|
409
|
+
touched = set(trace.get("touched_files") or [])
|
|
410
|
+
guards = set(trace.get("guard_files") or [])
|
|
411
|
+
logged = set(trace.get("change_log_files") or [])
|
|
412
|
+
|
|
413
|
+
if _is_shared_mutation_payload(payload):
|
|
414
|
+
touched.update(_split_files(tool_input.get("file_path")))
|
|
415
|
+
touched.update(_split_files(tool_input.get("path")))
|
|
416
|
+
touched.update(_split_files(tool_input.get("files")))
|
|
417
|
+
touched.update(_split_files(tool_input.get("paths")))
|
|
418
|
+
if cmd:
|
|
419
|
+
trace["last_mutation_command"] = cmd[:500]
|
|
420
|
+
if _is_production_mutation_command(cmd):
|
|
421
|
+
trace["production_mutation"] = True
|
|
422
|
+
|
|
423
|
+
if tool_name in {"nexo_guard_check", "mcp__nexo__nexo_guard_check"}:
|
|
424
|
+
guards.update(_split_files(tool_input.get("files")))
|
|
425
|
+
|
|
426
|
+
if _is_change_log_tool(tool_name):
|
|
427
|
+
logged.update(_split_files(tool_input.get("files")))
|
|
428
|
+
logged.update(_split_files(tool_input.get("files_changed")))
|
|
429
|
+
if not logged and touched:
|
|
430
|
+
logged.update(touched)
|
|
431
|
+
|
|
432
|
+
if _is_task_close_tool(tool_name):
|
|
433
|
+
touched.update(_split_files(tool_input.get("files_changed")))
|
|
434
|
+
|
|
435
|
+
trace["touched_files"] = sorted(touched)
|
|
436
|
+
trace["guard_files"] = sorted(guards)
|
|
437
|
+
trace["change_log_files"] = sorted(logged)
|
|
438
|
+
trace["updated_at"] = time.time()
|
|
439
|
+
|
|
440
|
+
if touched or guards or logged or trace.get("production_mutation"):
|
|
441
|
+
_write_json(path, trace)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _missing_trace_items(payload: dict, sid: str) -> list[str]:
|
|
445
|
+
if not _is_task_close_tool(_tool_name(payload)):
|
|
446
|
+
return []
|
|
447
|
+
trace = _read_json(_pending_trace_path(sid or "unknown"))
|
|
448
|
+
if not trace:
|
|
449
|
+
return []
|
|
450
|
+
tool_input = _tool_input(payload)
|
|
451
|
+
touched = set(trace.get("touched_files") or [])
|
|
452
|
+
if not touched and not trace.get("production_mutation"):
|
|
453
|
+
return []
|
|
454
|
+
guards = set(trace.get("guard_files") or [])
|
|
455
|
+
logged = set(trace.get("change_log_files") or [])
|
|
456
|
+
closing_files = _split_files(tool_input.get("files_changed"))
|
|
457
|
+
|
|
458
|
+
missing = []
|
|
459
|
+
if touched and not guards:
|
|
460
|
+
missing.append("guardias ejecutados")
|
|
461
|
+
if trace.get("production_mutation") and not logged and not _task_close_payload_has_change_trace(payload):
|
|
462
|
+
missing.append("registro de cambios")
|
|
463
|
+
if touched and closing_files and not touched.issubset(closing_files):
|
|
464
|
+
missing.append("files_changed completo")
|
|
465
|
+
if touched and not closing_files:
|
|
466
|
+
missing.append("files_changed")
|
|
467
|
+
return missing
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def check_post_change_trace_closeout(payload: dict, sid: str) -> str | None:
|
|
471
|
+
if not sid:
|
|
472
|
+
sid = "unknown"
|
|
473
|
+
_record_post_change_trace(payload, sid)
|
|
474
|
+
missing = _missing_trace_items(payload, sid)
|
|
475
|
+
if not missing:
|
|
476
|
+
if _is_task_close_tool(_tool_name(payload)):
|
|
477
|
+
_pending_trace_path(sid).unlink(missing_ok=True)
|
|
478
|
+
return None
|
|
479
|
+
trace = _read_json(_pending_trace_path(sid))
|
|
480
|
+
files = ", ".join((trace.get("touched_files") or [])[:6]) or "cambio detectado"
|
|
481
|
+
message = (
|
|
482
|
+
"Cierre bloqueado: antes de marcar completado hay que cuadrar archivos tocados, "
|
|
483
|
+
f"guardias y registro de cambios. Falta: {', '.join(missing)}. "
|
|
484
|
+
f"Archivos detectados: {files}."
|
|
485
|
+
)
|
|
486
|
+
return append_operator_language_contract(message)
|
|
487
|
+
|
|
488
|
+
|
|
377
489
|
def check_production_change_log_closeout(payload: dict, sid: str) -> str | None:
|
|
378
490
|
if not sid:
|
|
379
491
|
sid = "unknown"
|
|
@@ -551,6 +663,7 @@ def main() -> int:
|
|
|
551
663
|
sid = _resolve_sid_from_payload(payload)
|
|
552
664
|
reminder = check_inbox_and_emit_reminder(sid)
|
|
553
665
|
change_log_message = check_production_change_log_closeout(payload, sid)
|
|
666
|
+
post_change_trace_message = check_post_change_trace_closeout(payload, sid)
|
|
554
667
|
shared_scope_message = check_shared_scope_closeout(payload)
|
|
555
668
|
g1_message: str | None = None
|
|
556
669
|
try:
|
|
@@ -562,6 +675,7 @@ def main() -> int:
|
|
|
562
675
|
protocol_message,
|
|
563
676
|
reminder,
|
|
564
677
|
change_log_message,
|
|
678
|
+
post_change_trace_message,
|
|
565
679
|
shared_scope_message,
|
|
566
680
|
g1_message,
|
|
567
681
|
)
|
package/src/local_context/api.py
CHANGED
|
@@ -7,6 +7,7 @@ import re
|
|
|
7
7
|
import shutil
|
|
8
8
|
import sqlite3
|
|
9
9
|
import stat
|
|
10
|
+
import struct
|
|
10
11
|
import subprocess
|
|
11
12
|
import sys
|
|
12
13
|
import time
|
|
@@ -56,6 +57,16 @@ FTS_BACKFILL_BATCH = int(os.environ.get("NEXO_LOCAL_FTS_BACKFILL_BATCH", "500")
|
|
|
56
57
|
FTS_MIGRATION_CURSOR_KEY = "fts_migration_cursor"
|
|
57
58
|
FTS_MIGRATION_DONE_KEY = "fts_migration_done"
|
|
58
59
|
FTS_BACKFILL_TOTAL_KEY = "fts_backfill_total"
|
|
60
|
+
# Compact float32 BLOB embedding storage (replaces JSON-text vector_json, which
|
|
61
|
+
# bloated the index ~4-6x). Dual-write both columns, read prefers the BLOB and
|
|
62
|
+
# falls back to JSON, backfill converts old rows incrementally. Feature flags
|
|
63
|
+
# are kill switches that revert to JSON-only with no redeploy.
|
|
64
|
+
EMB_BLOB_WRITE_ENABLED = os.environ.get("NEXO_LOCAL_EMB_BLOB_WRITE", "1") != "0"
|
|
65
|
+
EMB_BLOB_READ_ENABLED = os.environ.get("NEXO_LOCAL_EMB_BLOB_READ", "1") != "0"
|
|
66
|
+
EMB_BLOB_BACKFILL_BATCH = int(os.environ.get("NEXO_LOCAL_EMB_BLOB_BACKFILL_BATCH", "500") or "500")
|
|
67
|
+
EMB_BLOB_CURSOR_KEY = "emb_blob_backfill_cursor"
|
|
68
|
+
EMB_BLOB_DONE_KEY = "emb_blob_backfill_done"
|
|
69
|
+
EMB_BLOB_TOTAL_KEY = "emb_blob_backfill_total"
|
|
59
70
|
EMBEDDING_REFRESH_JOB = "embedding_refresh"
|
|
60
71
|
ENTITY_FACTS_JOB = "entity_facts"
|
|
61
72
|
BACKGROUND_INDEX_JOB_TYPES = {ENTITY_FACTS_JOB}
|
|
@@ -2888,6 +2899,47 @@ def _latest_version_id(conn, asset_id: str) -> str:
|
|
|
2888
2899
|
return row["version_id"] if row else stable_id("ver", asset_id)
|
|
2889
2900
|
|
|
2890
2901
|
|
|
2902
|
+
def _encode_embedding_blob(vector) -> bytes | None:
|
|
2903
|
+
"""Pack a vector of floats into a little-endian float32 BLOB (dimension*4
|
|
2904
|
+
bytes). Returns None when blob writes are disabled or the vector is empty,
|
|
2905
|
+
so the caller still writes vector_json (the source of truth during the
|
|
2906
|
+
transition). float32 vs the legacy float64 JSON is a deliberate, negligible
|
|
2907
|
+
cosine drift (vectors are L2-normalized / already 8-dp-rounded)."""
|
|
2908
|
+
if not EMB_BLOB_WRITE_ENABLED:
|
|
2909
|
+
return None
|
|
2910
|
+
try:
|
|
2911
|
+
floats = [float(v) for v in (vector or [])]
|
|
2912
|
+
if not floats:
|
|
2913
|
+
return None
|
|
2914
|
+
return struct.pack(f"<{len(floats)}f", *floats)
|
|
2915
|
+
except (TypeError, ValueError, struct.error):
|
|
2916
|
+
return None
|
|
2917
|
+
|
|
2918
|
+
|
|
2919
|
+
def _decode_embedding(row) -> list:
|
|
2920
|
+
"""Read a stored embedding, preferring the compact BLOB and falling back to
|
|
2921
|
+
the legacy JSON text. The BLOB is trusted only when its length matches
|
|
2922
|
+
dimension*4 (4 bytes per float32); a short/garbage blob falls through to
|
|
2923
|
+
JSON so it can never reach the cosine loop. Returns a plain Python list so
|
|
2924
|
+
embeddings.cosine() and the `elif vector:` truthiness need no changes."""
|
|
2925
|
+
if EMB_BLOB_READ_ENABLED:
|
|
2926
|
+
try:
|
|
2927
|
+
blob = row["vector_blob"]
|
|
2928
|
+
except (KeyError, IndexError):
|
|
2929
|
+
blob = None
|
|
2930
|
+
if blob:
|
|
2931
|
+
try:
|
|
2932
|
+
dim = int(row["dimension"] or 0)
|
|
2933
|
+
except (KeyError, IndexError, TypeError, ValueError):
|
|
2934
|
+
dim = 0
|
|
2935
|
+
if dim and len(blob) == dim * 4:
|
|
2936
|
+
try:
|
|
2937
|
+
return list(struct.unpack(f"<{dim}f", blob))
|
|
2938
|
+
except struct.error:
|
|
2939
|
+
pass # fall through to JSON
|
|
2940
|
+
return json_loads(row["vector_json"], [])
|
|
2941
|
+
|
|
2942
|
+
|
|
2891
2943
|
def _insert_chunk_embedding(conn, asset_id: str, chunk_id: str, text: str) -> None:
|
|
2892
2944
|
record = embeddings.embed_record(text)
|
|
2893
2945
|
model_id = str(record["model_id"])
|
|
@@ -2895,8 +2947,8 @@ def _insert_chunk_embedding(conn, asset_id: str, chunk_id: str, text: str) -> No
|
|
|
2895
2947
|
dimension = int(record["dimension"])
|
|
2896
2948
|
conn.execute(
|
|
2897
2949
|
"""
|
|
2898
|
-
INSERT INTO local_embeddings(embedding_id, asset_id, chunk_id, model_id, model_revision, dimension, vector_json, created_at)
|
|
2899
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
2950
|
+
INSERT INTO local_embeddings(embedding_id, asset_id, chunk_id, model_id, model_revision, dimension, vector_json, vector_blob, created_at)
|
|
2951
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
2900
2952
|
""",
|
|
2901
2953
|
(
|
|
2902
2954
|
stable_id("emb", f"{chunk_id}:{model_id}:{model_revision}:{dimension}"),
|
|
@@ -2906,6 +2958,7 @@ def _insert_chunk_embedding(conn, asset_id: str, chunk_id: str, text: str) -> No
|
|
|
2906
2958
|
model_revision,
|
|
2907
2959
|
dimension,
|
|
2908
2960
|
json_dumps(record["vector"]),
|
|
2961
|
+
_encode_embedding_blob(record["vector"]),
|
|
2909
2962
|
now(),
|
|
2910
2963
|
),
|
|
2911
2964
|
)
|
|
@@ -3555,6 +3608,13 @@ def run_once(
|
|
|
3555
3608
|
_backfill_fts_rows(conn, batch_limit=FTS_BACKFILL_BATCH)
|
|
3556
3609
|
except Exception:
|
|
3557
3610
|
pass
|
|
3611
|
+
# Incremental embedding TEXT->BLOB backfill: same bounded one-batch-per-tick
|
|
3612
|
+
# discipline. Best-effort; skips when disabled or already done.
|
|
3613
|
+
if EMB_BLOB_BACKFILL_BATCH > 0:
|
|
3614
|
+
try:
|
|
3615
|
+
_backfill_embedding_blobs(conn, batch_limit=EMB_BLOB_BACKFILL_BATCH)
|
|
3616
|
+
except Exception:
|
|
3617
|
+
pass
|
|
3558
3618
|
conn_after = _conn()
|
|
3559
3619
|
initial_after = _initial_scan_status(conn_after, list_roots(readonly=False))
|
|
3560
3620
|
blocking_active_after = _active_job_count(conn_after, blocking_only=True)
|
|
@@ -4603,6 +4663,76 @@ def _backfill_fts_rows(conn, *, batch_limit: int | None = None) -> dict:
|
|
|
4603
4663
|
return _with_sqlite_busy_retry(_run)
|
|
4604
4664
|
|
|
4605
4665
|
|
|
4666
|
+
def _backfill_embedding_blobs(conn, *, batch_limit: int | None = None) -> dict:
|
|
4667
|
+
"""Incrementally convert legacy vector_json TEXT rows to compact float32
|
|
4668
|
+
vector_blob. Idempotent + resumable via a rowid cursor in local_index_state,
|
|
4669
|
+
committing per batch. Converts the EXISTING JSON in place (never re-embeds —
|
|
4670
|
+
re-embedding could re-stamp model_id if fastembed availability differs). New
|
|
4671
|
+
rows already get vector_blob from the dual-write, so this only handles
|
|
4672
|
+
pre-existing rows (the legacy ~19GB DB). Rows whose JSON length != dimension
|
|
4673
|
+
are skipped (left JSON-only; dual-read falls back) but still advance the
|
|
4674
|
+
cursor so they are not retried forever.
|
|
4675
|
+
"""
|
|
4676
|
+
if batch_limit is None:
|
|
4677
|
+
batch_limit = EMB_BLOB_BACKFILL_BATCH
|
|
4678
|
+
batch_limit = int(batch_limit)
|
|
4679
|
+
if batch_limit <= 0:
|
|
4680
|
+
return {"ok": True, "skipped": "disabled", "done": _get_state_conn(conn, EMB_BLOB_DONE_KEY, "0") == "1"}
|
|
4681
|
+
if not EMB_BLOB_WRITE_ENABLED:
|
|
4682
|
+
return {"ok": True, "skipped": "blob_write_disabled", "done": False}
|
|
4683
|
+
if _get_state_conn(conn, EMB_BLOB_DONE_KEY, "0") == "1":
|
|
4684
|
+
return {"ok": True, "skipped": "already_done", "done": True}
|
|
4685
|
+
|
|
4686
|
+
def _run() -> dict:
|
|
4687
|
+
try:
|
|
4688
|
+
cursor = int(_get_state_conn(conn, EMB_BLOB_CURSOR_KEY, "0") or "0")
|
|
4689
|
+
except Exception:
|
|
4690
|
+
cursor = 0
|
|
4691
|
+
if _get_state_conn(conn, EMB_BLOB_TOTAL_KEY, "") == "":
|
|
4692
|
+
try:
|
|
4693
|
+
total_row = conn.execute(
|
|
4694
|
+
"SELECT COUNT(*) AS total FROM local_embeddings WHERE vector_blob IS NULL"
|
|
4695
|
+
).fetchone()
|
|
4696
|
+
_set_state_conn(conn, EMB_BLOB_TOTAL_KEY, str(int(total_row["total"] or 0)))
|
|
4697
|
+
except Exception:
|
|
4698
|
+
pass
|
|
4699
|
+
rows = conn.execute(
|
|
4700
|
+
"""
|
|
4701
|
+
SELECT rowid AS rid, dimension, vector_json
|
|
4702
|
+
FROM local_embeddings
|
|
4703
|
+
WHERE rowid > ? AND vector_blob IS NULL
|
|
4704
|
+
ORDER BY rowid ASC
|
|
4705
|
+
LIMIT ?
|
|
4706
|
+
""",
|
|
4707
|
+
(cursor, batch_limit),
|
|
4708
|
+
).fetchall()
|
|
4709
|
+
if not rows:
|
|
4710
|
+
_set_state_conn(conn, EMB_BLOB_DONE_KEY, "1")
|
|
4711
|
+
conn.commit()
|
|
4712
|
+
return {"ok": True, "done": True, "processed": 0, "cursor": cursor}
|
|
4713
|
+
max_rid = cursor
|
|
4714
|
+
converted = 0
|
|
4715
|
+
for row in rows:
|
|
4716
|
+
rid = int(row["rid"])
|
|
4717
|
+
if rid > max_rid:
|
|
4718
|
+
max_rid = rid
|
|
4719
|
+
try:
|
|
4720
|
+
dim = int(row["dimension"] or 0)
|
|
4721
|
+
except (TypeError, ValueError):
|
|
4722
|
+
dim = 0
|
|
4723
|
+
vec = json_loads(row["vector_json"], [])
|
|
4724
|
+
if dim and len(vec) == dim:
|
|
4725
|
+
blob = _encode_embedding_blob(vec)
|
|
4726
|
+
if blob is not None and len(blob) == dim * 4:
|
|
4727
|
+
conn.execute("UPDATE local_embeddings SET vector_blob=? WHERE rowid=?", (blob, rid))
|
|
4728
|
+
converted += 1
|
|
4729
|
+
_set_state_conn(conn, EMB_BLOB_CURSOR_KEY, str(max_rid))
|
|
4730
|
+
conn.commit()
|
|
4731
|
+
return {"ok": True, "done": False, "processed": len(rows), "converted": converted, "cursor": max_rid}
|
|
4732
|
+
|
|
4733
|
+
return _with_sqlite_busy_retry(_run)
|
|
4734
|
+
|
|
4735
|
+
|
|
4606
4736
|
def _context_candidate_rows(
|
|
4607
4737
|
conn,
|
|
4608
4738
|
entity_asset_ids: list[str],
|
|
@@ -4625,7 +4755,7 @@ def _context_candidate_rows(
|
|
|
4625
4755
|
prefilter_rows = conn.execute(
|
|
4626
4756
|
"""
|
|
4627
4757
|
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
|
|
4628
|
-
e.vector_json, e.model_id, e.model_revision, e.dimension
|
|
4758
|
+
e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
|
|
4629
4759
|
FROM local_chunks_fts f
|
|
4630
4760
|
JOIN local_chunks c ON c.rowid = f.rowid
|
|
4631
4761
|
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
@@ -4657,7 +4787,7 @@ def _context_candidate_rows(
|
|
|
4657
4787
|
prefilter_rows = conn.execute(
|
|
4658
4788
|
f"""
|
|
4659
4789
|
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
|
|
4660
|
-
e.vector_json, e.model_id, e.model_revision, e.dimension
|
|
4790
|
+
e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
|
|
4661
4791
|
FROM local_chunks c
|
|
4662
4792
|
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
4663
4793
|
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
@@ -4686,7 +4816,7 @@ def _context_candidate_rows(
|
|
|
4686
4816
|
base_rows = conn.execute(
|
|
4687
4817
|
"""
|
|
4688
4818
|
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
|
|
4689
|
-
e.vector_json, e.model_id, e.model_revision, e.dimension
|
|
4819
|
+
e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
|
|
4690
4820
|
FROM local_chunks c
|
|
4691
4821
|
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
4692
4822
|
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
@@ -4713,7 +4843,7 @@ def _context_candidate_rows(
|
|
|
4713
4843
|
entity_rows = conn.execute(
|
|
4714
4844
|
f"""
|
|
4715
4845
|
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary,
|
|
4716
|
-
e.vector_json, e.model_id, e.model_revision, e.dimension
|
|
4846
|
+
e.vector_json, e.vector_blob, e.model_id, e.model_revision, e.dimension
|
|
4717
4847
|
FROM local_chunks c
|
|
4718
4848
|
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
4719
4849
|
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
@@ -5200,7 +5330,7 @@ def _context_query_conn(
|
|
|
5200
5330
|
for row in rows:
|
|
5201
5331
|
if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
|
|
5202
5332
|
continue
|
|
5203
|
-
vector =
|
|
5333
|
+
vector = _decode_embedding(row)
|
|
5204
5334
|
text_score = _search_text_score(search_query, row["text"])
|
|
5205
5335
|
path_score = _search_text_score(search_query, row["path"] or "")
|
|
5206
5336
|
summary_score = _search_text_score(search_query, row["summary"] or "")
|
|
@@ -5756,6 +5886,14 @@ def purge_asset(asset_id: str) -> dict:
|
|
|
5756
5886
|
conn = _conn()
|
|
5757
5887
|
_purge_asset_ids(conn, [asset_id])
|
|
5758
5888
|
conn.commit()
|
|
5889
|
+
# Reclaim the just-freed pages. Cheap incremental_vacuum (not a full VACUUM
|
|
5890
|
+
# — this is a frequent single-asset op; a 19GB rewrite per purge would be
|
|
5891
|
+
# catastrophic). No-op unless auto_vacuum=INCREMENTAL is active. Best-effort.
|
|
5892
|
+
try:
|
|
5893
|
+
conn.execute("PRAGMA incremental_vacuum")
|
|
5894
|
+
conn.commit()
|
|
5895
|
+
except Exception:
|
|
5896
|
+
pass
|
|
5759
5897
|
log_event("info", "asset_purged", "Asset purged", asset_id=asset_id)
|
|
5760
5898
|
return {"ok": True, "asset_id": asset_id}
|
|
5761
5899
|
|
|
@@ -5790,6 +5928,18 @@ def clear_index() -> dict:
|
|
|
5790
5928
|
)
|
|
5791
5929
|
_set_initial_index_complete(conn, False)
|
|
5792
5930
|
conn.commit()
|
|
5931
|
+
# The index is now near-empty, so a full VACUUM rewrites a tiny file and
|
|
5932
|
+
# actually returns the freed disk to the OS (DELETE alone only moves pages
|
|
5933
|
+
# to the free-list). Checkpoint the WAL first so its pages are folded in,
|
|
5934
|
+
# VACUUM, then checkpoint again — in WAL mode VACUUM's rewrite lands in the
|
|
5935
|
+
# WAL, so the main file is only truncated by the trailing checkpoint. Works
|
|
5936
|
+
# regardless of auto_vacuum mode. Best-effort — never fail the clear.
|
|
5937
|
+
try:
|
|
5938
|
+
conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
|
|
5939
|
+
conn.execute("VACUUM")
|
|
5940
|
+
conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
|
|
5941
|
+
except Exception:
|
|
5942
|
+
pass
|
|
5793
5943
|
log_event("warn", "index_cleared", "Local memory index cleared")
|
|
5794
5944
|
return {"ok": True}
|
|
5795
5945
|
|
package/src/local_context/db.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import shutil
|
|
4
5
|
import sqlite3
|
|
5
6
|
import time
|
|
6
7
|
from pathlib import Path
|
|
@@ -14,6 +15,11 @@ LOCAL_CONTEXT_DB_NAME = "local-context.db"
|
|
|
14
15
|
MIGRATION_STATE_KEY = "local_context_db_migrated_from_main"
|
|
15
16
|
MIGRATION_SKIPPED_KEY = "local_context_db_migration_skipped"
|
|
16
17
|
MAIN_CLEANUP_STATE_KEY = "local_context_main_tables_drained"
|
|
18
|
+
# One-time conversion flag: auto_vacuum=INCREMENTAL is a no-op on an already
|
|
19
|
+
# populated DB until exactly one full VACUUM runs. We do that conversion once
|
|
20
|
+
# per never-converted DB (guarded by free disk) and record it here so it never
|
|
21
|
+
# re-runs the expensive rewrite. See ensure_local_context_db().
|
|
22
|
+
AUTO_VACUUM_CONVERTED_KEY = "auto_vacuum_converted"
|
|
17
23
|
|
|
18
24
|
LOCAL_CONTEXT_TABLES: tuple[str, ...] = (
|
|
19
25
|
"local_index_roots",
|
|
@@ -77,6 +83,12 @@ def _connect(db_path: Path) -> sqlite3.Connection:
|
|
|
77
83
|
conn = sqlite3.connect(str(db_path), timeout=max(_busy_timeout_ms() / 1000.0, 1.0), check_same_thread=False)
|
|
78
84
|
conn.row_factory = sqlite3.Row
|
|
79
85
|
conn.execute(f"PRAGMA busy_timeout={_busy_timeout_ms()}")
|
|
86
|
+
# auto_vacuum must be set BEFORE the first table is created to take effect on
|
|
87
|
+
# a brand-new DB (it is a no-op on an already-populated file — those are
|
|
88
|
+
# converted once via a guarded full VACUUM in ensure_local_context_db()).
|
|
89
|
+
# INCREMENTAL lets deletes (privacy purge, reconcile, purge_asset) reclaim
|
|
90
|
+
# pages via `PRAGMA incremental_vacuum` instead of growing the file forever.
|
|
91
|
+
conn.execute("PRAGMA auto_vacuum=INCREMENTAL")
|
|
80
92
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
81
93
|
conn.execute("PRAGMA synchronous=NORMAL")
|
|
82
94
|
conn.execute("PRAGMA temp_store=MEMORY")
|
|
@@ -119,10 +131,20 @@ def _ensure_schema(conn: sqlite3.Connection) -> None:
|
|
|
119
131
|
_ensure_entity_dossier_schema(conn)
|
|
120
132
|
_ensure_local_context_v2_schema(conn)
|
|
121
133
|
_m84_local_chunks_fts(conn)
|
|
122
|
-
conn
|
|
134
|
+
_m85_local_embeddings_blob(conn)
|
|
135
|
+
conn.execute("PRAGMA user_version=85")
|
|
123
136
|
conn.commit()
|
|
124
137
|
|
|
125
138
|
|
|
139
|
+
def _m85_local_embeddings_blob(conn: sqlite3.Connection) -> None:
|
|
140
|
+
"""v85: compact float32 BLOB embedding storage alongside the legacy
|
|
141
|
+
vector_json TEXT. Nullable + no DEFAULT so the ALTER is metadata-only (a
|
|
142
|
+
DEFAULT would rewrite the whole table). The write path dual-writes both
|
|
143
|
+
columns; the read path prefers the BLOB and falls back to JSON, so adding
|
|
144
|
+
the column is safe even before any backfill runs."""
|
|
145
|
+
_add_column_if_missing(conn, "local_embeddings", "vector_blob", "BLOB")
|
|
146
|
+
|
|
147
|
+
|
|
126
148
|
def _table_columns(conn: sqlite3.Connection, table: str) -> set[str]:
|
|
127
149
|
rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
|
|
128
150
|
return {str(row["name"] if isinstance(row, sqlite3.Row) else row[1]) for row in rows}
|
|
@@ -421,11 +443,49 @@ def ensure_local_context_db() -> None:
|
|
|
421
443
|
pass
|
|
422
444
|
return
|
|
423
445
|
_ensure_schema(_CONN)
|
|
446
|
+
_convert_auto_vacuum_once(_CONN, db_path)
|
|
424
447
|
_LAST_MIGRATION_ATTEMPT = now
|
|
425
448
|
migration = migrate_from_main_if_needed(_CONN)
|
|
426
449
|
_READY = True
|
|
427
450
|
|
|
428
451
|
|
|
452
|
+
def _convert_auto_vacuum_once(conn: sqlite3.Connection, db_path: Path) -> None:
|
|
453
|
+
"""Flip an existing DB from auto_vacuum=NONE to INCREMENTAL.
|
|
454
|
+
|
|
455
|
+
Setting the PRAGMA only takes effect after one full VACUUM that writes the
|
|
456
|
+
pointer-map pages. This rewrites the whole file once, so we guard on free
|
|
457
|
+
disk (VACUUM needs ~1x the DB size of scratch; require 2x margin) and only
|
|
458
|
+
record the done-flag once the mode is actually INCREMENTAL, so a machine
|
|
459
|
+
that was too full retries on a later boot. Best-effort: a failure here must
|
|
460
|
+
never block index startup. Runs on the writer connection only.
|
|
461
|
+
"""
|
|
462
|
+
try:
|
|
463
|
+
if _state(conn, AUTO_VACUUM_CONVERTED_KEY) == "1":
|
|
464
|
+
return
|
|
465
|
+
mode = int(conn.execute("PRAGMA auto_vacuum").fetchone()[0])
|
|
466
|
+
if mode == 2: # already INCREMENTAL (e.g. freshly created DB)
|
|
467
|
+
_set_state(conn, AUTO_VACUUM_CONVERTED_KEY, "1")
|
|
468
|
+
conn.commit()
|
|
469
|
+
return
|
|
470
|
+
try:
|
|
471
|
+
db_size = db_path.stat().st_size
|
|
472
|
+
free = shutil.disk_usage(db_path.parent).free
|
|
473
|
+
except OSError:
|
|
474
|
+
return
|
|
475
|
+
if free <= db_size * 2:
|
|
476
|
+
# Not enough scratch room — leave NONE mode, retry on a later boot.
|
|
477
|
+
return
|
|
478
|
+
conn.execute("PRAGMA auto_vacuum=INCREMENTAL")
|
|
479
|
+
conn.execute("VACUUM")
|
|
480
|
+
new_mode = int(conn.execute("PRAGMA auto_vacuum").fetchone()[0])
|
|
481
|
+
if new_mode == 2:
|
|
482
|
+
_set_state(conn, AUTO_VACUUM_CONVERTED_KEY, "1")
|
|
483
|
+
conn.commit()
|
|
484
|
+
except Exception:
|
|
485
|
+
# Conversion is an optimization; never break startup over it.
|
|
486
|
+
pass
|
|
487
|
+
|
|
488
|
+
|
|
429
489
|
def get_local_context_db() -> sqlite3.Connection:
|
|
430
490
|
ensure_local_context_db()
|
|
431
491
|
assert _CONN is not None
|
|
@@ -1130,15 +1130,128 @@ def prime_process_fingerprint() -> str:
|
|
|
1130
1130
|
_DRIFT_AUTOEXIT_SCHEDULED = False
|
|
1131
1131
|
_DRIFT_EXIT_CODE = 75
|
|
1132
1132
|
_DRIFT_EXIT_DELAY_SECONDS = 0.5
|
|
1133
|
+
# Anti crash-loop: cap how many times one process-chain may self-heal-reexec
|
|
1134
|
+
# before giving up and falling back to a plain exit. A half-written update or
|
|
1135
|
+
# an unreadable tree must never thrash.
|
|
1136
|
+
_SELFHEAL_MAX_GENERATIONS = 3
|
|
1137
|
+
# Tool calls currently executing: never re-exec mid-request (would desync the
|
|
1138
|
+
# JSON-RPC stream of a sibling call). Incremented/decremented in on_call_tool.
|
|
1139
|
+
_INFLIGHT_TOOL_CALLS = 0
|
|
1140
|
+
_DRIFT_REEXEC_DEFER_MAX = 20
|
|
1141
|
+
_drift_reexec_defers = 0
|
|
1133
1142
|
|
|
1134
1143
|
|
|
1135
|
-
def
|
|
1144
|
+
def _selfheal_reexec_disabled() -> bool:
|
|
1145
|
+
return str(os.environ.get("NEXO_DISABLE_SELFHEAL_REEXEC", "") or "").strip().lower() in {"1", "true", "yes"}
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
def _running_as_resident_service() -> bool:
|
|
1149
|
+
# The resident HTTP runtime-service serves multiple clients and has its own
|
|
1150
|
+
# self-retire (start_resident_obsolescence_watch). It must NOT execv. Lazy
|
|
1151
|
+
# import to avoid a circular import; fall back to an env sentinel.
|
|
1152
|
+
try:
|
|
1153
|
+
from runtime_service import is_runtime_service_process
|
|
1154
|
+
|
|
1155
|
+
return bool(is_runtime_service_process())
|
|
1156
|
+
except Exception:
|
|
1157
|
+
return str(os.environ.get("NEXO_RUNTIME_SERVICE", "") or "").strip().lower() in {"1", "true", "yes"}
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
def _selfheal_teardown() -> None:
|
|
1161
|
+
"""Release SQLite/WAL handles before re-exec so the new image does not fight
|
|
1162
|
+
its own locks. Best-effort: a teardown failure must never block the heal."""
|
|
1163
|
+
try:
|
|
1164
|
+
from local_context.db import close_local_context_db
|
|
1165
|
+
|
|
1166
|
+
close_local_context_db()
|
|
1167
|
+
except Exception:
|
|
1168
|
+
pass
|
|
1169
|
+
try:
|
|
1170
|
+
from db import close_db
|
|
1171
|
+
|
|
1172
|
+
close_db()
|
|
1173
|
+
except Exception:
|
|
1174
|
+
pass
|
|
1175
|
+
|
|
1176
|
+
|
|
1177
|
+
def _drift_hard_exit() -> None:
|
|
1178
|
+
# Fallback (today's behavior): exit so a relaunching client (e.g. Claude
|
|
1179
|
+
# Code) spawns a fresh process on the new code. Used when re-exec can't run.
|
|
1136
1180
|
try:
|
|
1137
1181
|
os._exit(_DRIFT_EXIT_CODE)
|
|
1138
1182
|
except Exception:
|
|
1139
1183
|
os._exit(1)
|
|
1140
1184
|
|
|
1141
1185
|
|
|
1186
|
+
def _request_drift_exit() -> None:
|
|
1187
|
+
"""Heal a post-update fingerprint drift TRANSPARENTLY: re-exec the live
|
|
1188
|
+
process in place (os.execv -> same PID, same inherited stdio pipes to the
|
|
1189
|
+
MCP client) so it loads the new code on disk without the client/session
|
|
1190
|
+
breaking and without the user restarting anything. Falls back to a plain
|
|
1191
|
+
exit on any obstacle. FAIL-OPEN: this must never be worse than today's exit.
|
|
1192
|
+
"""
|
|
1193
|
+
global _drift_reexec_defers
|
|
1194
|
+
try:
|
|
1195
|
+
# 0. Opt-out / non-posix / resident service -> today's behavior.
|
|
1196
|
+
# (execv on native Windows spawns+exits, dropping inherited stdio.)
|
|
1197
|
+
if _selfheal_reexec_disabled() or os.name != "posix" or _running_as_resident_service():
|
|
1198
|
+
_drift_hard_exit()
|
|
1199
|
+
return
|
|
1200
|
+
|
|
1201
|
+
# 1. Never re-exec mid tool-call: defer until in-flight calls drain.
|
|
1202
|
+
if _INFLIGHT_TOOL_CALLS > 0 and _drift_reexec_defers < _DRIFT_REEXEC_DEFER_MAX:
|
|
1203
|
+
_drift_reexec_defers += 1
|
|
1204
|
+
try:
|
|
1205
|
+
loop = asyncio.get_running_loop()
|
|
1206
|
+
loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
|
|
1207
|
+
return
|
|
1208
|
+
except RuntimeError:
|
|
1209
|
+
pass # no running loop -> proceed to re-exec now
|
|
1210
|
+
|
|
1211
|
+
# 2. Resolve the target fingerprint + anti-loop guards.
|
|
1212
|
+
try:
|
|
1213
|
+
target_fp = installed_runtime_fingerprint(use_cache=False) or ""
|
|
1214
|
+
except Exception:
|
|
1215
|
+
target_fp = ""
|
|
1216
|
+
already_healed_target = bool(target_fp) and os.environ.get("NEXO_SELFHEAL_GEN", "") == target_fp[:16]
|
|
1217
|
+
try:
|
|
1218
|
+
count = int(os.environ.get("NEXO_SELFHEAL_COUNT", "0") or "0")
|
|
1219
|
+
except ValueError:
|
|
1220
|
+
count = 0
|
|
1221
|
+
# We already re-exec'd toward this exact target (or hit the cap) and STILL
|
|
1222
|
+
# drift -> the update is broken/unstable; stop looping, exit once so a
|
|
1223
|
+
# relaunching client gets a clean process; a non-relaunching client keeps
|
|
1224
|
+
# the stale-but-alive server returning mcp_restart_required.
|
|
1225
|
+
if already_healed_target or count >= _SELFHEAL_MAX_GENERATIONS:
|
|
1226
|
+
_drift_hard_exit()
|
|
1227
|
+
return
|
|
1228
|
+
|
|
1229
|
+
# 3. Resolve the new entrypoint (the active snapshot's server.py).
|
|
1230
|
+
server_path = ""
|
|
1231
|
+
try:
|
|
1232
|
+
candidate = active_runtime_root() / "server.py"
|
|
1233
|
+
if candidate.is_file():
|
|
1234
|
+
server_path = str(candidate)
|
|
1235
|
+
except Exception:
|
|
1236
|
+
server_path = ""
|
|
1237
|
+
if not server_path and len(sys.argv) > 1 and os.path.isfile(sys.argv[1]):
|
|
1238
|
+
server_path = sys.argv[1]
|
|
1239
|
+
if not server_path:
|
|
1240
|
+
_drift_hard_exit()
|
|
1241
|
+
return
|
|
1242
|
+
|
|
1243
|
+
# 4. Best-effort teardown, stamp anti-loop env, re-exec in place.
|
|
1244
|
+
_selfheal_teardown()
|
|
1245
|
+
os.environ["NEXO_SELFHEAL_COUNT"] = str(count + 1)
|
|
1246
|
+
if target_fp:
|
|
1247
|
+
os.environ["NEXO_SELFHEAL_GEN"] = target_fp[:16]
|
|
1248
|
+
argv_tail = sys.argv[2:] if len(sys.argv) > 2 else []
|
|
1249
|
+
os.execv(sys.executable, [sys.executable, server_path, *argv_tail])
|
|
1250
|
+
except Exception:
|
|
1251
|
+
# Fail-open: any failure (execv raised, teardown, platform) -> plain exit.
|
|
1252
|
+
_drift_hard_exit()
|
|
1253
|
+
|
|
1254
|
+
|
|
1142
1255
|
def _schedule_drift_autoexit() -> None:
|
|
1143
1256
|
global _DRIFT_AUTOEXIT_SCHEDULED
|
|
1144
1257
|
if _DRIFT_AUTOEXIT_SCHEDULED:
|
|
@@ -1152,6 +1265,25 @@ def _schedule_drift_autoexit() -> None:
|
|
|
1152
1265
|
loop.call_later(_DRIFT_EXIT_DELAY_SECONDS, _request_drift_exit)
|
|
1153
1266
|
|
|
1154
1267
|
|
|
1268
|
+
def maybe_selfheal_on_boot(client: str = "") -> bool:
|
|
1269
|
+
"""Pre-serve drift check: if a freshly-spawned stdio child already loaded
|
|
1270
|
+
stale code (launched right after an update and would only ever receive
|
|
1271
|
+
allowlisted tools, so the per-call middleware never trips), re-exec into the
|
|
1272
|
+
new code BEFORE serving the first request. Normally does not return (execv
|
|
1273
|
+
replaces the process). Fail-open: any error -> return False and serve as-is.
|
|
1274
|
+
Call only in stdio-child mode (the resident HTTP service self-retires)."""
|
|
1275
|
+
try:
|
|
1276
|
+
state = resolve_restart_required(client=client)
|
|
1277
|
+
if not state.get("restart_required"):
|
|
1278
|
+
return False
|
|
1279
|
+
if state.get("reason") not in ("fingerprint_mismatch", "version_mismatch"):
|
|
1280
|
+
return False
|
|
1281
|
+
_request_drift_exit()
|
|
1282
|
+
return True
|
|
1283
|
+
except Exception:
|
|
1284
|
+
return False
|
|
1285
|
+
|
|
1286
|
+
|
|
1155
1287
|
@dataclass
|
|
1156
1288
|
class RestartRequiredMiddleware(Middleware):
|
|
1157
1289
|
client: str = ""
|
|
@@ -1214,11 +1346,18 @@ class RestartRequiredMiddleware(Middleware):
|
|
|
1214
1346
|
)
|
|
1215
1347
|
|
|
1216
1348
|
async def on_call_tool(self, context, call_next):
|
|
1349
|
+
global _INFLIGHT_TOOL_CALLS
|
|
1217
1350
|
tool_name = str(getattr(context.message, "name", "") or "").strip()
|
|
1218
1351
|
state = resolve_restart_required(client=self.client)
|
|
1219
1352
|
state = self._ack_current_client_if_restarted(state)
|
|
1220
1353
|
if not state["restart_required"] or tool_name in RESTART_ALLOWLIST:
|
|
1221
|
-
|
|
1354
|
+
# Track in-flight executions so a drift self-heal re-exec defers until
|
|
1355
|
+
# no tool call is mid-stream (avoids desyncing the JSON-RPC framing).
|
|
1356
|
+
_INFLIGHT_TOOL_CALLS += 1
|
|
1357
|
+
try:
|
|
1358
|
+
return await call_next(context)
|
|
1359
|
+
finally:
|
|
1360
|
+
_INFLIGHT_TOOL_CALLS -= 1
|
|
1222
1361
|
|
|
1223
1362
|
payload = {
|
|
1224
1363
|
"ok": False,
|
|
@@ -1043,19 +1043,63 @@ def check_db_size():
|
|
|
1043
1043
|
|
|
1044
1044
|
local_ctx = paths_module.memory_dir() / "local-context.db"
|
|
1045
1045
|
if local_ctx.exists():
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1046
|
+
def _index_bytes() -> int:
|
|
1047
|
+
# Include the -wal/-shm sidecars: a large orphan WAL was invisible
|
|
1048
|
+
# to a bare stat() and could hide real growth.
|
|
1049
|
+
total = 0
|
|
1050
|
+
for suffix in ("", "-wal", "-shm"):
|
|
1051
|
+
p = local_ctx.with_name(local_ctx.name + suffix)
|
|
1052
|
+
try:
|
|
1053
|
+
total += p.stat().st_size
|
|
1054
|
+
except OSError:
|
|
1055
|
+
pass
|
|
1056
|
+
return total
|
|
1057
|
+
|
|
1058
|
+
# Distinct, stricter audit cap (NOT the 60 GiB runtime soft-pause
|
|
1059
|
+
# NEXO_LOCAL_CONTEXT_MAX_DB_BYTES). Default 25 GiB.
|
|
1060
|
+
try:
|
|
1061
|
+
hard_cap = int(os.environ.get("NEXO_LOCAL_INDEX_MAX_BYTES", str(25 * 1024 ** 3)) or str(25 * 1024 ** 3))
|
|
1062
|
+
except ValueError:
|
|
1063
|
+
hard_cap = 25 * 1024 ** 3
|
|
1064
|
+
|
|
1065
|
+
size_gb = _index_bytes() / (1024 ** 3)
|
|
1066
|
+
if size_gb > 25:
|
|
1067
|
+
# ACT, don't just warn (learning #824: the 268 GB burst went
|
|
1068
|
+
# unseen because this check was advisory-only). Reclaim freed
|
|
1069
|
+
# pages cheaply: checkpoint the WAL + incremental_vacuum (no-op
|
|
1070
|
+
# unless auto_vacuum=INCREMENTAL is active). Best-effort, short
|
|
1071
|
+
# timeout so we never fight the live indexer's write lock.
|
|
1072
|
+
reclaimed_gb = 0.0
|
|
1073
|
+
try:
|
|
1074
|
+
import sqlite3 as _sqlite3
|
|
1075
|
+
|
|
1076
|
+
conn = _sqlite3.connect(str(local_ctx), timeout=5.0)
|
|
1077
|
+
try:
|
|
1078
|
+
conn.execute("PRAGMA busy_timeout=5000")
|
|
1079
|
+
conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
|
|
1080
|
+
conn.execute("PRAGMA incremental_vacuum")
|
|
1081
|
+
conn.commit()
|
|
1082
|
+
finally:
|
|
1083
|
+
conn.close()
|
|
1084
|
+
after_gb = _index_bytes() / (1024 ** 3)
|
|
1085
|
+
reclaimed_gb = max(0.0, size_gb - after_gb)
|
|
1086
|
+
size_gb = after_gb
|
|
1087
|
+
except Exception:
|
|
1088
|
+
pass
|
|
1089
|
+
reclaimed_note = f" (reclaimed {reclaimed_gb:.1f} GB)" if reclaimed_gb > 0.05 else ""
|
|
1090
|
+
if (size_gb * 1024 ** 3) > hard_cap or size_gb > 60:
|
|
1091
|
+
finding(
|
|
1092
|
+
"ERROR",
|
|
1093
|
+
"database",
|
|
1094
|
+
f"local-context.db is {size_gb:.1f} GB{reclaimed_note} — over the local-index cap; "
|
|
1095
|
+
f"review indexed roots/exclusions or run clear_index (operator decision)",
|
|
1096
|
+
)
|
|
1097
|
+
else:
|
|
1098
|
+
finding(
|
|
1099
|
+
"WARN",
|
|
1100
|
+
"database",
|
|
1101
|
+
f"local-context.db is {size_gb:.1f} GB{reclaimed_note} — local index growing; review roots/exclusions",
|
|
1102
|
+
)
|
|
1059
1103
|
except Exception as exc:
|
|
1060
1104
|
finding("WARN", "database", f"Could not check local-context.db size: {exc}")
|
|
1061
1105
|
|
|
@@ -947,6 +947,131 @@ def _reconcile_finished_rows(conn, *, hours=24):
|
|
|
947
947
|
return reconciled
|
|
948
948
|
|
|
949
949
|
|
|
950
|
+
def _reconcile_replied_zombies(conn):
|
|
951
|
+
"""Close 'processing'/'pending' emails that were ALREADY replied to before
|
|
952
|
+
the worker session marked them processed.
|
|
953
|
+
|
|
954
|
+
Failure mode (self-critiques 1111/1112, 25-may-2026): a worker session
|
|
955
|
+
sends the reply through ``nexo-send-reply.py`` but dies (exit -9) BEFORE
|
|
956
|
+
it flips the BD row to a terminal status. The stuck/zombie recovery then
|
|
957
|
+
resets the row to 'pending' and the daemon reinjects the MID, producing a
|
|
958
|
+
DUPLICATE reply to the operator.
|
|
959
|
+
|
|
960
|
+
This reconciler consults two durable signals that survive a session crash
|
|
961
|
+
and, if either says the operator was already answered, closes the row as
|
|
962
|
+
terminal ('processed') and logs a 'resolution' marker instead of letting it
|
|
963
|
+
be reinjected:
|
|
964
|
+
1. ``email_events`` lifecycle markers ('replied'/'resolution'/
|
|
965
|
+
'action_done') written by ``record_reply_lifecycle()`` at send time.
|
|
966
|
+
2. ``sent_email_events`` rows whose In-Reply-To / References point back at
|
|
967
|
+
the inbound ``message_id`` (the durable outbound ledger written by
|
|
968
|
+
``record_sent_email()``).
|
|
969
|
+
|
|
970
|
+
Matching is strictly per inbound message_id, so a fresh message in an
|
|
971
|
+
already-answered thread (its own distinct MID) never false-positives.
|
|
972
|
+
"""
|
|
973
|
+
if not _table_exists(conn, "emails"):
|
|
974
|
+
return []
|
|
975
|
+
|
|
976
|
+
cols = _email_table_columns(conn)
|
|
977
|
+
has_sent_ledger = _table_exists(conn, "sent_email_events")
|
|
978
|
+
|
|
979
|
+
rows = conn.execute(
|
|
980
|
+
"""
|
|
981
|
+
SELECT message_id, subject, status
|
|
982
|
+
FROM emails
|
|
983
|
+
WHERE status IN ('processing', 'pending')
|
|
984
|
+
"""
|
|
985
|
+
).fetchall()
|
|
986
|
+
|
|
987
|
+
sanitized = []
|
|
988
|
+
for row in rows:
|
|
989
|
+
mid = row["message_id"]
|
|
990
|
+
if not mid:
|
|
991
|
+
continue
|
|
992
|
+
|
|
993
|
+
signal = None
|
|
994
|
+
sent_reference = None
|
|
995
|
+
|
|
996
|
+
# Signal 1 — in-DB lifecycle marker keyed to this inbound MID.
|
|
997
|
+
ev = conn.execute(
|
|
998
|
+
"""
|
|
999
|
+
SELECT event, MAX(timestamp) AS ts
|
|
1000
|
+
FROM email_events
|
|
1001
|
+
WHERE email_id = ?
|
|
1002
|
+
AND event IN ('replied', 'resolution', 'action_done')
|
|
1003
|
+
""",
|
|
1004
|
+
(mid,),
|
|
1005
|
+
).fetchone()
|
|
1006
|
+
if ev and ev["ts"]:
|
|
1007
|
+
signal = f"email_event:{ev['event']}"
|
|
1008
|
+
sent_reference = ev["ts"]
|
|
1009
|
+
|
|
1010
|
+
# Signal 2 — durable outbound ledger pointing back at this MID.
|
|
1011
|
+
if signal is None and has_sent_ledger:
|
|
1012
|
+
sent = conn.execute(
|
|
1013
|
+
"""
|
|
1014
|
+
SELECT message_id AS sent_mid, sent_at
|
|
1015
|
+
FROM sent_email_events
|
|
1016
|
+
WHERE in_reply_to = ?
|
|
1017
|
+
OR references_header LIKE '%' || ? || '%'
|
|
1018
|
+
ORDER BY sent_at DESC
|
|
1019
|
+
LIMIT 1
|
|
1020
|
+
""",
|
|
1021
|
+
(mid, mid),
|
|
1022
|
+
).fetchone()
|
|
1023
|
+
if sent:
|
|
1024
|
+
signal = "sent_email_events"
|
|
1025
|
+
sent_reference = sent["sent_at"]
|
|
1026
|
+
|
|
1027
|
+
if signal is None:
|
|
1028
|
+
continue
|
|
1029
|
+
|
|
1030
|
+
updates = ["status = 'processed'"]
|
|
1031
|
+
if "completed_at" in cols:
|
|
1032
|
+
updates.append(
|
|
1033
|
+
"completed_at = COALESCE(completed_at, datetime('now','localtime'))"
|
|
1034
|
+
)
|
|
1035
|
+
if "error" in cols:
|
|
1036
|
+
updates.append("error = NULL")
|
|
1037
|
+
conn.execute(
|
|
1038
|
+
f"""
|
|
1039
|
+
UPDATE emails
|
|
1040
|
+
SET {', '.join(updates)}
|
|
1041
|
+
WHERE message_id = ?
|
|
1042
|
+
AND status IN ('processing', 'pending')
|
|
1043
|
+
""",
|
|
1044
|
+
(mid,),
|
|
1045
|
+
)
|
|
1046
|
+
_insert_event(
|
|
1047
|
+
conn,
|
|
1048
|
+
mid,
|
|
1049
|
+
"resolution",
|
|
1050
|
+
"Sanitized: reply already sent before BD close (zombie reconcile)",
|
|
1051
|
+
{
|
|
1052
|
+
"reason": "already_replied_reconciled",
|
|
1053
|
+
"previous_status": row["status"],
|
|
1054
|
+
"signal": signal,
|
|
1055
|
+
"sent_reference": sent_reference,
|
|
1056
|
+
},
|
|
1057
|
+
)
|
|
1058
|
+
log.warning(
|
|
1059
|
+
f"Sanitized already-replied zombie email: status={row['status']} "
|
|
1060
|
+
f"signal={signal} subj={(row['subject'] or '')[:40]} [{mid}] — "
|
|
1061
|
+
f"closed as 'processed', not reinjected"
|
|
1062
|
+
)
|
|
1063
|
+
sanitized.append(
|
|
1064
|
+
{
|
|
1065
|
+
"email_id": mid,
|
|
1066
|
+
"subject": row["subject"],
|
|
1067
|
+
"previous_status": row["status"],
|
|
1068
|
+
"signal": signal,
|
|
1069
|
+
}
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
return sanitized
|
|
1073
|
+
|
|
1074
|
+
|
|
950
1075
|
def _recent_debt_flagged(conn, email_id, *, hours=6):
|
|
951
1076
|
row = conn.execute(
|
|
952
1077
|
"""
|
|
@@ -1153,6 +1278,9 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
|
|
|
1153
1278
|
return ""
|
|
1154
1279
|
live_reconciled = _reconcile_processing_rows(conn)
|
|
1155
1280
|
finished_reconciled = _reconcile_finished_rows(conn)
|
|
1281
|
+
# Close already-replied zombies BEFORE the 2h stuck-recovery below resets
|
|
1282
|
+
# them to 'pending', so the daemon never reinjects a MID we already answered.
|
|
1283
|
+
replied_sanitized = _reconcile_replied_zombies(conn)
|
|
1156
1284
|
|
|
1157
1285
|
items = []
|
|
1158
1286
|
now_label = datetime.now().isoformat(timespec="seconds")
|
|
@@ -1278,14 +1406,17 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
|
|
|
1278
1406
|
conn.commit()
|
|
1279
1407
|
conn.close()
|
|
1280
1408
|
|
|
1281
|
-
if not items:
|
|
1409
|
+
if not items and not replied_sanitized:
|
|
1282
1410
|
return ""
|
|
1283
1411
|
|
|
1284
|
-
lines = [
|
|
1285
|
-
|
|
1286
|
-
lines.append(
|
|
1287
|
-
|
|
1288
|
-
|
|
1412
|
+
lines = []
|
|
1413
|
+
if items:
|
|
1414
|
+
lines.append("== PENDING EMAIL DEBT DETECTED ==")
|
|
1415
|
+
lines.append("Prioritize closing or clarifying these threads before ignoring them:")
|
|
1416
|
+
for item in items[:max_items]:
|
|
1417
|
+
lines.append(f"- {item['label']} ({item['detail']})")
|
|
1418
|
+
if len(items) > max_items:
|
|
1419
|
+
lines.append(f"- ... and {len(items) - max_items} more item(s)")
|
|
1289
1420
|
if recovered:
|
|
1290
1421
|
lines.append("")
|
|
1291
1422
|
lines.append(f"Auto-recovery applied: {len(recovered)} processing-stuck email(s) were reset to pending.")
|
|
@@ -1294,6 +1425,12 @@ def scan_debt(db_path=EMAIL_DB_PATH, *, max_items=5):
|
|
|
1294
1425
|
lines.append(
|
|
1295
1426
|
f"Reconciled {len(sent_reconciled)} processing email(s) with already-sent reply events; no re-open applied."
|
|
1296
1427
|
)
|
|
1428
|
+
if replied_sanitized:
|
|
1429
|
+
lines.append("")
|
|
1430
|
+
lines.append(
|
|
1431
|
+
f"Sanitized {len(replied_sanitized)} already-replied email(s): closed as 'processed' "
|
|
1432
|
+
f"to prevent duplicate operator replies (no reinjection)."
|
|
1433
|
+
)
|
|
1297
1434
|
total_reconciled = len(live_reconciled) + len(finished_reconciled)
|
|
1298
1435
|
if total_reconciled:
|
|
1299
1436
|
lines.append(f"Reconciled {total_reconciled} email(s) with inconsistent lifecycle state.")
|
package/src/server.py
CHANGED
|
@@ -139,6 +139,7 @@ from tools_api_call import (
|
|
|
139
139
|
from runtime_versioning import (
|
|
140
140
|
RestartRequiredMiddleware,
|
|
141
141
|
build_mcp_status,
|
|
142
|
+
maybe_selfheal_on_boot,
|
|
142
143
|
prime_process_fingerprint,
|
|
143
144
|
prime_process_version,
|
|
144
145
|
)
|
|
@@ -3264,4 +3265,10 @@ if __name__ == "__main__":
|
|
|
3264
3265
|
port=port,
|
|
3265
3266
|
on_exit=lambda: (close_local_context_db(), close_db()),
|
|
3266
3267
|
)
|
|
3268
|
+
else:
|
|
3269
|
+
# stdio child: if we booted already-stale (spawned right after an
|
|
3270
|
+
# update), re-exec into the new code transparently before serving —
|
|
3271
|
+
# covers the case where only allowlisted tools are called and the
|
|
3272
|
+
# per-call drift middleware would never trip. Fail-open.
|
|
3273
|
+
maybe_selfheal_on_boot(client=str(os.environ.get("NEXO_MCP_CLIENT", "") or "").strip())
|
|
3267
3274
|
mcp.run(**run_kwargs)
|