nexo-brain 7.20.0 → 7.20.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.0",
3
+ "version": "7.20.2",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,11 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.20.0` is the current packaged-runtime line. Minor release over v7.19.0the Local Context index now reconciles known files and folders on every service cycle, so created, modified, deleted and newly excluded local files are reflected automatically between full scans.
21
+ Version `7.20.2` is the current packaged-runtime line. Patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
22
+
23
+ Previously in `7.20.1`: patch release over v7.20.0 — the Local Context service now recovers from orphaned locks and mixed-version cycle failures instead of leaving the background index stuck.
24
+
25
+ Previously in `7.20.0`: minor release over v7.19.0 — the Local Context index now reconciles known files and folders on every service cycle, so created, modified, deleted and newly excluded local files are reflected automatically between full scans.
22
26
 
23
27
  Previously in `7.19.0`: minor release over v7.18.1 - bundle-managed installations (NEXO Desktop `brain-bundle/`) can now pin Brain to the host application release cycle via `NEXO_BRAIN_AUTO_UPDATE=false`, and the server auto-exits with code 75 on fingerprint mismatch so MCP clients respawn the server with the new code instead of leaving stale `server.py` processes alive.
24
28
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.0",
3
+ "version": "7.20.2",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
@@ -425,6 +425,48 @@ def _mark_dir_subtree_deleted(conn, dir_path: str, deleted_at: float | None = No
425
425
  return len(rows)
426
426
 
427
427
 
428
+ def _record_index_error(
429
+ conn,
430
+ *,
431
+ asset_id: str = "",
432
+ path: str = "",
433
+ phase: str,
434
+ error_code: str,
435
+ user_message: str,
436
+ technical_detail: str,
437
+ retryable: bool = True,
438
+ ) -> None:
439
+ conn.execute(
440
+ """
441
+ INSERT INTO local_index_errors(asset_id, path, phase, error_code, user_message, technical_detail, retryable, created_at)
442
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
443
+ """,
444
+ (asset_id, path, phase, error_code, user_message, technical_detail, 1 if retryable else 0, now()),
445
+ )
446
+
447
+
448
+ def _record_scan_error(conn, stats: dict | None, path: str, phase: str, exc: Exception) -> None:
449
+ if stats is not None:
450
+ stats["errors"] = int(stats.get("errors", 0) or 0) + 1
451
+ logged = int(stats.get("_errors_logged", 0) or 0)
452
+ if logged >= 20:
453
+ return
454
+ stats["_errors_logged"] = logged + 1
455
+ _record_index_error(
456
+ conn,
457
+ path=path,
458
+ phase=phase,
459
+ error_code=type(exc).__name__,
460
+ user_message="Algunas carpetas o archivos no se pudieron leer",
461
+ technical_detail=str(exc),
462
+ retryable=True,
463
+ )
464
+
465
+
466
+ def _public_stats(stats: dict) -> dict:
467
+ return {key: value for key, value in stats.items() if not str(key).startswith("_")}
468
+
469
+
428
470
  def enqueue_job(conn, asset_id: str, job_type: str, *, priority: int = 50) -> str:
429
471
  job_id = stable_id("job", f"{asset_id}:{job_type}")
430
472
  conn.execute(
@@ -447,6 +489,7 @@ def _iter_files(
447
489
  limit: int | None = None,
448
490
  start_after: str = "",
449
491
  seen_at: float | None = None,
492
+ stats: dict | None = None,
450
493
  ):
451
494
  seen_at = seen_at or now()
452
495
  seen_dirs: set[tuple[int, int]] = set()
@@ -461,7 +504,8 @@ def _iter_files(
461
504
  continue
462
505
  try:
463
506
  st = current.stat()
464
- except Exception:
507
+ except Exception as exc:
508
+ _record_scan_error(conn, stats, str(current), "quick_index", exc)
465
509
  continue
466
510
  key = (getattr(st, "st_dev", 0), getattr(st, "st_ino", 0))
467
511
  if key in seen_dirs:
@@ -470,7 +514,8 @@ def _iter_files(
470
514
  _upsert_dir(conn, root_id, current, seen_at, st)
471
515
  try:
472
516
  entries = sorted(current.iterdir(), key=lambda item: str(item).lower())
473
- except Exception:
517
+ except Exception as exc:
518
+ _record_scan_error(conn, stats, str(current), "quick_index", exc)
474
519
  continue
475
520
  dirs: list[Path] = []
476
521
  for entry in entries:
@@ -577,8 +622,8 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
577
622
  continue
578
623
  st = file_path.stat()
579
624
  fingerprint = quick_fingerprint(file_path, st)
580
- except Exception:
581
- stats["errors"] += 1
625
+ except Exception as exc:
626
+ _record_scan_error(conn, stats, path, "live_reconcile", exc)
582
627
  continue
583
628
  if fingerprint != row["quick_fingerprint"]:
584
629
  _, changed, state = _upsert_asset(conn, int(row["root_id"] or 0), file_path, seen_at, int(row["depth"] or 2))
@@ -647,8 +692,8 @@ def _scan_known_directory(
647
692
  if not current.is_dir():
648
693
  continue
649
694
  entries = sorted(current.iterdir(), key=lambda item: str(item).lower())
650
- except Exception:
651
- stats["errors"] += 1
695
+ except Exception as exc:
696
+ _record_scan_error(conn, stats, str(current), "live_reconcile", exc)
652
697
  continue
653
698
  scanned_dirs += 1
654
699
  stats["dirs_scanned"] += 1
@@ -679,8 +724,8 @@ def _scan_known_directory(
679
724
  stats["files_changed"] += 1
680
725
  if state != "ok":
681
726
  stats["errors"] += 1
682
- except Exception:
683
- stats["errors"] += 1
727
+ except Exception as exc:
728
+ _record_scan_error(conn, stats, str(entry), "live_reconcile", exc)
684
729
  deleted_files, deleted_dirs = _prune_missing_children(conn, current, seen_files, seen_dirs, seen_at)
685
730
  stats["files_deleted"] += deleted_files
686
731
  stats["dirs_deleted"] += deleted_dirs
@@ -731,8 +776,8 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
731
776
  continue
732
777
  st = dir_path.stat()
733
778
  fingerprint = _dir_fingerprint(dir_path, st)
734
- except Exception:
735
- stats["errors"] += 1
779
+ except Exception as exc:
780
+ _record_scan_error(conn, stats, str(dir_path), "live_reconcile", exc)
736
781
  continue
737
782
  if fingerprint != row["quick_fingerprint"]:
738
783
  stats["changed"] += 1
@@ -773,9 +818,18 @@ def reconcile_live_changes(
773
818
  + int(dir_stats.get("dirs_deleted", 0))
774
819
  + int(dir_stats.get("excluded_dirs", 0))
775
820
  )
776
- if changed_total:
777
- log_event("info", "live_reconcile_finished", "Local memory live changes reconciled", assets=asset_stats, dirs=dir_stats)
778
- return {"ok": True, "assets": asset_stats, "dirs": dir_stats}
821
+ error_total = int(asset_stats.get("errors", 0) or 0) + int(dir_stats.get("errors", 0) or 0)
822
+ public_asset_stats = _public_stats(asset_stats)
823
+ public_dir_stats = _public_stats(dir_stats)
824
+ if changed_total or error_total:
825
+ log_event(
826
+ "warn" if error_total else "info",
827
+ "live_reconcile_finished",
828
+ "Local memory live changes reconciled",
829
+ assets=public_asset_stats,
830
+ dirs=public_dir_stats,
831
+ )
832
+ return {"ok": True, "assets": public_asset_stats, "dirs": public_dir_stats}
779
833
 
780
834
 
781
835
  def scan_once(*, limit: int | None = None) -> dict:
@@ -814,6 +868,7 @@ def scan_once(*, limit: int | None = None) -> dict:
814
868
  limit=limit,
815
869
  start_after=str(checkpoint["current_path"] or ""),
816
870
  seen_at=cycle_started_at,
871
+ stats=totals,
817
872
  ):
818
873
  asset_id, changed, state = _upsert_asset(conn, root_id, file_path, cycle_started_at, int(root["depth"] or 2))
819
874
  last_seen_path = norm_path(file_path)
@@ -833,7 +888,7 @@ def scan_once(*, limit: int | None = None) -> dict:
833
888
  path=redact_path(str(root_path)),
834
889
  )
835
890
  if last_seen_path:
836
- _save_checkpoint(conn, root_id, last_seen_path, cycle_started_at=cycle_started_at, totals=totals)
891
+ _save_checkpoint(conn, root_id, last_seen_path, cycle_started_at=cycle_started_at, totals=_public_stats(totals))
837
892
  else:
838
893
  rows = conn.execute(
839
894
  "SELECT asset_id FROM local_assets WHERE root_id=? AND status='active' AND last_seen_at < ?",
@@ -850,8 +905,9 @@ def scan_once(*, limit: int | None = None) -> dict:
850
905
  (now(), now(), root_id),
851
906
  )
852
907
  conn.commit()
853
- log_event("info", "scan_finished", "Local memory scan finished", **totals)
854
- return {"ok": True, **totals}
908
+ public_totals = _public_stats(totals)
909
+ log_event("warn" if public_totals.get("errors") else "info", "scan_finished", "Local memory scan finished", **public_totals)
910
+ return {"ok": True, **public_totals}
855
911
 
856
912
 
857
913
  def _latest_version_id(conn, asset_id: str) -> str:
@@ -913,11 +969,35 @@ def _replace_entities(conn, asset_id: str, version_id: str, values: list[str]) -
913
969
  )
914
970
 
915
971
 
972
+ def _requeue_due_jobs(conn) -> dict:
973
+ current = now()
974
+ failed = conn.execute(
975
+ """
976
+ UPDATE local_index_jobs
977
+ SET status='pending', claimed_by='', lease_expires_at=NULL, updated_at=?
978
+ WHERE status='failed' AND (next_attempt_at IS NULL OR next_attempt_at <= ?)
979
+ """,
980
+ (current, current),
981
+ ).rowcount
982
+ expired = conn.execute(
983
+ """
984
+ UPDATE local_index_jobs
985
+ SET status='pending', claimed_by='', lease_expires_at=NULL, updated_at=?
986
+ WHERE status='running' AND lease_expires_at IS NOT NULL AND lease_expires_at <= ?
987
+ """,
988
+ (current, current),
989
+ ).rowcount
990
+ if failed or expired:
991
+ log_event("warn", "jobs_requeued", "Local memory recovered stalled jobs", failed=failed, expired=expired)
992
+ return {"failed": int(failed or 0), "expired": int(expired or 0)}
993
+
994
+
916
995
  def process_jobs(*, limit: int = 100) -> dict:
917
996
  conn = _conn()
918
997
  if _is_paused():
919
998
  log_event("info", "jobs_skipped_paused", "Local memory jobs skipped because indexing is paused")
920
999
  return {"ok": True, "paused": True, "processed": 0, "failed": 0}
1000
+ recovered = _requeue_due_jobs(conn)
921
1001
  rows = conn.execute(
922
1002
  """
923
1003
  SELECT j.*, a.path, a.depth, a.status AS asset_status
@@ -976,17 +1056,20 @@ def process_jobs(*, limit: int = 100) -> dict:
976
1056
  """,
977
1057
  (now() + 3600, type(exc).__name__, now(), job_id),
978
1058
  )
979
- conn.execute(
980
- """
981
- INSERT INTO local_index_errors(asset_id, path, phase, error_code, user_message, technical_detail, retryable, created_at)
982
- VALUES (?, ?, ?, ?, ?, ?, 1, ?)
983
- """,
984
- (asset_id, row["path"], job_type, type(exc).__name__, "Algunos archivos no se pudieron leer", str(exc), now()),
1059
+ _record_index_error(
1060
+ conn,
1061
+ asset_id=asset_id,
1062
+ path=row["path"],
1063
+ phase=job_type,
1064
+ error_code=type(exc).__name__,
1065
+ user_message="Algunos archivos no se pudieron leer",
1066
+ technical_detail=str(exc),
1067
+ retryable=True,
985
1068
  )
986
1069
  conn.commit()
987
1070
  if processed or failed:
988
1071
  log_event("info", "jobs_processed", "Local memory jobs processed", processed=processed, failed=failed)
989
- return {"ok": True, "processed": processed, "failed": failed}
1072
+ return {"ok": True, "processed": processed, "failed": failed, "recovered": recovered}
990
1073
 
991
1074
 
992
1075
  def run_once(
@@ -1000,6 +1083,12 @@ def run_once(
1000
1083
  ) -> dict:
1001
1084
  if root:
1002
1085
  add_root(root)
1086
+ elif (
1087
+ os.environ.get("NEXO_LOCAL_INDEX_DISABLE_DEFAULT_ROOTS", "").strip() != "1"
1088
+ and os.environ.get("NEXO_SKIP_FS_INDEX", "").strip() != "1"
1089
+ and not list_roots()
1090
+ ):
1091
+ ensure_default_roots()
1003
1092
  live_result = reconcile_live_changes(
1004
1093
  asset_limit=live_asset_limit,
1005
1094
  dir_limit=live_dir_limit,
@@ -1019,7 +1108,7 @@ def _problem_rows(conn) -> list[dict]:
1019
1108
  LIMIT 20
1020
1109
  """
1021
1110
  ).fetchall()
1022
- return [
1111
+ problems = [
1023
1112
  {
1024
1113
  "user_message": row["user_message"],
1025
1114
  "recommended_action": "NEXO lo volvera a intentar mas tarde" if row["retryable"] else "Revisa permisos o archivo",
@@ -1033,6 +1122,35 @@ def _problem_rows(conn) -> list[dict]:
1033
1122
  }
1034
1123
  for row in rows
1035
1124
  ]
1125
+ last_success = conn.execute(
1126
+ "SELECT MAX(created_at) AS created_at FROM local_index_logs WHERE event='service_cycle_finished'"
1127
+ ).fetchone()["created_at"] or 0
1128
+ service_rows = conn.execute(
1129
+ """
1130
+ SELECT created_at, level, event, message, metadata_json
1131
+ FROM local_index_logs
1132
+ WHERE event IN ('service_cycle_failed', 'service_cycle_compat_fallback', 'service_cycle_skipped_lock')
1133
+ AND created_at > ?
1134
+ ORDER BY id DESC
1135
+ LIMIT 5
1136
+ """,
1137
+ (last_success,),
1138
+ ).fetchall()
1139
+ problems.extend(
1140
+ {
1141
+ "user_message": "La memoria local tuvo un problema temporal y NEXO la reintentara automaticamente",
1142
+ "recommended_action": "No tienes que hacer nada. Si se repite, abre soporte y diagnostico para ver el detalle.",
1143
+ "technical_detail": f"{row['event']}: {row['message']} {row['metadata_json']}",
1144
+ "support_code": row["event"],
1145
+ "severity": "warning" if row["level"] == "warn" else "error",
1146
+ "retryable": True,
1147
+ "path": "",
1148
+ "phase": "service",
1149
+ "created_at": row["created_at"],
1150
+ }
1151
+ for row in service_rows
1152
+ )
1153
+ return problems
1036
1154
 
1037
1155
 
1038
1156
  def _command_output(args: list[str], *, timeout: int = 2) -> tuple[int, str, str]:
@@ -1072,6 +1190,7 @@ def _macos_local_index_service_status() -> dict:
1072
1190
  running = False
1073
1191
  active_process = False
1074
1192
  pid = ""
1193
+ launchctl_status = ""
1075
1194
 
1076
1195
  code, stdout, _ = _command_output(["launchctl", "list"], timeout=2)
1077
1196
  if code == 0:
@@ -1080,6 +1199,7 @@ def _macos_local_index_service_status() -> dict:
1080
1199
  if len(parts) >= 3 and parts[-1] == LOCAL_INDEX_SERVICE_LABEL:
1081
1200
  installed = True
1082
1201
  pid = parts[0]
1202
+ launchctl_status = parts[1]
1083
1203
  running = True
1084
1204
  active_process = pid.isdigit() and int(pid) > 0
1085
1205
  break
@@ -1099,6 +1219,7 @@ def _macos_local_index_service_status() -> dict:
1099
1219
  "manager": "launchagent",
1100
1220
  "label": LOCAL_INDEX_SERVICE_LABEL,
1101
1221
  "pid": pid,
1222
+ "last_exit_code": launchctl_status,
1102
1223
  "config_path": str(plist_path),
1103
1224
  }
1104
1225
 
@@ -1106,11 +1227,22 @@ def _macos_local_index_service_status() -> dict:
1106
1227
  def _windows_local_index_service_status() -> dict:
1107
1228
  command = (
1108
1229
  "$task = Get-ScheduledTask -TaskName 'NEXO Local Memory' -ErrorAction SilentlyContinue; "
1109
- "if ($task) { Write-Output $task.State }"
1230
+ "$info = if ($task) { Get-ScheduledTaskInfo -TaskName 'NEXO Local Memory' -ErrorAction SilentlyContinue }; "
1231
+ "if ($task) { "
1232
+ "$lastRun = if ($info -and $info.LastRunTime) { $info.LastRunTime.ToString('o') } else { '' }; "
1233
+ "$nextRun = if ($info -and $info.NextRunTime) { $info.NextRunTime.ToString('o') } else { '' }; "
1234
+ "$lastResult = if ($info) { [string]$info.LastTaskResult } else { '' }; "
1235
+ "Write-Output ($task.State.ToString() + '|' + $lastResult + '|' + $lastRun + '|' + $nextRun) "
1236
+ "}"
1110
1237
  )
1111
1238
  code, stdout, _ = _command_output(["powershell", "-NoProfile", "-Command", command], timeout=4)
1112
- task_state = stdout.strip()
1239
+ raw = stdout.strip()
1240
+ parts = raw.split("|") if "|" in raw else [raw]
1241
+ task_state = parts[0].strip() if parts else ""
1113
1242
  task_state_key = task_state.lower()
1243
+ last_task_result = parts[1].strip() if len(parts) > 1 else ""
1244
+ last_run_time = parts[2].strip() if len(parts) > 2 else ""
1245
+ next_run_time = parts[3].strip() if len(parts) > 3 else ""
1114
1246
  installed = code == 0 and bool(task_state)
1115
1247
  active_process = task_state_key == "running"
1116
1248
  if not active_process:
@@ -1123,6 +1255,9 @@ def _windows_local_index_service_status() -> dict:
1123
1255
  "manager": "scheduled_task",
1124
1256
  "task_name": LOCAL_INDEX_WINDOWS_TASK,
1125
1257
  "task_state": task_state,
1258
+ "last_task_result": last_task_result,
1259
+ "last_run_time": last_run_time,
1260
+ "next_run_time": next_run_time,
1126
1261
  }
1127
1262
 
1128
1263
 
@@ -1165,15 +1300,103 @@ def _local_index_service_status() -> dict:
1165
1300
  return service
1166
1301
 
1167
1302
 
1303
+ def _service_cycle_observation(conn) -> dict:
1304
+ last_success = conn.execute(
1305
+ "SELECT MAX(created_at) AS created_at FROM local_index_logs WHERE event='service_cycle_finished'"
1306
+ ).fetchone()["created_at"] or 0
1307
+ latest = conn.execute(
1308
+ """
1309
+ SELECT created_at, event, level, message, metadata_json
1310
+ FROM local_index_logs
1311
+ WHERE event IN ('service_cycle_finished', 'service_cycle_failed', 'service_cycle_compat_fallback', 'service_cycle_skipped_lock')
1312
+ ORDER BY id DESC
1313
+ LIMIT 1
1314
+ """
1315
+ ).fetchone()
1316
+ latest_error = conn.execute(
1317
+ """
1318
+ SELECT created_at, event, level, message, metadata_json
1319
+ FROM local_index_logs
1320
+ WHERE event IN ('service_cycle_failed', 'service_cycle_compat_fallback', 'service_cycle_skipped_lock')
1321
+ AND created_at > ?
1322
+ ORDER BY id DESC
1323
+ LIMIT 1
1324
+ """,
1325
+ (last_success,),
1326
+ ).fetchone()
1327
+ observation = {
1328
+ "last_success_at": float(last_success or 0),
1329
+ "last_error_at": 0,
1330
+ "last_error_code": "",
1331
+ "last_error_detail": "",
1332
+ "healthy": latest_error is None,
1333
+ }
1334
+ if latest:
1335
+ observation["last_heartbeat_at"] = float(latest["created_at"] or 0)
1336
+ if latest_error:
1337
+ observation["last_error_at"] = float(latest_error["created_at"] or 0)
1338
+ observation["last_error_code"] = latest_error["event"]
1339
+ observation["last_error_detail"] = f"{latest_error['message']} {latest_error['metadata_json']}"
1340
+ return observation
1341
+
1342
+
1343
+ def _service_scheduler_has_error(service: dict) -> bool:
1344
+ if service.get("manager") == "launchagent":
1345
+ code = str(service.get("last_exit_code") or "").strip()
1346
+ return bool(code and code not in {"0", "-"})
1347
+ if service.get("manager") == "scheduled_task":
1348
+ code = str(service.get("last_task_result") or "").strip()
1349
+ return bool(code and code not in {"0"})
1350
+ return False
1351
+
1352
+
1353
+ def _service_problem(service: dict) -> dict | None:
1354
+ if not service.get("installed"):
1355
+ return {
1356
+ "support_code": "local_index_service_not_installed",
1357
+ "user_message": "La memoria local aun no tiene activo el servicio en segundo plano",
1358
+ "recommended_action": "Reabre NEXO Desktop o actualiza a la ultima version para instalarlo automaticamente.",
1359
+ "technical_detail": f"manager={service.get('manager')} platform={service.get('platform')}",
1360
+ }
1361
+ if not service.get("running"):
1362
+ return {
1363
+ "support_code": "local_index_service_not_running",
1364
+ "user_message": "La memoria local no se esta actualizando en segundo plano",
1365
+ "recommended_action": "NEXO intentara recuperarlo automaticamente. Si se repite, abre soporte y diagnostico.",
1366
+ "technical_detail": f"manager={service.get('manager')} platform={service.get('platform')}",
1367
+ }
1368
+ if _service_scheduler_has_error(service):
1369
+ code = service.get("last_exit_code") or service.get("last_task_result") or ""
1370
+ return {
1371
+ "support_code": "local_index_service_last_run_failed",
1372
+ "user_message": "La ultima comprobacion de memoria local no termino correctamente",
1373
+ "recommended_action": "NEXO lo volvera a intentar automaticamente.",
1374
+ "technical_detail": f"last_result={code}",
1375
+ }
1376
+ if not service.get("healthy", True):
1377
+ return {
1378
+ "support_code": service.get("last_error_code") or "local_index_service_failed",
1379
+ "user_message": "La memoria local tuvo un problema temporal y NEXO la reintentara automaticamente",
1380
+ "recommended_action": "No tienes que hacer nada. Si se repite, abre soporte y diagnostico para ver el detalle.",
1381
+ "technical_detail": service.get("last_error_detail") or "",
1382
+ }
1383
+ return None
1384
+
1385
+
1168
1386
  def status() -> dict:
1169
1387
  conn = _conn()
1170
1388
  paused = _is_paused()
1171
1389
  assets = conn.execute(
1172
1390
  "SELECT COUNT(*) AS total, SUM(CASE WHEN status='active' THEN 1 ELSE 0 END) AS active FROM local_assets"
1173
1391
  ).fetchone()
1174
- pending = conn.execute("SELECT COUNT(*) AS total FROM local_index_jobs WHERE status='pending'").fetchone()["total"]
1175
- done = conn.execute("SELECT COUNT(*) AS total FROM local_index_jobs WHERE status='done'").fetchone()["total"]
1176
- total_jobs = pending + done
1392
+ job_rows = conn.execute("SELECT status, COUNT(*) AS total FROM local_index_jobs GROUP BY status").fetchall()
1393
+ job_counts = {row["status"]: int(row["total"] or 0) for row in job_rows}
1394
+ pending = int(job_counts.get("pending", 0) or 0)
1395
+ running_jobs = int(job_counts.get("running", 0) or 0)
1396
+ failed_jobs = int(job_counts.get("failed", 0) or 0)
1397
+ done = int(job_counts.get("done", 0) or 0)
1398
+ active_jobs = pending + running_jobs + failed_jobs
1399
+ total_jobs = active_jobs + done
1177
1400
  percent = 100 if total_jobs == 0 else int((done / max(total_jobs, 1)) * 100)
1178
1401
  roots = list_roots()
1179
1402
  volumes = []
@@ -1183,23 +1406,42 @@ def status() -> dict:
1183
1406
  for row in by_volume:
1184
1407
  volumes.append({"id": row["volume_id"], "label": row["volume_id"] or "Disk", "files": row["files"], "status": "active"})
1185
1408
  service = _local_index_service_status()
1186
- service["state"] = "paused" if paused else ("idle" if pending == 0 else "indexing")
1409
+ service.update(_service_cycle_observation(conn))
1410
+ problem = _service_problem(service)
1411
+ service["healthy"] = problem is None
1412
+ service["state"] = "paused" if paused else ("attention" if problem else ("idle" if active_jobs == 0 else "indexing"))
1413
+ problems = _problem_rows(conn)
1414
+ if problem:
1415
+ problems.insert(0, {
1416
+ "user_message": problem["user_message"],
1417
+ "recommended_action": problem["recommended_action"],
1418
+ "technical_detail": problem["technical_detail"],
1419
+ "support_code": problem["support_code"],
1420
+ "severity": "warning",
1421
+ "retryable": True,
1422
+ "path": "",
1423
+ "phase": "service",
1424
+ "created_at": now(),
1425
+ })
1187
1426
  return {
1188
1427
  "ok": True,
1189
1428
  "service": service,
1190
1429
  "global": {
1191
- "phase": "paused" if paused else ("idle" if pending == 0 else "light_extraction"),
1430
+ "phase": "paused" if paused else ("service_attention" if problem else ("idle" if active_jobs == 0 else "light_extraction")),
1192
1431
  "percent": percent,
1193
1432
  "files_found": int(assets["total"] or 0),
1194
1433
  "files_processed": int(done or 0),
1195
- "changes_pending": int(pending or 0),
1434
+ "changes_pending": int(active_jobs or 0),
1435
+ "jobs_pending": pending,
1436
+ "jobs_running": running_jobs,
1437
+ "jobs_failed": failed_jobs,
1196
1438
  "elapsed_seconds": 0,
1197
1439
  "eta_seconds": None,
1198
1440
  },
1199
1441
  "volumes": volumes,
1200
1442
  "roots": roots,
1201
1443
  "exclusions": list_exclusions(),
1202
- "problems": _problem_rows(conn),
1444
+ "problems": problems,
1203
1445
  "permissions": [],
1204
1446
  "models": model_status()["models"],
1205
1447
  "support_log_available": True,
@@ -15,7 +15,15 @@ def now() -> float:
15
15
 
16
16
 
17
17
  def norm_path(path: str | os.PathLike[str]) -> str:
18
- return str(Path(path).expanduser()).rstrip(os.sep)
18
+ text = str(Path(path).expanduser())
19
+ if re.match(r"^[A-Za-z]:[\\/]*$", text):
20
+ return f"{text[0].upper()}:\\"
21
+ if text in {"/", "\\"}:
22
+ return text
23
+ stripped = text.rstrip("/\\")
24
+ if re.match(r"^[A-Za-z]:$", stripped):
25
+ return f"{stripped[0].upper()}:\\"
26
+ return stripped or text
19
27
 
20
28
 
21
29
  def stable_id(prefix: str, value: str) -> str:
@@ -70,6 +70,57 @@ TRANSIENT_ERROR_KINDS = {
70
70
  }
71
71
  REQUIRED_PROTOCOL_SUMMARY_KEYS = ("guard_check", "heartbeat", "change_log")
72
72
 
73
+ # Compact few-shot rendered into the prompt on a `json_schema` retry. Keeps
74
+ # the placeholder structure intact so the model sees the exact contract that
75
+ # `_is_valid_extraction` enforces. Kept as a string to avoid pulling in a
76
+ # template engine for a one-shot block.
77
+ JSON_SCHEMA_FEWSHOT = (
78
+ "RETRY_HINT: the previous attempt produced JSON that did not match the "
79
+ "Deep Sleep extraction contract. The response must be a SINGLE JSON "
80
+ "object with the following minimum shape (extra keys allowed):\n"
81
+ "{\n"
82
+ ' "session_id": "<exact session id, string>",\n'
83
+ ' "findings": [ { "type": "...", "summary": "...", "evidence": "..." } ],\n'
84
+ ' "protocol_summary": {\n'
85
+ ' "guard_check": { "ran": true|false, "notes": "..." },\n'
86
+ ' "heartbeat": { "count": 0, "notes": "..." },\n'
87
+ ' "change_log": { "entries": 0, "notes": "..." }\n'
88
+ " }\n"
89
+ "}\n"
90
+ "Mandatory: session_id is a non-empty string equal to {{SESSION_ID}}; "
91
+ "findings is a list of objects; protocol_summary contains the three "
92
+ "object keys above. Return ONLY the JSON object, no prose, no fences."
93
+ )
94
+
95
+
96
+ def _record_protocol_debt(
97
+ session_id: str,
98
+ *,
99
+ debt_type: str,
100
+ severity: str,
101
+ evidence: str,
102
+ ) -> None:
103
+ """Best-effort registration of an extraction failure as protocol debt.
104
+
105
+ Imported lazily so the extractor still runs in environments where the
106
+ DB layer is unavailable (e.g. partial installs, unit tests). Any error
107
+ inside the debt path is swallowed: we never want a debt-logging issue
108
+ to mask the real extraction failure already being reported.
109
+ """
110
+ try:
111
+ from db._protocol import create_protocol_debt
112
+ except Exception: # pragma: no cover - best effort
113
+ return
114
+ try:
115
+ create_protocol_debt(
116
+ session_id,
117
+ debt_type,
118
+ severity=severity,
119
+ evidence=evidence[:3500],
120
+ )
121
+ except Exception as exc: # pragma: no cover - best effort
122
+ print(f" Warning: could not record protocol_debt: {exc}", file=sys.stderr)
123
+
73
124
 
74
125
  def _classify_cli_result(result) -> tuple[str, str]:
75
126
  """Return (kind, short_message) describing a failed automation backend call.
@@ -211,11 +262,18 @@ def analyze_session(
211
262
  date_dir: Path,
212
263
  shared_context_file: Path | None,
213
264
  session_txt_map: dict[str, str] | None = None,
265
+ *,
266
+ prior_error_kind: str = "",
214
267
  ) -> tuple[dict | None, str | None]:
215
268
  """Send a session to the automation backend for extraction analysis.
216
269
 
217
270
  Returns (parsed_result, error_kind). `error_kind` is only set on failure.
218
271
  See `_classify_cli_result` for possible values.
272
+
273
+ ``prior_error_kind`` is consumed by the retry path: when the previous
274
+ attempt failed validation with ``json_schema`` we append a few-shot of
275
+ the contract so the model sees the exact shape it must produce instead
276
+ of repeating the same structurally wrong payload.
219
277
  """
220
278
  session_file = find_session_file(session_id, date_dir, session_txt_map=session_txt_map)
221
279
  if not session_file:
@@ -236,6 +294,15 @@ def analyze_session(
236
294
  prompt = prompt_template.replace("{{CONTEXT_FILE}}", str(session_file))
237
295
  prompt = prompt.replace("{{SESSION_ID}}", session_id)
238
296
  prompt += shared_ctx_instruction
297
+ if prior_error_kind == "json_schema":
298
+ prompt += "\n\n" + JSON_SCHEMA_FEWSHOT.replace("{{SESSION_ID}}", session_id)
299
+
300
+ # Bootstrap the subagent with the day's deep-sleep dir as cwd so its
301
+ # default Read allowlist already covers the session transcript, the
302
+ # shared context, and the day's working files. Without this, the CLI
303
+ # subprocess inherits the parent's cwd (often "/") and fails with
304
+ # `cannot_comply` the first time it tries to Read the session file.
305
+ subagent_cwd = str(date_dir) if date_dir and Path(date_dir).exists() else None
239
306
 
240
307
  try:
241
308
  json_system_prompt = render_core_prompt(
@@ -246,6 +313,7 @@ def analyze_session(
246
313
  result = run_automation_prompt(
247
314
  prompt,
248
315
  caller="deep-sleep/extract",
316
+ cwd=subagent_cwd,
249
317
  timeout=CLAUDE_TIMEOUT,
250
318
  output_format="text",
251
319
  append_system_prompt=json_system_prompt,
@@ -276,6 +344,7 @@ def analyze_session(
276
344
  convert_result = run_automation_prompt(
277
345
  convert_prompt,
278
346
  caller="deep-sleep/extract",
347
+ cwd=subagent_cwd,
279
348
  timeout=120,
280
349
  output_format="text",
281
350
  append_system_prompt=json_system_prompt,
@@ -471,6 +540,7 @@ def main():
471
540
  date_dir,
472
541
  shared_context_file,
473
542
  session_txt_map=session_txt_map,
543
+ prior_error_kind=last_error_kind,
474
544
  )
475
545
  if result:
476
546
  break
@@ -522,6 +592,21 @@ def main():
522
592
  }
523
593
  all_extractions.append(failed_entry)
524
594
  _save_checkpoint(checkpoint_file, failed_entry)
595
+ # Surface deterministic extractor failures as protocol debt so
596
+ # the aggregate self-audit cannot silently absorb the pattern.
597
+ # Severity escalates once the session is poisoned because by
598
+ # then it stops being a per-run hiccup and becomes a recurring
599
+ # runtime issue worth a louder signal.
600
+ _record_protocol_debt(
601
+ session_id,
602
+ debt_type=f"deep-sleep.extract.{last_error_kind}",
603
+ severity="error" if state == "poisoned" else "warn",
604
+ evidence=(
605
+ f"date={target_date} state={state} attempts={new_count}/"
606
+ f"{MAX_POISON_ATTEMPTS} kind={last_error_kind} "
607
+ f"checkpoint={checkpoint_file}"
608
+ ),
609
+ )
525
610
  if state == "poisoned":
526
611
  poisoned += 1
527
612
 
@@ -51,6 +51,34 @@ def log(message: str) -> None:
51
51
  handle.write(line + "\n")
52
52
 
53
53
 
54
+ def _log_event_best_effort(level: str, event: str, message: str, **metadata) -> None:
55
+ try:
56
+ log_event(level, event, message, **metadata)
57
+ except Exception as exc:
58
+ log(f"ERROR: failed to record local-index event {event}: {type(exc).__name__}: {exc}")
59
+
60
+
61
+ def _read_lock() -> dict:
62
+ try:
63
+ return json.loads(LOCK_FILE.read_text(encoding="utf-8"))
64
+ except Exception:
65
+ return {}
66
+
67
+
68
+ def _pid_running(pid: int) -> bool:
69
+ if pid <= 0:
70
+ return False
71
+ try:
72
+ os.kill(pid, 0)
73
+ except ProcessLookupError:
74
+ return False
75
+ except PermissionError:
76
+ return True
77
+ except OSError:
78
+ return False
79
+ return True
80
+
81
+
54
82
  def acquire_lock() -> bool:
55
83
  try:
56
84
  fd = os.open(str(LOCK_FILE), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
@@ -59,9 +87,16 @@ def acquire_lock() -> bool:
59
87
  return True
60
88
  except FileExistsError:
61
89
  try:
62
- age = time.time() - LOCK_FILE.stat().st_mtime
90
+ lock = _read_lock()
91
+ pid = int(lock.get("pid") or 0)
92
+ age = time.time() - float(lock.get("created_at") or LOCK_FILE.stat().st_mtime)
93
+ if pid and not _pid_running(pid):
94
+ LOCK_FILE.unlink(missing_ok=True)
95
+ log(f"Removed stale local-index lock for dead pid {pid}.")
96
+ return acquire_lock()
63
97
  if age > LOCK_STALE_SECONDS:
64
98
  LOCK_FILE.unlink(missing_ok=True)
99
+ log(f"Removed stale local-index lock older than {int(age)} seconds.")
65
100
  return acquire_lock()
66
101
  except Exception:
67
102
  pass
@@ -70,31 +105,54 @@ def acquire_lock() -> bool:
70
105
 
71
106
  def release_lock() -> None:
72
107
  try:
108
+ lock = _read_lock()
109
+ pid = int(lock.get("pid") or 0)
110
+ if pid and pid != os.getpid():
111
+ return
73
112
  LOCK_FILE.unlink(missing_ok=True)
74
113
  except Exception:
75
114
  pass
76
115
 
77
116
 
78
- def main() -> int:
79
- if not acquire_lock():
80
- log("Skipped: previous local-index cycle is still running.")
81
- return 0
117
+ def _run_index_cycle() -> dict:
82
118
  try:
83
- if os.environ.get("NEXO_LOCAL_INDEX_DISABLE_DEFAULT_ROOTS", "").strip() != "1":
84
- api.ensure_default_roots()
85
- result = api.run_once(
119
+ return api.run_once(
86
120
  limit=SCAN_LIMIT,
87
121
  process_limit=PROCESS_LIMIT,
88
122
  live_asset_limit=LIVE_ASSET_LIMIT,
89
123
  live_dir_limit=LIVE_DIR_LIMIT,
90
124
  live_file_limit=LIVE_FILE_LIMIT,
91
125
  )
92
- log_event("info", "service_cycle_finished", "Local memory service cycle finished", result=result)
126
+ except TypeError as exc:
127
+ message = str(exc)
128
+ live_kwargs = ("live_asset_limit", "live_dir_limit", "live_file_limit")
129
+ if not any(name in message for name in live_kwargs):
130
+ raise
131
+ _log_event_best_effort(
132
+ "warn",
133
+ "service_cycle_compat_fallback",
134
+ "Local memory service used compatibility fallback",
135
+ error=message,
136
+ )
137
+ log(f"Compatibility fallback: api.run_once does not accept live reconcile limits ({message}).")
138
+ return api.run_once(limit=SCAN_LIMIT, process_limit=PROCESS_LIMIT)
139
+
140
+
141
+ def main() -> int:
142
+ if not acquire_lock():
143
+ log("Skipped: previous local-index cycle is still running.")
144
+ _log_event_best_effort("warn", "service_cycle_skipped_lock", "Local memory service skipped because a previous cycle is still running")
145
+ return 0
146
+ try:
147
+ if os.environ.get("NEXO_LOCAL_INDEX_DISABLE_DEFAULT_ROOTS", "").strip() != "1":
148
+ api.ensure_default_roots()
149
+ result = _run_index_cycle()
150
+ _log_event_best_effort("info", "service_cycle_finished", "Local memory service cycle finished", result=result)
93
151
  log(json.dumps(result, ensure_ascii=False, sort_keys=True))
94
152
  return 0 if result.get("ok") else 2
95
153
  except Exception as exc:
96
- log_event("error", "service_cycle_failed", "Local memory service cycle failed", error=type(exc).__name__)
97
154
  log(f"ERROR: {type(exc).__name__}: {exc}")
155
+ _log_event_best_effort("error", "service_cycle_failed", "Local memory service cycle failed", error=type(exc).__name__)
98
156
  return 2
99
157
  finally:
100
158
  release_lock()