nexo-brain 7.20.11 → 7.20.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import os
5
+ import re
5
6
  import shutil
6
7
  import stat
7
8
  import hashlib
@@ -30,6 +31,12 @@ DEFAULT_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_DEPTH", "24")
30
31
  DEFAULT_EMAIL_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_EMAIL_ROOT_DEPTH", "24") or "24")
31
32
  DEFAULT_MOUNTED_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_MOUNTED_ROOT_DEPTH", "24") or "24")
32
33
  DEFAULT_SYSTEM_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_SYSTEM_ROOT_DEPTH", "24") or "24")
34
+ DEFAULT_CONTEXT_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_MAX_CHARS", "20000") or "20000")
35
+ DEFAULT_ROUTER_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_ROUTER_MAX_CHARS", "6000") or "6000")
36
+ DEFAULT_MAX_JOB_ATTEMPTS = int(os.environ.get("NEXO_LOCAL_INDEX_MAX_JOB_ATTEMPTS", "3") or "3")
37
+ INITIAL_INDEX_COMPLETE_KEY = "initial_index_complete"
38
+ INITIAL_INDEX_STARTED_AT_KEY = "initial_index_started_at"
39
+ VALID_CONTEXT_MODES = {"compact", "full"}
33
40
 
34
41
 
35
42
  def ensure_ready() -> None:
@@ -49,6 +56,7 @@ def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> di
49
56
  log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
50
57
  return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
51
58
  depth_value = 2 if depth is None else int(depth)
59
+ existing = conn.execute("SELECT id, status FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
52
60
  conn.execute(
53
61
  """
54
62
  INSERT INTO local_index_roots(root_path, display_path, mode, depth, status, created_at, updated_at)
@@ -62,6 +70,12 @@ def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> di
62
70
  """,
63
71
  (root_path, path, mode, depth_value, now(), now()),
64
72
  )
73
+ row = conn.execute("SELECT id FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
74
+ existing_status = str(existing["status"] or "") if existing else ""
75
+ if row and (not existing or existing_status in {"removed", "offline"}):
76
+ _set_state_conn(conn, _root_initial_scan_key(int(row["id"])), "0")
77
+ _set_initial_index_complete(conn, False)
78
+ _set_initial_index_started_at(conn, now())
65
79
  conn.commit()
66
80
  log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value)
67
81
  return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value}
@@ -504,8 +518,7 @@ def list_exclusions() -> list[dict]:
504
518
  return [dict(row) for row in rows]
505
519
 
506
520
 
507
- def _set_state(key: str, value: str) -> None:
508
- conn = _conn()
521
+ def _set_state_conn(conn, key: str, value: str) -> None:
509
522
  conn.execute(
510
523
  """
511
524
  INSERT INTO local_index_state(key, value, updated_at)
@@ -514,15 +527,127 @@ def _set_state(key: str, value: str) -> None:
514
527
  """,
515
528
  (key, value, now()),
516
529
  )
517
- conn.commit()
518
530
 
519
531
 
520
- def _get_state(key: str, default: str = "") -> str:
532
+ def _set_state(key: str, value: str) -> None:
521
533
  conn = _conn()
534
+ _set_state_conn(conn, key, value)
535
+ conn.commit()
536
+
537
+
538
+ def _get_state_conn(conn, key: str, default: str = "") -> str:
522
539
  row = conn.execute("SELECT value FROM local_index_state WHERE key=?", (key,)).fetchone()
523
540
  return row["value"] if row else default
524
541
 
525
542
 
543
+ def _get_state(key: str, default: str = "") -> str:
544
+ conn = _conn()
545
+ return _get_state_conn(conn, key, default)
546
+
547
+
548
+ def _root_initial_scan_key(root_id: int) -> str:
549
+ return f"root:{int(root_id)}:initial_scan_complete"
550
+
551
+
552
+ def _root_initial_scan_complete(conn, root: dict) -> bool:
553
+ root_id = int(root["id"])
554
+ row = conn.execute("SELECT value FROM local_index_state WHERE key=?", (_root_initial_scan_key(root_id),)).fetchone()
555
+ if row:
556
+ return str(row["value"]) == "1"
557
+ checkpoint = conn.execute(
558
+ "SELECT 1 FROM local_index_checkpoints WHERE root_id=? AND phase='quick_index' LIMIT 1",
559
+ (root_id,),
560
+ ).fetchone()
561
+ return bool(root.get("last_scan_at") and not checkpoint)
562
+
563
+
564
+ def _set_root_initial_scan_complete(conn, root_id: int, complete: bool) -> None:
565
+ _set_state_conn(conn, _root_initial_scan_key(root_id), "1" if complete else "0")
566
+
567
+
568
+ def _initial_index_complete(conn) -> bool:
569
+ return _get_state_conn(conn, INITIAL_INDEX_COMPLETE_KEY, "0") == "1"
570
+
571
+
572
+ def _set_initial_index_complete(conn, complete: bool) -> None:
573
+ _set_state_conn(conn, INITIAL_INDEX_COMPLETE_KEY, "1" if complete else "0")
574
+
575
+
576
+ def _set_initial_index_started_at(conn, started_at: float) -> None:
577
+ _set_state_conn(conn, INITIAL_INDEX_STARTED_AT_KEY, str(float(started_at or now())))
578
+
579
+
580
+ def _earliest_index_activity(conn) -> float:
581
+ candidates = []
582
+ for sql in (
583
+ "SELECT MIN(created_at) AS value FROM local_index_roots WHERE status!='removed'",
584
+ "SELECT MIN(first_seen_at) AS value FROM local_assets WHERE status!='deleted'",
585
+ "SELECT MIN(created_at) AS value FROM local_index_jobs",
586
+ "SELECT MIN(created_at) AS value FROM local_index_logs WHERE event IN ('root_added', 'scan_started', 'scan_finished', 'jobs_processed', 'service_cycle_finished')",
587
+ ):
588
+ try:
589
+ value = conn.execute(sql).fetchone()["value"] or 0
590
+ except Exception:
591
+ value = 0
592
+ if value:
593
+ candidates.append(float(value))
594
+ return min(candidates) if candidates else 0.0
595
+
596
+
597
+ def _ensure_initial_index_started_at(conn) -> float:
598
+ raw = _get_state_conn(conn, INITIAL_INDEX_STARTED_AT_KEY, "")
599
+ try:
600
+ value = float(raw or 0)
601
+ except Exception:
602
+ value = 0.0
603
+ if value > 0:
604
+ return value
605
+ value = _earliest_index_activity(conn) or now()
606
+ _set_initial_index_started_at(conn, value)
607
+ conn.commit()
608
+ return value
609
+
610
+
611
+ def _active_job_count(conn) -> int:
612
+ row = conn.execute(
613
+ """
614
+ SELECT COUNT(*) AS total
615
+ FROM local_index_jobs
616
+ WHERE status IN ('pending', 'running', 'failed')
617
+ """
618
+ ).fetchone()
619
+ return int(row["total"] or 0)
620
+
621
+
622
+ def _refresh_initial_index_complete(conn, initial_scan: dict | None = None, active_jobs: int | None = None) -> bool:
623
+ if _initial_index_complete(conn):
624
+ return True
625
+ scan_state = initial_scan if initial_scan is not None else _initial_scan_status(conn)
626
+ remaining = _active_job_count(conn) if active_jobs is None else int(active_jobs or 0)
627
+ complete = bool(scan_state.get("complete")) and remaining == 0
628
+ if complete:
629
+ _set_initial_index_complete(conn, True)
630
+ conn.commit()
631
+ return complete
632
+
633
+
634
+ def _initial_scan_status(conn, roots: list[dict] | None = None) -> dict:
635
+ rows = roots if roots is not None else list_roots()
636
+ tracked = _effective_scan_roots([dict(row) for row in rows if str(row.get("status") or "active") not in {"removed", "offline"}])
637
+ pending = [row for row in tracked if not _root_initial_scan_complete(conn, row)]
638
+ checkpoints = conn.execute(
639
+ "SELECT COUNT(*) AS total FROM local_index_checkpoints WHERE phase='quick_index'"
640
+ ).fetchone()["total"] or 0
641
+ complete = bool(tracked) and not pending
642
+ return {
643
+ "complete": complete,
644
+ "mode": "watching_changes" if complete else "initial_indexing",
645
+ "pending_roots": len(pending),
646
+ "total_roots": len(tracked),
647
+ "checkpoint_count": int(checkpoints or 0),
648
+ }
649
+
650
+
526
651
  def pause() -> dict:
527
652
  _set_state("paused", "1")
528
653
  log_event("info", "index_paused", "Local memory indexing paused")
@@ -555,7 +680,12 @@ def _is_excluded(path: str, exclusions: list[str]) -> bool:
555
680
 
556
681
  def _path_prefix(path: str) -> str:
557
682
  normalized = norm_path(path)
558
- return normalized + os.sep if normalized else os.sep
683
+ if not normalized:
684
+ return os.sep
685
+ if normalized in {"/", "\\"}:
686
+ return normalized
687
+ sep = "\\" if re.match(r"^[A-Za-z]:\\", normalized) or "\\" in normalized else os.sep
688
+ return normalized if normalized.endswith(sep) else normalized + sep
559
689
 
560
690
 
561
691
  def _is_nested_path(path: str, parent: str) -> bool:
@@ -563,9 +693,20 @@ def _is_nested_path(path: str, parent: str) -> bool:
563
693
  base = norm_path(parent)
564
694
  if not value or not base or value == base:
565
695
  return False
566
- if base == os.sep:
567
- return value.startswith(os.sep)
568
- return value.startswith(_path_prefix(base))
696
+ value_cmp = value.replace("\\", "/")
697
+ base_cmp = base.replace("\\", "/")
698
+ if re.match(r"^[A-Za-z]:/?$", base_cmp):
699
+ base_cmp = f"{base_cmp[0].upper()}:/"
700
+ if re.match(r"^[A-Za-z]:/?$", value_cmp):
701
+ value_cmp = f"{value_cmp[0].upper()}:/"
702
+ if base_cmp != "/":
703
+ base_cmp = base_cmp.rstrip("/")
704
+ if value_cmp != "/":
705
+ value_cmp = value_cmp.rstrip("/")
706
+ if base_cmp == "/":
707
+ return value_cmp.startswith("/")
708
+ prefix = base_cmp if base_cmp.endswith("/") else f"{base_cmp}/"
709
+ return value_cmp.startswith(prefix)
569
710
 
570
711
 
571
712
  def _is_discovered_mount_path(path: str) -> bool:
@@ -614,6 +755,19 @@ def _file_type(path: Path) -> str:
614
755
  return "file"
615
756
 
616
757
 
758
+ def _volume_id_for_path(path: Path) -> str:
759
+ normalized = norm_path(path).replace("\\", "/")
760
+ match = re.match(r"^([A-Za-z]):/", normalized)
761
+ if match:
762
+ return f"{match.group(1).upper()}:\\"
763
+ parts = [part for part in normalized.split("/") if part]
764
+ if len(parts) >= 2 and parts[0] in {"Volumes", "mnt", "media"}:
765
+ return f"/{parts[0]}/{parts[1]}"
766
+ if len(parts) >= 3 and parts[0] == "run" and parts[1] == "media":
767
+ return f"/run/media/{parts[2]}"
768
+ return path.anchor or "/"
769
+
770
+
617
771
  def _permission_state(path: Path) -> str:
618
772
  try:
619
773
  path.stat()
@@ -744,7 +898,7 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
744
898
  normalized,
745
899
  raw_path,
746
900
  parent,
747
- path.anchor or "/",
901
+ _volume_id_for_path(path),
748
902
  _file_type(path),
749
903
  path.suffix.lower(),
750
904
  int(st.st_size),
@@ -1259,6 +1413,7 @@ def scan_once(*, limit: int | None = None) -> dict:
1259
1413
  for root in roots:
1260
1414
  root_path = Path(root["root_path"]).expanduser()
1261
1415
  root_id = int(root["id"])
1416
+ root_initial_complete = _root_initial_scan_complete(conn, dict(root))
1262
1417
  if should_skip_tree(str(root_path)) and not _allow_explicit_blocked_root(str(root_path)):
1263
1418
  conn.execute(
1264
1419
  "UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
@@ -1301,6 +1456,8 @@ def scan_once(*, limit: int | None = None) -> dict:
1301
1456
  partial_root = bool(limit and seen_for_root >= limit)
1302
1457
  totals["partial"] = bool(totals["partial"] or partial_root)
1303
1458
  if partial_root:
1459
+ if not root_initial_complete:
1460
+ _set_root_initial_scan_complete(conn, root_id, False)
1304
1461
  log_event(
1305
1462
  "info",
1306
1463
  "scan_partial",
@@ -1315,11 +1472,10 @@ def scan_once(*, limit: int | None = None) -> dict:
1315
1472
  (root_id, cycle_started_at),
1316
1473
  ).fetchall()
1317
1474
  for row in rows:
1318
- conn.execute(
1319
- "UPDATE local_assets SET status='deleted', deleted_at=?, updated_at=? WHERE asset_id=?",
1320
- (now(), now(), row["asset_id"]),
1321
- )
1475
+ _mark_asset_deleted(conn, row["asset_id"])
1322
1476
  _clear_checkpoint(conn, root_id)
1477
+ if not root_initial_complete:
1478
+ _set_root_initial_scan_complete(conn, root_id, True)
1323
1479
  conn.execute(
1324
1480
  "UPDATE local_index_roots SET status='active', last_scan_at=?, updated_at=? WHERE id=?",
1325
1481
  (now(), now(), root_id),
@@ -1391,13 +1547,23 @@ def _replace_entities(conn, asset_id: str, version_id: str, values: list[str]) -
1391
1547
 
1392
1548
  def _requeue_due_jobs(conn) -> dict:
1393
1549
  current = now()
1550
+ exhausted = conn.execute(
1551
+ """
1552
+ UPDATE local_index_jobs
1553
+ SET status='done', next_attempt_at=NULL, claimed_by='', lease_expires_at=NULL, updated_at=?
1554
+ WHERE status='failed' AND attempt_count >= ?
1555
+ """,
1556
+ (current, DEFAULT_MAX_JOB_ATTEMPTS),
1557
+ ).rowcount
1394
1558
  failed = conn.execute(
1395
1559
  """
1396
1560
  UPDATE local_index_jobs
1397
1561
  SET status='pending', claimed_by='', lease_expires_at=NULL, updated_at=?
1398
- WHERE status='failed' AND (next_attempt_at IS NULL OR next_attempt_at <= ?)
1562
+ WHERE status='failed'
1563
+ AND attempt_count < ?
1564
+ AND (next_attempt_at IS NULL OR next_attempt_at <= ?)
1399
1565
  """,
1400
- (current, current),
1566
+ (current, DEFAULT_MAX_JOB_ATTEMPTS, current),
1401
1567
  ).rowcount
1402
1568
  expired = conn.execute(
1403
1569
  """
@@ -1407,9 +1573,9 @@ def _requeue_due_jobs(conn) -> dict:
1407
1573
  """,
1408
1574
  (current, current),
1409
1575
  ).rowcount
1410
- if failed or expired:
1411
- log_event("warn", "jobs_requeued", "Local memory recovered stalled jobs", failed=failed, expired=expired)
1412
- return {"failed": int(failed or 0), "expired": int(expired or 0)}
1576
+ if failed or expired or exhausted:
1577
+ log_event("warn", "jobs_requeued", "Local memory recovered stalled jobs", failed=failed, expired=expired, exhausted=exhausted)
1578
+ return {"failed": int(failed or 0), "expired": int(expired or 0), "exhausted": int(exhausted or 0)}
1413
1579
 
1414
1580
 
1415
1581
  def process_jobs(*, limit: int = 100) -> dict:
@@ -1483,13 +1649,15 @@ def process_jobs(*, limit: int = 100) -> dict:
1483
1649
  processed += 1
1484
1650
  except Exception as exc:
1485
1651
  failed += 1
1652
+ attempts = int(row["attempt_count"] or 0) + 1
1653
+ terminal = attempts >= DEFAULT_MAX_JOB_ATTEMPTS
1486
1654
  conn.execute(
1487
1655
  """
1488
1656
  UPDATE local_index_jobs
1489
- SET status='failed', attempt_count=attempt_count+1, next_attempt_at=?, last_error_code=?, updated_at=?
1657
+ SET status=?, attempt_count=attempt_count+1, next_attempt_at=?, claimed_by='', lease_expires_at=NULL, last_error_code=?, updated_at=?
1490
1658
  WHERE job_id=?
1491
1659
  """,
1492
- (now() + 3600, type(exc).__name__, now(), job_id),
1660
+ ("done" if terminal else "failed", None if terminal else now() + 3600, type(exc).__name__, now(), job_id),
1493
1661
  )
1494
1662
  _record_index_error(
1495
1663
  conn,
@@ -1499,7 +1667,7 @@ def process_jobs(*, limit: int = 100) -> dict:
1499
1667
  error_code=type(exc).__name__,
1500
1668
  user_message="Algunos archivos no se pudieron leer",
1501
1669
  technical_detail=str(exc),
1502
- retryable=True,
1670
+ retryable=not terminal,
1503
1671
  )
1504
1672
  conn.commit()
1505
1673
  if processed or failed:
@@ -1526,14 +1694,37 @@ def run_once(
1526
1694
  ensure_default_roots()
1527
1695
  if root:
1528
1696
  add_root(root)
1529
- live_result = reconcile_live_changes(
1530
- asset_limit=live_asset_limit,
1531
- dir_limit=live_dir_limit,
1532
- file_limit=live_file_limit,
1533
- )
1697
+ conn = _conn()
1698
+ initial_before = _initial_scan_status(conn, list_roots())
1699
+ initial_index_before = _refresh_initial_index_complete(conn, initial_before)
1700
+ if initial_index_before:
1701
+ live_result = reconcile_live_changes(
1702
+ asset_limit=live_asset_limit,
1703
+ dir_limit=live_dir_limit,
1704
+ file_limit=live_file_limit,
1705
+ )
1706
+ else:
1707
+ live_result = {
1708
+ "ok": True,
1709
+ "skipped": True,
1710
+ "reason": "initial_scan_in_progress",
1711
+ "assets": {},
1712
+ "dirs": {},
1713
+ }
1534
1714
  scan_result = scan_once(limit=limit)
1535
1715
  job_result = process_jobs(limit=process_limit)
1536
- return {"ok": True, "live": live_result, "scan": scan_result, "jobs": job_result}
1716
+ conn_after = _conn()
1717
+ initial_after = _initial_scan_status(conn_after, list_roots())
1718
+ active_after = _active_job_count(conn_after)
1719
+ initial_index_after = _refresh_initial_index_complete(conn_after, initial_after, active_after)
1720
+ return {
1721
+ "ok": True,
1722
+ "initial_scan": initial_after,
1723
+ "initial_index_complete": initial_index_after,
1724
+ "live": live_result,
1725
+ "scan": scan_result,
1726
+ "jobs": job_result,
1727
+ }
1537
1728
 
1538
1729
 
1539
1730
  def _problem_rows(conn) -> list[dict]:
@@ -1788,21 +1979,7 @@ def _service_cycle_observation(conn) -> dict:
1788
1979
 
1789
1980
 
1790
1981
  def _index_timing(conn, *, done: int, active_jobs: int, percent: int) -> dict:
1791
- first_seen = conn.execute(
1792
- """
1793
- SELECT MIN(created_at) AS created_at
1794
- FROM local_index_logs
1795
- WHERE event IN ('root_added', 'scan_started', 'scan_finished', 'jobs_processed', 'service_cycle_finished')
1796
- """
1797
- ).fetchone()["created_at"] or 0
1798
- if not first_seen:
1799
- first_seen = conn.execute(
1800
- """
1801
- SELECT MIN(first_seen_at) AS first_seen_at
1802
- FROM local_assets
1803
- WHERE status!='deleted'
1804
- """
1805
- ).fetchone()["first_seen_at"] or 0
1982
+ first_seen = _ensure_initial_index_started_at(conn)
1806
1983
  elapsed_seconds = max(0, int(now() - float(first_seen))) if first_seen else 0
1807
1984
  eta_seconds = None
1808
1985
  if elapsed_seconds > 0 and done > 0 and active_jobs > 0 and 0 < percent < 100:
@@ -1885,6 +2062,8 @@ def status() -> dict:
1885
2062
  percent = 100 if total_jobs == 0 else int((done / max(total_jobs, 1)) * 100)
1886
2063
  timing = _index_timing(conn, done=done, active_jobs=active_jobs, percent=percent)
1887
2064
  roots = list_roots()
2065
+ initial_scan = _initial_scan_status(conn, roots)
2066
+ initial_index_complete = _refresh_initial_index_complete(conn, initial_scan, active_jobs)
1888
2067
  volumes = []
1889
2068
  by_volume = conn.execute(
1890
2069
  """
@@ -1903,7 +2082,7 @@ def status() -> dict:
1903
2082
  service.update(_service_cycle_observation(conn))
1904
2083
  problem = _service_problem(service)
1905
2084
  service["healthy"] = problem is None
1906
- service["state"] = "paused" if paused else ("attention" if problem else ("idle" if active_jobs == 0 else "indexing"))
2085
+ service["state"] = "paused" if paused else ("attention" if problem else ("idle" if active_jobs == 0 and initial_index_complete else "indexing"))
1907
2086
  problems = _problem_rows(conn)
1908
2087
  if problem:
1909
2088
  problems.insert(0, {
@@ -1917,11 +2096,21 @@ def status() -> dict:
1917
2096
  "phase": "service",
1918
2097
  "created_at": now(),
1919
2098
  })
2099
+ if paused:
2100
+ phase = "paused"
2101
+ elif problem:
2102
+ phase = "service_attention"
2103
+ elif not initial_index_complete:
2104
+ phase = "initial_indexing"
2105
+ elif active_jobs == 0:
2106
+ phase = "idle"
2107
+ else:
2108
+ phase = "updating_changes"
1920
2109
  return {
1921
2110
  "ok": True,
1922
2111
  "service": service,
1923
2112
  "global": {
1924
- "phase": "paused" if paused else ("service_attention" if problem else ("idle" if active_jobs == 0 else "light_extraction")),
2113
+ "phase": phase,
1925
2114
  "percent": percent,
1926
2115
  "files_found": int(assets["total"] or 0),
1927
2116
  "files_processed": int(done or 0),
@@ -1931,7 +2120,14 @@ def status() -> dict:
1931
2120
  "jobs_failed": failed_jobs,
1932
2121
  "elapsed_seconds": timing["elapsed_seconds"],
1933
2122
  "eta_seconds": timing["eta_seconds"],
2123
+ "index_started_at": _get_state_conn(conn, INITIAL_INDEX_STARTED_AT_KEY, ""),
2124
+ "initial_scan_complete": bool(initial_index_complete),
2125
+ "initial_discovery_complete": bool(initial_scan["complete"]),
2126
+ "initial_index_complete": bool(initial_index_complete),
2127
+ "index_mode": "watching_changes" if initial_index_complete else "initial_indexing",
1934
2128
  },
2129
+ "initial_scan": initial_scan,
2130
+ "initial_index_complete": bool(initial_index_complete),
1935
2131
  "volumes": volumes,
1936
2132
  "roots": roots,
1937
2133
  "exclusions": list_exclusions(),
@@ -2165,19 +2361,321 @@ def _context_candidate_rows(conn, entity_asset_ids: list[str], *, base_limit: in
2165
2361
  return rows
2166
2362
 
2167
2363
 
2168
- def context_query(query: str, *, intent: str = "answer", limit: int = 12, evidence_required: bool = True, current_context: str = "") -> dict:
2364
+ def _compact_text(value: str, *, max_chars: int) -> str:
2365
+ text = " ".join(str(value or "").split())
2366
+ if max_chars <= 0 or len(text) <= max_chars:
2367
+ return text
2368
+ return text[: max(0, max_chars - 1)].rstrip() + "…"
2369
+
2370
+
2371
+ def _payload_size(payload: dict) -> int:
2372
+ return len(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
2373
+
2374
+
2375
+ def _normalize_context_mode(mode: str) -> tuple[str, list[str]]:
2376
+ value = str(mode or "compact").strip().lower()
2377
+ if value in VALID_CONTEXT_MODES:
2378
+ return value, []
2379
+ return "compact", [f"Unsupported local context mode '{value}'. Falling back to compact mode."]
2380
+
2381
+
2382
+ def _context_usage_hint(payload: dict) -> dict:
2383
+ current = {
2384
+ "mode": payload.get("mode", "compact"),
2385
+ "limit": payload.get("limit"),
2386
+ "max_chars": payload.get("max_chars"),
2387
+ "include_entities": bool(payload.get("include_entities")),
2388
+ "include_relations": bool(payload.get("include_relations")),
2389
+ }
2390
+ return {
2391
+ "tool": "nexo_local_context",
2392
+ "current_params": current,
2393
+ "recommended_call": "nexo_local_context(query='...', mode='compact', limit=4, max_chars=12000, include_entities=false, include_relations=false)",
2394
+ "recommended_params": {
2395
+ "mode": "compact",
2396
+ "limit": 4,
2397
+ "max_chars": 12000,
2398
+ "include_entities": False,
2399
+ "include_relations": False,
2400
+ },
2401
+ "expand": "Use mode='full' only for debugging, with a specific query and explicit max_chars.",
2402
+ "refine": "Add names, dates, project names, file types, paths, or email subjects to reduce noise.",
2403
+ }
2404
+
2405
+
2406
+ def _minimal_truncated_context_payload(payload: dict, *, max_chars: int) -> dict:
2407
+ mode = str(payload.get("mode") or "compact")
2408
+ minimal = {
2409
+ "ok": bool(payload.get("ok", True)),
2410
+ "mode": mode,
2411
+ "truncated": True,
2412
+ "warnings": ["truncated"],
2413
+ "usage_hint": "nexo_local_context(query='...', mode='compact', limit=4, max_chars=12000)",
2414
+ "assets": [],
2415
+ "chunks": [],
2416
+ "entities": [],
2417
+ "relations": [],
2418
+ "evidence_refs": [],
2419
+ }
2420
+ if max_chars and _payload_size(minimal) > max_chars:
2421
+ tiny = {
2422
+ "ok": bool(payload.get("ok", True)),
2423
+ "mode": mode,
2424
+ "truncated": True,
2425
+ "usage_hint": "nexo_local_context(mode='compact',limit=4,max_chars=12000)",
2426
+ }
2427
+ return tiny
2428
+ return minimal
2429
+
2430
+
2431
+ def _sync_context_payload_refs(payload: dict) -> None:
2432
+ chunks = payload.get("chunks") or []
2433
+ chunk_ids = {str(chunk.get("chunk_id") or "") for chunk in chunks if chunk.get("chunk_id")}
2434
+ asset_ids = {str(chunk.get("asset_id") or "") for chunk in chunks if chunk.get("asset_id")}
2435
+ if chunk_ids:
2436
+ payload["evidence_refs"] = [
2437
+ ref for ref in (payload.get("evidence_refs") or [])
2438
+ if any(f"#chunk:{chunk_id}" in str(ref) for chunk_id in chunk_ids)
2439
+ ]
2440
+ payload["assets"] = [
2441
+ asset for asset in (payload.get("assets") or [])
2442
+ if str(asset.get("asset_id") or "") in asset_ids
2443
+ ]
2444
+ elif not chunks:
2445
+ payload["evidence_refs"] = []
2446
+
2447
+
2448
+ def _truncate_context_payload(payload: dict, *, max_chars: int) -> dict:
2449
+ if not max_chars or max_chars <= 0 or _payload_size(payload) <= max_chars:
2450
+ return payload
2451
+ warnings = list(payload.get("warnings") or [])
2452
+ warnings.append(
2453
+ "Local context result was truncated. Use mode='compact', lower limit, raise max_chars, or refine the query with more specific names, dates, paths, projects, or file types."
2454
+ )
2455
+ payload["warnings"] = warnings
2456
+ payload["truncated"] = True
2457
+ payload["usage_hint"] = _context_usage_hint(payload)
2458
+ payload["query"] = _compact_text(payload.get("query") or "", max_chars=240)
2459
+ payload["summary"] = _compact_text(payload.get("summary") or "", max_chars=240)
2460
+ for chunk in payload.get("chunks") or []:
2461
+ chunk["text"] = _compact_text(chunk.get("text") or "", max_chars=220)
2462
+ for asset in payload.get("assets") or []:
2463
+ asset["display_path"] = _compact_text(asset.get("display_path") or "", max_chars=240)
2464
+ asset["summary"] = _compact_text(asset.get("summary") or "", max_chars=160)
2465
+ if not payload.get("include_entities"):
2466
+ payload["entities"] = []
2467
+ if not payload.get("include_relations"):
2468
+ payload["relations"] = []
2469
+ while _payload_size(payload) > max_chars and len(payload.get("chunks") or []) > 1:
2470
+ payload["chunks"].pop()
2471
+ while _payload_size(payload) > max_chars and len(payload.get("assets") or []) > 1:
2472
+ removed = payload["assets"].pop()
2473
+ removed_asset_id = removed.get("asset_id")
2474
+ payload["chunks"] = [chunk for chunk in payload.get("chunks") or [] if chunk.get("asset_id") != removed_asset_id]
2475
+ payload["evidence_refs"] = payload.get("evidence_refs", [])[: len(payload.get("assets") or [])]
2476
+ if _payload_size(payload) > max_chars:
2477
+ payload["entities"] = []
2478
+ payload["relations"] = []
2479
+ if _payload_size(payload) > max_chars:
2480
+ payload["chunks"] = [
2481
+ {
2482
+ "chunk_id": chunk.get("chunk_id", ""),
2483
+ "asset_id": chunk.get("asset_id", ""),
2484
+ "text": _compact_text(chunk.get("text") or "", max_chars=120),
2485
+ "score": chunk.get("score", 0),
2486
+ }
2487
+ for chunk in (payload.get("chunks") or [])[:1]
2488
+ ]
2489
+ payload["assets"] = [
2490
+ {
2491
+ "asset_id": asset.get("asset_id", ""),
2492
+ "display_path": asset.get("display_path", ""),
2493
+ "file_type": asset.get("file_type", "file"),
2494
+ "score": asset.get("score", 0),
2495
+ }
2496
+ for asset in (payload.get("assets") or [])[:1]
2497
+ ]
2498
+ payload["evidence_refs"] = (payload.get("evidence_refs") or [])[:1]
2499
+ _sync_context_payload_refs(payload)
2500
+ if _payload_size(payload) > max_chars:
2501
+ return _minimal_truncated_context_payload(payload, max_chars=max_chars)
2502
+ return payload
2503
+
2504
+
2505
+ def _shape_context_payload(
2506
+ payload: dict,
2507
+ *,
2508
+ mode: str,
2509
+ max_chars: int,
2510
+ include_entities: bool,
2511
+ include_relations: bool,
2512
+ snippet_chars: int,
2513
+ ) -> dict:
2514
+ normalized_mode, mode_warnings = _normalize_context_mode(mode)
2515
+ shaped = dict(payload)
2516
+ shaped["warnings"] = [*(shaped.get("warnings") or []), *mode_warnings]
2517
+ shaped["mode"] = normalized_mode
2518
+ shaped["limit"] = len(shaped.get("assets") or [])
2519
+ shaped["include_entities"] = bool(include_entities)
2520
+ shaped["include_relations"] = bool(include_relations)
2521
+ shaped["truncated"] = False
2522
+ shaped["max_chars"] = int(max_chars or 0)
2523
+ if normalized_mode == "compact":
2524
+ seen_chunk_assets: set[str] = set()
2525
+ compact_chunks = []
2526
+ for chunk in shaped.get("chunks") or []:
2527
+ asset_id = str(chunk.get("asset_id") or "")
2528
+ if asset_id in seen_chunk_assets:
2529
+ continue
2530
+ seen_chunk_assets.add(asset_id)
2531
+ compact_chunks.append({
2532
+ "chunk_id": chunk.get("chunk_id", ""),
2533
+ "asset_id": asset_id,
2534
+ "text": _compact_text(chunk.get("text") or "", max_chars=max(80, int(snippet_chars or 360))),
2535
+ "score": chunk.get("score", 0),
2536
+ })
2537
+ shaped["chunks"] = compact_chunks
2538
+ shaped["assets"] = [
2539
+ {
2540
+ "asset_id": asset.get("asset_id", ""),
2541
+ "display_path": asset.get("display_path", ""),
2542
+ "file_type": asset.get("file_type", "file"),
2543
+ "score": asset.get("score", 0),
2544
+ "summary": _compact_text(asset.get("summary") or "", max_chars=180),
2545
+ }
2546
+ for asset in shaped.get("assets") or []
2547
+ ]
2548
+ else:
2549
+ shaped["chunks"] = [
2550
+ {
2551
+ **chunk,
2552
+ "text": _compact_text(chunk.get("text") or "", max_chars=max(200, int(snippet_chars or 1200))),
2553
+ }
2554
+ for chunk in shaped.get("chunks") or []
2555
+ ]
2556
+ if not include_entities:
2557
+ shaped["entities"] = []
2558
+ if not include_relations:
2559
+ shaped["relations"] = []
2560
+ _sync_context_payload_refs(shaped)
2561
+ return _truncate_context_payload(shaped, max_chars=int(max_chars or 0))
2562
+
2563
+
2564
+ def render_context_evidence(result: dict, *, limit: int = 4, max_chars: int = DEFAULT_ROUTER_MAX_CHARS) -> str:
2565
+ assets = result.get("assets") or []
2566
+ if not assets:
2567
+ return ""
2568
+ lines = ["", "LOCAL CONTEXT EVIDENCE:"]
2569
+ lines.append("Use this local evidence if it is relevant to the user's request. Do not mention files that are not supported by the evidence.")
2570
+ chunks_by_asset = {}
2571
+ for chunk in result.get("chunks") or []:
2572
+ chunks_by_asset.setdefault(chunk.get("asset_id"), chunk)
2573
+ for asset in assets[: max(1, int(limit or 4))]:
2574
+ display_path = str(asset.get("display_path") or "")
2575
+ score = asset.get("score")
2576
+ summary = _compact_text(asset.get("summary") or "", max_chars=160)
2577
+ suffix = f" — {summary}" if summary else ""
2578
+ lines.append(f"- {display_path} ({asset.get('file_type', 'file')}, score={score}){suffix}")
2579
+ chunk = chunks_by_asset.get(asset.get("asset_id"))
2580
+ if chunk and chunk.get("text"):
2581
+ lines.append(f" excerpt: {_compact_text(chunk.get('text') or '', max_chars=320)}")
2582
+ refs = result.get("evidence_refs") or []
2583
+ if refs:
2584
+ lines.append(f"Evidence refs: {', '.join(str(ref) for ref in refs[: max(1, int(limit or 4))])}")
2585
+ if result.get("truncated"):
2586
+ lines.append("Result was compacted. Refine the query or call nexo_local_context(mode='full', max_chars=...) if deeper inspection is needed.")
2587
+ rendered = "\n".join(lines)
2588
+ if max_chars and len(rendered) > max_chars:
2589
+ return rendered[: max(0, max_chars - 1)].rstrip() + "…"
2590
+ return rendered
2591
+
2592
+
2593
+ def _router_payload_size(payload: dict) -> int:
2594
+ return len(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
2595
+
2596
+
2597
+ def context_router(
2598
+ query: str,
2599
+ *,
2600
+ intent: str = "answer",
2601
+ limit: int = 4,
2602
+ current_context: str = "",
2603
+ max_chars: int = DEFAULT_ROUTER_MAX_CHARS,
2604
+ ) -> dict:
2605
+ output_max_chars = int(max_chars or 0)
2606
+ internal_max_chars = max(output_max_chars * 3, 4000) if output_max_chars > 0 else 0
2607
+ result = context_query(
2608
+ query,
2609
+ intent=intent,
2610
+ limit=max(1, min(int(limit or 4), 8)),
2611
+ evidence_required=False,
2612
+ current_context=current_context,
2613
+ mode="compact",
2614
+ max_chars=internal_max_chars,
2615
+ include_entities=False,
2616
+ include_relations=False,
2617
+ snippet_chars=360,
2618
+ )
2619
+ rendered = render_context_evidence(result, limit=limit, max_chars=output_max_chars)
2620
+ payload = {
2621
+ "ok": True,
2622
+ "query": query,
2623
+ "intent": intent,
2624
+ "should_inject": bool(result.get("evidence_refs")),
2625
+ "rendered": rendered,
2626
+ "evidence_refs": result.get("evidence_refs") or [],
2627
+ "truncated": bool(result.get("truncated") or (output_max_chars and len(rendered) >= output_max_chars)),
2628
+ "usage_hint": result.get("usage_hint"),
2629
+ }
2630
+ if output_max_chars and _router_payload_size(payload) > output_max_chars:
2631
+ payload["rendered"] = _compact_text(rendered, max_chars=max(80, output_max_chars // 2))
2632
+ payload["truncated"] = True
2633
+ if output_max_chars and _router_payload_size(payload) > output_max_chars:
2634
+ payload["evidence_refs"] = (payload.get("evidence_refs") or [])[:1]
2635
+ payload["usage_hint"] = "nexo_local_context(query='...', mode='compact', limit=4, max_chars=12000)"
2636
+ if output_max_chars and _router_payload_size(payload) > output_max_chars:
2637
+ return {
2638
+ "ok": True,
2639
+ "query": _compact_text(query, max_chars=120),
2640
+ "intent": intent,
2641
+ "should_inject": bool(payload.get("evidence_refs")),
2642
+ "truncated": True,
2643
+ "rendered": _compact_text(rendered, max_chars=max(40, output_max_chars // 2)),
2644
+ "evidence_refs": (payload.get("evidence_refs") or [])[:1],
2645
+ "usage_hint": "nexo_local_context(mode='compact',limit=4,max_chars=12000)",
2646
+ }
2647
+ return payload
2648
+
2649
+
2650
+ def context_query(
2651
+ query: str,
2652
+ *,
2653
+ intent: str = "answer",
2654
+ limit: int = 12,
2655
+ evidence_required: bool = True,
2656
+ current_context: str = "",
2657
+ mode: str = "full",
2658
+ max_chars: int = DEFAULT_CONTEXT_MAX_CHARS,
2659
+ include_entities: bool = True,
2660
+ include_relations: bool = True,
2661
+ snippet_chars: int = 1200,
2662
+ ) -> dict:
2169
2663
  conn = _conn()
2170
- qvec = embeddings.embed_text(query)
2171
- entities_payload, entity_boosts = _entity_matches_for_query(conn, query, limit=max(int(limit), 1))
2664
+ clean_query = str(query or "").strip()
2665
+ normalized_mode, mode_warnings = _normalize_context_mode(mode)
2666
+ context_tail = _compact_text(current_context or "", max_chars=1000)
2667
+ search_query = clean_query if not context_tail else f"{clean_query}\n{context_tail}"
2668
+ qvec = embeddings.embed_text(search_query)
2669
+ entities_payload, entity_boosts = _entity_matches_for_query(conn, search_query, limit=max(int(limit), 1))
2172
2670
  rows = _context_candidate_rows(conn, list(entity_boosts.keys()), base_limit=5000)
2173
2671
  scored = []
2174
2672
  for row in rows:
2175
2673
  if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
2176
2674
  continue
2177
2675
  vector = json_loads(row["vector_json"], [])
2178
- text_score = _search_text_score(query, row["text"])
2179
- path_score = _search_text_score(query, row["path"] or "")
2180
- summary_score = _search_text_score(query, row["summary"] or "")
2676
+ text_score = _search_text_score(search_query, row["text"])
2677
+ path_score = _search_text_score(search_query, row["path"] or "")
2678
+ summary_score = _search_text_score(search_query, row["summary"] or "")
2181
2679
  entity_score = entity_boosts.get(row["asset_id"], 0.0)
2182
2680
  vector_score = embeddings.cosine(qvec, vector)
2183
2681
  score = max(text_score, path_score, summary_score, vector_score)
@@ -2216,7 +2714,7 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
2216
2714
  evidence_refs.append(f"local_asset:{row['asset_id']}#chunk:{row['chunk_id']}")
2217
2715
  relations_payload: list[dict] = []
2218
2716
  relation_asset_ids = list(dict.fromkeys([*seen_assets, *entity_boosts.keys()]))[: int(limit)]
2219
- if relation_asset_ids:
2717
+ if include_relations and relation_asset_ids:
2220
2718
  asset_ids = relation_asset_ids
2221
2719
  placeholders = ",".join("?" for _ in asset_ids)
2222
2720
  relation_rows = conn.execute(
@@ -2230,19 +2728,19 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
2230
2728
  [*asset_ids, int(limit) * 3],
2231
2729
  ).fetchall()
2232
2730
  relations_payload = [dict(row) for row in relation_rows]
2233
- warnings = []
2731
+ warnings = list(mode_warnings)
2234
2732
  if evidence_required and not evidence_refs:
2235
2733
  warnings.append("No local evidence found for this query.")
2236
2734
  summary = ""
2237
2735
  if assets:
2238
- summary = f"Found {len(assets)} local asset(s) related to '{query}'."
2736
+ summary = f"Found {len(assets)} local asset(s) related to '{clean_query}'."
2239
2737
  conn.execute(
2240
2738
  """
2241
2739
  INSERT INTO local_context_queries(query_hash, intent, result_count, confidence, warnings_json, created_at)
2242
2740
  VALUES (?, ?, ?, ?, ?, ?)
2243
2741
  """,
2244
2742
  (
2245
- hashlib.sha256(query.encode("utf-8", errors="ignore")).hexdigest(),
2743
+ hashlib.sha256(clean_query.encode("utf-8", errors="ignore")).hexdigest(),
2246
2744
  intent,
2247
2745
  len(assets),
2248
2746
  0.75 if evidence_refs else 0.0,
@@ -2251,9 +2749,9 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
2251
2749
  ),
2252
2750
  )
2253
2751
  conn.commit()
2254
- return {
2752
+ payload = {
2255
2753
  "ok": True,
2256
- "query": query,
2754
+ "query": clean_query,
2257
2755
  "intent": intent,
2258
2756
  "confidence": 0.75 if evidence_refs else 0.0,
2259
2757
  "summary": summary,
@@ -2264,6 +2762,14 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
2264
2762
  "warnings": warnings,
2265
2763
  "evidence_refs": evidence_refs,
2266
2764
  }
2765
+ return _shape_context_payload(
2766
+ payload,
2767
+ mode=normalized_mode,
2768
+ max_chars=int(max_chars or 0),
2769
+ include_entities=bool(include_entities),
2770
+ include_relations=bool(include_relations),
2771
+ snippet_chars=int(snippet_chars or 1200),
2772
+ )
2267
2773
 
2268
2774
 
2269
2775
  def get_asset(asset_id: str) -> dict:
@@ -2306,11 +2812,23 @@ def clear_index() -> dict:
2306
2812
  "local_index_dirs",
2307
2813
  "local_index_errors",
2308
2814
  "local_index_jobs",
2815
+ "local_index_checkpoints",
2309
2816
  "local_asset_versions",
2310
2817
  "local_assets",
2311
2818
  "local_context_queries",
2312
2819
  ):
2313
2820
  conn.execute(f"DELETE FROM {table}")
2821
+ conn.execute("DELETE FROM local_index_state WHERE key LIKE 'root:%:initial_scan_complete'")
2822
+ conn.execute("DELETE FROM local_index_state WHERE key=?", (INITIAL_INDEX_COMPLETE_KEY,))
2823
+ conn.execute("DELETE FROM local_index_state WHERE key=?", (INITIAL_INDEX_STARTED_AT_KEY,))
2824
+ rows = conn.execute("SELECT id FROM local_index_roots WHERE status!='removed'").fetchall()
2825
+ for row in rows:
2826
+ _set_root_initial_scan_complete(conn, int(row["id"]), False)
2827
+ conn.execute(
2828
+ "UPDATE local_index_roots SET last_scan_at=NULL, status='active', updated_at=? WHERE status!='removed'",
2829
+ (now(),),
2830
+ )
2831
+ _set_initial_index_complete(conn, False)
2314
2832
  conn.commit()
2315
2833
  log_event("warn", "index_cleared", "Local memory index cleared")
2316
2834
  return {"ok": True}