nexo-brain 7.20.4 → 7.20.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.4",
3
+ "version": "7.20.8",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,15 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.20.4` is the current packaged-runtime line. Patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
21
+ Version `7.20.8` is the current packaged-runtime line. Patch release over v7.20.7 — Local Context recognises Windows Mail package roots and Outlook Mac profile roots as bounded local-email sources instead of rejecting them as generic AppData / Group Containers.
22
+
23
+ Previously in `7.20.7`: patch release over v7.20.6 — Local Context email-root bootstrap is deterministic across CI, WSL and migrated profiles while preserving macOS Mail.app, Windows Outlook, Thunderbird and NEXO email coverage.
24
+
25
+ Previously in `7.20.6`: patch release over v7.20.5 — Local Context ranks entity matches at chunk level, keeps old entity-matched assets eligible, adds safe local email roots for macOS/Windows/Linux, extracts `.eml`, `.emlx`, `.msg` and NEXO email DB continuity, and exposes local graph relations in pre-action context.
26
+
27
+ Previously in `7.20.5`: patch release over v7.20.4 — Local Context status reports elapsed indexing time and a defensive ETA while background jobs remain pending.
28
+
29
+ Previously in `7.20.4`: patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
22
30
 
23
31
  Previously in `7.20.3`: patch release over v7.20.2 — installer DMG volumes are no longer added as local-memory roots, removed roots purge stale payloads, and doctor can repair removed-root residue.
24
32
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.4",
3
+ "version": "7.20.8",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
@@ -123,12 +123,44 @@ def _mounted_volume_roots() -> list[str]:
123
123
  return roots
124
124
 
125
125
 
126
+ def _local_email_roots() -> list[str]:
127
+ home = Path.home()
128
+ roots: list[Path] = [home / ".nexo" / "runtime" / "nexo-email"]
129
+ mac_roots = [
130
+ home / "Library" / "Mail",
131
+ home / "Library" / "Group Containers" / "UBF8T346G9.Office" / "Outlook" / "Outlook 15 Profiles",
132
+ ]
133
+ local_app_data = Path(os.environ.get("LOCALAPPDATA") or home / "AppData" / "Local")
134
+ roaming_app_data = Path(os.environ.get("APPDATA") or home / "AppData" / "Roaming")
135
+ windows_roots = [
136
+ home / "Documents" / "Outlook Files",
137
+ local_app_data / "Microsoft" / "Outlook",
138
+ roaming_app_data / "Microsoft" / "Outlook",
139
+ local_app_data / "Packages" / "microsoft.windowscommunicationsapps_8wekyb3d8bbwe" / "LocalState",
140
+ ]
141
+ linux_roots = [home / ".thunderbird", home / ".mozilla-thunderbird"]
142
+
143
+ if sys.platform == "darwin":
144
+ roots.extend(mac_roots)
145
+ elif sys.platform.startswith("win"):
146
+ roots.extend(windows_roots)
147
+ else:
148
+ roots.extend(linux_roots)
149
+
150
+ # CI and migrated profiles can expose platform-specific mail stores while
151
+ # running on another OS. Include only the stores that actually exist.
152
+ for optional_root in [*mac_roots, *windows_roots, *linux_roots]:
153
+ if optional_root.exists() and optional_root not in roots:
154
+ roots.append(optional_root)
155
+ return [str(root) for root in roots]
156
+
157
+
126
158
  def default_roots() -> list[str]:
127
159
  home = Path.home()
128
160
  configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
129
161
  if configured:
130
162
  return _dedupe_roots([item for item in configured.split(os.pathsep) if item.strip()])
131
- return _dedupe_roots([str(home), *_mounted_volume_roots()])
163
+ return _dedupe_roots([str(home), *_local_email_roots(), *_mounted_volume_roots()])
132
164
 
133
165
 
134
166
  def ensure_default_roots() -> dict:
@@ -471,7 +503,7 @@ def _file_type(path: Path) -> str:
471
503
  return "photo"
472
504
  if suffix in {".py", ".js", ".ts", ".tsx", ".jsx", ".php", ".sql", ".css", ".html"}:
473
505
  return "code"
474
- if suffix in {".eml"}:
506
+ if suffix in {".eml", ".emlx", ".msg", ".pst", ".ost"}:
475
507
  return "email"
476
508
  if suffix in {".pdf", ".docx", ".pptx", ".xlsx", ".md", ".txt", ".csv", ".tsv"}:
477
509
  return "document"
@@ -1652,6 +1684,29 @@ def _service_cycle_observation(conn) -> dict:
1652
1684
  return observation
1653
1685
 
1654
1686
 
1687
+ def _index_timing(conn, *, done: int, active_jobs: int, percent: int) -> dict:
1688
+ first_seen = conn.execute(
1689
+ """
1690
+ SELECT MIN(created_at) AS created_at
1691
+ FROM local_index_logs
1692
+ WHERE event IN ('root_added', 'scan_started', 'scan_finished', 'jobs_processed', 'service_cycle_finished')
1693
+ """
1694
+ ).fetchone()["created_at"] or 0
1695
+ if not first_seen:
1696
+ first_seen = conn.execute(
1697
+ """
1698
+ SELECT MIN(first_seen_at) AS first_seen_at
1699
+ FROM local_assets
1700
+ WHERE status!='deleted'
1701
+ """
1702
+ ).fetchone()["first_seen_at"] or 0
1703
+ elapsed_seconds = max(0, int(now() - float(first_seen))) if first_seen else 0
1704
+ eta_seconds = None
1705
+ if elapsed_seconds > 0 and done > 0 and active_jobs > 0 and 0 < percent < 100:
1706
+ eta_seconds = max(0, int((elapsed_seconds / max(done, 1)) * active_jobs))
1707
+ return {"elapsed_seconds": elapsed_seconds, "eta_seconds": eta_seconds}
1708
+
1709
+
1655
1710
  def _service_scheduler_has_error(service: dict) -> bool:
1656
1711
  if service.get("manager") == "launchagent":
1657
1712
  code = str(service.get("last_exit_code") or "").strip()
@@ -1725,6 +1780,7 @@ def status() -> dict:
1725
1780
  active_jobs = pending + running_jobs + failed_jobs
1726
1781
  total_jobs = active_jobs + done
1727
1782
  percent = 100 if total_jobs == 0 else int((done / max(total_jobs, 1)) * 100)
1783
+ timing = _index_timing(conn, done=done, active_jobs=active_jobs, percent=percent)
1728
1784
  roots = list_roots()
1729
1785
  volumes = []
1730
1786
  by_volume = conn.execute(
@@ -1770,8 +1826,8 @@ def status() -> dict:
1770
1826
  "jobs_pending": pending,
1771
1827
  "jobs_running": running_jobs,
1772
1828
  "jobs_failed": failed_jobs,
1773
- "elapsed_seconds": 0,
1774
- "eta_seconds": None,
1829
+ "elapsed_seconds": timing["elapsed_seconds"],
1830
+ "eta_seconds": timing["eta_seconds"],
1775
1831
  },
1776
1832
  "volumes": volumes,
1777
1833
  "roots": roots,
@@ -1856,10 +1912,112 @@ def _search_text_score(query: str, text: str) -> float:
1856
1912
  return len(q & tokens) / max(len(q), 1)
1857
1913
 
1858
1914
 
1859
- def context_query(query: str, *, intent: str = "answer", limit: int = 12, evidence_required: bool = True, current_context: str = "") -> dict:
1860
- conn = _conn()
1861
- qvec = embeddings.embed_text(query)
1915
+ _QUERY_STOPWORDS = {
1916
+ "about",
1917
+ "archivos",
1918
+ "con",
1919
+ "context",
1920
+ "contexto",
1921
+ "cuanto",
1922
+ "dame",
1923
+ "del",
1924
+ "desde",
1925
+ "documentos",
1926
+ "donde",
1927
+ "esta",
1928
+ "está",
1929
+ "file",
1930
+ "files",
1931
+ "hay",
1932
+ "los",
1933
+ "para",
1934
+ "que",
1935
+ "qué",
1936
+ "related",
1937
+ "relacionado",
1938
+ "sabes",
1939
+ "sobre",
1940
+ "todo",
1941
+ "what",
1942
+ "where",
1943
+ }
1944
+
1945
+
1946
+ def _query_terms(query: str) -> list[str]:
1947
+ terms = []
1948
+ for token in tokenize(query):
1949
+ if len(token) < 3 or token in _QUERY_STOPWORDS:
1950
+ continue
1951
+ if token not in terms:
1952
+ terms.append(token)
1953
+ return terms[:10]
1954
+
1955
+
1956
+ def _entity_match_score(query_lower: str, terms: list[str], name: str) -> float:
1957
+ entity = (name or "").strip().lower()
1958
+ if not entity:
1959
+ return 0.0
1960
+ entity_terms = set(tokenize(entity))
1961
+ if entity and entity in query_lower:
1962
+ return 1.0
1963
+ if not terms:
1964
+ return 0.0
1965
+ term_set = set(terms)
1966
+ overlap = term_set & entity_terms
1967
+ if overlap:
1968
+ return min(0.95, 0.45 + (len(overlap) / max(len(entity_terms), 1)) * 0.5)
1969
+ if any(term in entity for term in terms):
1970
+ return 0.6
1971
+ return 0.0
1972
+
1973
+
1974
+ def _entity_matches_for_query(conn, query: str, *, limit: int) -> tuple[list[dict], dict[str, float]]:
1975
+ query_lower = (query or "").strip().lower()
1976
+ terms = _query_terms(query)
1977
+ if not query_lower or not terms:
1978
+ return [], {}
1979
+
1980
+ clauses = " OR ".join("lower(e.name) LIKE ?" for _ in terms)
1981
+ params = [f"%{term}%" for term in terms]
1862
1982
  rows = conn.execute(
1983
+ f"""
1984
+ SELECT DISTINCT e.name, e.entity_type, e.asset_id, a.path, a.privacy_class
1985
+ FROM local_entities e
1986
+ JOIN local_assets a ON a.asset_id = e.asset_id
1987
+ WHERE a.status='active'
1988
+ AND a.privacy_class='normal'
1989
+ AND ({clauses})
1990
+ LIMIT ?
1991
+ """,
1992
+ [*params, max(int(limit) * 20, 40)],
1993
+ ).fetchall()
1994
+
1995
+ matches = []
1996
+ boosts: dict[str, float] = {}
1997
+ seen = set()
1998
+ for row in rows:
1999
+ if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
2000
+ continue
2001
+ score = _entity_match_score(query_lower, terms, str(row["name"] or ""))
2002
+ if score <= 0:
2003
+ continue
2004
+ key = (row["name"], row["entity_type"], row["asset_id"])
2005
+ if key not in seen:
2006
+ matches.append({
2007
+ "name": row["name"],
2008
+ "entity_type": row["entity_type"],
2009
+ "asset_id": row["asset_id"],
2010
+ "score": round(float(score), 4),
2011
+ })
2012
+ seen.add(key)
2013
+ boosts[row["asset_id"]] = max(boosts.get(row["asset_id"], 0.0), float(score))
2014
+
2015
+ matches.sort(key=lambda item: item.get("score", 0), reverse=True)
2016
+ return matches[: int(limit)], boosts
2017
+
2018
+
2019
+ def _context_candidate_rows(conn, entity_asset_ids: list[str], *, base_limit: int = 5000) -> list:
2020
+ base_rows = conn.execute(
1863
2021
  """
1864
2022
  SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
1865
2023
  FROM local_chunks c
@@ -1869,17 +2027,68 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
1869
2027
  WHERE a.status='active'
1870
2028
  AND a.privacy_class='normal'
1871
2029
  ORDER BY c.created_at DESC
1872
- LIMIT 5000
1873
- """
2030
+ LIMIT ?
2031
+ """,
2032
+ (int(base_limit),),
1874
2033
  ).fetchall()
2034
+ if not entity_asset_ids:
2035
+ return base_rows
2036
+
2037
+ placeholders = ",".join("?" for _ in entity_asset_ids)
2038
+ entity_rows = conn.execute(
2039
+ f"""
2040
+ SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
2041
+ FROM local_chunks c
2042
+ JOIN local_assets a ON a.asset_id = c.asset_id
2043
+ LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
2044
+ LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
2045
+ WHERE a.status='active'
2046
+ AND a.privacy_class='normal'
2047
+ AND c.asset_id IN ({placeholders})
2048
+ ORDER BY c.chunk_index ASC
2049
+ LIMIT ?
2050
+ """,
2051
+ [*entity_asset_ids, max(1000, len(entity_asset_ids) * 80)],
2052
+ ).fetchall()
2053
+
2054
+ rows = []
2055
+ seen_chunks = set()
2056
+ for row in [*entity_rows, *base_rows]:
2057
+ chunk_id = row["chunk_id"]
2058
+ if chunk_id in seen_chunks:
2059
+ continue
2060
+ seen_chunks.add(chunk_id)
2061
+ rows.append(row)
2062
+ return rows
2063
+
2064
+
2065
+ def context_query(query: str, *, intent: str = "answer", limit: int = 12, evidence_required: bool = True, current_context: str = "") -> dict:
2066
+ conn = _conn()
2067
+ qvec = embeddings.embed_text(query)
2068
+ entities_payload, entity_boosts = _entity_matches_for_query(conn, query, limit=max(int(limit), 1))
2069
+ rows = _context_candidate_rows(conn, list(entity_boosts.keys()), base_limit=5000)
1875
2070
  scored = []
1876
2071
  for row in rows:
1877
2072
  if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
1878
2073
  continue
1879
2074
  vector = json_loads(row["vector_json"], [])
1880
- score = max(_search_text_score(query, row["text"]), embeddings.cosine(qvec, vector))
2075
+ text_score = _search_text_score(query, row["text"])
2076
+ path_score = _search_text_score(query, row["path"] or "")
2077
+ summary_score = _search_text_score(query, row["summary"] or "")
2078
+ entity_score = entity_boosts.get(row["asset_id"], 0.0)
2079
+ vector_score = embeddings.cosine(qvec, vector)
2080
+ score = max(text_score, path_score, summary_score, vector_score)
2081
+ if entity_score > 0:
2082
+ direct_score = max(text_score, path_score, summary_score)
2083
+ if direct_score > 0:
2084
+ entity_rank = 0.82 + (0.42 * text_score) + (0.18 * path_score) + (0.12 * summary_score)
2085
+ score = max(score, entity_rank + min(0.2, entity_score * 0.2))
2086
+ else:
2087
+ # Entity-level matches keep older assets eligible, but do not let
2088
+ # unrelated chunks from a long document outrank direct evidence.
2089
+ score = max(score, min(0.48, 0.28 + entity_score * 0.2))
1881
2090
  if score > 0:
1882
- scored.append((score, row))
2091
+ scored.append((min(float(score), 1.6), row))
1883
2092
  scored.sort(key=lambda item: item[0], reverse=True)
1884
2093
  assets = []
1885
2094
  chunks = []
@@ -1902,14 +2111,10 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
1902
2111
  "score": round(float(score), 4),
1903
2112
  })
1904
2113
  evidence_refs.append(f"local_asset:{row['asset_id']}#chunk:{row['chunk_id']}")
1905
- entity_rows = conn.execute(
1906
- "SELECT DISTINCT name, entity_type, asset_id FROM local_entities WHERE lower(name) LIKE ? LIMIT ?",
1907
- (f"%{query.lower()}%", int(limit)),
1908
- ).fetchall()
1909
- entities_payload = [dict(row) for row in entity_rows]
1910
2114
  relations_payload: list[dict] = []
1911
- if seen_assets:
1912
- asset_ids = list(seen_assets)[: int(limit)]
2115
+ relation_asset_ids = list(dict.fromkeys([*seen_assets, *entity_boosts.keys()]))[: int(limit)]
2116
+ if relation_asset_ids:
2117
+ asset_ids = relation_asset_ids
1913
2118
  placeholders = ",".join("?" for _ in asset_ids)
1914
2119
  relation_rows = conn.execute(
1915
2120
  f"""
@@ -4,12 +4,15 @@ import csv
4
4
  import html
5
5
  import json
6
6
  import re
7
+ import sqlite3
7
8
  import zipfile
8
9
  from email import policy
9
10
  from email.parser import BytesParser
10
11
  from pathlib import Path
11
12
  from xml.etree import ElementTree
12
13
 
14
+ from .privacy import is_local_email_db
15
+
13
16
  MAX_TEXT_BYTES = 512 * 1024
14
17
  MAX_CHARS = 120_000
15
18
 
@@ -73,8 +76,8 @@ def _extract_csv(path: Path) -> str:
73
76
  return "\n".join(rows)[:MAX_CHARS]
74
77
 
75
78
 
76
- def _extract_eml(path: Path) -> tuple[str, dict]:
77
- msg = BytesParser(policy=policy.default).parsebytes(path.read_bytes()[:MAX_TEXT_BYTES])
79
+ def _extract_email_bytes(data: bytes) -> tuple[str, dict]:
80
+ msg = BytesParser(policy=policy.default).parsebytes(data[:MAX_TEXT_BYTES])
78
81
  meta = {
79
82
  "subject": str(msg.get("subject") or ""),
80
83
  "from": str(msg.get("from") or ""),
@@ -92,6 +95,99 @@ def _extract_eml(path: Path) -> tuple[str, dict]:
92
95
  return "\n".join([meta["subject"], meta["from"], meta["to"], text])[:MAX_CHARS], meta
93
96
 
94
97
 
98
+ def _extract_eml(path: Path) -> tuple[str, dict]:
99
+ return _extract_email_bytes(path.read_bytes()[:MAX_TEXT_BYTES])
100
+
101
+
102
+ def _extract_emlx(path: Path) -> tuple[str, dict]:
103
+ data = path.read_bytes()[:MAX_TEXT_BYTES]
104
+ first_line, separator, rest = data.partition(b"\n")
105
+ if separator and first_line.strip().isdigit():
106
+ declared = int(first_line.strip() or b"0")
107
+ payload = rest[:declared] if declared > 0 else rest
108
+ else:
109
+ payload = data
110
+ if b"\n<?xml" in payload:
111
+ payload = payload.split(b"\n<?xml", 1)[0]
112
+ text, meta = _extract_email_bytes(payload)
113
+ meta["apple_mail_message"] = True
114
+ return text, meta
115
+
116
+
117
+ def _printable_binary_text(path: Path) -> str:
118
+ data = path.read_bytes()[:MAX_TEXT_BYTES]
119
+ decoded = data.decode("utf-16", errors="ignore") if b"\x00" in data[:2000] else data.decode("latin-1", errors="ignore")
120
+ pieces = re.findall(r"[\wÀ-ÿ@./:=+\- ,;()\\[\\]{}]{4,}", decoded)
121
+ return "\n".join(piece.strip() for piece in pieces if piece.strip())[:MAX_CHARS]
122
+
123
+
124
+ def _extract_msg(path: Path) -> tuple[str, dict]:
125
+ try:
126
+ import extract_msg # type: ignore
127
+ message = extract_msg.Message(str(path))
128
+ meta = {
129
+ "subject": str(getattr(message, "subject", "") or ""),
130
+ "from": str(getattr(message, "sender", "") or ""),
131
+ "to": str(getattr(message, "to", "") or ""),
132
+ "date": str(getattr(message, "date", "") or ""),
133
+ "extractor": "msg",
134
+ }
135
+ body = str(getattr(message, "body", "") or "")
136
+ close = getattr(message, "close", None)
137
+ if callable(close):
138
+ close()
139
+ return "\n".join([meta["subject"], meta["from"], meta["to"], body])[:MAX_CHARS], meta
140
+ except Exception:
141
+ return _printable_binary_text(path), {"extractor": "msg_fallback"}
142
+
143
+
144
+ def _table_names(conn: sqlite3.Connection) -> set[str]:
145
+ rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
146
+ return {str(row[0]) for row in rows}
147
+
148
+
149
+ def _select_existing_columns(conn: sqlite3.Connection, table: str, columns: list[str]) -> list[str]:
150
+ found = {str(row[1]) for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
151
+ return [column for column in columns if column in found]
152
+
153
+
154
+ def _extract_nexo_email_db(path: Path) -> tuple[str, dict]:
155
+ if not is_local_email_db(str(path)):
156
+ return "", {"extractor": "sqlite_blocked"}
157
+ uri = f"file:{path}?mode=ro"
158
+ parts: list[str] = []
159
+ try:
160
+ conn = sqlite3.connect(uri, uri=True, timeout=1)
161
+ except Exception:
162
+ return "", {"extractor": "nexo_email_db", "state": "locked_or_unavailable"}
163
+ try:
164
+ tables = _table_names(conn)
165
+ if "emails" in tables:
166
+ cols = _select_existing_columns(
167
+ conn,
168
+ "emails",
169
+ ["from_addr", "from_name", "subject", "received_at", "status", "body", "response"],
170
+ )
171
+ if not cols:
172
+ return "", {"extractor": "nexo_email_db", "tables": sorted(tables)}
173
+ order = "received_at" if "received_at" in cols else "rowid"
174
+ for row in conn.execute(f"SELECT {', '.join(cols)} FROM emails ORDER BY {order} DESC LIMIT 1000").fetchall():
175
+ parts.append(" | ".join(str(value or "")[:4000] for value in row))
176
+ if "sent_email_events" in tables:
177
+ cols = _select_existing_columns(
178
+ conn,
179
+ "sent_email_events",
180
+ ["sender", "to_addrs", "cc_addrs", "subject", "sent_at", "status", "body_text"],
181
+ )
182
+ if cols:
183
+ order = "sent_at" if "sent_at" in cols else "rowid"
184
+ for row in conn.execute(f"SELECT {', '.join(cols)} FROM sent_email_events ORDER BY {order} DESC LIMIT 1000").fetchall():
185
+ parts.append(" | ".join(str(value or "")[:4000] for value in row))
186
+ finally:
187
+ conn.close()
188
+ return "\n".join(parts)[:MAX_CHARS], {"extractor": "nexo_email_db", "tables": sorted(tables) if "tables" in locals() else []}
189
+
190
+
95
191
  def _zip_xml_text(path: Path, members: list[str]) -> str:
96
192
  pieces: list[str] = []
97
193
  with zipfile.ZipFile(path) as zf:
@@ -176,6 +272,14 @@ def extract_text(path: Path) -> tuple[str, dict]:
176
272
  elif suffix == ".eml":
177
273
  text, metadata = _extract_eml(path)
178
274
  metadata["extractor"] = "eml"
275
+ elif suffix == ".emlx":
276
+ text, metadata = _extract_emlx(path)
277
+ metadata["extractor"] = "emlx"
278
+ elif suffix == ".msg":
279
+ text, metadata = _extract_msg(path)
280
+ metadata["extractor"] = metadata.get("extractor") or "msg"
281
+ elif suffix == ".db" and is_local_email_db(str(path)):
282
+ text, metadata = _extract_nexo_email_db(path)
179
283
  elif suffix == ".pdf":
180
284
  text = _extract_pdf(path)
181
285
  elif suffix == ".docx":
@@ -67,6 +67,29 @@ SENSITIVE_PARTS = {
67
67
  "browser profile",
68
68
  }
69
69
 
70
+ EMAIL_RUNTIME_DB_NAMES = {
71
+ "email.db",
72
+ "email-tracker.db",
73
+ "emails.db",
74
+ "monitor.db",
75
+ "nexo-email.db",
76
+ }
77
+
78
+ EMAIL_ATTACHMENT_SUFFIXES = {
79
+ ".csv",
80
+ ".docx",
81
+ ".eml",
82
+ ".emlx",
83
+ ".html",
84
+ ".md",
85
+ ".pdf",
86
+ ".pptx",
87
+ ".txt",
88
+ ".xlsx",
89
+ }
90
+
91
+ EMAIL_EXTRACTABLE_SUFFIXES = {".eml", ".emlx", ".msg"}
92
+
70
93
  NOISY_PARTS = {
71
94
  "node_modules",
72
95
  "vendor",
@@ -173,6 +196,77 @@ def _contains_path_marker(lowered: str, markers: set[str]) -> bool:
173
196
  return any(marker in lowered for marker in markers)
174
197
 
175
198
 
199
+ def _is_under_marker(lowered: str, marker: str) -> bool:
200
+ marker = marker.strip("/").lower()
201
+ if not marker:
202
+ return False
203
+ return lowered.endswith("/" + marker) or f"/{marker}/" in lowered
204
+
205
+
206
+ def _is_inside_windows_mail_package(lowered: str) -> bool:
207
+ return "/appdata/local/packages/microsoft.windowscommunicationsapps" in lowered
208
+
209
+
210
+ def _is_inside_outlook_mac_profile(lowered: str) -> bool:
211
+ return "/library/group containers/ubf8t346g9.office/outlook" in lowered
212
+
213
+
214
+ def is_local_email_tree(path: str) -> bool:
215
+ lowered = _normalized(path)
216
+ if _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
217
+ return True
218
+ return any(
219
+ _is_under_marker(lowered, marker)
220
+ for marker in (
221
+ "library/mail",
222
+ ".nexo/runtime/nexo-email",
223
+ "documents/outlook files",
224
+ "appdata/local/microsoft/outlook",
225
+ "appdata/roaming/microsoft/outlook",
226
+ "appdata/local/packages/microsoft.windowscommunicationsapps",
227
+ ".thunderbird",
228
+ ".mozilla-thunderbird",
229
+ )
230
+ )
231
+
232
+
233
+ def is_local_email_db(path: str) -> bool:
234
+ p = Path(path)
235
+ return is_local_email_tree(path) and p.name.lower() in EMAIL_RUNTIME_DB_NAMES
236
+
237
+
238
+ def is_allowed_local_email_file(path: str) -> bool:
239
+ if not is_local_email_tree(path):
240
+ return False
241
+ p = Path(path)
242
+ lowered = _normalized(path)
243
+ suffix = p.suffix.lower()
244
+ if is_sensitive_path(path):
245
+ return False
246
+ if _is_under_marker(lowered, ".nexo/runtime/nexo-email"):
247
+ if is_local_email_db(path):
248
+ return True
249
+ if _is_under_marker(lowered, ".nexo/runtime/nexo-email/attachments"):
250
+ return suffix in EMAIL_ATTACHMENT_SUFFIXES
251
+ return suffix in {".eml", ".emlx"}
252
+ if _is_under_marker(lowered, "library/mail"):
253
+ return suffix in {".eml", ".emlx"}
254
+ if any(
255
+ _is_under_marker(lowered, marker)
256
+ for marker in (
257
+ "library/group containers/ubf8t346g9.office/outlook",
258
+ "documents/outlook files",
259
+ "appdata/local/microsoft/outlook",
260
+ "appdata/roaming/microsoft/outlook",
261
+ "appdata/local/packages/microsoft.windowscommunicationsapps",
262
+ )
263
+ ) or _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
264
+ return suffix in {".eml", ".msg", ".pst", ".ost"}
265
+ if _is_under_marker(lowered, ".thunderbird") or _is_under_marker(lowered, ".mozilla-thunderbird"):
266
+ return suffix in {".eml", ".mbox", ""}
267
+ return False
268
+
269
+
176
270
  def _has_transient_project_part(path: str) -> bool:
177
271
  parts = list(_normalized(path).replace(":", "/").split("/"))
178
272
  for index, part in enumerate(parts):
@@ -239,6 +333,8 @@ def classify_path(path: str) -> tuple[int, str, str]:
239
333
  lowered = _normalized(path)
240
334
  parts = _parts(path)
241
335
 
336
+ if is_local_email_tree(path) and (Path(path).suffix == "" or is_allowed_local_email_file(path)):
337
+ return 2, "normal", "local_email_path"
242
338
  if is_sensitive_path(path):
243
339
  return 1, "sensitive_inventory_only", "sensitive_path"
244
340
  if is_private_profile_path(path):
@@ -253,6 +349,8 @@ def classify_path(path: str) -> tuple[int, str, str]:
253
349
  def should_skip_tree(path: str) -> bool:
254
350
  lowered = _normalized(path)
255
351
  parts = _parts(path)
352
+ if is_local_email_tree(path):
353
+ return False
256
354
  if any(item in lowered for item in SYSTEM_PARTS):
257
355
  return True
258
356
  if is_sensitive_path(path) or is_private_profile_path(path):
@@ -263,6 +361,8 @@ def should_skip_tree(path: str) -> bool:
263
361
  def should_skip_file(path: str) -> bool:
264
362
  lowered = _normalized(path)
265
363
  parts = _parts(path)
364
+ if is_local_email_tree(path):
365
+ return not is_allowed_local_email_file(path)
266
366
  if any(item in lowered for item in SYSTEM_PARTS):
267
367
  return True
268
368
  if is_sensitive_path(path) or is_private_profile_path(path):
@@ -282,6 +382,8 @@ def should_extract(path: str, depth: int) -> bool:
282
382
  if should_skip_file(path):
283
383
  return False
284
384
  suffix = Path(path).suffix.lower()
385
+ if is_local_email_db(path):
386
+ return True
285
387
  if suffix in {
286
388
  ".txt",
287
389
  ".md",
@@ -302,6 +404,8 @@ def should_extract(path: str, depth: int) -> bool:
302
404
  ".csv",
303
405
  ".tsv",
304
406
  ".eml",
407
+ ".emlx",
408
+ ".msg",
305
409
  ".pdf",
306
410
  ".docx",
307
411
  ".pptx",
@@ -43,6 +43,15 @@ def _format_local_context_evidence(query: str, *, limit: int = 4) -> str:
43
43
  refs = result.get("evidence_refs") or []
44
44
  if refs:
45
45
  lines.append(f"Evidence refs: {', '.join(str(ref) for ref in refs[:limit])}")
46
+ relations = result.get("relations") or []
47
+ if relations:
48
+ lines.append("Local relations:")
49
+ for relation in relations[:limit]:
50
+ relation_type = str(relation.get("relation_type") or "related")
51
+ target = str(relation.get("target_ref") or relation.get("target_asset_id") or "").strip()
52
+ evidence = str(relation.get("evidence") or "").strip()
53
+ suffix = f" — {evidence[:120]}" if evidence else ""
54
+ lines.append(f"- {relation_type}: {target}{suffix}")
46
55
  return "\n".join(lines)
47
56
 
48
57