nexo-brain 7.20.3 → 7.20.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.3",
3
+ "version": "7.20.8",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,19 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.20.2` is the current packaged-runtime line. Patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
21
+ Version `7.20.8` is the current packaged-runtime line. Patch release over v7.20.7 — Local Context recognises Windows Mail package roots and Outlook Mac profile roots as bounded local-email sources instead of rejecting them as generic AppData / Group Containers.
22
+
23
+ Previously in `7.20.7`: patch release over v7.20.6 — Local Context email-root bootstrap is deterministic across CI, WSL and migrated profiles while preserving macOS Mail.app, Windows Outlook, Thunderbird and NEXO email coverage.
24
+
25
+ Previously in `7.20.6`: patch release over v7.20.5 — Local Context ranks entity matches at chunk level, keeps old entity-matched assets eligible, adds safe local email roots for macOS/Windows/Linux, extracts `.eml`, `.emlx`, `.msg` and NEXO email DB continuity, and exposes local graph relations in pre-action context.
26
+
27
+ Previously in `7.20.5`: patch release over v7.20.4 — Local Context status reports elapsed indexing time and a defensive ETA while background jobs remain pending.
28
+
29
+ Previously in `7.20.4`: patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
30
+
31
+ Previously in `7.20.3`: patch release over v7.20.2 — installer DMG volumes are no longer added as local-memory roots, removed roots purge stale payloads, and doctor can repair removed-root residue.
32
+
33
+ Previously in `7.20.2`: patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
22
34
 
23
35
  Previously in `7.20.1`: patch release over v7.20.0 — the Local Context service now recovers from orphaned locks and mixed-version cycle failures instead of leaving the background index stuck.
24
36
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.3",
3
+ "version": "7.20.8",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
@@ -3840,16 +3840,23 @@ def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
3840
3840
  result = local_context_api.local_index_hygiene(fix=fix)
3841
3841
  residue = result.get("residue") or {}
3842
3842
  cleanup = result.get("cleanup") or {}
3843
+ privacy = result.get("privacy") or {}
3844
+ privacy_residue = privacy.get("residue") or {}
3845
+ privacy_cleanup = privacy.get("cleanup") or {}
3843
3846
  suspect_roots = [str(path) for path in result.get("removed_roots") or []]
3844
3847
  residue_total = sum(int(residue.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
3845
3848
  cleanup_total = sum(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
3849
+ privacy_residue_total = sum(int(privacy_residue.get(key, 0) or 0) for key in ("assets", "dirs", "content_secret_assets"))
3850
+ privacy_cleanup_total = sum(int(privacy_cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "chunks", "embeddings", "entities", "relations", "versions", "dirs", "content_secret_assets"))
3846
3851
  evidence = [
3847
3852
  "suspect_installer_roots=" + str(len(suspect_roots)),
3848
3853
  "residue=" + json.dumps(residue, sort_keys=True),
3849
3854
  "cleanup=" + json.dumps(cleanup, sort_keys=True),
3855
+ "privacy_residue=" + json.dumps(privacy_residue, sort_keys=True),
3856
+ "privacy_cleanup=" + json.dumps(privacy_cleanup, sort_keys=True),
3850
3857
  ]
3851
3858
  evidence.extend(f"root={path}" for path in suspect_roots[:5])
3852
- if residue_total == 0 and not suspect_roots:
3859
+ if residue_total == 0 and privacy_residue_total == 0 and not suspect_roots:
3853
3860
  return DoctorCheck(
3854
3861
  id="runtime.local_index_hygiene",
3855
3862
  tier="runtime",
@@ -3868,17 +3875,17 @@ def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
3868
3875
  summary="Local memory index hygiene repaired",
3869
3876
  evidence=evidence,
3870
3877
  repair_plan=[],
3871
- fixed=cleanup_total > 0 or bool(suspect_roots),
3878
+ fixed=cleanup_total > 0 or privacy_cleanup_total > 0 or bool(suspect_roots),
3872
3879
  )
3873
3880
  return DoctorCheck(
3874
3881
  id="runtime.local_index_hygiene",
3875
3882
  tier="runtime",
3876
3883
  status="degraded",
3877
3884
  severity="warn",
3878
- summary="Local memory index has stale removed-root residue",
3885
+ summary="Local memory index has stale or private residue",
3879
3886
  evidence=evidence,
3880
- repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and installer-volume residue"],
3881
- escalation_prompt="Local memory status may show stale pending or failed jobs from removed roots.",
3887
+ repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and private local-memory residue"],
3888
+ escalation_prompt="Local memory may contain stale or private index payloads that should be purged before indexing continues.",
3882
3889
  )
3883
3890
  except Exception as exc:
3884
3891
  return DoctorCheck(
@@ -14,9 +14,9 @@ from db import get_db, init_db
14
14
  from db._schema import run_migrations
15
15
 
16
16
  from . import embeddings
17
- from .extractors import chunk_text, entities, extract_text, summarize
17
+ from .extractors import chunk_text, contains_secret, entities, extract_text, summarize
18
18
  from .logging import log_event, tail
19
- from .privacy import classify_path, should_extract, should_skip_tree
19
+ from .privacy import classify_path, is_queryable_path, should_extract, should_skip_file, should_skip_tree
20
20
  from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
21
21
 
22
22
  LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
@@ -41,6 +41,9 @@ def _conn():
41
41
  def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> dict:
42
42
  conn = _conn()
43
43
  root_path = norm_path(path)
44
+ if should_skip_tree(root_path):
45
+ log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
46
+ return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
44
47
  depth_value = 2 if depth is None else int(depth)
45
48
  conn.execute(
46
49
  """
@@ -120,12 +123,44 @@ def _mounted_volume_roots() -> list[str]:
120
123
  return roots
121
124
 
122
125
 
126
+ def _local_email_roots() -> list[str]:
127
+ home = Path.home()
128
+ roots: list[Path] = [home / ".nexo" / "runtime" / "nexo-email"]
129
+ mac_roots = [
130
+ home / "Library" / "Mail",
131
+ home / "Library" / "Group Containers" / "UBF8T346G9.Office" / "Outlook" / "Outlook 15 Profiles",
132
+ ]
133
+ local_app_data = Path(os.environ.get("LOCALAPPDATA") or home / "AppData" / "Local")
134
+ roaming_app_data = Path(os.environ.get("APPDATA") or home / "AppData" / "Roaming")
135
+ windows_roots = [
136
+ home / "Documents" / "Outlook Files",
137
+ local_app_data / "Microsoft" / "Outlook",
138
+ roaming_app_data / "Microsoft" / "Outlook",
139
+ local_app_data / "Packages" / "microsoft.windowscommunicationsapps_8wekyb3d8bbwe" / "LocalState",
140
+ ]
141
+ linux_roots = [home / ".thunderbird", home / ".mozilla-thunderbird"]
142
+
143
+ if sys.platform == "darwin":
144
+ roots.extend(mac_roots)
145
+ elif sys.platform.startswith("win"):
146
+ roots.extend(windows_roots)
147
+ else:
148
+ roots.extend(linux_roots)
149
+
150
+ # CI and migrated profiles can expose platform-specific mail stores while
151
+ # running on another OS. Include only the stores that actually exist.
152
+ for optional_root in [*mac_roots, *windows_roots, *linux_roots]:
153
+ if optional_root.exists() and optional_root not in roots:
154
+ roots.append(optional_root)
155
+ return [str(root) for root in roots]
156
+
157
+
123
158
  def default_roots() -> list[str]:
124
159
  home = Path.home()
125
160
  configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
126
161
  if configured:
127
162
  return _dedupe_roots([item for item in configured.split(os.pathsep) if item.strip()])
128
- return _dedupe_roots([str(home), *_mounted_volume_roots()])
163
+ return _dedupe_roots([str(home), *_local_email_roots(), *_mounted_volume_roots()])
129
164
 
130
165
 
131
166
  def ensure_default_roots() -> dict:
@@ -220,6 +255,7 @@ def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -
220
255
  for table in ("local_embeddings", "local_chunks", "local_entities", "local_asset_versions"):
221
256
  conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({asset_subquery})", tuple(params))
222
257
  conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({asset_subquery})", tuple(params))
258
+ conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({asset_subquery})", tuple(params))
223
259
  conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({asset_subquery})", tuple(params))
224
260
  conn.execute(f"DELETE FROM local_index_jobs WHERE asset_id IN ({asset_subquery})", tuple(params))
225
261
  conn.execute(f"DELETE FROM local_index_errors WHERE asset_id IN ({asset_subquery})", tuple(params))
@@ -235,12 +271,136 @@ def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -
235
271
  return counts
236
272
 
237
273
 
274
+ def _purge_asset_ids(conn, asset_ids: list[str]) -> dict:
275
+ unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
276
+ counts = {"assets": len(unique_ids), "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0}
277
+ if not unique_ids:
278
+ return counts
279
+ for start in range(0, len(unique_ids), 500):
280
+ batch = unique_ids[start:start + 500]
281
+ placeholders = ",".join("?" for _ in batch)
282
+ for key, table in (
283
+ ("embeddings", "local_embeddings"),
284
+ ("chunks", "local_chunks"),
285
+ ("entities", "local_entities"),
286
+ ("versions", "local_asset_versions"),
287
+ ("jobs", "local_index_jobs"),
288
+ ("errors", "local_index_errors"),
289
+ ):
290
+ counts[key] += int(conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
291
+ counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
292
+ counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
293
+ counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch)).rowcount or 0)
294
+ conn.execute(f"DELETE FROM local_assets WHERE asset_id IN ({placeholders})", tuple(batch))
295
+ return counts
296
+
297
+
298
+ def _privacy_unsafe_asset_ids(conn) -> list[str]:
299
+ rows = conn.execute("SELECT asset_id, path, privacy_class FROM local_assets").fetchall()
300
+ unsafe: list[str] = []
301
+ for row in rows:
302
+ privacy_class = str(row["privacy_class"] or "")
303
+ if should_skip_file(str(row["path"] or "")) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
304
+ unsafe.append(str(row["asset_id"]))
305
+ return unsafe
306
+
307
+
308
+ def _privacy_unsafe_dir_ids(conn) -> list[str]:
309
+ rows = conn.execute("SELECT dir_id, path FROM local_index_dirs").fetchall()
310
+ return [str(row["dir_id"]) for row in rows if should_skip_tree(str(row["path"] or ""))]
311
+
312
+
313
+ def _content_secret_asset_ids(conn) -> list[str]:
314
+ rows = conn.execute(
315
+ """
316
+ SELECT c.asset_id, c.text
317
+ FROM local_chunks c
318
+ JOIN local_assets a ON a.asset_id=c.asset_id
319
+ WHERE a.status='active'
320
+ AND COALESCE(a.privacy_class, 'normal')='normal'
321
+ ORDER BY c.asset_id, c.chunk_index
322
+ """
323
+ ).fetchall()
324
+ secret_ids: set[str] = set()
325
+ for row in rows:
326
+ asset_id = str(row["asset_id"])
327
+ if asset_id in secret_ids:
328
+ continue
329
+ if contains_secret(str(row["text"] or "")):
330
+ secret_ids.add(asset_id)
331
+ return sorted(secret_ids)
332
+
333
+
334
+ def _mark_content_secret_assets(conn, asset_ids: list[str]) -> int:
335
+ unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
336
+ if not unique_ids:
337
+ return 0
338
+ for start in range(0, len(unique_ids), 500):
339
+ batch = unique_ids[start:start + 500]
340
+ placeholders = ",".join("?" for _ in batch)
341
+ for table in ("local_embeddings", "local_chunks", "local_entities"):
342
+ conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch))
343
+ conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch))
344
+ conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch))
345
+ conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch))
346
+ conn.execute(
347
+ f"""
348
+ UPDATE local_index_jobs
349
+ SET status='done', last_error_code='content_secret_blocked', updated_at=?
350
+ WHERE asset_id IN ({placeholders})
351
+ """,
352
+ (now(), *batch),
353
+ )
354
+ conn.execute(
355
+ f"""
356
+ UPDATE local_asset_versions
357
+ SET summary='', metadata_json=?
358
+ WHERE asset_id IN ({placeholders})
359
+ """,
360
+ (json_dumps({"content_blocked": "secret_pattern"}), *batch),
361
+ )
362
+ conn.execute(
363
+ f"""
364
+ UPDATE local_assets
365
+ SET privacy_class='content_secret_inventory_only',
366
+ depth=1,
367
+ depth_reason='content_secret',
368
+ phase='privacy_blocked',
369
+ updated_at=?
370
+ WHERE asset_id IN ({placeholders})
371
+ """,
372
+ (now(), *batch),
373
+ )
374
+ return len(unique_ids)
375
+
376
+
377
+ def local_index_privacy_hygiene(*, fix: bool = False) -> dict:
378
+ conn = _conn()
379
+ asset_ids = _privacy_unsafe_asset_ids(conn)
380
+ dir_ids = _privacy_unsafe_dir_ids(conn)
381
+ content_secret_ids = _content_secret_asset_ids(conn)
382
+ residue = {"assets": len(asset_ids), "dirs": len(dir_ids), "content_secret_assets": len(content_secret_ids)}
383
+ cleanup = {"assets": 0, "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0, "dirs": 0, "content_secret_assets": 0}
384
+ if fix:
385
+ cleanup.update(_purge_asset_ids(conn, asset_ids))
386
+ if dir_ids:
387
+ for start in range(0, len(dir_ids), 500):
388
+ batch = dir_ids[start:start + 500]
389
+ placeholders = ",".join("?" for _ in batch)
390
+ cleanup["dirs"] += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
391
+ cleanup["content_secret_assets"] = _mark_content_secret_assets(conn, content_secret_ids)
392
+ conn.commit()
393
+ if asset_ids or dir_ids or content_secret_ids:
394
+ log_event("warn", "privacy_hygiene_repaired", "Local memory privacy hygiene repaired", cleanup=cleanup)
395
+ return {"ok": True, "fix": fix, "residue": residue, "cleanup": cleanup}
396
+
397
+
238
398
  def local_index_hygiene(*, fix: bool = False) -> dict:
239
399
  conn = _conn()
240
400
  removed_paths: list[str] = []
241
401
  for row in conn.execute("SELECT id, root_path FROM local_index_roots").fetchall():
242
402
  path = str(row["root_path"] or "")
243
- if _should_skip_mounted_root(Path(path)):
403
+ if _should_skip_mounted_root(Path(path)) or should_skip_tree(path):
244
404
  removed_paths.append(path)
245
405
  if fix:
246
406
  conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE id=?", (now(), row["id"]))
@@ -249,9 +409,10 @@ def local_index_hygiene(*, fix: bool = False) -> dict:
249
409
  if fix:
250
410
  cleanup = _purge_removed_root_payloads(conn)
251
411
  conn.commit()
412
+ privacy = local_index_privacy_hygiene(fix=fix)
252
413
  if fix and (removed_paths or any(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))):
253
414
  log_event("info", "index_hygiene_repaired", "Local memory index hygiene repaired", roots=[redact_path(path) for path in removed_paths], cleanup=cleanup)
254
- return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup}
415
+ return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup, "privacy": privacy}
255
416
 
256
417
 
257
418
  def repair_index_hygiene() -> dict:
@@ -342,7 +503,7 @@ def _file_type(path: Path) -> str:
342
503
  return "photo"
343
504
  if suffix in {".py", ".js", ".ts", ".tsx", ".jsx", ".php", ".sql", ".css", ".html"}:
344
505
  return "code"
345
- if suffix in {".eml"}:
506
+ if suffix in {".eml", ".emlx", ".msg", ".pst", ".ost"}:
346
507
  return "email"
347
508
  if suffix in {".pdf", ".docx", ".pptx", ".xlsx", ".md", ".txt", ".csv", ".tsv"}:
348
509
  return "document"
@@ -424,6 +585,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
424
585
  raw_path = str(path)
425
586
  normalized = norm_path(raw_path)
426
587
  asset_id = stable_id("asset", normalized)
588
+ if should_skip_file(normalized):
589
+ return asset_id, False, "skipped"
427
590
  perm = _permission_state(path)
428
591
  depth, privacy_class, depth_reason = classify_path(normalized)
429
592
  depth = min(depth, root_depth)
@@ -546,6 +709,20 @@ def _mark_dir_subtree_deleted(conn, dir_path: str, deleted_at: float | None = No
546
709
  return len(rows)
547
710
 
548
711
 
712
+ def _purge_dir_subtree(conn, dir_path: str) -> int:
713
+ normalized = norm_path(dir_path)
714
+ prefix = _path_prefix(normalized)
715
+ rows = conn.execute(
716
+ "SELECT asset_id FROM local_assets WHERE path=? OR path LIKE ?",
717
+ (normalized, prefix + "%"),
718
+ ).fetchall()
719
+ asset_ids = [str(row["asset_id"]) for row in rows]
720
+ _purge_asset_ids(conn, asset_ids)
721
+ conn.execute("DELETE FROM local_index_dirs WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
722
+ conn.execute("DELETE FROM local_index_errors WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
723
+ return len(asset_ids)
724
+
725
+
549
726
  def _record_index_error(
550
727
  conn,
551
728
  *,
@@ -651,6 +828,8 @@ def _iter_files(
651
828
  continue
652
829
  if entry.is_file():
653
830
  normalized = norm_path(entry)
831
+ if should_skip_file(normalized):
832
+ continue
654
833
  if start_after_norm and normalized <= start_after_norm:
655
834
  continue
656
835
  yield entry
@@ -729,7 +908,11 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
729
908
  path = str(row["path"])
730
909
  root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
731
910
  if _is_excluded(path, exclusions):
732
- _mark_asset_deleted(conn, row["asset_id"], seen_at)
911
+ _purge_asset_ids(conn, [row["asset_id"]])
912
+ stats["excluded"] += 1
913
+ continue
914
+ if should_skip_file(path):
915
+ _purge_asset_ids(conn, [row["asset_id"]])
733
916
  stats["excluded"] += 1
734
917
  continue
735
918
  if root_path is not None and not root_path.exists():
@@ -836,6 +1019,8 @@ def _scan_known_directory(
836
1019
  stack.append(entry)
837
1020
  continue
838
1021
  if entry.is_file():
1022
+ if should_skip_file(str(entry)):
1023
+ continue
839
1024
  seen_files.add(norm_path(entry))
840
1025
  if stats["files_scanned"] >= file_limit:
841
1026
  continue
@@ -843,7 +1028,7 @@ def _scan_known_directory(
843
1028
  stats["files_scanned"] += 1
844
1029
  if changed:
845
1030
  stats["files_changed"] += 1
846
- if state != "ok":
1031
+ if state not in {"ok", "skipped"}:
847
1032
  stats["errors"] += 1
848
1033
  except Exception as exc:
849
1034
  _record_scan_error(conn, stats, str(entry), "live_reconcile", exc)
@@ -887,6 +1072,10 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
887
1072
  stats["files_deleted"] += _mark_dir_subtree_deleted(conn, str(dir_path), seen_at)
888
1073
  stats["excluded_dirs"] += 1
889
1074
  continue
1075
+ if should_skip_tree(str(dir_path)):
1076
+ stats["files_deleted"] += _purge_dir_subtree(conn, str(dir_path))
1077
+ stats["excluded_dirs"] += 1
1078
+ continue
890
1079
  if root_path is not None and not root_path.exists():
891
1080
  stats["offline"] += 1
892
1081
  continue
@@ -966,6 +1155,12 @@ def scan_once(*, limit: int | None = None) -> dict:
966
1155
  for root in roots:
967
1156
  root_path = Path(root["root_path"]).expanduser()
968
1157
  root_id = int(root["id"])
1158
+ if should_skip_tree(str(root_path)):
1159
+ conn.execute(
1160
+ "UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
1161
+ (now(), now(), root_id),
1162
+ )
1163
+ continue
969
1164
  if not root_path.exists():
970
1165
  conn.execute(
971
1166
  "UPDATE local_index_roots SET status='offline', last_scan_at=?, updated_at=? WHERE id=?",
@@ -997,7 +1192,7 @@ def scan_once(*, limit: int | None = None) -> dict:
997
1192
  seen_for_root += 1
998
1193
  if changed:
999
1194
  totals["changed"] += 1
1000
- if state != "ok":
1195
+ if state not in {"ok", "skipped"}:
1001
1196
  totals["errors"] += 1
1002
1197
  partial_root = bool(limit and seen_for_root >= limit)
1003
1198
  totals["partial"] = bool(totals["partial"] or partial_root)
@@ -1121,7 +1316,7 @@ def process_jobs(*, limit: int = 100) -> dict:
1121
1316
  recovered = _requeue_due_jobs(conn)
1122
1317
  rows = conn.execute(
1123
1318
  """
1124
- SELECT j.*, a.path, a.depth, a.status AS asset_status
1319
+ SELECT j.*, a.path, a.depth, a.privacy_class, a.status AS asset_status
1125
1320
  FROM local_index_jobs j
1126
1321
  JOIN local_assets a ON a.asset_id = j.asset_id
1127
1322
  WHERE j.status='pending'
@@ -1143,9 +1338,24 @@ def process_jobs(*, limit: int = 100) -> dict:
1143
1338
  try:
1144
1339
  if row["asset_status"] != "active":
1145
1340
  raise FileNotFoundError(row["path"])
1341
+ if str(row["privacy_class"] or "normal") != "normal":
1342
+ conn.execute(
1343
+ "UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='privacy_blocked' WHERE job_id=?",
1344
+ (now(), job_id),
1345
+ )
1346
+ processed += 1
1347
+ continue
1146
1348
  if job_type == "light_extraction":
1147
1349
  text, metadata = extract_text(Path(row["path"]))
1148
1350
  version_id = _latest_version_id(conn, asset_id)
1351
+ if contains_secret(text):
1352
+ _mark_content_secret_assets(conn, [asset_id])
1353
+ conn.execute(
1354
+ "UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='content_secret_blocked' WHERE job_id=?",
1355
+ (now(), job_id),
1356
+ )
1357
+ processed += 1
1358
+ continue
1149
1359
  summary = summarize(text)
1150
1360
  conn.execute(
1151
1361
  "UPDATE local_asset_versions SET summary=?, metadata_json=? WHERE version_id=?",
@@ -1202,6 +1412,9 @@ def run_once(
1202
1412
  live_dir_limit: int = DEFAULT_LIVE_DIR_LIMIT,
1203
1413
  live_file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
1204
1414
  ) -> dict:
1415
+ if _get_state("privacy_hygiene_v2", "0") != "1":
1416
+ local_index_privacy_hygiene(fix=True)
1417
+ _set_state("privacy_hygiene_v2", "1")
1205
1418
  if root:
1206
1419
  add_root(root)
1207
1420
  elif (
@@ -1471,6 +1684,29 @@ def _service_cycle_observation(conn) -> dict:
1471
1684
  return observation
1472
1685
 
1473
1686
 
1687
+ def _index_timing(conn, *, done: int, active_jobs: int, percent: int) -> dict:
1688
+ first_seen = conn.execute(
1689
+ """
1690
+ SELECT MIN(created_at) AS created_at
1691
+ FROM local_index_logs
1692
+ WHERE event IN ('root_added', 'scan_started', 'scan_finished', 'jobs_processed', 'service_cycle_finished')
1693
+ """
1694
+ ).fetchone()["created_at"] or 0
1695
+ if not first_seen:
1696
+ first_seen = conn.execute(
1697
+ """
1698
+ SELECT MIN(first_seen_at) AS first_seen_at
1699
+ FROM local_assets
1700
+ WHERE status!='deleted'
1701
+ """
1702
+ ).fetchone()["first_seen_at"] or 0
1703
+ elapsed_seconds = max(0, int(now() - float(first_seen))) if first_seen else 0
1704
+ eta_seconds = None
1705
+ if elapsed_seconds > 0 and done > 0 and active_jobs > 0 and 0 < percent < 100:
1706
+ eta_seconds = max(0, int((elapsed_seconds / max(done, 1)) * active_jobs))
1707
+ return {"elapsed_seconds": elapsed_seconds, "eta_seconds": eta_seconds}
1708
+
1709
+
1474
1710
  def _service_scheduler_has_error(service: dict) -> bool:
1475
1711
  if service.get("manager") == "launchagent":
1476
1712
  code = str(service.get("last_exit_code") or "").strip()
@@ -1544,6 +1780,7 @@ def status() -> dict:
1544
1780
  active_jobs = pending + running_jobs + failed_jobs
1545
1781
  total_jobs = active_jobs + done
1546
1782
  percent = 100 if total_jobs == 0 else int((done / max(total_jobs, 1)) * 100)
1783
+ timing = _index_timing(conn, done=done, active_jobs=active_jobs, percent=percent)
1547
1784
  roots = list_roots()
1548
1785
  volumes = []
1549
1786
  by_volume = conn.execute(
@@ -1589,8 +1826,8 @@ def status() -> dict:
1589
1826
  "jobs_pending": pending,
1590
1827
  "jobs_running": running_jobs,
1591
1828
  "jobs_failed": failed_jobs,
1592
- "elapsed_seconds": 0,
1593
- "eta_seconds": None,
1829
+ "elapsed_seconds": timing["elapsed_seconds"],
1830
+ "eta_seconds": timing["eta_seconds"],
1594
1831
  },
1595
1832
  "volumes": volumes,
1596
1833
  "roots": roots,
@@ -1675,26 +1912,183 @@ def _search_text_score(query: str, text: str) -> float:
1675
1912
  return len(q & tokens) / max(len(q), 1)
1676
1913
 
1677
1914
 
1678
- def context_query(query: str, *, intent: str = "answer", limit: int = 12, evidence_required: bool = True, current_context: str = "") -> dict:
1679
- conn = _conn()
1680
- qvec = embeddings.embed_text(query)
1915
+ _QUERY_STOPWORDS = {
1916
+ "about",
1917
+ "archivos",
1918
+ "con",
1919
+ "context",
1920
+ "contexto",
1921
+ "cuanto",
1922
+ "dame",
1923
+ "del",
1924
+ "desde",
1925
+ "documentos",
1926
+ "donde",
1927
+ "esta",
1928
+ "está",
1929
+ "file",
1930
+ "files",
1931
+ "hay",
1932
+ "los",
1933
+ "para",
1934
+ "que",
1935
+ "qué",
1936
+ "related",
1937
+ "relacionado",
1938
+ "sabes",
1939
+ "sobre",
1940
+ "todo",
1941
+ "what",
1942
+ "where",
1943
+ }
1944
+
1945
+
1946
+ def _query_terms(query: str) -> list[str]:
1947
+ terms = []
1948
+ for token in tokenize(query):
1949
+ if len(token) < 3 or token in _QUERY_STOPWORDS:
1950
+ continue
1951
+ if token not in terms:
1952
+ terms.append(token)
1953
+ return terms[:10]
1954
+
1955
+
1956
+ def _entity_match_score(query_lower: str, terms: list[str], name: str) -> float:
1957
+ entity = (name or "").strip().lower()
1958
+ if not entity:
1959
+ return 0.0
1960
+ entity_terms = set(tokenize(entity))
1961
+ if entity and entity in query_lower:
1962
+ return 1.0
1963
+ if not terms:
1964
+ return 0.0
1965
+ term_set = set(terms)
1966
+ overlap = term_set & entity_terms
1967
+ if overlap:
1968
+ return min(0.95, 0.45 + (len(overlap) / max(len(entity_terms), 1)) * 0.5)
1969
+ if any(term in entity for term in terms):
1970
+ return 0.6
1971
+ return 0.0
1972
+
1973
+
1974
+ def _entity_matches_for_query(conn, query: str, *, limit: int) -> tuple[list[dict], dict[str, float]]:
1975
+ query_lower = (query or "").strip().lower()
1976
+ terms = _query_terms(query)
1977
+ if not query_lower or not terms:
1978
+ return [], {}
1979
+
1980
+ clauses = " OR ".join("lower(e.name) LIKE ?" for _ in terms)
1981
+ params = [f"%{term}%" for term in terms]
1681
1982
  rows = conn.execute(
1983
+ f"""
1984
+ SELECT DISTINCT e.name, e.entity_type, e.asset_id, a.path, a.privacy_class
1985
+ FROM local_entities e
1986
+ JOIN local_assets a ON a.asset_id = e.asset_id
1987
+ WHERE a.status='active'
1988
+ AND a.privacy_class='normal'
1989
+ AND ({clauses})
1990
+ LIMIT ?
1991
+ """,
1992
+ [*params, max(int(limit) * 20, 40)],
1993
+ ).fetchall()
1994
+
1995
+ matches = []
1996
+ boosts: dict[str, float] = {}
1997
+ seen = set()
1998
+ for row in rows:
1999
+ if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
2000
+ continue
2001
+ score = _entity_match_score(query_lower, terms, str(row["name"] or ""))
2002
+ if score <= 0:
2003
+ continue
2004
+ key = (row["name"], row["entity_type"], row["asset_id"])
2005
+ if key not in seen:
2006
+ matches.append({
2007
+ "name": row["name"],
2008
+ "entity_type": row["entity_type"],
2009
+ "asset_id": row["asset_id"],
2010
+ "score": round(float(score), 4),
2011
+ })
2012
+ seen.add(key)
2013
+ boosts[row["asset_id"]] = max(boosts.get(row["asset_id"], 0.0), float(score))
2014
+
2015
+ matches.sort(key=lambda item: item.get("score", 0), reverse=True)
2016
+ return matches[: int(limit)], boosts
2017
+
2018
+
2019
+ def _context_candidate_rows(conn, entity_asset_ids: list[str], *, base_limit: int = 5000) -> list:
2020
+ base_rows = conn.execute(
1682
2021
  """
1683
- SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, v.summary, e.vector_json
2022
+ SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
1684
2023
  FROM local_chunks c
1685
2024
  JOIN local_assets a ON a.asset_id = c.asset_id
1686
2025
  LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
1687
2026
  LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
1688
2027
  WHERE a.status='active'
1689
- LIMIT 1000
1690
- """
2028
+ AND a.privacy_class='normal'
2029
+ ORDER BY c.created_at DESC
2030
+ LIMIT ?
2031
+ """,
2032
+ (int(base_limit),),
2033
+ ).fetchall()
2034
+ if not entity_asset_ids:
2035
+ return base_rows
2036
+
2037
+ placeholders = ",".join("?" for _ in entity_asset_ids)
2038
+ entity_rows = conn.execute(
2039
+ f"""
2040
+ SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
2041
+ FROM local_chunks c
2042
+ JOIN local_assets a ON a.asset_id = c.asset_id
2043
+ LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
2044
+ LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
2045
+ WHERE a.status='active'
2046
+ AND a.privacy_class='normal'
2047
+ AND c.asset_id IN ({placeholders})
2048
+ ORDER BY c.chunk_index ASC
2049
+ LIMIT ?
2050
+ """,
2051
+ [*entity_asset_ids, max(1000, len(entity_asset_ids) * 80)],
1691
2052
  ).fetchall()
2053
+
2054
+ rows = []
2055
+ seen_chunks = set()
2056
+ for row in [*entity_rows, *base_rows]:
2057
+ chunk_id = row["chunk_id"]
2058
+ if chunk_id in seen_chunks:
2059
+ continue
2060
+ seen_chunks.add(chunk_id)
2061
+ rows.append(row)
2062
+ return rows
2063
+
2064
+
2065
+ def context_query(query: str, *, intent: str = "answer", limit: int = 12, evidence_required: bool = True, current_context: str = "") -> dict:
2066
+ conn = _conn()
2067
+ qvec = embeddings.embed_text(query)
2068
+ entities_payload, entity_boosts = _entity_matches_for_query(conn, query, limit=max(int(limit), 1))
2069
+ rows = _context_candidate_rows(conn, list(entity_boosts.keys()), base_limit=5000)
1692
2070
  scored = []
1693
2071
  for row in rows:
2072
+ if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
2073
+ continue
1694
2074
  vector = json_loads(row["vector_json"], [])
1695
- score = max(_search_text_score(query, row["text"]), embeddings.cosine(qvec, vector))
2075
+ text_score = _search_text_score(query, row["text"])
2076
+ path_score = _search_text_score(query, row["path"] or "")
2077
+ summary_score = _search_text_score(query, row["summary"] or "")
2078
+ entity_score = entity_boosts.get(row["asset_id"], 0.0)
2079
+ vector_score = embeddings.cosine(qvec, vector)
2080
+ score = max(text_score, path_score, summary_score, vector_score)
2081
+ if entity_score > 0:
2082
+ direct_score = max(text_score, path_score, summary_score)
2083
+ if direct_score > 0:
2084
+ entity_rank = 0.82 + (0.42 * text_score) + (0.18 * path_score) + (0.12 * summary_score)
2085
+ score = max(score, entity_rank + min(0.2, entity_score * 0.2))
2086
+ else:
2087
+ # Entity-level matches keep older assets eligible, but do not let
2088
+ # unrelated chunks from a long document outrank direct evidence.
2089
+ score = max(score, min(0.48, 0.28 + entity_score * 0.2))
1696
2090
  if score > 0:
1697
- scored.append((score, row))
2091
+ scored.append((min(float(score), 1.6), row))
1698
2092
  scored.sort(key=lambda item: item[0], reverse=True)
1699
2093
  assets = []
1700
2094
  chunks = []
@@ -1704,7 +2098,6 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
1704
2098
  if row["asset_id"] not in seen_assets:
1705
2099
  assets.append({
1706
2100
  "asset_id": row["asset_id"],
1707
- "path": row["path"],
1708
2101
  "display_path": redact_path(row["path"]),
1709
2102
  "file_type": row["file_type"],
1710
2103
  "score": round(float(score), 4),
@@ -1718,14 +2111,10 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
1718
2111
  "score": round(float(score), 4),
1719
2112
  })
1720
2113
  evidence_refs.append(f"local_asset:{row['asset_id']}#chunk:{row['chunk_id']}")
1721
- entity_rows = conn.execute(
1722
- "SELECT DISTINCT name, entity_type, asset_id FROM local_entities WHERE lower(name) LIKE ? LIMIT ?",
1723
- (f"%{query.lower()}%", int(limit)),
1724
- ).fetchall()
1725
- entities_payload = [dict(row) for row in entity_rows]
1726
2114
  relations_payload: list[dict] = []
1727
- if seen_assets:
1728
- asset_ids = list(seen_assets)[: int(limit)]
2115
+ relation_asset_ids = list(dict.fromkeys([*seen_assets, *entity_boosts.keys()]))[: int(limit)]
2116
+ if relation_asset_ids:
2117
+ asset_ids = relation_asset_ids
1729
2118
  placeholders = ",".join("?" for _ in asset_ids)
1730
2119
  relation_rows = conn.execute(
1731
2120
  f"""
@@ -1798,13 +2187,7 @@ def get_neighbors(asset_id: str, *, limit: int = 30) -> dict:
1798
2187
 
1799
2188
  def purge_asset(asset_id: str) -> dict:
1800
2189
  conn = _conn()
1801
- for table in ("local_embeddings", "local_chunks", "local_entities"):
1802
- conn.execute(f"DELETE FROM {table} WHERE asset_id=?", (asset_id,))
1803
- conn.execute("DELETE FROM local_relations WHERE source_asset_id=?", (asset_id,))
1804
- conn.execute("DELETE FROM local_index_errors WHERE asset_id=?", (asset_id,))
1805
- conn.execute("DELETE FROM local_index_jobs WHERE asset_id=?", (asset_id,))
1806
- conn.execute("DELETE FROM local_asset_versions WHERE asset_id=?", (asset_id,))
1807
- conn.execute("DELETE FROM local_assets WHERE asset_id=?", (asset_id,))
2190
+ _purge_asset_ids(conn, [asset_id])
1808
2191
  conn.commit()
1809
2192
  log_event("info", "asset_purged", "Asset purged", asset_id=asset_id)
1810
2193
  return {"ok": True, "asset_id": asset_id}
@@ -4,12 +4,15 @@ import csv
4
4
  import html
5
5
  import json
6
6
  import re
7
+ import sqlite3
7
8
  import zipfile
8
9
  from email import policy
9
10
  from email.parser import BytesParser
10
11
  from pathlib import Path
11
12
  from xml.etree import ElementTree
12
13
 
14
+ from .privacy import is_local_email_db
15
+
13
16
  MAX_TEXT_BYTES = 512 * 1024
14
17
  MAX_CHARS = 120_000
15
18
 
@@ -32,6 +35,26 @@ TEXT_SUFFIXES = {
32
35
  ".css",
33
36
  }
34
37
 
38
+ SECRET_PATTERNS: tuple[re.Pattern, ...] = (
39
+ re.compile(r"\bBearer\s+[A-Za-z0-9._\-~+/]{12,}\b", re.I),
40
+ re.compile(r"\bsk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
41
+ re.compile(r"\bpk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
42
+ re.compile(r"\b(ghp|gho|ghu|ghs|ghr|github_pat|glpat|xoxb|xoxp|shpat)_[A-Za-z0-9_]{16,}\b", re.I),
43
+ re.compile(r"\b(AKIA|ASIA)[A-Z0-9]{16,}\b"),
44
+ re.compile(r"\bey[A-Za-z0-9_-]{10,}\.ey[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"),
45
+ re.compile(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY-----", re.I),
46
+ re.compile(r"\b([A-Z][A-Z0-9_]*(?:TOKEN|SECRET|KEY|PASSWORD|PASS)\s*[:=]\s*)['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
47
+ re.compile(r"\b(?:api[_-]?key|secret[_-]?key|auth[_-]?token)\s*[:=]\s*['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
48
+ re.compile(r"\b(?:password|passwd|pwd)\s*[:=]\s*['\"][^'\"]{6,}['\"]", re.I),
49
+ )
50
+
51
+
52
+ def contains_secret(text: str) -> bool:
53
+ if not text:
54
+ return False
55
+ sample = text[:MAX_CHARS]
56
+ return any(pattern.search(sample) for pattern in SECRET_PATTERNS)
57
+
35
58
 
36
59
  def _read_text(path: Path) -> str:
37
60
  data = path.read_bytes()[:MAX_TEXT_BYTES]
@@ -53,8 +76,8 @@ def _extract_csv(path: Path) -> str:
53
76
  return "\n".join(rows)[:MAX_CHARS]
54
77
 
55
78
 
56
- def _extract_eml(path: Path) -> tuple[str, dict]:
57
- msg = BytesParser(policy=policy.default).parsebytes(path.read_bytes()[:MAX_TEXT_BYTES])
79
+ def _extract_email_bytes(data: bytes) -> tuple[str, dict]:
80
+ msg = BytesParser(policy=policy.default).parsebytes(data[:MAX_TEXT_BYTES])
58
81
  meta = {
59
82
  "subject": str(msg.get("subject") or ""),
60
83
  "from": str(msg.get("from") or ""),
@@ -72,6 +95,99 @@ def _extract_eml(path: Path) -> tuple[str, dict]:
72
95
  return "\n".join([meta["subject"], meta["from"], meta["to"], text])[:MAX_CHARS], meta
73
96
 
74
97
 
98
+ def _extract_eml(path: Path) -> tuple[str, dict]:
99
+ return _extract_email_bytes(path.read_bytes()[:MAX_TEXT_BYTES])
100
+
101
+
102
+ def _extract_emlx(path: Path) -> tuple[str, dict]:
103
+ data = path.read_bytes()[:MAX_TEXT_BYTES]
104
+ first_line, separator, rest = data.partition(b"\n")
105
+ if separator and first_line.strip().isdigit():
106
+ declared = int(first_line.strip() or b"0")
107
+ payload = rest[:declared] if declared > 0 else rest
108
+ else:
109
+ payload = data
110
+ if b"\n<?xml" in payload:
111
+ payload = payload.split(b"\n<?xml", 1)[0]
112
+ text, meta = _extract_email_bytes(payload)
113
+ meta["apple_mail_message"] = True
114
+ return text, meta
115
+
116
+
117
+ def _printable_binary_text(path: Path) -> str:
118
+ data = path.read_bytes()[:MAX_TEXT_BYTES]
119
+ decoded = data.decode("utf-16", errors="ignore") if b"\x00" in data[:2000] else data.decode("latin-1", errors="ignore")
120
+ pieces = re.findall(r"[\wÀ-ÿ@./:=+\- ,;()\\[\\]{}]{4,}", decoded)
121
+ return "\n".join(piece.strip() for piece in pieces if piece.strip())[:MAX_CHARS]
122
+
123
+
124
+ def _extract_msg(path: Path) -> tuple[str, dict]:
125
+ try:
126
+ import extract_msg # type: ignore
127
+ message = extract_msg.Message(str(path))
128
+ meta = {
129
+ "subject": str(getattr(message, "subject", "") or ""),
130
+ "from": str(getattr(message, "sender", "") or ""),
131
+ "to": str(getattr(message, "to", "") or ""),
132
+ "date": str(getattr(message, "date", "") or ""),
133
+ "extractor": "msg",
134
+ }
135
+ body = str(getattr(message, "body", "") or "")
136
+ close = getattr(message, "close", None)
137
+ if callable(close):
138
+ close()
139
+ return "\n".join([meta["subject"], meta["from"], meta["to"], body])[:MAX_CHARS], meta
140
+ except Exception:
141
+ return _printable_binary_text(path), {"extractor": "msg_fallback"}
142
+
143
+
144
+ def _table_names(conn: sqlite3.Connection) -> set[str]:
145
+ rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
146
+ return {str(row[0]) for row in rows}
147
+
148
+
149
+ def _select_existing_columns(conn: sqlite3.Connection, table: str, columns: list[str]) -> list[str]:
150
+ found = {str(row[1]) for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
151
+ return [column for column in columns if column in found]
152
+
153
+
154
+ def _extract_nexo_email_db(path: Path) -> tuple[str, dict]:
155
+ if not is_local_email_db(str(path)):
156
+ return "", {"extractor": "sqlite_blocked"}
157
+ uri = f"file:{path}?mode=ro"
158
+ parts: list[str] = []
159
+ try:
160
+ conn = sqlite3.connect(uri, uri=True, timeout=1)
161
+ except Exception:
162
+ return "", {"extractor": "nexo_email_db", "state": "locked_or_unavailable"}
163
+ try:
164
+ tables = _table_names(conn)
165
+ if "emails" in tables:
166
+ cols = _select_existing_columns(
167
+ conn,
168
+ "emails",
169
+ ["from_addr", "from_name", "subject", "received_at", "status", "body", "response"],
170
+ )
171
+ if not cols:
172
+ return "", {"extractor": "nexo_email_db", "tables": sorted(tables)}
173
+ order = "received_at" if "received_at" in cols else "rowid"
174
+ for row in conn.execute(f"SELECT {', '.join(cols)} FROM emails ORDER BY {order} DESC LIMIT 1000").fetchall():
175
+ parts.append(" | ".join(str(value or "")[:4000] for value in row))
176
+ if "sent_email_events" in tables:
177
+ cols = _select_existing_columns(
178
+ conn,
179
+ "sent_email_events",
180
+ ["sender", "to_addrs", "cc_addrs", "subject", "sent_at", "status", "body_text"],
181
+ )
182
+ if cols:
183
+ order = "sent_at" if "sent_at" in cols else "rowid"
184
+ for row in conn.execute(f"SELECT {', '.join(cols)} FROM sent_email_events ORDER BY {order} DESC LIMIT 1000").fetchall():
185
+ parts.append(" | ".join(str(value or "")[:4000] for value in row))
186
+ finally:
187
+ conn.close()
188
+ return "\n".join(parts)[:MAX_CHARS], {"extractor": "nexo_email_db", "tables": sorted(tables) if "tables" in locals() else []}
189
+
190
+
75
191
  def _zip_xml_text(path: Path, members: list[str]) -> str:
76
192
  pieces: list[str] = []
77
193
  with zipfile.ZipFile(path) as zf:
@@ -156,6 +272,14 @@ def extract_text(path: Path) -> tuple[str, dict]:
156
272
  elif suffix == ".eml":
157
273
  text, metadata = _extract_eml(path)
158
274
  metadata["extractor"] = "eml"
275
+ elif suffix == ".emlx":
276
+ text, metadata = _extract_emlx(path)
277
+ metadata["extractor"] = "emlx"
278
+ elif suffix == ".msg":
279
+ text, metadata = _extract_msg(path)
280
+ metadata["extractor"] = metadata.get("extractor") or "msg"
281
+ elif suffix == ".db" and is_local_email_db(str(path)):
282
+ text, metadata = _extract_nexo_email_db(path)
159
283
  elif suffix == ".pdf":
160
284
  text = _extract_pdf(path)
161
285
  elif suffix == ".docx":
@@ -6,21 +6,58 @@ SENSITIVE_FILE_NAMES = {
6
6
  ".env",
7
7
  ".env.local",
8
8
  ".env.production",
9
+ ".npmrc",
10
+ ".pypirc",
11
+ ".netrc",
12
+ ".boto",
13
+ ".pgpass",
14
+ ".my.cnf",
15
+ ".git-credentials",
16
+ ".mcp_publisher_token",
17
+ ".mcpregistry_github_token",
18
+ ".mcpregistry_registry_token",
9
19
  "id_rsa",
10
20
  "id_dsa",
11
21
  "id_ecdsa",
12
22
  "id_ed25519",
23
+ "known_hosts",
24
+ "authorized_keys",
13
25
  "cookies.sqlite",
14
26
  "login data",
15
27
  "keychain-2.db",
16
28
  }
17
29
 
30
+ SENSITIVE_NAME_MARKERS = {
31
+ "api_key",
32
+ "apikey",
33
+ "auth_token",
34
+ "bearer",
35
+ "client_secret",
36
+ "credential",
37
+ "credentials",
38
+ "oauth",
39
+ "password",
40
+ "passwd",
41
+ "private_key",
42
+ "secret",
43
+ "token",
44
+ }
45
+
46
+ SENSITIVE_SUFFIXES = {
47
+ ".key",
48
+ ".pem",
49
+ ".p12",
50
+ ".pfx",
51
+ ".kdbx",
52
+ }
53
+
18
54
  SENSITIVE_PARTS = {
19
55
  ".ssh",
20
56
  ".gnupg",
21
57
  ".aws",
22
58
  ".azure",
23
59
  ".kube",
60
+ ".docker",
24
61
  "password",
25
62
  "passwords",
26
63
  "1password",
@@ -30,6 +67,29 @@ SENSITIVE_PARTS = {
30
67
  "browser profile",
31
68
  }
32
69
 
70
+ EMAIL_RUNTIME_DB_NAMES = {
71
+ "email.db",
72
+ "email-tracker.db",
73
+ "emails.db",
74
+ "monitor.db",
75
+ "nexo-email.db",
76
+ }
77
+
78
+ EMAIL_ATTACHMENT_SUFFIXES = {
79
+ ".csv",
80
+ ".docx",
81
+ ".eml",
82
+ ".emlx",
83
+ ".html",
84
+ ".md",
85
+ ".pdf",
86
+ ".pptx",
87
+ ".txt",
88
+ ".xlsx",
89
+ }
90
+
91
+ EMAIL_EXTRACTABLE_SUFFIXES = {".eml", ".emlx", ".msg"}
92
+
33
93
  NOISY_PARTS = {
34
94
  "node_modules",
35
95
  "vendor",
@@ -53,9 +113,64 @@ NOISY_PARTS = {
53
113
  ".parcel-cache",
54
114
  ".bun",
55
115
  ".gradle",
116
+ "$tmp",
56
117
  "target",
57
118
  }
58
119
 
120
+ TRANSIENT_PARTS = {"tmp", "temp"}
121
+
122
+ PRIVATE_PROFILE_PARTS = {
123
+ ".nexo",
124
+ ".claude",
125
+ ".codex",
126
+ ".gemini",
127
+ ".cursor",
128
+ ".config",
129
+ ".local",
130
+ ".npm",
131
+ ".yarn",
132
+ ".pnpm-store",
133
+ ".ollama",
134
+ ".docker",
135
+ ".vscode",
136
+ ".idea",
137
+ "appdata",
138
+ "application data",
139
+ "library/application support",
140
+ "library/containers",
141
+ "library/group containers",
142
+ "library/keychains",
143
+ "library/logs",
144
+ "library/mail",
145
+ "library/messages",
146
+ "library/safari",
147
+ "library/saved application state",
148
+ }
149
+
150
+ PROFILE_HIDDEN_FILE_NAMES = {
151
+ ".aider.chat.history.md",
152
+ ".aider.input.history",
153
+ ".bash_history",
154
+ ".bash_profile",
155
+ ".bashrc",
156
+ ".claude.json",
157
+ ".codex.json",
158
+ ".cursorignore",
159
+ ".ds_store",
160
+ ".gitconfig",
161
+ ".gitignore_global",
162
+ ".lesshst",
163
+ ".python_history",
164
+ ".sqlite_history",
165
+ ".viminfo",
166
+ ".wget-hsts",
167
+ ".zprofile",
168
+ ".zsh_history",
169
+ ".zshrc",
170
+ }
171
+
172
+ ALLOWED_HIDDEN_FILE_NAMES = set()
173
+
59
174
  SYSTEM_PARTS = {
60
175
  "system volume information",
61
176
  "$recycle.bin",
@@ -69,35 +184,206 @@ SYSTEM_PARTS = {
69
184
  }
70
185
 
71
186
 
72
- def classify_path(path: str) -> tuple[int, str, str]:
73
- """Return (depth, privacy_class, reason)."""
187
+ def _normalized(path: str) -> str:
188
+ return str(Path(path)).replace("\\", "/").lower()
189
+
190
+
191
+ def _parts(path: str) -> set[str]:
192
+ return {part for part in _normalized(path).replace(":", "/").split("/") if part}
193
+
194
+
195
+ def _contains_path_marker(lowered: str, markers: set[str]) -> bool:
196
+ return any(marker in lowered for marker in markers)
197
+
198
+
199
+ def _is_under_marker(lowered: str, marker: str) -> bool:
200
+ marker = marker.strip("/").lower()
201
+ if not marker:
202
+ return False
203
+ return lowered.endswith("/" + marker) or f"/{marker}/" in lowered
204
+
205
+
206
+ def _is_inside_windows_mail_package(lowered: str) -> bool:
207
+ return "/appdata/local/packages/microsoft.windowscommunicationsapps" in lowered
208
+
209
+
210
+ def _is_inside_outlook_mac_profile(lowered: str) -> bool:
211
+ return "/library/group containers/ubf8t346g9.office/outlook" in lowered
212
+
213
+
214
+ def is_local_email_tree(path: str) -> bool:
215
+ lowered = _normalized(path)
216
+ if _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
217
+ return True
218
+ return any(
219
+ _is_under_marker(lowered, marker)
220
+ for marker in (
221
+ "library/mail",
222
+ ".nexo/runtime/nexo-email",
223
+ "documents/outlook files",
224
+ "appdata/local/microsoft/outlook",
225
+ "appdata/roaming/microsoft/outlook",
226
+ "appdata/local/packages/microsoft.windowscommunicationsapps",
227
+ ".thunderbird",
228
+ ".mozilla-thunderbird",
229
+ )
230
+ )
231
+
232
+
233
+ def is_local_email_db(path: str) -> bool:
234
+ p = Path(path)
235
+ return is_local_email_tree(path) and p.name.lower() in EMAIL_RUNTIME_DB_NAMES
236
+
237
+
238
+ def is_allowed_local_email_file(path: str) -> bool:
239
+ if not is_local_email_tree(path):
240
+ return False
241
+ p = Path(path)
242
+ lowered = _normalized(path)
243
+ suffix = p.suffix.lower()
244
+ if is_sensitive_path(path):
245
+ return False
246
+ if _is_under_marker(lowered, ".nexo/runtime/nexo-email"):
247
+ if is_local_email_db(path):
248
+ return True
249
+ if _is_under_marker(lowered, ".nexo/runtime/nexo-email/attachments"):
250
+ return suffix in EMAIL_ATTACHMENT_SUFFIXES
251
+ return suffix in {".eml", ".emlx"}
252
+ if _is_under_marker(lowered, "library/mail"):
253
+ return suffix in {".eml", ".emlx"}
254
+ if any(
255
+ _is_under_marker(lowered, marker)
256
+ for marker in (
257
+ "library/group containers/ubf8t346g9.office/outlook",
258
+ "documents/outlook files",
259
+ "appdata/local/microsoft/outlook",
260
+ "appdata/roaming/microsoft/outlook",
261
+ "appdata/local/packages/microsoft.windowscommunicationsapps",
262
+ )
263
+ ) or _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
264
+ return suffix in {".eml", ".msg", ".pst", ".ost"}
265
+ if _is_under_marker(lowered, ".thunderbird") or _is_under_marker(lowered, ".mozilla-thunderbird"):
266
+ return suffix in {".eml", ".mbox", ""}
267
+ return False
268
+
269
+
270
+ def _has_transient_project_part(path: str) -> bool:
271
+ parts = list(_normalized(path).replace(":", "/").split("/"))
272
+ for index, part in enumerate(parts):
273
+ if part in TRANSIENT_PARTS and index >= 2:
274
+ return True
275
+ return False
276
+
277
+
278
+ def _has_hidden_dir_part(path: str) -> bool:
279
+ parts = [part for part in _normalized(path).replace(":", "/").split("/") if part]
280
+ return any(part.startswith(".") and part not in {".", ".."} for part in parts[:-1])
281
+
282
+
283
+ def _is_home_hidden_path(path: str) -> bool:
284
+ try:
285
+ p = Path(path).expanduser()
286
+ home = Path.home().expanduser()
287
+ rel = p.relative_to(home)
288
+ except Exception:
289
+ return False
290
+ return bool(rel.parts) and rel.parts[0].startswith(".")
291
+
292
+
293
+ def is_sensitive_path(path: str) -> bool:
74
294
  p = Path(path)
75
- lowered = str(p).replace("\\", "/").lower()
295
+ lowered = _normalized(path)
76
296
  name = p.name.lower()
77
- parts = {part.lower() for part in p.parts}
297
+ stem = p.stem.lower()
298
+ parts = _parts(path)
299
+ if name in SENSITIVE_FILE_NAMES:
300
+ return True
301
+ if name.startswith(".") and name not in ALLOWED_HIDDEN_FILE_NAMES:
302
+ return True
303
+ if name.startswith("~$"):
304
+ return True
305
+ if name.endswith((".tmp", ".swp", ".swo")):
306
+ return True
307
+ if p.suffix.lower() in SENSITIVE_SUFFIXES:
308
+ return True
309
+ if parts & SENSITIVE_PARTS:
310
+ return True
311
+ if any(marker in name or marker in stem for marker in SENSITIVE_NAME_MARKERS):
312
+ return True
313
+ return _contains_path_marker(lowered, SENSITIVE_PARTS)
314
+
78
315
 
79
- if name in SENSITIVE_FILE_NAMES or parts & SENSITIVE_PARTS:
316
+ def is_private_profile_path(path: str) -> bool:
317
+ lowered = _normalized(path)
318
+ parts = _parts(path)
319
+ if parts & PRIVATE_PROFILE_PARTS:
320
+ return True
321
+ if _contains_path_marker(lowered, PRIVATE_PROFILE_PARTS):
322
+ return True
323
+ name = Path(path).name.lower()
324
+ if name in PROFILE_HIDDEN_FILE_NAMES:
325
+ return True
326
+ if _is_home_hidden_path(path):
327
+ return True
328
+ return False
329
+
330
+
331
+ def classify_path(path: str) -> tuple[int, str, str]:
332
+ """Return (depth, privacy_class, reason)."""
333
+ lowered = _normalized(path)
334
+ parts = _parts(path)
335
+
336
+ if is_local_email_tree(path) and (Path(path).suffix == "" or is_allowed_local_email_file(path)):
337
+ return 2, "normal", "local_email_path"
338
+ if is_sensitive_path(path):
80
339
  return 1, "sensitive_inventory_only", "sensitive_path"
340
+ if is_private_profile_path(path):
341
+ return 0, "private_profile_blocked", "private_profile_path"
81
342
  if any(item in lowered for item in SYSTEM_PARTS):
82
343
  return 0, "system_blocked", "system_path"
83
- if parts & NOISY_PARTS:
344
+ if parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path):
84
345
  return 1, "inventory_only", "noisy_tree"
85
346
  return 2, "normal", "default"
86
347
 
87
348
 
88
349
  def should_skip_tree(path: str) -> bool:
89
- p = Path(path)
90
- lowered = str(p).replace("\\", "/").lower()
91
- parts = {part.lower() for part in p.parts}
350
+ lowered = _normalized(path)
351
+ parts = _parts(path)
352
+ if is_local_email_tree(path):
353
+ return False
354
+ if any(item in lowered for item in SYSTEM_PARTS):
355
+ return True
356
+ if is_sensitive_path(path) or is_private_profile_path(path):
357
+ return True
358
+ return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
359
+
360
+
361
+ def should_skip_file(path: str) -> bool:
362
+ lowered = _normalized(path)
363
+ parts = _parts(path)
364
+ if is_local_email_tree(path):
365
+ return not is_allowed_local_email_file(path)
92
366
  if any(item in lowered for item in SYSTEM_PARTS):
93
367
  return True
94
- return bool(parts & NOISY_PARTS)
368
+ if is_sensitive_path(path) or is_private_profile_path(path):
369
+ return True
370
+ return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
371
+
372
+
373
+ def is_queryable_path(path: str, privacy_class: str = "") -> bool:
374
+ if privacy_class and privacy_class != "normal":
375
+ return False
376
+ return not should_skip_file(path)
95
377
 
96
378
 
97
379
  def should_extract(path: str, depth: int) -> bool:
98
380
  if depth < 2:
99
381
  return False
382
+ if should_skip_file(path):
383
+ return False
100
384
  suffix = Path(path).suffix.lower()
385
+ if is_local_email_db(path):
386
+ return True
101
387
  if suffix in {
102
388
  ".txt",
103
389
  ".md",
@@ -118,6 +404,8 @@ def should_extract(path: str, depth: int) -> bool:
118
404
  ".csv",
119
405
  ".tsv",
120
406
  ".eml",
407
+ ".emlx",
408
+ ".msg",
121
409
  ".pdf",
122
410
  ".docx",
123
411
  ".pptx",
@@ -43,6 +43,15 @@ def _format_local_context_evidence(query: str, *, limit: int = 4) -> str:
43
43
  refs = result.get("evidence_refs") or []
44
44
  if refs:
45
45
  lines.append(f"Evidence refs: {', '.join(str(ref) for ref in refs[:limit])}")
46
+ relations = result.get("relations") or []
47
+ if relations:
48
+ lines.append("Local relations:")
49
+ for relation in relations[:limit]:
50
+ relation_type = str(relation.get("relation_type") or "related")
51
+ target = str(relation.get("target_ref") or relation.get("target_asset_id") or "").strip()
52
+ evidence = str(relation.get("evidence") or "").strip()
53
+ suffix = f" — {evidence[:120]}" if evidence else ""
54
+ lines.append(f"- {relation_type}: {target}{suffix}")
46
55
  return "\n".join(lines)
47
56
 
48
57