nexo-brain 7.25.3 → 7.25.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,10 +17,10 @@ from typing import Any
17
17
 
18
18
  import paths
19
19
  from . import embeddings
20
- from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db
20
+ from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db, local_context_db_path
21
21
  from .extractors import canonical_entity_key, chunk_text, contains_secret, entities, entity_mentions, extract_text, normalize_entity_alias, summarize
22
22
  from .logging import log_event, tail
23
- from .privacy import classify_path, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
23
+ from .privacy import classify_path, is_local_email_db, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
24
24
  from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
25
25
 
26
26
  LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
@@ -34,6 +34,9 @@ DEFAULT_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_DEPTH", "24")
34
34
  DEFAULT_EMAIL_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_EMAIL_ROOT_DEPTH", "24") or "24")
35
35
  DEFAULT_MOUNTED_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_MOUNTED_ROOT_DEPTH", "24") or "24")
36
36
  DEFAULT_SYSTEM_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_SYSTEM_ROOT_DEPTH", "24") or "24")
37
+ DEFAULT_ROOT_SEED_VERSION = 2
38
+ ROOT_SEED_VERSION_KEY = "local_index_roots_seed_version"
39
+ LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES = int(os.environ.get("NEXO_LOCAL_INDEX_V2_REBUILD_THRESHOLD_BYTES", str(2 * 1024 * 1024 * 1024)) or str(2 * 1024 * 1024 * 1024))
37
40
  DEFAULT_CONTEXT_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_MAX_CHARS", "20000") or "20000")
38
41
  DEFAULT_ROUTER_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_ROUTER_MAX_CHARS", "6000") or "6000")
39
42
  DEFAULT_MAX_JOB_ATTEMPTS = int(os.environ.get("NEXO_LOCAL_INDEX_MAX_JOB_ATTEMPTS", "3") or "3")
@@ -66,7 +69,6 @@ HIGH_VALUE_DOCUMENT_SUFFIXES = {
66
69
  ".pptx",
67
70
  ".pages",
68
71
  ".numbers",
69
- ".key",
70
72
  ".rtf",
71
73
  ".odt",
72
74
  ".ods",
@@ -84,6 +86,65 @@ EMAIL_DOCUMENT_SUFFIXES = {
84
86
  ".emlx",
85
87
  ".msg",
86
88
  }
89
+ CODE_DOCUMENT_SUFFIXES = {
90
+ ".py",
91
+ ".js",
92
+ ".ts",
93
+ ".tsx",
94
+ ".jsx",
95
+ ".php",
96
+ ".sql",
97
+ ".json",
98
+ ".yaml",
99
+ ".yml",
100
+ ".toml",
101
+ ".html",
102
+ ".css",
103
+ }
104
+ IMAGE_METADATA_SUFFIXES = {
105
+ ".jpg",
106
+ ".jpeg",
107
+ ".png",
108
+ ".gif",
109
+ ".heic",
110
+ ".webp",
111
+ ".tif",
112
+ ".tiff",
113
+ ".bmp",
114
+ ".raw",
115
+ ".dng",
116
+ }
117
+ MEDIA_METADATA_SUFFIXES = {
118
+ ".mp3",
119
+ ".m4a",
120
+ ".wav",
121
+ ".aac",
122
+ ".flac",
123
+ ".mp4",
124
+ ".mov",
125
+ ".avi",
126
+ ".mkv",
127
+ ".m4v",
128
+ }
129
+ IGNORED_BINARY_SUFFIXES = {
130
+ ".app",
131
+ ".bin",
132
+ ".class",
133
+ ".dll",
134
+ ".dmg",
135
+ ".dylib",
136
+ ".exe",
137
+ ".iso",
138
+ ".jar",
139
+ ".lock",
140
+ ".o",
141
+ ".obj",
142
+ ".pyc",
143
+ ".so",
144
+ ".swp",
145
+ ".swo",
146
+ ".tmp",
147
+ }
87
148
  HIGH_VALUE_DIRECTORY_NAMES = {
88
149
  "users",
89
150
  "home",
@@ -213,26 +274,250 @@ def _with_sqlite_busy_retry(callback, *, attempts: int | None = None):
213
274
  return None
214
275
 
215
276
 
216
- def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> dict:
277
+ def _normalize_source(source: str | None) -> str:
278
+ value = str(source or "user").strip().lower().replace("-", "_")
279
+ return value or "user"
280
+
281
+
282
+ def _normalize_extension(extension: str) -> str:
283
+ value = str(extension or "").strip().lower()
284
+ if not value:
285
+ return ""
286
+ if not value.startswith("."):
287
+ value = "." + value
288
+ return value
289
+
290
+
291
+ def _normalize_file_type_action(action: str | None) -> str:
292
+ value = str(action or "").strip().lower()
293
+ if value in {"include", "extract", "read", "full"}:
294
+ return "extract"
295
+ if value in {"metadata", "inventory", "index"}:
296
+ return "metadata"
297
+ if value in {"exclude", "ignore", "skip", "blocked"}:
298
+ return "ignore"
299
+ return "ignore"
300
+
301
+
302
+ def _default_file_type_rule_specs() -> list[dict]:
303
+ specs: list[dict] = []
304
+ for suffix in sorted(HIGH_VALUE_DOCUMENT_SUFFIXES):
305
+ specs.append({"extension": suffix, "action": "extract", "priority": 90, "reason": "core_high_value_document"})
306
+ for suffix in sorted(KNOWN_TEXT_SUFFIXES):
307
+ specs.append({"extension": suffix, "action": "extract", "priority": 82, "reason": "core_text_document"})
308
+ for suffix in sorted(EMAIL_DOCUMENT_SUFFIXES):
309
+ specs.append({"extension": suffix, "action": "extract", "priority": 70, "reason": "core_email_document"})
310
+ for suffix in sorted(CODE_DOCUMENT_SUFFIXES):
311
+ specs.append({"extension": suffix, "action": "extract", "priority": 55, "reason": "core_code_document"})
312
+ for suffix in sorted(IMAGE_METADATA_SUFFIXES):
313
+ specs.append({"extension": suffix, "action": "metadata", "priority": 35, "reason": "core_photo_metadata"})
314
+ for suffix in sorted(MEDIA_METADATA_SUFFIXES):
315
+ specs.append({"extension": suffix, "action": "metadata", "priority": 25, "reason": "core_media_metadata"})
316
+ for suffix in sorted(IGNORED_BINARY_SUFFIXES):
317
+ specs.append({"extension": suffix, "action": "ignore", "priority": 0, "reason": "core_binary_or_transient"})
318
+ return specs
319
+
320
+
321
+ def seed_core_file_type_rules(conn=None) -> dict:
322
+ conn = conn or _conn()
323
+ created_or_updated = 0
324
+ timestamp = now()
325
+ for spec in _default_file_type_rule_specs():
326
+ conn.execute(
327
+ """
328
+ INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
329
+ VALUES (?, ?, 'core_default', ?, ?, ?, ?)
330
+ ON CONFLICT(extension, source) DO UPDATE SET
331
+ action=excluded.action,
332
+ priority=excluded.priority,
333
+ reason=excluded.reason,
334
+ updated_at=excluded.updated_at
335
+ """,
336
+ (spec["extension"], spec["action"], int(spec["priority"]), spec["reason"], timestamp, timestamp),
337
+ )
338
+ created_or_updated += 1
339
+ return {"ok": True, "rules": created_or_updated}
340
+
341
+
342
+ def _list_file_type_rules_conn(conn) -> list[dict]:
343
+ rows = conn.execute(
344
+ """
345
+ SELECT *
346
+ FROM local_index_file_type_rules
347
+ ORDER BY
348
+ CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END,
349
+ extension
350
+ """
351
+ ).fetchall()
352
+ return [dict(row) for row in rows]
353
+
354
+
355
+ def _shape_file_type_rules(rows: list[dict]) -> dict:
356
+ effective: dict[str, dict] = {}
357
+ for row in rows:
358
+ ext = str(row.get("extension") or "")
359
+ if ext not in effective or row.get("source") == "user":
360
+ effective[ext] = row
361
+ return {"ok": True, "rules": rows, "effective": list(effective.values())}
362
+
363
+
364
+ def _effective_file_type_rule(conn, extension: str) -> dict:
365
+ ext = _normalize_extension(extension)
366
+ if not ext:
367
+ return {"extension": "", "action": "ignore", "source": "implicit", "priority": 0, "reason": "missing_extension"}
368
+ rows = conn.execute(
369
+ """
370
+ SELECT *
371
+ FROM local_index_file_type_rules
372
+ WHERE extension=?
373
+ ORDER BY CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END
374
+ LIMIT 1
375
+ """,
376
+ (ext,),
377
+ ).fetchall()
378
+ if rows:
379
+ return dict(rows[0])
380
+ if is_local_email_tree(ext):
381
+ return {"extension": ext, "action": "extract", "source": "implicit", "priority": 70, "reason": "local_email"}
382
+ return {"extension": ext, "action": "ignore", "source": "implicit", "priority": 0, "reason": "unknown_extension"}
383
+
384
+
385
+ def list_file_type_rules(*, readonly: bool = True) -> dict:
386
+ if not readonly:
387
+ conn = _conn()
388
+ seed_core_file_type_rules(conn)
389
+ conn.commit()
390
+ rows = _list_file_type_rules_conn(conn)
391
+ else:
392
+ conn = _read_conn()
393
+ try:
394
+ rows = _list_file_type_rules_conn(conn)
395
+ finally:
396
+ _close_read_conn(conn)
397
+ return _shape_file_type_rules(rows)
398
+
399
+
400
+ def _purge_assets_by_extension(conn, extension: str) -> dict:
401
+ ext = _normalize_extension(extension)
402
+ if not ext:
403
+ return {"assets": 0}
404
+ rows = conn.execute("SELECT asset_id FROM local_assets WHERE lower(extension)=?", (ext,)).fetchall()
405
+ return _purge_asset_ids(conn, [str(row["asset_id"]) for row in rows])
406
+
407
+
408
+ def set_file_type_rule(extension: str, *, action: str = "extract", source: str = "user", priority: int | None = None, reason: str = "user") -> dict:
409
+ conn = _conn()
410
+ ext = _normalize_extension(extension)
411
+ if not ext:
412
+ return {"ok": False, "error": "extension_required"}
413
+ normalized_action = _normalize_file_type_action(action)
414
+ source_value = _normalize_source(source)
415
+ priority_value = int(priority if priority is not None else (82 if normalized_action == "extract" else 20 if normalized_action == "metadata" else 0))
416
+ timestamp = now()
417
+ conn.execute(
418
+ """
419
+ INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
420
+ VALUES (?, ?, ?, ?, ?, ?, ?)
421
+ ON CONFLICT(extension, source) DO UPDATE SET
422
+ action=excluded.action,
423
+ priority=excluded.priority,
424
+ reason=excluded.reason,
425
+ updated_at=excluded.updated_at
426
+ """,
427
+ (ext, normalized_action, source_value, priority_value, reason, timestamp, timestamp),
428
+ )
429
+ cleanup = _purge_assets_by_extension(conn, ext) if normalized_action == "ignore" and source_value == "user" else {"assets": 0}
430
+ conn.commit()
431
+ log_event("info", "file_type_rule_set", "Local memory file type rule set", extension=ext, action=normalized_action, source=source_value, cleanup=cleanup)
432
+ return {"ok": True, "extension": ext, "action": normalized_action, "source": source_value, "priority": priority_value, "cleanup": cleanup}
433
+
434
+
435
+ def remove_file_type_rule(extension: str, *, source: str = "user") -> dict:
436
+ conn = _conn()
437
+ ext = _normalize_extension(extension)
438
+ source_value = _normalize_source(source)
439
+ conn.execute("DELETE FROM local_index_file_type_rules WHERE extension=? AND source=?", (ext, source_value))
440
+ conn.commit()
441
+ log_event("info", "file_type_rule_removed", "Local memory file type rule removed", extension=ext, source=source_value)
442
+ return {"ok": True, "extension": ext, "source": source_value}
443
+
444
+
445
+ def reset_file_type_rules() -> dict:
446
+ conn = _conn()
447
+ deleted = int(conn.execute("DELETE FROM local_index_file_type_rules WHERE source='user'").rowcount or 0)
448
+ seeded = seed_core_file_type_rules(conn)
449
+ conn.commit()
450
+ log_event("info", "file_type_rules_reset", "Local memory user file type overrides reset", deleted=deleted)
451
+ return {"ok": True, "deleted": deleted, "core_rules": int(seeded.get("rules") or 0), "file_types": list_file_type_rules(readonly=False)}
452
+
453
+
454
+ def _file_type_action(conn, path: str | Path) -> str:
455
+ p = Path(path)
456
+ if is_local_email_db(str(path)) or is_local_email_tree(str(path)):
457
+ return "extract"
458
+ return str(_effective_file_type_rule(conn, p.suffix.lower()).get("action") or "ignore")
459
+
460
+
461
+ def _should_index_file(conn, path: str | Path, *, allow_default_skip_override: bool = False) -> bool:
462
+ if not allow_default_skip_override and should_skip_file(str(path)):
463
+ return False
464
+ return _file_type_action(conn, path) != "ignore"
465
+
466
+
467
+ def _should_extract_file(conn, path: str | Path, depth: int, *, allow_default_skip_override: bool = False) -> bool:
468
+ if depth < 2 or (not allow_default_skip_override and should_skip_file(str(path))):
469
+ return False
470
+ return _file_type_action(conn, path) == "extract"
471
+
472
+
473
+ def add_root(path: str, *, mode: str = "normal", depth: int | None = None, source: str = "user", remote: bool = False, seed_version: int | None = None) -> dict:
217
474
  conn = _conn()
218
475
  root_path = norm_path(path)
219
- if should_skip_tree(root_path) and not _allow_explicit_blocked_root(root_path):
476
+ source_value = _normalize_source(source)
477
+ explicit_user_override = source_value == "user" and (_is_disk_root_path(root_path) or should_skip_tree(root_path))
478
+ if should_skip_tree(root_path) and source_value != "user" and not _allow_explicit_blocked_root(root_path):
220
479
  log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
221
480
  return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
222
481
  depth_value = 2 if depth is None else int(depth)
223
- existing = conn.execute("SELECT id, status FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
482
+ seed_value = int(seed_version if seed_version is not None else (DEFAULT_ROOT_SEED_VERSION if source_value == "core_default" else 0))
483
+ existing = conn.execute("SELECT id, status, source, depth FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
484
+ if existing and str(existing["status"] or "") == "active" and source_value == "user" and str(existing["source"] or "") == "core_default" and not explicit_user_override:
485
+ return {"ok": True, "root_path": root_path, "mode": mode, "depth": int(existing["depth"] or depth_value), "already_included": True, "included_by": "core_default"}
486
+ if source_value == "user":
487
+ parent = conn.execute(
488
+ """
489
+ SELECT root_path, source, depth
490
+ FROM local_index_roots
491
+ WHERE status='active' AND source='core_default'
492
+ ORDER BY length(root_path) DESC
493
+ """
494
+ ).fetchall()
495
+ for row in parent:
496
+ parent_path = str(row["root_path"] or "")
497
+ if _is_nested_path(root_path, parent_path) and not explicit_user_override:
498
+ return {
499
+ "ok": True,
500
+ "root_path": root_path,
501
+ "already_included": True,
502
+ "included_by": "core_default",
503
+ "included_root": parent_path,
504
+ "depth": int(row["depth"] or depth_value),
505
+ }
224
506
  conn.execute(
225
507
  """
226
- INSERT INTO local_index_roots(root_path, display_path, mode, depth, status, created_at, updated_at)
227
- VALUES (?, ?, ?, ?, 'active', ?, ?)
508
+ INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
509
+ VALUES (?, ?, ?, ?, ?, ?, ?, 'active', ?, ?)
228
510
  ON CONFLICT(root_path) DO UPDATE SET
229
511
  display_path=excluded.display_path,
230
512
  mode=excluded.mode,
231
513
  depth=excluded.depth,
514
+ source=excluded.source,
515
+ remote=excluded.remote,
516
+ seed_version=excluded.seed_version,
232
517
  status='active',
233
518
  updated_at=excluded.updated_at
234
519
  """,
235
- (root_path, path, mode, depth_value, now(), now()),
520
+ (root_path, path, mode, depth_value, source_value, 1 if remote else 0, seed_value, now(), now()),
236
521
  )
237
522
  row = conn.execute("SELECT id FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
238
523
  existing_status = str(existing["status"] or "") if existing else ""
@@ -241,8 +526,8 @@ def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> di
241
526
  _set_initial_index_complete(conn, False)
242
527
  _set_initial_index_started_at(conn, now())
243
528
  conn.commit()
244
- log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value)
245
- return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value}
529
+ log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value, source=source_value, explicit_override=explicit_user_override)
530
+ return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value, "source": source_value, "remote": bool(remote), "explicit_override": explicit_user_override}
246
531
 
247
532
 
248
533
  def remove_root(path: str) -> dict:
@@ -331,15 +616,40 @@ def _mounted_volume_roots() -> list[str]:
331
616
 
332
617
 
333
618
  def _system_volume_roots() -> list[str]:
619
+ if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
620
+ if sys.platform == "darwin":
621
+ return ["/"]
622
+ if sys.platform.startswith("win"):
623
+ return []
624
+ return ["/"]
334
625
  if os.environ.get("NEXO_LOCAL_INDEX_DISABLE_SYSTEM_ROOTS", "").strip() in {"1", "true", "yes"}:
335
626
  return []
336
- if sys.platform == "darwin":
337
- return ["/"]
627
+ return []
628
+
629
+
630
+ def _user_content_roots() -> list[str]:
631
+ home = Path.home()
632
+ candidates: list[Path] = [home]
338
633
  if sys.platform.startswith("win"):
339
- # Windows roots are discovered as mounted drive roots so mapped drives
340
- # and removable disks share the same code path.
341
- return []
342
- return ["/"]
634
+ candidates.extend([
635
+ home / "OneDrive",
636
+ home / "OneDrive - Personal",
637
+ home / "OneDrive - Empresa",
638
+ ])
639
+ for key in ("OneDrive", "OneDriveCommercial", "OneDriveConsumer"):
640
+ value = os.environ.get(key, "").strip()
641
+ if value:
642
+ candidates.append(Path(value))
643
+ elif sys.platform == "darwin":
644
+ candidates.append(home / "Library" / "Mobile Documents" / "com~apple~CloudDocs")
645
+ roots: list[str] = []
646
+ for candidate in candidates:
647
+ try:
648
+ if candidate.exists() and candidate.is_dir() and (not should_skip_tree(str(candidate)) or _allow_explicit_blocked_root(str(candidate))):
649
+ roots.append(str(candidate))
650
+ except Exception:
651
+ continue
652
+ return _dedupe_roots(roots)
343
653
 
344
654
 
345
655
  def _local_email_roots() -> list[str]:
@@ -379,42 +689,507 @@ def default_roots() -> list[str]:
379
689
 
380
690
 
381
691
  def default_root_specs() -> list[tuple[str, int]]:
382
- home = Path.home()
383
692
  configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
384
- system_specs = [(root, DEFAULT_SYSTEM_ROOT_DEPTH) for root in _system_volume_roots()]
385
- mounted_specs = [(root, DEFAULT_MOUNTED_ROOT_DEPTH) for root in _mounted_volume_roots()]
693
+ system_specs: list[tuple[str, int]] = []
694
+ if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
695
+ system_specs = [(root, DEFAULT_SYSTEM_ROOT_DEPTH) for root in _system_volume_roots()]
696
+ mounted_specs = []
697
+ if os.environ.get("NEXO_LOCAL_INDEX_INCLUDE_MOUNTED_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
698
+ mounted_specs = [(root, DEFAULT_MOUNTED_ROOT_DEPTH) for root in _mounted_volume_roots()]
386
699
  configured_specs = [(item, DEFAULT_ROOT_DEPTH) for item in configured.split(os.pathsep) if item.strip()]
387
- base_specs = system_specs + mounted_specs + configured_specs
388
- if not base_specs:
389
- base_specs = [(str(home), DEFAULT_ROOT_DEPTH)]
700
+ user_specs = [(root, DEFAULT_ROOT_DEPTH) for root in _user_content_roots()]
701
+ base_specs = user_specs + system_specs + mounted_specs + configured_specs
390
702
  return _dedupe_root_specs(
391
703
  base_specs
392
704
  + [(root, DEFAULT_EMAIL_ROOT_DEPTH) for root in _local_email_roots()]
393
705
  )
394
706
 
395
707
 
396
- def ensure_default_roots() -> dict:
397
- existing = {row["root_path"]: row for row in list_roots(readonly=False)}
708
+ def _all_roots_by_path_conn(conn) -> dict[str, dict]:
709
+ rows = conn.execute("SELECT * FROM local_index_roots ORDER BY root_path").fetchall()
710
+ return {str(row["root_path"]): dict(row) for row in rows}
711
+
712
+
713
+ def _seed_default_roots_conn(conn) -> dict:
714
+ existing = _all_roots_by_path_conn(conn)
398
715
  created = []
399
716
  updated = []
717
+ skipped_removed = []
400
718
  for root, depth in default_root_specs():
401
719
  candidate = Path(root).expanduser()
402
720
  if not candidate.exists() or not candidate.is_dir():
403
721
  continue
404
- existing_row = existing.get(norm_path(str(candidate)))
722
+ root_path = norm_path(str(candidate))
723
+ existing_row = existing.get(root_path)
405
724
  if existing_row:
725
+ if str(existing_row.get("status") or "") == "removed":
726
+ skipped_removed.append({"root_path": root_path})
727
+ continue
406
728
  current_depth = int(existing_row.get("depth") or 0)
407
729
  if current_depth < depth:
408
- conn = _conn()
409
730
  conn.execute(
410
- "UPDATE local_index_roots SET depth=?, updated_at=? WHERE root_path=?",
411
- (depth, now(), existing_row["root_path"]),
731
+ "UPDATE local_index_roots SET depth=?, source='core_default', seed_version=?, updated_at=? WHERE root_path=?",
732
+ (depth, DEFAULT_ROOT_SEED_VERSION, now(), root_path),
412
733
  )
413
- conn.commit()
414
- updated.append({"root_path": existing_row["root_path"], "depth": depth})
734
+ updated.append({"root_path": root_path, "depth": depth})
415
735
  continue
416
- created.append(add_root(str(candidate), mode="normal", depth=depth))
417
- return {"ok": True, "created": len(created), "updated": len(updated), "roots": list_roots(readonly=False)}
736
+ timestamp = now()
737
+ conn.execute(
738
+ """
739
+ INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
740
+ VALUES (?, ?, 'normal', ?, 'core_default', 0, ?, 'active', ?, ?)
741
+ """,
742
+ (root_path, str(candidate), int(depth), DEFAULT_ROOT_SEED_VERSION, timestamp, timestamp),
743
+ )
744
+ created.append({"root_path": root_path, "depth": int(depth)})
745
+ existing[root_path] = {
746
+ "root_path": root_path,
747
+ "display_path": str(candidate),
748
+ "mode": "normal",
749
+ "depth": int(depth),
750
+ "source": "core_default",
751
+ "remote": 0,
752
+ "seed_version": DEFAULT_ROOT_SEED_VERSION,
753
+ "status": "active",
754
+ }
755
+ return {"created": created, "updated": updated, "skipped_removed": skipped_removed}
756
+
757
+
758
+ def ensure_default_roots() -> dict:
759
+ conn = _conn()
760
+ seed_core_file_type_rules(conn)
761
+ seeded = _seed_default_roots_conn(conn)
762
+ migration = migrate_roots_seed_v2(dry_run=False, _already_seeded=True)
763
+ try:
764
+ conn.commit()
765
+ except sqlite3.ProgrammingError:
766
+ # A large legacy DB may have been archived and replaced during migration.
767
+ pass
768
+ return {
769
+ "ok": True,
770
+ "created": len(seeded["created"]),
771
+ "updated": len(seeded["updated"]),
772
+ "skipped_removed": len(seeded["skipped_removed"]),
773
+ "migration": migration,
774
+ "roots": list_roots(readonly=False),
775
+ "file_types": list_file_type_rules(readonly=False),
776
+ }
777
+
778
+
779
+ def _local_context_sidecar_paths(db_path: Path) -> list[Path]:
780
+ return [db_path, db_path.with_name(db_path.name + "-wal"), db_path.with_name(db_path.name + "-shm")]
781
+
782
+
783
+ def _local_context_db_size_bytes() -> int:
784
+ total = 0
785
+ for candidate in _local_context_sidecar_paths(local_context_db_path()):
786
+ try:
787
+ if candidate.exists():
788
+ total += int(candidate.stat().st_size)
789
+ except OSError:
790
+ continue
791
+ return total
792
+
793
+
794
+ def _capture_roots_v2_config(conn) -> dict:
795
+ state_rows = [
796
+ dict(row)
797
+ for row in conn.execute(
798
+ """
799
+ SELECT key, value, updated_at
800
+ FROM local_index_state
801
+ WHERE key NOT LIKE 'root_initial_scan:%'
802
+ AND key NOT IN (?, ?, ?)
803
+ ORDER BY key
804
+ """,
805
+ (ROOT_SEED_VERSION_KEY, INITIAL_INDEX_COMPLETE_KEY, INITIAL_INDEX_STARTED_AT_KEY),
806
+ ).fetchall()
807
+ ]
808
+ root_rows = []
809
+ for row in conn.execute(
810
+ """
811
+ SELECT root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at
812
+ FROM local_index_roots
813
+ ORDER BY root_path
814
+ """
815
+ ).fetchall():
816
+ shaped = dict(row)
817
+ source = str(shaped.get("source") or "legacy")
818
+ status = str(shaped.get("status") or "")
819
+ root_path = str(shaped.get("root_path") or "")
820
+ preserve = (
821
+ source == "user"
822
+ or bool(shaped.get("remote"))
823
+ or status == "removed"
824
+ or (source == "core_default" and status == "active" and not _is_disk_root_path(root_path))
825
+ )
826
+ if preserve:
827
+ root_rows.append(shaped)
828
+ exclusion_rows = [
829
+ dict(row)
830
+ for row in conn.execute(
831
+ """
832
+ SELECT path, display_path, source, kind, reason, created_at
833
+ FROM local_index_exclusions
834
+ ORDER BY path
835
+ """
836
+ ).fetchall()
837
+ ]
838
+ file_type_rows = [
839
+ dict(row)
840
+ for row in conn.execute(
841
+ """
842
+ SELECT extension, action, source, priority, reason, created_at, updated_at
843
+ FROM local_index_file_type_rules
844
+ WHERE source='user'
845
+ ORDER BY extension
846
+ """
847
+ ).fetchall()
848
+ ]
849
+ return {
850
+ "state": state_rows,
851
+ "roots": root_rows,
852
+ "exclusions": exclusion_rows,
853
+ "file_types": file_type_rows,
854
+ }
855
+
856
+
857
+ def _restore_roots_v2_config(conn, config: dict) -> dict:
858
+ restored = {"state": 0, "roots": 0, "exclusions": 0, "file_types": 0}
859
+ timestamp = now()
860
+ for row in config.get("state") or []:
861
+ conn.execute(
862
+ """
863
+ INSERT OR REPLACE INTO local_index_state(key, value, updated_at)
864
+ VALUES (?, ?, ?)
865
+ """,
866
+ (row.get("key"), row.get("value") or "", float(row.get("updated_at") or timestamp)),
867
+ )
868
+ restored["state"] += 1
869
+ for row in config.get("roots") or []:
870
+ root_path = norm_path(str(row.get("root_path") or ""))
871
+ if not root_path:
872
+ continue
873
+ conn.execute(
874
+ """
875
+ INSERT OR REPLACE INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
876
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
877
+ """,
878
+ (
879
+ root_path,
880
+ row.get("display_path") or root_path,
881
+ row.get("mode") or "normal",
882
+ int(row.get("depth") or DEFAULT_ROOT_DEPTH),
883
+ _normalize_source(row.get("source") or "user"),
884
+ 1 if row.get("remote") else 0,
885
+ int(row.get("seed_version") or 0),
886
+ row.get("status") or "active",
887
+ float(row.get("created_at") or timestamp),
888
+ float(row.get("updated_at") or timestamp),
889
+ ),
890
+ )
891
+ restored["roots"] += 1
892
+ for row in config.get("exclusions") or []:
893
+ exclusion_path = norm_path(str(row.get("path") or ""))
894
+ if not exclusion_path:
895
+ continue
896
+ conn.execute(
897
+ """
898
+ INSERT OR REPLACE INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
899
+ VALUES (?, ?, ?, ?, ?, ?)
900
+ """,
901
+ (
902
+ exclusion_path,
903
+ row.get("display_path") or exclusion_path,
904
+ _normalize_source(row.get("source") or "user"),
905
+ row.get("kind") or "folder",
906
+ row.get("reason") or "user",
907
+ float(row.get("created_at") or timestamp),
908
+ ),
909
+ )
910
+ restored["exclusions"] += 1
911
+ for row in config.get("file_types") or []:
912
+ extension = _normalize_extension(str(row.get("extension") or ""))
913
+ if not extension:
914
+ continue
915
+ conn.execute(
916
+ """
917
+ INSERT OR REPLACE INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
918
+ VALUES (?, ?, ?, ?, ?, ?, ?)
919
+ """,
920
+ (
921
+ extension,
922
+ _normalize_file_type_action(str(row.get("action") or "ignore")),
923
+ _normalize_source(row.get("source") or "user"),
924
+ int(row.get("priority") or 0),
925
+ row.get("reason") or "user",
926
+ float(row.get("created_at") or timestamp),
927
+ float(row.get("updated_at") or timestamp),
928
+ ),
929
+ )
930
+ restored["file_types"] += 1
931
+ return restored
932
+
933
+
934
+ def _create_roots_v2_sqlite_backup(conn) -> dict:
935
+ db_path = local_context_db_path()
936
+ if not db_path.is_file():
937
+ return {"ok": True, "skipped": True, "reason": "db_missing"}
938
+ conn.commit()
939
+ backup_path = paths.create_backup_path("local-context-roots-v2", ".db")
940
+ backup_conn = None
941
+ try:
942
+ backup_conn = sqlite3.connect(str(backup_path))
943
+ conn.backup(backup_conn)
944
+ backup_conn.close()
945
+ backup_conn = None
946
+ backup_check = sqlite3.connect(str(backup_path))
947
+ try:
948
+ source_roots = int(conn.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
949
+ backup_roots = int(backup_check.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
950
+ finally:
951
+ backup_check.close()
952
+ if backup_roots < source_roots:
953
+ return {
954
+ "ok": False,
955
+ "error": "backup_validation_failed",
956
+ "path": str(backup_path),
957
+ "source_roots": source_roots,
958
+ "backup_roots": backup_roots,
959
+ }
960
+ prune = paths.finalize_backup_snapshot(backup_path)
961
+ return {"ok": True, "path": str(backup_path), "source_roots": source_roots, "backup_roots": backup_roots, "prune": prune}
962
+ except Exception as exc:
963
+ return {"ok": False, "error": str(exc), "path": str(backup_path)}
964
+ finally:
965
+ if backup_conn is not None:
966
+ try:
967
+ backup_conn.close()
968
+ except Exception:
969
+ pass
970
+
971
+
972
+ def _archive_rebuild_local_context_for_roots_v2(conn, summary: dict) -> dict:
973
+ db_path = local_context_db_path()
974
+ config = _capture_roots_v2_config(conn)
975
+ size_bytes = _local_context_db_size_bytes()
976
+ backup_dir = paths.create_backup_dir("local-context-roots-v2")
977
+ conn.commit()
978
+ close_local_context_db()
979
+ moved = []
980
+ try:
981
+ for candidate in _local_context_sidecar_paths(db_path):
982
+ if not candidate.exists():
983
+ continue
984
+ target = backup_dir / candidate.name
985
+ shutil.move(str(candidate), str(target))
986
+ moved.append({"path": str(candidate), "backup_path": str(target)})
987
+ fresh = _conn()
988
+ seed_core_file_type_rules(fresh)
989
+ restored = _restore_roots_v2_config(fresh, config)
990
+ seeded = _seed_default_roots_conn(fresh)
991
+ _set_state_conn(fresh, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
992
+ _set_initial_index_complete(fresh, False)
993
+ _set_initial_index_started_at(fresh, now())
994
+ fresh.commit()
995
+ prune = paths.finalize_backup_snapshot(backup_dir)
996
+ result = {
997
+ "ok": True,
998
+ "strategy": "archive_rebuild",
999
+ "backup_dir": str(backup_dir),
1000
+ "size_bytes": size_bytes,
1001
+ "moved": moved,
1002
+ "preserved": restored,
1003
+ "seeded": seeded,
1004
+ "prune": prune,
1005
+ }
1006
+ log_event("info", "roots_seed_v2_archived_rebuilt", "Local memory roots seed v2 archived large DB and rebuilt config", summary=summary, result=result)
1007
+ return result
1008
+ except Exception as exc:
1009
+ return {
1010
+ "ok": False,
1011
+ "strategy": "archive_rebuild",
1012
+ "backup_dir": str(backup_dir),
1013
+ "size_bytes": size_bytes,
1014
+ "moved": moved,
1015
+ "error": str(exc),
1016
+ }
1017
+
1018
+
1019
+ def _is_disk_root_path(path: str) -> bool:
1020
+ normalized = norm_path(path)
1021
+ if normalized in {"/", "\\"}:
1022
+ return True
1023
+ return bool(re.match(r"^[A-Za-z]:\\?$", normalized))
1024
+
1025
+
1026
+ def _path_is_under_any(path: str, prefixes: list[str]) -> bool:
1027
+ value = norm_path(path)
1028
+ return any(value == prefix or value.startswith(_path_prefix(prefix)) for prefix in prefixes if prefix)
1029
+
1030
+
1031
+ def _best_root_id_for_path(path: str, roots: list[dict]) -> int | None:
1032
+ value = norm_path(path)
1033
+ best: tuple[int, int] | None = None
1034
+ for row in roots:
1035
+ root_path = str(row.get("root_path") or "")
1036
+ if not root_path or not (value == root_path or value.startswith(_path_prefix(root_path))):
1037
+ continue
1038
+ candidate = (len(root_path), int(row.get("id") or 0))
1039
+ if best is None or candidate[0] > best[0]:
1040
+ best = candidate
1041
+ return best[1] if best else None
1042
+
1043
+
1044
+ def _purge_dir_ids(conn, dir_ids: list[str]) -> int:
1045
+ unique_ids = [item for item in dict.fromkeys(dir_ids) if item]
1046
+ deleted = 0
1047
+ for start in range(0, len(unique_ids), 500):
1048
+ batch = unique_ids[start:start + 500]
1049
+ placeholders = ",".join("?" for _ in batch)
1050
+ deleted += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
1051
+ return deleted
1052
+
1053
+
1054
+ def migrate_roots_seed_v2(*, dry_run: bool = True, _already_seeded: bool = False) -> dict:
1055
+ """Move legacy whole-disk roots to curated user roots and purge obvious noise."""
1056
+ conn = _conn()
1057
+ if not _already_seeded:
1058
+ seed_core_file_type_rules(conn)
1059
+ current_seed = _get_state_conn(conn, ROOT_SEED_VERSION_KEY, "0")
1060
+ if str(current_seed) == str(DEFAULT_ROOT_SEED_VERSION):
1061
+ return {"ok": True, "dry_run": dry_run, "needed": False, "seed_version": DEFAULT_ROOT_SEED_VERSION}
1062
+
1063
+ active_roots = [dict(row) for row in conn.execute("SELECT * FROM local_index_roots WHERE status='active'").fetchall()]
1064
+ keep_roots = [
1065
+ row for row in active_roots
1066
+ if str(row.get("status") or "") == "active"
1067
+ and not (
1068
+ _is_disk_root_path(str(row.get("root_path") or ""))
1069
+ and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
1070
+ )
1071
+ ]
1072
+ keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
1073
+ legacy_disk_roots = [
1074
+ row for row in active_roots
1075
+ if (
1076
+ _is_disk_root_path(str(row.get("root_path") or ""))
1077
+ and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
1078
+ )
1079
+ or (
1080
+ str(row.get("source") or "legacy") in {"legacy", "system_default"}
1081
+ and any(_is_nested_path(prefix, str(row.get("root_path") or "")) for prefix in keep_prefixes)
1082
+ )
1083
+ ]
1084
+ keep_roots = [row for row in keep_roots if row not in legacy_disk_roots]
1085
+ keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
1086
+ legacy_ids = {int(row.get("id") or 0) for row in legacy_disk_roots}
1087
+ legacy_prefixes = [str(row.get("root_path") or "") for row in legacy_disk_roots if row.get("root_path")]
1088
+ override_prefixes = [str(row.get("root_path") or "") for row in keep_roots if _root_allows_default_skip_override(row)]
1089
+
1090
+ asset_ids_to_purge: list[str] = []
1091
+ asset_remaps: dict[int, list[str]] = {}
1092
+ asset_rows = conn.execute("SELECT asset_id, root_id, path, extension, privacy_class FROM local_assets").fetchall()
1093
+ for row in asset_rows:
1094
+ path = str(row["path"] or "")
1095
+ under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
1096
+ action = _file_type_action(conn, path)
1097
+ explicit_override = _path_under_any_prefix(path, override_prefixes)
1098
+ unsafe = not explicit_override and (
1099
+ should_skip_file(path)
1100
+ or str(row["privacy_class"] or "") in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}
1101
+ )
1102
+ if action == "ignore" or unsafe or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
1103
+ asset_ids_to_purge.append(str(row["asset_id"]))
1104
+ continue
1105
+ if under_legacy:
1106
+ new_root_id = _best_root_id_for_path(path, keep_roots)
1107
+ if new_root_id:
1108
+ asset_remaps.setdefault(new_root_id, []).append(str(row["asset_id"]))
1109
+
1110
+ dir_ids_to_purge: list[str] = []
1111
+ dir_remaps: dict[int, list[str]] = {}
1112
+ dir_rows = conn.execute("SELECT dir_id, root_id, path FROM local_index_dirs").fetchall()
1113
+ for row in dir_rows:
1114
+ path = str(row["path"] or "")
1115
+ under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
1116
+ explicit_override = _path_under_any_prefix(path, override_prefixes)
1117
+ if (should_skip_tree(path) and not explicit_override) or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
1118
+ dir_ids_to_purge.append(str(row["dir_id"]))
1119
+ continue
1120
+ if under_legacy:
1121
+ new_root_id = _best_root_id_for_path(path, keep_roots)
1122
+ if new_root_id:
1123
+ dir_remaps.setdefault(new_root_id, []).append(str(row["dir_id"]))
1124
+
1125
+ summary = {
1126
+ "ok": True,
1127
+ "dry_run": dry_run,
1128
+ "needed": True,
1129
+ "legacy_disk_roots": [str(row.get("root_path") or "") for row in legacy_disk_roots],
1130
+ "keep_roots": keep_prefixes,
1131
+ "assets_to_purge": len(asset_ids_to_purge),
1132
+ "dirs_to_purge": len(dir_ids_to_purge),
1133
+ "assets_to_remap": sum(len(items) for items in asset_remaps.values()),
1134
+ "dirs_to_remap": sum(len(items) for items in dir_remaps.values()),
1135
+ "cleanup": {},
1136
+ }
1137
+ if dry_run:
1138
+ return summary
1139
+
1140
+ destructive = bool(
1141
+ asset_ids_to_purge
1142
+ or dir_ids_to_purge
1143
+ or legacy_ids
1144
+ or any(asset_remaps.values())
1145
+ or any(dir_remaps.values())
1146
+ )
1147
+ db_size = _local_context_db_size_bytes()
1148
+ summary["db_size_bytes"] = db_size
1149
+ if destructive and LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES > 0 and db_size > LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES:
1150
+ rebuild = _archive_rebuild_local_context_for_roots_v2(conn, summary)
1151
+ summary["cleanup"] = rebuild
1152
+ summary["strategy"] = "archive_rebuild"
1153
+ summary["ok"] = bool(rebuild.get("ok"))
1154
+ if not rebuild.get("ok"):
1155
+ summary["error"] = str(rebuild.get("error") or "archive_rebuild_failed")
1156
+ return summary
1157
+
1158
+ backup = None
1159
+ if destructive:
1160
+ backup = _create_roots_v2_sqlite_backup(conn)
1161
+ summary["backup"] = backup
1162
+ if not backup.get("ok"):
1163
+ summary["ok"] = False
1164
+ summary["error"] = "migration_backup_failed"
1165
+ return summary
1166
+
1167
+ for new_root_id, asset_ids in asset_remaps.items():
1168
+ for start in range(0, len(asset_ids), 500):
1169
+ batch = asset_ids[start:start + 500]
1170
+ placeholders = ",".join("?" for _ in batch)
1171
+ conn.execute(f"UPDATE local_assets SET root_id=?, updated_at=? WHERE asset_id IN ({placeholders})", (new_root_id, now(), *batch))
1172
+ for new_root_id, dir_ids in dir_remaps.items():
1173
+ for start in range(0, len(dir_ids), 500):
1174
+ batch = dir_ids[start:start + 500]
1175
+ placeholders = ",".join("?" for _ in batch)
1176
+ conn.execute(f"UPDATE local_index_dirs SET root_id=?, updated_at=? WHERE dir_id IN ({placeholders})", (new_root_id, now(), *batch))
1177
+ cleanup = _purge_asset_ids(conn, asset_ids_to_purge)
1178
+ cleanup["dirs"] = _purge_dir_ids(conn, dir_ids_to_purge)
1179
+ if legacy_ids:
1180
+ placeholders = ",".join("?" for _ in legacy_ids)
1181
+ conn.execute(f"DELETE FROM local_index_checkpoints WHERE root_id IN ({placeholders})", tuple(legacy_ids))
1182
+ conn.execute(
1183
+ f"UPDATE local_index_roots SET status='removed', source='core_removed', updated_at=? WHERE id IN ({placeholders})",
1184
+ (now(), *legacy_ids),
1185
+ )
1186
+ _set_state_conn(conn, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
1187
+ _set_initial_index_complete(conn, False)
1188
+ _set_initial_index_started_at(conn, now())
1189
+ summary["cleanup"] = cleanup
1190
+ summary["strategy"] = "in_place"
1191
+ log_event("info", "roots_seed_v2_migrated", "Local memory roots seed v2 applied", summary=summary)
1192
+ return summary
418
1193
 
419
1194
 
420
1195
  def _should_skip_mounted_root(candidate: Path) -> bool:
@@ -543,17 +1318,26 @@ def _purge_asset_ids(conn, asset_ids: list[str]) -> dict:
543
1318
 
544
1319
  def _privacy_unsafe_asset_ids(conn) -> list[str]:
545
1320
  rows = conn.execute("SELECT asset_id, path, privacy_class FROM local_assets").fetchall()
1321
+ override_prefixes = _active_user_override_prefixes_conn(conn)
546
1322
  unsafe: list[str] = []
547
1323
  for row in rows:
548
1324
  privacy_class = str(row["privacy_class"] or "")
549
- if should_skip_file(str(row["path"] or "")) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
1325
+ path = str(row["path"] or "")
1326
+ if _path_under_any_prefix(path, override_prefixes):
1327
+ continue
1328
+ if should_skip_file(path) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
550
1329
  unsafe.append(str(row["asset_id"]))
551
1330
  return unsafe
552
1331
 
553
1332
 
554
1333
  def _privacy_unsafe_dir_ids(conn) -> list[str]:
555
1334
  rows = conn.execute("SELECT dir_id, path FROM local_index_dirs").fetchall()
556
- return [str(row["dir_id"]) for row in rows if should_skip_tree(str(row["path"] or ""))]
1335
+ override_prefixes = _active_user_override_prefixes_conn(conn)
1336
+ return [
1337
+ str(row["dir_id"])
1338
+ for row in rows
1339
+ if should_skip_tree(str(row["path"] or "")) and not _path_under_any_prefix(str(row["path"] or ""), override_prefixes)
1340
+ ]
557
1341
 
558
1342
 
559
1343
  def _content_secret_asset_ids(conn) -> list[str]:
@@ -646,9 +1430,10 @@ def local_index_privacy_hygiene(*, fix: bool = False) -> dict:
646
1430
  def local_index_hygiene(*, fix: bool = False) -> dict:
647
1431
  conn = _conn()
648
1432
  removed_paths: list[str] = []
649
- for row in conn.execute("SELECT id, root_path FROM local_index_roots").fetchall():
1433
+ for row in conn.execute("SELECT id, root_path, source, status FROM local_index_roots").fetchall():
650
1434
  path = str(row["root_path"] or "")
651
- if _should_skip_mounted_root(Path(path)) or should_skip_tree(path):
1435
+ root = dict(row)
1436
+ if _should_skip_mounted_root(Path(path)) or (should_skip_tree(path) and not _root_allows_default_skip_override(root)):
652
1437
  removed_paths.append(path)
653
1438
  if fix:
654
1439
  conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE id=?", (now(), row["id"]))
@@ -667,20 +1452,26 @@ def repair_index_hygiene() -> dict:
667
1452
  return local_index_hygiene(fix=True)
668
1453
 
669
1454
 
670
- def add_exclusion(path: str, *, reason: str = "user") -> dict:
1455
+ def add_exclusion(path: str, *, reason: str = "user", source: str = "user", kind: str = "folder") -> dict:
671
1456
  conn = _conn()
672
1457
  excluded_path = norm_path(path)
1458
+ source_value = _normalize_source(source)
1459
+ kind_value = str(kind or "folder").strip().lower() or "folder"
673
1460
  conn.execute(
674
1461
  """
675
- INSERT INTO local_index_exclusions(path, display_path, reason, created_at)
676
- VALUES (?, ?, ?, ?)
677
- ON CONFLICT(path) DO UPDATE SET display_path=excluded.display_path, reason=excluded.reason
1462
+ INSERT INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
1463
+ VALUES (?, ?, ?, ?, ?, ?)
1464
+ ON CONFLICT(path) DO UPDATE SET
1465
+ display_path=excluded.display_path,
1466
+ source=excluded.source,
1467
+ kind=excluded.kind,
1468
+ reason=excluded.reason
678
1469
  """,
679
- (excluded_path, path, reason, now()),
1470
+ (excluded_path, path, source_value, kind_value, reason, now()),
680
1471
  )
681
1472
  conn.commit()
682
- log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason)
683
- return {"ok": True, "path": excluded_path}
1473
+ log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason, source=source_value)
1474
+ return {"ok": True, "path": excluded_path, "source": source_value, "kind": kind_value}
684
1475
 
685
1476
 
686
1477
  def remove_exclusion(path: str) -> dict:
@@ -1050,6 +1841,39 @@ def _is_nested_path(path: str, parent: str) -> bool:
1050
1841
  return value_cmp.startswith(prefix)
1051
1842
 
1052
1843
 
1844
+ def _root_allows_default_skip_override(root: dict | None) -> bool:
1845
+ if not root:
1846
+ return False
1847
+ root_path = str(root.get("root_path") or "")
1848
+ return str(root.get("source") or "") == "user" and bool(root_path) and (
1849
+ _is_disk_root_path(root_path) or should_skip_tree(root_path)
1850
+ )
1851
+
1852
+
1853
+ def _active_user_override_prefixes_conn(conn) -> list[str]:
1854
+ rows = conn.execute(
1855
+ """
1856
+ SELECT root_path
1857
+ FROM local_index_roots
1858
+ WHERE status='active' AND source='user'
1859
+ """
1860
+ ).fetchall()
1861
+ return [
1862
+ str(row["root_path"] or "")
1863
+ for row in rows
1864
+ if row["root_path"] and (_is_disk_root_path(str(row["root_path"] or "")) or should_skip_tree(str(row["root_path"] or "")))
1865
+ ]
1866
+
1867
+
1868
+ def _path_under_any_prefix(path: str, prefixes: list[str]) -> bool:
1869
+ for prefix in prefixes:
1870
+ if not prefix:
1871
+ continue
1872
+ if norm_path(path) == norm_path(prefix) or _is_nested_path(path, prefix):
1873
+ return True
1874
+ return False
1875
+
1876
+
1053
1877
  def _is_discovered_mount_path(path: str) -> bool:
1054
1878
  value = norm_path(path).replace("\\", "/").lower()
1055
1879
  if not value:
@@ -1070,6 +1894,9 @@ def _effective_scan_roots(roots: list[dict]) -> list[dict]:
1070
1894
  effective: list[dict] = []
1071
1895
  for root in active_roots:
1072
1896
  root_path = str(root.get("root_path") or "")
1897
+ if _root_allows_default_skip_override(root):
1898
+ effective.append(root)
1899
+ continue
1073
1900
  if _is_discovered_mount_path(root_path):
1074
1901
  effective.append(root)
1075
1902
  continue
@@ -1180,14 +2007,16 @@ def _upsert_dir(
1180
2007
  return changed, fingerprint
1181
2008
 
1182
2009
 
1183
- def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: int) -> tuple[str, bool, str]:
2010
+ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: int, *, allow_default_skip_override: bool = False) -> tuple[str, bool, str]:
1184
2011
  raw_path = str(path)
1185
2012
  normalized = norm_path(raw_path)
1186
2013
  asset_id = stable_id("asset", normalized)
1187
- if should_skip_file(normalized):
2014
+ if not _should_index_file(conn, normalized, allow_default_skip_override=allow_default_skip_override):
1188
2015
  return asset_id, False, "skipped"
1189
2016
  perm = _permission_state(path)
1190
2017
  depth, privacy_class, depth_reason = classify_path(normalized)
2018
+ if allow_default_skip_override and privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only", "inventory_only"}:
2019
+ depth, privacy_class, depth_reason = 2, "normal", "explicit_user_include"
1191
2020
  depth = min(depth, root_depth)
1192
2021
  try:
1193
2022
  st = path.stat()
@@ -1197,7 +2026,7 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
1197
2026
  INSERT INTO local_index_errors(asset_id, path, phase, error_code, user_message, technical_detail, retryable, created_at)
1198
2027
  VALUES (?, ?, 'quick_index', ?, ?, ?, 1, ?)
1199
2028
  """,
1200
- (asset_id, normalized, type(exc).__name__, "Algunos archivos no se pudieron leer", str(exc), now()),
2029
+ (asset_id, normalized, type(exc).__name__, "Some files could not be read", str(exc), now()),
1201
2030
  )
1202
2031
  return asset_id, False, "error"
1203
2032
  fingerprint = quick_fingerprint(path, st)
@@ -1265,8 +2094,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
1265
2094
  """,
1266
2095
  (version_id, asset_id, fingerprint, int(st.st_size), float(st.st_mtime), now()),
1267
2096
  )
1268
- if should_extract(normalized, depth):
1269
- enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path))
2097
+ if _should_extract_file(conn, normalized, depth, allow_default_skip_override=allow_default_skip_override):
2098
+ enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path, conn=conn))
1270
2099
  enqueue_job(conn, asset_id, "graph", priority=40)
1271
2100
  return asset_id, changed, "ok"
1272
2101
 
@@ -1354,7 +2183,7 @@ def _record_scan_error(conn, stats: dict | None, path: str, phase: str, exc: Exc
1354
2183
  path=path,
1355
2184
  phase=phase,
1356
2185
  error_code=type(exc).__name__,
1357
- user_message="Algunas carpetas o archivos no se pudieron leer",
2186
+ user_message="Some folders or files could not be read",
1358
2187
  technical_detail=str(exc),
1359
2188
  retryable=True,
1360
2189
  )
@@ -1377,7 +2206,15 @@ def enqueue_job(conn, asset_id: str, job_type: str, *, priority: int = 50) -> st
1377
2206
  return job_id
1378
2207
 
1379
2208
 
1380
- def _extraction_priority(path: Path) -> int:
2209
+ def _extraction_priority(path: Path, *, conn=None) -> int:
2210
+ if conn is not None:
2211
+ rule = _effective_file_type_rule(conn, path.suffix.lower())
2212
+ try:
2213
+ priority = int(rule.get("priority") or 0)
2214
+ except Exception:
2215
+ priority = 0
2216
+ if priority > 0:
2217
+ return priority
1381
2218
  suffix = path.suffix.lower()
1382
2219
  if suffix in HIGH_VALUE_DOCUMENT_SUFFIXES:
1383
2220
  return 90
@@ -1385,7 +2222,7 @@ def _extraction_priority(path: Path) -> int:
1385
2222
  return 82
1386
2223
  if suffix in EMAIL_DOCUMENT_SUFFIXES or is_local_email_tree(str(path)):
1387
2224
  return 70
1388
- if suffix in {".py", ".js", ".ts", ".tsx", ".jsx", ".php", ".sql", ".json", ".yaml", ".yml", ".toml", ".html", ".css"}:
2225
+ if suffix in CODE_DOCUMENT_SUFFIXES:
1389
2226
  return 55
1390
2227
  return 45
1391
2228
 
@@ -1425,6 +2262,7 @@ def _iter_files(
1425
2262
  start_after: str = "",
1426
2263
  seen_at: float | None = None,
1427
2264
  stats: dict | None = None,
2265
+ allow_default_skip_override: bool = False,
1428
2266
  ):
1429
2267
  seen_at = seen_at or now()
1430
2268
  seen_dirs: set[tuple[int, int]] = set()
@@ -1435,7 +2273,7 @@ def _iter_files(
1435
2273
  current = stack.pop()
1436
2274
  if _is_excluded(str(current), exclusions):
1437
2275
  continue
1438
- if current != root and should_skip_tree(str(current)):
2276
+ if current != root and should_skip_tree(str(current)) and not allow_default_skip_override:
1439
2277
  continue
1440
2278
  try:
1441
2279
  st = current.stat()
@@ -1459,13 +2297,13 @@ def _iter_files(
1459
2297
  if entry.is_symlink():
1460
2298
  continue
1461
2299
  if entry.is_dir():
1462
- if should_skip_tree(str(entry)):
2300
+ if should_skip_tree(str(entry)) and not allow_default_skip_override:
1463
2301
  continue
1464
2302
  dirs.append(entry)
1465
2303
  continue
1466
2304
  if entry.is_file():
1467
2305
  normalized = norm_path(entry)
1468
- if should_skip_file(normalized):
2306
+ if not _should_index_file(conn, normalized, allow_default_skip_override=allow_default_skip_override):
1469
2307
  continue
1470
2308
  if start_after_norm and normalized <= start_after_norm:
1471
2309
  continue
@@ -1530,7 +2368,7 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
1530
2368
  return stats
1531
2369
  rows = conn.execute(
1532
2370
  """
1533
- SELECT a.asset_id, a.path, a.root_id, a.quick_fingerprint, a.depth, r.root_path
2371
+ SELECT a.asset_id, a.path, a.root_id, a.quick_fingerprint, a.depth, r.root_path, r.source
1534
2372
  FROM local_assets a
1535
2373
  LEFT JOIN local_index_roots r ON r.id = a.root_id
1536
2374
  WHERE a.status='active'
@@ -1544,11 +2382,12 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
1544
2382
  stats["checked"] += 1
1545
2383
  path = str(row["path"])
1546
2384
  root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
2385
+ allow_default_skip_override = _root_allows_default_skip_override(dict(row))
1547
2386
  if _is_excluded(path, exclusions):
1548
2387
  _purge_asset_ids(conn, [row["asset_id"]])
1549
2388
  stats["excluded"] += 1
1550
2389
  continue
1551
- if should_skip_file(path):
2390
+ if not _should_index_file(conn, path, allow_default_skip_override=allow_default_skip_override):
1552
2391
  _purge_asset_ids(conn, [row["asset_id"]])
1553
2392
  stats["excluded"] += 1
1554
2393
  continue
@@ -1567,7 +2406,7 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
1567
2406
  _record_scan_error(conn, stats, path, "live_reconcile", exc)
1568
2407
  continue
1569
2408
  if fingerprint != row["quick_fingerprint"]:
1570
- _, changed, state = _upsert_asset(conn, int(row["root_id"] or 0), file_path, seen_at, int(row["depth"] or 2))
2409
+ _, changed, state = _upsert_asset(conn, int(row["root_id"] or 0), file_path, seen_at, int(row["depth"] or 2), allow_default_skip_override=allow_default_skip_override)
1571
2410
  if changed:
1572
2411
  stats["modified"] += 1
1573
2412
  if state != "ok":
@@ -1616,6 +2455,7 @@ def _scan_known_directory(
1616
2455
  *,
1617
2456
  file_limit: int,
1618
2457
  dir_limit: int,
2458
+ allow_default_skip_override: bool = False,
1619
2459
  ) -> None:
1620
2460
  stack = [directory]
1621
2461
  seen_at = now()
@@ -1626,7 +2466,7 @@ def _scan_known_directory(
1626
2466
  _mark_dir_subtree_deleted(conn, str(current), seen_at)
1627
2467
  stats["excluded_dirs"] += 1
1628
2468
  continue
1629
- if current != directory and should_skip_tree(str(current)):
2469
+ if current != directory and should_skip_tree(str(current)) and not allow_default_skip_override:
1630
2470
  continue
1631
2471
  try:
1632
2472
  st = current.stat()
@@ -1648,7 +2488,7 @@ def _scan_known_directory(
1648
2488
  if entry.is_symlink():
1649
2489
  continue
1650
2490
  if entry.is_dir():
1651
- if should_skip_tree(str(entry)):
2491
+ if should_skip_tree(str(entry)) and not allow_default_skip_override:
1652
2492
  continue
1653
2493
  changed, _ = _upsert_dir(conn, root_id, entry, seen_at)
1654
2494
  seen_dirs.add(norm_path(entry))
@@ -1656,12 +2496,12 @@ def _scan_known_directory(
1656
2496
  stack.append(entry)
1657
2497
  continue
1658
2498
  if entry.is_file():
1659
- if should_skip_file(str(entry)):
2499
+ if not _should_index_file(conn, entry, allow_default_skip_override=allow_default_skip_override):
1660
2500
  continue
1661
2501
  seen_files.add(norm_path(entry))
1662
2502
  if stats["files_scanned"] >= file_limit:
1663
2503
  continue
1664
- _, changed, state = _upsert_asset(conn, root_id, entry, seen_at, root_depth)
2504
+ _, changed, state = _upsert_asset(conn, root_id, entry, seen_at, root_depth, allow_default_skip_override=allow_default_skip_override)
1665
2505
  stats["files_scanned"] += 1
1666
2506
  if changed:
1667
2507
  stats["files_changed"] += 1
@@ -1691,7 +2531,7 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
1691
2531
  return stats
1692
2532
  rows = conn.execute(
1693
2533
  """
1694
- SELECT d.dir_id, d.path, d.quick_fingerprint, d.root_id, r.root_path, r.depth
2534
+ SELECT d.dir_id, d.path, d.quick_fingerprint, d.root_id, r.root_path, r.depth, r.source
1695
2535
  FROM local_index_dirs d
1696
2536
  LEFT JOIN local_index_roots r ON r.id = d.root_id
1697
2537
  WHERE d.status='active'
@@ -1705,11 +2545,12 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
1705
2545
  stats["checked"] += 1
1706
2546
  dir_path = Path(row["path"])
1707
2547
  root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
2548
+ allow_default_skip_override = _root_allows_default_skip_override(dict(row))
1708
2549
  if _is_excluded(str(dir_path), exclusions):
1709
2550
  stats["files_deleted"] += _mark_dir_subtree_deleted(conn, str(dir_path), seen_at)
1710
2551
  stats["excluded_dirs"] += 1
1711
2552
  continue
1712
- if should_skip_tree(str(dir_path)):
2553
+ if should_skip_tree(str(dir_path)) and not allow_default_skip_override:
1713
2554
  stats["files_deleted"] += _purge_dir_subtree(conn, str(dir_path))
1714
2555
  stats["excluded_dirs"] += 1
1715
2556
  continue
@@ -1737,6 +2578,7 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
1737
2578
  stats,
1738
2579
  file_limit=file_limit,
1739
2580
  dir_limit=dir_limit,
2581
+ allow_default_skip_override=allow_default_skip_override,
1740
2582
  )
1741
2583
  else:
1742
2584
  conn.execute("UPDATE local_index_dirs SET updated_at=? WHERE dir_id=?", (seen_at, row["dir_id"]))
@@ -1750,6 +2592,7 @@ def reconcile_live_changes(
1750
2592
  file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
1751
2593
  ) -> dict:
1752
2594
  conn = _conn()
2595
+ seed_core_file_type_rules(conn)
1753
2596
  if _is_paused():
1754
2597
  return {"ok": True, "paused": True, "assets": {}, "dirs": {}}
1755
2598
  exclusions = [row["path"] for row in list_exclusions(readonly=False)]
@@ -1781,6 +2624,7 @@ def reconcile_live_changes(
1781
2624
 
1782
2625
  def scan_once(*, limit: int | None = None) -> dict:
1783
2626
  conn = _conn()
2627
+ seed_core_file_type_rules(conn)
1784
2628
  if _is_paused():
1785
2629
  log_event("info", "scan_skipped_paused", "Local memory scan skipped because indexing is paused")
1786
2630
  return {"ok": True, "paused": True, "roots": 0, "seen": 0, "changed": 0, "errors": 0, "partial": False}
@@ -1792,8 +2636,9 @@ def scan_once(*, limit: int | None = None) -> dict:
1792
2636
  for root in roots:
1793
2637
  root_path = Path(root["root_path"]).expanduser()
1794
2638
  root_id = int(root["id"])
2639
+ allow_default_skip_override = _root_allows_default_skip_override(dict(root))
1795
2640
  root_initial_complete = _root_initial_scan_complete(conn, dict(root))
1796
- if should_skip_tree(str(root_path)) and not _allow_explicit_blocked_root(str(root_path)):
2641
+ if should_skip_tree(str(root_path)) and not allow_default_skip_override and not _allow_explicit_blocked_root(str(root_path)):
1797
2642
  conn.execute(
1798
2643
  "UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
1799
2644
  (now(), now(), root_id),
@@ -1823,8 +2668,16 @@ def scan_once(*, limit: int | None = None) -> dict:
1823
2668
  start_after=str(checkpoint["current_path"] or ""),
1824
2669
  seen_at=cycle_started_at,
1825
2670
  stats=totals,
2671
+ allow_default_skip_override=allow_default_skip_override,
1826
2672
  ):
1827
- asset_id, changed, state = _upsert_asset(conn, root_id, file_path, cycle_started_at, int(root["depth"] or 2))
2673
+ asset_id, changed, state = _upsert_asset(
2674
+ conn,
2675
+ root_id,
2676
+ file_path,
2677
+ cycle_started_at,
2678
+ int(root["depth"] or 2),
2679
+ allow_default_skip_override=allow_default_skip_override,
2680
+ )
1828
2681
  last_seen_path = norm_path(file_path)
1829
2682
  totals["seen"] += 1
1830
2683
  seen_for_root += 1
@@ -2396,7 +3249,7 @@ def process_jobs(*, limit: int = 100) -> dict:
2396
3249
  path=row["path"],
2397
3250
  phase=job_type,
2398
3251
  error_code=type(exc).__name__,
2399
- user_message="Algunos archivos no se pudieron leer",
3252
+ user_message="Some files could not be read",
2400
3253
  technical_detail=str(exc),
2401
3254
  retryable=not terminal,
2402
3255
  )
@@ -2970,6 +3823,7 @@ def _status_from_conn(conn, *, readonly: bool = False) -> dict:
2970
3823
  "volumes": volumes,
2971
3824
  "roots": roots,
2972
3825
  "exclusions": _list_exclusions_conn(conn),
3826
+ "file_types": _shape_file_type_rules(_list_file_type_rules_conn(conn)),
2973
3827
  "problems": problems,
2974
3828
  "permissions": [],
2975
3829
  "models": model_status()["models"],