nexo-brain 7.25.2 → 7.25.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,10 +17,10 @@ from typing import Any
17
17
 
18
18
  import paths
19
19
  from . import embeddings
20
- from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db
20
+ from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db, local_context_db_path
21
21
  from .extractors import canonical_entity_key, chunk_text, contains_secret, entities, entity_mentions, extract_text, normalize_entity_alias, summarize
22
22
  from .logging import log_event, tail
23
- from .privacy import classify_path, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
23
+ from .privacy import classify_path, is_local_email_db, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
24
24
  from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
25
25
 
26
26
  LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
@@ -34,6 +34,9 @@ DEFAULT_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_DEPTH", "24")
34
34
  DEFAULT_EMAIL_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_EMAIL_ROOT_DEPTH", "24") or "24")
35
35
  DEFAULT_MOUNTED_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_MOUNTED_ROOT_DEPTH", "24") or "24")
36
36
  DEFAULT_SYSTEM_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_SYSTEM_ROOT_DEPTH", "24") or "24")
37
+ DEFAULT_ROOT_SEED_VERSION = 2
38
+ ROOT_SEED_VERSION_KEY = "local_index_roots_seed_version"
39
+ LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES = int(os.environ.get("NEXO_LOCAL_INDEX_V2_REBUILD_THRESHOLD_BYTES", str(2 * 1024 * 1024 * 1024)) or str(2 * 1024 * 1024 * 1024))
37
40
  DEFAULT_CONTEXT_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_MAX_CHARS", "20000") or "20000")
38
41
  DEFAULT_ROUTER_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_ROUTER_MAX_CHARS", "6000") or "6000")
39
42
  DEFAULT_MAX_JOB_ATTEMPTS = int(os.environ.get("NEXO_LOCAL_INDEX_MAX_JOB_ATTEMPTS", "3") or "3")
@@ -66,7 +69,6 @@ HIGH_VALUE_DOCUMENT_SUFFIXES = {
66
69
  ".pptx",
67
70
  ".pages",
68
71
  ".numbers",
69
- ".key",
70
72
  ".rtf",
71
73
  ".odt",
72
74
  ".ods",
@@ -84,6 +86,65 @@ EMAIL_DOCUMENT_SUFFIXES = {
84
86
  ".emlx",
85
87
  ".msg",
86
88
  }
89
+ CODE_DOCUMENT_SUFFIXES = {
90
+ ".py",
91
+ ".js",
92
+ ".ts",
93
+ ".tsx",
94
+ ".jsx",
95
+ ".php",
96
+ ".sql",
97
+ ".json",
98
+ ".yaml",
99
+ ".yml",
100
+ ".toml",
101
+ ".html",
102
+ ".css",
103
+ }
104
+ IMAGE_METADATA_SUFFIXES = {
105
+ ".jpg",
106
+ ".jpeg",
107
+ ".png",
108
+ ".gif",
109
+ ".heic",
110
+ ".webp",
111
+ ".tif",
112
+ ".tiff",
113
+ ".bmp",
114
+ ".raw",
115
+ ".dng",
116
+ }
117
+ MEDIA_METADATA_SUFFIXES = {
118
+ ".mp3",
119
+ ".m4a",
120
+ ".wav",
121
+ ".aac",
122
+ ".flac",
123
+ ".mp4",
124
+ ".mov",
125
+ ".avi",
126
+ ".mkv",
127
+ ".m4v",
128
+ }
129
+ IGNORED_BINARY_SUFFIXES = {
130
+ ".app",
131
+ ".bin",
132
+ ".class",
133
+ ".dll",
134
+ ".dmg",
135
+ ".dylib",
136
+ ".exe",
137
+ ".iso",
138
+ ".jar",
139
+ ".lock",
140
+ ".o",
141
+ ".obj",
142
+ ".pyc",
143
+ ".so",
144
+ ".swp",
145
+ ".swo",
146
+ ".tmp",
147
+ }
87
148
  HIGH_VALUE_DIRECTORY_NAMES = {
88
149
  "users",
89
150
  "home",
@@ -213,26 +274,249 @@ def _with_sqlite_busy_retry(callback, *, attempts: int | None = None):
213
274
  return None
214
275
 
215
276
 
216
- def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> dict:
277
+ def _normalize_source(source: str | None) -> str:
278
+ value = str(source or "user").strip().lower().replace("-", "_")
279
+ return value or "user"
280
+
281
+
282
+ def _normalize_extension(extension: str) -> str:
283
+ value = str(extension or "").strip().lower()
284
+ if not value:
285
+ return ""
286
+ if not value.startswith("."):
287
+ value = "." + value
288
+ return value
289
+
290
+
291
+ def _normalize_file_type_action(action: str | None) -> str:
292
+ value = str(action or "").strip().lower()
293
+ if value in {"include", "extract", "read", "full"}:
294
+ return "extract"
295
+ if value in {"metadata", "inventory", "index"}:
296
+ return "metadata"
297
+ if value in {"exclude", "ignore", "skip", "blocked"}:
298
+ return "ignore"
299
+ return "ignore"
300
+
301
+
302
+ def _default_file_type_rule_specs() -> list[dict]:
303
+ specs: list[dict] = []
304
+ for suffix in sorted(HIGH_VALUE_DOCUMENT_SUFFIXES):
305
+ specs.append({"extension": suffix, "action": "extract", "priority": 90, "reason": "core_high_value_document"})
306
+ for suffix in sorted(KNOWN_TEXT_SUFFIXES):
307
+ specs.append({"extension": suffix, "action": "extract", "priority": 82, "reason": "core_text_document"})
308
+ for suffix in sorted(EMAIL_DOCUMENT_SUFFIXES):
309
+ specs.append({"extension": suffix, "action": "extract", "priority": 70, "reason": "core_email_document"})
310
+ for suffix in sorted(CODE_DOCUMENT_SUFFIXES):
311
+ specs.append({"extension": suffix, "action": "extract", "priority": 55, "reason": "core_code_document"})
312
+ for suffix in sorted(IMAGE_METADATA_SUFFIXES):
313
+ specs.append({"extension": suffix, "action": "metadata", "priority": 35, "reason": "core_photo_metadata"})
314
+ for suffix in sorted(MEDIA_METADATA_SUFFIXES):
315
+ specs.append({"extension": suffix, "action": "metadata", "priority": 25, "reason": "core_media_metadata"})
316
+ for suffix in sorted(IGNORED_BINARY_SUFFIXES):
317
+ specs.append({"extension": suffix, "action": "ignore", "priority": 0, "reason": "core_binary_or_transient"})
318
+ return specs
319
+
320
+
321
+ def seed_core_file_type_rules(conn=None) -> dict:
322
+ conn = conn or _conn()
323
+ created_or_updated = 0
324
+ timestamp = now()
325
+ for spec in _default_file_type_rule_specs():
326
+ conn.execute(
327
+ """
328
+ INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
329
+ VALUES (?, ?, 'core_default', ?, ?, ?, ?)
330
+ ON CONFLICT(extension, source) DO UPDATE SET
331
+ action=excluded.action,
332
+ priority=excluded.priority,
333
+ reason=excluded.reason,
334
+ updated_at=excluded.updated_at
335
+ """,
336
+ (spec["extension"], spec["action"], int(spec["priority"]), spec["reason"], timestamp, timestamp),
337
+ )
338
+ created_or_updated += 1
339
+ return {"ok": True, "rules": created_or_updated}
340
+
341
+
342
+ def _list_file_type_rules_conn(conn) -> list[dict]:
343
+ rows = conn.execute(
344
+ """
345
+ SELECT *
346
+ FROM local_index_file_type_rules
347
+ ORDER BY
348
+ CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END,
349
+ extension
350
+ """
351
+ ).fetchall()
352
+ return [dict(row) for row in rows]
353
+
354
+
355
+ def _shape_file_type_rules(rows: list[dict]) -> dict:
356
+ effective: dict[str, dict] = {}
357
+ for row in rows:
358
+ ext = str(row.get("extension") or "")
359
+ if ext not in effective or row.get("source") == "user":
360
+ effective[ext] = row
361
+ return {"ok": True, "rules": rows, "effective": list(effective.values())}
362
+
363
+
364
+ def _effective_file_type_rule(conn, extension: str) -> dict:
365
+ ext = _normalize_extension(extension)
366
+ if not ext:
367
+ return {"extension": "", "action": "ignore", "source": "implicit", "priority": 0, "reason": "missing_extension"}
368
+ rows = conn.execute(
369
+ """
370
+ SELECT *
371
+ FROM local_index_file_type_rules
372
+ WHERE extension=?
373
+ ORDER BY CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END
374
+ LIMIT 1
375
+ """,
376
+ (ext,),
377
+ ).fetchall()
378
+ if rows:
379
+ return dict(rows[0])
380
+ if is_local_email_tree(ext):
381
+ return {"extension": ext, "action": "extract", "source": "implicit", "priority": 70, "reason": "local_email"}
382
+ return {"extension": ext, "action": "ignore", "source": "implicit", "priority": 0, "reason": "unknown_extension"}
383
+
384
+
385
+ def list_file_type_rules(*, readonly: bool = True) -> dict:
386
+ if not readonly:
387
+ conn = _conn()
388
+ seed_core_file_type_rules(conn)
389
+ conn.commit()
390
+ rows = _list_file_type_rules_conn(conn)
391
+ else:
392
+ conn = _read_conn()
393
+ try:
394
+ rows = _list_file_type_rules_conn(conn)
395
+ finally:
396
+ _close_read_conn(conn)
397
+ return _shape_file_type_rules(rows)
398
+
399
+
400
+ def _purge_assets_by_extension(conn, extension: str) -> dict:
401
+ ext = _normalize_extension(extension)
402
+ if not ext:
403
+ return {"assets": 0}
404
+ rows = conn.execute("SELECT asset_id FROM local_assets WHERE lower(extension)=?", (ext,)).fetchall()
405
+ return _purge_asset_ids(conn, [str(row["asset_id"]) for row in rows])
406
+
407
+
408
+ def set_file_type_rule(extension: str, *, action: str = "extract", source: str = "user", priority: int | None = None, reason: str = "user") -> dict:
409
+ conn = _conn()
410
+ ext = _normalize_extension(extension)
411
+ if not ext:
412
+ return {"ok": False, "error": "extension_required"}
413
+ normalized_action = _normalize_file_type_action(action)
414
+ source_value = _normalize_source(source)
415
+ priority_value = int(priority if priority is not None else (82 if normalized_action == "extract" else 20 if normalized_action == "metadata" else 0))
416
+ timestamp = now()
417
+ conn.execute(
418
+ """
419
+ INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
420
+ VALUES (?, ?, ?, ?, ?, ?, ?)
421
+ ON CONFLICT(extension, source) DO UPDATE SET
422
+ action=excluded.action,
423
+ priority=excluded.priority,
424
+ reason=excluded.reason,
425
+ updated_at=excluded.updated_at
426
+ """,
427
+ (ext, normalized_action, source_value, priority_value, reason, timestamp, timestamp),
428
+ )
429
+ cleanup = _purge_assets_by_extension(conn, ext) if normalized_action == "ignore" and source_value == "user" else {"assets": 0}
430
+ conn.commit()
431
+ log_event("info", "file_type_rule_set", "Local memory file type rule set", extension=ext, action=normalized_action, source=source_value, cleanup=cleanup)
432
+ return {"ok": True, "extension": ext, "action": normalized_action, "source": source_value, "priority": priority_value, "cleanup": cleanup}
433
+
434
+
435
+ def remove_file_type_rule(extension: str, *, source: str = "user") -> dict:
436
+ conn = _conn()
437
+ ext = _normalize_extension(extension)
438
+ source_value = _normalize_source(source)
439
+ conn.execute("DELETE FROM local_index_file_type_rules WHERE extension=? AND source=?", (ext, source_value))
440
+ conn.commit()
441
+ log_event("info", "file_type_rule_removed", "Local memory file type rule removed", extension=ext, source=source_value)
442
+ return {"ok": True, "extension": ext, "source": source_value}
443
+
444
+
445
+ def reset_file_type_rules() -> dict:
446
+ conn = _conn()
447
+ deleted = int(conn.execute("DELETE FROM local_index_file_type_rules WHERE source='user'").rowcount or 0)
448
+ seeded = seed_core_file_type_rules(conn)
449
+ conn.commit()
450
+ log_event("info", "file_type_rules_reset", "Local memory user file type overrides reset", deleted=deleted)
451
+ return {"ok": True, "deleted": deleted, "core_rules": int(seeded.get("rules") or 0), "file_types": list_file_type_rules(readonly=False)}
452
+
453
+
454
+ def _file_type_action(conn, path: str | Path) -> str:
455
+ p = Path(path)
456
+ if is_local_email_db(str(path)) or is_local_email_tree(str(path)):
457
+ return "extract"
458
+ return str(_effective_file_type_rule(conn, p.suffix.lower()).get("action") or "ignore")
459
+
460
+
461
+ def _should_index_file(conn, path: str | Path) -> bool:
462
+ if should_skip_file(str(path)):
463
+ return False
464
+ return _file_type_action(conn, path) != "ignore"
465
+
466
+
467
+ def _should_extract_file(conn, path: str | Path, depth: int) -> bool:
468
+ if depth < 2 or should_skip_file(str(path)):
469
+ return False
470
+ return _file_type_action(conn, path) == "extract"
471
+
472
+
473
+ def add_root(path: str, *, mode: str = "normal", depth: int | None = None, source: str = "user", remote: bool = False, seed_version: int | None = None) -> dict:
217
474
  conn = _conn()
218
475
  root_path = norm_path(path)
219
476
  if should_skip_tree(root_path) and not _allow_explicit_blocked_root(root_path):
220
477
  log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
221
478
  return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
222
479
  depth_value = 2 if depth is None else int(depth)
223
- existing = conn.execute("SELECT id, status FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
480
+ source_value = _normalize_source(source)
481
+ seed_value = int(seed_version if seed_version is not None else (DEFAULT_ROOT_SEED_VERSION if source_value == "core_default" else 0))
482
+ existing = conn.execute("SELECT id, status, source, depth FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
483
+ if existing and str(existing["status"] or "") == "active" and source_value == "user" and str(existing["source"] or "") == "core_default":
484
+ return {"ok": True, "root_path": root_path, "mode": mode, "depth": int(existing["depth"] or depth_value), "already_included": True, "included_by": "core_default"}
485
+ if source_value == "user":
486
+ parent = conn.execute(
487
+ """
488
+ SELECT root_path, source, depth
489
+ FROM local_index_roots
490
+ WHERE status='active' AND source='core_default'
491
+ ORDER BY length(root_path) DESC
492
+ """
493
+ ).fetchall()
494
+ for row in parent:
495
+ parent_path = str(row["root_path"] or "")
496
+ if _is_nested_path(root_path, parent_path):
497
+ return {
498
+ "ok": True,
499
+ "root_path": root_path,
500
+ "already_included": True,
501
+ "included_by": "core_default",
502
+ "included_root": parent_path,
503
+ "depth": int(row["depth"] or depth_value),
504
+ }
224
505
  conn.execute(
225
506
  """
226
- INSERT INTO local_index_roots(root_path, display_path, mode, depth, status, created_at, updated_at)
227
- VALUES (?, ?, ?, ?, 'active', ?, ?)
507
+ INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
508
+ VALUES (?, ?, ?, ?, ?, ?, ?, 'active', ?, ?)
228
509
  ON CONFLICT(root_path) DO UPDATE SET
229
510
  display_path=excluded.display_path,
230
511
  mode=excluded.mode,
231
512
  depth=excluded.depth,
513
+ source=excluded.source,
514
+ remote=excluded.remote,
515
+ seed_version=excluded.seed_version,
232
516
  status='active',
233
517
  updated_at=excluded.updated_at
234
518
  """,
235
- (root_path, path, mode, depth_value, now(), now()),
519
+ (root_path, path, mode, depth_value, source_value, 1 if remote else 0, seed_value, now(), now()),
236
520
  )
237
521
  row = conn.execute("SELECT id FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
238
522
  existing_status = str(existing["status"] or "") if existing else ""
@@ -241,8 +525,8 @@ def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> di
241
525
  _set_initial_index_complete(conn, False)
242
526
  _set_initial_index_started_at(conn, now())
243
527
  conn.commit()
244
- log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value)
245
- return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value}
528
+ log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value, source=source_value)
529
+ return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value, "source": source_value, "remote": bool(remote)}
246
530
 
247
531
 
248
532
  def remove_root(path: str) -> dict:
@@ -331,15 +615,40 @@ def _mounted_volume_roots() -> list[str]:
331
615
 
332
616
 
333
617
  def _system_volume_roots() -> list[str]:
618
+ if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
619
+ if sys.platform == "darwin":
620
+ return ["/"]
621
+ if sys.platform.startswith("win"):
622
+ return []
623
+ return ["/"]
334
624
  if os.environ.get("NEXO_LOCAL_INDEX_DISABLE_SYSTEM_ROOTS", "").strip() in {"1", "true", "yes"}:
335
625
  return []
336
- if sys.platform == "darwin":
337
- return ["/"]
626
+ return []
627
+
628
+
629
+ def _user_content_roots() -> list[str]:
630
+ home = Path.home()
631
+ candidates: list[Path] = [home]
338
632
  if sys.platform.startswith("win"):
339
- # Windows roots are discovered as mounted drive roots so mapped drives
340
- # and removable disks share the same code path.
341
- return []
342
- return ["/"]
633
+ candidates.extend([
634
+ home / "OneDrive",
635
+ home / "OneDrive - Personal",
636
+ home / "OneDrive - Empresa",
637
+ ])
638
+ for key in ("OneDrive", "OneDriveCommercial", "OneDriveConsumer"):
639
+ value = os.environ.get(key, "").strip()
640
+ if value:
641
+ candidates.append(Path(value))
642
+ elif sys.platform == "darwin":
643
+ candidates.append(home / "Library" / "Mobile Documents" / "com~apple~CloudDocs")
644
+ roots: list[str] = []
645
+ for candidate in candidates:
646
+ try:
647
+ if candidate.exists() and candidate.is_dir() and (not should_skip_tree(str(candidate)) or _allow_explicit_blocked_root(str(candidate))):
648
+ roots.append(str(candidate))
649
+ except Exception:
650
+ continue
651
+ return _dedupe_roots(roots)
343
652
 
344
653
 
345
654
  def _local_email_roots() -> list[str]:
@@ -379,42 +688,501 @@ def default_roots() -> list[str]:
379
688
 
380
689
 
381
690
  def default_root_specs() -> list[tuple[str, int]]:
382
- home = Path.home()
383
691
  configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
384
- system_specs = [(root, DEFAULT_SYSTEM_ROOT_DEPTH) for root in _system_volume_roots()]
385
- mounted_specs = [(root, DEFAULT_MOUNTED_ROOT_DEPTH) for root in _mounted_volume_roots()]
692
+ system_specs: list[tuple[str, int]] = []
693
+ if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
694
+ system_specs = [(root, DEFAULT_SYSTEM_ROOT_DEPTH) for root in _system_volume_roots()]
695
+ mounted_specs = []
696
+ if os.environ.get("NEXO_LOCAL_INDEX_INCLUDE_MOUNTED_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
697
+ mounted_specs = [(root, DEFAULT_MOUNTED_ROOT_DEPTH) for root in _mounted_volume_roots()]
386
698
  configured_specs = [(item, DEFAULT_ROOT_DEPTH) for item in configured.split(os.pathsep) if item.strip()]
387
- base_specs = system_specs + mounted_specs + configured_specs
388
- if not base_specs:
389
- base_specs = [(str(home), DEFAULT_ROOT_DEPTH)]
699
+ user_specs = [(root, DEFAULT_ROOT_DEPTH) for root in _user_content_roots()]
700
+ base_specs = user_specs + system_specs + mounted_specs + configured_specs
390
701
  return _dedupe_root_specs(
391
702
  base_specs
392
703
  + [(root, DEFAULT_EMAIL_ROOT_DEPTH) for root in _local_email_roots()]
393
704
  )
394
705
 
395
706
 
396
- def ensure_default_roots() -> dict:
397
- existing = {row["root_path"]: row for row in list_roots(readonly=False)}
707
+ def _all_roots_by_path_conn(conn) -> dict[str, dict]:
708
+ rows = conn.execute("SELECT * FROM local_index_roots ORDER BY root_path").fetchall()
709
+ return {str(row["root_path"]): dict(row) for row in rows}
710
+
711
+
712
+ def _seed_default_roots_conn(conn) -> dict:
713
+ existing = _all_roots_by_path_conn(conn)
398
714
  created = []
399
715
  updated = []
716
+ skipped_removed = []
400
717
  for root, depth in default_root_specs():
401
718
  candidate = Path(root).expanduser()
402
719
  if not candidate.exists() or not candidate.is_dir():
403
720
  continue
404
- existing_row = existing.get(norm_path(str(candidate)))
721
+ root_path = norm_path(str(candidate))
722
+ existing_row = existing.get(root_path)
405
723
  if existing_row:
724
+ if str(existing_row.get("status") or "") == "removed":
725
+ skipped_removed.append({"root_path": root_path})
726
+ continue
406
727
  current_depth = int(existing_row.get("depth") or 0)
407
728
  if current_depth < depth:
408
- conn = _conn()
409
729
  conn.execute(
410
- "UPDATE local_index_roots SET depth=?, updated_at=? WHERE root_path=?",
411
- (depth, now(), existing_row["root_path"]),
730
+ "UPDATE local_index_roots SET depth=?, source='core_default', seed_version=?, updated_at=? WHERE root_path=?",
731
+ (depth, DEFAULT_ROOT_SEED_VERSION, now(), root_path),
412
732
  )
413
- conn.commit()
414
- updated.append({"root_path": existing_row["root_path"], "depth": depth})
733
+ updated.append({"root_path": root_path, "depth": depth})
734
+ continue
735
+ timestamp = now()
736
+ conn.execute(
737
+ """
738
+ INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
739
+ VALUES (?, ?, 'normal', ?, 'core_default', 0, ?, 'active', ?, ?)
740
+ """,
741
+ (root_path, str(candidate), int(depth), DEFAULT_ROOT_SEED_VERSION, timestamp, timestamp),
742
+ )
743
+ created.append({"root_path": root_path, "depth": int(depth)})
744
+ existing[root_path] = {
745
+ "root_path": root_path,
746
+ "display_path": str(candidate),
747
+ "mode": "normal",
748
+ "depth": int(depth),
749
+ "source": "core_default",
750
+ "remote": 0,
751
+ "seed_version": DEFAULT_ROOT_SEED_VERSION,
752
+ "status": "active",
753
+ }
754
+ return {"created": created, "updated": updated, "skipped_removed": skipped_removed}
755
+
756
+
757
+ def ensure_default_roots() -> dict:
758
+ conn = _conn()
759
+ seed_core_file_type_rules(conn)
760
+ seeded = _seed_default_roots_conn(conn)
761
+ migration = migrate_roots_seed_v2(dry_run=False, _already_seeded=True)
762
+ try:
763
+ conn.commit()
764
+ except sqlite3.ProgrammingError:
765
+ # A large legacy DB may have been archived and replaced during migration.
766
+ pass
767
+ return {
768
+ "ok": True,
769
+ "created": len(seeded["created"]),
770
+ "updated": len(seeded["updated"]),
771
+ "skipped_removed": len(seeded["skipped_removed"]),
772
+ "migration": migration,
773
+ "roots": list_roots(readonly=False),
774
+ "file_types": list_file_type_rules(readonly=False),
775
+ }
776
+
777
+
778
+ def _local_context_sidecar_paths(db_path: Path) -> list[Path]:
779
+ return [db_path, db_path.with_name(db_path.name + "-wal"), db_path.with_name(db_path.name + "-shm")]
780
+
781
+
782
+ def _local_context_db_size_bytes() -> int:
783
+ total = 0
784
+ for candidate in _local_context_sidecar_paths(local_context_db_path()):
785
+ try:
786
+ if candidate.exists():
787
+ total += int(candidate.stat().st_size)
788
+ except OSError:
789
+ continue
790
+ return total
791
+
792
+
793
+ def _capture_roots_v2_config(conn) -> dict:
794
+ state_rows = [
795
+ dict(row)
796
+ for row in conn.execute(
797
+ """
798
+ SELECT key, value, updated_at
799
+ FROM local_index_state
800
+ WHERE key NOT LIKE 'root_initial_scan:%'
801
+ AND key NOT IN (?, ?, ?)
802
+ ORDER BY key
803
+ """,
804
+ (ROOT_SEED_VERSION_KEY, INITIAL_INDEX_COMPLETE_KEY, INITIAL_INDEX_STARTED_AT_KEY),
805
+ ).fetchall()
806
+ ]
807
+ root_rows = []
808
+ for row in conn.execute(
809
+ """
810
+ SELECT root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at
811
+ FROM local_index_roots
812
+ ORDER BY root_path
813
+ """
814
+ ).fetchall():
815
+ shaped = dict(row)
816
+ source = str(shaped.get("source") or "legacy")
817
+ status = str(shaped.get("status") or "")
818
+ root_path = str(shaped.get("root_path") or "")
819
+ preserve = (
820
+ source == "user"
821
+ or bool(shaped.get("remote"))
822
+ or status == "removed"
823
+ or (source == "core_default" and status == "active" and not _is_disk_root_path(root_path))
824
+ )
825
+ if preserve:
826
+ root_rows.append(shaped)
827
+ exclusion_rows = [
828
+ dict(row)
829
+ for row in conn.execute(
830
+ """
831
+ SELECT path, display_path, source, kind, reason, created_at
832
+ FROM local_index_exclusions
833
+ ORDER BY path
834
+ """
835
+ ).fetchall()
836
+ ]
837
+ file_type_rows = [
838
+ dict(row)
839
+ for row in conn.execute(
840
+ """
841
+ SELECT extension, action, source, priority, reason, created_at, updated_at
842
+ FROM local_index_file_type_rules
843
+ WHERE source='user'
844
+ ORDER BY extension
845
+ """
846
+ ).fetchall()
847
+ ]
848
+ return {
849
+ "state": state_rows,
850
+ "roots": root_rows,
851
+ "exclusions": exclusion_rows,
852
+ "file_types": file_type_rows,
853
+ }
854
+
855
+
856
+ def _restore_roots_v2_config(conn, config: dict) -> dict:
857
+ restored = {"state": 0, "roots": 0, "exclusions": 0, "file_types": 0}
858
+ timestamp = now()
859
+ for row in config.get("state") or []:
860
+ conn.execute(
861
+ """
862
+ INSERT OR REPLACE INTO local_index_state(key, value, updated_at)
863
+ VALUES (?, ?, ?)
864
+ """,
865
+ (row.get("key"), row.get("value") or "", float(row.get("updated_at") or timestamp)),
866
+ )
867
+ restored["state"] += 1
868
+ for row in config.get("roots") or []:
869
+ root_path = norm_path(str(row.get("root_path") or ""))
870
+ if not root_path:
871
+ continue
872
+ conn.execute(
873
+ """
874
+ INSERT OR REPLACE INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
875
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
876
+ """,
877
+ (
878
+ root_path,
879
+ row.get("display_path") or root_path,
880
+ row.get("mode") or "normal",
881
+ int(row.get("depth") or DEFAULT_ROOT_DEPTH),
882
+ _normalize_source(row.get("source") or "user"),
883
+ 1 if row.get("remote") else 0,
884
+ int(row.get("seed_version") or 0),
885
+ row.get("status") or "active",
886
+ float(row.get("created_at") or timestamp),
887
+ float(row.get("updated_at") or timestamp),
888
+ ),
889
+ )
890
+ restored["roots"] += 1
891
+ for row in config.get("exclusions") or []:
892
+ exclusion_path = norm_path(str(row.get("path") or ""))
893
+ if not exclusion_path:
894
+ continue
895
+ conn.execute(
896
+ """
897
+ INSERT OR REPLACE INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
898
+ VALUES (?, ?, ?, ?, ?, ?)
899
+ """,
900
+ (
901
+ exclusion_path,
902
+ row.get("display_path") or exclusion_path,
903
+ _normalize_source(row.get("source") or "user"),
904
+ row.get("kind") or "folder",
905
+ row.get("reason") or "user",
906
+ float(row.get("created_at") or timestamp),
907
+ ),
908
+ )
909
+ restored["exclusions"] += 1
910
+ for row in config.get("file_types") or []:
911
+ extension = _normalize_extension(str(row.get("extension") or ""))
912
+ if not extension:
913
+ continue
914
+ conn.execute(
915
+ """
916
+ INSERT OR REPLACE INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
917
+ VALUES (?, ?, ?, ?, ?, ?, ?)
918
+ """,
919
+ (
920
+ extension,
921
+ _normalize_file_type_action(str(row.get("action") or "ignore")),
922
+ _normalize_source(row.get("source") or "user"),
923
+ int(row.get("priority") or 0),
924
+ row.get("reason") or "user",
925
+ float(row.get("created_at") or timestamp),
926
+ float(row.get("updated_at") or timestamp),
927
+ ),
928
+ )
929
+ restored["file_types"] += 1
930
+ return restored
931
+
932
+
933
+ def _create_roots_v2_sqlite_backup(conn) -> dict:
934
+ db_path = local_context_db_path()
935
+ if not db_path.is_file():
936
+ return {"ok": True, "skipped": True, "reason": "db_missing"}
937
+ conn.commit()
938
+ backup_path = paths.create_backup_path("local-context-roots-v2", ".db")
939
+ backup_conn = None
940
+ try:
941
+ backup_conn = sqlite3.connect(str(backup_path))
942
+ conn.backup(backup_conn)
943
+ backup_conn.close()
944
+ backup_conn = None
945
+ backup_check = sqlite3.connect(str(backup_path))
946
+ try:
947
+ source_roots = int(conn.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
948
+ backup_roots = int(backup_check.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
949
+ finally:
950
+ backup_check.close()
951
+ if backup_roots < source_roots:
952
+ return {
953
+ "ok": False,
954
+ "error": "backup_validation_failed",
955
+ "path": str(backup_path),
956
+ "source_roots": source_roots,
957
+ "backup_roots": backup_roots,
958
+ }
959
+ prune = paths.finalize_backup_snapshot(backup_path)
960
+ return {"ok": True, "path": str(backup_path), "source_roots": source_roots, "backup_roots": backup_roots, "prune": prune}
961
+ except Exception as exc:
962
+ return {"ok": False, "error": str(exc), "path": str(backup_path)}
963
+ finally:
964
+ if backup_conn is not None:
965
+ try:
966
+ backup_conn.close()
967
+ except Exception:
968
+ pass
969
+
970
+
971
+ def _archive_rebuild_local_context_for_roots_v2(conn, summary: dict) -> dict:
972
+ db_path = local_context_db_path()
973
+ config = _capture_roots_v2_config(conn)
974
+ size_bytes = _local_context_db_size_bytes()
975
+ backup_dir = paths.create_backup_dir("local-context-roots-v2")
976
+ conn.commit()
977
+ close_local_context_db()
978
+ moved = []
979
+ try:
980
+ for candidate in _local_context_sidecar_paths(db_path):
981
+ if not candidate.exists():
982
+ continue
983
+ target = backup_dir / candidate.name
984
+ shutil.move(str(candidate), str(target))
985
+ moved.append({"path": str(candidate), "backup_path": str(target)})
986
+ fresh = _conn()
987
+ seed_core_file_type_rules(fresh)
988
+ restored = _restore_roots_v2_config(fresh, config)
989
+ seeded = _seed_default_roots_conn(fresh)
990
+ _set_state_conn(fresh, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
991
+ _set_initial_index_complete(fresh, False)
992
+ _set_initial_index_started_at(fresh, now())
993
+ fresh.commit()
994
+ prune = paths.finalize_backup_snapshot(backup_dir)
995
+ result = {
996
+ "ok": True,
997
+ "strategy": "archive_rebuild",
998
+ "backup_dir": str(backup_dir),
999
+ "size_bytes": size_bytes,
1000
+ "moved": moved,
1001
+ "preserved": restored,
1002
+ "seeded": seeded,
1003
+ "prune": prune,
1004
+ }
1005
+ log_event("info", "roots_seed_v2_archived_rebuilt", "Local memory roots seed v2 archived large DB and rebuilt config", summary=summary, result=result)
1006
+ return result
1007
+ except Exception as exc:
1008
+ return {
1009
+ "ok": False,
1010
+ "strategy": "archive_rebuild",
1011
+ "backup_dir": str(backup_dir),
1012
+ "size_bytes": size_bytes,
1013
+ "moved": moved,
1014
+ "error": str(exc),
1015
+ }
1016
+
1017
+
1018
+ def _is_disk_root_path(path: str) -> bool:
1019
+ normalized = norm_path(path)
1020
+ if normalized in {"/", "\\"}:
1021
+ return True
1022
+ return bool(re.match(r"^[A-Za-z]:\\?$", normalized))
1023
+
1024
+
1025
+ def _path_is_under_any(path: str, prefixes: list[str]) -> bool:
1026
+ value = norm_path(path)
1027
+ return any(value == prefix or value.startswith(_path_prefix(prefix)) for prefix in prefixes if prefix)
1028
+
1029
+
1030
+ def _best_root_id_for_path(path: str, roots: list[dict]) -> int | None:
1031
+ value = norm_path(path)
1032
+ best: tuple[int, int] | None = None
1033
+ for row in roots:
1034
+ root_path = str(row.get("root_path") or "")
1035
+ if not root_path or not (value == root_path or value.startswith(_path_prefix(root_path))):
415
1036
  continue
416
- created.append(add_root(str(candidate), mode="normal", depth=depth))
417
- return {"ok": True, "created": len(created), "updated": len(updated), "roots": list_roots(readonly=False)}
1037
+ candidate = (len(root_path), int(row.get("id") or 0))
1038
+ if best is None or candidate[0] > best[0]:
1039
+ best = candidate
1040
+ return best[1] if best else None
1041
+
1042
+
1043
+ def _purge_dir_ids(conn, dir_ids: list[str]) -> int:
1044
+ unique_ids = [item for item in dict.fromkeys(dir_ids) if item]
1045
+ deleted = 0
1046
+ for start in range(0, len(unique_ids), 500):
1047
+ batch = unique_ids[start:start + 500]
1048
+ placeholders = ",".join("?" for _ in batch)
1049
+ deleted += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
1050
+ return deleted
1051
+
1052
+
1053
+ def migrate_roots_seed_v2(*, dry_run: bool = True, _already_seeded: bool = False) -> dict:
1054
+ """Move legacy whole-disk roots to curated user roots and purge obvious noise."""
1055
+ conn = _conn()
1056
+ if not _already_seeded:
1057
+ seed_core_file_type_rules(conn)
1058
+ current_seed = _get_state_conn(conn, ROOT_SEED_VERSION_KEY, "0")
1059
+ if str(current_seed) == str(DEFAULT_ROOT_SEED_VERSION):
1060
+ return {"ok": True, "dry_run": dry_run, "needed": False, "seed_version": DEFAULT_ROOT_SEED_VERSION}
1061
+
1062
+ active_roots = [dict(row) for row in conn.execute("SELECT * FROM local_index_roots WHERE status='active'").fetchall()]
1063
+ keep_roots = [
1064
+ row for row in active_roots
1065
+ if str(row.get("status") or "") == "active"
1066
+ and not (
1067
+ _is_disk_root_path(str(row.get("root_path") or ""))
1068
+ and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
1069
+ )
1070
+ ]
1071
+ keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
1072
+ legacy_disk_roots = [
1073
+ row for row in active_roots
1074
+ if (
1075
+ _is_disk_root_path(str(row.get("root_path") or ""))
1076
+ and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
1077
+ )
1078
+ or (
1079
+ str(row.get("source") or "legacy") in {"legacy", "system_default"}
1080
+ and any(_is_nested_path(prefix, str(row.get("root_path") or "")) for prefix in keep_prefixes)
1081
+ )
1082
+ ]
1083
+ keep_roots = [row for row in keep_roots if row not in legacy_disk_roots]
1084
+ keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
1085
+ legacy_ids = {int(row.get("id") or 0) for row in legacy_disk_roots}
1086
+ legacy_prefixes = [str(row.get("root_path") or "") for row in legacy_disk_roots if row.get("root_path")]
1087
+
1088
+ asset_ids_to_purge: list[str] = []
1089
+ asset_remaps: dict[int, list[str]] = {}
1090
+ asset_rows = conn.execute("SELECT asset_id, root_id, path, extension, privacy_class FROM local_assets").fetchall()
1091
+ for row in asset_rows:
1092
+ path = str(row["path"] or "")
1093
+ under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
1094
+ action = _file_type_action(conn, path)
1095
+ unsafe = should_skip_file(path) or str(row["privacy_class"] or "") in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}
1096
+ if action == "ignore" or unsafe or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
1097
+ asset_ids_to_purge.append(str(row["asset_id"]))
1098
+ continue
1099
+ if under_legacy:
1100
+ new_root_id = _best_root_id_for_path(path, keep_roots)
1101
+ if new_root_id:
1102
+ asset_remaps.setdefault(new_root_id, []).append(str(row["asset_id"]))
1103
+
1104
+ dir_ids_to_purge: list[str] = []
1105
+ dir_remaps: dict[int, list[str]] = {}
1106
+ dir_rows = conn.execute("SELECT dir_id, root_id, path FROM local_index_dirs").fetchall()
1107
+ for row in dir_rows:
1108
+ path = str(row["path"] or "")
1109
+ under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
1110
+ if should_skip_tree(path) or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
1111
+ dir_ids_to_purge.append(str(row["dir_id"]))
1112
+ continue
1113
+ if under_legacy:
1114
+ new_root_id = _best_root_id_for_path(path, keep_roots)
1115
+ if new_root_id:
1116
+ dir_remaps.setdefault(new_root_id, []).append(str(row["dir_id"]))
1117
+
1118
+ summary = {
1119
+ "ok": True,
1120
+ "dry_run": dry_run,
1121
+ "needed": True,
1122
+ "legacy_disk_roots": [str(row.get("root_path") or "") for row in legacy_disk_roots],
1123
+ "keep_roots": keep_prefixes,
1124
+ "assets_to_purge": len(asset_ids_to_purge),
1125
+ "dirs_to_purge": len(dir_ids_to_purge),
1126
+ "assets_to_remap": sum(len(items) for items in asset_remaps.values()),
1127
+ "dirs_to_remap": sum(len(items) for items in dir_remaps.values()),
1128
+ "cleanup": {},
1129
+ }
1130
+ if dry_run:
1131
+ return summary
1132
+
1133
+ destructive = bool(
1134
+ asset_ids_to_purge
1135
+ or dir_ids_to_purge
1136
+ or legacy_ids
1137
+ or any(asset_remaps.values())
1138
+ or any(dir_remaps.values())
1139
+ )
1140
+ db_size = _local_context_db_size_bytes()
1141
+ summary["db_size_bytes"] = db_size
1142
+ if destructive and LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES > 0 and db_size > LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES:
1143
+ rebuild = _archive_rebuild_local_context_for_roots_v2(conn, summary)
1144
+ summary["cleanup"] = rebuild
1145
+ summary["strategy"] = "archive_rebuild"
1146
+ summary["ok"] = bool(rebuild.get("ok"))
1147
+ if not rebuild.get("ok"):
1148
+ summary["error"] = str(rebuild.get("error") or "archive_rebuild_failed")
1149
+ return summary
1150
+
1151
+ backup = None
1152
+ if destructive:
1153
+ backup = _create_roots_v2_sqlite_backup(conn)
1154
+ summary["backup"] = backup
1155
+ if not backup.get("ok"):
1156
+ summary["ok"] = False
1157
+ summary["error"] = "migration_backup_failed"
1158
+ return summary
1159
+
1160
+ for new_root_id, asset_ids in asset_remaps.items():
1161
+ for start in range(0, len(asset_ids), 500):
1162
+ batch = asset_ids[start:start + 500]
1163
+ placeholders = ",".join("?" for _ in batch)
1164
+ conn.execute(f"UPDATE local_assets SET root_id=?, updated_at=? WHERE asset_id IN ({placeholders})", (new_root_id, now(), *batch))
1165
+ for new_root_id, dir_ids in dir_remaps.items():
1166
+ for start in range(0, len(dir_ids), 500):
1167
+ batch = dir_ids[start:start + 500]
1168
+ placeholders = ",".join("?" for _ in batch)
1169
+ conn.execute(f"UPDATE local_index_dirs SET root_id=?, updated_at=? WHERE dir_id IN ({placeholders})", (new_root_id, now(), *batch))
1170
+ cleanup = _purge_asset_ids(conn, asset_ids_to_purge)
1171
+ cleanup["dirs"] = _purge_dir_ids(conn, dir_ids_to_purge)
1172
+ if legacy_ids:
1173
+ placeholders = ",".join("?" for _ in legacy_ids)
1174
+ conn.execute(f"DELETE FROM local_index_checkpoints WHERE root_id IN ({placeholders})", tuple(legacy_ids))
1175
+ conn.execute(
1176
+ f"UPDATE local_index_roots SET status='removed', source='core_removed', updated_at=? WHERE id IN ({placeholders})",
1177
+ (now(), *legacy_ids),
1178
+ )
1179
+ _set_state_conn(conn, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
1180
+ _set_initial_index_complete(conn, False)
1181
+ _set_initial_index_started_at(conn, now())
1182
+ summary["cleanup"] = cleanup
1183
+ summary["strategy"] = "in_place"
1184
+ log_event("info", "roots_seed_v2_migrated", "Local memory roots seed v2 applied", summary=summary)
1185
+ return summary
418
1186
 
419
1187
 
420
1188
  def _should_skip_mounted_root(candidate: Path) -> bool:
@@ -667,20 +1435,26 @@ def repair_index_hygiene() -> dict:
667
1435
  return local_index_hygiene(fix=True)
668
1436
 
669
1437
 
670
- def add_exclusion(path: str, *, reason: str = "user") -> dict:
1438
+ def add_exclusion(path: str, *, reason: str = "user", source: str = "user", kind: str = "folder") -> dict:
671
1439
  conn = _conn()
672
1440
  excluded_path = norm_path(path)
1441
+ source_value = _normalize_source(source)
1442
+ kind_value = str(kind or "folder").strip().lower() or "folder"
673
1443
  conn.execute(
674
1444
  """
675
- INSERT INTO local_index_exclusions(path, display_path, reason, created_at)
676
- VALUES (?, ?, ?, ?)
677
- ON CONFLICT(path) DO UPDATE SET display_path=excluded.display_path, reason=excluded.reason
1445
+ INSERT INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
1446
+ VALUES (?, ?, ?, ?, ?, ?)
1447
+ ON CONFLICT(path) DO UPDATE SET
1448
+ display_path=excluded.display_path,
1449
+ source=excluded.source,
1450
+ kind=excluded.kind,
1451
+ reason=excluded.reason
678
1452
  """,
679
- (excluded_path, path, reason, now()),
1453
+ (excluded_path, path, source_value, kind_value, reason, now()),
680
1454
  )
681
1455
  conn.commit()
682
- log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason)
683
- return {"ok": True, "path": excluded_path}
1456
+ log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason, source=source_value)
1457
+ return {"ok": True, "path": excluded_path, "source": source_value, "kind": kind_value}
684
1458
 
685
1459
 
686
1460
  def remove_exclusion(path: str) -> dict:
@@ -1184,7 +1958,7 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
1184
1958
  raw_path = str(path)
1185
1959
  normalized = norm_path(raw_path)
1186
1960
  asset_id = stable_id("asset", normalized)
1187
- if should_skip_file(normalized):
1961
+ if not _should_index_file(conn, normalized):
1188
1962
  return asset_id, False, "skipped"
1189
1963
  perm = _permission_state(path)
1190
1964
  depth, privacy_class, depth_reason = classify_path(normalized)
@@ -1265,8 +2039,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
1265
2039
  """,
1266
2040
  (version_id, asset_id, fingerprint, int(st.st_size), float(st.st_mtime), now()),
1267
2041
  )
1268
- if should_extract(normalized, depth):
1269
- enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path))
2042
+ if _should_extract_file(conn, normalized, depth):
2043
+ enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path, conn=conn))
1270
2044
  enqueue_job(conn, asset_id, "graph", priority=40)
1271
2045
  return asset_id, changed, "ok"
1272
2046
 
@@ -1377,7 +2151,15 @@ def enqueue_job(conn, asset_id: str, job_type: str, *, priority: int = 50) -> st
1377
2151
  return job_id
1378
2152
 
1379
2153
 
1380
- def _extraction_priority(path: Path) -> int:
2154
+ def _extraction_priority(path: Path, *, conn=None) -> int:
2155
+ if conn is not None:
2156
+ rule = _effective_file_type_rule(conn, path.suffix.lower())
2157
+ try:
2158
+ priority = int(rule.get("priority") or 0)
2159
+ except Exception:
2160
+ priority = 0
2161
+ if priority > 0:
2162
+ return priority
1381
2163
  suffix = path.suffix.lower()
1382
2164
  if suffix in HIGH_VALUE_DOCUMENT_SUFFIXES:
1383
2165
  return 90
@@ -1385,7 +2167,7 @@ def _extraction_priority(path: Path) -> int:
1385
2167
  return 82
1386
2168
  if suffix in EMAIL_DOCUMENT_SUFFIXES or is_local_email_tree(str(path)):
1387
2169
  return 70
1388
- if suffix in {".py", ".js", ".ts", ".tsx", ".jsx", ".php", ".sql", ".json", ".yaml", ".yml", ".toml", ".html", ".css"}:
2170
+ if suffix in CODE_DOCUMENT_SUFFIXES:
1389
2171
  return 55
1390
2172
  return 45
1391
2173
 
@@ -1465,7 +2247,7 @@ def _iter_files(
1465
2247
  continue
1466
2248
  if entry.is_file():
1467
2249
  normalized = norm_path(entry)
1468
- if should_skip_file(normalized):
2250
+ if not _should_index_file(conn, normalized):
1469
2251
  continue
1470
2252
  if start_after_norm and normalized <= start_after_norm:
1471
2253
  continue
@@ -1548,7 +2330,7 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
1548
2330
  _purge_asset_ids(conn, [row["asset_id"]])
1549
2331
  stats["excluded"] += 1
1550
2332
  continue
1551
- if should_skip_file(path):
2333
+ if not _should_index_file(conn, path):
1552
2334
  _purge_asset_ids(conn, [row["asset_id"]])
1553
2335
  stats["excluded"] += 1
1554
2336
  continue
@@ -1656,7 +2438,7 @@ def _scan_known_directory(
1656
2438
  stack.append(entry)
1657
2439
  continue
1658
2440
  if entry.is_file():
1659
- if should_skip_file(str(entry)):
2441
+ if not _should_index_file(conn, entry):
1660
2442
  continue
1661
2443
  seen_files.add(norm_path(entry))
1662
2444
  if stats["files_scanned"] >= file_limit:
@@ -1750,6 +2532,7 @@ def reconcile_live_changes(
1750
2532
  file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
1751
2533
  ) -> dict:
1752
2534
  conn = _conn()
2535
+ seed_core_file_type_rules(conn)
1753
2536
  if _is_paused():
1754
2537
  return {"ok": True, "paused": True, "assets": {}, "dirs": {}}
1755
2538
  exclusions = [row["path"] for row in list_exclusions(readonly=False)]
@@ -1781,6 +2564,7 @@ def reconcile_live_changes(
1781
2564
 
1782
2565
  def scan_once(*, limit: int | None = None) -> dict:
1783
2566
  conn = _conn()
2567
+ seed_core_file_type_rules(conn)
1784
2568
  if _is_paused():
1785
2569
  log_event("info", "scan_skipped_paused", "Local memory scan skipped because indexing is paused")
1786
2570
  return {"ok": True, "paused": True, "roots": 0, "seen": 0, "changed": 0, "errors": 0, "partial": False}
@@ -2970,6 +3754,7 @@ def _status_from_conn(conn, *, readonly: bool = False) -> dict:
2970
3754
  "volumes": volumes,
2971
3755
  "roots": roots,
2972
3756
  "exclusions": _list_exclusions_conn(conn),
3757
+ "file_types": _shape_file_type_rules(_list_file_type_rules_conn(conn)),
2973
3758
  "problems": problems,
2974
3759
  "permissions": [],
2975
3760
  "models": model_status()["models"],