nexo-brain 7.25.3 → 7.25.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +3 -1
- package/package.json +1 -1
- package/src/cli.py +66 -0
- package/src/db/_schema.py +23 -0
- package/src/local_context/__init__.py +10 -0
- package/src/local_context/api.py +832 -47
- package/src/local_context/db.py +45 -1
- package/src/local_context/extractors.py +17 -1
- package/src/server.py +26 -0
- package/tool-enforcement-map.json +31 -0
package/src/local_context/api.py
CHANGED
|
@@ -17,10 +17,10 @@ from typing import Any
|
|
|
17
17
|
|
|
18
18
|
import paths
|
|
19
19
|
from . import embeddings
|
|
20
|
-
from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db
|
|
20
|
+
from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db, local_context_db_path
|
|
21
21
|
from .extractors import canonical_entity_key, chunk_text, contains_secret, entities, entity_mentions, extract_text, normalize_entity_alias, summarize
|
|
22
22
|
from .logging import log_event, tail
|
|
23
|
-
from .privacy import classify_path, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
|
|
23
|
+
from .privacy import classify_path, is_local_email_db, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
|
|
24
24
|
from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
|
|
25
25
|
|
|
26
26
|
LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
|
|
@@ -34,6 +34,9 @@ DEFAULT_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_DEPTH", "24")
|
|
|
34
34
|
DEFAULT_EMAIL_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_EMAIL_ROOT_DEPTH", "24") or "24")
|
|
35
35
|
DEFAULT_MOUNTED_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_MOUNTED_ROOT_DEPTH", "24") or "24")
|
|
36
36
|
DEFAULT_SYSTEM_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_SYSTEM_ROOT_DEPTH", "24") or "24")
|
|
37
|
+
DEFAULT_ROOT_SEED_VERSION = 2
|
|
38
|
+
ROOT_SEED_VERSION_KEY = "local_index_roots_seed_version"
|
|
39
|
+
LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES = int(os.environ.get("NEXO_LOCAL_INDEX_V2_REBUILD_THRESHOLD_BYTES", str(2 * 1024 * 1024 * 1024)) or str(2 * 1024 * 1024 * 1024))
|
|
37
40
|
DEFAULT_CONTEXT_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_MAX_CHARS", "20000") or "20000")
|
|
38
41
|
DEFAULT_ROUTER_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_ROUTER_MAX_CHARS", "6000") or "6000")
|
|
39
42
|
DEFAULT_MAX_JOB_ATTEMPTS = int(os.environ.get("NEXO_LOCAL_INDEX_MAX_JOB_ATTEMPTS", "3") or "3")
|
|
@@ -66,7 +69,6 @@ HIGH_VALUE_DOCUMENT_SUFFIXES = {
|
|
|
66
69
|
".pptx",
|
|
67
70
|
".pages",
|
|
68
71
|
".numbers",
|
|
69
|
-
".key",
|
|
70
72
|
".rtf",
|
|
71
73
|
".odt",
|
|
72
74
|
".ods",
|
|
@@ -84,6 +86,65 @@ EMAIL_DOCUMENT_SUFFIXES = {
|
|
|
84
86
|
".emlx",
|
|
85
87
|
".msg",
|
|
86
88
|
}
|
|
89
|
+
CODE_DOCUMENT_SUFFIXES = {
|
|
90
|
+
".py",
|
|
91
|
+
".js",
|
|
92
|
+
".ts",
|
|
93
|
+
".tsx",
|
|
94
|
+
".jsx",
|
|
95
|
+
".php",
|
|
96
|
+
".sql",
|
|
97
|
+
".json",
|
|
98
|
+
".yaml",
|
|
99
|
+
".yml",
|
|
100
|
+
".toml",
|
|
101
|
+
".html",
|
|
102
|
+
".css",
|
|
103
|
+
}
|
|
104
|
+
IMAGE_METADATA_SUFFIXES = {
|
|
105
|
+
".jpg",
|
|
106
|
+
".jpeg",
|
|
107
|
+
".png",
|
|
108
|
+
".gif",
|
|
109
|
+
".heic",
|
|
110
|
+
".webp",
|
|
111
|
+
".tif",
|
|
112
|
+
".tiff",
|
|
113
|
+
".bmp",
|
|
114
|
+
".raw",
|
|
115
|
+
".dng",
|
|
116
|
+
}
|
|
117
|
+
MEDIA_METADATA_SUFFIXES = {
|
|
118
|
+
".mp3",
|
|
119
|
+
".m4a",
|
|
120
|
+
".wav",
|
|
121
|
+
".aac",
|
|
122
|
+
".flac",
|
|
123
|
+
".mp4",
|
|
124
|
+
".mov",
|
|
125
|
+
".avi",
|
|
126
|
+
".mkv",
|
|
127
|
+
".m4v",
|
|
128
|
+
}
|
|
129
|
+
IGNORED_BINARY_SUFFIXES = {
|
|
130
|
+
".app",
|
|
131
|
+
".bin",
|
|
132
|
+
".class",
|
|
133
|
+
".dll",
|
|
134
|
+
".dmg",
|
|
135
|
+
".dylib",
|
|
136
|
+
".exe",
|
|
137
|
+
".iso",
|
|
138
|
+
".jar",
|
|
139
|
+
".lock",
|
|
140
|
+
".o",
|
|
141
|
+
".obj",
|
|
142
|
+
".pyc",
|
|
143
|
+
".so",
|
|
144
|
+
".swp",
|
|
145
|
+
".swo",
|
|
146
|
+
".tmp",
|
|
147
|
+
}
|
|
87
148
|
HIGH_VALUE_DIRECTORY_NAMES = {
|
|
88
149
|
"users",
|
|
89
150
|
"home",
|
|
@@ -213,26 +274,249 @@ def _with_sqlite_busy_retry(callback, *, attempts: int | None = None):
|
|
|
213
274
|
return None
|
|
214
275
|
|
|
215
276
|
|
|
216
|
-
def
|
|
277
|
+
def _normalize_source(source: str | None) -> str:
|
|
278
|
+
value = str(source or "user").strip().lower().replace("-", "_")
|
|
279
|
+
return value or "user"
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _normalize_extension(extension: str) -> str:
|
|
283
|
+
value = str(extension or "").strip().lower()
|
|
284
|
+
if not value:
|
|
285
|
+
return ""
|
|
286
|
+
if not value.startswith("."):
|
|
287
|
+
value = "." + value
|
|
288
|
+
return value
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _normalize_file_type_action(action: str | None) -> str:
|
|
292
|
+
value = str(action or "").strip().lower()
|
|
293
|
+
if value in {"include", "extract", "read", "full"}:
|
|
294
|
+
return "extract"
|
|
295
|
+
if value in {"metadata", "inventory", "index"}:
|
|
296
|
+
return "metadata"
|
|
297
|
+
if value in {"exclude", "ignore", "skip", "blocked"}:
|
|
298
|
+
return "ignore"
|
|
299
|
+
return "ignore"
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _default_file_type_rule_specs() -> list[dict]:
|
|
303
|
+
specs: list[dict] = []
|
|
304
|
+
for suffix in sorted(HIGH_VALUE_DOCUMENT_SUFFIXES):
|
|
305
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 90, "reason": "core_high_value_document"})
|
|
306
|
+
for suffix in sorted(KNOWN_TEXT_SUFFIXES):
|
|
307
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 82, "reason": "core_text_document"})
|
|
308
|
+
for suffix in sorted(EMAIL_DOCUMENT_SUFFIXES):
|
|
309
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 70, "reason": "core_email_document"})
|
|
310
|
+
for suffix in sorted(CODE_DOCUMENT_SUFFIXES):
|
|
311
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 55, "reason": "core_code_document"})
|
|
312
|
+
for suffix in sorted(IMAGE_METADATA_SUFFIXES):
|
|
313
|
+
specs.append({"extension": suffix, "action": "metadata", "priority": 35, "reason": "core_photo_metadata"})
|
|
314
|
+
for suffix in sorted(MEDIA_METADATA_SUFFIXES):
|
|
315
|
+
specs.append({"extension": suffix, "action": "metadata", "priority": 25, "reason": "core_media_metadata"})
|
|
316
|
+
for suffix in sorted(IGNORED_BINARY_SUFFIXES):
|
|
317
|
+
specs.append({"extension": suffix, "action": "ignore", "priority": 0, "reason": "core_binary_or_transient"})
|
|
318
|
+
return specs
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def seed_core_file_type_rules(conn=None) -> dict:
|
|
322
|
+
conn = conn or _conn()
|
|
323
|
+
created_or_updated = 0
|
|
324
|
+
timestamp = now()
|
|
325
|
+
for spec in _default_file_type_rule_specs():
|
|
326
|
+
conn.execute(
|
|
327
|
+
"""
|
|
328
|
+
INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
|
|
329
|
+
VALUES (?, ?, 'core_default', ?, ?, ?, ?)
|
|
330
|
+
ON CONFLICT(extension, source) DO UPDATE SET
|
|
331
|
+
action=excluded.action,
|
|
332
|
+
priority=excluded.priority,
|
|
333
|
+
reason=excluded.reason,
|
|
334
|
+
updated_at=excluded.updated_at
|
|
335
|
+
""",
|
|
336
|
+
(spec["extension"], spec["action"], int(spec["priority"]), spec["reason"], timestamp, timestamp),
|
|
337
|
+
)
|
|
338
|
+
created_or_updated += 1
|
|
339
|
+
return {"ok": True, "rules": created_or_updated}
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _list_file_type_rules_conn(conn) -> list[dict]:
|
|
343
|
+
rows = conn.execute(
|
|
344
|
+
"""
|
|
345
|
+
SELECT *
|
|
346
|
+
FROM local_index_file_type_rules
|
|
347
|
+
ORDER BY
|
|
348
|
+
CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END,
|
|
349
|
+
extension
|
|
350
|
+
"""
|
|
351
|
+
).fetchall()
|
|
352
|
+
return [dict(row) for row in rows]
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _shape_file_type_rules(rows: list[dict]) -> dict:
|
|
356
|
+
effective: dict[str, dict] = {}
|
|
357
|
+
for row in rows:
|
|
358
|
+
ext = str(row.get("extension") or "")
|
|
359
|
+
if ext not in effective or row.get("source") == "user":
|
|
360
|
+
effective[ext] = row
|
|
361
|
+
return {"ok": True, "rules": rows, "effective": list(effective.values())}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _effective_file_type_rule(conn, extension: str) -> dict:
|
|
365
|
+
ext = _normalize_extension(extension)
|
|
366
|
+
if not ext:
|
|
367
|
+
return {"extension": "", "action": "ignore", "source": "implicit", "priority": 0, "reason": "missing_extension"}
|
|
368
|
+
rows = conn.execute(
|
|
369
|
+
"""
|
|
370
|
+
SELECT *
|
|
371
|
+
FROM local_index_file_type_rules
|
|
372
|
+
WHERE extension=?
|
|
373
|
+
ORDER BY CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END
|
|
374
|
+
LIMIT 1
|
|
375
|
+
""",
|
|
376
|
+
(ext,),
|
|
377
|
+
).fetchall()
|
|
378
|
+
if rows:
|
|
379
|
+
return dict(rows[0])
|
|
380
|
+
if is_local_email_tree(ext):
|
|
381
|
+
return {"extension": ext, "action": "extract", "source": "implicit", "priority": 70, "reason": "local_email"}
|
|
382
|
+
return {"extension": ext, "action": "ignore", "source": "implicit", "priority": 0, "reason": "unknown_extension"}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def list_file_type_rules(*, readonly: bool = True) -> dict:
|
|
386
|
+
if not readonly:
|
|
387
|
+
conn = _conn()
|
|
388
|
+
seed_core_file_type_rules(conn)
|
|
389
|
+
conn.commit()
|
|
390
|
+
rows = _list_file_type_rules_conn(conn)
|
|
391
|
+
else:
|
|
392
|
+
conn = _read_conn()
|
|
393
|
+
try:
|
|
394
|
+
rows = _list_file_type_rules_conn(conn)
|
|
395
|
+
finally:
|
|
396
|
+
_close_read_conn(conn)
|
|
397
|
+
return _shape_file_type_rules(rows)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _purge_assets_by_extension(conn, extension: str) -> dict:
|
|
401
|
+
ext = _normalize_extension(extension)
|
|
402
|
+
if not ext:
|
|
403
|
+
return {"assets": 0}
|
|
404
|
+
rows = conn.execute("SELECT asset_id FROM local_assets WHERE lower(extension)=?", (ext,)).fetchall()
|
|
405
|
+
return _purge_asset_ids(conn, [str(row["asset_id"]) for row in rows])
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def set_file_type_rule(extension: str, *, action: str = "extract", source: str = "user", priority: int | None = None, reason: str = "user") -> dict:
|
|
409
|
+
conn = _conn()
|
|
410
|
+
ext = _normalize_extension(extension)
|
|
411
|
+
if not ext:
|
|
412
|
+
return {"ok": False, "error": "extension_required"}
|
|
413
|
+
normalized_action = _normalize_file_type_action(action)
|
|
414
|
+
source_value = _normalize_source(source)
|
|
415
|
+
priority_value = int(priority if priority is not None else (82 if normalized_action == "extract" else 20 if normalized_action == "metadata" else 0))
|
|
416
|
+
timestamp = now()
|
|
417
|
+
conn.execute(
|
|
418
|
+
"""
|
|
419
|
+
INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
|
|
420
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
421
|
+
ON CONFLICT(extension, source) DO UPDATE SET
|
|
422
|
+
action=excluded.action,
|
|
423
|
+
priority=excluded.priority,
|
|
424
|
+
reason=excluded.reason,
|
|
425
|
+
updated_at=excluded.updated_at
|
|
426
|
+
""",
|
|
427
|
+
(ext, normalized_action, source_value, priority_value, reason, timestamp, timestamp),
|
|
428
|
+
)
|
|
429
|
+
cleanup = _purge_assets_by_extension(conn, ext) if normalized_action == "ignore" and source_value == "user" else {"assets": 0}
|
|
430
|
+
conn.commit()
|
|
431
|
+
log_event("info", "file_type_rule_set", "Local memory file type rule set", extension=ext, action=normalized_action, source=source_value, cleanup=cleanup)
|
|
432
|
+
return {"ok": True, "extension": ext, "action": normalized_action, "source": source_value, "priority": priority_value, "cleanup": cleanup}
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def remove_file_type_rule(extension: str, *, source: str = "user") -> dict:
|
|
436
|
+
conn = _conn()
|
|
437
|
+
ext = _normalize_extension(extension)
|
|
438
|
+
source_value = _normalize_source(source)
|
|
439
|
+
conn.execute("DELETE FROM local_index_file_type_rules WHERE extension=? AND source=?", (ext, source_value))
|
|
440
|
+
conn.commit()
|
|
441
|
+
log_event("info", "file_type_rule_removed", "Local memory file type rule removed", extension=ext, source=source_value)
|
|
442
|
+
return {"ok": True, "extension": ext, "source": source_value}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def reset_file_type_rules() -> dict:
|
|
446
|
+
conn = _conn()
|
|
447
|
+
deleted = int(conn.execute("DELETE FROM local_index_file_type_rules WHERE source='user'").rowcount or 0)
|
|
448
|
+
seeded = seed_core_file_type_rules(conn)
|
|
449
|
+
conn.commit()
|
|
450
|
+
log_event("info", "file_type_rules_reset", "Local memory user file type overrides reset", deleted=deleted)
|
|
451
|
+
return {"ok": True, "deleted": deleted, "core_rules": int(seeded.get("rules") or 0), "file_types": list_file_type_rules(readonly=False)}
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _file_type_action(conn, path: str | Path) -> str:
|
|
455
|
+
p = Path(path)
|
|
456
|
+
if is_local_email_db(str(path)) or is_local_email_tree(str(path)):
|
|
457
|
+
return "extract"
|
|
458
|
+
return str(_effective_file_type_rule(conn, p.suffix.lower()).get("action") or "ignore")
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def _should_index_file(conn, path: str | Path) -> bool:
|
|
462
|
+
if should_skip_file(str(path)):
|
|
463
|
+
return False
|
|
464
|
+
return _file_type_action(conn, path) != "ignore"
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _should_extract_file(conn, path: str | Path, depth: int) -> bool:
|
|
468
|
+
if depth < 2 or should_skip_file(str(path)):
|
|
469
|
+
return False
|
|
470
|
+
return _file_type_action(conn, path) == "extract"
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def add_root(path: str, *, mode: str = "normal", depth: int | None = None, source: str = "user", remote: bool = False, seed_version: int | None = None) -> dict:
|
|
217
474
|
conn = _conn()
|
|
218
475
|
root_path = norm_path(path)
|
|
219
476
|
if should_skip_tree(root_path) and not _allow_explicit_blocked_root(root_path):
|
|
220
477
|
log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
|
|
221
478
|
return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
|
|
222
479
|
depth_value = 2 if depth is None else int(depth)
|
|
223
|
-
|
|
480
|
+
source_value = _normalize_source(source)
|
|
481
|
+
seed_value = int(seed_version if seed_version is not None else (DEFAULT_ROOT_SEED_VERSION if source_value == "core_default" else 0))
|
|
482
|
+
existing = conn.execute("SELECT id, status, source, depth FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
|
|
483
|
+
if existing and str(existing["status"] or "") == "active" and source_value == "user" and str(existing["source"] or "") == "core_default":
|
|
484
|
+
return {"ok": True, "root_path": root_path, "mode": mode, "depth": int(existing["depth"] or depth_value), "already_included": True, "included_by": "core_default"}
|
|
485
|
+
if source_value == "user":
|
|
486
|
+
parent = conn.execute(
|
|
487
|
+
"""
|
|
488
|
+
SELECT root_path, source, depth
|
|
489
|
+
FROM local_index_roots
|
|
490
|
+
WHERE status='active' AND source='core_default'
|
|
491
|
+
ORDER BY length(root_path) DESC
|
|
492
|
+
"""
|
|
493
|
+
).fetchall()
|
|
494
|
+
for row in parent:
|
|
495
|
+
parent_path = str(row["root_path"] or "")
|
|
496
|
+
if _is_nested_path(root_path, parent_path):
|
|
497
|
+
return {
|
|
498
|
+
"ok": True,
|
|
499
|
+
"root_path": root_path,
|
|
500
|
+
"already_included": True,
|
|
501
|
+
"included_by": "core_default",
|
|
502
|
+
"included_root": parent_path,
|
|
503
|
+
"depth": int(row["depth"] or depth_value),
|
|
504
|
+
}
|
|
224
505
|
conn.execute(
|
|
225
506
|
"""
|
|
226
|
-
INSERT INTO local_index_roots(root_path, display_path, mode, depth, status, created_at, updated_at)
|
|
227
|
-
VALUES (?, ?, ?, ?, 'active', ?, ?)
|
|
507
|
+
INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
|
|
508
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, 'active', ?, ?)
|
|
228
509
|
ON CONFLICT(root_path) DO UPDATE SET
|
|
229
510
|
display_path=excluded.display_path,
|
|
230
511
|
mode=excluded.mode,
|
|
231
512
|
depth=excluded.depth,
|
|
513
|
+
source=excluded.source,
|
|
514
|
+
remote=excluded.remote,
|
|
515
|
+
seed_version=excluded.seed_version,
|
|
232
516
|
status='active',
|
|
233
517
|
updated_at=excluded.updated_at
|
|
234
518
|
""",
|
|
235
|
-
(root_path, path, mode, depth_value, now(), now()),
|
|
519
|
+
(root_path, path, mode, depth_value, source_value, 1 if remote else 0, seed_value, now(), now()),
|
|
236
520
|
)
|
|
237
521
|
row = conn.execute("SELECT id FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
|
|
238
522
|
existing_status = str(existing["status"] or "") if existing else ""
|
|
@@ -241,8 +525,8 @@ def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> di
|
|
|
241
525
|
_set_initial_index_complete(conn, False)
|
|
242
526
|
_set_initial_index_started_at(conn, now())
|
|
243
527
|
conn.commit()
|
|
244
|
-
log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value)
|
|
245
|
-
return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value}
|
|
528
|
+
log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value, source=source_value)
|
|
529
|
+
return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value, "source": source_value, "remote": bool(remote)}
|
|
246
530
|
|
|
247
531
|
|
|
248
532
|
def remove_root(path: str) -> dict:
|
|
@@ -331,15 +615,40 @@ def _mounted_volume_roots() -> list[str]:
|
|
|
331
615
|
|
|
332
616
|
|
|
333
617
|
def _system_volume_roots() -> list[str]:
|
|
618
|
+
if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
|
|
619
|
+
if sys.platform == "darwin":
|
|
620
|
+
return ["/"]
|
|
621
|
+
if sys.platform.startswith("win"):
|
|
622
|
+
return []
|
|
623
|
+
return ["/"]
|
|
334
624
|
if os.environ.get("NEXO_LOCAL_INDEX_DISABLE_SYSTEM_ROOTS", "").strip() in {"1", "true", "yes"}:
|
|
335
625
|
return []
|
|
336
|
-
|
|
337
|
-
|
|
626
|
+
return []
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def _user_content_roots() -> list[str]:
|
|
630
|
+
home = Path.home()
|
|
631
|
+
candidates: list[Path] = [home]
|
|
338
632
|
if sys.platform.startswith("win"):
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
633
|
+
candidates.extend([
|
|
634
|
+
home / "OneDrive",
|
|
635
|
+
home / "OneDrive - Personal",
|
|
636
|
+
home / "OneDrive - Empresa",
|
|
637
|
+
])
|
|
638
|
+
for key in ("OneDrive", "OneDriveCommercial", "OneDriveConsumer"):
|
|
639
|
+
value = os.environ.get(key, "").strip()
|
|
640
|
+
if value:
|
|
641
|
+
candidates.append(Path(value))
|
|
642
|
+
elif sys.platform == "darwin":
|
|
643
|
+
candidates.append(home / "Library" / "Mobile Documents" / "com~apple~CloudDocs")
|
|
644
|
+
roots: list[str] = []
|
|
645
|
+
for candidate in candidates:
|
|
646
|
+
try:
|
|
647
|
+
if candidate.exists() and candidate.is_dir() and (not should_skip_tree(str(candidate)) or _allow_explicit_blocked_root(str(candidate))):
|
|
648
|
+
roots.append(str(candidate))
|
|
649
|
+
except Exception:
|
|
650
|
+
continue
|
|
651
|
+
return _dedupe_roots(roots)
|
|
343
652
|
|
|
344
653
|
|
|
345
654
|
def _local_email_roots() -> list[str]:
|
|
@@ -379,42 +688,501 @@ def default_roots() -> list[str]:
|
|
|
379
688
|
|
|
380
689
|
|
|
381
690
|
def default_root_specs() -> list[tuple[str, int]]:
|
|
382
|
-
home = Path.home()
|
|
383
691
|
configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
|
|
384
|
-
system_specs
|
|
385
|
-
|
|
692
|
+
system_specs: list[tuple[str, int]] = []
|
|
693
|
+
if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
|
|
694
|
+
system_specs = [(root, DEFAULT_SYSTEM_ROOT_DEPTH) for root in _system_volume_roots()]
|
|
695
|
+
mounted_specs = []
|
|
696
|
+
if os.environ.get("NEXO_LOCAL_INDEX_INCLUDE_MOUNTED_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
|
|
697
|
+
mounted_specs = [(root, DEFAULT_MOUNTED_ROOT_DEPTH) for root in _mounted_volume_roots()]
|
|
386
698
|
configured_specs = [(item, DEFAULT_ROOT_DEPTH) for item in configured.split(os.pathsep) if item.strip()]
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
base_specs = [(str(home), DEFAULT_ROOT_DEPTH)]
|
|
699
|
+
user_specs = [(root, DEFAULT_ROOT_DEPTH) for root in _user_content_roots()]
|
|
700
|
+
base_specs = user_specs + system_specs + mounted_specs + configured_specs
|
|
390
701
|
return _dedupe_root_specs(
|
|
391
702
|
base_specs
|
|
392
703
|
+ [(root, DEFAULT_EMAIL_ROOT_DEPTH) for root in _local_email_roots()]
|
|
393
704
|
)
|
|
394
705
|
|
|
395
706
|
|
|
396
|
-
def
|
|
397
|
-
|
|
707
|
+
def _all_roots_by_path_conn(conn) -> dict[str, dict]:
|
|
708
|
+
rows = conn.execute("SELECT * FROM local_index_roots ORDER BY root_path").fetchall()
|
|
709
|
+
return {str(row["root_path"]): dict(row) for row in rows}
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _seed_default_roots_conn(conn) -> dict:
|
|
713
|
+
existing = _all_roots_by_path_conn(conn)
|
|
398
714
|
created = []
|
|
399
715
|
updated = []
|
|
716
|
+
skipped_removed = []
|
|
400
717
|
for root, depth in default_root_specs():
|
|
401
718
|
candidate = Path(root).expanduser()
|
|
402
719
|
if not candidate.exists() or not candidate.is_dir():
|
|
403
720
|
continue
|
|
404
|
-
|
|
721
|
+
root_path = norm_path(str(candidate))
|
|
722
|
+
existing_row = existing.get(root_path)
|
|
405
723
|
if existing_row:
|
|
724
|
+
if str(existing_row.get("status") or "") == "removed":
|
|
725
|
+
skipped_removed.append({"root_path": root_path})
|
|
726
|
+
continue
|
|
406
727
|
current_depth = int(existing_row.get("depth") or 0)
|
|
407
728
|
if current_depth < depth:
|
|
408
|
-
conn = _conn()
|
|
409
729
|
conn.execute(
|
|
410
|
-
"UPDATE local_index_roots SET depth=?, updated_at=? WHERE root_path=?",
|
|
411
|
-
(depth, now(),
|
|
730
|
+
"UPDATE local_index_roots SET depth=?, source='core_default', seed_version=?, updated_at=? WHERE root_path=?",
|
|
731
|
+
(depth, DEFAULT_ROOT_SEED_VERSION, now(), root_path),
|
|
412
732
|
)
|
|
413
|
-
|
|
414
|
-
|
|
733
|
+
updated.append({"root_path": root_path, "depth": depth})
|
|
734
|
+
continue
|
|
735
|
+
timestamp = now()
|
|
736
|
+
conn.execute(
|
|
737
|
+
"""
|
|
738
|
+
INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
|
|
739
|
+
VALUES (?, ?, 'normal', ?, 'core_default', 0, ?, 'active', ?, ?)
|
|
740
|
+
""",
|
|
741
|
+
(root_path, str(candidate), int(depth), DEFAULT_ROOT_SEED_VERSION, timestamp, timestamp),
|
|
742
|
+
)
|
|
743
|
+
created.append({"root_path": root_path, "depth": int(depth)})
|
|
744
|
+
existing[root_path] = {
|
|
745
|
+
"root_path": root_path,
|
|
746
|
+
"display_path": str(candidate),
|
|
747
|
+
"mode": "normal",
|
|
748
|
+
"depth": int(depth),
|
|
749
|
+
"source": "core_default",
|
|
750
|
+
"remote": 0,
|
|
751
|
+
"seed_version": DEFAULT_ROOT_SEED_VERSION,
|
|
752
|
+
"status": "active",
|
|
753
|
+
}
|
|
754
|
+
return {"created": created, "updated": updated, "skipped_removed": skipped_removed}
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def ensure_default_roots() -> dict:
|
|
758
|
+
conn = _conn()
|
|
759
|
+
seed_core_file_type_rules(conn)
|
|
760
|
+
seeded = _seed_default_roots_conn(conn)
|
|
761
|
+
migration = migrate_roots_seed_v2(dry_run=False, _already_seeded=True)
|
|
762
|
+
try:
|
|
763
|
+
conn.commit()
|
|
764
|
+
except sqlite3.ProgrammingError:
|
|
765
|
+
# A large legacy DB may have been archived and replaced during migration.
|
|
766
|
+
pass
|
|
767
|
+
return {
|
|
768
|
+
"ok": True,
|
|
769
|
+
"created": len(seeded["created"]),
|
|
770
|
+
"updated": len(seeded["updated"]),
|
|
771
|
+
"skipped_removed": len(seeded["skipped_removed"]),
|
|
772
|
+
"migration": migration,
|
|
773
|
+
"roots": list_roots(readonly=False),
|
|
774
|
+
"file_types": list_file_type_rules(readonly=False),
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def _local_context_sidecar_paths(db_path: Path) -> list[Path]:
|
|
779
|
+
return [db_path, db_path.with_name(db_path.name + "-wal"), db_path.with_name(db_path.name + "-shm")]
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def _local_context_db_size_bytes() -> int:
|
|
783
|
+
total = 0
|
|
784
|
+
for candidate in _local_context_sidecar_paths(local_context_db_path()):
|
|
785
|
+
try:
|
|
786
|
+
if candidate.exists():
|
|
787
|
+
total += int(candidate.stat().st_size)
|
|
788
|
+
except OSError:
|
|
789
|
+
continue
|
|
790
|
+
return total
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
def _capture_roots_v2_config(conn) -> dict:
|
|
794
|
+
state_rows = [
|
|
795
|
+
dict(row)
|
|
796
|
+
for row in conn.execute(
|
|
797
|
+
"""
|
|
798
|
+
SELECT key, value, updated_at
|
|
799
|
+
FROM local_index_state
|
|
800
|
+
WHERE key NOT LIKE 'root_initial_scan:%'
|
|
801
|
+
AND key NOT IN (?, ?, ?)
|
|
802
|
+
ORDER BY key
|
|
803
|
+
""",
|
|
804
|
+
(ROOT_SEED_VERSION_KEY, INITIAL_INDEX_COMPLETE_KEY, INITIAL_INDEX_STARTED_AT_KEY),
|
|
805
|
+
).fetchall()
|
|
806
|
+
]
|
|
807
|
+
root_rows = []
|
|
808
|
+
for row in conn.execute(
|
|
809
|
+
"""
|
|
810
|
+
SELECT root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at
|
|
811
|
+
FROM local_index_roots
|
|
812
|
+
ORDER BY root_path
|
|
813
|
+
"""
|
|
814
|
+
).fetchall():
|
|
815
|
+
shaped = dict(row)
|
|
816
|
+
source = str(shaped.get("source") or "legacy")
|
|
817
|
+
status = str(shaped.get("status") or "")
|
|
818
|
+
root_path = str(shaped.get("root_path") or "")
|
|
819
|
+
preserve = (
|
|
820
|
+
source == "user"
|
|
821
|
+
or bool(shaped.get("remote"))
|
|
822
|
+
or status == "removed"
|
|
823
|
+
or (source == "core_default" and status == "active" and not _is_disk_root_path(root_path))
|
|
824
|
+
)
|
|
825
|
+
if preserve:
|
|
826
|
+
root_rows.append(shaped)
|
|
827
|
+
exclusion_rows = [
|
|
828
|
+
dict(row)
|
|
829
|
+
for row in conn.execute(
|
|
830
|
+
"""
|
|
831
|
+
SELECT path, display_path, source, kind, reason, created_at
|
|
832
|
+
FROM local_index_exclusions
|
|
833
|
+
ORDER BY path
|
|
834
|
+
"""
|
|
835
|
+
).fetchall()
|
|
836
|
+
]
|
|
837
|
+
file_type_rows = [
|
|
838
|
+
dict(row)
|
|
839
|
+
for row in conn.execute(
|
|
840
|
+
"""
|
|
841
|
+
SELECT extension, action, source, priority, reason, created_at, updated_at
|
|
842
|
+
FROM local_index_file_type_rules
|
|
843
|
+
WHERE source='user'
|
|
844
|
+
ORDER BY extension
|
|
845
|
+
"""
|
|
846
|
+
).fetchall()
|
|
847
|
+
]
|
|
848
|
+
return {
|
|
849
|
+
"state": state_rows,
|
|
850
|
+
"roots": root_rows,
|
|
851
|
+
"exclusions": exclusion_rows,
|
|
852
|
+
"file_types": file_type_rows,
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
def _restore_roots_v2_config(conn, config: dict) -> dict:
|
|
857
|
+
restored = {"state": 0, "roots": 0, "exclusions": 0, "file_types": 0}
|
|
858
|
+
timestamp = now()
|
|
859
|
+
for row in config.get("state") or []:
|
|
860
|
+
conn.execute(
|
|
861
|
+
"""
|
|
862
|
+
INSERT OR REPLACE INTO local_index_state(key, value, updated_at)
|
|
863
|
+
VALUES (?, ?, ?)
|
|
864
|
+
""",
|
|
865
|
+
(row.get("key"), row.get("value") or "", float(row.get("updated_at") or timestamp)),
|
|
866
|
+
)
|
|
867
|
+
restored["state"] += 1
|
|
868
|
+
for row in config.get("roots") or []:
|
|
869
|
+
root_path = norm_path(str(row.get("root_path") or ""))
|
|
870
|
+
if not root_path:
|
|
871
|
+
continue
|
|
872
|
+
conn.execute(
|
|
873
|
+
"""
|
|
874
|
+
INSERT OR REPLACE INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
|
|
875
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
876
|
+
""",
|
|
877
|
+
(
|
|
878
|
+
root_path,
|
|
879
|
+
row.get("display_path") or root_path,
|
|
880
|
+
row.get("mode") or "normal",
|
|
881
|
+
int(row.get("depth") or DEFAULT_ROOT_DEPTH),
|
|
882
|
+
_normalize_source(row.get("source") or "user"),
|
|
883
|
+
1 if row.get("remote") else 0,
|
|
884
|
+
int(row.get("seed_version") or 0),
|
|
885
|
+
row.get("status") or "active",
|
|
886
|
+
float(row.get("created_at") or timestamp),
|
|
887
|
+
float(row.get("updated_at") or timestamp),
|
|
888
|
+
),
|
|
889
|
+
)
|
|
890
|
+
restored["roots"] += 1
|
|
891
|
+
for row in config.get("exclusions") or []:
|
|
892
|
+
exclusion_path = norm_path(str(row.get("path") or ""))
|
|
893
|
+
if not exclusion_path:
|
|
894
|
+
continue
|
|
895
|
+
conn.execute(
|
|
896
|
+
"""
|
|
897
|
+
INSERT OR REPLACE INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
|
|
898
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
899
|
+
""",
|
|
900
|
+
(
|
|
901
|
+
exclusion_path,
|
|
902
|
+
row.get("display_path") or exclusion_path,
|
|
903
|
+
_normalize_source(row.get("source") or "user"),
|
|
904
|
+
row.get("kind") or "folder",
|
|
905
|
+
row.get("reason") or "user",
|
|
906
|
+
float(row.get("created_at") or timestamp),
|
|
907
|
+
),
|
|
908
|
+
)
|
|
909
|
+
restored["exclusions"] += 1
|
|
910
|
+
for row in config.get("file_types") or []:
|
|
911
|
+
extension = _normalize_extension(str(row.get("extension") or ""))
|
|
912
|
+
if not extension:
|
|
913
|
+
continue
|
|
914
|
+
conn.execute(
|
|
915
|
+
"""
|
|
916
|
+
INSERT OR REPLACE INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
|
|
917
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
918
|
+
""",
|
|
919
|
+
(
|
|
920
|
+
extension,
|
|
921
|
+
_normalize_file_type_action(str(row.get("action") or "ignore")),
|
|
922
|
+
_normalize_source(row.get("source") or "user"),
|
|
923
|
+
int(row.get("priority") or 0),
|
|
924
|
+
row.get("reason") or "user",
|
|
925
|
+
float(row.get("created_at") or timestamp),
|
|
926
|
+
float(row.get("updated_at") or timestamp),
|
|
927
|
+
),
|
|
928
|
+
)
|
|
929
|
+
restored["file_types"] += 1
|
|
930
|
+
return restored
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
def _create_roots_v2_sqlite_backup(conn) -> dict:
|
|
934
|
+
db_path = local_context_db_path()
|
|
935
|
+
if not db_path.is_file():
|
|
936
|
+
return {"ok": True, "skipped": True, "reason": "db_missing"}
|
|
937
|
+
conn.commit()
|
|
938
|
+
backup_path = paths.create_backup_path("local-context-roots-v2", ".db")
|
|
939
|
+
backup_conn = None
|
|
940
|
+
try:
|
|
941
|
+
backup_conn = sqlite3.connect(str(backup_path))
|
|
942
|
+
conn.backup(backup_conn)
|
|
943
|
+
backup_conn.close()
|
|
944
|
+
backup_conn = None
|
|
945
|
+
backup_check = sqlite3.connect(str(backup_path))
|
|
946
|
+
try:
|
|
947
|
+
source_roots = int(conn.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
|
|
948
|
+
backup_roots = int(backup_check.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
|
|
949
|
+
finally:
|
|
950
|
+
backup_check.close()
|
|
951
|
+
if backup_roots < source_roots:
|
|
952
|
+
return {
|
|
953
|
+
"ok": False,
|
|
954
|
+
"error": "backup_validation_failed",
|
|
955
|
+
"path": str(backup_path),
|
|
956
|
+
"source_roots": source_roots,
|
|
957
|
+
"backup_roots": backup_roots,
|
|
958
|
+
}
|
|
959
|
+
prune = paths.finalize_backup_snapshot(backup_path)
|
|
960
|
+
return {"ok": True, "path": str(backup_path), "source_roots": source_roots, "backup_roots": backup_roots, "prune": prune}
|
|
961
|
+
except Exception as exc:
|
|
962
|
+
return {"ok": False, "error": str(exc), "path": str(backup_path)}
|
|
963
|
+
finally:
|
|
964
|
+
if backup_conn is not None:
|
|
965
|
+
try:
|
|
966
|
+
backup_conn.close()
|
|
967
|
+
except Exception:
|
|
968
|
+
pass
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
def _archive_rebuild_local_context_for_roots_v2(conn, summary: dict) -> dict:
|
|
972
|
+
db_path = local_context_db_path()
|
|
973
|
+
config = _capture_roots_v2_config(conn)
|
|
974
|
+
size_bytes = _local_context_db_size_bytes()
|
|
975
|
+
backup_dir = paths.create_backup_dir("local-context-roots-v2")
|
|
976
|
+
conn.commit()
|
|
977
|
+
close_local_context_db()
|
|
978
|
+
moved = []
|
|
979
|
+
try:
|
|
980
|
+
for candidate in _local_context_sidecar_paths(db_path):
|
|
981
|
+
if not candidate.exists():
|
|
982
|
+
continue
|
|
983
|
+
target = backup_dir / candidate.name
|
|
984
|
+
shutil.move(str(candidate), str(target))
|
|
985
|
+
moved.append({"path": str(candidate), "backup_path": str(target)})
|
|
986
|
+
fresh = _conn()
|
|
987
|
+
seed_core_file_type_rules(fresh)
|
|
988
|
+
restored = _restore_roots_v2_config(fresh, config)
|
|
989
|
+
seeded = _seed_default_roots_conn(fresh)
|
|
990
|
+
_set_state_conn(fresh, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
|
|
991
|
+
_set_initial_index_complete(fresh, False)
|
|
992
|
+
_set_initial_index_started_at(fresh, now())
|
|
993
|
+
fresh.commit()
|
|
994
|
+
prune = paths.finalize_backup_snapshot(backup_dir)
|
|
995
|
+
result = {
|
|
996
|
+
"ok": True,
|
|
997
|
+
"strategy": "archive_rebuild",
|
|
998
|
+
"backup_dir": str(backup_dir),
|
|
999
|
+
"size_bytes": size_bytes,
|
|
1000
|
+
"moved": moved,
|
|
1001
|
+
"preserved": restored,
|
|
1002
|
+
"seeded": seeded,
|
|
1003
|
+
"prune": prune,
|
|
1004
|
+
}
|
|
1005
|
+
log_event("info", "roots_seed_v2_archived_rebuilt", "Local memory roots seed v2 archived large DB and rebuilt config", summary=summary, result=result)
|
|
1006
|
+
return result
|
|
1007
|
+
except Exception as exc:
|
|
1008
|
+
return {
|
|
1009
|
+
"ok": False,
|
|
1010
|
+
"strategy": "archive_rebuild",
|
|
1011
|
+
"backup_dir": str(backup_dir),
|
|
1012
|
+
"size_bytes": size_bytes,
|
|
1013
|
+
"moved": moved,
|
|
1014
|
+
"error": str(exc),
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
def _is_disk_root_path(path: str) -> bool:
|
|
1019
|
+
normalized = norm_path(path)
|
|
1020
|
+
if normalized in {"/", "\\"}:
|
|
1021
|
+
return True
|
|
1022
|
+
return bool(re.match(r"^[A-Za-z]:\\?$", normalized))
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def _path_is_under_any(path: str, prefixes: list[str]) -> bool:
|
|
1026
|
+
value = norm_path(path)
|
|
1027
|
+
return any(value == prefix or value.startswith(_path_prefix(prefix)) for prefix in prefixes if prefix)
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def _best_root_id_for_path(path: str, roots: list[dict]) -> int | None:
|
|
1031
|
+
value = norm_path(path)
|
|
1032
|
+
best: tuple[int, int] | None = None
|
|
1033
|
+
for row in roots:
|
|
1034
|
+
root_path = str(row.get("root_path") or "")
|
|
1035
|
+
if not root_path or not (value == root_path or value.startswith(_path_prefix(root_path))):
|
|
415
1036
|
continue
|
|
416
|
-
|
|
417
|
-
|
|
1037
|
+
candidate = (len(root_path), int(row.get("id") or 0))
|
|
1038
|
+
if best is None or candidate[0] > best[0]:
|
|
1039
|
+
best = candidate
|
|
1040
|
+
return best[1] if best else None
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
def _purge_dir_ids(conn, dir_ids: list[str]) -> int:
|
|
1044
|
+
unique_ids = [item for item in dict.fromkeys(dir_ids) if item]
|
|
1045
|
+
deleted = 0
|
|
1046
|
+
for start in range(0, len(unique_ids), 500):
|
|
1047
|
+
batch = unique_ids[start:start + 500]
|
|
1048
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1049
|
+
deleted += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
1050
|
+
return deleted
|
|
1051
|
+
|
|
1052
|
+
|
|
1053
|
+
def migrate_roots_seed_v2(*, dry_run: bool = True, _already_seeded: bool = False) -> dict:
|
|
1054
|
+
"""Move legacy whole-disk roots to curated user roots and purge obvious noise."""
|
|
1055
|
+
conn = _conn()
|
|
1056
|
+
if not _already_seeded:
|
|
1057
|
+
seed_core_file_type_rules(conn)
|
|
1058
|
+
current_seed = _get_state_conn(conn, ROOT_SEED_VERSION_KEY, "0")
|
|
1059
|
+
if str(current_seed) == str(DEFAULT_ROOT_SEED_VERSION):
|
|
1060
|
+
return {"ok": True, "dry_run": dry_run, "needed": False, "seed_version": DEFAULT_ROOT_SEED_VERSION}
|
|
1061
|
+
|
|
1062
|
+
active_roots = [dict(row) for row in conn.execute("SELECT * FROM local_index_roots WHERE status='active'").fetchall()]
|
|
1063
|
+
keep_roots = [
|
|
1064
|
+
row for row in active_roots
|
|
1065
|
+
if str(row.get("status") or "") == "active"
|
|
1066
|
+
and not (
|
|
1067
|
+
_is_disk_root_path(str(row.get("root_path") or ""))
|
|
1068
|
+
and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
|
|
1069
|
+
)
|
|
1070
|
+
]
|
|
1071
|
+
keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
|
|
1072
|
+
legacy_disk_roots = [
|
|
1073
|
+
row for row in active_roots
|
|
1074
|
+
if (
|
|
1075
|
+
_is_disk_root_path(str(row.get("root_path") or ""))
|
|
1076
|
+
and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
|
|
1077
|
+
)
|
|
1078
|
+
or (
|
|
1079
|
+
str(row.get("source") or "legacy") in {"legacy", "system_default"}
|
|
1080
|
+
and any(_is_nested_path(prefix, str(row.get("root_path") or "")) for prefix in keep_prefixes)
|
|
1081
|
+
)
|
|
1082
|
+
]
|
|
1083
|
+
keep_roots = [row for row in keep_roots if row not in legacy_disk_roots]
|
|
1084
|
+
keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
|
|
1085
|
+
legacy_ids = {int(row.get("id") or 0) for row in legacy_disk_roots}
|
|
1086
|
+
legacy_prefixes = [str(row.get("root_path") or "") for row in legacy_disk_roots if row.get("root_path")]
|
|
1087
|
+
|
|
1088
|
+
asset_ids_to_purge: list[str] = []
|
|
1089
|
+
asset_remaps: dict[int, list[str]] = {}
|
|
1090
|
+
asset_rows = conn.execute("SELECT asset_id, root_id, path, extension, privacy_class FROM local_assets").fetchall()
|
|
1091
|
+
for row in asset_rows:
|
|
1092
|
+
path = str(row["path"] or "")
|
|
1093
|
+
under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
|
|
1094
|
+
action = _file_type_action(conn, path)
|
|
1095
|
+
unsafe = should_skip_file(path) or str(row["privacy_class"] or "") in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}
|
|
1096
|
+
if action == "ignore" or unsafe or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
|
|
1097
|
+
asset_ids_to_purge.append(str(row["asset_id"]))
|
|
1098
|
+
continue
|
|
1099
|
+
if under_legacy:
|
|
1100
|
+
new_root_id = _best_root_id_for_path(path, keep_roots)
|
|
1101
|
+
if new_root_id:
|
|
1102
|
+
asset_remaps.setdefault(new_root_id, []).append(str(row["asset_id"]))
|
|
1103
|
+
|
|
1104
|
+
dir_ids_to_purge: list[str] = []
|
|
1105
|
+
dir_remaps: dict[int, list[str]] = {}
|
|
1106
|
+
dir_rows = conn.execute("SELECT dir_id, root_id, path FROM local_index_dirs").fetchall()
|
|
1107
|
+
for row in dir_rows:
|
|
1108
|
+
path = str(row["path"] or "")
|
|
1109
|
+
under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
|
|
1110
|
+
if should_skip_tree(path) or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
|
|
1111
|
+
dir_ids_to_purge.append(str(row["dir_id"]))
|
|
1112
|
+
continue
|
|
1113
|
+
if under_legacy:
|
|
1114
|
+
new_root_id = _best_root_id_for_path(path, keep_roots)
|
|
1115
|
+
if new_root_id:
|
|
1116
|
+
dir_remaps.setdefault(new_root_id, []).append(str(row["dir_id"]))
|
|
1117
|
+
|
|
1118
|
+
summary = {
|
|
1119
|
+
"ok": True,
|
|
1120
|
+
"dry_run": dry_run,
|
|
1121
|
+
"needed": True,
|
|
1122
|
+
"legacy_disk_roots": [str(row.get("root_path") or "") for row in legacy_disk_roots],
|
|
1123
|
+
"keep_roots": keep_prefixes,
|
|
1124
|
+
"assets_to_purge": len(asset_ids_to_purge),
|
|
1125
|
+
"dirs_to_purge": len(dir_ids_to_purge),
|
|
1126
|
+
"assets_to_remap": sum(len(items) for items in asset_remaps.values()),
|
|
1127
|
+
"dirs_to_remap": sum(len(items) for items in dir_remaps.values()),
|
|
1128
|
+
"cleanup": {},
|
|
1129
|
+
}
|
|
1130
|
+
if dry_run:
|
|
1131
|
+
return summary
|
|
1132
|
+
|
|
1133
|
+
destructive = bool(
|
|
1134
|
+
asset_ids_to_purge
|
|
1135
|
+
or dir_ids_to_purge
|
|
1136
|
+
or legacy_ids
|
|
1137
|
+
or any(asset_remaps.values())
|
|
1138
|
+
or any(dir_remaps.values())
|
|
1139
|
+
)
|
|
1140
|
+
db_size = _local_context_db_size_bytes()
|
|
1141
|
+
summary["db_size_bytes"] = db_size
|
|
1142
|
+
if destructive and LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES > 0 and db_size > LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES:
|
|
1143
|
+
rebuild = _archive_rebuild_local_context_for_roots_v2(conn, summary)
|
|
1144
|
+
summary["cleanup"] = rebuild
|
|
1145
|
+
summary["strategy"] = "archive_rebuild"
|
|
1146
|
+
summary["ok"] = bool(rebuild.get("ok"))
|
|
1147
|
+
if not rebuild.get("ok"):
|
|
1148
|
+
summary["error"] = str(rebuild.get("error") or "archive_rebuild_failed")
|
|
1149
|
+
return summary
|
|
1150
|
+
|
|
1151
|
+
backup = None
|
|
1152
|
+
if destructive:
|
|
1153
|
+
backup = _create_roots_v2_sqlite_backup(conn)
|
|
1154
|
+
summary["backup"] = backup
|
|
1155
|
+
if not backup.get("ok"):
|
|
1156
|
+
summary["ok"] = False
|
|
1157
|
+
summary["error"] = "migration_backup_failed"
|
|
1158
|
+
return summary
|
|
1159
|
+
|
|
1160
|
+
for new_root_id, asset_ids in asset_remaps.items():
|
|
1161
|
+
for start in range(0, len(asset_ids), 500):
|
|
1162
|
+
batch = asset_ids[start:start + 500]
|
|
1163
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1164
|
+
conn.execute(f"UPDATE local_assets SET root_id=?, updated_at=? WHERE asset_id IN ({placeholders})", (new_root_id, now(), *batch))
|
|
1165
|
+
for new_root_id, dir_ids in dir_remaps.items():
|
|
1166
|
+
for start in range(0, len(dir_ids), 500):
|
|
1167
|
+
batch = dir_ids[start:start + 500]
|
|
1168
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1169
|
+
conn.execute(f"UPDATE local_index_dirs SET root_id=?, updated_at=? WHERE dir_id IN ({placeholders})", (new_root_id, now(), *batch))
|
|
1170
|
+
cleanup = _purge_asset_ids(conn, asset_ids_to_purge)
|
|
1171
|
+
cleanup["dirs"] = _purge_dir_ids(conn, dir_ids_to_purge)
|
|
1172
|
+
if legacy_ids:
|
|
1173
|
+
placeholders = ",".join("?" for _ in legacy_ids)
|
|
1174
|
+
conn.execute(f"DELETE FROM local_index_checkpoints WHERE root_id IN ({placeholders})", tuple(legacy_ids))
|
|
1175
|
+
conn.execute(
|
|
1176
|
+
f"UPDATE local_index_roots SET status='removed', source='core_removed', updated_at=? WHERE id IN ({placeholders})",
|
|
1177
|
+
(now(), *legacy_ids),
|
|
1178
|
+
)
|
|
1179
|
+
_set_state_conn(conn, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
|
|
1180
|
+
_set_initial_index_complete(conn, False)
|
|
1181
|
+
_set_initial_index_started_at(conn, now())
|
|
1182
|
+
summary["cleanup"] = cleanup
|
|
1183
|
+
summary["strategy"] = "in_place"
|
|
1184
|
+
log_event("info", "roots_seed_v2_migrated", "Local memory roots seed v2 applied", summary=summary)
|
|
1185
|
+
return summary
|
|
418
1186
|
|
|
419
1187
|
|
|
420
1188
|
def _should_skip_mounted_root(candidate: Path) -> bool:
|
|
@@ -667,20 +1435,26 @@ def repair_index_hygiene() -> dict:
|
|
|
667
1435
|
return local_index_hygiene(fix=True)
|
|
668
1436
|
|
|
669
1437
|
|
|
670
|
-
def add_exclusion(path: str, *, reason: str = "user") -> dict:
|
|
1438
|
+
def add_exclusion(path: str, *, reason: str = "user", source: str = "user", kind: str = "folder") -> dict:
|
|
671
1439
|
conn = _conn()
|
|
672
1440
|
excluded_path = norm_path(path)
|
|
1441
|
+
source_value = _normalize_source(source)
|
|
1442
|
+
kind_value = str(kind or "folder").strip().lower() or "folder"
|
|
673
1443
|
conn.execute(
|
|
674
1444
|
"""
|
|
675
|
-
INSERT INTO local_index_exclusions(path, display_path, reason, created_at)
|
|
676
|
-
VALUES (?, ?, ?, ?)
|
|
677
|
-
ON CONFLICT(path) DO UPDATE SET
|
|
1445
|
+
INSERT INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
|
|
1446
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
1447
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
1448
|
+
display_path=excluded.display_path,
|
|
1449
|
+
source=excluded.source,
|
|
1450
|
+
kind=excluded.kind,
|
|
1451
|
+
reason=excluded.reason
|
|
678
1452
|
""",
|
|
679
|
-
(excluded_path, path, reason, now()),
|
|
1453
|
+
(excluded_path, path, source_value, kind_value, reason, now()),
|
|
680
1454
|
)
|
|
681
1455
|
conn.commit()
|
|
682
|
-
log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason)
|
|
683
|
-
return {"ok": True, "path": excluded_path}
|
|
1456
|
+
log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason, source=source_value)
|
|
1457
|
+
return {"ok": True, "path": excluded_path, "source": source_value, "kind": kind_value}
|
|
684
1458
|
|
|
685
1459
|
|
|
686
1460
|
def remove_exclusion(path: str) -> dict:
|
|
@@ -1184,7 +1958,7 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
|
|
|
1184
1958
|
raw_path = str(path)
|
|
1185
1959
|
normalized = norm_path(raw_path)
|
|
1186
1960
|
asset_id = stable_id("asset", normalized)
|
|
1187
|
-
if
|
|
1961
|
+
if not _should_index_file(conn, normalized):
|
|
1188
1962
|
return asset_id, False, "skipped"
|
|
1189
1963
|
perm = _permission_state(path)
|
|
1190
1964
|
depth, privacy_class, depth_reason = classify_path(normalized)
|
|
@@ -1265,8 +2039,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
|
|
|
1265
2039
|
""",
|
|
1266
2040
|
(version_id, asset_id, fingerprint, int(st.st_size), float(st.st_mtime), now()),
|
|
1267
2041
|
)
|
|
1268
|
-
if
|
|
1269
|
-
enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path))
|
|
2042
|
+
if _should_extract_file(conn, normalized, depth):
|
|
2043
|
+
enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path, conn=conn))
|
|
1270
2044
|
enqueue_job(conn, asset_id, "graph", priority=40)
|
|
1271
2045
|
return asset_id, changed, "ok"
|
|
1272
2046
|
|
|
@@ -1377,7 +2151,15 @@ def enqueue_job(conn, asset_id: str, job_type: str, *, priority: int = 50) -> st
|
|
|
1377
2151
|
return job_id
|
|
1378
2152
|
|
|
1379
2153
|
|
|
1380
|
-
def _extraction_priority(path: Path) -> int:
|
|
2154
|
+
def _extraction_priority(path: Path, *, conn=None) -> int:
|
|
2155
|
+
if conn is not None:
|
|
2156
|
+
rule = _effective_file_type_rule(conn, path.suffix.lower())
|
|
2157
|
+
try:
|
|
2158
|
+
priority = int(rule.get("priority") or 0)
|
|
2159
|
+
except Exception:
|
|
2160
|
+
priority = 0
|
|
2161
|
+
if priority > 0:
|
|
2162
|
+
return priority
|
|
1381
2163
|
suffix = path.suffix.lower()
|
|
1382
2164
|
if suffix in HIGH_VALUE_DOCUMENT_SUFFIXES:
|
|
1383
2165
|
return 90
|
|
@@ -1385,7 +2167,7 @@ def _extraction_priority(path: Path) -> int:
|
|
|
1385
2167
|
return 82
|
|
1386
2168
|
if suffix in EMAIL_DOCUMENT_SUFFIXES or is_local_email_tree(str(path)):
|
|
1387
2169
|
return 70
|
|
1388
|
-
if suffix in
|
|
2170
|
+
if suffix in CODE_DOCUMENT_SUFFIXES:
|
|
1389
2171
|
return 55
|
|
1390
2172
|
return 45
|
|
1391
2173
|
|
|
@@ -1465,7 +2247,7 @@ def _iter_files(
|
|
|
1465
2247
|
continue
|
|
1466
2248
|
if entry.is_file():
|
|
1467
2249
|
normalized = norm_path(entry)
|
|
1468
|
-
if
|
|
2250
|
+
if not _should_index_file(conn, normalized):
|
|
1469
2251
|
continue
|
|
1470
2252
|
if start_after_norm and normalized <= start_after_norm:
|
|
1471
2253
|
continue
|
|
@@ -1548,7 +2330,7 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
|
|
|
1548
2330
|
_purge_asset_ids(conn, [row["asset_id"]])
|
|
1549
2331
|
stats["excluded"] += 1
|
|
1550
2332
|
continue
|
|
1551
|
-
if
|
|
2333
|
+
if not _should_index_file(conn, path):
|
|
1552
2334
|
_purge_asset_ids(conn, [row["asset_id"]])
|
|
1553
2335
|
stats["excluded"] += 1
|
|
1554
2336
|
continue
|
|
@@ -1656,7 +2438,7 @@ def _scan_known_directory(
|
|
|
1656
2438
|
stack.append(entry)
|
|
1657
2439
|
continue
|
|
1658
2440
|
if entry.is_file():
|
|
1659
|
-
if
|
|
2441
|
+
if not _should_index_file(conn, entry):
|
|
1660
2442
|
continue
|
|
1661
2443
|
seen_files.add(norm_path(entry))
|
|
1662
2444
|
if stats["files_scanned"] >= file_limit:
|
|
@@ -1750,6 +2532,7 @@ def reconcile_live_changes(
|
|
|
1750
2532
|
file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
|
|
1751
2533
|
) -> dict:
|
|
1752
2534
|
conn = _conn()
|
|
2535
|
+
seed_core_file_type_rules(conn)
|
|
1753
2536
|
if _is_paused():
|
|
1754
2537
|
return {"ok": True, "paused": True, "assets": {}, "dirs": {}}
|
|
1755
2538
|
exclusions = [row["path"] for row in list_exclusions(readonly=False)]
|
|
@@ -1781,6 +2564,7 @@ def reconcile_live_changes(
|
|
|
1781
2564
|
|
|
1782
2565
|
def scan_once(*, limit: int | None = None) -> dict:
|
|
1783
2566
|
conn = _conn()
|
|
2567
|
+
seed_core_file_type_rules(conn)
|
|
1784
2568
|
if _is_paused():
|
|
1785
2569
|
log_event("info", "scan_skipped_paused", "Local memory scan skipped because indexing is paused")
|
|
1786
2570
|
return {"ok": True, "paused": True, "roots": 0, "seen": 0, "changed": 0, "errors": 0, "partial": False}
|
|
@@ -2970,6 +3754,7 @@ def _status_from_conn(conn, *, readonly: bool = False) -> dict:
|
|
|
2970
3754
|
"volumes": volumes,
|
|
2971
3755
|
"roots": roots,
|
|
2972
3756
|
"exclusions": _list_exclusions_conn(conn),
|
|
3757
|
+
"file_types": _shape_file_type_rules(_list_file_type_rules_conn(conn)),
|
|
2973
3758
|
"problems": problems,
|
|
2974
3759
|
"permissions": [],
|
|
2975
3760
|
"models": model_status()["models"],
|