nexo-brain 7.25.3 → 7.25.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +5 -1
- package/package.json +1 -1
- package/src/cli.py +66 -0
- package/src/db/_schema.py +23 -0
- package/src/local_context/__init__.py +10 -0
- package/src/local_context/api.py +921 -67
- package/src/local_context/db.py +45 -1
- package/src/local_context/extractors.py +17 -1
- package/src/server.py +26 -0
- package/tool-enforcement-map.json +31 -0
package/src/local_context/api.py
CHANGED
|
@@ -17,10 +17,10 @@ from typing import Any
|
|
|
17
17
|
|
|
18
18
|
import paths
|
|
19
19
|
from . import embeddings
|
|
20
|
-
from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db
|
|
20
|
+
from .db import LOCAL_CONTEXT_TABLES, close_local_context_db, connect_local_context_db_readonly, ensure_local_context_db, get_local_context_db, local_context_db_path
|
|
21
21
|
from .extractors import canonical_entity_key, chunk_text, contains_secret, entities, entity_mentions, extract_text, normalize_entity_alias, summarize
|
|
22
22
|
from .logging import log_event, tail
|
|
23
|
-
from .privacy import classify_path, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
|
|
23
|
+
from .privacy import classify_path, is_local_email_db, is_local_email_tree, is_queryable_path, should_extract, should_skip_file, should_skip_tree
|
|
24
24
|
from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
|
|
25
25
|
|
|
26
26
|
LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
|
|
@@ -34,6 +34,9 @@ DEFAULT_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_DEPTH", "24")
|
|
|
34
34
|
DEFAULT_EMAIL_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_EMAIL_ROOT_DEPTH", "24") or "24")
|
|
35
35
|
DEFAULT_MOUNTED_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_MOUNTED_ROOT_DEPTH", "24") or "24")
|
|
36
36
|
DEFAULT_SYSTEM_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_SYSTEM_ROOT_DEPTH", "24") or "24")
|
|
37
|
+
DEFAULT_ROOT_SEED_VERSION = 2
|
|
38
|
+
ROOT_SEED_VERSION_KEY = "local_index_roots_seed_version"
|
|
39
|
+
LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES = int(os.environ.get("NEXO_LOCAL_INDEX_V2_REBUILD_THRESHOLD_BYTES", str(2 * 1024 * 1024 * 1024)) or str(2 * 1024 * 1024 * 1024))
|
|
37
40
|
DEFAULT_CONTEXT_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_MAX_CHARS", "20000") or "20000")
|
|
38
41
|
DEFAULT_ROUTER_MAX_CHARS = int(os.environ.get("NEXO_LOCAL_CONTEXT_ROUTER_MAX_CHARS", "6000") or "6000")
|
|
39
42
|
DEFAULT_MAX_JOB_ATTEMPTS = int(os.environ.get("NEXO_LOCAL_INDEX_MAX_JOB_ATTEMPTS", "3") or "3")
|
|
@@ -66,7 +69,6 @@ HIGH_VALUE_DOCUMENT_SUFFIXES = {
|
|
|
66
69
|
".pptx",
|
|
67
70
|
".pages",
|
|
68
71
|
".numbers",
|
|
69
|
-
".key",
|
|
70
72
|
".rtf",
|
|
71
73
|
".odt",
|
|
72
74
|
".ods",
|
|
@@ -84,6 +86,65 @@ EMAIL_DOCUMENT_SUFFIXES = {
|
|
|
84
86
|
".emlx",
|
|
85
87
|
".msg",
|
|
86
88
|
}
|
|
89
|
+
CODE_DOCUMENT_SUFFIXES = {
|
|
90
|
+
".py",
|
|
91
|
+
".js",
|
|
92
|
+
".ts",
|
|
93
|
+
".tsx",
|
|
94
|
+
".jsx",
|
|
95
|
+
".php",
|
|
96
|
+
".sql",
|
|
97
|
+
".json",
|
|
98
|
+
".yaml",
|
|
99
|
+
".yml",
|
|
100
|
+
".toml",
|
|
101
|
+
".html",
|
|
102
|
+
".css",
|
|
103
|
+
}
|
|
104
|
+
IMAGE_METADATA_SUFFIXES = {
|
|
105
|
+
".jpg",
|
|
106
|
+
".jpeg",
|
|
107
|
+
".png",
|
|
108
|
+
".gif",
|
|
109
|
+
".heic",
|
|
110
|
+
".webp",
|
|
111
|
+
".tif",
|
|
112
|
+
".tiff",
|
|
113
|
+
".bmp",
|
|
114
|
+
".raw",
|
|
115
|
+
".dng",
|
|
116
|
+
}
|
|
117
|
+
MEDIA_METADATA_SUFFIXES = {
|
|
118
|
+
".mp3",
|
|
119
|
+
".m4a",
|
|
120
|
+
".wav",
|
|
121
|
+
".aac",
|
|
122
|
+
".flac",
|
|
123
|
+
".mp4",
|
|
124
|
+
".mov",
|
|
125
|
+
".avi",
|
|
126
|
+
".mkv",
|
|
127
|
+
".m4v",
|
|
128
|
+
}
|
|
129
|
+
IGNORED_BINARY_SUFFIXES = {
|
|
130
|
+
".app",
|
|
131
|
+
".bin",
|
|
132
|
+
".class",
|
|
133
|
+
".dll",
|
|
134
|
+
".dmg",
|
|
135
|
+
".dylib",
|
|
136
|
+
".exe",
|
|
137
|
+
".iso",
|
|
138
|
+
".jar",
|
|
139
|
+
".lock",
|
|
140
|
+
".o",
|
|
141
|
+
".obj",
|
|
142
|
+
".pyc",
|
|
143
|
+
".so",
|
|
144
|
+
".swp",
|
|
145
|
+
".swo",
|
|
146
|
+
".tmp",
|
|
147
|
+
}
|
|
87
148
|
HIGH_VALUE_DIRECTORY_NAMES = {
|
|
88
149
|
"users",
|
|
89
150
|
"home",
|
|
@@ -213,26 +274,250 @@ def _with_sqlite_busy_retry(callback, *, attempts: int | None = None):
|
|
|
213
274
|
return None
|
|
214
275
|
|
|
215
276
|
|
|
216
|
-
def
|
|
277
|
+
def _normalize_source(source: str | None) -> str:
|
|
278
|
+
value = str(source or "user").strip().lower().replace("-", "_")
|
|
279
|
+
return value or "user"
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _normalize_extension(extension: str) -> str:
|
|
283
|
+
value = str(extension or "").strip().lower()
|
|
284
|
+
if not value:
|
|
285
|
+
return ""
|
|
286
|
+
if not value.startswith("."):
|
|
287
|
+
value = "." + value
|
|
288
|
+
return value
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _normalize_file_type_action(action: str | None) -> str:
|
|
292
|
+
value = str(action or "").strip().lower()
|
|
293
|
+
if value in {"include", "extract", "read", "full"}:
|
|
294
|
+
return "extract"
|
|
295
|
+
if value in {"metadata", "inventory", "index"}:
|
|
296
|
+
return "metadata"
|
|
297
|
+
if value in {"exclude", "ignore", "skip", "blocked"}:
|
|
298
|
+
return "ignore"
|
|
299
|
+
return "ignore"
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _default_file_type_rule_specs() -> list[dict]:
|
|
303
|
+
specs: list[dict] = []
|
|
304
|
+
for suffix in sorted(HIGH_VALUE_DOCUMENT_SUFFIXES):
|
|
305
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 90, "reason": "core_high_value_document"})
|
|
306
|
+
for suffix in sorted(KNOWN_TEXT_SUFFIXES):
|
|
307
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 82, "reason": "core_text_document"})
|
|
308
|
+
for suffix in sorted(EMAIL_DOCUMENT_SUFFIXES):
|
|
309
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 70, "reason": "core_email_document"})
|
|
310
|
+
for suffix in sorted(CODE_DOCUMENT_SUFFIXES):
|
|
311
|
+
specs.append({"extension": suffix, "action": "extract", "priority": 55, "reason": "core_code_document"})
|
|
312
|
+
for suffix in sorted(IMAGE_METADATA_SUFFIXES):
|
|
313
|
+
specs.append({"extension": suffix, "action": "metadata", "priority": 35, "reason": "core_photo_metadata"})
|
|
314
|
+
for suffix in sorted(MEDIA_METADATA_SUFFIXES):
|
|
315
|
+
specs.append({"extension": suffix, "action": "metadata", "priority": 25, "reason": "core_media_metadata"})
|
|
316
|
+
for suffix in sorted(IGNORED_BINARY_SUFFIXES):
|
|
317
|
+
specs.append({"extension": suffix, "action": "ignore", "priority": 0, "reason": "core_binary_or_transient"})
|
|
318
|
+
return specs
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def seed_core_file_type_rules(conn=None) -> dict:
|
|
322
|
+
conn = conn or _conn()
|
|
323
|
+
created_or_updated = 0
|
|
324
|
+
timestamp = now()
|
|
325
|
+
for spec in _default_file_type_rule_specs():
|
|
326
|
+
conn.execute(
|
|
327
|
+
"""
|
|
328
|
+
INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
|
|
329
|
+
VALUES (?, ?, 'core_default', ?, ?, ?, ?)
|
|
330
|
+
ON CONFLICT(extension, source) DO UPDATE SET
|
|
331
|
+
action=excluded.action,
|
|
332
|
+
priority=excluded.priority,
|
|
333
|
+
reason=excluded.reason,
|
|
334
|
+
updated_at=excluded.updated_at
|
|
335
|
+
""",
|
|
336
|
+
(spec["extension"], spec["action"], int(spec["priority"]), spec["reason"], timestamp, timestamp),
|
|
337
|
+
)
|
|
338
|
+
created_or_updated += 1
|
|
339
|
+
return {"ok": True, "rules": created_or_updated}
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _list_file_type_rules_conn(conn) -> list[dict]:
|
|
343
|
+
rows = conn.execute(
|
|
344
|
+
"""
|
|
345
|
+
SELECT *
|
|
346
|
+
FROM local_index_file_type_rules
|
|
347
|
+
ORDER BY
|
|
348
|
+
CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END,
|
|
349
|
+
extension
|
|
350
|
+
"""
|
|
351
|
+
).fetchall()
|
|
352
|
+
return [dict(row) for row in rows]
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _shape_file_type_rules(rows: list[dict]) -> dict:
|
|
356
|
+
effective: dict[str, dict] = {}
|
|
357
|
+
for row in rows:
|
|
358
|
+
ext = str(row.get("extension") or "")
|
|
359
|
+
if ext not in effective or row.get("source") == "user":
|
|
360
|
+
effective[ext] = row
|
|
361
|
+
return {"ok": True, "rules": rows, "effective": list(effective.values())}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _effective_file_type_rule(conn, extension: str) -> dict:
|
|
365
|
+
ext = _normalize_extension(extension)
|
|
366
|
+
if not ext:
|
|
367
|
+
return {"extension": "", "action": "ignore", "source": "implicit", "priority": 0, "reason": "missing_extension"}
|
|
368
|
+
rows = conn.execute(
|
|
369
|
+
"""
|
|
370
|
+
SELECT *
|
|
371
|
+
FROM local_index_file_type_rules
|
|
372
|
+
WHERE extension=?
|
|
373
|
+
ORDER BY CASE source WHEN 'user' THEN 0 WHEN 'core_default' THEN 1 ELSE 2 END
|
|
374
|
+
LIMIT 1
|
|
375
|
+
""",
|
|
376
|
+
(ext,),
|
|
377
|
+
).fetchall()
|
|
378
|
+
if rows:
|
|
379
|
+
return dict(rows[0])
|
|
380
|
+
if is_local_email_tree(ext):
|
|
381
|
+
return {"extension": ext, "action": "extract", "source": "implicit", "priority": 70, "reason": "local_email"}
|
|
382
|
+
return {"extension": ext, "action": "ignore", "source": "implicit", "priority": 0, "reason": "unknown_extension"}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def list_file_type_rules(*, readonly: bool = True) -> dict:
|
|
386
|
+
if not readonly:
|
|
387
|
+
conn = _conn()
|
|
388
|
+
seed_core_file_type_rules(conn)
|
|
389
|
+
conn.commit()
|
|
390
|
+
rows = _list_file_type_rules_conn(conn)
|
|
391
|
+
else:
|
|
392
|
+
conn = _read_conn()
|
|
393
|
+
try:
|
|
394
|
+
rows = _list_file_type_rules_conn(conn)
|
|
395
|
+
finally:
|
|
396
|
+
_close_read_conn(conn)
|
|
397
|
+
return _shape_file_type_rules(rows)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _purge_assets_by_extension(conn, extension: str) -> dict:
|
|
401
|
+
ext = _normalize_extension(extension)
|
|
402
|
+
if not ext:
|
|
403
|
+
return {"assets": 0}
|
|
404
|
+
rows = conn.execute("SELECT asset_id FROM local_assets WHERE lower(extension)=?", (ext,)).fetchall()
|
|
405
|
+
return _purge_asset_ids(conn, [str(row["asset_id"]) for row in rows])
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def set_file_type_rule(extension: str, *, action: str = "extract", source: str = "user", priority: int | None = None, reason: str = "user") -> dict:
|
|
409
|
+
conn = _conn()
|
|
410
|
+
ext = _normalize_extension(extension)
|
|
411
|
+
if not ext:
|
|
412
|
+
return {"ok": False, "error": "extension_required"}
|
|
413
|
+
normalized_action = _normalize_file_type_action(action)
|
|
414
|
+
source_value = _normalize_source(source)
|
|
415
|
+
priority_value = int(priority if priority is not None else (82 if normalized_action == "extract" else 20 if normalized_action == "metadata" else 0))
|
|
416
|
+
timestamp = now()
|
|
417
|
+
conn.execute(
|
|
418
|
+
"""
|
|
419
|
+
INSERT INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
|
|
420
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
421
|
+
ON CONFLICT(extension, source) DO UPDATE SET
|
|
422
|
+
action=excluded.action,
|
|
423
|
+
priority=excluded.priority,
|
|
424
|
+
reason=excluded.reason,
|
|
425
|
+
updated_at=excluded.updated_at
|
|
426
|
+
""",
|
|
427
|
+
(ext, normalized_action, source_value, priority_value, reason, timestamp, timestamp),
|
|
428
|
+
)
|
|
429
|
+
cleanup = _purge_assets_by_extension(conn, ext) if normalized_action == "ignore" and source_value == "user" else {"assets": 0}
|
|
430
|
+
conn.commit()
|
|
431
|
+
log_event("info", "file_type_rule_set", "Local memory file type rule set", extension=ext, action=normalized_action, source=source_value, cleanup=cleanup)
|
|
432
|
+
return {"ok": True, "extension": ext, "action": normalized_action, "source": source_value, "priority": priority_value, "cleanup": cleanup}
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def remove_file_type_rule(extension: str, *, source: str = "user") -> dict:
|
|
436
|
+
conn = _conn()
|
|
437
|
+
ext = _normalize_extension(extension)
|
|
438
|
+
source_value = _normalize_source(source)
|
|
439
|
+
conn.execute("DELETE FROM local_index_file_type_rules WHERE extension=? AND source=?", (ext, source_value))
|
|
440
|
+
conn.commit()
|
|
441
|
+
log_event("info", "file_type_rule_removed", "Local memory file type rule removed", extension=ext, source=source_value)
|
|
442
|
+
return {"ok": True, "extension": ext, "source": source_value}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def reset_file_type_rules() -> dict:
|
|
446
|
+
conn = _conn()
|
|
447
|
+
deleted = int(conn.execute("DELETE FROM local_index_file_type_rules WHERE source='user'").rowcount or 0)
|
|
448
|
+
seeded = seed_core_file_type_rules(conn)
|
|
449
|
+
conn.commit()
|
|
450
|
+
log_event("info", "file_type_rules_reset", "Local memory user file type overrides reset", deleted=deleted)
|
|
451
|
+
return {"ok": True, "deleted": deleted, "core_rules": int(seeded.get("rules") or 0), "file_types": list_file_type_rules(readonly=False)}
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _file_type_action(conn, path: str | Path) -> str:
|
|
455
|
+
p = Path(path)
|
|
456
|
+
if is_local_email_db(str(path)) or is_local_email_tree(str(path)):
|
|
457
|
+
return "extract"
|
|
458
|
+
return str(_effective_file_type_rule(conn, p.suffix.lower()).get("action") or "ignore")
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def _should_index_file(conn, path: str | Path, *, allow_default_skip_override: bool = False) -> bool:
|
|
462
|
+
if not allow_default_skip_override and should_skip_file(str(path)):
|
|
463
|
+
return False
|
|
464
|
+
return _file_type_action(conn, path) != "ignore"
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _should_extract_file(conn, path: str | Path, depth: int, *, allow_default_skip_override: bool = False) -> bool:
|
|
468
|
+
if depth < 2 or (not allow_default_skip_override and should_skip_file(str(path))):
|
|
469
|
+
return False
|
|
470
|
+
return _file_type_action(conn, path) == "extract"
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def add_root(path: str, *, mode: str = "normal", depth: int | None = None, source: str = "user", remote: bool = False, seed_version: int | None = None) -> dict:
|
|
217
474
|
conn = _conn()
|
|
218
475
|
root_path = norm_path(path)
|
|
219
|
-
|
|
476
|
+
source_value = _normalize_source(source)
|
|
477
|
+
explicit_user_override = source_value == "user" and (_is_disk_root_path(root_path) or should_skip_tree(root_path))
|
|
478
|
+
if should_skip_tree(root_path) and source_value != "user" and not _allow_explicit_blocked_root(root_path):
|
|
220
479
|
log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
|
|
221
480
|
return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
|
|
222
481
|
depth_value = 2 if depth is None else int(depth)
|
|
223
|
-
|
|
482
|
+
seed_value = int(seed_version if seed_version is not None else (DEFAULT_ROOT_SEED_VERSION if source_value == "core_default" else 0))
|
|
483
|
+
existing = conn.execute("SELECT id, status, source, depth FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
|
|
484
|
+
if existing and str(existing["status"] or "") == "active" and source_value == "user" and str(existing["source"] or "") == "core_default" and not explicit_user_override:
|
|
485
|
+
return {"ok": True, "root_path": root_path, "mode": mode, "depth": int(existing["depth"] or depth_value), "already_included": True, "included_by": "core_default"}
|
|
486
|
+
if source_value == "user":
|
|
487
|
+
parent = conn.execute(
|
|
488
|
+
"""
|
|
489
|
+
SELECT root_path, source, depth
|
|
490
|
+
FROM local_index_roots
|
|
491
|
+
WHERE status='active' AND source='core_default'
|
|
492
|
+
ORDER BY length(root_path) DESC
|
|
493
|
+
"""
|
|
494
|
+
).fetchall()
|
|
495
|
+
for row in parent:
|
|
496
|
+
parent_path = str(row["root_path"] or "")
|
|
497
|
+
if _is_nested_path(root_path, parent_path) and not explicit_user_override:
|
|
498
|
+
return {
|
|
499
|
+
"ok": True,
|
|
500
|
+
"root_path": root_path,
|
|
501
|
+
"already_included": True,
|
|
502
|
+
"included_by": "core_default",
|
|
503
|
+
"included_root": parent_path,
|
|
504
|
+
"depth": int(row["depth"] or depth_value),
|
|
505
|
+
}
|
|
224
506
|
conn.execute(
|
|
225
507
|
"""
|
|
226
|
-
INSERT INTO local_index_roots(root_path, display_path, mode, depth, status, created_at, updated_at)
|
|
227
|
-
VALUES (?, ?, ?, ?, 'active', ?, ?)
|
|
508
|
+
INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
|
|
509
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, 'active', ?, ?)
|
|
228
510
|
ON CONFLICT(root_path) DO UPDATE SET
|
|
229
511
|
display_path=excluded.display_path,
|
|
230
512
|
mode=excluded.mode,
|
|
231
513
|
depth=excluded.depth,
|
|
514
|
+
source=excluded.source,
|
|
515
|
+
remote=excluded.remote,
|
|
516
|
+
seed_version=excluded.seed_version,
|
|
232
517
|
status='active',
|
|
233
518
|
updated_at=excluded.updated_at
|
|
234
519
|
""",
|
|
235
|
-
(root_path, path, mode, depth_value, now(), now()),
|
|
520
|
+
(root_path, path, mode, depth_value, source_value, 1 if remote else 0, seed_value, now(), now()),
|
|
236
521
|
)
|
|
237
522
|
row = conn.execute("SELECT id FROM local_index_roots WHERE root_path=?", (root_path,)).fetchone()
|
|
238
523
|
existing_status = str(existing["status"] or "") if existing else ""
|
|
@@ -241,8 +526,8 @@ def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> di
|
|
|
241
526
|
_set_initial_index_complete(conn, False)
|
|
242
527
|
_set_initial_index_started_at(conn, now())
|
|
243
528
|
conn.commit()
|
|
244
|
-
log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value)
|
|
245
|
-
return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value}
|
|
529
|
+
log_event("info", "root_added", "Root added", path=redact_path(root_path), mode=mode, depth=depth_value, source=source_value, explicit_override=explicit_user_override)
|
|
530
|
+
return {"ok": True, "root_path": root_path, "mode": mode, "depth": depth_value, "source": source_value, "remote": bool(remote), "explicit_override": explicit_user_override}
|
|
246
531
|
|
|
247
532
|
|
|
248
533
|
def remove_root(path: str) -> dict:
|
|
@@ -331,15 +616,40 @@ def _mounted_volume_roots() -> list[str]:
|
|
|
331
616
|
|
|
332
617
|
|
|
333
618
|
def _system_volume_roots() -> list[str]:
|
|
619
|
+
if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
|
|
620
|
+
if sys.platform == "darwin":
|
|
621
|
+
return ["/"]
|
|
622
|
+
if sys.platform.startswith("win"):
|
|
623
|
+
return []
|
|
624
|
+
return ["/"]
|
|
334
625
|
if os.environ.get("NEXO_LOCAL_INDEX_DISABLE_SYSTEM_ROOTS", "").strip() in {"1", "true", "yes"}:
|
|
335
626
|
return []
|
|
336
|
-
|
|
337
|
-
|
|
627
|
+
return []
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def _user_content_roots() -> list[str]:
|
|
631
|
+
home = Path.home()
|
|
632
|
+
candidates: list[Path] = [home]
|
|
338
633
|
if sys.platform.startswith("win"):
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
634
|
+
candidates.extend([
|
|
635
|
+
home / "OneDrive",
|
|
636
|
+
home / "OneDrive - Personal",
|
|
637
|
+
home / "OneDrive - Empresa",
|
|
638
|
+
])
|
|
639
|
+
for key in ("OneDrive", "OneDriveCommercial", "OneDriveConsumer"):
|
|
640
|
+
value = os.environ.get(key, "").strip()
|
|
641
|
+
if value:
|
|
642
|
+
candidates.append(Path(value))
|
|
643
|
+
elif sys.platform == "darwin":
|
|
644
|
+
candidates.append(home / "Library" / "Mobile Documents" / "com~apple~CloudDocs")
|
|
645
|
+
roots: list[str] = []
|
|
646
|
+
for candidate in candidates:
|
|
647
|
+
try:
|
|
648
|
+
if candidate.exists() and candidate.is_dir() and (not should_skip_tree(str(candidate)) or _allow_explicit_blocked_root(str(candidate))):
|
|
649
|
+
roots.append(str(candidate))
|
|
650
|
+
except Exception:
|
|
651
|
+
continue
|
|
652
|
+
return _dedupe_roots(roots)
|
|
343
653
|
|
|
344
654
|
|
|
345
655
|
def _local_email_roots() -> list[str]:
|
|
@@ -379,42 +689,507 @@ def default_roots() -> list[str]:
|
|
|
379
689
|
|
|
380
690
|
|
|
381
691
|
def default_root_specs() -> list[tuple[str, int]]:
|
|
382
|
-
home = Path.home()
|
|
383
692
|
configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
|
|
384
|
-
system_specs
|
|
385
|
-
|
|
693
|
+
system_specs: list[tuple[str, int]] = []
|
|
694
|
+
if os.environ.get("NEXO_LOCAL_INDEX_ENABLE_SYSTEM_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
|
|
695
|
+
system_specs = [(root, DEFAULT_SYSTEM_ROOT_DEPTH) for root in _system_volume_roots()]
|
|
696
|
+
mounted_specs = []
|
|
697
|
+
if os.environ.get("NEXO_LOCAL_INDEX_INCLUDE_MOUNTED_ROOTS", "").strip().lower() in {"1", "true", "yes"}:
|
|
698
|
+
mounted_specs = [(root, DEFAULT_MOUNTED_ROOT_DEPTH) for root in _mounted_volume_roots()]
|
|
386
699
|
configured_specs = [(item, DEFAULT_ROOT_DEPTH) for item in configured.split(os.pathsep) if item.strip()]
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
base_specs = [(str(home), DEFAULT_ROOT_DEPTH)]
|
|
700
|
+
user_specs = [(root, DEFAULT_ROOT_DEPTH) for root in _user_content_roots()]
|
|
701
|
+
base_specs = user_specs + system_specs + mounted_specs + configured_specs
|
|
390
702
|
return _dedupe_root_specs(
|
|
391
703
|
base_specs
|
|
392
704
|
+ [(root, DEFAULT_EMAIL_ROOT_DEPTH) for root in _local_email_roots()]
|
|
393
705
|
)
|
|
394
706
|
|
|
395
707
|
|
|
396
|
-
def
|
|
397
|
-
|
|
708
|
+
def _all_roots_by_path_conn(conn) -> dict[str, dict]:
|
|
709
|
+
rows = conn.execute("SELECT * FROM local_index_roots ORDER BY root_path").fetchall()
|
|
710
|
+
return {str(row["root_path"]): dict(row) for row in rows}
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def _seed_default_roots_conn(conn) -> dict:
|
|
714
|
+
existing = _all_roots_by_path_conn(conn)
|
|
398
715
|
created = []
|
|
399
716
|
updated = []
|
|
717
|
+
skipped_removed = []
|
|
400
718
|
for root, depth in default_root_specs():
|
|
401
719
|
candidate = Path(root).expanduser()
|
|
402
720
|
if not candidate.exists() or not candidate.is_dir():
|
|
403
721
|
continue
|
|
404
|
-
|
|
722
|
+
root_path = norm_path(str(candidate))
|
|
723
|
+
existing_row = existing.get(root_path)
|
|
405
724
|
if existing_row:
|
|
725
|
+
if str(existing_row.get("status") or "") == "removed":
|
|
726
|
+
skipped_removed.append({"root_path": root_path})
|
|
727
|
+
continue
|
|
406
728
|
current_depth = int(existing_row.get("depth") or 0)
|
|
407
729
|
if current_depth < depth:
|
|
408
|
-
conn = _conn()
|
|
409
730
|
conn.execute(
|
|
410
|
-
"UPDATE local_index_roots SET depth=?, updated_at=? WHERE root_path=?",
|
|
411
|
-
(depth, now(),
|
|
731
|
+
"UPDATE local_index_roots SET depth=?, source='core_default', seed_version=?, updated_at=? WHERE root_path=?",
|
|
732
|
+
(depth, DEFAULT_ROOT_SEED_VERSION, now(), root_path),
|
|
412
733
|
)
|
|
413
|
-
|
|
414
|
-
updated.append({"root_path": existing_row["root_path"], "depth": depth})
|
|
734
|
+
updated.append({"root_path": root_path, "depth": depth})
|
|
415
735
|
continue
|
|
416
|
-
|
|
417
|
-
|
|
736
|
+
timestamp = now()
|
|
737
|
+
conn.execute(
|
|
738
|
+
"""
|
|
739
|
+
INSERT INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
|
|
740
|
+
VALUES (?, ?, 'normal', ?, 'core_default', 0, ?, 'active', ?, ?)
|
|
741
|
+
""",
|
|
742
|
+
(root_path, str(candidate), int(depth), DEFAULT_ROOT_SEED_VERSION, timestamp, timestamp),
|
|
743
|
+
)
|
|
744
|
+
created.append({"root_path": root_path, "depth": int(depth)})
|
|
745
|
+
existing[root_path] = {
|
|
746
|
+
"root_path": root_path,
|
|
747
|
+
"display_path": str(candidate),
|
|
748
|
+
"mode": "normal",
|
|
749
|
+
"depth": int(depth),
|
|
750
|
+
"source": "core_default",
|
|
751
|
+
"remote": 0,
|
|
752
|
+
"seed_version": DEFAULT_ROOT_SEED_VERSION,
|
|
753
|
+
"status": "active",
|
|
754
|
+
}
|
|
755
|
+
return {"created": created, "updated": updated, "skipped_removed": skipped_removed}
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
def ensure_default_roots() -> dict:
|
|
759
|
+
conn = _conn()
|
|
760
|
+
seed_core_file_type_rules(conn)
|
|
761
|
+
seeded = _seed_default_roots_conn(conn)
|
|
762
|
+
migration = migrate_roots_seed_v2(dry_run=False, _already_seeded=True)
|
|
763
|
+
try:
|
|
764
|
+
conn.commit()
|
|
765
|
+
except sqlite3.ProgrammingError:
|
|
766
|
+
# A large legacy DB may have been archived and replaced during migration.
|
|
767
|
+
pass
|
|
768
|
+
return {
|
|
769
|
+
"ok": True,
|
|
770
|
+
"created": len(seeded["created"]),
|
|
771
|
+
"updated": len(seeded["updated"]),
|
|
772
|
+
"skipped_removed": len(seeded["skipped_removed"]),
|
|
773
|
+
"migration": migration,
|
|
774
|
+
"roots": list_roots(readonly=False),
|
|
775
|
+
"file_types": list_file_type_rules(readonly=False),
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
def _local_context_sidecar_paths(db_path: Path) -> list[Path]:
|
|
780
|
+
return [db_path, db_path.with_name(db_path.name + "-wal"), db_path.with_name(db_path.name + "-shm")]
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def _local_context_db_size_bytes() -> int:
|
|
784
|
+
total = 0
|
|
785
|
+
for candidate in _local_context_sidecar_paths(local_context_db_path()):
|
|
786
|
+
try:
|
|
787
|
+
if candidate.exists():
|
|
788
|
+
total += int(candidate.stat().st_size)
|
|
789
|
+
except OSError:
|
|
790
|
+
continue
|
|
791
|
+
return total
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _capture_roots_v2_config(conn) -> dict:
|
|
795
|
+
state_rows = [
|
|
796
|
+
dict(row)
|
|
797
|
+
for row in conn.execute(
|
|
798
|
+
"""
|
|
799
|
+
SELECT key, value, updated_at
|
|
800
|
+
FROM local_index_state
|
|
801
|
+
WHERE key NOT LIKE 'root_initial_scan:%'
|
|
802
|
+
AND key NOT IN (?, ?, ?)
|
|
803
|
+
ORDER BY key
|
|
804
|
+
""",
|
|
805
|
+
(ROOT_SEED_VERSION_KEY, INITIAL_INDEX_COMPLETE_KEY, INITIAL_INDEX_STARTED_AT_KEY),
|
|
806
|
+
).fetchall()
|
|
807
|
+
]
|
|
808
|
+
root_rows = []
|
|
809
|
+
for row in conn.execute(
|
|
810
|
+
"""
|
|
811
|
+
SELECT root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at
|
|
812
|
+
FROM local_index_roots
|
|
813
|
+
ORDER BY root_path
|
|
814
|
+
"""
|
|
815
|
+
).fetchall():
|
|
816
|
+
shaped = dict(row)
|
|
817
|
+
source = str(shaped.get("source") or "legacy")
|
|
818
|
+
status = str(shaped.get("status") or "")
|
|
819
|
+
root_path = str(shaped.get("root_path") or "")
|
|
820
|
+
preserve = (
|
|
821
|
+
source == "user"
|
|
822
|
+
or bool(shaped.get("remote"))
|
|
823
|
+
or status == "removed"
|
|
824
|
+
or (source == "core_default" and status == "active" and not _is_disk_root_path(root_path))
|
|
825
|
+
)
|
|
826
|
+
if preserve:
|
|
827
|
+
root_rows.append(shaped)
|
|
828
|
+
exclusion_rows = [
|
|
829
|
+
dict(row)
|
|
830
|
+
for row in conn.execute(
|
|
831
|
+
"""
|
|
832
|
+
SELECT path, display_path, source, kind, reason, created_at
|
|
833
|
+
FROM local_index_exclusions
|
|
834
|
+
ORDER BY path
|
|
835
|
+
"""
|
|
836
|
+
).fetchall()
|
|
837
|
+
]
|
|
838
|
+
file_type_rows = [
|
|
839
|
+
dict(row)
|
|
840
|
+
for row in conn.execute(
|
|
841
|
+
"""
|
|
842
|
+
SELECT extension, action, source, priority, reason, created_at, updated_at
|
|
843
|
+
FROM local_index_file_type_rules
|
|
844
|
+
WHERE source='user'
|
|
845
|
+
ORDER BY extension
|
|
846
|
+
"""
|
|
847
|
+
).fetchall()
|
|
848
|
+
]
|
|
849
|
+
return {
|
|
850
|
+
"state": state_rows,
|
|
851
|
+
"roots": root_rows,
|
|
852
|
+
"exclusions": exclusion_rows,
|
|
853
|
+
"file_types": file_type_rows,
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def _restore_roots_v2_config(conn, config: dict) -> dict:
|
|
858
|
+
restored = {"state": 0, "roots": 0, "exclusions": 0, "file_types": 0}
|
|
859
|
+
timestamp = now()
|
|
860
|
+
for row in config.get("state") or []:
|
|
861
|
+
conn.execute(
|
|
862
|
+
"""
|
|
863
|
+
INSERT OR REPLACE INTO local_index_state(key, value, updated_at)
|
|
864
|
+
VALUES (?, ?, ?)
|
|
865
|
+
""",
|
|
866
|
+
(row.get("key"), row.get("value") or "", float(row.get("updated_at") or timestamp)),
|
|
867
|
+
)
|
|
868
|
+
restored["state"] += 1
|
|
869
|
+
for row in config.get("roots") or []:
|
|
870
|
+
root_path = norm_path(str(row.get("root_path") or ""))
|
|
871
|
+
if not root_path:
|
|
872
|
+
continue
|
|
873
|
+
conn.execute(
|
|
874
|
+
"""
|
|
875
|
+
INSERT OR REPLACE INTO local_index_roots(root_path, display_path, mode, depth, source, remote, seed_version, status, created_at, updated_at)
|
|
876
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
877
|
+
""",
|
|
878
|
+
(
|
|
879
|
+
root_path,
|
|
880
|
+
row.get("display_path") or root_path,
|
|
881
|
+
row.get("mode") or "normal",
|
|
882
|
+
int(row.get("depth") or DEFAULT_ROOT_DEPTH),
|
|
883
|
+
_normalize_source(row.get("source") or "user"),
|
|
884
|
+
1 if row.get("remote") else 0,
|
|
885
|
+
int(row.get("seed_version") or 0),
|
|
886
|
+
row.get("status") or "active",
|
|
887
|
+
float(row.get("created_at") or timestamp),
|
|
888
|
+
float(row.get("updated_at") or timestamp),
|
|
889
|
+
),
|
|
890
|
+
)
|
|
891
|
+
restored["roots"] += 1
|
|
892
|
+
for row in config.get("exclusions") or []:
|
|
893
|
+
exclusion_path = norm_path(str(row.get("path") or ""))
|
|
894
|
+
if not exclusion_path:
|
|
895
|
+
continue
|
|
896
|
+
conn.execute(
|
|
897
|
+
"""
|
|
898
|
+
INSERT OR REPLACE INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
|
|
899
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
900
|
+
""",
|
|
901
|
+
(
|
|
902
|
+
exclusion_path,
|
|
903
|
+
row.get("display_path") or exclusion_path,
|
|
904
|
+
_normalize_source(row.get("source") or "user"),
|
|
905
|
+
row.get("kind") or "folder",
|
|
906
|
+
row.get("reason") or "user",
|
|
907
|
+
float(row.get("created_at") or timestamp),
|
|
908
|
+
),
|
|
909
|
+
)
|
|
910
|
+
restored["exclusions"] += 1
|
|
911
|
+
for row in config.get("file_types") or []:
|
|
912
|
+
extension = _normalize_extension(str(row.get("extension") or ""))
|
|
913
|
+
if not extension:
|
|
914
|
+
continue
|
|
915
|
+
conn.execute(
|
|
916
|
+
"""
|
|
917
|
+
INSERT OR REPLACE INTO local_index_file_type_rules(extension, action, source, priority, reason, created_at, updated_at)
|
|
918
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
919
|
+
""",
|
|
920
|
+
(
|
|
921
|
+
extension,
|
|
922
|
+
_normalize_file_type_action(str(row.get("action") or "ignore")),
|
|
923
|
+
_normalize_source(row.get("source") or "user"),
|
|
924
|
+
int(row.get("priority") or 0),
|
|
925
|
+
row.get("reason") or "user",
|
|
926
|
+
float(row.get("created_at") or timestamp),
|
|
927
|
+
float(row.get("updated_at") or timestamp),
|
|
928
|
+
),
|
|
929
|
+
)
|
|
930
|
+
restored["file_types"] += 1
|
|
931
|
+
return restored
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
def _create_roots_v2_sqlite_backup(conn) -> dict:
|
|
935
|
+
db_path = local_context_db_path()
|
|
936
|
+
if not db_path.is_file():
|
|
937
|
+
return {"ok": True, "skipped": True, "reason": "db_missing"}
|
|
938
|
+
conn.commit()
|
|
939
|
+
backup_path = paths.create_backup_path("local-context-roots-v2", ".db")
|
|
940
|
+
backup_conn = None
|
|
941
|
+
try:
|
|
942
|
+
backup_conn = sqlite3.connect(str(backup_path))
|
|
943
|
+
conn.backup(backup_conn)
|
|
944
|
+
backup_conn.close()
|
|
945
|
+
backup_conn = None
|
|
946
|
+
backup_check = sqlite3.connect(str(backup_path))
|
|
947
|
+
try:
|
|
948
|
+
source_roots = int(conn.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
|
|
949
|
+
backup_roots = int(backup_check.execute("SELECT COUNT(*) FROM local_index_roots").fetchone()[0] or 0)
|
|
950
|
+
finally:
|
|
951
|
+
backup_check.close()
|
|
952
|
+
if backup_roots < source_roots:
|
|
953
|
+
return {
|
|
954
|
+
"ok": False,
|
|
955
|
+
"error": "backup_validation_failed",
|
|
956
|
+
"path": str(backup_path),
|
|
957
|
+
"source_roots": source_roots,
|
|
958
|
+
"backup_roots": backup_roots,
|
|
959
|
+
}
|
|
960
|
+
prune = paths.finalize_backup_snapshot(backup_path)
|
|
961
|
+
return {"ok": True, "path": str(backup_path), "source_roots": source_roots, "backup_roots": backup_roots, "prune": prune}
|
|
962
|
+
except Exception as exc:
|
|
963
|
+
return {"ok": False, "error": str(exc), "path": str(backup_path)}
|
|
964
|
+
finally:
|
|
965
|
+
if backup_conn is not None:
|
|
966
|
+
try:
|
|
967
|
+
backup_conn.close()
|
|
968
|
+
except Exception:
|
|
969
|
+
pass
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
def _archive_rebuild_local_context_for_roots_v2(conn, summary: dict) -> dict:
|
|
973
|
+
db_path = local_context_db_path()
|
|
974
|
+
config = _capture_roots_v2_config(conn)
|
|
975
|
+
size_bytes = _local_context_db_size_bytes()
|
|
976
|
+
backup_dir = paths.create_backup_dir("local-context-roots-v2")
|
|
977
|
+
conn.commit()
|
|
978
|
+
close_local_context_db()
|
|
979
|
+
moved = []
|
|
980
|
+
try:
|
|
981
|
+
for candidate in _local_context_sidecar_paths(db_path):
|
|
982
|
+
if not candidate.exists():
|
|
983
|
+
continue
|
|
984
|
+
target = backup_dir / candidate.name
|
|
985
|
+
shutil.move(str(candidate), str(target))
|
|
986
|
+
moved.append({"path": str(candidate), "backup_path": str(target)})
|
|
987
|
+
fresh = _conn()
|
|
988
|
+
seed_core_file_type_rules(fresh)
|
|
989
|
+
restored = _restore_roots_v2_config(fresh, config)
|
|
990
|
+
seeded = _seed_default_roots_conn(fresh)
|
|
991
|
+
_set_state_conn(fresh, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
|
|
992
|
+
_set_initial_index_complete(fresh, False)
|
|
993
|
+
_set_initial_index_started_at(fresh, now())
|
|
994
|
+
fresh.commit()
|
|
995
|
+
prune = paths.finalize_backup_snapshot(backup_dir)
|
|
996
|
+
result = {
|
|
997
|
+
"ok": True,
|
|
998
|
+
"strategy": "archive_rebuild",
|
|
999
|
+
"backup_dir": str(backup_dir),
|
|
1000
|
+
"size_bytes": size_bytes,
|
|
1001
|
+
"moved": moved,
|
|
1002
|
+
"preserved": restored,
|
|
1003
|
+
"seeded": seeded,
|
|
1004
|
+
"prune": prune,
|
|
1005
|
+
}
|
|
1006
|
+
log_event("info", "roots_seed_v2_archived_rebuilt", "Local memory roots seed v2 archived large DB and rebuilt config", summary=summary, result=result)
|
|
1007
|
+
return result
|
|
1008
|
+
except Exception as exc:
|
|
1009
|
+
return {
|
|
1010
|
+
"ok": False,
|
|
1011
|
+
"strategy": "archive_rebuild",
|
|
1012
|
+
"backup_dir": str(backup_dir),
|
|
1013
|
+
"size_bytes": size_bytes,
|
|
1014
|
+
"moved": moved,
|
|
1015
|
+
"error": str(exc),
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
def _is_disk_root_path(path: str) -> bool:
|
|
1020
|
+
normalized = norm_path(path)
|
|
1021
|
+
if normalized in {"/", "\\"}:
|
|
1022
|
+
return True
|
|
1023
|
+
return bool(re.match(r"^[A-Za-z]:\\?$", normalized))
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
def _path_is_under_any(path: str, prefixes: list[str]) -> bool:
|
|
1027
|
+
value = norm_path(path)
|
|
1028
|
+
return any(value == prefix or value.startswith(_path_prefix(prefix)) for prefix in prefixes if prefix)
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
def _best_root_id_for_path(path: str, roots: list[dict]) -> int | None:
|
|
1032
|
+
value = norm_path(path)
|
|
1033
|
+
best: tuple[int, int] | None = None
|
|
1034
|
+
for row in roots:
|
|
1035
|
+
root_path = str(row.get("root_path") or "")
|
|
1036
|
+
if not root_path or not (value == root_path or value.startswith(_path_prefix(root_path))):
|
|
1037
|
+
continue
|
|
1038
|
+
candidate = (len(root_path), int(row.get("id") or 0))
|
|
1039
|
+
if best is None or candidate[0] > best[0]:
|
|
1040
|
+
best = candidate
|
|
1041
|
+
return best[1] if best else None
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
def _purge_dir_ids(conn, dir_ids: list[str]) -> int:
|
|
1045
|
+
unique_ids = [item for item in dict.fromkeys(dir_ids) if item]
|
|
1046
|
+
deleted = 0
|
|
1047
|
+
for start in range(0, len(unique_ids), 500):
|
|
1048
|
+
batch = unique_ids[start:start + 500]
|
|
1049
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1050
|
+
deleted += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
1051
|
+
return deleted
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
def migrate_roots_seed_v2(*, dry_run: bool = True, _already_seeded: bool = False) -> dict:
|
|
1055
|
+
"""Move legacy whole-disk roots to curated user roots and purge obvious noise."""
|
|
1056
|
+
conn = _conn()
|
|
1057
|
+
if not _already_seeded:
|
|
1058
|
+
seed_core_file_type_rules(conn)
|
|
1059
|
+
current_seed = _get_state_conn(conn, ROOT_SEED_VERSION_KEY, "0")
|
|
1060
|
+
if str(current_seed) == str(DEFAULT_ROOT_SEED_VERSION):
|
|
1061
|
+
return {"ok": True, "dry_run": dry_run, "needed": False, "seed_version": DEFAULT_ROOT_SEED_VERSION}
|
|
1062
|
+
|
|
1063
|
+
active_roots = [dict(row) for row in conn.execute("SELECT * FROM local_index_roots WHERE status='active'").fetchall()]
|
|
1064
|
+
keep_roots = [
|
|
1065
|
+
row for row in active_roots
|
|
1066
|
+
if str(row.get("status") or "") == "active"
|
|
1067
|
+
and not (
|
|
1068
|
+
_is_disk_root_path(str(row.get("root_path") or ""))
|
|
1069
|
+
and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
|
|
1070
|
+
)
|
|
1071
|
+
]
|
|
1072
|
+
keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
|
|
1073
|
+
legacy_disk_roots = [
|
|
1074
|
+
row for row in active_roots
|
|
1075
|
+
if (
|
|
1076
|
+
_is_disk_root_path(str(row.get("root_path") or ""))
|
|
1077
|
+
and str(row.get("source") or "legacy") in {"legacy", "core_default", "system_default"}
|
|
1078
|
+
)
|
|
1079
|
+
or (
|
|
1080
|
+
str(row.get("source") or "legacy") in {"legacy", "system_default"}
|
|
1081
|
+
and any(_is_nested_path(prefix, str(row.get("root_path") or "")) for prefix in keep_prefixes)
|
|
1082
|
+
)
|
|
1083
|
+
]
|
|
1084
|
+
keep_roots = [row for row in keep_roots if row not in legacy_disk_roots]
|
|
1085
|
+
keep_prefixes = [str(row.get("root_path") or "") for row in keep_roots if row.get("root_path")]
|
|
1086
|
+
legacy_ids = {int(row.get("id") or 0) for row in legacy_disk_roots}
|
|
1087
|
+
legacy_prefixes = [str(row.get("root_path") or "") for row in legacy_disk_roots if row.get("root_path")]
|
|
1088
|
+
override_prefixes = [str(row.get("root_path") or "") for row in keep_roots if _root_allows_default_skip_override(row)]
|
|
1089
|
+
|
|
1090
|
+
asset_ids_to_purge: list[str] = []
|
|
1091
|
+
asset_remaps: dict[int, list[str]] = {}
|
|
1092
|
+
asset_rows = conn.execute("SELECT asset_id, root_id, path, extension, privacy_class FROM local_assets").fetchall()
|
|
1093
|
+
for row in asset_rows:
|
|
1094
|
+
path = str(row["path"] or "")
|
|
1095
|
+
under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
|
|
1096
|
+
action = _file_type_action(conn, path)
|
|
1097
|
+
explicit_override = _path_under_any_prefix(path, override_prefixes)
|
|
1098
|
+
unsafe = not explicit_override and (
|
|
1099
|
+
should_skip_file(path)
|
|
1100
|
+
or str(row["privacy_class"] or "") in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}
|
|
1101
|
+
)
|
|
1102
|
+
if action == "ignore" or unsafe or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
|
|
1103
|
+
asset_ids_to_purge.append(str(row["asset_id"]))
|
|
1104
|
+
continue
|
|
1105
|
+
if under_legacy:
|
|
1106
|
+
new_root_id = _best_root_id_for_path(path, keep_roots)
|
|
1107
|
+
if new_root_id:
|
|
1108
|
+
asset_remaps.setdefault(new_root_id, []).append(str(row["asset_id"]))
|
|
1109
|
+
|
|
1110
|
+
dir_ids_to_purge: list[str] = []
|
|
1111
|
+
dir_remaps: dict[int, list[str]] = {}
|
|
1112
|
+
dir_rows = conn.execute("SELECT dir_id, root_id, path FROM local_index_dirs").fetchall()
|
|
1113
|
+
for row in dir_rows:
|
|
1114
|
+
path = str(row["path"] or "")
|
|
1115
|
+
under_legacy = int(row["root_id"] or 0) in legacy_ids or _path_is_under_any(path, legacy_prefixes)
|
|
1116
|
+
explicit_override = _path_under_any_prefix(path, override_prefixes)
|
|
1117
|
+
if (should_skip_tree(path) and not explicit_override) or (under_legacy and not _path_is_under_any(path, keep_prefixes)):
|
|
1118
|
+
dir_ids_to_purge.append(str(row["dir_id"]))
|
|
1119
|
+
continue
|
|
1120
|
+
if under_legacy:
|
|
1121
|
+
new_root_id = _best_root_id_for_path(path, keep_roots)
|
|
1122
|
+
if new_root_id:
|
|
1123
|
+
dir_remaps.setdefault(new_root_id, []).append(str(row["dir_id"]))
|
|
1124
|
+
|
|
1125
|
+
summary = {
|
|
1126
|
+
"ok": True,
|
|
1127
|
+
"dry_run": dry_run,
|
|
1128
|
+
"needed": True,
|
|
1129
|
+
"legacy_disk_roots": [str(row.get("root_path") or "") for row in legacy_disk_roots],
|
|
1130
|
+
"keep_roots": keep_prefixes,
|
|
1131
|
+
"assets_to_purge": len(asset_ids_to_purge),
|
|
1132
|
+
"dirs_to_purge": len(dir_ids_to_purge),
|
|
1133
|
+
"assets_to_remap": sum(len(items) for items in asset_remaps.values()),
|
|
1134
|
+
"dirs_to_remap": sum(len(items) for items in dir_remaps.values()),
|
|
1135
|
+
"cleanup": {},
|
|
1136
|
+
}
|
|
1137
|
+
if dry_run:
|
|
1138
|
+
return summary
|
|
1139
|
+
|
|
1140
|
+
destructive = bool(
|
|
1141
|
+
asset_ids_to_purge
|
|
1142
|
+
or dir_ids_to_purge
|
|
1143
|
+
or legacy_ids
|
|
1144
|
+
or any(asset_remaps.values())
|
|
1145
|
+
or any(dir_remaps.values())
|
|
1146
|
+
)
|
|
1147
|
+
db_size = _local_context_db_size_bytes()
|
|
1148
|
+
summary["db_size_bytes"] = db_size
|
|
1149
|
+
if destructive and LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES > 0 and db_size > LOCAL_CONTEXT_REBUILD_THRESHOLD_BYTES:
|
|
1150
|
+
rebuild = _archive_rebuild_local_context_for_roots_v2(conn, summary)
|
|
1151
|
+
summary["cleanup"] = rebuild
|
|
1152
|
+
summary["strategy"] = "archive_rebuild"
|
|
1153
|
+
summary["ok"] = bool(rebuild.get("ok"))
|
|
1154
|
+
if not rebuild.get("ok"):
|
|
1155
|
+
summary["error"] = str(rebuild.get("error") or "archive_rebuild_failed")
|
|
1156
|
+
return summary
|
|
1157
|
+
|
|
1158
|
+
backup = None
|
|
1159
|
+
if destructive:
|
|
1160
|
+
backup = _create_roots_v2_sqlite_backup(conn)
|
|
1161
|
+
summary["backup"] = backup
|
|
1162
|
+
if not backup.get("ok"):
|
|
1163
|
+
summary["ok"] = False
|
|
1164
|
+
summary["error"] = "migration_backup_failed"
|
|
1165
|
+
return summary
|
|
1166
|
+
|
|
1167
|
+
for new_root_id, asset_ids in asset_remaps.items():
|
|
1168
|
+
for start in range(0, len(asset_ids), 500):
|
|
1169
|
+
batch = asset_ids[start:start + 500]
|
|
1170
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1171
|
+
conn.execute(f"UPDATE local_assets SET root_id=?, updated_at=? WHERE asset_id IN ({placeholders})", (new_root_id, now(), *batch))
|
|
1172
|
+
for new_root_id, dir_ids in dir_remaps.items():
|
|
1173
|
+
for start in range(0, len(dir_ids), 500):
|
|
1174
|
+
batch = dir_ids[start:start + 500]
|
|
1175
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1176
|
+
conn.execute(f"UPDATE local_index_dirs SET root_id=?, updated_at=? WHERE dir_id IN ({placeholders})", (new_root_id, now(), *batch))
|
|
1177
|
+
cleanup = _purge_asset_ids(conn, asset_ids_to_purge)
|
|
1178
|
+
cleanup["dirs"] = _purge_dir_ids(conn, dir_ids_to_purge)
|
|
1179
|
+
if legacy_ids:
|
|
1180
|
+
placeholders = ",".join("?" for _ in legacy_ids)
|
|
1181
|
+
conn.execute(f"DELETE FROM local_index_checkpoints WHERE root_id IN ({placeholders})", tuple(legacy_ids))
|
|
1182
|
+
conn.execute(
|
|
1183
|
+
f"UPDATE local_index_roots SET status='removed', source='core_removed', updated_at=? WHERE id IN ({placeholders})",
|
|
1184
|
+
(now(), *legacy_ids),
|
|
1185
|
+
)
|
|
1186
|
+
_set_state_conn(conn, ROOT_SEED_VERSION_KEY, str(DEFAULT_ROOT_SEED_VERSION))
|
|
1187
|
+
_set_initial_index_complete(conn, False)
|
|
1188
|
+
_set_initial_index_started_at(conn, now())
|
|
1189
|
+
summary["cleanup"] = cleanup
|
|
1190
|
+
summary["strategy"] = "in_place"
|
|
1191
|
+
log_event("info", "roots_seed_v2_migrated", "Local memory roots seed v2 applied", summary=summary)
|
|
1192
|
+
return summary
|
|
418
1193
|
|
|
419
1194
|
|
|
420
1195
|
def _should_skip_mounted_root(candidate: Path) -> bool:
|
|
@@ -543,17 +1318,26 @@ def _purge_asset_ids(conn, asset_ids: list[str]) -> dict:
|
|
|
543
1318
|
|
|
544
1319
|
def _privacy_unsafe_asset_ids(conn) -> list[str]:
|
|
545
1320
|
rows = conn.execute("SELECT asset_id, path, privacy_class FROM local_assets").fetchall()
|
|
1321
|
+
override_prefixes = _active_user_override_prefixes_conn(conn)
|
|
546
1322
|
unsafe: list[str] = []
|
|
547
1323
|
for row in rows:
|
|
548
1324
|
privacy_class = str(row["privacy_class"] or "")
|
|
549
|
-
|
|
1325
|
+
path = str(row["path"] or "")
|
|
1326
|
+
if _path_under_any_prefix(path, override_prefixes):
|
|
1327
|
+
continue
|
|
1328
|
+
if should_skip_file(path) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
|
|
550
1329
|
unsafe.append(str(row["asset_id"]))
|
|
551
1330
|
return unsafe
|
|
552
1331
|
|
|
553
1332
|
|
|
554
1333
|
def _privacy_unsafe_dir_ids(conn) -> list[str]:
|
|
555
1334
|
rows = conn.execute("SELECT dir_id, path FROM local_index_dirs").fetchall()
|
|
556
|
-
|
|
1335
|
+
override_prefixes = _active_user_override_prefixes_conn(conn)
|
|
1336
|
+
return [
|
|
1337
|
+
str(row["dir_id"])
|
|
1338
|
+
for row in rows
|
|
1339
|
+
if should_skip_tree(str(row["path"] or "")) and not _path_under_any_prefix(str(row["path"] or ""), override_prefixes)
|
|
1340
|
+
]
|
|
557
1341
|
|
|
558
1342
|
|
|
559
1343
|
def _content_secret_asset_ids(conn) -> list[str]:
|
|
@@ -646,9 +1430,10 @@ def local_index_privacy_hygiene(*, fix: bool = False) -> dict:
|
|
|
646
1430
|
def local_index_hygiene(*, fix: bool = False) -> dict:
|
|
647
1431
|
conn = _conn()
|
|
648
1432
|
removed_paths: list[str] = []
|
|
649
|
-
for row in conn.execute("SELECT id, root_path FROM local_index_roots").fetchall():
|
|
1433
|
+
for row in conn.execute("SELECT id, root_path, source, status FROM local_index_roots").fetchall():
|
|
650
1434
|
path = str(row["root_path"] or "")
|
|
651
|
-
|
|
1435
|
+
root = dict(row)
|
|
1436
|
+
if _should_skip_mounted_root(Path(path)) or (should_skip_tree(path) and not _root_allows_default_skip_override(root)):
|
|
652
1437
|
removed_paths.append(path)
|
|
653
1438
|
if fix:
|
|
654
1439
|
conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE id=?", (now(), row["id"]))
|
|
@@ -667,20 +1452,26 @@ def repair_index_hygiene() -> dict:
|
|
|
667
1452
|
return local_index_hygiene(fix=True)
|
|
668
1453
|
|
|
669
1454
|
|
|
670
|
-
def add_exclusion(path: str, *, reason: str = "user") -> dict:
|
|
1455
|
+
def add_exclusion(path: str, *, reason: str = "user", source: str = "user", kind: str = "folder") -> dict:
|
|
671
1456
|
conn = _conn()
|
|
672
1457
|
excluded_path = norm_path(path)
|
|
1458
|
+
source_value = _normalize_source(source)
|
|
1459
|
+
kind_value = str(kind or "folder").strip().lower() or "folder"
|
|
673
1460
|
conn.execute(
|
|
674
1461
|
"""
|
|
675
|
-
INSERT INTO local_index_exclusions(path, display_path, reason, created_at)
|
|
676
|
-
VALUES (?, ?, ?, ?)
|
|
677
|
-
ON CONFLICT(path) DO UPDATE SET
|
|
1462
|
+
INSERT INTO local_index_exclusions(path, display_path, source, kind, reason, created_at)
|
|
1463
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
1464
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
1465
|
+
display_path=excluded.display_path,
|
|
1466
|
+
source=excluded.source,
|
|
1467
|
+
kind=excluded.kind,
|
|
1468
|
+
reason=excluded.reason
|
|
678
1469
|
""",
|
|
679
|
-
(excluded_path, path, reason, now()),
|
|
1470
|
+
(excluded_path, path, source_value, kind_value, reason, now()),
|
|
680
1471
|
)
|
|
681
1472
|
conn.commit()
|
|
682
|
-
log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason)
|
|
683
|
-
return {"ok": True, "path": excluded_path}
|
|
1473
|
+
log_event("info", "exclusion_added", "Exclusion added", path=redact_path(excluded_path), reason=reason, source=source_value)
|
|
1474
|
+
return {"ok": True, "path": excluded_path, "source": source_value, "kind": kind_value}
|
|
684
1475
|
|
|
685
1476
|
|
|
686
1477
|
def remove_exclusion(path: str) -> dict:
|
|
@@ -1050,6 +1841,39 @@ def _is_nested_path(path: str, parent: str) -> bool:
|
|
|
1050
1841
|
return value_cmp.startswith(prefix)
|
|
1051
1842
|
|
|
1052
1843
|
|
|
1844
|
+
def _root_allows_default_skip_override(root: dict | None) -> bool:
|
|
1845
|
+
if not root:
|
|
1846
|
+
return False
|
|
1847
|
+
root_path = str(root.get("root_path") or "")
|
|
1848
|
+
return str(root.get("source") or "") == "user" and bool(root_path) and (
|
|
1849
|
+
_is_disk_root_path(root_path) or should_skip_tree(root_path)
|
|
1850
|
+
)
|
|
1851
|
+
|
|
1852
|
+
|
|
1853
|
+
def _active_user_override_prefixes_conn(conn) -> list[str]:
|
|
1854
|
+
rows = conn.execute(
|
|
1855
|
+
"""
|
|
1856
|
+
SELECT root_path
|
|
1857
|
+
FROM local_index_roots
|
|
1858
|
+
WHERE status='active' AND source='user'
|
|
1859
|
+
"""
|
|
1860
|
+
).fetchall()
|
|
1861
|
+
return [
|
|
1862
|
+
str(row["root_path"] or "")
|
|
1863
|
+
for row in rows
|
|
1864
|
+
if row["root_path"] and (_is_disk_root_path(str(row["root_path"] or "")) or should_skip_tree(str(row["root_path"] or "")))
|
|
1865
|
+
]
|
|
1866
|
+
|
|
1867
|
+
|
|
1868
|
+
def _path_under_any_prefix(path: str, prefixes: list[str]) -> bool:
|
|
1869
|
+
for prefix in prefixes:
|
|
1870
|
+
if not prefix:
|
|
1871
|
+
continue
|
|
1872
|
+
if norm_path(path) == norm_path(prefix) or _is_nested_path(path, prefix):
|
|
1873
|
+
return True
|
|
1874
|
+
return False
|
|
1875
|
+
|
|
1876
|
+
|
|
1053
1877
|
def _is_discovered_mount_path(path: str) -> bool:
|
|
1054
1878
|
value = norm_path(path).replace("\\", "/").lower()
|
|
1055
1879
|
if not value:
|
|
@@ -1070,6 +1894,9 @@ def _effective_scan_roots(roots: list[dict]) -> list[dict]:
|
|
|
1070
1894
|
effective: list[dict] = []
|
|
1071
1895
|
for root in active_roots:
|
|
1072
1896
|
root_path = str(root.get("root_path") or "")
|
|
1897
|
+
if _root_allows_default_skip_override(root):
|
|
1898
|
+
effective.append(root)
|
|
1899
|
+
continue
|
|
1073
1900
|
if _is_discovered_mount_path(root_path):
|
|
1074
1901
|
effective.append(root)
|
|
1075
1902
|
continue
|
|
@@ -1180,14 +2007,16 @@ def _upsert_dir(
|
|
|
1180
2007
|
return changed, fingerprint
|
|
1181
2008
|
|
|
1182
2009
|
|
|
1183
|
-
def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: int) -> tuple[str, bool, str]:
|
|
2010
|
+
def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: int, *, allow_default_skip_override: bool = False) -> tuple[str, bool, str]:
|
|
1184
2011
|
raw_path = str(path)
|
|
1185
2012
|
normalized = norm_path(raw_path)
|
|
1186
2013
|
asset_id = stable_id("asset", normalized)
|
|
1187
|
-
if
|
|
2014
|
+
if not _should_index_file(conn, normalized, allow_default_skip_override=allow_default_skip_override):
|
|
1188
2015
|
return asset_id, False, "skipped"
|
|
1189
2016
|
perm = _permission_state(path)
|
|
1190
2017
|
depth, privacy_class, depth_reason = classify_path(normalized)
|
|
2018
|
+
if allow_default_skip_override and privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only", "inventory_only"}:
|
|
2019
|
+
depth, privacy_class, depth_reason = 2, "normal", "explicit_user_include"
|
|
1191
2020
|
depth = min(depth, root_depth)
|
|
1192
2021
|
try:
|
|
1193
2022
|
st = path.stat()
|
|
@@ -1197,7 +2026,7 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
|
|
|
1197
2026
|
INSERT INTO local_index_errors(asset_id, path, phase, error_code, user_message, technical_detail, retryable, created_at)
|
|
1198
2027
|
VALUES (?, ?, 'quick_index', ?, ?, ?, 1, ?)
|
|
1199
2028
|
""",
|
|
1200
|
-
(asset_id, normalized, type(exc).__name__, "
|
|
2029
|
+
(asset_id, normalized, type(exc).__name__, "Some files could not be read", str(exc), now()),
|
|
1201
2030
|
)
|
|
1202
2031
|
return asset_id, False, "error"
|
|
1203
2032
|
fingerprint = quick_fingerprint(path, st)
|
|
@@ -1265,8 +2094,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
|
|
|
1265
2094
|
""",
|
|
1266
2095
|
(version_id, asset_id, fingerprint, int(st.st_size), float(st.st_mtime), now()),
|
|
1267
2096
|
)
|
|
1268
|
-
if
|
|
1269
|
-
enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path))
|
|
2097
|
+
if _should_extract_file(conn, normalized, depth, allow_default_skip_override=allow_default_skip_override):
|
|
2098
|
+
enqueue_job(conn, asset_id, "light_extraction", priority=_extraction_priority(path, conn=conn))
|
|
1270
2099
|
enqueue_job(conn, asset_id, "graph", priority=40)
|
|
1271
2100
|
return asset_id, changed, "ok"
|
|
1272
2101
|
|
|
@@ -1354,7 +2183,7 @@ def _record_scan_error(conn, stats: dict | None, path: str, phase: str, exc: Exc
|
|
|
1354
2183
|
path=path,
|
|
1355
2184
|
phase=phase,
|
|
1356
2185
|
error_code=type(exc).__name__,
|
|
1357
|
-
user_message="
|
|
2186
|
+
user_message="Some folders or files could not be read",
|
|
1358
2187
|
technical_detail=str(exc),
|
|
1359
2188
|
retryable=True,
|
|
1360
2189
|
)
|
|
@@ -1377,7 +2206,15 @@ def enqueue_job(conn, asset_id: str, job_type: str, *, priority: int = 50) -> st
|
|
|
1377
2206
|
return job_id
|
|
1378
2207
|
|
|
1379
2208
|
|
|
1380
|
-
def _extraction_priority(path: Path) -> int:
|
|
2209
|
+
def _extraction_priority(path: Path, *, conn=None) -> int:
|
|
2210
|
+
if conn is not None:
|
|
2211
|
+
rule = _effective_file_type_rule(conn, path.suffix.lower())
|
|
2212
|
+
try:
|
|
2213
|
+
priority = int(rule.get("priority") or 0)
|
|
2214
|
+
except Exception:
|
|
2215
|
+
priority = 0
|
|
2216
|
+
if priority > 0:
|
|
2217
|
+
return priority
|
|
1381
2218
|
suffix = path.suffix.lower()
|
|
1382
2219
|
if suffix in HIGH_VALUE_DOCUMENT_SUFFIXES:
|
|
1383
2220
|
return 90
|
|
@@ -1385,7 +2222,7 @@ def _extraction_priority(path: Path) -> int:
|
|
|
1385
2222
|
return 82
|
|
1386
2223
|
if suffix in EMAIL_DOCUMENT_SUFFIXES or is_local_email_tree(str(path)):
|
|
1387
2224
|
return 70
|
|
1388
|
-
if suffix in
|
|
2225
|
+
if suffix in CODE_DOCUMENT_SUFFIXES:
|
|
1389
2226
|
return 55
|
|
1390
2227
|
return 45
|
|
1391
2228
|
|
|
@@ -1425,6 +2262,7 @@ def _iter_files(
|
|
|
1425
2262
|
start_after: str = "",
|
|
1426
2263
|
seen_at: float | None = None,
|
|
1427
2264
|
stats: dict | None = None,
|
|
2265
|
+
allow_default_skip_override: bool = False,
|
|
1428
2266
|
):
|
|
1429
2267
|
seen_at = seen_at or now()
|
|
1430
2268
|
seen_dirs: set[tuple[int, int]] = set()
|
|
@@ -1435,7 +2273,7 @@ def _iter_files(
|
|
|
1435
2273
|
current = stack.pop()
|
|
1436
2274
|
if _is_excluded(str(current), exclusions):
|
|
1437
2275
|
continue
|
|
1438
|
-
if current != root and should_skip_tree(str(current)):
|
|
2276
|
+
if current != root and should_skip_tree(str(current)) and not allow_default_skip_override:
|
|
1439
2277
|
continue
|
|
1440
2278
|
try:
|
|
1441
2279
|
st = current.stat()
|
|
@@ -1459,13 +2297,13 @@ def _iter_files(
|
|
|
1459
2297
|
if entry.is_symlink():
|
|
1460
2298
|
continue
|
|
1461
2299
|
if entry.is_dir():
|
|
1462
|
-
if should_skip_tree(str(entry)):
|
|
2300
|
+
if should_skip_tree(str(entry)) and not allow_default_skip_override:
|
|
1463
2301
|
continue
|
|
1464
2302
|
dirs.append(entry)
|
|
1465
2303
|
continue
|
|
1466
2304
|
if entry.is_file():
|
|
1467
2305
|
normalized = norm_path(entry)
|
|
1468
|
-
if
|
|
2306
|
+
if not _should_index_file(conn, normalized, allow_default_skip_override=allow_default_skip_override):
|
|
1469
2307
|
continue
|
|
1470
2308
|
if start_after_norm and normalized <= start_after_norm:
|
|
1471
2309
|
continue
|
|
@@ -1530,7 +2368,7 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
|
|
|
1530
2368
|
return stats
|
|
1531
2369
|
rows = conn.execute(
|
|
1532
2370
|
"""
|
|
1533
|
-
SELECT a.asset_id, a.path, a.root_id, a.quick_fingerprint, a.depth, r.root_path
|
|
2371
|
+
SELECT a.asset_id, a.path, a.root_id, a.quick_fingerprint, a.depth, r.root_path, r.source
|
|
1534
2372
|
FROM local_assets a
|
|
1535
2373
|
LEFT JOIN local_index_roots r ON r.id = a.root_id
|
|
1536
2374
|
WHERE a.status='active'
|
|
@@ -1544,11 +2382,12 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
|
|
|
1544
2382
|
stats["checked"] += 1
|
|
1545
2383
|
path = str(row["path"])
|
|
1546
2384
|
root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
|
|
2385
|
+
allow_default_skip_override = _root_allows_default_skip_override(dict(row))
|
|
1547
2386
|
if _is_excluded(path, exclusions):
|
|
1548
2387
|
_purge_asset_ids(conn, [row["asset_id"]])
|
|
1549
2388
|
stats["excluded"] += 1
|
|
1550
2389
|
continue
|
|
1551
|
-
if
|
|
2390
|
+
if not _should_index_file(conn, path, allow_default_skip_override=allow_default_skip_override):
|
|
1552
2391
|
_purge_asset_ids(conn, [row["asset_id"]])
|
|
1553
2392
|
stats["excluded"] += 1
|
|
1554
2393
|
continue
|
|
@@ -1567,7 +2406,7 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
|
|
|
1567
2406
|
_record_scan_error(conn, stats, path, "live_reconcile", exc)
|
|
1568
2407
|
continue
|
|
1569
2408
|
if fingerprint != row["quick_fingerprint"]:
|
|
1570
|
-
_, changed, state = _upsert_asset(conn, int(row["root_id"] or 0), file_path, seen_at, int(row["depth"] or 2))
|
|
2409
|
+
_, changed, state = _upsert_asset(conn, int(row["root_id"] or 0), file_path, seen_at, int(row["depth"] or 2), allow_default_skip_override=allow_default_skip_override)
|
|
1571
2410
|
if changed:
|
|
1572
2411
|
stats["modified"] += 1
|
|
1573
2412
|
if state != "ok":
|
|
@@ -1616,6 +2455,7 @@ def _scan_known_directory(
|
|
|
1616
2455
|
*,
|
|
1617
2456
|
file_limit: int,
|
|
1618
2457
|
dir_limit: int,
|
|
2458
|
+
allow_default_skip_override: bool = False,
|
|
1619
2459
|
) -> None:
|
|
1620
2460
|
stack = [directory]
|
|
1621
2461
|
seen_at = now()
|
|
@@ -1626,7 +2466,7 @@ def _scan_known_directory(
|
|
|
1626
2466
|
_mark_dir_subtree_deleted(conn, str(current), seen_at)
|
|
1627
2467
|
stats["excluded_dirs"] += 1
|
|
1628
2468
|
continue
|
|
1629
|
-
if current != directory and should_skip_tree(str(current)):
|
|
2469
|
+
if current != directory and should_skip_tree(str(current)) and not allow_default_skip_override:
|
|
1630
2470
|
continue
|
|
1631
2471
|
try:
|
|
1632
2472
|
st = current.stat()
|
|
@@ -1648,7 +2488,7 @@ def _scan_known_directory(
|
|
|
1648
2488
|
if entry.is_symlink():
|
|
1649
2489
|
continue
|
|
1650
2490
|
if entry.is_dir():
|
|
1651
|
-
if should_skip_tree(str(entry)):
|
|
2491
|
+
if should_skip_tree(str(entry)) and not allow_default_skip_override:
|
|
1652
2492
|
continue
|
|
1653
2493
|
changed, _ = _upsert_dir(conn, root_id, entry, seen_at)
|
|
1654
2494
|
seen_dirs.add(norm_path(entry))
|
|
@@ -1656,12 +2496,12 @@ def _scan_known_directory(
|
|
|
1656
2496
|
stack.append(entry)
|
|
1657
2497
|
continue
|
|
1658
2498
|
if entry.is_file():
|
|
1659
|
-
if
|
|
2499
|
+
if not _should_index_file(conn, entry, allow_default_skip_override=allow_default_skip_override):
|
|
1660
2500
|
continue
|
|
1661
2501
|
seen_files.add(norm_path(entry))
|
|
1662
2502
|
if stats["files_scanned"] >= file_limit:
|
|
1663
2503
|
continue
|
|
1664
|
-
_, changed, state = _upsert_asset(conn, root_id, entry, seen_at, root_depth)
|
|
2504
|
+
_, changed, state = _upsert_asset(conn, root_id, entry, seen_at, root_depth, allow_default_skip_override=allow_default_skip_override)
|
|
1665
2505
|
stats["files_scanned"] += 1
|
|
1666
2506
|
if changed:
|
|
1667
2507
|
stats["files_changed"] += 1
|
|
@@ -1691,7 +2531,7 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
|
|
|
1691
2531
|
return stats
|
|
1692
2532
|
rows = conn.execute(
|
|
1693
2533
|
"""
|
|
1694
|
-
SELECT d.dir_id, d.path, d.quick_fingerprint, d.root_id, r.root_path, r.depth
|
|
2534
|
+
SELECT d.dir_id, d.path, d.quick_fingerprint, d.root_id, r.root_path, r.depth, r.source
|
|
1695
2535
|
FROM local_index_dirs d
|
|
1696
2536
|
LEFT JOIN local_index_roots r ON r.id = d.root_id
|
|
1697
2537
|
WHERE d.status='active'
|
|
@@ -1705,11 +2545,12 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
|
|
|
1705
2545
|
stats["checked"] += 1
|
|
1706
2546
|
dir_path = Path(row["path"])
|
|
1707
2547
|
root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
|
|
2548
|
+
allow_default_skip_override = _root_allows_default_skip_override(dict(row))
|
|
1708
2549
|
if _is_excluded(str(dir_path), exclusions):
|
|
1709
2550
|
stats["files_deleted"] += _mark_dir_subtree_deleted(conn, str(dir_path), seen_at)
|
|
1710
2551
|
stats["excluded_dirs"] += 1
|
|
1711
2552
|
continue
|
|
1712
|
-
if should_skip_tree(str(dir_path)):
|
|
2553
|
+
if should_skip_tree(str(dir_path)) and not allow_default_skip_override:
|
|
1713
2554
|
stats["files_deleted"] += _purge_dir_subtree(conn, str(dir_path))
|
|
1714
2555
|
stats["excluded_dirs"] += 1
|
|
1715
2556
|
continue
|
|
@@ -1737,6 +2578,7 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
|
|
|
1737
2578
|
stats,
|
|
1738
2579
|
file_limit=file_limit,
|
|
1739
2580
|
dir_limit=dir_limit,
|
|
2581
|
+
allow_default_skip_override=allow_default_skip_override,
|
|
1740
2582
|
)
|
|
1741
2583
|
else:
|
|
1742
2584
|
conn.execute("UPDATE local_index_dirs SET updated_at=? WHERE dir_id=?", (seen_at, row["dir_id"]))
|
|
@@ -1750,6 +2592,7 @@ def reconcile_live_changes(
|
|
|
1750
2592
|
file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
|
|
1751
2593
|
) -> dict:
|
|
1752
2594
|
conn = _conn()
|
|
2595
|
+
seed_core_file_type_rules(conn)
|
|
1753
2596
|
if _is_paused():
|
|
1754
2597
|
return {"ok": True, "paused": True, "assets": {}, "dirs": {}}
|
|
1755
2598
|
exclusions = [row["path"] for row in list_exclusions(readonly=False)]
|
|
@@ -1781,6 +2624,7 @@ def reconcile_live_changes(
|
|
|
1781
2624
|
|
|
1782
2625
|
def scan_once(*, limit: int | None = None) -> dict:
|
|
1783
2626
|
conn = _conn()
|
|
2627
|
+
seed_core_file_type_rules(conn)
|
|
1784
2628
|
if _is_paused():
|
|
1785
2629
|
log_event("info", "scan_skipped_paused", "Local memory scan skipped because indexing is paused")
|
|
1786
2630
|
return {"ok": True, "paused": True, "roots": 0, "seen": 0, "changed": 0, "errors": 0, "partial": False}
|
|
@@ -1792,8 +2636,9 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
1792
2636
|
for root in roots:
|
|
1793
2637
|
root_path = Path(root["root_path"]).expanduser()
|
|
1794
2638
|
root_id = int(root["id"])
|
|
2639
|
+
allow_default_skip_override = _root_allows_default_skip_override(dict(root))
|
|
1795
2640
|
root_initial_complete = _root_initial_scan_complete(conn, dict(root))
|
|
1796
|
-
if should_skip_tree(str(root_path)) and not _allow_explicit_blocked_root(str(root_path)):
|
|
2641
|
+
if should_skip_tree(str(root_path)) and not allow_default_skip_override and not _allow_explicit_blocked_root(str(root_path)):
|
|
1797
2642
|
conn.execute(
|
|
1798
2643
|
"UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
|
|
1799
2644
|
(now(), now(), root_id),
|
|
@@ -1823,8 +2668,16 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
1823
2668
|
start_after=str(checkpoint["current_path"] or ""),
|
|
1824
2669
|
seen_at=cycle_started_at,
|
|
1825
2670
|
stats=totals,
|
|
2671
|
+
allow_default_skip_override=allow_default_skip_override,
|
|
1826
2672
|
):
|
|
1827
|
-
asset_id, changed, state = _upsert_asset(
|
|
2673
|
+
asset_id, changed, state = _upsert_asset(
|
|
2674
|
+
conn,
|
|
2675
|
+
root_id,
|
|
2676
|
+
file_path,
|
|
2677
|
+
cycle_started_at,
|
|
2678
|
+
int(root["depth"] or 2),
|
|
2679
|
+
allow_default_skip_override=allow_default_skip_override,
|
|
2680
|
+
)
|
|
1828
2681
|
last_seen_path = norm_path(file_path)
|
|
1829
2682
|
totals["seen"] += 1
|
|
1830
2683
|
seen_for_root += 1
|
|
@@ -2396,7 +3249,7 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
2396
3249
|
path=row["path"],
|
|
2397
3250
|
phase=job_type,
|
|
2398
3251
|
error_code=type(exc).__name__,
|
|
2399
|
-
user_message="
|
|
3252
|
+
user_message="Some files could not be read",
|
|
2400
3253
|
technical_detail=str(exc),
|
|
2401
3254
|
retryable=not terminal,
|
|
2402
3255
|
)
|
|
@@ -2970,6 +3823,7 @@ def _status_from_conn(conn, *, readonly: bool = False) -> dict:
|
|
|
2970
3823
|
"volumes": volumes,
|
|
2971
3824
|
"roots": roots,
|
|
2972
3825
|
"exclusions": _list_exclusions_conn(conn),
|
|
3826
|
+
"file_types": _shape_file_type_rules(_list_file_type_rules_conn(conn)),
|
|
2973
3827
|
"problems": problems,
|
|
2974
3828
|
"permissions": [],
|
|
2975
3829
|
"models": model_status()["models"],
|