nexo-brain 7.20.3 → 7.20.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +5 -1
- package/package.json +1 -1
- package/src/doctor/providers/runtime.py +12 -5
- package/src/local_context/api.py +196 -18
- package/src/local_context/extractors.py +20 -0
- package/src/local_context/privacy.py +194 -10
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.4",
|
|
4
4
|
"description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "NEXO Brain",
|
package/README.md
CHANGED
|
@@ -18,7 +18,11 @@
|
|
|
18
18
|
|
|
19
19
|
[Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
|
|
20
20
|
|
|
21
|
-
Version `7.20.
|
|
21
|
+
Version `7.20.4` is the current packaged-runtime line. Patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
|
|
22
|
+
|
|
23
|
+
Previously in `7.20.3`: patch release over v7.20.2 — installer DMG volumes are no longer added as local-memory roots, removed roots purge stale payloads, and doctor can repair removed-root residue.
|
|
24
|
+
|
|
25
|
+
Previously in `7.20.2`: patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
|
|
22
26
|
|
|
23
27
|
Previously in `7.20.1`: patch release over v7.20.0 — the Local Context service now recovers from orphaned locks and mixed-version cycle failures instead of leaving the background index stuck.
|
|
24
28
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.4",
|
|
4
4
|
"mcpName": "io.github.wazionapps/nexo",
|
|
5
5
|
"description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
|
|
6
6
|
"homepage": "https://nexo-brain.com",
|
|
@@ -3840,16 +3840,23 @@ def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
|
|
|
3840
3840
|
result = local_context_api.local_index_hygiene(fix=fix)
|
|
3841
3841
|
residue = result.get("residue") or {}
|
|
3842
3842
|
cleanup = result.get("cleanup") or {}
|
|
3843
|
+
privacy = result.get("privacy") or {}
|
|
3844
|
+
privacy_residue = privacy.get("residue") or {}
|
|
3845
|
+
privacy_cleanup = privacy.get("cleanup") or {}
|
|
3843
3846
|
suspect_roots = [str(path) for path in result.get("removed_roots") or []]
|
|
3844
3847
|
residue_total = sum(int(residue.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
|
|
3845
3848
|
cleanup_total = sum(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
|
|
3849
|
+
privacy_residue_total = sum(int(privacy_residue.get(key, 0) or 0) for key in ("assets", "dirs", "content_secret_assets"))
|
|
3850
|
+
privacy_cleanup_total = sum(int(privacy_cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "chunks", "embeddings", "entities", "relations", "versions", "dirs", "content_secret_assets"))
|
|
3846
3851
|
evidence = [
|
|
3847
3852
|
"suspect_installer_roots=" + str(len(suspect_roots)),
|
|
3848
3853
|
"residue=" + json.dumps(residue, sort_keys=True),
|
|
3849
3854
|
"cleanup=" + json.dumps(cleanup, sort_keys=True),
|
|
3855
|
+
"privacy_residue=" + json.dumps(privacy_residue, sort_keys=True),
|
|
3856
|
+
"privacy_cleanup=" + json.dumps(privacy_cleanup, sort_keys=True),
|
|
3850
3857
|
]
|
|
3851
3858
|
evidence.extend(f"root={path}" for path in suspect_roots[:5])
|
|
3852
|
-
if residue_total == 0 and not suspect_roots:
|
|
3859
|
+
if residue_total == 0 and privacy_residue_total == 0 and not suspect_roots:
|
|
3853
3860
|
return DoctorCheck(
|
|
3854
3861
|
id="runtime.local_index_hygiene",
|
|
3855
3862
|
tier="runtime",
|
|
@@ -3868,17 +3875,17 @@ def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
|
|
|
3868
3875
|
summary="Local memory index hygiene repaired",
|
|
3869
3876
|
evidence=evidence,
|
|
3870
3877
|
repair_plan=[],
|
|
3871
|
-
fixed=cleanup_total > 0 or bool(suspect_roots),
|
|
3878
|
+
fixed=cleanup_total > 0 or privacy_cleanup_total > 0 or bool(suspect_roots),
|
|
3872
3879
|
)
|
|
3873
3880
|
return DoctorCheck(
|
|
3874
3881
|
id="runtime.local_index_hygiene",
|
|
3875
3882
|
tier="runtime",
|
|
3876
3883
|
status="degraded",
|
|
3877
3884
|
severity="warn",
|
|
3878
|
-
summary="Local memory index has stale
|
|
3885
|
+
summary="Local memory index has stale or private residue",
|
|
3879
3886
|
evidence=evidence,
|
|
3880
|
-
repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and
|
|
3881
|
-
escalation_prompt="Local memory
|
|
3887
|
+
repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and private local-memory residue"],
|
|
3888
|
+
escalation_prompt="Local memory may contain stale or private index payloads that should be purged before indexing continues.",
|
|
3882
3889
|
)
|
|
3883
3890
|
except Exception as exc:
|
|
3884
3891
|
return DoctorCheck(
|
package/src/local_context/api.py
CHANGED
|
@@ -14,9 +14,9 @@ from db import get_db, init_db
|
|
|
14
14
|
from db._schema import run_migrations
|
|
15
15
|
|
|
16
16
|
from . import embeddings
|
|
17
|
-
from .extractors import chunk_text, entities, extract_text, summarize
|
|
17
|
+
from .extractors import chunk_text, contains_secret, entities, extract_text, summarize
|
|
18
18
|
from .logging import log_event, tail
|
|
19
|
-
from .privacy import classify_path, should_extract, should_skip_tree
|
|
19
|
+
from .privacy import classify_path, is_queryable_path, should_extract, should_skip_file, should_skip_tree
|
|
20
20
|
from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
|
|
21
21
|
|
|
22
22
|
LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
|
|
@@ -41,6 +41,9 @@ def _conn():
|
|
|
41
41
|
def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> dict:
|
|
42
42
|
conn = _conn()
|
|
43
43
|
root_path = norm_path(path)
|
|
44
|
+
if should_skip_tree(root_path):
|
|
45
|
+
log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
|
|
46
|
+
return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
|
|
44
47
|
depth_value = 2 if depth is None else int(depth)
|
|
45
48
|
conn.execute(
|
|
46
49
|
"""
|
|
@@ -220,6 +223,7 @@ def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -
|
|
|
220
223
|
for table in ("local_embeddings", "local_chunks", "local_entities", "local_asset_versions"):
|
|
221
224
|
conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
222
225
|
conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({asset_subquery})", tuple(params))
|
|
226
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({asset_subquery})", tuple(params))
|
|
223
227
|
conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({asset_subquery})", tuple(params))
|
|
224
228
|
conn.execute(f"DELETE FROM local_index_jobs WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
225
229
|
conn.execute(f"DELETE FROM local_index_errors WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
@@ -235,12 +239,136 @@ def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -
|
|
|
235
239
|
return counts
|
|
236
240
|
|
|
237
241
|
|
|
242
|
+
def _purge_asset_ids(conn, asset_ids: list[str]) -> dict:
|
|
243
|
+
unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
|
|
244
|
+
counts = {"assets": len(unique_ids), "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0}
|
|
245
|
+
if not unique_ids:
|
|
246
|
+
return counts
|
|
247
|
+
for start in range(0, len(unique_ids), 500):
|
|
248
|
+
batch = unique_ids[start:start + 500]
|
|
249
|
+
placeholders = ",".join("?" for _ in batch)
|
|
250
|
+
for key, table in (
|
|
251
|
+
("embeddings", "local_embeddings"),
|
|
252
|
+
("chunks", "local_chunks"),
|
|
253
|
+
("entities", "local_entities"),
|
|
254
|
+
("versions", "local_asset_versions"),
|
|
255
|
+
("jobs", "local_index_jobs"),
|
|
256
|
+
("errors", "local_index_errors"),
|
|
257
|
+
):
|
|
258
|
+
counts[key] += int(conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
259
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
260
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
261
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
262
|
+
conn.execute(f"DELETE FROM local_assets WHERE asset_id IN ({placeholders})", tuple(batch))
|
|
263
|
+
return counts
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _privacy_unsafe_asset_ids(conn) -> list[str]:
|
|
267
|
+
rows = conn.execute("SELECT asset_id, path, privacy_class FROM local_assets").fetchall()
|
|
268
|
+
unsafe: list[str] = []
|
|
269
|
+
for row in rows:
|
|
270
|
+
privacy_class = str(row["privacy_class"] or "")
|
|
271
|
+
if should_skip_file(str(row["path"] or "")) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
|
|
272
|
+
unsafe.append(str(row["asset_id"]))
|
|
273
|
+
return unsafe
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _privacy_unsafe_dir_ids(conn) -> list[str]:
|
|
277
|
+
rows = conn.execute("SELECT dir_id, path FROM local_index_dirs").fetchall()
|
|
278
|
+
return [str(row["dir_id"]) for row in rows if should_skip_tree(str(row["path"] or ""))]
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _content_secret_asset_ids(conn) -> list[str]:
|
|
282
|
+
rows = conn.execute(
|
|
283
|
+
"""
|
|
284
|
+
SELECT c.asset_id, c.text
|
|
285
|
+
FROM local_chunks c
|
|
286
|
+
JOIN local_assets a ON a.asset_id=c.asset_id
|
|
287
|
+
WHERE a.status='active'
|
|
288
|
+
AND COALESCE(a.privacy_class, 'normal')='normal'
|
|
289
|
+
ORDER BY c.asset_id, c.chunk_index
|
|
290
|
+
"""
|
|
291
|
+
).fetchall()
|
|
292
|
+
secret_ids: set[str] = set()
|
|
293
|
+
for row in rows:
|
|
294
|
+
asset_id = str(row["asset_id"])
|
|
295
|
+
if asset_id in secret_ids:
|
|
296
|
+
continue
|
|
297
|
+
if contains_secret(str(row["text"] or "")):
|
|
298
|
+
secret_ids.add(asset_id)
|
|
299
|
+
return sorted(secret_ids)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _mark_content_secret_assets(conn, asset_ids: list[str]) -> int:
|
|
303
|
+
unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
|
|
304
|
+
if not unique_ids:
|
|
305
|
+
return 0
|
|
306
|
+
for start in range(0, len(unique_ids), 500):
|
|
307
|
+
batch = unique_ids[start:start + 500]
|
|
308
|
+
placeholders = ",".join("?" for _ in batch)
|
|
309
|
+
for table in ("local_embeddings", "local_chunks", "local_entities"):
|
|
310
|
+
conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch))
|
|
311
|
+
conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch))
|
|
312
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch))
|
|
313
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch))
|
|
314
|
+
conn.execute(
|
|
315
|
+
f"""
|
|
316
|
+
UPDATE local_index_jobs
|
|
317
|
+
SET status='done', last_error_code='content_secret_blocked', updated_at=?
|
|
318
|
+
WHERE asset_id IN ({placeholders})
|
|
319
|
+
""",
|
|
320
|
+
(now(), *batch),
|
|
321
|
+
)
|
|
322
|
+
conn.execute(
|
|
323
|
+
f"""
|
|
324
|
+
UPDATE local_asset_versions
|
|
325
|
+
SET summary='', metadata_json=?
|
|
326
|
+
WHERE asset_id IN ({placeholders})
|
|
327
|
+
""",
|
|
328
|
+
(json_dumps({"content_blocked": "secret_pattern"}), *batch),
|
|
329
|
+
)
|
|
330
|
+
conn.execute(
|
|
331
|
+
f"""
|
|
332
|
+
UPDATE local_assets
|
|
333
|
+
SET privacy_class='content_secret_inventory_only',
|
|
334
|
+
depth=1,
|
|
335
|
+
depth_reason='content_secret',
|
|
336
|
+
phase='privacy_blocked',
|
|
337
|
+
updated_at=?
|
|
338
|
+
WHERE asset_id IN ({placeholders})
|
|
339
|
+
""",
|
|
340
|
+
(now(), *batch),
|
|
341
|
+
)
|
|
342
|
+
return len(unique_ids)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def local_index_privacy_hygiene(*, fix: bool = False) -> dict:
|
|
346
|
+
conn = _conn()
|
|
347
|
+
asset_ids = _privacy_unsafe_asset_ids(conn)
|
|
348
|
+
dir_ids = _privacy_unsafe_dir_ids(conn)
|
|
349
|
+
content_secret_ids = _content_secret_asset_ids(conn)
|
|
350
|
+
residue = {"assets": len(asset_ids), "dirs": len(dir_ids), "content_secret_assets": len(content_secret_ids)}
|
|
351
|
+
cleanup = {"assets": 0, "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0, "dirs": 0, "content_secret_assets": 0}
|
|
352
|
+
if fix:
|
|
353
|
+
cleanup.update(_purge_asset_ids(conn, asset_ids))
|
|
354
|
+
if dir_ids:
|
|
355
|
+
for start in range(0, len(dir_ids), 500):
|
|
356
|
+
batch = dir_ids[start:start + 500]
|
|
357
|
+
placeholders = ",".join("?" for _ in batch)
|
|
358
|
+
cleanup["dirs"] += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
359
|
+
cleanup["content_secret_assets"] = _mark_content_secret_assets(conn, content_secret_ids)
|
|
360
|
+
conn.commit()
|
|
361
|
+
if asset_ids or dir_ids or content_secret_ids:
|
|
362
|
+
log_event("warn", "privacy_hygiene_repaired", "Local memory privacy hygiene repaired", cleanup=cleanup)
|
|
363
|
+
return {"ok": True, "fix": fix, "residue": residue, "cleanup": cleanup}
|
|
364
|
+
|
|
365
|
+
|
|
238
366
|
def local_index_hygiene(*, fix: bool = False) -> dict:
|
|
239
367
|
conn = _conn()
|
|
240
368
|
removed_paths: list[str] = []
|
|
241
369
|
for row in conn.execute("SELECT id, root_path FROM local_index_roots").fetchall():
|
|
242
370
|
path = str(row["root_path"] or "")
|
|
243
|
-
if _should_skip_mounted_root(Path(path)):
|
|
371
|
+
if _should_skip_mounted_root(Path(path)) or should_skip_tree(path):
|
|
244
372
|
removed_paths.append(path)
|
|
245
373
|
if fix:
|
|
246
374
|
conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE id=?", (now(), row["id"]))
|
|
@@ -249,9 +377,10 @@ def local_index_hygiene(*, fix: bool = False) -> dict:
|
|
|
249
377
|
if fix:
|
|
250
378
|
cleanup = _purge_removed_root_payloads(conn)
|
|
251
379
|
conn.commit()
|
|
380
|
+
privacy = local_index_privacy_hygiene(fix=fix)
|
|
252
381
|
if fix and (removed_paths or any(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))):
|
|
253
382
|
log_event("info", "index_hygiene_repaired", "Local memory index hygiene repaired", roots=[redact_path(path) for path in removed_paths], cleanup=cleanup)
|
|
254
|
-
return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup}
|
|
383
|
+
return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup, "privacy": privacy}
|
|
255
384
|
|
|
256
385
|
|
|
257
386
|
def repair_index_hygiene() -> dict:
|
|
@@ -424,6 +553,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
|
|
|
424
553
|
raw_path = str(path)
|
|
425
554
|
normalized = norm_path(raw_path)
|
|
426
555
|
asset_id = stable_id("asset", normalized)
|
|
556
|
+
if should_skip_file(normalized):
|
|
557
|
+
return asset_id, False, "skipped"
|
|
427
558
|
perm = _permission_state(path)
|
|
428
559
|
depth, privacy_class, depth_reason = classify_path(normalized)
|
|
429
560
|
depth = min(depth, root_depth)
|
|
@@ -546,6 +677,20 @@ def _mark_dir_subtree_deleted(conn, dir_path: str, deleted_at: float | None = No
|
|
|
546
677
|
return len(rows)
|
|
547
678
|
|
|
548
679
|
|
|
680
|
+
def _purge_dir_subtree(conn, dir_path: str) -> int:
|
|
681
|
+
normalized = norm_path(dir_path)
|
|
682
|
+
prefix = _path_prefix(normalized)
|
|
683
|
+
rows = conn.execute(
|
|
684
|
+
"SELECT asset_id FROM local_assets WHERE path=? OR path LIKE ?",
|
|
685
|
+
(normalized, prefix + "%"),
|
|
686
|
+
).fetchall()
|
|
687
|
+
asset_ids = [str(row["asset_id"]) for row in rows]
|
|
688
|
+
_purge_asset_ids(conn, asset_ids)
|
|
689
|
+
conn.execute("DELETE FROM local_index_dirs WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
|
|
690
|
+
conn.execute("DELETE FROM local_index_errors WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
|
|
691
|
+
return len(asset_ids)
|
|
692
|
+
|
|
693
|
+
|
|
549
694
|
def _record_index_error(
|
|
550
695
|
conn,
|
|
551
696
|
*,
|
|
@@ -651,6 +796,8 @@ def _iter_files(
|
|
|
651
796
|
continue
|
|
652
797
|
if entry.is_file():
|
|
653
798
|
normalized = norm_path(entry)
|
|
799
|
+
if should_skip_file(normalized):
|
|
800
|
+
continue
|
|
654
801
|
if start_after_norm and normalized <= start_after_norm:
|
|
655
802
|
continue
|
|
656
803
|
yield entry
|
|
@@ -729,7 +876,11 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
|
|
|
729
876
|
path = str(row["path"])
|
|
730
877
|
root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
|
|
731
878
|
if _is_excluded(path, exclusions):
|
|
732
|
-
|
|
879
|
+
_purge_asset_ids(conn, [row["asset_id"]])
|
|
880
|
+
stats["excluded"] += 1
|
|
881
|
+
continue
|
|
882
|
+
if should_skip_file(path):
|
|
883
|
+
_purge_asset_ids(conn, [row["asset_id"]])
|
|
733
884
|
stats["excluded"] += 1
|
|
734
885
|
continue
|
|
735
886
|
if root_path is not None and not root_path.exists():
|
|
@@ -836,6 +987,8 @@ def _scan_known_directory(
|
|
|
836
987
|
stack.append(entry)
|
|
837
988
|
continue
|
|
838
989
|
if entry.is_file():
|
|
990
|
+
if should_skip_file(str(entry)):
|
|
991
|
+
continue
|
|
839
992
|
seen_files.add(norm_path(entry))
|
|
840
993
|
if stats["files_scanned"] >= file_limit:
|
|
841
994
|
continue
|
|
@@ -843,7 +996,7 @@ def _scan_known_directory(
|
|
|
843
996
|
stats["files_scanned"] += 1
|
|
844
997
|
if changed:
|
|
845
998
|
stats["files_changed"] += 1
|
|
846
|
-
if state
|
|
999
|
+
if state not in {"ok", "skipped"}:
|
|
847
1000
|
stats["errors"] += 1
|
|
848
1001
|
except Exception as exc:
|
|
849
1002
|
_record_scan_error(conn, stats, str(entry), "live_reconcile", exc)
|
|
@@ -887,6 +1040,10 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
|
|
|
887
1040
|
stats["files_deleted"] += _mark_dir_subtree_deleted(conn, str(dir_path), seen_at)
|
|
888
1041
|
stats["excluded_dirs"] += 1
|
|
889
1042
|
continue
|
|
1043
|
+
if should_skip_tree(str(dir_path)):
|
|
1044
|
+
stats["files_deleted"] += _purge_dir_subtree(conn, str(dir_path))
|
|
1045
|
+
stats["excluded_dirs"] += 1
|
|
1046
|
+
continue
|
|
890
1047
|
if root_path is not None and not root_path.exists():
|
|
891
1048
|
stats["offline"] += 1
|
|
892
1049
|
continue
|
|
@@ -966,6 +1123,12 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
966
1123
|
for root in roots:
|
|
967
1124
|
root_path = Path(root["root_path"]).expanduser()
|
|
968
1125
|
root_id = int(root["id"])
|
|
1126
|
+
if should_skip_tree(str(root_path)):
|
|
1127
|
+
conn.execute(
|
|
1128
|
+
"UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
|
|
1129
|
+
(now(), now(), root_id),
|
|
1130
|
+
)
|
|
1131
|
+
continue
|
|
969
1132
|
if not root_path.exists():
|
|
970
1133
|
conn.execute(
|
|
971
1134
|
"UPDATE local_index_roots SET status='offline', last_scan_at=?, updated_at=? WHERE id=?",
|
|
@@ -997,7 +1160,7 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
997
1160
|
seen_for_root += 1
|
|
998
1161
|
if changed:
|
|
999
1162
|
totals["changed"] += 1
|
|
1000
|
-
if state
|
|
1163
|
+
if state not in {"ok", "skipped"}:
|
|
1001
1164
|
totals["errors"] += 1
|
|
1002
1165
|
partial_root = bool(limit and seen_for_root >= limit)
|
|
1003
1166
|
totals["partial"] = bool(totals["partial"] or partial_root)
|
|
@@ -1121,7 +1284,7 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
1121
1284
|
recovered = _requeue_due_jobs(conn)
|
|
1122
1285
|
rows = conn.execute(
|
|
1123
1286
|
"""
|
|
1124
|
-
SELECT j.*, a.path, a.depth, a.status AS asset_status
|
|
1287
|
+
SELECT j.*, a.path, a.depth, a.privacy_class, a.status AS asset_status
|
|
1125
1288
|
FROM local_index_jobs j
|
|
1126
1289
|
JOIN local_assets a ON a.asset_id = j.asset_id
|
|
1127
1290
|
WHERE j.status='pending'
|
|
@@ -1143,9 +1306,24 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
1143
1306
|
try:
|
|
1144
1307
|
if row["asset_status"] != "active":
|
|
1145
1308
|
raise FileNotFoundError(row["path"])
|
|
1309
|
+
if str(row["privacy_class"] or "normal") != "normal":
|
|
1310
|
+
conn.execute(
|
|
1311
|
+
"UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='privacy_blocked' WHERE job_id=?",
|
|
1312
|
+
(now(), job_id),
|
|
1313
|
+
)
|
|
1314
|
+
processed += 1
|
|
1315
|
+
continue
|
|
1146
1316
|
if job_type == "light_extraction":
|
|
1147
1317
|
text, metadata = extract_text(Path(row["path"]))
|
|
1148
1318
|
version_id = _latest_version_id(conn, asset_id)
|
|
1319
|
+
if contains_secret(text):
|
|
1320
|
+
_mark_content_secret_assets(conn, [asset_id])
|
|
1321
|
+
conn.execute(
|
|
1322
|
+
"UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='content_secret_blocked' WHERE job_id=?",
|
|
1323
|
+
(now(), job_id),
|
|
1324
|
+
)
|
|
1325
|
+
processed += 1
|
|
1326
|
+
continue
|
|
1149
1327
|
summary = summarize(text)
|
|
1150
1328
|
conn.execute(
|
|
1151
1329
|
"UPDATE local_asset_versions SET summary=?, metadata_json=? WHERE version_id=?",
|
|
@@ -1202,6 +1380,9 @@ def run_once(
|
|
|
1202
1380
|
live_dir_limit: int = DEFAULT_LIVE_DIR_LIMIT,
|
|
1203
1381
|
live_file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
|
|
1204
1382
|
) -> dict:
|
|
1383
|
+
if _get_state("privacy_hygiene_v2", "0") != "1":
|
|
1384
|
+
local_index_privacy_hygiene(fix=True)
|
|
1385
|
+
_set_state("privacy_hygiene_v2", "1")
|
|
1205
1386
|
if root:
|
|
1206
1387
|
add_root(root)
|
|
1207
1388
|
elif (
|
|
@@ -1680,17 +1861,21 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1680
1861
|
qvec = embeddings.embed_text(query)
|
|
1681
1862
|
rows = conn.execute(
|
|
1682
1863
|
"""
|
|
1683
|
-
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, v.summary, e.vector_json
|
|
1864
|
+
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
|
|
1684
1865
|
FROM local_chunks c
|
|
1685
1866
|
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
1686
1867
|
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
1687
1868
|
LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
|
|
1688
1869
|
WHERE a.status='active'
|
|
1689
|
-
|
|
1870
|
+
AND a.privacy_class='normal'
|
|
1871
|
+
ORDER BY c.created_at DESC
|
|
1872
|
+
LIMIT 5000
|
|
1690
1873
|
"""
|
|
1691
1874
|
).fetchall()
|
|
1692
1875
|
scored = []
|
|
1693
1876
|
for row in rows:
|
|
1877
|
+
if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
|
|
1878
|
+
continue
|
|
1694
1879
|
vector = json_loads(row["vector_json"], [])
|
|
1695
1880
|
score = max(_search_text_score(query, row["text"]), embeddings.cosine(qvec, vector))
|
|
1696
1881
|
if score > 0:
|
|
@@ -1704,7 +1889,6 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1704
1889
|
if row["asset_id"] not in seen_assets:
|
|
1705
1890
|
assets.append({
|
|
1706
1891
|
"asset_id": row["asset_id"],
|
|
1707
|
-
"path": row["path"],
|
|
1708
1892
|
"display_path": redact_path(row["path"]),
|
|
1709
1893
|
"file_type": row["file_type"],
|
|
1710
1894
|
"score": round(float(score), 4),
|
|
@@ -1798,13 +1982,7 @@ def get_neighbors(asset_id: str, *, limit: int = 30) -> dict:
|
|
|
1798
1982
|
|
|
1799
1983
|
def purge_asset(asset_id: str) -> dict:
|
|
1800
1984
|
conn = _conn()
|
|
1801
|
-
|
|
1802
|
-
conn.execute(f"DELETE FROM {table} WHERE asset_id=?", (asset_id,))
|
|
1803
|
-
conn.execute("DELETE FROM local_relations WHERE source_asset_id=?", (asset_id,))
|
|
1804
|
-
conn.execute("DELETE FROM local_index_errors WHERE asset_id=?", (asset_id,))
|
|
1805
|
-
conn.execute("DELETE FROM local_index_jobs WHERE asset_id=?", (asset_id,))
|
|
1806
|
-
conn.execute("DELETE FROM local_asset_versions WHERE asset_id=?", (asset_id,))
|
|
1807
|
-
conn.execute("DELETE FROM local_assets WHERE asset_id=?", (asset_id,))
|
|
1985
|
+
_purge_asset_ids(conn, [asset_id])
|
|
1808
1986
|
conn.commit()
|
|
1809
1987
|
log_event("info", "asset_purged", "Asset purged", asset_id=asset_id)
|
|
1810
1988
|
return {"ok": True, "asset_id": asset_id}
|
|
@@ -32,6 +32,26 @@ TEXT_SUFFIXES = {
|
|
|
32
32
|
".css",
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
+
SECRET_PATTERNS: tuple[re.Pattern, ...] = (
|
|
36
|
+
re.compile(r"\bBearer\s+[A-Za-z0-9._\-~+/]{12,}\b", re.I),
|
|
37
|
+
re.compile(r"\bsk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
|
|
38
|
+
re.compile(r"\bpk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
|
|
39
|
+
re.compile(r"\b(ghp|gho|ghu|ghs|ghr|github_pat|glpat|xoxb|xoxp|shpat)_[A-Za-z0-9_]{16,}\b", re.I),
|
|
40
|
+
re.compile(r"\b(AKIA|ASIA)[A-Z0-9]{16,}\b"),
|
|
41
|
+
re.compile(r"\bey[A-Za-z0-9_-]{10,}\.ey[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"),
|
|
42
|
+
re.compile(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY-----", re.I),
|
|
43
|
+
re.compile(r"\b([A-Z][A-Z0-9_]*(?:TOKEN|SECRET|KEY|PASSWORD|PASS)\s*[:=]\s*)['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
|
|
44
|
+
re.compile(r"\b(?:api[_-]?key|secret[_-]?key|auth[_-]?token)\s*[:=]\s*['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
|
|
45
|
+
re.compile(r"\b(?:password|passwd|pwd)\s*[:=]\s*['\"][^'\"]{6,}['\"]", re.I),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def contains_secret(text: str) -> bool:
|
|
50
|
+
if not text:
|
|
51
|
+
return False
|
|
52
|
+
sample = text[:MAX_CHARS]
|
|
53
|
+
return any(pattern.search(sample) for pattern in SECRET_PATTERNS)
|
|
54
|
+
|
|
35
55
|
|
|
36
56
|
def _read_text(path: Path) -> str:
|
|
37
57
|
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
|
@@ -6,21 +6,58 @@ SENSITIVE_FILE_NAMES = {
|
|
|
6
6
|
".env",
|
|
7
7
|
".env.local",
|
|
8
8
|
".env.production",
|
|
9
|
+
".npmrc",
|
|
10
|
+
".pypirc",
|
|
11
|
+
".netrc",
|
|
12
|
+
".boto",
|
|
13
|
+
".pgpass",
|
|
14
|
+
".my.cnf",
|
|
15
|
+
".git-credentials",
|
|
16
|
+
".mcp_publisher_token",
|
|
17
|
+
".mcpregistry_github_token",
|
|
18
|
+
".mcpregistry_registry_token",
|
|
9
19
|
"id_rsa",
|
|
10
20
|
"id_dsa",
|
|
11
21
|
"id_ecdsa",
|
|
12
22
|
"id_ed25519",
|
|
23
|
+
"known_hosts",
|
|
24
|
+
"authorized_keys",
|
|
13
25
|
"cookies.sqlite",
|
|
14
26
|
"login data",
|
|
15
27
|
"keychain-2.db",
|
|
16
28
|
}
|
|
17
29
|
|
|
30
|
+
SENSITIVE_NAME_MARKERS = {
|
|
31
|
+
"api_key",
|
|
32
|
+
"apikey",
|
|
33
|
+
"auth_token",
|
|
34
|
+
"bearer",
|
|
35
|
+
"client_secret",
|
|
36
|
+
"credential",
|
|
37
|
+
"credentials",
|
|
38
|
+
"oauth",
|
|
39
|
+
"password",
|
|
40
|
+
"passwd",
|
|
41
|
+
"private_key",
|
|
42
|
+
"secret",
|
|
43
|
+
"token",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
SENSITIVE_SUFFIXES = {
|
|
47
|
+
".key",
|
|
48
|
+
".pem",
|
|
49
|
+
".p12",
|
|
50
|
+
".pfx",
|
|
51
|
+
".kdbx",
|
|
52
|
+
}
|
|
53
|
+
|
|
18
54
|
SENSITIVE_PARTS = {
|
|
19
55
|
".ssh",
|
|
20
56
|
".gnupg",
|
|
21
57
|
".aws",
|
|
22
58
|
".azure",
|
|
23
59
|
".kube",
|
|
60
|
+
".docker",
|
|
24
61
|
"password",
|
|
25
62
|
"passwords",
|
|
26
63
|
"1password",
|
|
@@ -53,9 +90,64 @@ NOISY_PARTS = {
|
|
|
53
90
|
".parcel-cache",
|
|
54
91
|
".bun",
|
|
55
92
|
".gradle",
|
|
93
|
+
"$tmp",
|
|
56
94
|
"target",
|
|
57
95
|
}
|
|
58
96
|
|
|
97
|
+
TRANSIENT_PARTS = {"tmp", "temp"}
|
|
98
|
+
|
|
99
|
+
PRIVATE_PROFILE_PARTS = {
|
|
100
|
+
".nexo",
|
|
101
|
+
".claude",
|
|
102
|
+
".codex",
|
|
103
|
+
".gemini",
|
|
104
|
+
".cursor",
|
|
105
|
+
".config",
|
|
106
|
+
".local",
|
|
107
|
+
".npm",
|
|
108
|
+
".yarn",
|
|
109
|
+
".pnpm-store",
|
|
110
|
+
".ollama",
|
|
111
|
+
".docker",
|
|
112
|
+
".vscode",
|
|
113
|
+
".idea",
|
|
114
|
+
"appdata",
|
|
115
|
+
"application data",
|
|
116
|
+
"library/application support",
|
|
117
|
+
"library/containers",
|
|
118
|
+
"library/group containers",
|
|
119
|
+
"library/keychains",
|
|
120
|
+
"library/logs",
|
|
121
|
+
"library/mail",
|
|
122
|
+
"library/messages",
|
|
123
|
+
"library/safari",
|
|
124
|
+
"library/saved application state",
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
PROFILE_HIDDEN_FILE_NAMES = {
|
|
128
|
+
".aider.chat.history.md",
|
|
129
|
+
".aider.input.history",
|
|
130
|
+
".bash_history",
|
|
131
|
+
".bash_profile",
|
|
132
|
+
".bashrc",
|
|
133
|
+
".claude.json",
|
|
134
|
+
".codex.json",
|
|
135
|
+
".cursorignore",
|
|
136
|
+
".ds_store",
|
|
137
|
+
".gitconfig",
|
|
138
|
+
".gitignore_global",
|
|
139
|
+
".lesshst",
|
|
140
|
+
".python_history",
|
|
141
|
+
".sqlite_history",
|
|
142
|
+
".viminfo",
|
|
143
|
+
".wget-hsts",
|
|
144
|
+
".zprofile",
|
|
145
|
+
".zsh_history",
|
|
146
|
+
".zshrc",
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
ALLOWED_HIDDEN_FILE_NAMES = set()
|
|
150
|
+
|
|
59
151
|
SYSTEM_PARTS = {
|
|
60
152
|
"system volume information",
|
|
61
153
|
"$recycle.bin",
|
|
@@ -69,34 +161,126 @@ SYSTEM_PARTS = {
|
|
|
69
161
|
}
|
|
70
162
|
|
|
71
163
|
|
|
72
|
-
def
|
|
73
|
-
|
|
164
|
+
def _normalized(path: str) -> str:
|
|
165
|
+
return str(Path(path)).replace("\\", "/").lower()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _parts(path: str) -> set[str]:
|
|
169
|
+
return {part for part in _normalized(path).replace(":", "/").split("/") if part}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _contains_path_marker(lowered: str, markers: set[str]) -> bool:
|
|
173
|
+
return any(marker in lowered for marker in markers)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _has_transient_project_part(path: str) -> bool:
|
|
177
|
+
parts = list(_normalized(path).replace(":", "/").split("/"))
|
|
178
|
+
for index, part in enumerate(parts):
|
|
179
|
+
if part in TRANSIENT_PARTS and index >= 2:
|
|
180
|
+
return True
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _has_hidden_dir_part(path: str) -> bool:
|
|
185
|
+
parts = [part for part in _normalized(path).replace(":", "/").split("/") if part]
|
|
186
|
+
return any(part.startswith(".") and part not in {".", ".."} for part in parts[:-1])
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _is_home_hidden_path(path: str) -> bool:
|
|
190
|
+
try:
|
|
191
|
+
p = Path(path).expanduser()
|
|
192
|
+
home = Path.home().expanduser()
|
|
193
|
+
rel = p.relative_to(home)
|
|
194
|
+
except Exception:
|
|
195
|
+
return False
|
|
196
|
+
return bool(rel.parts) and rel.parts[0].startswith(".")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def is_sensitive_path(path: str) -> bool:
|
|
74
200
|
p = Path(path)
|
|
75
|
-
lowered =
|
|
201
|
+
lowered = _normalized(path)
|
|
76
202
|
name = p.name.lower()
|
|
77
|
-
|
|
203
|
+
stem = p.stem.lower()
|
|
204
|
+
parts = _parts(path)
|
|
205
|
+
if name in SENSITIVE_FILE_NAMES:
|
|
206
|
+
return True
|
|
207
|
+
if name.startswith(".") and name not in ALLOWED_HIDDEN_FILE_NAMES:
|
|
208
|
+
return True
|
|
209
|
+
if name.startswith("~$"):
|
|
210
|
+
return True
|
|
211
|
+
if name.endswith((".tmp", ".swp", ".swo")):
|
|
212
|
+
return True
|
|
213
|
+
if p.suffix.lower() in SENSITIVE_SUFFIXES:
|
|
214
|
+
return True
|
|
215
|
+
if parts & SENSITIVE_PARTS:
|
|
216
|
+
return True
|
|
217
|
+
if any(marker in name or marker in stem for marker in SENSITIVE_NAME_MARKERS):
|
|
218
|
+
return True
|
|
219
|
+
return _contains_path_marker(lowered, SENSITIVE_PARTS)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def is_private_profile_path(path: str) -> bool:
|
|
223
|
+
lowered = _normalized(path)
|
|
224
|
+
parts = _parts(path)
|
|
225
|
+
if parts & PRIVATE_PROFILE_PARTS:
|
|
226
|
+
return True
|
|
227
|
+
if _contains_path_marker(lowered, PRIVATE_PROFILE_PARTS):
|
|
228
|
+
return True
|
|
229
|
+
name = Path(path).name.lower()
|
|
230
|
+
if name in PROFILE_HIDDEN_FILE_NAMES:
|
|
231
|
+
return True
|
|
232
|
+
if _is_home_hidden_path(path):
|
|
233
|
+
return True
|
|
234
|
+
return False
|
|
235
|
+
|
|
78
236
|
|
|
79
|
-
|
|
237
|
+
def classify_path(path: str) -> tuple[int, str, str]:
|
|
238
|
+
"""Return (depth, privacy_class, reason)."""
|
|
239
|
+
lowered = _normalized(path)
|
|
240
|
+
parts = _parts(path)
|
|
241
|
+
|
|
242
|
+
if is_sensitive_path(path):
|
|
80
243
|
return 1, "sensitive_inventory_only", "sensitive_path"
|
|
244
|
+
if is_private_profile_path(path):
|
|
245
|
+
return 0, "private_profile_blocked", "private_profile_path"
|
|
81
246
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
82
247
|
return 0, "system_blocked", "system_path"
|
|
83
|
-
if parts & NOISY_PARTS:
|
|
248
|
+
if parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path):
|
|
84
249
|
return 1, "inventory_only", "noisy_tree"
|
|
85
250
|
return 2, "normal", "default"
|
|
86
251
|
|
|
87
252
|
|
|
88
253
|
def should_skip_tree(path: str) -> bool:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
254
|
+
lowered = _normalized(path)
|
|
255
|
+
parts = _parts(path)
|
|
256
|
+
if any(item in lowered for item in SYSTEM_PARTS):
|
|
257
|
+
return True
|
|
258
|
+
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
259
|
+
return True
|
|
260
|
+
return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def should_skip_file(path: str) -> bool:
|
|
264
|
+
lowered = _normalized(path)
|
|
265
|
+
parts = _parts(path)
|
|
92
266
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
93
267
|
return True
|
|
94
|
-
|
|
268
|
+
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
269
|
+
return True
|
|
270
|
+
return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def is_queryable_path(path: str, privacy_class: str = "") -> bool:
|
|
274
|
+
if privacy_class and privacy_class != "normal":
|
|
275
|
+
return False
|
|
276
|
+
return not should_skip_file(path)
|
|
95
277
|
|
|
96
278
|
|
|
97
279
|
def should_extract(path: str, depth: int) -> bool:
|
|
98
280
|
if depth < 2:
|
|
99
281
|
return False
|
|
282
|
+
if should_skip_file(path):
|
|
283
|
+
return False
|
|
100
284
|
suffix = Path(path).suffix.lower()
|
|
101
285
|
if suffix in {
|
|
102
286
|
".txt",
|