nexo-brain 7.20.2 → 7.20.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.2",
3
+ "version": "7.20.4",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,11 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.20.2` is the current packaged-runtime line. Patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
21
+ Version `7.20.4` is the current packaged-runtime line. Patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
22
+
23
+ Previously in `7.20.3`: patch release over v7.20.2 — installer DMG volumes are no longer added as local-memory roots, removed roots purge stale payloads, and doctor can repair removed-root residue.
24
+
25
+ Previously in `7.20.2`: patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
22
26
 
23
27
  Previously in `7.20.1`: patch release over v7.20.0 — the Local Context service now recovers from orphaned locks and mixed-version cycle failures instead of leaving the background index stuck.
24
28
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.20.2",
3
+ "version": "7.20.4",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
@@ -3833,6 +3833,73 @@ def check_automation_caller_coverage(days: int = 7) -> DoctorCheck:
3833
3833
  )
3834
3834
 
3835
3835
 
3836
+ def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
3837
+ try:
3838
+ from local_context import api as local_context_api
3839
+
3840
+ result = local_context_api.local_index_hygiene(fix=fix)
3841
+ residue = result.get("residue") or {}
3842
+ cleanup = result.get("cleanup") or {}
3843
+ privacy = result.get("privacy") or {}
3844
+ privacy_residue = privacy.get("residue") or {}
3845
+ privacy_cleanup = privacy.get("cleanup") or {}
3846
+ suspect_roots = [str(path) for path in result.get("removed_roots") or []]
3847
+ residue_total = sum(int(residue.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
3848
+ cleanup_total = sum(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
3849
+ privacy_residue_total = sum(int(privacy_residue.get(key, 0) or 0) for key in ("assets", "dirs", "content_secret_assets"))
3850
+ privacy_cleanup_total = sum(int(privacy_cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "chunks", "embeddings", "entities", "relations", "versions", "dirs", "content_secret_assets"))
3851
+ evidence = [
3852
+ "suspect_installer_roots=" + str(len(suspect_roots)),
3853
+ "residue=" + json.dumps(residue, sort_keys=True),
3854
+ "cleanup=" + json.dumps(cleanup, sort_keys=True),
3855
+ "privacy_residue=" + json.dumps(privacy_residue, sort_keys=True),
3856
+ "privacy_cleanup=" + json.dumps(privacy_cleanup, sort_keys=True),
3857
+ ]
3858
+ evidence.extend(f"root={path}" for path in suspect_roots[:5])
3859
+ if residue_total == 0 and privacy_residue_total == 0 and not suspect_roots:
3860
+ return DoctorCheck(
3861
+ id="runtime.local_index_hygiene",
3862
+ tier="runtime",
3863
+ status="healthy",
3864
+ severity="info",
3865
+ summary="Local memory index hygiene is clean",
3866
+ evidence=evidence,
3867
+ repair_plan=[],
3868
+ )
3869
+ if fix:
3870
+ return DoctorCheck(
3871
+ id="runtime.local_index_hygiene",
3872
+ tier="runtime",
3873
+ status="healthy",
3874
+ severity="info",
3875
+ summary="Local memory index hygiene repaired",
3876
+ evidence=evidence,
3877
+ repair_plan=[],
3878
+ fixed=cleanup_total > 0 or privacy_cleanup_total > 0 or bool(suspect_roots),
3879
+ )
3880
+ return DoctorCheck(
3881
+ id="runtime.local_index_hygiene",
3882
+ tier="runtime",
3883
+ status="degraded",
3884
+ severity="warn",
3885
+ summary="Local memory index has stale or private residue",
3886
+ evidence=evidence,
3887
+ repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and private local-memory residue"],
3888
+ escalation_prompt="Local memory may contain stale or private index payloads that should be purged before indexing continues.",
3889
+ )
3890
+ except Exception as exc:
3891
+ return DoctorCheck(
3892
+ id="runtime.local_index_hygiene",
3893
+ tier="runtime",
3894
+ status="degraded",
3895
+ severity="warn",
3896
+ summary="Local memory index hygiene could not be checked",
3897
+ evidence=[str(exc)],
3898
+ repair_plan=["Inspect local_context.api.local_index_hygiene and runtime DB tables"],
3899
+ escalation_prompt="Support cannot verify local memory index residue.",
3900
+ )
3901
+
3902
+
3836
3903
  def run_runtime_checks(fix: bool = False) -> list[DoctorCheck]:
3837
3904
  """Run all runtime-tier checks. Read-only by default."""
3838
3905
  return [
@@ -3854,6 +3921,7 @@ def run_runtime_checks(fix: bool = False) -> list[DoctorCheck]:
3854
3921
  safe_check(check_automation_telemetry),
3855
3922
  safe_check(check_automation_caller_coverage),
3856
3923
  safe_check(check_state_watchers),
3924
+ safe_check(check_local_index_hygiene, fix=fix),
3857
3925
  safe_check(check_release_artifact_sync),
3858
3926
  safe_check(check_release_trace_hygiene),
3859
3927
  safe_check(check_launchagent_inventory),
@@ -16,6 +16,7 @@ from .api import (
16
16
  get_neighbors,
17
17
  list_exclusions,
18
18
  list_roots,
19
+ local_index_hygiene,
19
20
  model_status,
20
21
  pause,
21
22
  purge_asset,
@@ -39,6 +40,7 @@ __all__ = [
39
40
  "get_neighbors",
40
41
  "list_exclusions",
41
42
  "list_roots",
43
+ "local_index_hygiene",
42
44
  "model_status",
43
45
  "pause",
44
46
  "purge_asset",
@@ -14,9 +14,9 @@ from db import get_db, init_db
14
14
  from db._schema import run_migrations
15
15
 
16
16
  from . import embeddings
17
- from .extractors import chunk_text, entities, extract_text, summarize
17
+ from .extractors import chunk_text, contains_secret, entities, extract_text, summarize
18
18
  from .logging import log_event, tail
19
- from .privacy import classify_path, should_extract, should_skip_tree
19
+ from .privacy import classify_path, is_queryable_path, should_extract, should_skip_file, should_skip_tree
20
20
  from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
21
21
 
22
22
  LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
@@ -41,6 +41,9 @@ def _conn():
41
41
  def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> dict:
42
42
  conn = _conn()
43
43
  root_path = norm_path(path)
44
+ if should_skip_tree(root_path):
45
+ log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
46
+ return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
44
47
  depth_value = 2 if depth is None else int(depth)
45
48
  conn.execute(
46
49
  """
@@ -64,9 +67,10 @@ def remove_root(path: str) -> dict:
64
67
  conn = _conn()
65
68
  root_path = norm_path(path)
66
69
  conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE root_path=?", (now(), root_path))
70
+ cleanup = _purge_removed_root_payloads(conn, root_paths=[root_path])
67
71
  conn.commit()
68
- log_event("info", "root_removed", "Root removed", path=redact_path(root_path))
69
- return {"ok": True, "root_path": root_path}
72
+ log_event("info", "root_removed", "Root removed", path=redact_path(root_path), cleanup=cleanup)
73
+ return {"ok": True, "root_path": root_path, "cleanup": cleanup}
70
74
 
71
75
 
72
76
  def list_roots() -> list[dict]:
@@ -108,6 +112,8 @@ def _mounted_volume_roots() -> list[str]:
108
112
  try:
109
113
  if candidate.name.startswith(".") or not candidate.is_dir():
110
114
  continue
115
+ if _should_skip_mounted_root(candidate):
116
+ continue
111
117
  resolved = candidate.resolve()
112
118
  if resolved == root_resolved:
113
119
  continue
@@ -137,6 +143,250 @@ def ensure_default_roots() -> dict:
137
143
  return {"ok": True, "created": len(created), "roots": list_roots()}
138
144
 
139
145
 
146
+ def _should_skip_mounted_root(candidate: Path) -> bool:
147
+ name = candidate.name.strip().lower()
148
+ if name in {"nexo desktop", "nexo desktop beta"} or name.startswith("nexo desktop "):
149
+ return True
150
+ try:
151
+ app_bundles = [child.name.lower() for child in candidate.iterdir() if child.suffix.lower() == ".app"]
152
+ except Exception:
153
+ app_bundles = []
154
+ if any(name.startswith("nexo desktop") for name in app_bundles):
155
+ installer_markers = (
156
+ candidate / ".background",
157
+ candidate / "Applications",
158
+ candidate / ".DS_Store",
159
+ )
160
+ if any(marker.exists() for marker in installer_markers):
161
+ return True
162
+ return False
163
+
164
+
165
+ def _removed_root_filters(conn, *, root_paths: list[str] | None = None) -> tuple[list[int], list[str]]:
166
+ if root_paths:
167
+ placeholders = ",".join("?" for _ in root_paths)
168
+ rows = conn.execute(
169
+ f"SELECT id, root_path FROM local_index_roots WHERE root_path IN ({placeholders}) AND status='removed'",
170
+ tuple(root_paths),
171
+ ).fetchall()
172
+ else:
173
+ rows = conn.execute("SELECT id, root_path FROM local_index_roots WHERE status='removed'").fetchall()
174
+ return [int(row["id"]) for row in rows], [str(row["root_path"]) for row in rows]
175
+
176
+
177
+ def _removed_root_payload_counts(conn, *, root_paths: list[str] | None = None) -> dict:
178
+ root_ids, removed_paths = _removed_root_filters(conn, root_paths=root_paths)
179
+ if not root_ids and not removed_paths:
180
+ return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
181
+ asset_filter, params = _removed_root_asset_filter(root_ids, removed_paths)
182
+ if not asset_filter:
183
+ return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
184
+ asset_subquery = f"SELECT asset_id FROM local_assets WHERE {asset_filter}"
185
+ assets = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_assets WHERE {asset_filter}", tuple(params)).fetchone()["total"] or 0)
186
+ jobs = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_jobs WHERE asset_id IN ({asset_subquery})", tuple(params)).fetchone()["total"] or 0)
187
+ errors = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_errors WHERE asset_id IN ({asset_subquery})", tuple(params)).fetchone()["total"] or 0)
188
+ for path in removed_paths:
189
+ errors += int(conn.execute("SELECT COUNT(*) AS total FROM local_index_errors WHERE asset_id='' AND (path = ? OR path LIKE ?)", (path, f"{path}/%")).fetchone()["total"] or 0)
190
+ dirs = 0
191
+ checkpoints = 0
192
+ if root_ids:
193
+ root_placeholders = ",".join("?" for _ in root_ids)
194
+ dirs = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_dirs WHERE root_id IN ({root_placeholders})", tuple(root_ids)).fetchone()["total"] or 0)
195
+ checkpoints = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_checkpoints WHERE root_id IN ({root_placeholders})", tuple(root_ids)).fetchone()["total"] or 0)
196
+ return {"assets": assets, "jobs": jobs, "errors": errors, "dirs": dirs, "checkpoints": checkpoints}
197
+
198
+
199
+ def _removed_root_asset_filter(root_ids: list[int], removed_paths: list[str]) -> tuple[str, list[Any]]:
200
+ filters: list[str] = []
201
+ params: list[Any] = []
202
+ if root_ids:
203
+ root_placeholders = ",".join("?" for _ in root_ids)
204
+ filters.append(f"root_id IN ({root_placeholders})")
205
+ params.extend(root_ids)
206
+ for path in removed_paths:
207
+ filters.append("(path = ? OR path LIKE ?)")
208
+ params.extend([path, f"{path}/%"])
209
+ return " OR ".join(filters), params
210
+
211
+
212
+ def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -> dict:
213
+ root_ids, removed_paths = _removed_root_filters(conn, root_paths=root_paths)
214
+ if not root_ids and not removed_paths:
215
+ return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
216
+
217
+ asset_filter, params = _removed_root_asset_filter(root_ids, removed_paths)
218
+ if not asset_filter:
219
+ return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
220
+ asset_subquery = f"SELECT asset_id FROM local_assets WHERE {asset_filter}"
221
+ counts = _removed_root_payload_counts(conn, root_paths=root_paths)
222
+
223
+ for table in ("local_embeddings", "local_chunks", "local_entities", "local_asset_versions"):
224
+ conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({asset_subquery})", tuple(params))
225
+ conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({asset_subquery})", tuple(params))
226
+ conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({asset_subquery})", tuple(params))
227
+ conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({asset_subquery})", tuple(params))
228
+ conn.execute(f"DELETE FROM local_index_jobs WHERE asset_id IN ({asset_subquery})", tuple(params))
229
+ conn.execute(f"DELETE FROM local_index_errors WHERE asset_id IN ({asset_subquery})", tuple(params))
230
+
231
+ for path in removed_paths:
232
+ conn.execute("DELETE FROM local_index_errors WHERE path = ? OR path LIKE ?", (path, f"{path}/%"))
233
+
234
+ if root_ids:
235
+ root_placeholders = ",".join("?" for _ in root_ids)
236
+ conn.execute(f"DELETE FROM local_index_dirs WHERE root_id IN ({root_placeholders})", tuple(root_ids))
237
+ conn.execute(f"DELETE FROM local_index_checkpoints WHERE root_id IN ({root_placeholders})", tuple(root_ids))
238
+ conn.execute(f"DELETE FROM local_assets WHERE {asset_filter}", tuple(params))
239
+ return counts
240
+
241
+
242
+ def _purge_asset_ids(conn, asset_ids: list[str]) -> dict:
243
+ unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
244
+ counts = {"assets": len(unique_ids), "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0}
245
+ if not unique_ids:
246
+ return counts
247
+ for start in range(0, len(unique_ids), 500):
248
+ batch = unique_ids[start:start + 500]
249
+ placeholders = ",".join("?" for _ in batch)
250
+ for key, table in (
251
+ ("embeddings", "local_embeddings"),
252
+ ("chunks", "local_chunks"),
253
+ ("entities", "local_entities"),
254
+ ("versions", "local_asset_versions"),
255
+ ("jobs", "local_index_jobs"),
256
+ ("errors", "local_index_errors"),
257
+ ):
258
+ counts[key] += int(conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
259
+ counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
260
+ counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
261
+ counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch)).rowcount or 0)
262
+ conn.execute(f"DELETE FROM local_assets WHERE asset_id IN ({placeholders})", tuple(batch))
263
+ return counts
264
+
265
+
266
+ def _privacy_unsafe_asset_ids(conn) -> list[str]:
267
+ rows = conn.execute("SELECT asset_id, path, privacy_class FROM local_assets").fetchall()
268
+ unsafe: list[str] = []
269
+ for row in rows:
270
+ privacy_class = str(row["privacy_class"] or "")
271
+ if should_skip_file(str(row["path"] or "")) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
272
+ unsafe.append(str(row["asset_id"]))
273
+ return unsafe
274
+
275
+
276
+ def _privacy_unsafe_dir_ids(conn) -> list[str]:
277
+ rows = conn.execute("SELECT dir_id, path FROM local_index_dirs").fetchall()
278
+ return [str(row["dir_id"]) for row in rows if should_skip_tree(str(row["path"] or ""))]
279
+
280
+
281
+ def _content_secret_asset_ids(conn) -> list[str]:
282
+ rows = conn.execute(
283
+ """
284
+ SELECT c.asset_id, c.text
285
+ FROM local_chunks c
286
+ JOIN local_assets a ON a.asset_id=c.asset_id
287
+ WHERE a.status='active'
288
+ AND COALESCE(a.privacy_class, 'normal')='normal'
289
+ ORDER BY c.asset_id, c.chunk_index
290
+ """
291
+ ).fetchall()
292
+ secret_ids: set[str] = set()
293
+ for row in rows:
294
+ asset_id = str(row["asset_id"])
295
+ if asset_id in secret_ids:
296
+ continue
297
+ if contains_secret(str(row["text"] or "")):
298
+ secret_ids.add(asset_id)
299
+ return sorted(secret_ids)
300
+
301
+
302
+ def _mark_content_secret_assets(conn, asset_ids: list[str]) -> int:
303
+ unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
304
+ if not unique_ids:
305
+ return 0
306
+ for start in range(0, len(unique_ids), 500):
307
+ batch = unique_ids[start:start + 500]
308
+ placeholders = ",".join("?" for _ in batch)
309
+ for table in ("local_embeddings", "local_chunks", "local_entities"):
310
+ conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch))
311
+ conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch))
312
+ conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch))
313
+ conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch))
314
+ conn.execute(
315
+ f"""
316
+ UPDATE local_index_jobs
317
+ SET status='done', last_error_code='content_secret_blocked', updated_at=?
318
+ WHERE asset_id IN ({placeholders})
319
+ """,
320
+ (now(), *batch),
321
+ )
322
+ conn.execute(
323
+ f"""
324
+ UPDATE local_asset_versions
325
+ SET summary='', metadata_json=?
326
+ WHERE asset_id IN ({placeholders})
327
+ """,
328
+ (json_dumps({"content_blocked": "secret_pattern"}), *batch),
329
+ )
330
+ conn.execute(
331
+ f"""
332
+ UPDATE local_assets
333
+ SET privacy_class='content_secret_inventory_only',
334
+ depth=1,
335
+ depth_reason='content_secret',
336
+ phase='privacy_blocked',
337
+ updated_at=?
338
+ WHERE asset_id IN ({placeholders})
339
+ """,
340
+ (now(), *batch),
341
+ )
342
+ return len(unique_ids)
343
+
344
+
345
+ def local_index_privacy_hygiene(*, fix: bool = False) -> dict:
346
+ conn = _conn()
347
+ asset_ids = _privacy_unsafe_asset_ids(conn)
348
+ dir_ids = _privacy_unsafe_dir_ids(conn)
349
+ content_secret_ids = _content_secret_asset_ids(conn)
350
+ residue = {"assets": len(asset_ids), "dirs": len(dir_ids), "content_secret_assets": len(content_secret_ids)}
351
+ cleanup = {"assets": 0, "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0, "dirs": 0, "content_secret_assets": 0}
352
+ if fix:
353
+ cleanup.update(_purge_asset_ids(conn, asset_ids))
354
+ if dir_ids:
355
+ for start in range(0, len(dir_ids), 500):
356
+ batch = dir_ids[start:start + 500]
357
+ placeholders = ",".join("?" for _ in batch)
358
+ cleanup["dirs"] += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
359
+ cleanup["content_secret_assets"] = _mark_content_secret_assets(conn, content_secret_ids)
360
+ conn.commit()
361
+ if asset_ids or dir_ids or content_secret_ids:
362
+ log_event("warn", "privacy_hygiene_repaired", "Local memory privacy hygiene repaired", cleanup=cleanup)
363
+ return {"ok": True, "fix": fix, "residue": residue, "cleanup": cleanup}
364
+
365
+
366
+ def local_index_hygiene(*, fix: bool = False) -> dict:
367
+ conn = _conn()
368
+ removed_paths: list[str] = []
369
+ for row in conn.execute("SELECT id, root_path FROM local_index_roots").fetchall():
370
+ path = str(row["root_path"] or "")
371
+ if _should_skip_mounted_root(Path(path)) or should_skip_tree(path):
372
+ removed_paths.append(path)
373
+ if fix:
374
+ conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE id=?", (now(), row["id"]))
375
+ before = _removed_root_payload_counts(conn)
376
+ cleanup = {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
377
+ if fix:
378
+ cleanup = _purge_removed_root_payloads(conn)
379
+ conn.commit()
380
+ privacy = local_index_privacy_hygiene(fix=fix)
381
+ if fix and (removed_paths or any(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))):
382
+ log_event("info", "index_hygiene_repaired", "Local memory index hygiene repaired", roots=[redact_path(path) for path in removed_paths], cleanup=cleanup)
383
+ return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup, "privacy": privacy}
384
+
385
+
386
+ def repair_index_hygiene() -> dict:
387
+ return local_index_hygiene(fix=True)
388
+
389
+
140
390
  def add_exclusion(path: str, *, reason: str = "user") -> dict:
141
391
  conn = _conn()
142
392
  excluded_path = norm_path(path)
@@ -303,6 +553,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
303
553
  raw_path = str(path)
304
554
  normalized = norm_path(raw_path)
305
555
  asset_id = stable_id("asset", normalized)
556
+ if should_skip_file(normalized):
557
+ return asset_id, False, "skipped"
306
558
  perm = _permission_state(path)
307
559
  depth, privacy_class, depth_reason = classify_path(normalized)
308
560
  depth = min(depth, root_depth)
@@ -398,7 +650,7 @@ def _mark_asset_deleted(conn, asset_id: str, deleted_at: float | None = None) ->
398
650
  """
399
651
  UPDATE local_index_jobs
400
652
  SET status='done', last_error_code='asset_deleted', updated_at=?
401
- WHERE asset_id=? AND status IN ('pending', 'running')
653
+ WHERE asset_id=? AND status IN ('pending', 'running', 'failed')
402
654
  """,
403
655
  (deleted_at, asset_id),
404
656
  )
@@ -425,6 +677,20 @@ def _mark_dir_subtree_deleted(conn, dir_path: str, deleted_at: float | None = No
425
677
  return len(rows)
426
678
 
427
679
 
680
+ def _purge_dir_subtree(conn, dir_path: str) -> int:
681
+ normalized = norm_path(dir_path)
682
+ prefix = _path_prefix(normalized)
683
+ rows = conn.execute(
684
+ "SELECT asset_id FROM local_assets WHERE path=? OR path LIKE ?",
685
+ (normalized, prefix + "%"),
686
+ ).fetchall()
687
+ asset_ids = [str(row["asset_id"]) for row in rows]
688
+ _purge_asset_ids(conn, asset_ids)
689
+ conn.execute("DELETE FROM local_index_dirs WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
690
+ conn.execute("DELETE FROM local_index_errors WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
691
+ return len(asset_ids)
692
+
693
+
428
694
  def _record_index_error(
429
695
  conn,
430
696
  *,
@@ -530,6 +796,8 @@ def _iter_files(
530
796
  continue
531
797
  if entry.is_file():
532
798
  normalized = norm_path(entry)
799
+ if should_skip_file(normalized):
800
+ continue
533
801
  if start_after_norm and normalized <= start_after_norm:
534
802
  continue
535
803
  yield entry
@@ -608,7 +876,11 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
608
876
  path = str(row["path"])
609
877
  root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
610
878
  if _is_excluded(path, exclusions):
611
- _mark_asset_deleted(conn, row["asset_id"], seen_at)
879
+ _purge_asset_ids(conn, [row["asset_id"]])
880
+ stats["excluded"] += 1
881
+ continue
882
+ if should_skip_file(path):
883
+ _purge_asset_ids(conn, [row["asset_id"]])
612
884
  stats["excluded"] += 1
613
885
  continue
614
886
  if root_path is not None and not root_path.exists():
@@ -715,6 +987,8 @@ def _scan_known_directory(
715
987
  stack.append(entry)
716
988
  continue
717
989
  if entry.is_file():
990
+ if should_skip_file(str(entry)):
991
+ continue
718
992
  seen_files.add(norm_path(entry))
719
993
  if stats["files_scanned"] >= file_limit:
720
994
  continue
@@ -722,7 +996,7 @@ def _scan_known_directory(
722
996
  stats["files_scanned"] += 1
723
997
  if changed:
724
998
  stats["files_changed"] += 1
725
- if state != "ok":
999
+ if state not in {"ok", "skipped"}:
726
1000
  stats["errors"] += 1
727
1001
  except Exception as exc:
728
1002
  _record_scan_error(conn, stats, str(entry), "live_reconcile", exc)
@@ -766,6 +1040,10 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
766
1040
  stats["files_deleted"] += _mark_dir_subtree_deleted(conn, str(dir_path), seen_at)
767
1041
  stats["excluded_dirs"] += 1
768
1042
  continue
1043
+ if should_skip_tree(str(dir_path)):
1044
+ stats["files_deleted"] += _purge_dir_subtree(conn, str(dir_path))
1045
+ stats["excluded_dirs"] += 1
1046
+ continue
769
1047
  if root_path is not None and not root_path.exists():
770
1048
  stats["offline"] += 1
771
1049
  continue
@@ -845,6 +1123,12 @@ def scan_once(*, limit: int | None = None) -> dict:
845
1123
  for root in roots:
846
1124
  root_path = Path(root["root_path"]).expanduser()
847
1125
  root_id = int(root["id"])
1126
+ if should_skip_tree(str(root_path)):
1127
+ conn.execute(
1128
+ "UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
1129
+ (now(), now(), root_id),
1130
+ )
1131
+ continue
848
1132
  if not root_path.exists():
849
1133
  conn.execute(
850
1134
  "UPDATE local_index_roots SET status='offline', last_scan_at=?, updated_at=? WHERE id=?",
@@ -876,7 +1160,7 @@ def scan_once(*, limit: int | None = None) -> dict:
876
1160
  seen_for_root += 1
877
1161
  if changed:
878
1162
  totals["changed"] += 1
879
- if state != "ok":
1163
+ if state not in {"ok", "skipped"}:
880
1164
  totals["errors"] += 1
881
1165
  partial_root = bool(limit and seen_for_root >= limit)
882
1166
  totals["partial"] = bool(totals["partial"] or partial_root)
@@ -1000,7 +1284,7 @@ def process_jobs(*, limit: int = 100) -> dict:
1000
1284
  recovered = _requeue_due_jobs(conn)
1001
1285
  rows = conn.execute(
1002
1286
  """
1003
- SELECT j.*, a.path, a.depth, a.status AS asset_status
1287
+ SELECT j.*, a.path, a.depth, a.privacy_class, a.status AS asset_status
1004
1288
  FROM local_index_jobs j
1005
1289
  JOIN local_assets a ON a.asset_id = j.asset_id
1006
1290
  WHERE j.status='pending'
@@ -1022,9 +1306,24 @@ def process_jobs(*, limit: int = 100) -> dict:
1022
1306
  try:
1023
1307
  if row["asset_status"] != "active":
1024
1308
  raise FileNotFoundError(row["path"])
1309
+ if str(row["privacy_class"] or "normal") != "normal":
1310
+ conn.execute(
1311
+ "UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='privacy_blocked' WHERE job_id=?",
1312
+ (now(), job_id),
1313
+ )
1314
+ processed += 1
1315
+ continue
1025
1316
  if job_type == "light_extraction":
1026
1317
  text, metadata = extract_text(Path(row["path"]))
1027
1318
  version_id = _latest_version_id(conn, asset_id)
1319
+ if contains_secret(text):
1320
+ _mark_content_secret_assets(conn, [asset_id])
1321
+ conn.execute(
1322
+ "UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='content_secret_blocked' WHERE job_id=?",
1323
+ (now(), job_id),
1324
+ )
1325
+ processed += 1
1326
+ continue
1028
1327
  summary = summarize(text)
1029
1328
  conn.execute(
1030
1329
  "UPDATE local_asset_versions SET summary=?, metadata_json=? WHERE version_id=?",
@@ -1081,6 +1380,9 @@ def run_once(
1081
1380
  live_dir_limit: int = DEFAULT_LIVE_DIR_LIMIT,
1082
1381
  live_file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
1083
1382
  ) -> dict:
1383
+ if _get_state("privacy_hygiene_v2", "0") != "1":
1384
+ local_index_privacy_hygiene(fix=True)
1385
+ _set_state("privacy_hygiene_v2", "1")
1084
1386
  if root:
1085
1387
  add_root(root)
1086
1388
  elif (
@@ -1102,9 +1404,19 @@ def run_once(
1102
1404
  def _problem_rows(conn) -> list[dict]:
1103
1405
  rows = conn.execute(
1104
1406
  """
1105
- SELECT path, phase, error_code, user_message, technical_detail, retryable, created_at
1106
- FROM local_index_errors
1107
- ORDER BY id DESC
1407
+ SELECT e.path, e.phase, e.error_code, e.user_message, e.technical_detail, e.retryable, e.created_at
1408
+ FROM local_index_errors e
1409
+ LEFT JOIN local_assets a ON a.asset_id=e.asset_id
1410
+ LEFT JOIN local_index_roots r ON r.id=a.root_id
1411
+ WHERE COALESCE(r.status, 'active') != 'removed'
1412
+ AND NOT EXISTS (
1413
+ SELECT 1
1414
+ FROM local_index_roots rr
1415
+ WHERE rr.status='removed'
1416
+ AND e.path != ''
1417
+ AND (e.path = rr.root_path OR e.path LIKE rr.root_path || '/%')
1418
+ )
1419
+ ORDER BY e.id DESC
1108
1420
  LIMIT 20
1109
1421
  """
1110
1422
  ).fetchall()
@@ -1387,9 +1699,24 @@ def status() -> dict:
1387
1699
  conn = _conn()
1388
1700
  paused = _is_paused()
1389
1701
  assets = conn.execute(
1390
- "SELECT COUNT(*) AS total, SUM(CASE WHEN status='active' THEN 1 ELSE 0 END) AS active FROM local_assets"
1702
+ """
1703
+ SELECT COUNT(*) AS total, SUM(CASE WHEN a.status='active' THEN 1 ELSE 0 END) AS active
1704
+ FROM local_assets a
1705
+ LEFT JOIN local_index_roots r ON r.id=a.root_id
1706
+ WHERE COALESCE(r.status, 'active') != 'removed'
1707
+ """
1391
1708
  ).fetchone()
1392
- job_rows = conn.execute("SELECT status, COUNT(*) AS total FROM local_index_jobs GROUP BY status").fetchall()
1709
+ job_rows = conn.execute(
1710
+ """
1711
+ SELECT j.status, COUNT(*) AS total
1712
+ FROM local_index_jobs j
1713
+ JOIN local_assets a ON a.asset_id=j.asset_id
1714
+ LEFT JOIN local_index_roots r ON r.id=a.root_id
1715
+ WHERE a.status='active'
1716
+ AND COALESCE(r.status, 'active') != 'removed'
1717
+ GROUP BY j.status
1718
+ """
1719
+ ).fetchall()
1393
1720
  job_counts = {row["status"]: int(row["total"] or 0) for row in job_rows}
1394
1721
  pending = int(job_counts.get("pending", 0) or 0)
1395
1722
  running_jobs = int(job_counts.get("running", 0) or 0)
@@ -1401,7 +1728,15 @@ def status() -> dict:
1401
1728
  roots = list_roots()
1402
1729
  volumes = []
1403
1730
  by_volume = conn.execute(
1404
- "SELECT volume_id, COUNT(*) AS files FROM local_assets WHERE status='active' GROUP BY volume_id ORDER BY volume_id"
1731
+ """
1732
+ SELECT a.volume_id, COUNT(*) AS files
1733
+ FROM local_assets a
1734
+ LEFT JOIN local_index_roots r ON r.id=a.root_id
1735
+ WHERE a.status='active'
1736
+ AND COALESCE(r.status, 'active') != 'removed'
1737
+ GROUP BY a.volume_id
1738
+ ORDER BY a.volume_id
1739
+ """
1405
1740
  ).fetchall()
1406
1741
  for row in by_volume:
1407
1742
  volumes.append({"id": row["volume_id"], "label": row["volume_id"] or "Disk", "files": row["files"], "status": "active"})
@@ -1526,17 +1861,21 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
1526
1861
  qvec = embeddings.embed_text(query)
1527
1862
  rows = conn.execute(
1528
1863
  """
1529
- SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, v.summary, e.vector_json
1864
+ SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
1530
1865
  FROM local_chunks c
1531
1866
  JOIN local_assets a ON a.asset_id = c.asset_id
1532
1867
  LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
1533
1868
  LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
1534
1869
  WHERE a.status='active'
1535
- LIMIT 1000
1870
+ AND a.privacy_class='normal'
1871
+ ORDER BY c.created_at DESC
1872
+ LIMIT 5000
1536
1873
  """
1537
1874
  ).fetchall()
1538
1875
  scored = []
1539
1876
  for row in rows:
1877
+ if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
1878
+ continue
1540
1879
  vector = json_loads(row["vector_json"], [])
1541
1880
  score = max(_search_text_score(query, row["text"]), embeddings.cosine(qvec, vector))
1542
1881
  if score > 0:
@@ -1550,7 +1889,6 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
1550
1889
  if row["asset_id"] not in seen_assets:
1551
1890
  assets.append({
1552
1891
  "asset_id": row["asset_id"],
1553
- "path": row["path"],
1554
1892
  "display_path": redact_path(row["path"]),
1555
1893
  "file_type": row["file_type"],
1556
1894
  "score": round(float(score), 4),
@@ -1644,13 +1982,7 @@ def get_neighbors(asset_id: str, *, limit: int = 30) -> dict:
1644
1982
 
1645
1983
  def purge_asset(asset_id: str) -> dict:
1646
1984
  conn = _conn()
1647
- for table in ("local_embeddings", "local_chunks", "local_entities"):
1648
- conn.execute(f"DELETE FROM {table} WHERE asset_id=?", (asset_id,))
1649
- conn.execute("DELETE FROM local_relations WHERE source_asset_id=?", (asset_id,))
1650
- conn.execute("DELETE FROM local_index_errors WHERE asset_id=?", (asset_id,))
1651
- conn.execute("DELETE FROM local_index_jobs WHERE asset_id=?", (asset_id,))
1652
- conn.execute("DELETE FROM local_asset_versions WHERE asset_id=?", (asset_id,))
1653
- conn.execute("DELETE FROM local_assets WHERE asset_id=?", (asset_id,))
1985
+ _purge_asset_ids(conn, [asset_id])
1654
1986
  conn.commit()
1655
1987
  log_event("info", "asset_purged", "Asset purged", asset_id=asset_id)
1656
1988
  return {"ok": True, "asset_id": asset_id}
@@ -32,6 +32,26 @@ TEXT_SUFFIXES = {
32
32
  ".css",
33
33
  }
34
34
 
35
+ SECRET_PATTERNS: tuple[re.Pattern, ...] = (
36
+ re.compile(r"\bBearer\s+[A-Za-z0-9._\-~+/]{12,}\b", re.I),
37
+ re.compile(r"\bsk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
38
+ re.compile(r"\bpk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
39
+ re.compile(r"\b(ghp|gho|ghu|ghs|ghr|github_pat|glpat|xoxb|xoxp|shpat)_[A-Za-z0-9_]{16,}\b", re.I),
40
+ re.compile(r"\b(AKIA|ASIA)[A-Z0-9]{16,}\b"),
41
+ re.compile(r"\bey[A-Za-z0-9_-]{10,}\.ey[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"),
42
+ re.compile(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY-----", re.I),
43
+ re.compile(r"\b([A-Z][A-Z0-9_]*(?:TOKEN|SECRET|KEY|PASSWORD|PASS)\s*[:=]\s*)['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
44
+ re.compile(r"\b(?:api[_-]?key|secret[_-]?key|auth[_-]?token)\s*[:=]\s*['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
45
+ re.compile(r"\b(?:password|passwd|pwd)\s*[:=]\s*['\"][^'\"]{6,}['\"]", re.I),
46
+ )
47
+
48
+
49
+ def contains_secret(text: str) -> bool:
50
+ if not text:
51
+ return False
52
+ sample = text[:MAX_CHARS]
53
+ return any(pattern.search(sample) for pattern in SECRET_PATTERNS)
54
+
35
55
 
36
56
  def _read_text(path: Path) -> str:
37
57
  data = path.read_bytes()[:MAX_TEXT_BYTES]
@@ -6,21 +6,58 @@ SENSITIVE_FILE_NAMES = {
6
6
  ".env",
7
7
  ".env.local",
8
8
  ".env.production",
9
+ ".npmrc",
10
+ ".pypirc",
11
+ ".netrc",
12
+ ".boto",
13
+ ".pgpass",
14
+ ".my.cnf",
15
+ ".git-credentials",
16
+ ".mcp_publisher_token",
17
+ ".mcpregistry_github_token",
18
+ ".mcpregistry_registry_token",
9
19
  "id_rsa",
10
20
  "id_dsa",
11
21
  "id_ecdsa",
12
22
  "id_ed25519",
23
+ "known_hosts",
24
+ "authorized_keys",
13
25
  "cookies.sqlite",
14
26
  "login data",
15
27
  "keychain-2.db",
16
28
  }
17
29
 
30
+ SENSITIVE_NAME_MARKERS = {
31
+ "api_key",
32
+ "apikey",
33
+ "auth_token",
34
+ "bearer",
35
+ "client_secret",
36
+ "credential",
37
+ "credentials",
38
+ "oauth",
39
+ "password",
40
+ "passwd",
41
+ "private_key",
42
+ "secret",
43
+ "token",
44
+ }
45
+
46
+ SENSITIVE_SUFFIXES = {
47
+ ".key",
48
+ ".pem",
49
+ ".p12",
50
+ ".pfx",
51
+ ".kdbx",
52
+ }
53
+
18
54
  SENSITIVE_PARTS = {
19
55
  ".ssh",
20
56
  ".gnupg",
21
57
  ".aws",
22
58
  ".azure",
23
59
  ".kube",
60
+ ".docker",
24
61
  "password",
25
62
  "passwords",
26
63
  "1password",
@@ -53,9 +90,64 @@ NOISY_PARTS = {
53
90
  ".parcel-cache",
54
91
  ".bun",
55
92
  ".gradle",
93
+ "$tmp",
56
94
  "target",
57
95
  }
58
96
 
97
+ TRANSIENT_PARTS = {"tmp", "temp"}
98
+
99
+ PRIVATE_PROFILE_PARTS = {
100
+ ".nexo",
101
+ ".claude",
102
+ ".codex",
103
+ ".gemini",
104
+ ".cursor",
105
+ ".config",
106
+ ".local",
107
+ ".npm",
108
+ ".yarn",
109
+ ".pnpm-store",
110
+ ".ollama",
111
+ ".docker",
112
+ ".vscode",
113
+ ".idea",
114
+ "appdata",
115
+ "application data",
116
+ "library/application support",
117
+ "library/containers",
118
+ "library/group containers",
119
+ "library/keychains",
120
+ "library/logs",
121
+ "library/mail",
122
+ "library/messages",
123
+ "library/safari",
124
+ "library/saved application state",
125
+ }
126
+
127
+ PROFILE_HIDDEN_FILE_NAMES = {
128
+ ".aider.chat.history.md",
129
+ ".aider.input.history",
130
+ ".bash_history",
131
+ ".bash_profile",
132
+ ".bashrc",
133
+ ".claude.json",
134
+ ".codex.json",
135
+ ".cursorignore",
136
+ ".ds_store",
137
+ ".gitconfig",
138
+ ".gitignore_global",
139
+ ".lesshst",
140
+ ".python_history",
141
+ ".sqlite_history",
142
+ ".viminfo",
143
+ ".wget-hsts",
144
+ ".zprofile",
145
+ ".zsh_history",
146
+ ".zshrc",
147
+ }
148
+
149
+ ALLOWED_HIDDEN_FILE_NAMES = set()
150
+
59
151
  SYSTEM_PARTS = {
60
152
  "system volume information",
61
153
  "$recycle.bin",
@@ -69,34 +161,126 @@ SYSTEM_PARTS = {
69
161
  }
70
162
 
71
163
 
72
- def classify_path(path: str) -> tuple[int, str, str]:
73
- """Return (depth, privacy_class, reason)."""
164
+ def _normalized(path: str) -> str:
165
+ return str(Path(path)).replace("\\", "/").lower()
166
+
167
+
168
+ def _parts(path: str) -> set[str]:
169
+ return {part for part in _normalized(path).replace(":", "/").split("/") if part}
170
+
171
+
172
+ def _contains_path_marker(lowered: str, markers: set[str]) -> bool:
173
+ return any(marker in lowered for marker in markers)
174
+
175
+
176
+ def _has_transient_project_part(path: str) -> bool:
177
+ parts = list(_normalized(path).replace(":", "/").split("/"))
178
+ for index, part in enumerate(parts):
179
+ if part in TRANSIENT_PARTS and index >= 2:
180
+ return True
181
+ return False
182
+
183
+
184
+ def _has_hidden_dir_part(path: str) -> bool:
185
+ parts = [part for part in _normalized(path).replace(":", "/").split("/") if part]
186
+ return any(part.startswith(".") and part not in {".", ".."} for part in parts[:-1])
187
+
188
+
189
+ def _is_home_hidden_path(path: str) -> bool:
190
+ try:
191
+ p = Path(path).expanduser()
192
+ home = Path.home().expanduser()
193
+ rel = p.relative_to(home)
194
+ except Exception:
195
+ return False
196
+ return bool(rel.parts) and rel.parts[0].startswith(".")
197
+
198
+
199
+ def is_sensitive_path(path: str) -> bool:
74
200
  p = Path(path)
75
- lowered = str(p).replace("\\", "/").lower()
201
+ lowered = _normalized(path)
76
202
  name = p.name.lower()
77
- parts = {part.lower() for part in p.parts}
203
+ stem = p.stem.lower()
204
+ parts = _parts(path)
205
+ if name in SENSITIVE_FILE_NAMES:
206
+ return True
207
+ if name.startswith(".") and name not in ALLOWED_HIDDEN_FILE_NAMES:
208
+ return True
209
+ if name.startswith("~$"):
210
+ return True
211
+ if name.endswith((".tmp", ".swp", ".swo")):
212
+ return True
213
+ if p.suffix.lower() in SENSITIVE_SUFFIXES:
214
+ return True
215
+ if parts & SENSITIVE_PARTS:
216
+ return True
217
+ if any(marker in name or marker in stem for marker in SENSITIVE_NAME_MARKERS):
218
+ return True
219
+ return _contains_path_marker(lowered, SENSITIVE_PARTS)
220
+
221
+
222
+ def is_private_profile_path(path: str) -> bool:
223
+ lowered = _normalized(path)
224
+ parts = _parts(path)
225
+ if parts & PRIVATE_PROFILE_PARTS:
226
+ return True
227
+ if _contains_path_marker(lowered, PRIVATE_PROFILE_PARTS):
228
+ return True
229
+ name = Path(path).name.lower()
230
+ if name in PROFILE_HIDDEN_FILE_NAMES:
231
+ return True
232
+ if _is_home_hidden_path(path):
233
+ return True
234
+ return False
235
+
78
236
 
79
- if name in SENSITIVE_FILE_NAMES or parts & SENSITIVE_PARTS:
237
+ def classify_path(path: str) -> tuple[int, str, str]:
238
+ """Return (depth, privacy_class, reason)."""
239
+ lowered = _normalized(path)
240
+ parts = _parts(path)
241
+
242
+ if is_sensitive_path(path):
80
243
  return 1, "sensitive_inventory_only", "sensitive_path"
244
+ if is_private_profile_path(path):
245
+ return 0, "private_profile_blocked", "private_profile_path"
81
246
  if any(item in lowered for item in SYSTEM_PARTS):
82
247
  return 0, "system_blocked", "system_path"
83
- if parts & NOISY_PARTS:
248
+ if parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path):
84
249
  return 1, "inventory_only", "noisy_tree"
85
250
  return 2, "normal", "default"
86
251
 
87
252
 
88
253
  def should_skip_tree(path: str) -> bool:
89
- p = Path(path)
90
- lowered = str(p).replace("\\", "/").lower()
91
- parts = {part.lower() for part in p.parts}
254
+ lowered = _normalized(path)
255
+ parts = _parts(path)
256
+ if any(item in lowered for item in SYSTEM_PARTS):
257
+ return True
258
+ if is_sensitive_path(path) or is_private_profile_path(path):
259
+ return True
260
+ return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
261
+
262
+
263
+ def should_skip_file(path: str) -> bool:
264
+ lowered = _normalized(path)
265
+ parts = _parts(path)
92
266
  if any(item in lowered for item in SYSTEM_PARTS):
93
267
  return True
94
- return bool(parts & NOISY_PARTS)
268
+ if is_sensitive_path(path) or is_private_profile_path(path):
269
+ return True
270
+ return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
271
+
272
+
273
+ def is_queryable_path(path: str, privacy_class: str = "") -> bool:
274
+ if privacy_class and privacy_class != "normal":
275
+ return False
276
+ return not should_skip_file(path)
95
277
 
96
278
 
97
279
  def should_extract(path: str, depth: int) -> bool:
98
280
  if depth < 2:
99
281
  return False
282
+ if should_skip_file(path):
283
+ return False
100
284
  suffix = Path(path).suffix.lower()
101
285
  if suffix in {
102
286
  ".txt",