nexo-brain 7.20.2 → 7.20.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +5 -1
- package/package.json +1 -1
- package/src/doctor/providers/runtime.py +68 -0
- package/src/local_context/__init__.py +2 -0
- package/src/local_context/api.py +357 -25
- package/src/local_context/extractors.py +20 -0
- package/src/local_context/privacy.py +194 -10
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.4",
|
|
4
4
|
"description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "NEXO Brain",
|
package/README.md
CHANGED
|
@@ -18,7 +18,11 @@
|
|
|
18
18
|
|
|
19
19
|
[Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
|
|
20
20
|
|
|
21
|
-
Version `7.20.
|
|
21
|
+
Version `7.20.4` is the current packaged-runtime line. Patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
|
|
22
|
+
|
|
23
|
+
Previously in `7.20.3`: patch release over v7.20.2 — installer DMG volumes are no longer added as local-memory roots, removed roots purge stale payloads, and doctor can repair removed-root residue.
|
|
24
|
+
|
|
25
|
+
Previously in `7.20.2`: patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
|
|
22
26
|
|
|
23
27
|
Previously in `7.20.1`: patch release over v7.20.0 — the Local Context service now recovers from orphaned locks and mixed-version cycle failures instead of leaving the background index stuck.
|
|
24
28
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.4",
|
|
4
4
|
"mcpName": "io.github.wazionapps/nexo",
|
|
5
5
|
"description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
|
|
6
6
|
"homepage": "https://nexo-brain.com",
|
|
@@ -3833,6 +3833,73 @@ def check_automation_caller_coverage(days: int = 7) -> DoctorCheck:
|
|
|
3833
3833
|
)
|
|
3834
3834
|
|
|
3835
3835
|
|
|
3836
|
+
def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
|
|
3837
|
+
try:
|
|
3838
|
+
from local_context import api as local_context_api
|
|
3839
|
+
|
|
3840
|
+
result = local_context_api.local_index_hygiene(fix=fix)
|
|
3841
|
+
residue = result.get("residue") or {}
|
|
3842
|
+
cleanup = result.get("cleanup") or {}
|
|
3843
|
+
privacy = result.get("privacy") or {}
|
|
3844
|
+
privacy_residue = privacy.get("residue") or {}
|
|
3845
|
+
privacy_cleanup = privacy.get("cleanup") or {}
|
|
3846
|
+
suspect_roots = [str(path) for path in result.get("removed_roots") or []]
|
|
3847
|
+
residue_total = sum(int(residue.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
|
|
3848
|
+
cleanup_total = sum(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
|
|
3849
|
+
privacy_residue_total = sum(int(privacy_residue.get(key, 0) or 0) for key in ("assets", "dirs", "content_secret_assets"))
|
|
3850
|
+
privacy_cleanup_total = sum(int(privacy_cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "chunks", "embeddings", "entities", "relations", "versions", "dirs", "content_secret_assets"))
|
|
3851
|
+
evidence = [
|
|
3852
|
+
"suspect_installer_roots=" + str(len(suspect_roots)),
|
|
3853
|
+
"residue=" + json.dumps(residue, sort_keys=True),
|
|
3854
|
+
"cleanup=" + json.dumps(cleanup, sort_keys=True),
|
|
3855
|
+
"privacy_residue=" + json.dumps(privacy_residue, sort_keys=True),
|
|
3856
|
+
"privacy_cleanup=" + json.dumps(privacy_cleanup, sort_keys=True),
|
|
3857
|
+
]
|
|
3858
|
+
evidence.extend(f"root={path}" for path in suspect_roots[:5])
|
|
3859
|
+
if residue_total == 0 and privacy_residue_total == 0 and not suspect_roots:
|
|
3860
|
+
return DoctorCheck(
|
|
3861
|
+
id="runtime.local_index_hygiene",
|
|
3862
|
+
tier="runtime",
|
|
3863
|
+
status="healthy",
|
|
3864
|
+
severity="info",
|
|
3865
|
+
summary="Local memory index hygiene is clean",
|
|
3866
|
+
evidence=evidence,
|
|
3867
|
+
repair_plan=[],
|
|
3868
|
+
)
|
|
3869
|
+
if fix:
|
|
3870
|
+
return DoctorCheck(
|
|
3871
|
+
id="runtime.local_index_hygiene",
|
|
3872
|
+
tier="runtime",
|
|
3873
|
+
status="healthy",
|
|
3874
|
+
severity="info",
|
|
3875
|
+
summary="Local memory index hygiene repaired",
|
|
3876
|
+
evidence=evidence,
|
|
3877
|
+
repair_plan=[],
|
|
3878
|
+
fixed=cleanup_total > 0 or privacy_cleanup_total > 0 or bool(suspect_roots),
|
|
3879
|
+
)
|
|
3880
|
+
return DoctorCheck(
|
|
3881
|
+
id="runtime.local_index_hygiene",
|
|
3882
|
+
tier="runtime",
|
|
3883
|
+
status="degraded",
|
|
3884
|
+
severity="warn",
|
|
3885
|
+
summary="Local memory index has stale or private residue",
|
|
3886
|
+
evidence=evidence,
|
|
3887
|
+
repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and private local-memory residue"],
|
|
3888
|
+
escalation_prompt="Local memory may contain stale or private index payloads that should be purged before indexing continues.",
|
|
3889
|
+
)
|
|
3890
|
+
except Exception as exc:
|
|
3891
|
+
return DoctorCheck(
|
|
3892
|
+
id="runtime.local_index_hygiene",
|
|
3893
|
+
tier="runtime",
|
|
3894
|
+
status="degraded",
|
|
3895
|
+
severity="warn",
|
|
3896
|
+
summary="Local memory index hygiene could not be checked",
|
|
3897
|
+
evidence=[str(exc)],
|
|
3898
|
+
repair_plan=["Inspect local_context.api.local_index_hygiene and runtime DB tables"],
|
|
3899
|
+
escalation_prompt="Support cannot verify local memory index residue.",
|
|
3900
|
+
)
|
|
3901
|
+
|
|
3902
|
+
|
|
3836
3903
|
def run_runtime_checks(fix: bool = False) -> list[DoctorCheck]:
|
|
3837
3904
|
"""Run all runtime-tier checks. Read-only by default."""
|
|
3838
3905
|
return [
|
|
@@ -3854,6 +3921,7 @@ def run_runtime_checks(fix: bool = False) -> list[DoctorCheck]:
|
|
|
3854
3921
|
safe_check(check_automation_telemetry),
|
|
3855
3922
|
safe_check(check_automation_caller_coverage),
|
|
3856
3923
|
safe_check(check_state_watchers),
|
|
3924
|
+
safe_check(check_local_index_hygiene, fix=fix),
|
|
3857
3925
|
safe_check(check_release_artifact_sync),
|
|
3858
3926
|
safe_check(check_release_trace_hygiene),
|
|
3859
3927
|
safe_check(check_launchagent_inventory),
|
|
@@ -16,6 +16,7 @@ from .api import (
|
|
|
16
16
|
get_neighbors,
|
|
17
17
|
list_exclusions,
|
|
18
18
|
list_roots,
|
|
19
|
+
local_index_hygiene,
|
|
19
20
|
model_status,
|
|
20
21
|
pause,
|
|
21
22
|
purge_asset,
|
|
@@ -39,6 +40,7 @@ __all__ = [
|
|
|
39
40
|
"get_neighbors",
|
|
40
41
|
"list_exclusions",
|
|
41
42
|
"list_roots",
|
|
43
|
+
"local_index_hygiene",
|
|
42
44
|
"model_status",
|
|
43
45
|
"pause",
|
|
44
46
|
"purge_asset",
|
package/src/local_context/api.py
CHANGED
|
@@ -14,9 +14,9 @@ from db import get_db, init_db
|
|
|
14
14
|
from db._schema import run_migrations
|
|
15
15
|
|
|
16
16
|
from . import embeddings
|
|
17
|
-
from .extractors import chunk_text, entities, extract_text, summarize
|
|
17
|
+
from .extractors import chunk_text, contains_secret, entities, extract_text, summarize
|
|
18
18
|
from .logging import log_event, tail
|
|
19
|
-
from .privacy import classify_path, should_extract, should_skip_tree
|
|
19
|
+
from .privacy import classify_path, is_queryable_path, should_extract, should_skip_file, should_skip_tree
|
|
20
20
|
from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
|
|
21
21
|
|
|
22
22
|
LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
|
|
@@ -41,6 +41,9 @@ def _conn():
|
|
|
41
41
|
def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> dict:
|
|
42
42
|
conn = _conn()
|
|
43
43
|
root_path = norm_path(path)
|
|
44
|
+
if should_skip_tree(root_path):
|
|
45
|
+
log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
|
|
46
|
+
return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
|
|
44
47
|
depth_value = 2 if depth is None else int(depth)
|
|
45
48
|
conn.execute(
|
|
46
49
|
"""
|
|
@@ -64,9 +67,10 @@ def remove_root(path: str) -> dict:
|
|
|
64
67
|
conn = _conn()
|
|
65
68
|
root_path = norm_path(path)
|
|
66
69
|
conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE root_path=?", (now(), root_path))
|
|
70
|
+
cleanup = _purge_removed_root_payloads(conn, root_paths=[root_path])
|
|
67
71
|
conn.commit()
|
|
68
|
-
log_event("info", "root_removed", "Root removed", path=redact_path(root_path))
|
|
69
|
-
return {"ok": True, "root_path": root_path}
|
|
72
|
+
log_event("info", "root_removed", "Root removed", path=redact_path(root_path), cleanup=cleanup)
|
|
73
|
+
return {"ok": True, "root_path": root_path, "cleanup": cleanup}
|
|
70
74
|
|
|
71
75
|
|
|
72
76
|
def list_roots() -> list[dict]:
|
|
@@ -108,6 +112,8 @@ def _mounted_volume_roots() -> list[str]:
|
|
|
108
112
|
try:
|
|
109
113
|
if candidate.name.startswith(".") or not candidate.is_dir():
|
|
110
114
|
continue
|
|
115
|
+
if _should_skip_mounted_root(candidate):
|
|
116
|
+
continue
|
|
111
117
|
resolved = candidate.resolve()
|
|
112
118
|
if resolved == root_resolved:
|
|
113
119
|
continue
|
|
@@ -137,6 +143,250 @@ def ensure_default_roots() -> dict:
|
|
|
137
143
|
return {"ok": True, "created": len(created), "roots": list_roots()}
|
|
138
144
|
|
|
139
145
|
|
|
146
|
+
def _should_skip_mounted_root(candidate: Path) -> bool:
|
|
147
|
+
name = candidate.name.strip().lower()
|
|
148
|
+
if name in {"nexo desktop", "nexo desktop beta"} or name.startswith("nexo desktop "):
|
|
149
|
+
return True
|
|
150
|
+
try:
|
|
151
|
+
app_bundles = [child.name.lower() for child in candidate.iterdir() if child.suffix.lower() == ".app"]
|
|
152
|
+
except Exception:
|
|
153
|
+
app_bundles = []
|
|
154
|
+
if any(name.startswith("nexo desktop") for name in app_bundles):
|
|
155
|
+
installer_markers = (
|
|
156
|
+
candidate / ".background",
|
|
157
|
+
candidate / "Applications",
|
|
158
|
+
candidate / ".DS_Store",
|
|
159
|
+
)
|
|
160
|
+
if any(marker.exists() for marker in installer_markers):
|
|
161
|
+
return True
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _removed_root_filters(conn, *, root_paths: list[str] | None = None) -> tuple[list[int], list[str]]:
|
|
166
|
+
if root_paths:
|
|
167
|
+
placeholders = ",".join("?" for _ in root_paths)
|
|
168
|
+
rows = conn.execute(
|
|
169
|
+
f"SELECT id, root_path FROM local_index_roots WHERE root_path IN ({placeholders}) AND status='removed'",
|
|
170
|
+
tuple(root_paths),
|
|
171
|
+
).fetchall()
|
|
172
|
+
else:
|
|
173
|
+
rows = conn.execute("SELECT id, root_path FROM local_index_roots WHERE status='removed'").fetchall()
|
|
174
|
+
return [int(row["id"]) for row in rows], [str(row["root_path"]) for row in rows]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _removed_root_payload_counts(conn, *, root_paths: list[str] | None = None) -> dict:
|
|
178
|
+
root_ids, removed_paths = _removed_root_filters(conn, root_paths=root_paths)
|
|
179
|
+
if not root_ids and not removed_paths:
|
|
180
|
+
return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
|
|
181
|
+
asset_filter, params = _removed_root_asset_filter(root_ids, removed_paths)
|
|
182
|
+
if not asset_filter:
|
|
183
|
+
return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
|
|
184
|
+
asset_subquery = f"SELECT asset_id FROM local_assets WHERE {asset_filter}"
|
|
185
|
+
assets = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_assets WHERE {asset_filter}", tuple(params)).fetchone()["total"] or 0)
|
|
186
|
+
jobs = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_jobs WHERE asset_id IN ({asset_subquery})", tuple(params)).fetchone()["total"] or 0)
|
|
187
|
+
errors = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_errors WHERE asset_id IN ({asset_subquery})", tuple(params)).fetchone()["total"] or 0)
|
|
188
|
+
for path in removed_paths:
|
|
189
|
+
errors += int(conn.execute("SELECT COUNT(*) AS total FROM local_index_errors WHERE asset_id='' AND (path = ? OR path LIKE ?)", (path, f"{path}/%")).fetchone()["total"] or 0)
|
|
190
|
+
dirs = 0
|
|
191
|
+
checkpoints = 0
|
|
192
|
+
if root_ids:
|
|
193
|
+
root_placeholders = ",".join("?" for _ in root_ids)
|
|
194
|
+
dirs = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_dirs WHERE root_id IN ({root_placeholders})", tuple(root_ids)).fetchone()["total"] or 0)
|
|
195
|
+
checkpoints = int(conn.execute(f"SELECT COUNT(*) AS total FROM local_index_checkpoints WHERE root_id IN ({root_placeholders})", tuple(root_ids)).fetchone()["total"] or 0)
|
|
196
|
+
return {"assets": assets, "jobs": jobs, "errors": errors, "dirs": dirs, "checkpoints": checkpoints}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _removed_root_asset_filter(root_ids: list[int], removed_paths: list[str]) -> tuple[str, list[Any]]:
|
|
200
|
+
filters: list[str] = []
|
|
201
|
+
params: list[Any] = []
|
|
202
|
+
if root_ids:
|
|
203
|
+
root_placeholders = ",".join("?" for _ in root_ids)
|
|
204
|
+
filters.append(f"root_id IN ({root_placeholders})")
|
|
205
|
+
params.extend(root_ids)
|
|
206
|
+
for path in removed_paths:
|
|
207
|
+
filters.append("(path = ? OR path LIKE ?)")
|
|
208
|
+
params.extend([path, f"{path}/%"])
|
|
209
|
+
return " OR ".join(filters), params
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -> dict:
|
|
213
|
+
root_ids, removed_paths = _removed_root_filters(conn, root_paths=root_paths)
|
|
214
|
+
if not root_ids and not removed_paths:
|
|
215
|
+
return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
|
|
216
|
+
|
|
217
|
+
asset_filter, params = _removed_root_asset_filter(root_ids, removed_paths)
|
|
218
|
+
if not asset_filter:
|
|
219
|
+
return {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
|
|
220
|
+
asset_subquery = f"SELECT asset_id FROM local_assets WHERE {asset_filter}"
|
|
221
|
+
counts = _removed_root_payload_counts(conn, root_paths=root_paths)
|
|
222
|
+
|
|
223
|
+
for table in ("local_embeddings", "local_chunks", "local_entities", "local_asset_versions"):
|
|
224
|
+
conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
225
|
+
conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({asset_subquery})", tuple(params))
|
|
226
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({asset_subquery})", tuple(params))
|
|
227
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({asset_subquery})", tuple(params))
|
|
228
|
+
conn.execute(f"DELETE FROM local_index_jobs WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
229
|
+
conn.execute(f"DELETE FROM local_index_errors WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
230
|
+
|
|
231
|
+
for path in removed_paths:
|
|
232
|
+
conn.execute("DELETE FROM local_index_errors WHERE path = ? OR path LIKE ?", (path, f"{path}/%"))
|
|
233
|
+
|
|
234
|
+
if root_ids:
|
|
235
|
+
root_placeholders = ",".join("?" for _ in root_ids)
|
|
236
|
+
conn.execute(f"DELETE FROM local_index_dirs WHERE root_id IN ({root_placeholders})", tuple(root_ids))
|
|
237
|
+
conn.execute(f"DELETE FROM local_index_checkpoints WHERE root_id IN ({root_placeholders})", tuple(root_ids))
|
|
238
|
+
conn.execute(f"DELETE FROM local_assets WHERE {asset_filter}", tuple(params))
|
|
239
|
+
return counts
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _purge_asset_ids(conn, asset_ids: list[str]) -> dict:
|
|
243
|
+
unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
|
|
244
|
+
counts = {"assets": len(unique_ids), "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0}
|
|
245
|
+
if not unique_ids:
|
|
246
|
+
return counts
|
|
247
|
+
for start in range(0, len(unique_ids), 500):
|
|
248
|
+
batch = unique_ids[start:start + 500]
|
|
249
|
+
placeholders = ",".join("?" for _ in batch)
|
|
250
|
+
for key, table in (
|
|
251
|
+
("embeddings", "local_embeddings"),
|
|
252
|
+
("chunks", "local_chunks"),
|
|
253
|
+
("entities", "local_entities"),
|
|
254
|
+
("versions", "local_asset_versions"),
|
|
255
|
+
("jobs", "local_index_jobs"),
|
|
256
|
+
("errors", "local_index_errors"),
|
|
257
|
+
):
|
|
258
|
+
counts[key] += int(conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
259
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
260
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
261
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
262
|
+
conn.execute(f"DELETE FROM local_assets WHERE asset_id IN ({placeholders})", tuple(batch))
|
|
263
|
+
return counts
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _privacy_unsafe_asset_ids(conn) -> list[str]:
|
|
267
|
+
rows = conn.execute("SELECT asset_id, path, privacy_class FROM local_assets").fetchall()
|
|
268
|
+
unsafe: list[str] = []
|
|
269
|
+
for row in rows:
|
|
270
|
+
privacy_class = str(row["privacy_class"] or "")
|
|
271
|
+
if should_skip_file(str(row["path"] or "")) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
|
|
272
|
+
unsafe.append(str(row["asset_id"]))
|
|
273
|
+
return unsafe
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _privacy_unsafe_dir_ids(conn) -> list[str]:
|
|
277
|
+
rows = conn.execute("SELECT dir_id, path FROM local_index_dirs").fetchall()
|
|
278
|
+
return [str(row["dir_id"]) for row in rows if should_skip_tree(str(row["path"] or ""))]
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _content_secret_asset_ids(conn) -> list[str]:
|
|
282
|
+
rows = conn.execute(
|
|
283
|
+
"""
|
|
284
|
+
SELECT c.asset_id, c.text
|
|
285
|
+
FROM local_chunks c
|
|
286
|
+
JOIN local_assets a ON a.asset_id=c.asset_id
|
|
287
|
+
WHERE a.status='active'
|
|
288
|
+
AND COALESCE(a.privacy_class, 'normal')='normal'
|
|
289
|
+
ORDER BY c.asset_id, c.chunk_index
|
|
290
|
+
"""
|
|
291
|
+
).fetchall()
|
|
292
|
+
secret_ids: set[str] = set()
|
|
293
|
+
for row in rows:
|
|
294
|
+
asset_id = str(row["asset_id"])
|
|
295
|
+
if asset_id in secret_ids:
|
|
296
|
+
continue
|
|
297
|
+
if contains_secret(str(row["text"] or "")):
|
|
298
|
+
secret_ids.add(asset_id)
|
|
299
|
+
return sorted(secret_ids)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _mark_content_secret_assets(conn, asset_ids: list[str]) -> int:
|
|
303
|
+
unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
|
|
304
|
+
if not unique_ids:
|
|
305
|
+
return 0
|
|
306
|
+
for start in range(0, len(unique_ids), 500):
|
|
307
|
+
batch = unique_ids[start:start + 500]
|
|
308
|
+
placeholders = ",".join("?" for _ in batch)
|
|
309
|
+
for table in ("local_embeddings", "local_chunks", "local_entities"):
|
|
310
|
+
conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch))
|
|
311
|
+
conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch))
|
|
312
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch))
|
|
313
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch))
|
|
314
|
+
conn.execute(
|
|
315
|
+
f"""
|
|
316
|
+
UPDATE local_index_jobs
|
|
317
|
+
SET status='done', last_error_code='content_secret_blocked', updated_at=?
|
|
318
|
+
WHERE asset_id IN ({placeholders})
|
|
319
|
+
""",
|
|
320
|
+
(now(), *batch),
|
|
321
|
+
)
|
|
322
|
+
conn.execute(
|
|
323
|
+
f"""
|
|
324
|
+
UPDATE local_asset_versions
|
|
325
|
+
SET summary='', metadata_json=?
|
|
326
|
+
WHERE asset_id IN ({placeholders})
|
|
327
|
+
""",
|
|
328
|
+
(json_dumps({"content_blocked": "secret_pattern"}), *batch),
|
|
329
|
+
)
|
|
330
|
+
conn.execute(
|
|
331
|
+
f"""
|
|
332
|
+
UPDATE local_assets
|
|
333
|
+
SET privacy_class='content_secret_inventory_only',
|
|
334
|
+
depth=1,
|
|
335
|
+
depth_reason='content_secret',
|
|
336
|
+
phase='privacy_blocked',
|
|
337
|
+
updated_at=?
|
|
338
|
+
WHERE asset_id IN ({placeholders})
|
|
339
|
+
""",
|
|
340
|
+
(now(), *batch),
|
|
341
|
+
)
|
|
342
|
+
return len(unique_ids)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def local_index_privacy_hygiene(*, fix: bool = False) -> dict:
|
|
346
|
+
conn = _conn()
|
|
347
|
+
asset_ids = _privacy_unsafe_asset_ids(conn)
|
|
348
|
+
dir_ids = _privacy_unsafe_dir_ids(conn)
|
|
349
|
+
content_secret_ids = _content_secret_asset_ids(conn)
|
|
350
|
+
residue = {"assets": len(asset_ids), "dirs": len(dir_ids), "content_secret_assets": len(content_secret_ids)}
|
|
351
|
+
cleanup = {"assets": 0, "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0, "dirs": 0, "content_secret_assets": 0}
|
|
352
|
+
if fix:
|
|
353
|
+
cleanup.update(_purge_asset_ids(conn, asset_ids))
|
|
354
|
+
if dir_ids:
|
|
355
|
+
for start in range(0, len(dir_ids), 500):
|
|
356
|
+
batch = dir_ids[start:start + 500]
|
|
357
|
+
placeholders = ",".join("?" for _ in batch)
|
|
358
|
+
cleanup["dirs"] += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
359
|
+
cleanup["content_secret_assets"] = _mark_content_secret_assets(conn, content_secret_ids)
|
|
360
|
+
conn.commit()
|
|
361
|
+
if asset_ids or dir_ids or content_secret_ids:
|
|
362
|
+
log_event("warn", "privacy_hygiene_repaired", "Local memory privacy hygiene repaired", cleanup=cleanup)
|
|
363
|
+
return {"ok": True, "fix": fix, "residue": residue, "cleanup": cleanup}
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def local_index_hygiene(*, fix: bool = False) -> dict:
|
|
367
|
+
conn = _conn()
|
|
368
|
+
removed_paths: list[str] = []
|
|
369
|
+
for row in conn.execute("SELECT id, root_path FROM local_index_roots").fetchall():
|
|
370
|
+
path = str(row["root_path"] or "")
|
|
371
|
+
if _should_skip_mounted_root(Path(path)) or should_skip_tree(path):
|
|
372
|
+
removed_paths.append(path)
|
|
373
|
+
if fix:
|
|
374
|
+
conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE id=?", (now(), row["id"]))
|
|
375
|
+
before = _removed_root_payload_counts(conn)
|
|
376
|
+
cleanup = {"assets": 0, "jobs": 0, "errors": 0, "dirs": 0, "checkpoints": 0}
|
|
377
|
+
if fix:
|
|
378
|
+
cleanup = _purge_removed_root_payloads(conn)
|
|
379
|
+
conn.commit()
|
|
380
|
+
privacy = local_index_privacy_hygiene(fix=fix)
|
|
381
|
+
if fix and (removed_paths or any(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))):
|
|
382
|
+
log_event("info", "index_hygiene_repaired", "Local memory index hygiene repaired", roots=[redact_path(path) for path in removed_paths], cleanup=cleanup)
|
|
383
|
+
return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup, "privacy": privacy}
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def repair_index_hygiene() -> dict:
|
|
387
|
+
return local_index_hygiene(fix=True)
|
|
388
|
+
|
|
389
|
+
|
|
140
390
|
def add_exclusion(path: str, *, reason: str = "user") -> dict:
|
|
141
391
|
conn = _conn()
|
|
142
392
|
excluded_path = norm_path(path)
|
|
@@ -303,6 +553,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
|
|
|
303
553
|
raw_path = str(path)
|
|
304
554
|
normalized = norm_path(raw_path)
|
|
305
555
|
asset_id = stable_id("asset", normalized)
|
|
556
|
+
if should_skip_file(normalized):
|
|
557
|
+
return asset_id, False, "skipped"
|
|
306
558
|
perm = _permission_state(path)
|
|
307
559
|
depth, privacy_class, depth_reason = classify_path(normalized)
|
|
308
560
|
depth = min(depth, root_depth)
|
|
@@ -398,7 +650,7 @@ def _mark_asset_deleted(conn, asset_id: str, deleted_at: float | None = None) ->
|
|
|
398
650
|
"""
|
|
399
651
|
UPDATE local_index_jobs
|
|
400
652
|
SET status='done', last_error_code='asset_deleted', updated_at=?
|
|
401
|
-
WHERE asset_id=? AND status IN ('pending', 'running')
|
|
653
|
+
WHERE asset_id=? AND status IN ('pending', 'running', 'failed')
|
|
402
654
|
""",
|
|
403
655
|
(deleted_at, asset_id),
|
|
404
656
|
)
|
|
@@ -425,6 +677,20 @@ def _mark_dir_subtree_deleted(conn, dir_path: str, deleted_at: float | None = No
|
|
|
425
677
|
return len(rows)
|
|
426
678
|
|
|
427
679
|
|
|
680
|
+
def _purge_dir_subtree(conn, dir_path: str) -> int:
|
|
681
|
+
normalized = norm_path(dir_path)
|
|
682
|
+
prefix = _path_prefix(normalized)
|
|
683
|
+
rows = conn.execute(
|
|
684
|
+
"SELECT asset_id FROM local_assets WHERE path=? OR path LIKE ?",
|
|
685
|
+
(normalized, prefix + "%"),
|
|
686
|
+
).fetchall()
|
|
687
|
+
asset_ids = [str(row["asset_id"]) for row in rows]
|
|
688
|
+
_purge_asset_ids(conn, asset_ids)
|
|
689
|
+
conn.execute("DELETE FROM local_index_dirs WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
|
|
690
|
+
conn.execute("DELETE FROM local_index_errors WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
|
|
691
|
+
return len(asset_ids)
|
|
692
|
+
|
|
693
|
+
|
|
428
694
|
def _record_index_error(
|
|
429
695
|
conn,
|
|
430
696
|
*,
|
|
@@ -530,6 +796,8 @@ def _iter_files(
|
|
|
530
796
|
continue
|
|
531
797
|
if entry.is_file():
|
|
532
798
|
normalized = norm_path(entry)
|
|
799
|
+
if should_skip_file(normalized):
|
|
800
|
+
continue
|
|
533
801
|
if start_after_norm and normalized <= start_after_norm:
|
|
534
802
|
continue
|
|
535
803
|
yield entry
|
|
@@ -608,7 +876,11 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
|
|
|
608
876
|
path = str(row["path"])
|
|
609
877
|
root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
|
|
610
878
|
if _is_excluded(path, exclusions):
|
|
611
|
-
|
|
879
|
+
_purge_asset_ids(conn, [row["asset_id"]])
|
|
880
|
+
stats["excluded"] += 1
|
|
881
|
+
continue
|
|
882
|
+
if should_skip_file(path):
|
|
883
|
+
_purge_asset_ids(conn, [row["asset_id"]])
|
|
612
884
|
stats["excluded"] += 1
|
|
613
885
|
continue
|
|
614
886
|
if root_path is not None and not root_path.exists():
|
|
@@ -715,6 +987,8 @@ def _scan_known_directory(
|
|
|
715
987
|
stack.append(entry)
|
|
716
988
|
continue
|
|
717
989
|
if entry.is_file():
|
|
990
|
+
if should_skip_file(str(entry)):
|
|
991
|
+
continue
|
|
718
992
|
seen_files.add(norm_path(entry))
|
|
719
993
|
if stats["files_scanned"] >= file_limit:
|
|
720
994
|
continue
|
|
@@ -722,7 +996,7 @@ def _scan_known_directory(
|
|
|
722
996
|
stats["files_scanned"] += 1
|
|
723
997
|
if changed:
|
|
724
998
|
stats["files_changed"] += 1
|
|
725
|
-
if state
|
|
999
|
+
if state not in {"ok", "skipped"}:
|
|
726
1000
|
stats["errors"] += 1
|
|
727
1001
|
except Exception as exc:
|
|
728
1002
|
_record_scan_error(conn, stats, str(entry), "live_reconcile", exc)
|
|
@@ -766,6 +1040,10 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
|
|
|
766
1040
|
stats["files_deleted"] += _mark_dir_subtree_deleted(conn, str(dir_path), seen_at)
|
|
767
1041
|
stats["excluded_dirs"] += 1
|
|
768
1042
|
continue
|
|
1043
|
+
if should_skip_tree(str(dir_path)):
|
|
1044
|
+
stats["files_deleted"] += _purge_dir_subtree(conn, str(dir_path))
|
|
1045
|
+
stats["excluded_dirs"] += 1
|
|
1046
|
+
continue
|
|
769
1047
|
if root_path is not None and not root_path.exists():
|
|
770
1048
|
stats["offline"] += 1
|
|
771
1049
|
continue
|
|
@@ -845,6 +1123,12 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
845
1123
|
for root in roots:
|
|
846
1124
|
root_path = Path(root["root_path"]).expanduser()
|
|
847
1125
|
root_id = int(root["id"])
|
|
1126
|
+
if should_skip_tree(str(root_path)):
|
|
1127
|
+
conn.execute(
|
|
1128
|
+
"UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
|
|
1129
|
+
(now(), now(), root_id),
|
|
1130
|
+
)
|
|
1131
|
+
continue
|
|
848
1132
|
if not root_path.exists():
|
|
849
1133
|
conn.execute(
|
|
850
1134
|
"UPDATE local_index_roots SET status='offline', last_scan_at=?, updated_at=? WHERE id=?",
|
|
@@ -876,7 +1160,7 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
876
1160
|
seen_for_root += 1
|
|
877
1161
|
if changed:
|
|
878
1162
|
totals["changed"] += 1
|
|
879
|
-
if state
|
|
1163
|
+
if state not in {"ok", "skipped"}:
|
|
880
1164
|
totals["errors"] += 1
|
|
881
1165
|
partial_root = bool(limit and seen_for_root >= limit)
|
|
882
1166
|
totals["partial"] = bool(totals["partial"] or partial_root)
|
|
@@ -1000,7 +1284,7 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
1000
1284
|
recovered = _requeue_due_jobs(conn)
|
|
1001
1285
|
rows = conn.execute(
|
|
1002
1286
|
"""
|
|
1003
|
-
SELECT j.*, a.path, a.depth, a.status AS asset_status
|
|
1287
|
+
SELECT j.*, a.path, a.depth, a.privacy_class, a.status AS asset_status
|
|
1004
1288
|
FROM local_index_jobs j
|
|
1005
1289
|
JOIN local_assets a ON a.asset_id = j.asset_id
|
|
1006
1290
|
WHERE j.status='pending'
|
|
@@ -1022,9 +1306,24 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
1022
1306
|
try:
|
|
1023
1307
|
if row["asset_status"] != "active":
|
|
1024
1308
|
raise FileNotFoundError(row["path"])
|
|
1309
|
+
if str(row["privacy_class"] or "normal") != "normal":
|
|
1310
|
+
conn.execute(
|
|
1311
|
+
"UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='privacy_blocked' WHERE job_id=?",
|
|
1312
|
+
(now(), job_id),
|
|
1313
|
+
)
|
|
1314
|
+
processed += 1
|
|
1315
|
+
continue
|
|
1025
1316
|
if job_type == "light_extraction":
|
|
1026
1317
|
text, metadata = extract_text(Path(row["path"]))
|
|
1027
1318
|
version_id = _latest_version_id(conn, asset_id)
|
|
1319
|
+
if contains_secret(text):
|
|
1320
|
+
_mark_content_secret_assets(conn, [asset_id])
|
|
1321
|
+
conn.execute(
|
|
1322
|
+
"UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='content_secret_blocked' WHERE job_id=?",
|
|
1323
|
+
(now(), job_id),
|
|
1324
|
+
)
|
|
1325
|
+
processed += 1
|
|
1326
|
+
continue
|
|
1028
1327
|
summary = summarize(text)
|
|
1029
1328
|
conn.execute(
|
|
1030
1329
|
"UPDATE local_asset_versions SET summary=?, metadata_json=? WHERE version_id=?",
|
|
@@ -1081,6 +1380,9 @@ def run_once(
|
|
|
1081
1380
|
live_dir_limit: int = DEFAULT_LIVE_DIR_LIMIT,
|
|
1082
1381
|
live_file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
|
|
1083
1382
|
) -> dict:
|
|
1383
|
+
if _get_state("privacy_hygiene_v2", "0") != "1":
|
|
1384
|
+
local_index_privacy_hygiene(fix=True)
|
|
1385
|
+
_set_state("privacy_hygiene_v2", "1")
|
|
1084
1386
|
if root:
|
|
1085
1387
|
add_root(root)
|
|
1086
1388
|
elif (
|
|
@@ -1102,9 +1404,19 @@ def run_once(
|
|
|
1102
1404
|
def _problem_rows(conn) -> list[dict]:
|
|
1103
1405
|
rows = conn.execute(
|
|
1104
1406
|
"""
|
|
1105
|
-
SELECT path, phase, error_code, user_message, technical_detail, retryable, created_at
|
|
1106
|
-
FROM local_index_errors
|
|
1107
|
-
|
|
1407
|
+
SELECT e.path, e.phase, e.error_code, e.user_message, e.technical_detail, e.retryable, e.created_at
|
|
1408
|
+
FROM local_index_errors e
|
|
1409
|
+
LEFT JOIN local_assets a ON a.asset_id=e.asset_id
|
|
1410
|
+
LEFT JOIN local_index_roots r ON r.id=a.root_id
|
|
1411
|
+
WHERE COALESCE(r.status, 'active') != 'removed'
|
|
1412
|
+
AND NOT EXISTS (
|
|
1413
|
+
SELECT 1
|
|
1414
|
+
FROM local_index_roots rr
|
|
1415
|
+
WHERE rr.status='removed'
|
|
1416
|
+
AND e.path != ''
|
|
1417
|
+
AND (e.path = rr.root_path OR e.path LIKE rr.root_path || '/%')
|
|
1418
|
+
)
|
|
1419
|
+
ORDER BY e.id DESC
|
|
1108
1420
|
LIMIT 20
|
|
1109
1421
|
"""
|
|
1110
1422
|
).fetchall()
|
|
@@ -1387,9 +1699,24 @@ def status() -> dict:
|
|
|
1387
1699
|
conn = _conn()
|
|
1388
1700
|
paused = _is_paused()
|
|
1389
1701
|
assets = conn.execute(
|
|
1390
|
-
"
|
|
1702
|
+
"""
|
|
1703
|
+
SELECT COUNT(*) AS total, SUM(CASE WHEN a.status='active' THEN 1 ELSE 0 END) AS active
|
|
1704
|
+
FROM local_assets a
|
|
1705
|
+
LEFT JOIN local_index_roots r ON r.id=a.root_id
|
|
1706
|
+
WHERE COALESCE(r.status, 'active') != 'removed'
|
|
1707
|
+
"""
|
|
1391
1708
|
).fetchone()
|
|
1392
|
-
job_rows = conn.execute(
|
|
1709
|
+
job_rows = conn.execute(
|
|
1710
|
+
"""
|
|
1711
|
+
SELECT j.status, COUNT(*) AS total
|
|
1712
|
+
FROM local_index_jobs j
|
|
1713
|
+
JOIN local_assets a ON a.asset_id=j.asset_id
|
|
1714
|
+
LEFT JOIN local_index_roots r ON r.id=a.root_id
|
|
1715
|
+
WHERE a.status='active'
|
|
1716
|
+
AND COALESCE(r.status, 'active') != 'removed'
|
|
1717
|
+
GROUP BY j.status
|
|
1718
|
+
"""
|
|
1719
|
+
).fetchall()
|
|
1393
1720
|
job_counts = {row["status"]: int(row["total"] or 0) for row in job_rows}
|
|
1394
1721
|
pending = int(job_counts.get("pending", 0) or 0)
|
|
1395
1722
|
running_jobs = int(job_counts.get("running", 0) or 0)
|
|
@@ -1401,7 +1728,15 @@ def status() -> dict:
|
|
|
1401
1728
|
roots = list_roots()
|
|
1402
1729
|
volumes = []
|
|
1403
1730
|
by_volume = conn.execute(
|
|
1404
|
-
"
|
|
1731
|
+
"""
|
|
1732
|
+
SELECT a.volume_id, COUNT(*) AS files
|
|
1733
|
+
FROM local_assets a
|
|
1734
|
+
LEFT JOIN local_index_roots r ON r.id=a.root_id
|
|
1735
|
+
WHERE a.status='active'
|
|
1736
|
+
AND COALESCE(r.status, 'active') != 'removed'
|
|
1737
|
+
GROUP BY a.volume_id
|
|
1738
|
+
ORDER BY a.volume_id
|
|
1739
|
+
"""
|
|
1405
1740
|
).fetchall()
|
|
1406
1741
|
for row in by_volume:
|
|
1407
1742
|
volumes.append({"id": row["volume_id"], "label": row["volume_id"] or "Disk", "files": row["files"], "status": "active"})
|
|
@@ -1526,17 +1861,21 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1526
1861
|
qvec = embeddings.embed_text(query)
|
|
1527
1862
|
rows = conn.execute(
|
|
1528
1863
|
"""
|
|
1529
|
-
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, v.summary, e.vector_json
|
|
1864
|
+
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
|
|
1530
1865
|
FROM local_chunks c
|
|
1531
1866
|
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
1532
1867
|
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
1533
1868
|
LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
|
|
1534
1869
|
WHERE a.status='active'
|
|
1535
|
-
|
|
1870
|
+
AND a.privacy_class='normal'
|
|
1871
|
+
ORDER BY c.created_at DESC
|
|
1872
|
+
LIMIT 5000
|
|
1536
1873
|
"""
|
|
1537
1874
|
).fetchall()
|
|
1538
1875
|
scored = []
|
|
1539
1876
|
for row in rows:
|
|
1877
|
+
if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
|
|
1878
|
+
continue
|
|
1540
1879
|
vector = json_loads(row["vector_json"], [])
|
|
1541
1880
|
score = max(_search_text_score(query, row["text"]), embeddings.cosine(qvec, vector))
|
|
1542
1881
|
if score > 0:
|
|
@@ -1550,7 +1889,6 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1550
1889
|
if row["asset_id"] not in seen_assets:
|
|
1551
1890
|
assets.append({
|
|
1552
1891
|
"asset_id": row["asset_id"],
|
|
1553
|
-
"path": row["path"],
|
|
1554
1892
|
"display_path": redact_path(row["path"]),
|
|
1555
1893
|
"file_type": row["file_type"],
|
|
1556
1894
|
"score": round(float(score), 4),
|
|
@@ -1644,13 +1982,7 @@ def get_neighbors(asset_id: str, *, limit: int = 30) -> dict:
|
|
|
1644
1982
|
|
|
1645
1983
|
def purge_asset(asset_id: str) -> dict:
|
|
1646
1984
|
conn = _conn()
|
|
1647
|
-
|
|
1648
|
-
conn.execute(f"DELETE FROM {table} WHERE asset_id=?", (asset_id,))
|
|
1649
|
-
conn.execute("DELETE FROM local_relations WHERE source_asset_id=?", (asset_id,))
|
|
1650
|
-
conn.execute("DELETE FROM local_index_errors WHERE asset_id=?", (asset_id,))
|
|
1651
|
-
conn.execute("DELETE FROM local_index_jobs WHERE asset_id=?", (asset_id,))
|
|
1652
|
-
conn.execute("DELETE FROM local_asset_versions WHERE asset_id=?", (asset_id,))
|
|
1653
|
-
conn.execute("DELETE FROM local_assets WHERE asset_id=?", (asset_id,))
|
|
1985
|
+
_purge_asset_ids(conn, [asset_id])
|
|
1654
1986
|
conn.commit()
|
|
1655
1987
|
log_event("info", "asset_purged", "Asset purged", asset_id=asset_id)
|
|
1656
1988
|
return {"ok": True, "asset_id": asset_id}
|
|
@@ -32,6 +32,26 @@ TEXT_SUFFIXES = {
|
|
|
32
32
|
".css",
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
+
SECRET_PATTERNS: tuple[re.Pattern, ...] = (
|
|
36
|
+
re.compile(r"\bBearer\s+[A-Za-z0-9._\-~+/]{12,}\b", re.I),
|
|
37
|
+
re.compile(r"\bsk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
|
|
38
|
+
re.compile(r"\bpk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
|
|
39
|
+
re.compile(r"\b(ghp|gho|ghu|ghs|ghr|github_pat|glpat|xoxb|xoxp|shpat)_[A-Za-z0-9_]{16,}\b", re.I),
|
|
40
|
+
re.compile(r"\b(AKIA|ASIA)[A-Z0-9]{16,}\b"),
|
|
41
|
+
re.compile(r"\bey[A-Za-z0-9_-]{10,}\.ey[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"),
|
|
42
|
+
re.compile(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY-----", re.I),
|
|
43
|
+
re.compile(r"\b([A-Z][A-Z0-9_]*(?:TOKEN|SECRET|KEY|PASSWORD|PASS)\s*[:=]\s*)['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
|
|
44
|
+
re.compile(r"\b(?:api[_-]?key|secret[_-]?key|auth[_-]?token)\s*[:=]\s*['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
|
|
45
|
+
re.compile(r"\b(?:password|passwd|pwd)\s*[:=]\s*['\"][^'\"]{6,}['\"]", re.I),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def contains_secret(text: str) -> bool:
|
|
50
|
+
if not text:
|
|
51
|
+
return False
|
|
52
|
+
sample = text[:MAX_CHARS]
|
|
53
|
+
return any(pattern.search(sample) for pattern in SECRET_PATTERNS)
|
|
54
|
+
|
|
35
55
|
|
|
36
56
|
def _read_text(path: Path) -> str:
|
|
37
57
|
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
|
@@ -6,21 +6,58 @@ SENSITIVE_FILE_NAMES = {
|
|
|
6
6
|
".env",
|
|
7
7
|
".env.local",
|
|
8
8
|
".env.production",
|
|
9
|
+
".npmrc",
|
|
10
|
+
".pypirc",
|
|
11
|
+
".netrc",
|
|
12
|
+
".boto",
|
|
13
|
+
".pgpass",
|
|
14
|
+
".my.cnf",
|
|
15
|
+
".git-credentials",
|
|
16
|
+
".mcp_publisher_token",
|
|
17
|
+
".mcpregistry_github_token",
|
|
18
|
+
".mcpregistry_registry_token",
|
|
9
19
|
"id_rsa",
|
|
10
20
|
"id_dsa",
|
|
11
21
|
"id_ecdsa",
|
|
12
22
|
"id_ed25519",
|
|
23
|
+
"known_hosts",
|
|
24
|
+
"authorized_keys",
|
|
13
25
|
"cookies.sqlite",
|
|
14
26
|
"login data",
|
|
15
27
|
"keychain-2.db",
|
|
16
28
|
}
|
|
17
29
|
|
|
30
|
+
SENSITIVE_NAME_MARKERS = {
|
|
31
|
+
"api_key",
|
|
32
|
+
"apikey",
|
|
33
|
+
"auth_token",
|
|
34
|
+
"bearer",
|
|
35
|
+
"client_secret",
|
|
36
|
+
"credential",
|
|
37
|
+
"credentials",
|
|
38
|
+
"oauth",
|
|
39
|
+
"password",
|
|
40
|
+
"passwd",
|
|
41
|
+
"private_key",
|
|
42
|
+
"secret",
|
|
43
|
+
"token",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
SENSITIVE_SUFFIXES = {
|
|
47
|
+
".key",
|
|
48
|
+
".pem",
|
|
49
|
+
".p12",
|
|
50
|
+
".pfx",
|
|
51
|
+
".kdbx",
|
|
52
|
+
}
|
|
53
|
+
|
|
18
54
|
SENSITIVE_PARTS = {
|
|
19
55
|
".ssh",
|
|
20
56
|
".gnupg",
|
|
21
57
|
".aws",
|
|
22
58
|
".azure",
|
|
23
59
|
".kube",
|
|
60
|
+
".docker",
|
|
24
61
|
"password",
|
|
25
62
|
"passwords",
|
|
26
63
|
"1password",
|
|
@@ -53,9 +90,64 @@ NOISY_PARTS = {
|
|
|
53
90
|
".parcel-cache",
|
|
54
91
|
".bun",
|
|
55
92
|
".gradle",
|
|
93
|
+
"$tmp",
|
|
56
94
|
"target",
|
|
57
95
|
}
|
|
58
96
|
|
|
97
|
+
TRANSIENT_PARTS = {"tmp", "temp"}
|
|
98
|
+
|
|
99
|
+
PRIVATE_PROFILE_PARTS = {
|
|
100
|
+
".nexo",
|
|
101
|
+
".claude",
|
|
102
|
+
".codex",
|
|
103
|
+
".gemini",
|
|
104
|
+
".cursor",
|
|
105
|
+
".config",
|
|
106
|
+
".local",
|
|
107
|
+
".npm",
|
|
108
|
+
".yarn",
|
|
109
|
+
".pnpm-store",
|
|
110
|
+
".ollama",
|
|
111
|
+
".docker",
|
|
112
|
+
".vscode",
|
|
113
|
+
".idea",
|
|
114
|
+
"appdata",
|
|
115
|
+
"application data",
|
|
116
|
+
"library/application support",
|
|
117
|
+
"library/containers",
|
|
118
|
+
"library/group containers",
|
|
119
|
+
"library/keychains",
|
|
120
|
+
"library/logs",
|
|
121
|
+
"library/mail",
|
|
122
|
+
"library/messages",
|
|
123
|
+
"library/safari",
|
|
124
|
+
"library/saved application state",
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
PROFILE_HIDDEN_FILE_NAMES = {
|
|
128
|
+
".aider.chat.history.md",
|
|
129
|
+
".aider.input.history",
|
|
130
|
+
".bash_history",
|
|
131
|
+
".bash_profile",
|
|
132
|
+
".bashrc",
|
|
133
|
+
".claude.json",
|
|
134
|
+
".codex.json",
|
|
135
|
+
".cursorignore",
|
|
136
|
+
".ds_store",
|
|
137
|
+
".gitconfig",
|
|
138
|
+
".gitignore_global",
|
|
139
|
+
".lesshst",
|
|
140
|
+
".python_history",
|
|
141
|
+
".sqlite_history",
|
|
142
|
+
".viminfo",
|
|
143
|
+
".wget-hsts",
|
|
144
|
+
".zprofile",
|
|
145
|
+
".zsh_history",
|
|
146
|
+
".zshrc",
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
ALLOWED_HIDDEN_FILE_NAMES = set()
|
|
150
|
+
|
|
59
151
|
SYSTEM_PARTS = {
|
|
60
152
|
"system volume information",
|
|
61
153
|
"$recycle.bin",
|
|
@@ -69,34 +161,126 @@ SYSTEM_PARTS = {
|
|
|
69
161
|
}
|
|
70
162
|
|
|
71
163
|
|
|
72
|
-
def
|
|
73
|
-
|
|
164
|
+
def _normalized(path: str) -> str:
|
|
165
|
+
return str(Path(path)).replace("\\", "/").lower()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _parts(path: str) -> set[str]:
|
|
169
|
+
return {part for part in _normalized(path).replace(":", "/").split("/") if part}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _contains_path_marker(lowered: str, markers: set[str]) -> bool:
|
|
173
|
+
return any(marker in lowered for marker in markers)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _has_transient_project_part(path: str) -> bool:
|
|
177
|
+
parts = list(_normalized(path).replace(":", "/").split("/"))
|
|
178
|
+
for index, part in enumerate(parts):
|
|
179
|
+
if part in TRANSIENT_PARTS and index >= 2:
|
|
180
|
+
return True
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _has_hidden_dir_part(path: str) -> bool:
|
|
185
|
+
parts = [part for part in _normalized(path).replace(":", "/").split("/") if part]
|
|
186
|
+
return any(part.startswith(".") and part not in {".", ".."} for part in parts[:-1])
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _is_home_hidden_path(path: str) -> bool:
|
|
190
|
+
try:
|
|
191
|
+
p = Path(path).expanduser()
|
|
192
|
+
home = Path.home().expanduser()
|
|
193
|
+
rel = p.relative_to(home)
|
|
194
|
+
except Exception:
|
|
195
|
+
return False
|
|
196
|
+
return bool(rel.parts) and rel.parts[0].startswith(".")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def is_sensitive_path(path: str) -> bool:
|
|
74
200
|
p = Path(path)
|
|
75
|
-
lowered =
|
|
201
|
+
lowered = _normalized(path)
|
|
76
202
|
name = p.name.lower()
|
|
77
|
-
|
|
203
|
+
stem = p.stem.lower()
|
|
204
|
+
parts = _parts(path)
|
|
205
|
+
if name in SENSITIVE_FILE_NAMES:
|
|
206
|
+
return True
|
|
207
|
+
if name.startswith(".") and name not in ALLOWED_HIDDEN_FILE_NAMES:
|
|
208
|
+
return True
|
|
209
|
+
if name.startswith("~$"):
|
|
210
|
+
return True
|
|
211
|
+
if name.endswith((".tmp", ".swp", ".swo")):
|
|
212
|
+
return True
|
|
213
|
+
if p.suffix.lower() in SENSITIVE_SUFFIXES:
|
|
214
|
+
return True
|
|
215
|
+
if parts & SENSITIVE_PARTS:
|
|
216
|
+
return True
|
|
217
|
+
if any(marker in name or marker in stem for marker in SENSITIVE_NAME_MARKERS):
|
|
218
|
+
return True
|
|
219
|
+
return _contains_path_marker(lowered, SENSITIVE_PARTS)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def is_private_profile_path(path: str) -> bool:
|
|
223
|
+
lowered = _normalized(path)
|
|
224
|
+
parts = _parts(path)
|
|
225
|
+
if parts & PRIVATE_PROFILE_PARTS:
|
|
226
|
+
return True
|
|
227
|
+
if _contains_path_marker(lowered, PRIVATE_PROFILE_PARTS):
|
|
228
|
+
return True
|
|
229
|
+
name = Path(path).name.lower()
|
|
230
|
+
if name in PROFILE_HIDDEN_FILE_NAMES:
|
|
231
|
+
return True
|
|
232
|
+
if _is_home_hidden_path(path):
|
|
233
|
+
return True
|
|
234
|
+
return False
|
|
235
|
+
|
|
78
236
|
|
|
79
|
-
|
|
237
|
+
def classify_path(path: str) -> tuple[int, str, str]:
|
|
238
|
+
"""Return (depth, privacy_class, reason)."""
|
|
239
|
+
lowered = _normalized(path)
|
|
240
|
+
parts = _parts(path)
|
|
241
|
+
|
|
242
|
+
if is_sensitive_path(path):
|
|
80
243
|
return 1, "sensitive_inventory_only", "sensitive_path"
|
|
244
|
+
if is_private_profile_path(path):
|
|
245
|
+
return 0, "private_profile_blocked", "private_profile_path"
|
|
81
246
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
82
247
|
return 0, "system_blocked", "system_path"
|
|
83
|
-
if parts & NOISY_PARTS:
|
|
248
|
+
if parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path):
|
|
84
249
|
return 1, "inventory_only", "noisy_tree"
|
|
85
250
|
return 2, "normal", "default"
|
|
86
251
|
|
|
87
252
|
|
|
88
253
|
def should_skip_tree(path: str) -> bool:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
254
|
+
lowered = _normalized(path)
|
|
255
|
+
parts = _parts(path)
|
|
256
|
+
if any(item in lowered for item in SYSTEM_PARTS):
|
|
257
|
+
return True
|
|
258
|
+
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
259
|
+
return True
|
|
260
|
+
return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def should_skip_file(path: str) -> bool:
|
|
264
|
+
lowered = _normalized(path)
|
|
265
|
+
parts = _parts(path)
|
|
92
266
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
93
267
|
return True
|
|
94
|
-
|
|
268
|
+
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
269
|
+
return True
|
|
270
|
+
return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def is_queryable_path(path: str, privacy_class: str = "") -> bool:
|
|
274
|
+
if privacy_class and privacy_class != "normal":
|
|
275
|
+
return False
|
|
276
|
+
return not should_skip_file(path)
|
|
95
277
|
|
|
96
278
|
|
|
97
279
|
def should_extract(path: str, depth: int) -> bool:
|
|
98
280
|
if depth < 2:
|
|
99
281
|
return False
|
|
282
|
+
if should_skip_file(path):
|
|
283
|
+
return False
|
|
100
284
|
suffix = Path(path).suffix.lower()
|
|
101
285
|
if suffix in {
|
|
102
286
|
".txt",
|