nexo-brain 7.20.3 → 7.20.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +13 -1
- package/package.json +1 -1
- package/src/doctor/providers/runtime.py +12 -5
- package/src/local_context/api.py +418 -35
- package/src/local_context/extractors.py +126 -2
- package/src/local_context/privacy.py +298 -10
- package/src/tools_hot_context.py +9 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.8",
|
|
4
4
|
"description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "NEXO Brain",
|
package/README.md
CHANGED
|
@@ -18,7 +18,19 @@
|
|
|
18
18
|
|
|
19
19
|
[Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
|
|
20
20
|
|
|
21
|
-
Version `7.20.
|
|
21
|
+
Version `7.20.8` is the current packaged-runtime line. Patch release over v7.20.7 — Local Context recognises Windows Mail package roots and Outlook Mac profile roots as bounded local-email sources instead of rejecting them as generic AppData / Group Containers.
|
|
22
|
+
|
|
23
|
+
Previously in `7.20.7`: patch release over v7.20.6 — Local Context email-root bootstrap is deterministic across CI, WSL and migrated profiles while preserving macOS Mail.app, Windows Outlook, Thunderbird and NEXO email coverage.
|
|
24
|
+
|
|
25
|
+
Previously in `7.20.6`: patch release over v7.20.5 — Local Context ranks entity matches at chunk level, keeps old entity-matched assets eligible, adds safe local email roots for macOS/Windows/Linux, extracts `.eml`, `.emlx`, `.msg` and NEXO email DB continuity, and exposes local graph relations in pre-action context.
|
|
26
|
+
|
|
27
|
+
Previously in `7.20.5`: patch release over v7.20.4 — Local Context status reports elapsed indexing time and a defensive ETA while background jobs remain pending.
|
|
28
|
+
|
|
29
|
+
Previously in `7.20.4`: patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
|
|
30
|
+
|
|
31
|
+
Previously in `7.20.3`: patch release over v7.20.2 — installer DMG volumes are no longer added as local-memory roots, removed roots purge stale payloads, and doctor can repair removed-root residue.
|
|
32
|
+
|
|
33
|
+
Previously in `7.20.2`: patch release over v7.20.1 — Local Context now requeues stalled work, reports real macOS/Windows background-service health, records scan errors and preserves Windows drive roots.
|
|
22
34
|
|
|
23
35
|
Previously in `7.20.1`: patch release over v7.20.0 — the Local Context service now recovers from orphaned locks and mixed-version cycle failures instead of leaving the background index stuck.
|
|
24
36
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.8",
|
|
4
4
|
"mcpName": "io.github.wazionapps/nexo",
|
|
5
5
|
"description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
|
|
6
6
|
"homepage": "https://nexo-brain.com",
|
|
@@ -3840,16 +3840,23 @@ def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
|
|
|
3840
3840
|
result = local_context_api.local_index_hygiene(fix=fix)
|
|
3841
3841
|
residue = result.get("residue") or {}
|
|
3842
3842
|
cleanup = result.get("cleanup") or {}
|
|
3843
|
+
privacy = result.get("privacy") or {}
|
|
3844
|
+
privacy_residue = privacy.get("residue") or {}
|
|
3845
|
+
privacy_cleanup = privacy.get("cleanup") or {}
|
|
3843
3846
|
suspect_roots = [str(path) for path in result.get("removed_roots") or []]
|
|
3844
3847
|
residue_total = sum(int(residue.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
|
|
3845
3848
|
cleanup_total = sum(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))
|
|
3849
|
+
privacy_residue_total = sum(int(privacy_residue.get(key, 0) or 0) for key in ("assets", "dirs", "content_secret_assets"))
|
|
3850
|
+
privacy_cleanup_total = sum(int(privacy_cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "chunks", "embeddings", "entities", "relations", "versions", "dirs", "content_secret_assets"))
|
|
3846
3851
|
evidence = [
|
|
3847
3852
|
"suspect_installer_roots=" + str(len(suspect_roots)),
|
|
3848
3853
|
"residue=" + json.dumps(residue, sort_keys=True),
|
|
3849
3854
|
"cleanup=" + json.dumps(cleanup, sort_keys=True),
|
|
3855
|
+
"privacy_residue=" + json.dumps(privacy_residue, sort_keys=True),
|
|
3856
|
+
"privacy_cleanup=" + json.dumps(privacy_cleanup, sort_keys=True),
|
|
3850
3857
|
]
|
|
3851
3858
|
evidence.extend(f"root={path}" for path in suspect_roots[:5])
|
|
3852
|
-
if residue_total == 0 and not suspect_roots:
|
|
3859
|
+
if residue_total == 0 and privacy_residue_total == 0 and not suspect_roots:
|
|
3853
3860
|
return DoctorCheck(
|
|
3854
3861
|
id="runtime.local_index_hygiene",
|
|
3855
3862
|
tier="runtime",
|
|
@@ -3868,17 +3875,17 @@ def check_local_index_hygiene(fix: bool = False) -> DoctorCheck:
|
|
|
3868
3875
|
summary="Local memory index hygiene repaired",
|
|
3869
3876
|
evidence=evidence,
|
|
3870
3877
|
repair_plan=[],
|
|
3871
|
-
fixed=cleanup_total > 0 or bool(suspect_roots),
|
|
3878
|
+
fixed=cleanup_total > 0 or privacy_cleanup_total > 0 or bool(suspect_roots),
|
|
3872
3879
|
)
|
|
3873
3880
|
return DoctorCheck(
|
|
3874
3881
|
id="runtime.local_index_hygiene",
|
|
3875
3882
|
tier="runtime",
|
|
3876
3883
|
status="degraded",
|
|
3877
3884
|
severity="warn",
|
|
3878
|
-
summary="Local memory index has stale
|
|
3885
|
+
summary="Local memory index has stale or private residue",
|
|
3879
3886
|
evidence=evidence,
|
|
3880
|
-
repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and
|
|
3881
|
-
escalation_prompt="Local memory
|
|
3887
|
+
repair_plan=["Run `nexo doctor --tier runtime --fix` to purge stale local memory roots and private local-memory residue"],
|
|
3888
|
+
escalation_prompt="Local memory may contain stale or private index payloads that should be purged before indexing continues.",
|
|
3882
3889
|
)
|
|
3883
3890
|
except Exception as exc:
|
|
3884
3891
|
return DoctorCheck(
|
package/src/local_context/api.py
CHANGED
|
@@ -14,9 +14,9 @@ from db import get_db, init_db
|
|
|
14
14
|
from db._schema import run_migrations
|
|
15
15
|
|
|
16
16
|
from . import embeddings
|
|
17
|
-
from .extractors import chunk_text, entities, extract_text, summarize
|
|
17
|
+
from .extractors import chunk_text, contains_secret, entities, extract_text, summarize
|
|
18
18
|
from .logging import log_event, tail
|
|
19
|
-
from .privacy import classify_path, should_extract, should_skip_tree
|
|
19
|
+
from .privacy import classify_path, is_queryable_path, should_extract, should_skip_file, should_skip_tree
|
|
20
20
|
from .util import content_hash, json_dumps, json_loads, norm_path, now, quick_fingerprint, redact_path, stable_id, system_label, tokenize
|
|
21
21
|
|
|
22
22
|
LOCAL_INDEX_SERVICE_LABEL = "com.nexo.local-index"
|
|
@@ -41,6 +41,9 @@ def _conn():
|
|
|
41
41
|
def add_root(path: str, *, mode: str = "normal", depth: int | None = None) -> dict:
|
|
42
42
|
conn = _conn()
|
|
43
43
|
root_path = norm_path(path)
|
|
44
|
+
if should_skip_tree(root_path):
|
|
45
|
+
log_event("warn", "root_rejected_private", "Root rejected by local memory privacy rules", path=redact_path(root_path))
|
|
46
|
+
return {"ok": False, "error": "root_blocked_by_privacy", "root_path": root_path}
|
|
44
47
|
depth_value = 2 if depth is None else int(depth)
|
|
45
48
|
conn.execute(
|
|
46
49
|
"""
|
|
@@ -120,12 +123,44 @@ def _mounted_volume_roots() -> list[str]:
|
|
|
120
123
|
return roots
|
|
121
124
|
|
|
122
125
|
|
|
126
|
+
def _local_email_roots() -> list[str]:
|
|
127
|
+
home = Path.home()
|
|
128
|
+
roots: list[Path] = [home / ".nexo" / "runtime" / "nexo-email"]
|
|
129
|
+
mac_roots = [
|
|
130
|
+
home / "Library" / "Mail",
|
|
131
|
+
home / "Library" / "Group Containers" / "UBF8T346G9.Office" / "Outlook" / "Outlook 15 Profiles",
|
|
132
|
+
]
|
|
133
|
+
local_app_data = Path(os.environ.get("LOCALAPPDATA") or home / "AppData" / "Local")
|
|
134
|
+
roaming_app_data = Path(os.environ.get("APPDATA") or home / "AppData" / "Roaming")
|
|
135
|
+
windows_roots = [
|
|
136
|
+
home / "Documents" / "Outlook Files",
|
|
137
|
+
local_app_data / "Microsoft" / "Outlook",
|
|
138
|
+
roaming_app_data / "Microsoft" / "Outlook",
|
|
139
|
+
local_app_data / "Packages" / "microsoft.windowscommunicationsapps_8wekyb3d8bbwe" / "LocalState",
|
|
140
|
+
]
|
|
141
|
+
linux_roots = [home / ".thunderbird", home / ".mozilla-thunderbird"]
|
|
142
|
+
|
|
143
|
+
if sys.platform == "darwin":
|
|
144
|
+
roots.extend(mac_roots)
|
|
145
|
+
elif sys.platform.startswith("win"):
|
|
146
|
+
roots.extend(windows_roots)
|
|
147
|
+
else:
|
|
148
|
+
roots.extend(linux_roots)
|
|
149
|
+
|
|
150
|
+
# CI and migrated profiles can expose platform-specific mail stores while
|
|
151
|
+
# running on another OS. Include only the stores that actually exist.
|
|
152
|
+
for optional_root in [*mac_roots, *windows_roots, *linux_roots]:
|
|
153
|
+
if optional_root.exists() and optional_root not in roots:
|
|
154
|
+
roots.append(optional_root)
|
|
155
|
+
return [str(root) for root in roots]
|
|
156
|
+
|
|
157
|
+
|
|
123
158
|
def default_roots() -> list[str]:
|
|
124
159
|
home = Path.home()
|
|
125
160
|
configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
|
|
126
161
|
if configured:
|
|
127
162
|
return _dedupe_roots([item for item in configured.split(os.pathsep) if item.strip()])
|
|
128
|
-
return _dedupe_roots([str(home), *_mounted_volume_roots()])
|
|
163
|
+
return _dedupe_roots([str(home), *_local_email_roots(), *_mounted_volume_roots()])
|
|
129
164
|
|
|
130
165
|
|
|
131
166
|
def ensure_default_roots() -> dict:
|
|
@@ -220,6 +255,7 @@ def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -
|
|
|
220
255
|
for table in ("local_embeddings", "local_chunks", "local_entities", "local_asset_versions"):
|
|
221
256
|
conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
222
257
|
conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({asset_subquery})", tuple(params))
|
|
258
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({asset_subquery})", tuple(params))
|
|
223
259
|
conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({asset_subquery})", tuple(params))
|
|
224
260
|
conn.execute(f"DELETE FROM local_index_jobs WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
225
261
|
conn.execute(f"DELETE FROM local_index_errors WHERE asset_id IN ({asset_subquery})", tuple(params))
|
|
@@ -235,12 +271,136 @@ def _purge_removed_root_payloads(conn, *, root_paths: list[str] | None = None) -
|
|
|
235
271
|
return counts
|
|
236
272
|
|
|
237
273
|
|
|
274
|
+
def _purge_asset_ids(conn, asset_ids: list[str]) -> dict:
|
|
275
|
+
unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
|
|
276
|
+
counts = {"assets": len(unique_ids), "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0}
|
|
277
|
+
if not unique_ids:
|
|
278
|
+
return counts
|
|
279
|
+
for start in range(0, len(unique_ids), 500):
|
|
280
|
+
batch = unique_ids[start:start + 500]
|
|
281
|
+
placeholders = ",".join("?" for _ in batch)
|
|
282
|
+
for key, table in (
|
|
283
|
+
("embeddings", "local_embeddings"),
|
|
284
|
+
("chunks", "local_chunks"),
|
|
285
|
+
("entities", "local_entities"),
|
|
286
|
+
("versions", "local_asset_versions"),
|
|
287
|
+
("jobs", "local_index_jobs"),
|
|
288
|
+
("errors", "local_index_errors"),
|
|
289
|
+
):
|
|
290
|
+
counts[key] += int(conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
291
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
292
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
293
|
+
counts["relations"] += int(conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
294
|
+
conn.execute(f"DELETE FROM local_assets WHERE asset_id IN ({placeholders})", tuple(batch))
|
|
295
|
+
return counts
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _privacy_unsafe_asset_ids(conn) -> list[str]:
|
|
299
|
+
rows = conn.execute("SELECT asset_id, path, privacy_class FROM local_assets").fetchall()
|
|
300
|
+
unsafe: list[str] = []
|
|
301
|
+
for row in rows:
|
|
302
|
+
privacy_class = str(row["privacy_class"] or "")
|
|
303
|
+
if should_skip_file(str(row["path"] or "")) or privacy_class in {"private_profile_blocked", "system_blocked", "sensitive_inventory_only"}:
|
|
304
|
+
unsafe.append(str(row["asset_id"]))
|
|
305
|
+
return unsafe
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _privacy_unsafe_dir_ids(conn) -> list[str]:
|
|
309
|
+
rows = conn.execute("SELECT dir_id, path FROM local_index_dirs").fetchall()
|
|
310
|
+
return [str(row["dir_id"]) for row in rows if should_skip_tree(str(row["path"] or ""))]
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _content_secret_asset_ids(conn) -> list[str]:
|
|
314
|
+
rows = conn.execute(
|
|
315
|
+
"""
|
|
316
|
+
SELECT c.asset_id, c.text
|
|
317
|
+
FROM local_chunks c
|
|
318
|
+
JOIN local_assets a ON a.asset_id=c.asset_id
|
|
319
|
+
WHERE a.status='active'
|
|
320
|
+
AND COALESCE(a.privacy_class, 'normal')='normal'
|
|
321
|
+
ORDER BY c.asset_id, c.chunk_index
|
|
322
|
+
"""
|
|
323
|
+
).fetchall()
|
|
324
|
+
secret_ids: set[str] = set()
|
|
325
|
+
for row in rows:
|
|
326
|
+
asset_id = str(row["asset_id"])
|
|
327
|
+
if asset_id in secret_ids:
|
|
328
|
+
continue
|
|
329
|
+
if contains_secret(str(row["text"] or "")):
|
|
330
|
+
secret_ids.add(asset_id)
|
|
331
|
+
return sorted(secret_ids)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _mark_content_secret_assets(conn, asset_ids: list[str]) -> int:
|
|
335
|
+
unique_ids = [asset_id for asset_id in dict.fromkeys(asset_ids) if asset_id]
|
|
336
|
+
if not unique_ids:
|
|
337
|
+
return 0
|
|
338
|
+
for start in range(0, len(unique_ids), 500):
|
|
339
|
+
batch = unique_ids[start:start + 500]
|
|
340
|
+
placeholders = ",".join("?" for _ in batch)
|
|
341
|
+
for table in ("local_embeddings", "local_chunks", "local_entities"):
|
|
342
|
+
conn.execute(f"DELETE FROM {table} WHERE asset_id IN ({placeholders})", tuple(batch))
|
|
343
|
+
conn.execute(f"DELETE FROM local_relations WHERE source_asset_id IN ({placeholders})", tuple(batch))
|
|
344
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_asset_id IN ({placeholders})", tuple(batch))
|
|
345
|
+
conn.execute(f"DELETE FROM local_relations WHERE target_ref IN ({placeholders})", tuple(batch))
|
|
346
|
+
conn.execute(
|
|
347
|
+
f"""
|
|
348
|
+
UPDATE local_index_jobs
|
|
349
|
+
SET status='done', last_error_code='content_secret_blocked', updated_at=?
|
|
350
|
+
WHERE asset_id IN ({placeholders})
|
|
351
|
+
""",
|
|
352
|
+
(now(), *batch),
|
|
353
|
+
)
|
|
354
|
+
conn.execute(
|
|
355
|
+
f"""
|
|
356
|
+
UPDATE local_asset_versions
|
|
357
|
+
SET summary='', metadata_json=?
|
|
358
|
+
WHERE asset_id IN ({placeholders})
|
|
359
|
+
""",
|
|
360
|
+
(json_dumps({"content_blocked": "secret_pattern"}), *batch),
|
|
361
|
+
)
|
|
362
|
+
conn.execute(
|
|
363
|
+
f"""
|
|
364
|
+
UPDATE local_assets
|
|
365
|
+
SET privacy_class='content_secret_inventory_only',
|
|
366
|
+
depth=1,
|
|
367
|
+
depth_reason='content_secret',
|
|
368
|
+
phase='privacy_blocked',
|
|
369
|
+
updated_at=?
|
|
370
|
+
WHERE asset_id IN ({placeholders})
|
|
371
|
+
""",
|
|
372
|
+
(now(), *batch),
|
|
373
|
+
)
|
|
374
|
+
return len(unique_ids)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def local_index_privacy_hygiene(*, fix: bool = False) -> dict:
|
|
378
|
+
conn = _conn()
|
|
379
|
+
asset_ids = _privacy_unsafe_asset_ids(conn)
|
|
380
|
+
dir_ids = _privacy_unsafe_dir_ids(conn)
|
|
381
|
+
content_secret_ids = _content_secret_asset_ids(conn)
|
|
382
|
+
residue = {"assets": len(asset_ids), "dirs": len(dir_ids), "content_secret_assets": len(content_secret_ids)}
|
|
383
|
+
cleanup = {"assets": 0, "jobs": 0, "errors": 0, "chunks": 0, "embeddings": 0, "entities": 0, "relations": 0, "versions": 0, "dirs": 0, "content_secret_assets": 0}
|
|
384
|
+
if fix:
|
|
385
|
+
cleanup.update(_purge_asset_ids(conn, asset_ids))
|
|
386
|
+
if dir_ids:
|
|
387
|
+
for start in range(0, len(dir_ids), 500):
|
|
388
|
+
batch = dir_ids[start:start + 500]
|
|
389
|
+
placeholders = ",".join("?" for _ in batch)
|
|
390
|
+
cleanup["dirs"] += int(conn.execute(f"DELETE FROM local_index_dirs WHERE dir_id IN ({placeholders})", tuple(batch)).rowcount or 0)
|
|
391
|
+
cleanup["content_secret_assets"] = _mark_content_secret_assets(conn, content_secret_ids)
|
|
392
|
+
conn.commit()
|
|
393
|
+
if asset_ids or dir_ids or content_secret_ids:
|
|
394
|
+
log_event("warn", "privacy_hygiene_repaired", "Local memory privacy hygiene repaired", cleanup=cleanup)
|
|
395
|
+
return {"ok": True, "fix": fix, "residue": residue, "cleanup": cleanup}
|
|
396
|
+
|
|
397
|
+
|
|
238
398
|
def local_index_hygiene(*, fix: bool = False) -> dict:
|
|
239
399
|
conn = _conn()
|
|
240
400
|
removed_paths: list[str] = []
|
|
241
401
|
for row in conn.execute("SELECT id, root_path FROM local_index_roots").fetchall():
|
|
242
402
|
path = str(row["root_path"] or "")
|
|
243
|
-
if _should_skip_mounted_root(Path(path)):
|
|
403
|
+
if _should_skip_mounted_root(Path(path)) or should_skip_tree(path):
|
|
244
404
|
removed_paths.append(path)
|
|
245
405
|
if fix:
|
|
246
406
|
conn.execute("UPDATE local_index_roots SET status='removed', updated_at=? WHERE id=?", (now(), row["id"]))
|
|
@@ -249,9 +409,10 @@ def local_index_hygiene(*, fix: bool = False) -> dict:
|
|
|
249
409
|
if fix:
|
|
250
410
|
cleanup = _purge_removed_root_payloads(conn)
|
|
251
411
|
conn.commit()
|
|
412
|
+
privacy = local_index_privacy_hygiene(fix=fix)
|
|
252
413
|
if fix and (removed_paths or any(int(cleanup.get(key, 0) or 0) for key in ("assets", "jobs", "errors", "dirs", "checkpoints"))):
|
|
253
414
|
log_event("info", "index_hygiene_repaired", "Local memory index hygiene repaired", roots=[redact_path(path) for path in removed_paths], cleanup=cleanup)
|
|
254
|
-
return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup}
|
|
415
|
+
return {"ok": True, "fix": fix, "removed_roots": removed_paths, "residue": before, "cleanup": cleanup, "privacy": privacy}
|
|
255
416
|
|
|
256
417
|
|
|
257
418
|
def repair_index_hygiene() -> dict:
|
|
@@ -342,7 +503,7 @@ def _file_type(path: Path) -> str:
|
|
|
342
503
|
return "photo"
|
|
343
504
|
if suffix in {".py", ".js", ".ts", ".tsx", ".jsx", ".php", ".sql", ".css", ".html"}:
|
|
344
505
|
return "code"
|
|
345
|
-
if suffix in {".eml"}:
|
|
506
|
+
if suffix in {".eml", ".emlx", ".msg", ".pst", ".ost"}:
|
|
346
507
|
return "email"
|
|
347
508
|
if suffix in {".pdf", ".docx", ".pptx", ".xlsx", ".md", ".txt", ".csv", ".tsv"}:
|
|
348
509
|
return "document"
|
|
@@ -424,6 +585,8 @@ def _upsert_asset(conn, root_id: int, path: Path, seen_at: float, root_depth: in
|
|
|
424
585
|
raw_path = str(path)
|
|
425
586
|
normalized = norm_path(raw_path)
|
|
426
587
|
asset_id = stable_id("asset", normalized)
|
|
588
|
+
if should_skip_file(normalized):
|
|
589
|
+
return asset_id, False, "skipped"
|
|
427
590
|
perm = _permission_state(path)
|
|
428
591
|
depth, privacy_class, depth_reason = classify_path(normalized)
|
|
429
592
|
depth = min(depth, root_depth)
|
|
@@ -546,6 +709,20 @@ def _mark_dir_subtree_deleted(conn, dir_path: str, deleted_at: float | None = No
|
|
|
546
709
|
return len(rows)
|
|
547
710
|
|
|
548
711
|
|
|
712
|
+
def _purge_dir_subtree(conn, dir_path: str) -> int:
|
|
713
|
+
normalized = norm_path(dir_path)
|
|
714
|
+
prefix = _path_prefix(normalized)
|
|
715
|
+
rows = conn.execute(
|
|
716
|
+
"SELECT asset_id FROM local_assets WHERE path=? OR path LIKE ?",
|
|
717
|
+
(normalized, prefix + "%"),
|
|
718
|
+
).fetchall()
|
|
719
|
+
asset_ids = [str(row["asset_id"]) for row in rows]
|
|
720
|
+
_purge_asset_ids(conn, asset_ids)
|
|
721
|
+
conn.execute("DELETE FROM local_index_dirs WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
|
|
722
|
+
conn.execute("DELETE FROM local_index_errors WHERE path=? OR path LIKE ?", (normalized, prefix + "%"))
|
|
723
|
+
return len(asset_ids)
|
|
724
|
+
|
|
725
|
+
|
|
549
726
|
def _record_index_error(
|
|
550
727
|
conn,
|
|
551
728
|
*,
|
|
@@ -651,6 +828,8 @@ def _iter_files(
|
|
|
651
828
|
continue
|
|
652
829
|
if entry.is_file():
|
|
653
830
|
normalized = norm_path(entry)
|
|
831
|
+
if should_skip_file(normalized):
|
|
832
|
+
continue
|
|
654
833
|
if start_after_norm and normalized <= start_after_norm:
|
|
655
834
|
continue
|
|
656
835
|
yield entry
|
|
@@ -729,7 +908,11 @@ def _reconcile_known_assets(conn, exclusions: list[str], *, limit: int) -> dict:
|
|
|
729
908
|
path = str(row["path"])
|
|
730
909
|
root_path = Path(row["root_path"]).expanduser() if row["root_path"] else None
|
|
731
910
|
if _is_excluded(path, exclusions):
|
|
732
|
-
|
|
911
|
+
_purge_asset_ids(conn, [row["asset_id"]])
|
|
912
|
+
stats["excluded"] += 1
|
|
913
|
+
continue
|
|
914
|
+
if should_skip_file(path):
|
|
915
|
+
_purge_asset_ids(conn, [row["asset_id"]])
|
|
733
916
|
stats["excluded"] += 1
|
|
734
917
|
continue
|
|
735
918
|
if root_path is not None and not root_path.exists():
|
|
@@ -836,6 +1019,8 @@ def _scan_known_directory(
|
|
|
836
1019
|
stack.append(entry)
|
|
837
1020
|
continue
|
|
838
1021
|
if entry.is_file():
|
|
1022
|
+
if should_skip_file(str(entry)):
|
|
1023
|
+
continue
|
|
839
1024
|
seen_files.add(norm_path(entry))
|
|
840
1025
|
if stats["files_scanned"] >= file_limit:
|
|
841
1026
|
continue
|
|
@@ -843,7 +1028,7 @@ def _scan_known_directory(
|
|
|
843
1028
|
stats["files_scanned"] += 1
|
|
844
1029
|
if changed:
|
|
845
1030
|
stats["files_changed"] += 1
|
|
846
|
-
if state
|
|
1031
|
+
if state not in {"ok", "skipped"}:
|
|
847
1032
|
stats["errors"] += 1
|
|
848
1033
|
except Exception as exc:
|
|
849
1034
|
_record_scan_error(conn, stats, str(entry), "live_reconcile", exc)
|
|
@@ -887,6 +1072,10 @@ def _reconcile_known_dirs(conn, exclusions: list[str], *, dir_limit: int, file_l
|
|
|
887
1072
|
stats["files_deleted"] += _mark_dir_subtree_deleted(conn, str(dir_path), seen_at)
|
|
888
1073
|
stats["excluded_dirs"] += 1
|
|
889
1074
|
continue
|
|
1075
|
+
if should_skip_tree(str(dir_path)):
|
|
1076
|
+
stats["files_deleted"] += _purge_dir_subtree(conn, str(dir_path))
|
|
1077
|
+
stats["excluded_dirs"] += 1
|
|
1078
|
+
continue
|
|
890
1079
|
if root_path is not None and not root_path.exists():
|
|
891
1080
|
stats["offline"] += 1
|
|
892
1081
|
continue
|
|
@@ -966,6 +1155,12 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
966
1155
|
for root in roots:
|
|
967
1156
|
root_path = Path(root["root_path"]).expanduser()
|
|
968
1157
|
root_id = int(root["id"])
|
|
1158
|
+
if should_skip_tree(str(root_path)):
|
|
1159
|
+
conn.execute(
|
|
1160
|
+
"UPDATE local_index_roots SET status='removed', last_scan_at=?, updated_at=? WHERE id=?",
|
|
1161
|
+
(now(), now(), root_id),
|
|
1162
|
+
)
|
|
1163
|
+
continue
|
|
969
1164
|
if not root_path.exists():
|
|
970
1165
|
conn.execute(
|
|
971
1166
|
"UPDATE local_index_roots SET status='offline', last_scan_at=?, updated_at=? WHERE id=?",
|
|
@@ -997,7 +1192,7 @@ def scan_once(*, limit: int | None = None) -> dict:
|
|
|
997
1192
|
seen_for_root += 1
|
|
998
1193
|
if changed:
|
|
999
1194
|
totals["changed"] += 1
|
|
1000
|
-
if state
|
|
1195
|
+
if state not in {"ok", "skipped"}:
|
|
1001
1196
|
totals["errors"] += 1
|
|
1002
1197
|
partial_root = bool(limit and seen_for_root >= limit)
|
|
1003
1198
|
totals["partial"] = bool(totals["partial"] or partial_root)
|
|
@@ -1121,7 +1316,7 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
1121
1316
|
recovered = _requeue_due_jobs(conn)
|
|
1122
1317
|
rows = conn.execute(
|
|
1123
1318
|
"""
|
|
1124
|
-
SELECT j.*, a.path, a.depth, a.status AS asset_status
|
|
1319
|
+
SELECT j.*, a.path, a.depth, a.privacy_class, a.status AS asset_status
|
|
1125
1320
|
FROM local_index_jobs j
|
|
1126
1321
|
JOIN local_assets a ON a.asset_id = j.asset_id
|
|
1127
1322
|
WHERE j.status='pending'
|
|
@@ -1143,9 +1338,24 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
1143
1338
|
try:
|
|
1144
1339
|
if row["asset_status"] != "active":
|
|
1145
1340
|
raise FileNotFoundError(row["path"])
|
|
1341
|
+
if str(row["privacy_class"] or "normal") != "normal":
|
|
1342
|
+
conn.execute(
|
|
1343
|
+
"UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='privacy_blocked' WHERE job_id=?",
|
|
1344
|
+
(now(), job_id),
|
|
1345
|
+
)
|
|
1346
|
+
processed += 1
|
|
1347
|
+
continue
|
|
1146
1348
|
if job_type == "light_extraction":
|
|
1147
1349
|
text, metadata = extract_text(Path(row["path"]))
|
|
1148
1350
|
version_id = _latest_version_id(conn, asset_id)
|
|
1351
|
+
if contains_secret(text):
|
|
1352
|
+
_mark_content_secret_assets(conn, [asset_id])
|
|
1353
|
+
conn.execute(
|
|
1354
|
+
"UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='content_secret_blocked' WHERE job_id=?",
|
|
1355
|
+
(now(), job_id),
|
|
1356
|
+
)
|
|
1357
|
+
processed += 1
|
|
1358
|
+
continue
|
|
1149
1359
|
summary = summarize(text)
|
|
1150
1360
|
conn.execute(
|
|
1151
1361
|
"UPDATE local_asset_versions SET summary=?, metadata_json=? WHERE version_id=?",
|
|
@@ -1202,6 +1412,9 @@ def run_once(
|
|
|
1202
1412
|
live_dir_limit: int = DEFAULT_LIVE_DIR_LIMIT,
|
|
1203
1413
|
live_file_limit: int = DEFAULT_LIVE_FILE_LIMIT,
|
|
1204
1414
|
) -> dict:
|
|
1415
|
+
if _get_state("privacy_hygiene_v2", "0") != "1":
|
|
1416
|
+
local_index_privacy_hygiene(fix=True)
|
|
1417
|
+
_set_state("privacy_hygiene_v2", "1")
|
|
1205
1418
|
if root:
|
|
1206
1419
|
add_root(root)
|
|
1207
1420
|
elif (
|
|
@@ -1471,6 +1684,29 @@ def _service_cycle_observation(conn) -> dict:
|
|
|
1471
1684
|
return observation
|
|
1472
1685
|
|
|
1473
1686
|
|
|
1687
|
+
def _index_timing(conn, *, done: int, active_jobs: int, percent: int) -> dict:
|
|
1688
|
+
first_seen = conn.execute(
|
|
1689
|
+
"""
|
|
1690
|
+
SELECT MIN(created_at) AS created_at
|
|
1691
|
+
FROM local_index_logs
|
|
1692
|
+
WHERE event IN ('root_added', 'scan_started', 'scan_finished', 'jobs_processed', 'service_cycle_finished')
|
|
1693
|
+
"""
|
|
1694
|
+
).fetchone()["created_at"] or 0
|
|
1695
|
+
if not first_seen:
|
|
1696
|
+
first_seen = conn.execute(
|
|
1697
|
+
"""
|
|
1698
|
+
SELECT MIN(first_seen_at) AS first_seen_at
|
|
1699
|
+
FROM local_assets
|
|
1700
|
+
WHERE status!='deleted'
|
|
1701
|
+
"""
|
|
1702
|
+
).fetchone()["first_seen_at"] or 0
|
|
1703
|
+
elapsed_seconds = max(0, int(now() - float(first_seen))) if first_seen else 0
|
|
1704
|
+
eta_seconds = None
|
|
1705
|
+
if elapsed_seconds > 0 and done > 0 and active_jobs > 0 and 0 < percent < 100:
|
|
1706
|
+
eta_seconds = max(0, int((elapsed_seconds / max(done, 1)) * active_jobs))
|
|
1707
|
+
return {"elapsed_seconds": elapsed_seconds, "eta_seconds": eta_seconds}
|
|
1708
|
+
|
|
1709
|
+
|
|
1474
1710
|
def _service_scheduler_has_error(service: dict) -> bool:
|
|
1475
1711
|
if service.get("manager") == "launchagent":
|
|
1476
1712
|
code = str(service.get("last_exit_code") or "").strip()
|
|
@@ -1544,6 +1780,7 @@ def status() -> dict:
|
|
|
1544
1780
|
active_jobs = pending + running_jobs + failed_jobs
|
|
1545
1781
|
total_jobs = active_jobs + done
|
|
1546
1782
|
percent = 100 if total_jobs == 0 else int((done / max(total_jobs, 1)) * 100)
|
|
1783
|
+
timing = _index_timing(conn, done=done, active_jobs=active_jobs, percent=percent)
|
|
1547
1784
|
roots = list_roots()
|
|
1548
1785
|
volumes = []
|
|
1549
1786
|
by_volume = conn.execute(
|
|
@@ -1589,8 +1826,8 @@ def status() -> dict:
|
|
|
1589
1826
|
"jobs_pending": pending,
|
|
1590
1827
|
"jobs_running": running_jobs,
|
|
1591
1828
|
"jobs_failed": failed_jobs,
|
|
1592
|
-
"elapsed_seconds":
|
|
1593
|
-
"eta_seconds":
|
|
1829
|
+
"elapsed_seconds": timing["elapsed_seconds"],
|
|
1830
|
+
"eta_seconds": timing["eta_seconds"],
|
|
1594
1831
|
},
|
|
1595
1832
|
"volumes": volumes,
|
|
1596
1833
|
"roots": roots,
|
|
@@ -1675,26 +1912,183 @@ def _search_text_score(query: str, text: str) -> float:
|
|
|
1675
1912
|
return len(q & tokens) / max(len(q), 1)
|
|
1676
1913
|
|
|
1677
1914
|
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1915
|
+
_QUERY_STOPWORDS = {
|
|
1916
|
+
"about",
|
|
1917
|
+
"archivos",
|
|
1918
|
+
"con",
|
|
1919
|
+
"context",
|
|
1920
|
+
"contexto",
|
|
1921
|
+
"cuanto",
|
|
1922
|
+
"dame",
|
|
1923
|
+
"del",
|
|
1924
|
+
"desde",
|
|
1925
|
+
"documentos",
|
|
1926
|
+
"donde",
|
|
1927
|
+
"esta",
|
|
1928
|
+
"está",
|
|
1929
|
+
"file",
|
|
1930
|
+
"files",
|
|
1931
|
+
"hay",
|
|
1932
|
+
"los",
|
|
1933
|
+
"para",
|
|
1934
|
+
"que",
|
|
1935
|
+
"qué",
|
|
1936
|
+
"related",
|
|
1937
|
+
"relacionado",
|
|
1938
|
+
"sabes",
|
|
1939
|
+
"sobre",
|
|
1940
|
+
"todo",
|
|
1941
|
+
"what",
|
|
1942
|
+
"where",
|
|
1943
|
+
}
|
|
1944
|
+
|
|
1945
|
+
|
|
1946
|
+
def _query_terms(query: str) -> list[str]:
|
|
1947
|
+
terms = []
|
|
1948
|
+
for token in tokenize(query):
|
|
1949
|
+
if len(token) < 3 or token in _QUERY_STOPWORDS:
|
|
1950
|
+
continue
|
|
1951
|
+
if token not in terms:
|
|
1952
|
+
terms.append(token)
|
|
1953
|
+
return terms[:10]
|
|
1954
|
+
|
|
1955
|
+
|
|
1956
|
+
def _entity_match_score(query_lower: str, terms: list[str], name: str) -> float:
|
|
1957
|
+
entity = (name or "").strip().lower()
|
|
1958
|
+
if not entity:
|
|
1959
|
+
return 0.0
|
|
1960
|
+
entity_terms = set(tokenize(entity))
|
|
1961
|
+
if entity and entity in query_lower:
|
|
1962
|
+
return 1.0
|
|
1963
|
+
if not terms:
|
|
1964
|
+
return 0.0
|
|
1965
|
+
term_set = set(terms)
|
|
1966
|
+
overlap = term_set & entity_terms
|
|
1967
|
+
if overlap:
|
|
1968
|
+
return min(0.95, 0.45 + (len(overlap) / max(len(entity_terms), 1)) * 0.5)
|
|
1969
|
+
if any(term in entity for term in terms):
|
|
1970
|
+
return 0.6
|
|
1971
|
+
return 0.0
|
|
1972
|
+
|
|
1973
|
+
|
|
1974
|
+
def _entity_matches_for_query(conn, query: str, *, limit: int) -> tuple[list[dict], dict[str, float]]:
|
|
1975
|
+
query_lower = (query or "").strip().lower()
|
|
1976
|
+
terms = _query_terms(query)
|
|
1977
|
+
if not query_lower or not terms:
|
|
1978
|
+
return [], {}
|
|
1979
|
+
|
|
1980
|
+
clauses = " OR ".join("lower(e.name) LIKE ?" for _ in terms)
|
|
1981
|
+
params = [f"%{term}%" for term in terms]
|
|
1681
1982
|
rows = conn.execute(
|
|
1983
|
+
f"""
|
|
1984
|
+
SELECT DISTINCT e.name, e.entity_type, e.asset_id, a.path, a.privacy_class
|
|
1985
|
+
FROM local_entities e
|
|
1986
|
+
JOIN local_assets a ON a.asset_id = e.asset_id
|
|
1987
|
+
WHERE a.status='active'
|
|
1988
|
+
AND a.privacy_class='normal'
|
|
1989
|
+
AND ({clauses})
|
|
1990
|
+
LIMIT ?
|
|
1991
|
+
""",
|
|
1992
|
+
[*params, max(int(limit) * 20, 40)],
|
|
1993
|
+
).fetchall()
|
|
1994
|
+
|
|
1995
|
+
matches = []
|
|
1996
|
+
boosts: dict[str, float] = {}
|
|
1997
|
+
seen = set()
|
|
1998
|
+
for row in rows:
|
|
1999
|
+
if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
|
|
2000
|
+
continue
|
|
2001
|
+
score = _entity_match_score(query_lower, terms, str(row["name"] or ""))
|
|
2002
|
+
if score <= 0:
|
|
2003
|
+
continue
|
|
2004
|
+
key = (row["name"], row["entity_type"], row["asset_id"])
|
|
2005
|
+
if key not in seen:
|
|
2006
|
+
matches.append({
|
|
2007
|
+
"name": row["name"],
|
|
2008
|
+
"entity_type": row["entity_type"],
|
|
2009
|
+
"asset_id": row["asset_id"],
|
|
2010
|
+
"score": round(float(score), 4),
|
|
2011
|
+
})
|
|
2012
|
+
seen.add(key)
|
|
2013
|
+
boosts[row["asset_id"]] = max(boosts.get(row["asset_id"], 0.0), float(score))
|
|
2014
|
+
|
|
2015
|
+
matches.sort(key=lambda item: item.get("score", 0), reverse=True)
|
|
2016
|
+
return matches[: int(limit)], boosts
|
|
2017
|
+
|
|
2018
|
+
|
|
2019
|
+
def _context_candidate_rows(conn, entity_asset_ids: list[str], *, base_limit: int = 5000) -> list:
|
|
2020
|
+
base_rows = conn.execute(
|
|
1682
2021
|
"""
|
|
1683
|
-
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, v.summary, e.vector_json
|
|
2022
|
+
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
|
|
1684
2023
|
FROM local_chunks c
|
|
1685
2024
|
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
1686
2025
|
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
1687
2026
|
LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
|
|
1688
2027
|
WHERE a.status='active'
|
|
1689
|
-
|
|
1690
|
-
|
|
2028
|
+
AND a.privacy_class='normal'
|
|
2029
|
+
ORDER BY c.created_at DESC
|
|
2030
|
+
LIMIT ?
|
|
2031
|
+
""",
|
|
2032
|
+
(int(base_limit),),
|
|
2033
|
+
).fetchall()
|
|
2034
|
+
if not entity_asset_ids:
|
|
2035
|
+
return base_rows
|
|
2036
|
+
|
|
2037
|
+
placeholders = ",".join("?" for _ in entity_asset_ids)
|
|
2038
|
+
entity_rows = conn.execute(
|
|
2039
|
+
f"""
|
|
2040
|
+
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
|
|
2041
|
+
FROM local_chunks c
|
|
2042
|
+
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
2043
|
+
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
2044
|
+
LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
|
|
2045
|
+
WHERE a.status='active'
|
|
2046
|
+
AND a.privacy_class='normal'
|
|
2047
|
+
AND c.asset_id IN ({placeholders})
|
|
2048
|
+
ORDER BY c.chunk_index ASC
|
|
2049
|
+
LIMIT ?
|
|
2050
|
+
""",
|
|
2051
|
+
[*entity_asset_ids, max(1000, len(entity_asset_ids) * 80)],
|
|
1691
2052
|
).fetchall()
|
|
2053
|
+
|
|
2054
|
+
rows = []
|
|
2055
|
+
seen_chunks = set()
|
|
2056
|
+
for row in [*entity_rows, *base_rows]:
|
|
2057
|
+
chunk_id = row["chunk_id"]
|
|
2058
|
+
if chunk_id in seen_chunks:
|
|
2059
|
+
continue
|
|
2060
|
+
seen_chunks.add(chunk_id)
|
|
2061
|
+
rows.append(row)
|
|
2062
|
+
return rows
|
|
2063
|
+
|
|
2064
|
+
|
|
2065
|
+
def context_query(query: str, *, intent: str = "answer", limit: int = 12, evidence_required: bool = True, current_context: str = "") -> dict:
|
|
2066
|
+
conn = _conn()
|
|
2067
|
+
qvec = embeddings.embed_text(query)
|
|
2068
|
+
entities_payload, entity_boosts = _entity_matches_for_query(conn, query, limit=max(int(limit), 1))
|
|
2069
|
+
rows = _context_candidate_rows(conn, list(entity_boosts.keys()), base_limit=5000)
|
|
1692
2070
|
scored = []
|
|
1693
2071
|
for row in rows:
|
|
2072
|
+
if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
|
|
2073
|
+
continue
|
|
1694
2074
|
vector = json_loads(row["vector_json"], [])
|
|
1695
|
-
|
|
2075
|
+
text_score = _search_text_score(query, row["text"])
|
|
2076
|
+
path_score = _search_text_score(query, row["path"] or "")
|
|
2077
|
+
summary_score = _search_text_score(query, row["summary"] or "")
|
|
2078
|
+
entity_score = entity_boosts.get(row["asset_id"], 0.0)
|
|
2079
|
+
vector_score = embeddings.cosine(qvec, vector)
|
|
2080
|
+
score = max(text_score, path_score, summary_score, vector_score)
|
|
2081
|
+
if entity_score > 0:
|
|
2082
|
+
direct_score = max(text_score, path_score, summary_score)
|
|
2083
|
+
if direct_score > 0:
|
|
2084
|
+
entity_rank = 0.82 + (0.42 * text_score) + (0.18 * path_score) + (0.12 * summary_score)
|
|
2085
|
+
score = max(score, entity_rank + min(0.2, entity_score * 0.2))
|
|
2086
|
+
else:
|
|
2087
|
+
# Entity-level matches keep older assets eligible, but do not let
|
|
2088
|
+
# unrelated chunks from a long document outrank direct evidence.
|
|
2089
|
+
score = max(score, min(0.48, 0.28 + entity_score * 0.2))
|
|
1696
2090
|
if score > 0:
|
|
1697
|
-
scored.append((score, row))
|
|
2091
|
+
scored.append((min(float(score), 1.6), row))
|
|
1698
2092
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
1699
2093
|
assets = []
|
|
1700
2094
|
chunks = []
|
|
@@ -1704,7 +2098,6 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1704
2098
|
if row["asset_id"] not in seen_assets:
|
|
1705
2099
|
assets.append({
|
|
1706
2100
|
"asset_id": row["asset_id"],
|
|
1707
|
-
"path": row["path"],
|
|
1708
2101
|
"display_path": redact_path(row["path"]),
|
|
1709
2102
|
"file_type": row["file_type"],
|
|
1710
2103
|
"score": round(float(score), 4),
|
|
@@ -1718,14 +2111,10 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1718
2111
|
"score": round(float(score), 4),
|
|
1719
2112
|
})
|
|
1720
2113
|
evidence_refs.append(f"local_asset:{row['asset_id']}#chunk:{row['chunk_id']}")
|
|
1721
|
-
entity_rows = conn.execute(
|
|
1722
|
-
"SELECT DISTINCT name, entity_type, asset_id FROM local_entities WHERE lower(name) LIKE ? LIMIT ?",
|
|
1723
|
-
(f"%{query.lower()}%", int(limit)),
|
|
1724
|
-
).fetchall()
|
|
1725
|
-
entities_payload = [dict(row) for row in entity_rows]
|
|
1726
2114
|
relations_payload: list[dict] = []
|
|
1727
|
-
|
|
1728
|
-
|
|
2115
|
+
relation_asset_ids = list(dict.fromkeys([*seen_assets, *entity_boosts.keys()]))[: int(limit)]
|
|
2116
|
+
if relation_asset_ids:
|
|
2117
|
+
asset_ids = relation_asset_ids
|
|
1729
2118
|
placeholders = ",".join("?" for _ in asset_ids)
|
|
1730
2119
|
relation_rows = conn.execute(
|
|
1731
2120
|
f"""
|
|
@@ -1798,13 +2187,7 @@ def get_neighbors(asset_id: str, *, limit: int = 30) -> dict:
|
|
|
1798
2187
|
|
|
1799
2188
|
def purge_asset(asset_id: str) -> dict:
|
|
1800
2189
|
conn = _conn()
|
|
1801
|
-
|
|
1802
|
-
conn.execute(f"DELETE FROM {table} WHERE asset_id=?", (asset_id,))
|
|
1803
|
-
conn.execute("DELETE FROM local_relations WHERE source_asset_id=?", (asset_id,))
|
|
1804
|
-
conn.execute("DELETE FROM local_index_errors WHERE asset_id=?", (asset_id,))
|
|
1805
|
-
conn.execute("DELETE FROM local_index_jobs WHERE asset_id=?", (asset_id,))
|
|
1806
|
-
conn.execute("DELETE FROM local_asset_versions WHERE asset_id=?", (asset_id,))
|
|
1807
|
-
conn.execute("DELETE FROM local_assets WHERE asset_id=?", (asset_id,))
|
|
2190
|
+
_purge_asset_ids(conn, [asset_id])
|
|
1808
2191
|
conn.commit()
|
|
1809
2192
|
log_event("info", "asset_purged", "Asset purged", asset_id=asset_id)
|
|
1810
2193
|
return {"ok": True, "asset_id": asset_id}
|
|
@@ -4,12 +4,15 @@ import csv
|
|
|
4
4
|
import html
|
|
5
5
|
import json
|
|
6
6
|
import re
|
|
7
|
+
import sqlite3
|
|
7
8
|
import zipfile
|
|
8
9
|
from email import policy
|
|
9
10
|
from email.parser import BytesParser
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from xml.etree import ElementTree
|
|
12
13
|
|
|
14
|
+
from .privacy import is_local_email_db
|
|
15
|
+
|
|
13
16
|
MAX_TEXT_BYTES = 512 * 1024
|
|
14
17
|
MAX_CHARS = 120_000
|
|
15
18
|
|
|
@@ -32,6 +35,26 @@ TEXT_SUFFIXES = {
|
|
|
32
35
|
".css",
|
|
33
36
|
}
|
|
34
37
|
|
|
38
|
+
SECRET_PATTERNS: tuple[re.Pattern, ...] = (
|
|
39
|
+
re.compile(r"\bBearer\s+[A-Za-z0-9._\-~+/]{12,}\b", re.I),
|
|
40
|
+
re.compile(r"\bsk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
|
|
41
|
+
re.compile(r"\bpk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
|
|
42
|
+
re.compile(r"\b(ghp|gho|ghu|ghs|ghr|github_pat|glpat|xoxb|xoxp|shpat)_[A-Za-z0-9_]{16,}\b", re.I),
|
|
43
|
+
re.compile(r"\b(AKIA|ASIA)[A-Z0-9]{16,}\b"),
|
|
44
|
+
re.compile(r"\bey[A-Za-z0-9_-]{10,}\.ey[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"),
|
|
45
|
+
re.compile(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY-----", re.I),
|
|
46
|
+
re.compile(r"\b([A-Z][A-Z0-9_]*(?:TOKEN|SECRET|KEY|PASSWORD|PASS)\s*[:=]\s*)['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
|
|
47
|
+
re.compile(r"\b(?:api[_-]?key|secret[_-]?key|auth[_-]?token)\s*[:=]\s*['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
|
|
48
|
+
re.compile(r"\b(?:password|passwd|pwd)\s*[:=]\s*['\"][^'\"]{6,}['\"]", re.I),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def contains_secret(text: str) -> bool:
|
|
53
|
+
if not text:
|
|
54
|
+
return False
|
|
55
|
+
sample = text[:MAX_CHARS]
|
|
56
|
+
return any(pattern.search(sample) for pattern in SECRET_PATTERNS)
|
|
57
|
+
|
|
35
58
|
|
|
36
59
|
def _read_text(path: Path) -> str:
|
|
37
60
|
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
|
@@ -53,8 +76,8 @@ def _extract_csv(path: Path) -> str:
|
|
|
53
76
|
return "\n".join(rows)[:MAX_CHARS]
|
|
54
77
|
|
|
55
78
|
|
|
56
|
-
def
|
|
57
|
-
msg = BytesParser(policy=policy.default).parsebytes(
|
|
79
|
+
def _extract_email_bytes(data: bytes) -> tuple[str, dict]:
|
|
80
|
+
msg = BytesParser(policy=policy.default).parsebytes(data[:MAX_TEXT_BYTES])
|
|
58
81
|
meta = {
|
|
59
82
|
"subject": str(msg.get("subject") or ""),
|
|
60
83
|
"from": str(msg.get("from") or ""),
|
|
@@ -72,6 +95,99 @@ def _extract_eml(path: Path) -> tuple[str, dict]:
|
|
|
72
95
|
return "\n".join([meta["subject"], meta["from"], meta["to"], text])[:MAX_CHARS], meta
|
|
73
96
|
|
|
74
97
|
|
|
98
|
+
def _extract_eml(path: Path) -> tuple[str, dict]:
|
|
99
|
+
return _extract_email_bytes(path.read_bytes()[:MAX_TEXT_BYTES])
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _extract_emlx(path: Path) -> tuple[str, dict]:
|
|
103
|
+
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
|
104
|
+
first_line, separator, rest = data.partition(b"\n")
|
|
105
|
+
if separator and first_line.strip().isdigit():
|
|
106
|
+
declared = int(first_line.strip() or b"0")
|
|
107
|
+
payload = rest[:declared] if declared > 0 else rest
|
|
108
|
+
else:
|
|
109
|
+
payload = data
|
|
110
|
+
if b"\n<?xml" in payload:
|
|
111
|
+
payload = payload.split(b"\n<?xml", 1)[0]
|
|
112
|
+
text, meta = _extract_email_bytes(payload)
|
|
113
|
+
meta["apple_mail_message"] = True
|
|
114
|
+
return text, meta
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _printable_binary_text(path: Path) -> str:
|
|
118
|
+
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
|
119
|
+
decoded = data.decode("utf-16", errors="ignore") if b"\x00" in data[:2000] else data.decode("latin-1", errors="ignore")
|
|
120
|
+
pieces = re.findall(r"[\wÀ-ÿ@./:=+\- ,;()\\[\\]{}]{4,}", decoded)
|
|
121
|
+
return "\n".join(piece.strip() for piece in pieces if piece.strip())[:MAX_CHARS]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _extract_msg(path: Path) -> tuple[str, dict]:
|
|
125
|
+
try:
|
|
126
|
+
import extract_msg # type: ignore
|
|
127
|
+
message = extract_msg.Message(str(path))
|
|
128
|
+
meta = {
|
|
129
|
+
"subject": str(getattr(message, "subject", "") or ""),
|
|
130
|
+
"from": str(getattr(message, "sender", "") or ""),
|
|
131
|
+
"to": str(getattr(message, "to", "") or ""),
|
|
132
|
+
"date": str(getattr(message, "date", "") or ""),
|
|
133
|
+
"extractor": "msg",
|
|
134
|
+
}
|
|
135
|
+
body = str(getattr(message, "body", "") or "")
|
|
136
|
+
close = getattr(message, "close", None)
|
|
137
|
+
if callable(close):
|
|
138
|
+
close()
|
|
139
|
+
return "\n".join([meta["subject"], meta["from"], meta["to"], body])[:MAX_CHARS], meta
|
|
140
|
+
except Exception:
|
|
141
|
+
return _printable_binary_text(path), {"extractor": "msg_fallback"}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _table_names(conn: sqlite3.Connection) -> set[str]:
|
|
145
|
+
rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
|
146
|
+
return {str(row[0]) for row in rows}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _select_existing_columns(conn: sqlite3.Connection, table: str, columns: list[str]) -> list[str]:
|
|
150
|
+
found = {str(row[1]) for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
|
|
151
|
+
return [column for column in columns if column in found]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _extract_nexo_email_db(path: Path) -> tuple[str, dict]:
|
|
155
|
+
if not is_local_email_db(str(path)):
|
|
156
|
+
return "", {"extractor": "sqlite_blocked"}
|
|
157
|
+
uri = f"file:{path}?mode=ro"
|
|
158
|
+
parts: list[str] = []
|
|
159
|
+
try:
|
|
160
|
+
conn = sqlite3.connect(uri, uri=True, timeout=1)
|
|
161
|
+
except Exception:
|
|
162
|
+
return "", {"extractor": "nexo_email_db", "state": "locked_or_unavailable"}
|
|
163
|
+
try:
|
|
164
|
+
tables = _table_names(conn)
|
|
165
|
+
if "emails" in tables:
|
|
166
|
+
cols = _select_existing_columns(
|
|
167
|
+
conn,
|
|
168
|
+
"emails",
|
|
169
|
+
["from_addr", "from_name", "subject", "received_at", "status", "body", "response"],
|
|
170
|
+
)
|
|
171
|
+
if not cols:
|
|
172
|
+
return "", {"extractor": "nexo_email_db", "tables": sorted(tables)}
|
|
173
|
+
order = "received_at" if "received_at" in cols else "rowid"
|
|
174
|
+
for row in conn.execute(f"SELECT {', '.join(cols)} FROM emails ORDER BY {order} DESC LIMIT 1000").fetchall():
|
|
175
|
+
parts.append(" | ".join(str(value or "")[:4000] for value in row))
|
|
176
|
+
if "sent_email_events" in tables:
|
|
177
|
+
cols = _select_existing_columns(
|
|
178
|
+
conn,
|
|
179
|
+
"sent_email_events",
|
|
180
|
+
["sender", "to_addrs", "cc_addrs", "subject", "sent_at", "status", "body_text"],
|
|
181
|
+
)
|
|
182
|
+
if cols:
|
|
183
|
+
order = "sent_at" if "sent_at" in cols else "rowid"
|
|
184
|
+
for row in conn.execute(f"SELECT {', '.join(cols)} FROM sent_email_events ORDER BY {order} DESC LIMIT 1000").fetchall():
|
|
185
|
+
parts.append(" | ".join(str(value or "")[:4000] for value in row))
|
|
186
|
+
finally:
|
|
187
|
+
conn.close()
|
|
188
|
+
return "\n".join(parts)[:MAX_CHARS], {"extractor": "nexo_email_db", "tables": sorted(tables) if "tables" in locals() else []}
|
|
189
|
+
|
|
190
|
+
|
|
75
191
|
def _zip_xml_text(path: Path, members: list[str]) -> str:
|
|
76
192
|
pieces: list[str] = []
|
|
77
193
|
with zipfile.ZipFile(path) as zf:
|
|
@@ -156,6 +272,14 @@ def extract_text(path: Path) -> tuple[str, dict]:
|
|
|
156
272
|
elif suffix == ".eml":
|
|
157
273
|
text, metadata = _extract_eml(path)
|
|
158
274
|
metadata["extractor"] = "eml"
|
|
275
|
+
elif suffix == ".emlx":
|
|
276
|
+
text, metadata = _extract_emlx(path)
|
|
277
|
+
metadata["extractor"] = "emlx"
|
|
278
|
+
elif suffix == ".msg":
|
|
279
|
+
text, metadata = _extract_msg(path)
|
|
280
|
+
metadata["extractor"] = metadata.get("extractor") or "msg"
|
|
281
|
+
elif suffix == ".db" and is_local_email_db(str(path)):
|
|
282
|
+
text, metadata = _extract_nexo_email_db(path)
|
|
159
283
|
elif suffix == ".pdf":
|
|
160
284
|
text = _extract_pdf(path)
|
|
161
285
|
elif suffix == ".docx":
|
|
@@ -6,21 +6,58 @@ SENSITIVE_FILE_NAMES = {
|
|
|
6
6
|
".env",
|
|
7
7
|
".env.local",
|
|
8
8
|
".env.production",
|
|
9
|
+
".npmrc",
|
|
10
|
+
".pypirc",
|
|
11
|
+
".netrc",
|
|
12
|
+
".boto",
|
|
13
|
+
".pgpass",
|
|
14
|
+
".my.cnf",
|
|
15
|
+
".git-credentials",
|
|
16
|
+
".mcp_publisher_token",
|
|
17
|
+
".mcpregistry_github_token",
|
|
18
|
+
".mcpregistry_registry_token",
|
|
9
19
|
"id_rsa",
|
|
10
20
|
"id_dsa",
|
|
11
21
|
"id_ecdsa",
|
|
12
22
|
"id_ed25519",
|
|
23
|
+
"known_hosts",
|
|
24
|
+
"authorized_keys",
|
|
13
25
|
"cookies.sqlite",
|
|
14
26
|
"login data",
|
|
15
27
|
"keychain-2.db",
|
|
16
28
|
}
|
|
17
29
|
|
|
30
|
+
SENSITIVE_NAME_MARKERS = {
|
|
31
|
+
"api_key",
|
|
32
|
+
"apikey",
|
|
33
|
+
"auth_token",
|
|
34
|
+
"bearer",
|
|
35
|
+
"client_secret",
|
|
36
|
+
"credential",
|
|
37
|
+
"credentials",
|
|
38
|
+
"oauth",
|
|
39
|
+
"password",
|
|
40
|
+
"passwd",
|
|
41
|
+
"private_key",
|
|
42
|
+
"secret",
|
|
43
|
+
"token",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
SENSITIVE_SUFFIXES = {
|
|
47
|
+
".key",
|
|
48
|
+
".pem",
|
|
49
|
+
".p12",
|
|
50
|
+
".pfx",
|
|
51
|
+
".kdbx",
|
|
52
|
+
}
|
|
53
|
+
|
|
18
54
|
SENSITIVE_PARTS = {
|
|
19
55
|
".ssh",
|
|
20
56
|
".gnupg",
|
|
21
57
|
".aws",
|
|
22
58
|
".azure",
|
|
23
59
|
".kube",
|
|
60
|
+
".docker",
|
|
24
61
|
"password",
|
|
25
62
|
"passwords",
|
|
26
63
|
"1password",
|
|
@@ -30,6 +67,29 @@ SENSITIVE_PARTS = {
|
|
|
30
67
|
"browser profile",
|
|
31
68
|
}
|
|
32
69
|
|
|
70
|
+
EMAIL_RUNTIME_DB_NAMES = {
|
|
71
|
+
"email.db",
|
|
72
|
+
"email-tracker.db",
|
|
73
|
+
"emails.db",
|
|
74
|
+
"monitor.db",
|
|
75
|
+
"nexo-email.db",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
EMAIL_ATTACHMENT_SUFFIXES = {
|
|
79
|
+
".csv",
|
|
80
|
+
".docx",
|
|
81
|
+
".eml",
|
|
82
|
+
".emlx",
|
|
83
|
+
".html",
|
|
84
|
+
".md",
|
|
85
|
+
".pdf",
|
|
86
|
+
".pptx",
|
|
87
|
+
".txt",
|
|
88
|
+
".xlsx",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
EMAIL_EXTRACTABLE_SUFFIXES = {".eml", ".emlx", ".msg"}
|
|
92
|
+
|
|
33
93
|
NOISY_PARTS = {
|
|
34
94
|
"node_modules",
|
|
35
95
|
"vendor",
|
|
@@ -53,9 +113,64 @@ NOISY_PARTS = {
|
|
|
53
113
|
".parcel-cache",
|
|
54
114
|
".bun",
|
|
55
115
|
".gradle",
|
|
116
|
+
"$tmp",
|
|
56
117
|
"target",
|
|
57
118
|
}
|
|
58
119
|
|
|
120
|
+
TRANSIENT_PARTS = {"tmp", "temp"}
|
|
121
|
+
|
|
122
|
+
PRIVATE_PROFILE_PARTS = {
|
|
123
|
+
".nexo",
|
|
124
|
+
".claude",
|
|
125
|
+
".codex",
|
|
126
|
+
".gemini",
|
|
127
|
+
".cursor",
|
|
128
|
+
".config",
|
|
129
|
+
".local",
|
|
130
|
+
".npm",
|
|
131
|
+
".yarn",
|
|
132
|
+
".pnpm-store",
|
|
133
|
+
".ollama",
|
|
134
|
+
".docker",
|
|
135
|
+
".vscode",
|
|
136
|
+
".idea",
|
|
137
|
+
"appdata",
|
|
138
|
+
"application data",
|
|
139
|
+
"library/application support",
|
|
140
|
+
"library/containers",
|
|
141
|
+
"library/group containers",
|
|
142
|
+
"library/keychains",
|
|
143
|
+
"library/logs",
|
|
144
|
+
"library/mail",
|
|
145
|
+
"library/messages",
|
|
146
|
+
"library/safari",
|
|
147
|
+
"library/saved application state",
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
PROFILE_HIDDEN_FILE_NAMES = {
|
|
151
|
+
".aider.chat.history.md",
|
|
152
|
+
".aider.input.history",
|
|
153
|
+
".bash_history",
|
|
154
|
+
".bash_profile",
|
|
155
|
+
".bashrc",
|
|
156
|
+
".claude.json",
|
|
157
|
+
".codex.json",
|
|
158
|
+
".cursorignore",
|
|
159
|
+
".ds_store",
|
|
160
|
+
".gitconfig",
|
|
161
|
+
".gitignore_global",
|
|
162
|
+
".lesshst",
|
|
163
|
+
".python_history",
|
|
164
|
+
".sqlite_history",
|
|
165
|
+
".viminfo",
|
|
166
|
+
".wget-hsts",
|
|
167
|
+
".zprofile",
|
|
168
|
+
".zsh_history",
|
|
169
|
+
".zshrc",
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
ALLOWED_HIDDEN_FILE_NAMES = set()
|
|
173
|
+
|
|
59
174
|
SYSTEM_PARTS = {
|
|
60
175
|
"system volume information",
|
|
61
176
|
"$recycle.bin",
|
|
@@ -69,35 +184,206 @@ SYSTEM_PARTS = {
|
|
|
69
184
|
}
|
|
70
185
|
|
|
71
186
|
|
|
72
|
-
def
|
|
73
|
-
|
|
187
|
+
def _normalized(path: str) -> str:
|
|
188
|
+
return str(Path(path)).replace("\\", "/").lower()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _parts(path: str) -> set[str]:
|
|
192
|
+
return {part for part in _normalized(path).replace(":", "/").split("/") if part}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _contains_path_marker(lowered: str, markers: set[str]) -> bool:
|
|
196
|
+
return any(marker in lowered for marker in markers)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _is_under_marker(lowered: str, marker: str) -> bool:
|
|
200
|
+
marker = marker.strip("/").lower()
|
|
201
|
+
if not marker:
|
|
202
|
+
return False
|
|
203
|
+
return lowered.endswith("/" + marker) or f"/{marker}/" in lowered
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _is_inside_windows_mail_package(lowered: str) -> bool:
|
|
207
|
+
return "/appdata/local/packages/microsoft.windowscommunicationsapps" in lowered
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _is_inside_outlook_mac_profile(lowered: str) -> bool:
|
|
211
|
+
return "/library/group containers/ubf8t346g9.office/outlook" in lowered
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def is_local_email_tree(path: str) -> bool:
|
|
215
|
+
lowered = _normalized(path)
|
|
216
|
+
if _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
|
|
217
|
+
return True
|
|
218
|
+
return any(
|
|
219
|
+
_is_under_marker(lowered, marker)
|
|
220
|
+
for marker in (
|
|
221
|
+
"library/mail",
|
|
222
|
+
".nexo/runtime/nexo-email",
|
|
223
|
+
"documents/outlook files",
|
|
224
|
+
"appdata/local/microsoft/outlook",
|
|
225
|
+
"appdata/roaming/microsoft/outlook",
|
|
226
|
+
"appdata/local/packages/microsoft.windowscommunicationsapps",
|
|
227
|
+
".thunderbird",
|
|
228
|
+
".mozilla-thunderbird",
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def is_local_email_db(path: str) -> bool:
|
|
234
|
+
p = Path(path)
|
|
235
|
+
return is_local_email_tree(path) and p.name.lower() in EMAIL_RUNTIME_DB_NAMES
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def is_allowed_local_email_file(path: str) -> bool:
|
|
239
|
+
if not is_local_email_tree(path):
|
|
240
|
+
return False
|
|
241
|
+
p = Path(path)
|
|
242
|
+
lowered = _normalized(path)
|
|
243
|
+
suffix = p.suffix.lower()
|
|
244
|
+
if is_sensitive_path(path):
|
|
245
|
+
return False
|
|
246
|
+
if _is_under_marker(lowered, ".nexo/runtime/nexo-email"):
|
|
247
|
+
if is_local_email_db(path):
|
|
248
|
+
return True
|
|
249
|
+
if _is_under_marker(lowered, ".nexo/runtime/nexo-email/attachments"):
|
|
250
|
+
return suffix in EMAIL_ATTACHMENT_SUFFIXES
|
|
251
|
+
return suffix in {".eml", ".emlx"}
|
|
252
|
+
if _is_under_marker(lowered, "library/mail"):
|
|
253
|
+
return suffix in {".eml", ".emlx"}
|
|
254
|
+
if any(
|
|
255
|
+
_is_under_marker(lowered, marker)
|
|
256
|
+
for marker in (
|
|
257
|
+
"library/group containers/ubf8t346g9.office/outlook",
|
|
258
|
+
"documents/outlook files",
|
|
259
|
+
"appdata/local/microsoft/outlook",
|
|
260
|
+
"appdata/roaming/microsoft/outlook",
|
|
261
|
+
"appdata/local/packages/microsoft.windowscommunicationsapps",
|
|
262
|
+
)
|
|
263
|
+
) or _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
|
|
264
|
+
return suffix in {".eml", ".msg", ".pst", ".ost"}
|
|
265
|
+
if _is_under_marker(lowered, ".thunderbird") or _is_under_marker(lowered, ".mozilla-thunderbird"):
|
|
266
|
+
return suffix in {".eml", ".mbox", ""}
|
|
267
|
+
return False
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _has_transient_project_part(path: str) -> bool:
|
|
271
|
+
parts = list(_normalized(path).replace(":", "/").split("/"))
|
|
272
|
+
for index, part in enumerate(parts):
|
|
273
|
+
if part in TRANSIENT_PARTS and index >= 2:
|
|
274
|
+
return True
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _has_hidden_dir_part(path: str) -> bool:
|
|
279
|
+
parts = [part for part in _normalized(path).replace(":", "/").split("/") if part]
|
|
280
|
+
return any(part.startswith(".") and part not in {".", ".."} for part in parts[:-1])
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _is_home_hidden_path(path: str) -> bool:
|
|
284
|
+
try:
|
|
285
|
+
p = Path(path).expanduser()
|
|
286
|
+
home = Path.home().expanduser()
|
|
287
|
+
rel = p.relative_to(home)
|
|
288
|
+
except Exception:
|
|
289
|
+
return False
|
|
290
|
+
return bool(rel.parts) and rel.parts[0].startswith(".")
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def is_sensitive_path(path: str) -> bool:
|
|
74
294
|
p = Path(path)
|
|
75
|
-
lowered =
|
|
295
|
+
lowered = _normalized(path)
|
|
76
296
|
name = p.name.lower()
|
|
77
|
-
|
|
297
|
+
stem = p.stem.lower()
|
|
298
|
+
parts = _parts(path)
|
|
299
|
+
if name in SENSITIVE_FILE_NAMES:
|
|
300
|
+
return True
|
|
301
|
+
if name.startswith(".") and name not in ALLOWED_HIDDEN_FILE_NAMES:
|
|
302
|
+
return True
|
|
303
|
+
if name.startswith("~$"):
|
|
304
|
+
return True
|
|
305
|
+
if name.endswith((".tmp", ".swp", ".swo")):
|
|
306
|
+
return True
|
|
307
|
+
if p.suffix.lower() in SENSITIVE_SUFFIXES:
|
|
308
|
+
return True
|
|
309
|
+
if parts & SENSITIVE_PARTS:
|
|
310
|
+
return True
|
|
311
|
+
if any(marker in name or marker in stem for marker in SENSITIVE_NAME_MARKERS):
|
|
312
|
+
return True
|
|
313
|
+
return _contains_path_marker(lowered, SENSITIVE_PARTS)
|
|
314
|
+
|
|
78
315
|
|
|
79
|
-
|
|
316
|
+
def is_private_profile_path(path: str) -> bool:
|
|
317
|
+
lowered = _normalized(path)
|
|
318
|
+
parts = _parts(path)
|
|
319
|
+
if parts & PRIVATE_PROFILE_PARTS:
|
|
320
|
+
return True
|
|
321
|
+
if _contains_path_marker(lowered, PRIVATE_PROFILE_PARTS):
|
|
322
|
+
return True
|
|
323
|
+
name = Path(path).name.lower()
|
|
324
|
+
if name in PROFILE_HIDDEN_FILE_NAMES:
|
|
325
|
+
return True
|
|
326
|
+
if _is_home_hidden_path(path):
|
|
327
|
+
return True
|
|
328
|
+
return False
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def classify_path(path: str) -> tuple[int, str, str]:
|
|
332
|
+
"""Return (depth, privacy_class, reason)."""
|
|
333
|
+
lowered = _normalized(path)
|
|
334
|
+
parts = _parts(path)
|
|
335
|
+
|
|
336
|
+
if is_local_email_tree(path) and (Path(path).suffix == "" or is_allowed_local_email_file(path)):
|
|
337
|
+
return 2, "normal", "local_email_path"
|
|
338
|
+
if is_sensitive_path(path):
|
|
80
339
|
return 1, "sensitive_inventory_only", "sensitive_path"
|
|
340
|
+
if is_private_profile_path(path):
|
|
341
|
+
return 0, "private_profile_blocked", "private_profile_path"
|
|
81
342
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
82
343
|
return 0, "system_blocked", "system_path"
|
|
83
|
-
if parts & NOISY_PARTS:
|
|
344
|
+
if parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path):
|
|
84
345
|
return 1, "inventory_only", "noisy_tree"
|
|
85
346
|
return 2, "normal", "default"
|
|
86
347
|
|
|
87
348
|
|
|
88
349
|
def should_skip_tree(path: str) -> bool:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
350
|
+
lowered = _normalized(path)
|
|
351
|
+
parts = _parts(path)
|
|
352
|
+
if is_local_email_tree(path):
|
|
353
|
+
return False
|
|
354
|
+
if any(item in lowered for item in SYSTEM_PARTS):
|
|
355
|
+
return True
|
|
356
|
+
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
357
|
+
return True
|
|
358
|
+
return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def should_skip_file(path: str) -> bool:
|
|
362
|
+
lowered = _normalized(path)
|
|
363
|
+
parts = _parts(path)
|
|
364
|
+
if is_local_email_tree(path):
|
|
365
|
+
return not is_allowed_local_email_file(path)
|
|
92
366
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
93
367
|
return True
|
|
94
|
-
|
|
368
|
+
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
369
|
+
return True
|
|
370
|
+
return bool(parts & NOISY_PARTS or _has_transient_project_part(path) or _has_hidden_dir_part(path))
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def is_queryable_path(path: str, privacy_class: str = "") -> bool:
|
|
374
|
+
if privacy_class and privacy_class != "normal":
|
|
375
|
+
return False
|
|
376
|
+
return not should_skip_file(path)
|
|
95
377
|
|
|
96
378
|
|
|
97
379
|
def should_extract(path: str, depth: int) -> bool:
|
|
98
380
|
if depth < 2:
|
|
99
381
|
return False
|
|
382
|
+
if should_skip_file(path):
|
|
383
|
+
return False
|
|
100
384
|
suffix = Path(path).suffix.lower()
|
|
385
|
+
if is_local_email_db(path):
|
|
386
|
+
return True
|
|
101
387
|
if suffix in {
|
|
102
388
|
".txt",
|
|
103
389
|
".md",
|
|
@@ -118,6 +404,8 @@ def should_extract(path: str, depth: int) -> bool:
|
|
|
118
404
|
".csv",
|
|
119
405
|
".tsv",
|
|
120
406
|
".eml",
|
|
407
|
+
".emlx",
|
|
408
|
+
".msg",
|
|
121
409
|
".pdf",
|
|
122
410
|
".docx",
|
|
123
411
|
".pptx",
|
package/src/tools_hot_context.py
CHANGED
|
@@ -43,6 +43,15 @@ def _format_local_context_evidence(query: str, *, limit: int = 4) -> str:
|
|
|
43
43
|
refs = result.get("evidence_refs") or []
|
|
44
44
|
if refs:
|
|
45
45
|
lines.append(f"Evidence refs: {', '.join(str(ref) for ref in refs[:limit])}")
|
|
46
|
+
relations = result.get("relations") or []
|
|
47
|
+
if relations:
|
|
48
|
+
lines.append("Local relations:")
|
|
49
|
+
for relation in relations[:limit]:
|
|
50
|
+
relation_type = str(relation.get("relation_type") or "related")
|
|
51
|
+
target = str(relation.get("target_ref") or relation.get("target_asset_id") or "").strip()
|
|
52
|
+
evidence = str(relation.get("evidence") or "").strip()
|
|
53
|
+
suffix = f" — {evidence[:120]}" if evidence else ""
|
|
54
|
+
lines.append(f"- {relation_type}: {target}{suffix}")
|
|
46
55
|
return "\n".join(lines)
|
|
47
56
|
|
|
48
57
|
|