nexo-brain 7.20.4 → 7.20.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +11 -1
- package/bin/windows-wsl-bridge.js +20 -0
- package/package.json +1 -1
- package/src/crons/sync.py +125 -7
- package/src/local_context/api.py +272 -27
- package/src/local_context/extractors.py +109 -2
- package/src/local_context/privacy.py +111 -0
- package/src/tools_hot_context.py +9 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.9",
|
|
4
4
|
"description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "NEXO Brain",
|
package/README.md
CHANGED
|
@@ -18,7 +18,17 @@
|
|
|
18
18
|
|
|
19
19
|
[Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
|
|
20
20
|
|
|
21
|
-
Version `7.20.
|
|
21
|
+
Version `7.20.9` is the current packaged-runtime line. Patch release over v7.20.8 — Local Context scans automatic roots at full operational depth, falls back to crontab when Linux/WSL systemd user timers fail, passes Windows AppData email roots into WSL, and blocks Google API keys before HTML cleaning.
|
|
22
|
+
|
|
23
|
+
Previously in `7.20.8`: patch release over v7.20.7 — Local Context recognises Windows Mail package roots and Outlook Mac profile roots as bounded local-email sources instead of rejecting them as generic AppData / Group Containers.
|
|
24
|
+
|
|
25
|
+
Previously in `7.20.7`: patch release over v7.20.6 — Local Context email-root bootstrap is deterministic across CI, WSL and migrated profiles while preserving macOS Mail.app, Windows Outlook, Thunderbird and NEXO email coverage.
|
|
26
|
+
|
|
27
|
+
Previously in `7.20.6`: patch release over v7.20.5 — Local Context ranks entity matches at chunk level, keeps old entity-matched assets eligible, adds safe local email roots for macOS/Windows/Linux, extracts `.eml`, `.emlx`, `.msg` and NEXO email DB continuity, and exposes local graph relations in pre-action context.
|
|
28
|
+
|
|
29
|
+
Previously in `7.20.5`: patch release over v7.20.4 — Local Context status reports elapsed indexing time and a defensive ETA while background jobs remain pending.
|
|
30
|
+
|
|
31
|
+
Previously in `7.20.4`: patch release over v7.20.3 — Local Context now blocks private dotfiles, hidden project folders and secret-bearing content before chunks, embeddings, graph relations or agent context are created.
|
|
22
32
|
|
|
23
33
|
Previously in `7.20.3`: patch release over v7.20.2 — installer DMG volumes are no longer added as local-memory roots, removed roots purge stale payloads, and doctor can repair removed-root residue.
|
|
24
34
|
|
|
@@ -93,6 +93,20 @@ function resolveLinuxEnv(env = process.env) {
|
|
|
93
93
|
return linuxEnv;
|
|
94
94
|
}
|
|
95
95
|
|
|
96
|
+
function resolveWindowsHostPathEnv(env = process.env) {
|
|
97
|
+
const result = {};
|
|
98
|
+
for (const key of ["LOCALAPPDATA", "APPDATA"]) {
|
|
99
|
+
const value = String(env[key] || "").trim();
|
|
100
|
+
if (!value) continue;
|
|
101
|
+
if (isWindowsStylePath(value)) {
|
|
102
|
+
result[key] = toWslPath(value);
|
|
103
|
+
} else if (value.startsWith("/")) {
|
|
104
|
+
result[key] = value;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return result;
|
|
108
|
+
}
|
|
109
|
+
|
|
96
110
|
function uniqueValues(values = []) {
|
|
97
111
|
const seen = new Set();
|
|
98
112
|
return values.filter((value) => {
|
|
@@ -242,6 +256,10 @@ function buildWslExecSpec({
|
|
|
242
256
|
for (const [key, value] of Object.entries(linuxEnv)) {
|
|
243
257
|
wslArgs.push(`${key}=${value}`);
|
|
244
258
|
}
|
|
259
|
+
const windowsHostPathEnv = resolveWindowsHostPathEnv(env);
|
|
260
|
+
for (const [key, value] of Object.entries(windowsHostPathEnv)) {
|
|
261
|
+
wslArgs.push(`${key}=${value}`);
|
|
262
|
+
}
|
|
245
263
|
|
|
246
264
|
// Build the staging shell script. Stages the bundle from /mnt/c (DrvFs/9P)
|
|
247
265
|
// to /tmp (native ext4) BEFORE invoking node. Without staging, node hangs
|
|
@@ -296,6 +314,7 @@ function buildWslExecSpec({
|
|
|
296
314
|
command: "wsl.exe",
|
|
297
315
|
args: wslArgs,
|
|
298
316
|
linuxEnv,
|
|
317
|
+
windowsHostPathEnv,
|
|
299
318
|
managedLinuxPath,
|
|
300
319
|
translatedScriptPath,
|
|
301
320
|
};
|
|
@@ -338,6 +357,7 @@ module.exports = {
|
|
|
338
357
|
probeWslUserHome,
|
|
339
358
|
resolveLinuxEnv,
|
|
340
359
|
resolveLinuxUserHome,
|
|
360
|
+
resolveWindowsHostPathEnv,
|
|
341
361
|
runViaWsl,
|
|
342
362
|
toWslPath,
|
|
343
363
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.20.
|
|
3
|
+
"version": "7.20.9",
|
|
4
4
|
"mcpName": "io.github.wazionapps/nexo",
|
|
5
5
|
"description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
|
|
6
6
|
"homepage": "https://nexo-brain.com",
|
package/src/crons/sync.py
CHANGED
|
@@ -20,6 +20,7 @@ import json
|
|
|
20
20
|
import os
|
|
21
21
|
import platform
|
|
22
22
|
import plistlib
|
|
23
|
+
import shlex
|
|
23
24
|
import shutil
|
|
24
25
|
import subprocess
|
|
25
26
|
import sys
|
|
@@ -133,6 +134,8 @@ SCHEDULE_FILE = paths.config_dir() / "schedule.json"
|
|
|
133
134
|
CORE_CRON_MANAGED_ENV = "NEXO_MANAGED_CORE_CRON"
|
|
134
135
|
PERSONAL_CRON_MANAGED_ENV = "NEXO_MANAGED_PERSONAL_CRON"
|
|
135
136
|
PERSONAL_CRON_ID_ENV = "NEXO_PERSONAL_CRON_ID"
|
|
137
|
+
CRONTAB_BEGIN = "# >>> NEXO managed core crons >>>"
|
|
138
|
+
CRONTAB_END = "# <<< NEXO managed core crons <<<"
|
|
136
139
|
RETIRED_CORE_FILES = (
|
|
137
140
|
Path("core") / "scripts" / "nexo-day-orchestrator.sh",
|
|
138
141
|
Path("scripts") / "nexo-day-orchestrator.sh",
|
|
@@ -457,6 +460,106 @@ def build_plist(cron: dict) -> dict:
|
|
|
457
460
|
return plist
|
|
458
461
|
|
|
459
462
|
|
|
463
|
+
def _shell_join(args: list[str | Path]) -> str:
|
|
464
|
+
return " ".join(shlex.quote(str(arg)) for arg in args)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _cron_schedule(cron: dict) -> str | None:
|
|
468
|
+
if cron.get("keep_alive"):
|
|
469
|
+
return None
|
|
470
|
+
if "interval_seconds" in cron:
|
|
471
|
+
try:
|
|
472
|
+
seconds = int(cron["interval_seconds"])
|
|
473
|
+
except Exception:
|
|
474
|
+
return None
|
|
475
|
+
if seconds <= 0 or seconds % 60 != 0:
|
|
476
|
+
return None
|
|
477
|
+
minutes = max(1, seconds // 60)
|
|
478
|
+
return "* * * * *" if minutes == 1 else f"*/{minutes} * * * *"
|
|
479
|
+
if "schedule" in cron:
|
|
480
|
+
s = resolve_declared_schedule(cron)
|
|
481
|
+
hour, minute = int(s.get("hour", 0)), int(s.get("minute", 0))
|
|
482
|
+
weekday = "*"
|
|
483
|
+
if "weekday" in s:
|
|
484
|
+
raw_weekday = int(s["weekday"])
|
|
485
|
+
weekday = "0" if raw_weekday == 7 else str(raw_weekday)
|
|
486
|
+
return f"{minute} {hour} * * {weekday}"
|
|
487
|
+
return None
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _linux_crontab_entry(cron: dict, exec_cmd: str, stdout_log: Path, stderr_log: Path) -> str | None:
|
|
491
|
+
schedule = _cron_schedule(cron)
|
|
492
|
+
if not schedule:
|
|
493
|
+
return None
|
|
494
|
+
env_prefix = " ".join(
|
|
495
|
+
f"{key}={shlex.quote(str(value))}"
|
|
496
|
+
for key, value in {
|
|
497
|
+
"HOME": Path.home(),
|
|
498
|
+
"NEXO_HOME": NEXO_HOME,
|
|
499
|
+
"NEXO_CODE": _runtime_code_dir(),
|
|
500
|
+
"PYTHONUNBUFFERED": "1",
|
|
501
|
+
}.items()
|
|
502
|
+
)
|
|
503
|
+
return f"{schedule} {env_prefix} {exec_cmd} >> {shlex.quote(str(stdout_log))} 2>> {shlex.quote(str(stderr_log))}"
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _strip_managed_crontab_block(body: str) -> str:
|
|
507
|
+
lines = body.splitlines()
|
|
508
|
+
kept: list[str] = []
|
|
509
|
+
skipping = False
|
|
510
|
+
for line in lines:
|
|
511
|
+
if line.strip() == CRONTAB_BEGIN:
|
|
512
|
+
skipping = True
|
|
513
|
+
continue
|
|
514
|
+
if line.strip() == CRONTAB_END:
|
|
515
|
+
skipping = False
|
|
516
|
+
continue
|
|
517
|
+
if not skipping:
|
|
518
|
+
kept.append(line)
|
|
519
|
+
return "\n".join(kept).rstrip()
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def _install_linux_crontab_fallback(entries: list[str]) -> dict:
|
|
523
|
+
if not entries:
|
|
524
|
+
return {"ok": False, "error": "no_crontab_entries"}
|
|
525
|
+
if not shutil.which("crontab"):
|
|
526
|
+
return {"ok": False, "error": "crontab_missing"}
|
|
527
|
+
|
|
528
|
+
existing = subprocess.run(["crontab", "-l"], capture_output=True, text=True)
|
|
529
|
+
current_body = existing.stdout if existing.returncode == 0 else ""
|
|
530
|
+
unmanaged_body = _strip_managed_crontab_block(current_body)
|
|
531
|
+
managed_body = "\n".join([CRONTAB_BEGIN, *entries, CRONTAB_END])
|
|
532
|
+
next_body = f"{unmanaged_body}\n\n{managed_body}\n" if unmanaged_body else f"{managed_body}\n"
|
|
533
|
+
|
|
534
|
+
tmp_path = None
|
|
535
|
+
try:
|
|
536
|
+
with tempfile.NamedTemporaryFile("w", encoding="utf-8", delete=False) as fh:
|
|
537
|
+
tmp_path = fh.name
|
|
538
|
+
fh.write(next_body)
|
|
539
|
+
proc = subprocess.run(["crontab", tmp_path], capture_output=True, text=True)
|
|
540
|
+
finally:
|
|
541
|
+
if tmp_path:
|
|
542
|
+
try:
|
|
543
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
544
|
+
except Exception:
|
|
545
|
+
pass
|
|
546
|
+
if proc.returncode != 0:
|
|
547
|
+
return {"ok": False, "error": proc.stderr or proc.stdout or "crontab_install_failed"}
|
|
548
|
+
return {"ok": True, "entries": len(entries)}
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def _enable_systemd_user_units(units: list[str]) -> dict:
|
|
552
|
+
errors: list[str] = []
|
|
553
|
+
daemon = subprocess.run(["systemctl", "--user", "daemon-reload"], capture_output=True, text=True)
|
|
554
|
+
if daemon.returncode != 0:
|
|
555
|
+
errors.append(daemon.stderr or daemon.stdout or "systemctl daemon-reload failed")
|
|
556
|
+
for unit in units:
|
|
557
|
+
proc = subprocess.run(["systemctl", "--user", "enable", "--now", unit], capture_output=True, text=True)
|
|
558
|
+
if proc.returncode != 0:
|
|
559
|
+
errors.append(f"{unit}: {proc.stderr or proc.stdout or 'enable failed'}")
|
|
560
|
+
return {"ok": not errors, "errors": errors}
|
|
561
|
+
|
|
562
|
+
|
|
460
563
|
def get_installed_nexo_crons() -> dict[str, Path]:
|
|
461
564
|
"""Return dict of cron_id → plist_path for installed NEXO crons."""
|
|
462
565
|
installed = {}
|
|
@@ -670,6 +773,9 @@ def sync_linux(dry_run: bool = False):
|
|
|
670
773
|
python_bin = p
|
|
671
774
|
break
|
|
672
775
|
|
|
776
|
+
enable_units: list[str] = []
|
|
777
|
+
crontab_entries: list[str] = []
|
|
778
|
+
|
|
673
779
|
for cron in manifest_crons:
|
|
674
780
|
cron_id = cron["id"]
|
|
675
781
|
script_src = _resolve_source_artifact(cron["script"])
|
|
@@ -683,9 +789,9 @@ def sync_linux(dry_run: bool = False):
|
|
|
683
789
|
_copy_into_runtime(subdir_src)
|
|
684
790
|
|
|
685
791
|
if script_type == "shell":
|
|
686
|
-
exec_cmd =
|
|
792
|
+
exec_cmd = _shell_join(["/bin/bash", wrapper_dest, cron_id, "/bin/bash", script_dest])
|
|
687
793
|
else:
|
|
688
|
-
exec_cmd =
|
|
794
|
+
exec_cmd = _shell_join(["/bin/bash", wrapper_dest, cron_id, python_bin, script_dest])
|
|
689
795
|
|
|
690
796
|
service_path = unit_dir / f"nexo-{cron_id}.service"
|
|
691
797
|
timer_path = unit_dir / f"nexo-{cron_id}.timer"
|
|
@@ -734,6 +840,7 @@ StandardError=append:{stderr_log}
|
|
|
734
840
|
|
|
735
841
|
service_path.write_text(service_content)
|
|
736
842
|
if cron.get("keep_alive"):
|
|
843
|
+
enable_units.append(f"nexo-{cron_id}.service")
|
|
737
844
|
log(f" Installed keep_alive service: {cron_id}")
|
|
738
845
|
continue
|
|
739
846
|
|
|
@@ -748,14 +855,25 @@ Persistent=true
|
|
|
748
855
|
WantedBy=timers.target
|
|
749
856
|
"""
|
|
750
857
|
timer_path.write_text(timer_content)
|
|
858
|
+
enable_units.append(f"nexo-{cron_id}.timer")
|
|
859
|
+
crontab_entry = _linux_crontab_entry(cron, exec_cmd, stdout_log, stderr_log)
|
|
860
|
+
if crontab_entry:
|
|
861
|
+
crontab_entries.append(crontab_entry)
|
|
751
862
|
log(f" Installed: {cron_id}")
|
|
752
863
|
|
|
753
864
|
if not dry_run:
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
865
|
+
systemd_result = _enable_systemd_user_units(enable_units)
|
|
866
|
+
if systemd_result.get("ok"):
|
|
867
|
+
log("systemd units enabled.")
|
|
868
|
+
else:
|
|
869
|
+
log(f"WARNING: systemd user timers failed; installing crontab fallback: {systemd_result.get('errors')}")
|
|
870
|
+
fallback = _install_linux_crontab_fallback(crontab_entries)
|
|
871
|
+
if not fallback.get("ok"):
|
|
872
|
+
raise RuntimeError(
|
|
873
|
+
"Linux cron activation failed: "
|
|
874
|
+
f"systemd={systemd_result.get('errors')} crontab={fallback.get('error')}"
|
|
875
|
+
)
|
|
876
|
+
log(f"crontab fallback installed ({fallback.get('entries')} entries).")
|
|
759
877
|
|
|
760
878
|
log("Sync complete.")
|
|
761
879
|
|
package/src/local_context/api.py
CHANGED
|
@@ -26,6 +26,9 @@ LOCAL_INDEX_LINUX_UNIT = "nexo-local-index.service"
|
|
|
26
26
|
DEFAULT_LIVE_ASSET_LIMIT = int(os.environ.get("NEXO_LOCAL_INDEX_LIVE_ASSET_LIMIT", "2000") or "2000")
|
|
27
27
|
DEFAULT_LIVE_DIR_LIMIT = int(os.environ.get("NEXO_LOCAL_INDEX_LIVE_DIR_LIMIT", "300") or "300")
|
|
28
28
|
DEFAULT_LIVE_FILE_LIMIT = int(os.environ.get("NEXO_LOCAL_INDEX_LIVE_FILE_LIMIT", "1000") or "1000")
|
|
29
|
+
DEFAULT_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_DEPTH", "24") or "24")
|
|
30
|
+
DEFAULT_EMAIL_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_EMAIL_ROOT_DEPTH", "24") or "24")
|
|
31
|
+
DEFAULT_MOUNTED_ROOT_DEPTH = int(os.environ.get("NEXO_LOCAL_INDEX_MOUNTED_ROOT_DEPTH", "24") or "24")
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
def ensure_ready() -> None:
|
|
@@ -91,6 +94,21 @@ def _dedupe_roots(roots: list[str]) -> list[str]:
|
|
|
91
94
|
return result
|
|
92
95
|
|
|
93
96
|
|
|
97
|
+
def _dedupe_root_specs(specs: list[tuple[str, int]]) -> list[tuple[str, int]]:
|
|
98
|
+
ordered: list[str] = []
|
|
99
|
+
depths: dict[str, int] = {}
|
|
100
|
+
for root, depth in specs:
|
|
101
|
+
normalized = norm_path(root)
|
|
102
|
+
if not normalized:
|
|
103
|
+
continue
|
|
104
|
+
if normalized not in depths:
|
|
105
|
+
ordered.append(normalized)
|
|
106
|
+
depths[normalized] = int(depth)
|
|
107
|
+
else:
|
|
108
|
+
depths[normalized] = max(depths[normalized], int(depth))
|
|
109
|
+
return [(root, depths[root]) for root in ordered]
|
|
110
|
+
|
|
111
|
+
|
|
94
112
|
def _mounted_volume_roots() -> list[str]:
|
|
95
113
|
candidates: list[Path] = []
|
|
96
114
|
if sys.platform == "darwin":
|
|
@@ -123,24 +141,78 @@ def _mounted_volume_roots() -> list[str]:
|
|
|
123
141
|
return roots
|
|
124
142
|
|
|
125
143
|
|
|
144
|
+
def _local_email_roots() -> list[str]:
|
|
145
|
+
home = Path.home()
|
|
146
|
+
roots: list[Path] = [home / ".nexo" / "runtime" / "nexo-email"]
|
|
147
|
+
mac_roots = [
|
|
148
|
+
home / "Library" / "Mail",
|
|
149
|
+
home / "Library" / "Group Containers" / "UBF8T346G9.Office" / "Outlook" / "Outlook 15 Profiles",
|
|
150
|
+
]
|
|
151
|
+
local_app_data = Path(os.environ.get("LOCALAPPDATA") or home / "AppData" / "Local")
|
|
152
|
+
roaming_app_data = Path(os.environ.get("APPDATA") or home / "AppData" / "Roaming")
|
|
153
|
+
windows_roots = [
|
|
154
|
+
home / "Documents" / "Outlook Files",
|
|
155
|
+
local_app_data / "Microsoft" / "Outlook",
|
|
156
|
+
roaming_app_data / "Microsoft" / "Outlook",
|
|
157
|
+
local_app_data / "Packages" / "microsoft.windowscommunicationsapps_8wekyb3d8bbwe" / "LocalState",
|
|
158
|
+
]
|
|
159
|
+
linux_roots = [home / ".thunderbird", home / ".mozilla-thunderbird"]
|
|
160
|
+
|
|
161
|
+
if sys.platform == "darwin":
|
|
162
|
+
roots.extend(mac_roots)
|
|
163
|
+
elif sys.platform.startswith("win"):
|
|
164
|
+
roots.extend(windows_roots)
|
|
165
|
+
else:
|
|
166
|
+
roots.extend(linux_roots)
|
|
167
|
+
|
|
168
|
+
# CI and migrated profiles can expose platform-specific mail stores while
|
|
169
|
+
# running on another OS. Include only the stores that actually exist.
|
|
170
|
+
for optional_root in [*mac_roots, *windows_roots, *linux_roots]:
|
|
171
|
+
if optional_root.exists() and optional_root not in roots:
|
|
172
|
+
roots.append(optional_root)
|
|
173
|
+
return [str(root) for root in roots]
|
|
174
|
+
|
|
175
|
+
|
|
126
176
|
def default_roots() -> list[str]:
|
|
177
|
+
return [root for root, _depth in default_root_specs()]
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def default_root_specs() -> list[tuple[str, int]]:
|
|
127
181
|
home = Path.home()
|
|
128
182
|
configured = os.environ.get("NEXO_LOCAL_INDEX_DEFAULT_ROOTS", "").strip()
|
|
129
183
|
if configured:
|
|
130
|
-
return
|
|
131
|
-
|
|
184
|
+
return _dedupe_root_specs(
|
|
185
|
+
[(item, DEFAULT_ROOT_DEPTH) for item in configured.split(os.pathsep) if item.strip()]
|
|
186
|
+
)
|
|
187
|
+
return _dedupe_root_specs(
|
|
188
|
+
[(str(home), DEFAULT_ROOT_DEPTH)]
|
|
189
|
+
+ [(root, DEFAULT_EMAIL_ROOT_DEPTH) for root in _local_email_roots()]
|
|
190
|
+
+ [(root, DEFAULT_MOUNTED_ROOT_DEPTH) for root in _mounted_volume_roots()]
|
|
191
|
+
)
|
|
132
192
|
|
|
133
193
|
|
|
134
194
|
def ensure_default_roots() -> dict:
|
|
135
|
-
|
|
195
|
+
existing = {row["root_path"]: row for row in list_roots()}
|
|
136
196
|
created = []
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
continue
|
|
197
|
+
updated = []
|
|
198
|
+
for root, depth in default_root_specs():
|
|
140
199
|
candidate = Path(root).expanduser()
|
|
141
|
-
if candidate.exists()
|
|
142
|
-
|
|
143
|
-
|
|
200
|
+
if not candidate.exists() or not candidate.is_dir():
|
|
201
|
+
continue
|
|
202
|
+
existing_row = existing.get(norm_path(str(candidate)))
|
|
203
|
+
if existing_row:
|
|
204
|
+
current_depth = int(existing_row.get("depth") or 0)
|
|
205
|
+
if current_depth < depth:
|
|
206
|
+
conn = _conn()
|
|
207
|
+
conn.execute(
|
|
208
|
+
"UPDATE local_index_roots SET depth=?, updated_at=? WHERE root_path=?",
|
|
209
|
+
(depth, now(), existing_row["root_path"]),
|
|
210
|
+
)
|
|
211
|
+
conn.commit()
|
|
212
|
+
updated.append({"root_path": existing_row["root_path"], "depth": depth})
|
|
213
|
+
continue
|
|
214
|
+
created.append(add_root(str(candidate), mode="normal", depth=depth))
|
|
215
|
+
return {"ok": True, "created": len(created), "updated": len(updated), "roots": list_roots()}
|
|
144
216
|
|
|
145
217
|
|
|
146
218
|
def _should_skip_mounted_root(candidate: Path) -> bool:
|
|
@@ -471,7 +543,7 @@ def _file_type(path: Path) -> str:
|
|
|
471
543
|
return "photo"
|
|
472
544
|
if suffix in {".py", ".js", ".ts", ".tsx", ".jsx", ".php", ".sql", ".css", ".html"}:
|
|
473
545
|
return "code"
|
|
474
|
-
if suffix in {".eml"}:
|
|
546
|
+
if suffix in {".eml", ".emlx", ".msg", ".pst", ".ost"}:
|
|
475
547
|
return "email"
|
|
476
548
|
if suffix in {".pdf", ".docx", ".pptx", ".xlsx", ".md", ".txt", ".csv", ".tsv"}:
|
|
477
549
|
return "document"
|
|
@@ -1316,7 +1388,7 @@ def process_jobs(*, limit: int = 100) -> dict:
|
|
|
1316
1388
|
if job_type == "light_extraction":
|
|
1317
1389
|
text, metadata = extract_text(Path(row["path"]))
|
|
1318
1390
|
version_id = _latest_version_id(conn, asset_id)
|
|
1319
|
-
if contains_secret(text):
|
|
1391
|
+
if metadata.get("content_secret_detected") or contains_secret(text):
|
|
1320
1392
|
_mark_content_secret_assets(conn, [asset_id])
|
|
1321
1393
|
conn.execute(
|
|
1322
1394
|
"UPDATE local_index_jobs SET status='done', updated_at=?, last_error_code='content_secret_blocked' WHERE job_id=?",
|
|
@@ -1652,6 +1724,29 @@ def _service_cycle_observation(conn) -> dict:
|
|
|
1652
1724
|
return observation
|
|
1653
1725
|
|
|
1654
1726
|
|
|
1727
|
+
def _index_timing(conn, *, done: int, active_jobs: int, percent: int) -> dict:
|
|
1728
|
+
first_seen = conn.execute(
|
|
1729
|
+
"""
|
|
1730
|
+
SELECT MIN(created_at) AS created_at
|
|
1731
|
+
FROM local_index_logs
|
|
1732
|
+
WHERE event IN ('root_added', 'scan_started', 'scan_finished', 'jobs_processed', 'service_cycle_finished')
|
|
1733
|
+
"""
|
|
1734
|
+
).fetchone()["created_at"] or 0
|
|
1735
|
+
if not first_seen:
|
|
1736
|
+
first_seen = conn.execute(
|
|
1737
|
+
"""
|
|
1738
|
+
SELECT MIN(first_seen_at) AS first_seen_at
|
|
1739
|
+
FROM local_assets
|
|
1740
|
+
WHERE status!='deleted'
|
|
1741
|
+
"""
|
|
1742
|
+
).fetchone()["first_seen_at"] or 0
|
|
1743
|
+
elapsed_seconds = max(0, int(now() - float(first_seen))) if first_seen else 0
|
|
1744
|
+
eta_seconds = None
|
|
1745
|
+
if elapsed_seconds > 0 and done > 0 and active_jobs > 0 and 0 < percent < 100:
|
|
1746
|
+
eta_seconds = max(0, int((elapsed_seconds / max(done, 1)) * active_jobs))
|
|
1747
|
+
return {"elapsed_seconds": elapsed_seconds, "eta_seconds": eta_seconds}
|
|
1748
|
+
|
|
1749
|
+
|
|
1655
1750
|
def _service_scheduler_has_error(service: dict) -> bool:
|
|
1656
1751
|
if service.get("manager") == "launchagent":
|
|
1657
1752
|
code = str(service.get("last_exit_code") or "").strip()
|
|
@@ -1725,6 +1820,7 @@ def status() -> dict:
|
|
|
1725
1820
|
active_jobs = pending + running_jobs + failed_jobs
|
|
1726
1821
|
total_jobs = active_jobs + done
|
|
1727
1822
|
percent = 100 if total_jobs == 0 else int((done / max(total_jobs, 1)) * 100)
|
|
1823
|
+
timing = _index_timing(conn, done=done, active_jobs=active_jobs, percent=percent)
|
|
1728
1824
|
roots = list_roots()
|
|
1729
1825
|
volumes = []
|
|
1730
1826
|
by_volume = conn.execute(
|
|
@@ -1770,8 +1866,8 @@ def status() -> dict:
|
|
|
1770
1866
|
"jobs_pending": pending,
|
|
1771
1867
|
"jobs_running": running_jobs,
|
|
1772
1868
|
"jobs_failed": failed_jobs,
|
|
1773
|
-
"elapsed_seconds":
|
|
1774
|
-
"eta_seconds":
|
|
1869
|
+
"elapsed_seconds": timing["elapsed_seconds"],
|
|
1870
|
+
"eta_seconds": timing["eta_seconds"],
|
|
1775
1871
|
},
|
|
1776
1872
|
"volumes": volumes,
|
|
1777
1873
|
"roots": roots,
|
|
@@ -1856,10 +1952,112 @@ def _search_text_score(query: str, text: str) -> float:
|
|
|
1856
1952
|
return len(q & tokens) / max(len(q), 1)
|
|
1857
1953
|
|
|
1858
1954
|
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1955
|
+
_QUERY_STOPWORDS = {
|
|
1956
|
+
"about",
|
|
1957
|
+
"archivos",
|
|
1958
|
+
"con",
|
|
1959
|
+
"context",
|
|
1960
|
+
"contexto",
|
|
1961
|
+
"cuanto",
|
|
1962
|
+
"dame",
|
|
1963
|
+
"del",
|
|
1964
|
+
"desde",
|
|
1965
|
+
"documentos",
|
|
1966
|
+
"donde",
|
|
1967
|
+
"esta",
|
|
1968
|
+
"está",
|
|
1969
|
+
"file",
|
|
1970
|
+
"files",
|
|
1971
|
+
"hay",
|
|
1972
|
+
"los",
|
|
1973
|
+
"para",
|
|
1974
|
+
"que",
|
|
1975
|
+
"qué",
|
|
1976
|
+
"related",
|
|
1977
|
+
"relacionado",
|
|
1978
|
+
"sabes",
|
|
1979
|
+
"sobre",
|
|
1980
|
+
"todo",
|
|
1981
|
+
"what",
|
|
1982
|
+
"where",
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
|
|
1986
|
+
def _query_terms(query: str) -> list[str]:
|
|
1987
|
+
terms = []
|
|
1988
|
+
for token in tokenize(query):
|
|
1989
|
+
if len(token) < 3 or token in _QUERY_STOPWORDS:
|
|
1990
|
+
continue
|
|
1991
|
+
if token not in terms:
|
|
1992
|
+
terms.append(token)
|
|
1993
|
+
return terms[:10]
|
|
1994
|
+
|
|
1995
|
+
|
|
1996
|
+
def _entity_match_score(query_lower: str, terms: list[str], name: str) -> float:
|
|
1997
|
+
entity = (name or "").strip().lower()
|
|
1998
|
+
if not entity:
|
|
1999
|
+
return 0.0
|
|
2000
|
+
entity_terms = set(tokenize(entity))
|
|
2001
|
+
if entity and entity in query_lower:
|
|
2002
|
+
return 1.0
|
|
2003
|
+
if not terms:
|
|
2004
|
+
return 0.0
|
|
2005
|
+
term_set = set(terms)
|
|
2006
|
+
overlap = term_set & entity_terms
|
|
2007
|
+
if overlap:
|
|
2008
|
+
return min(0.95, 0.45 + (len(overlap) / max(len(entity_terms), 1)) * 0.5)
|
|
2009
|
+
if any(term in entity for term in terms):
|
|
2010
|
+
return 0.6
|
|
2011
|
+
return 0.0
|
|
2012
|
+
|
|
2013
|
+
|
|
2014
|
+
def _entity_matches_for_query(conn, query: str, *, limit: int) -> tuple[list[dict], dict[str, float]]:
|
|
2015
|
+
query_lower = (query or "").strip().lower()
|
|
2016
|
+
terms = _query_terms(query)
|
|
2017
|
+
if not query_lower or not terms:
|
|
2018
|
+
return [], {}
|
|
2019
|
+
|
|
2020
|
+
clauses = " OR ".join("lower(e.name) LIKE ?" for _ in terms)
|
|
2021
|
+
params = [f"%{term}%" for term in terms]
|
|
1862
2022
|
rows = conn.execute(
|
|
2023
|
+
f"""
|
|
2024
|
+
SELECT DISTINCT e.name, e.entity_type, e.asset_id, a.path, a.privacy_class
|
|
2025
|
+
FROM local_entities e
|
|
2026
|
+
JOIN local_assets a ON a.asset_id = e.asset_id
|
|
2027
|
+
WHERE a.status='active'
|
|
2028
|
+
AND a.privacy_class='normal'
|
|
2029
|
+
AND ({clauses})
|
|
2030
|
+
LIMIT ?
|
|
2031
|
+
""",
|
|
2032
|
+
[*params, max(int(limit) * 20, 40)],
|
|
2033
|
+
).fetchall()
|
|
2034
|
+
|
|
2035
|
+
matches = []
|
|
2036
|
+
boosts: dict[str, float] = {}
|
|
2037
|
+
seen = set()
|
|
2038
|
+
for row in rows:
|
|
2039
|
+
if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
|
|
2040
|
+
continue
|
|
2041
|
+
score = _entity_match_score(query_lower, terms, str(row["name"] or ""))
|
|
2042
|
+
if score <= 0:
|
|
2043
|
+
continue
|
|
2044
|
+
key = (row["name"], row["entity_type"], row["asset_id"])
|
|
2045
|
+
if key not in seen:
|
|
2046
|
+
matches.append({
|
|
2047
|
+
"name": row["name"],
|
|
2048
|
+
"entity_type": row["entity_type"],
|
|
2049
|
+
"asset_id": row["asset_id"],
|
|
2050
|
+
"score": round(float(score), 4),
|
|
2051
|
+
})
|
|
2052
|
+
seen.add(key)
|
|
2053
|
+
boosts[row["asset_id"]] = max(boosts.get(row["asset_id"], 0.0), float(score))
|
|
2054
|
+
|
|
2055
|
+
matches.sort(key=lambda item: item.get("score", 0), reverse=True)
|
|
2056
|
+
return matches[: int(limit)], boosts
|
|
2057
|
+
|
|
2058
|
+
|
|
2059
|
+
def _context_candidate_rows(conn, entity_asset_ids: list[str], *, base_limit: int = 5000) -> list:
|
|
2060
|
+
base_rows = conn.execute(
|
|
1863
2061
|
"""
|
|
1864
2062
|
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
|
|
1865
2063
|
FROM local_chunks c
|
|
@@ -1869,17 +2067,68 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1869
2067
|
WHERE a.status='active'
|
|
1870
2068
|
AND a.privacy_class='normal'
|
|
1871
2069
|
ORDER BY c.created_at DESC
|
|
1872
|
-
LIMIT
|
|
1873
|
-
"""
|
|
2070
|
+
LIMIT ?
|
|
2071
|
+
""",
|
|
2072
|
+
(int(base_limit),),
|
|
2073
|
+
).fetchall()
|
|
2074
|
+
if not entity_asset_ids:
|
|
2075
|
+
return base_rows
|
|
2076
|
+
|
|
2077
|
+
placeholders = ",".join("?" for _ in entity_asset_ids)
|
|
2078
|
+
entity_rows = conn.execute(
|
|
2079
|
+
f"""
|
|
2080
|
+
SELECT c.chunk_id, c.asset_id, c.text, a.path, a.file_type, a.privacy_class, v.summary, e.vector_json
|
|
2081
|
+
FROM local_chunks c
|
|
2082
|
+
JOIN local_assets a ON a.asset_id = c.asset_id
|
|
2083
|
+
LEFT JOIN local_asset_versions v ON v.version_id = c.version_id
|
|
2084
|
+
LEFT JOIN local_embeddings e ON e.chunk_id = c.chunk_id
|
|
2085
|
+
WHERE a.status='active'
|
|
2086
|
+
AND a.privacy_class='normal'
|
|
2087
|
+
AND c.asset_id IN ({placeholders})
|
|
2088
|
+
ORDER BY c.chunk_index ASC
|
|
2089
|
+
LIMIT ?
|
|
2090
|
+
""",
|
|
2091
|
+
[*entity_asset_ids, max(1000, len(entity_asset_ids) * 80)],
|
|
1874
2092
|
).fetchall()
|
|
2093
|
+
|
|
2094
|
+
rows = []
|
|
2095
|
+
seen_chunks = set()
|
|
2096
|
+
for row in [*entity_rows, *base_rows]:
|
|
2097
|
+
chunk_id = row["chunk_id"]
|
|
2098
|
+
if chunk_id in seen_chunks:
|
|
2099
|
+
continue
|
|
2100
|
+
seen_chunks.add(chunk_id)
|
|
2101
|
+
rows.append(row)
|
|
2102
|
+
return rows
|
|
2103
|
+
|
|
2104
|
+
|
|
2105
|
+
def context_query(query: str, *, intent: str = "answer", limit: int = 12, evidence_required: bool = True, current_context: str = "") -> dict:
|
|
2106
|
+
conn = _conn()
|
|
2107
|
+
qvec = embeddings.embed_text(query)
|
|
2108
|
+
entities_payload, entity_boosts = _entity_matches_for_query(conn, query, limit=max(int(limit), 1))
|
|
2109
|
+
rows = _context_candidate_rows(conn, list(entity_boosts.keys()), base_limit=5000)
|
|
1875
2110
|
scored = []
|
|
1876
2111
|
for row in rows:
|
|
1877
2112
|
if not is_queryable_path(str(row["path"] or ""), str(row["privacy_class"] or "")):
|
|
1878
2113
|
continue
|
|
1879
2114
|
vector = json_loads(row["vector_json"], [])
|
|
1880
|
-
|
|
2115
|
+
text_score = _search_text_score(query, row["text"])
|
|
2116
|
+
path_score = _search_text_score(query, row["path"] or "")
|
|
2117
|
+
summary_score = _search_text_score(query, row["summary"] or "")
|
|
2118
|
+
entity_score = entity_boosts.get(row["asset_id"], 0.0)
|
|
2119
|
+
vector_score = embeddings.cosine(qvec, vector)
|
|
2120
|
+
score = max(text_score, path_score, summary_score, vector_score)
|
|
2121
|
+
if entity_score > 0:
|
|
2122
|
+
direct_score = max(text_score, path_score, summary_score)
|
|
2123
|
+
if direct_score > 0:
|
|
2124
|
+
entity_rank = 0.82 + (0.42 * text_score) + (0.18 * path_score) + (0.12 * summary_score)
|
|
2125
|
+
score = max(score, entity_rank + min(0.2, entity_score * 0.2))
|
|
2126
|
+
else:
|
|
2127
|
+
# Entity-level matches keep older assets eligible, but do not let
|
|
2128
|
+
# unrelated chunks from a long document outrank direct evidence.
|
|
2129
|
+
score = max(score, min(0.48, 0.28 + entity_score * 0.2))
|
|
1881
2130
|
if score > 0:
|
|
1882
|
-
scored.append((score, row))
|
|
2131
|
+
scored.append((min(float(score), 1.6), row))
|
|
1883
2132
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
1884
2133
|
assets = []
|
|
1885
2134
|
chunks = []
|
|
@@ -1902,14 +2151,10 @@ def context_query(query: str, *, intent: str = "answer", limit: int = 12, eviden
|
|
|
1902
2151
|
"score": round(float(score), 4),
|
|
1903
2152
|
})
|
|
1904
2153
|
evidence_refs.append(f"local_asset:{row['asset_id']}#chunk:{row['chunk_id']}")
|
|
1905
|
-
entity_rows = conn.execute(
|
|
1906
|
-
"SELECT DISTINCT name, entity_type, asset_id FROM local_entities WHERE lower(name) LIKE ? LIMIT ?",
|
|
1907
|
-
(f"%{query.lower()}%", int(limit)),
|
|
1908
|
-
).fetchall()
|
|
1909
|
-
entities_payload = [dict(row) for row in entity_rows]
|
|
1910
2154
|
relations_payload: list[dict] = []
|
|
1911
|
-
|
|
1912
|
-
|
|
2155
|
+
relation_asset_ids = list(dict.fromkeys([*seen_assets, *entity_boosts.keys()]))[: int(limit)]
|
|
2156
|
+
if relation_asset_ids:
|
|
2157
|
+
asset_ids = relation_asset_ids
|
|
1913
2158
|
placeholders = ",".join("?" for _ in asset_ids)
|
|
1914
2159
|
relation_rows = conn.execute(
|
|
1915
2160
|
f"""
|
|
@@ -4,12 +4,15 @@ import csv
|
|
|
4
4
|
import html
|
|
5
5
|
import json
|
|
6
6
|
import re
|
|
7
|
+
import sqlite3
|
|
7
8
|
import zipfile
|
|
8
9
|
from email import policy
|
|
9
10
|
from email.parser import BytesParser
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from xml.etree import ElementTree
|
|
12
13
|
|
|
14
|
+
from .privacy import is_local_email_db
|
|
15
|
+
|
|
13
16
|
MAX_TEXT_BYTES = 512 * 1024
|
|
14
17
|
MAX_CHARS = 120_000
|
|
15
18
|
|
|
@@ -38,6 +41,7 @@ SECRET_PATTERNS: tuple[re.Pattern, ...] = (
|
|
|
38
41
|
re.compile(r"\bpk-(?:[a-z]+-)?[A-Za-z0-9_\-]{20,}\b"),
|
|
39
42
|
re.compile(r"\b(ghp|gho|ghu|ghs|ghr|github_pat|glpat|xoxb|xoxp|shpat)_[A-Za-z0-9_]{16,}\b", re.I),
|
|
40
43
|
re.compile(r"\b(AKIA|ASIA)[A-Z0-9]{16,}\b"),
|
|
44
|
+
re.compile(r"\bAIza[0-9A-Za-z_-]{30,}\b"),
|
|
41
45
|
re.compile(r"\bey[A-Za-z0-9_-]{10,}\.ey[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"),
|
|
42
46
|
re.compile(r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP )?PRIVATE KEY-----", re.I),
|
|
43
47
|
re.compile(r"\b([A-Z][A-Z0-9_]*(?:TOKEN|SECRET|KEY|PASSWORD|PASS)\s*[:=]\s*)['\"]?[A-Za-z0-9._/+=\-]{12,}", re.I),
|
|
@@ -73,8 +77,8 @@ def _extract_csv(path: Path) -> str:
|
|
|
73
77
|
return "\n".join(rows)[:MAX_CHARS]
|
|
74
78
|
|
|
75
79
|
|
|
76
|
-
def
|
|
77
|
-
msg = BytesParser(policy=policy.default).parsebytes(
|
|
80
|
+
def _extract_email_bytes(data: bytes) -> tuple[str, dict]:
|
|
81
|
+
msg = BytesParser(policy=policy.default).parsebytes(data[:MAX_TEXT_BYTES])
|
|
78
82
|
meta = {
|
|
79
83
|
"subject": str(msg.get("subject") or ""),
|
|
80
84
|
"from": str(msg.get("from") or ""),
|
|
@@ -92,6 +96,99 @@ def _extract_eml(path: Path) -> tuple[str, dict]:
|
|
|
92
96
|
return "\n".join([meta["subject"], meta["from"], meta["to"], text])[:MAX_CHARS], meta
|
|
93
97
|
|
|
94
98
|
|
|
99
|
+
def _extract_eml(path: Path) -> tuple[str, dict]:
|
|
100
|
+
return _extract_email_bytes(path.read_bytes()[:MAX_TEXT_BYTES])
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _extract_emlx(path: Path) -> tuple[str, dict]:
|
|
104
|
+
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
|
105
|
+
first_line, separator, rest = data.partition(b"\n")
|
|
106
|
+
if separator and first_line.strip().isdigit():
|
|
107
|
+
declared = int(first_line.strip() or b"0")
|
|
108
|
+
payload = rest[:declared] if declared > 0 else rest
|
|
109
|
+
else:
|
|
110
|
+
payload = data
|
|
111
|
+
if b"\n<?xml" in payload:
|
|
112
|
+
payload = payload.split(b"\n<?xml", 1)[0]
|
|
113
|
+
text, meta = _extract_email_bytes(payload)
|
|
114
|
+
meta["apple_mail_message"] = True
|
|
115
|
+
return text, meta
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _printable_binary_text(path: Path) -> str:
|
|
119
|
+
data = path.read_bytes()[:MAX_TEXT_BYTES]
|
|
120
|
+
decoded = data.decode("utf-16", errors="ignore") if b"\x00" in data[:2000] else data.decode("latin-1", errors="ignore")
|
|
121
|
+
pieces = re.findall(r"[\wÀ-ÿ@./:=+\- ,;()\\[\\]{}]{4,}", decoded)
|
|
122
|
+
return "\n".join(piece.strip() for piece in pieces if piece.strip())[:MAX_CHARS]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _extract_msg(path: Path) -> tuple[str, dict]:
|
|
126
|
+
try:
|
|
127
|
+
import extract_msg # type: ignore
|
|
128
|
+
message = extract_msg.Message(str(path))
|
|
129
|
+
meta = {
|
|
130
|
+
"subject": str(getattr(message, "subject", "") or ""),
|
|
131
|
+
"from": str(getattr(message, "sender", "") or ""),
|
|
132
|
+
"to": str(getattr(message, "to", "") or ""),
|
|
133
|
+
"date": str(getattr(message, "date", "") or ""),
|
|
134
|
+
"extractor": "msg",
|
|
135
|
+
}
|
|
136
|
+
body = str(getattr(message, "body", "") or "")
|
|
137
|
+
close = getattr(message, "close", None)
|
|
138
|
+
if callable(close):
|
|
139
|
+
close()
|
|
140
|
+
return "\n".join([meta["subject"], meta["from"], meta["to"], body])[:MAX_CHARS], meta
|
|
141
|
+
except Exception:
|
|
142
|
+
return _printable_binary_text(path), {"extractor": "msg_fallback"}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _table_names(conn: sqlite3.Connection) -> set[str]:
|
|
146
|
+
rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
|
147
|
+
return {str(row[0]) for row in rows}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _select_existing_columns(conn: sqlite3.Connection, table: str, columns: list[str]) -> list[str]:
|
|
151
|
+
found = {str(row[1]) for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
|
|
152
|
+
return [column for column in columns if column in found]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _extract_nexo_email_db(path: Path) -> tuple[str, dict]:
|
|
156
|
+
if not is_local_email_db(str(path)):
|
|
157
|
+
return "", {"extractor": "sqlite_blocked"}
|
|
158
|
+
uri = f"file:{path}?mode=ro"
|
|
159
|
+
parts: list[str] = []
|
|
160
|
+
try:
|
|
161
|
+
conn = sqlite3.connect(uri, uri=True, timeout=1)
|
|
162
|
+
except Exception:
|
|
163
|
+
return "", {"extractor": "nexo_email_db", "state": "locked_or_unavailable"}
|
|
164
|
+
try:
|
|
165
|
+
tables = _table_names(conn)
|
|
166
|
+
if "emails" in tables:
|
|
167
|
+
cols = _select_existing_columns(
|
|
168
|
+
conn,
|
|
169
|
+
"emails",
|
|
170
|
+
["from_addr", "from_name", "subject", "received_at", "status", "body", "response"],
|
|
171
|
+
)
|
|
172
|
+
if not cols:
|
|
173
|
+
return "", {"extractor": "nexo_email_db", "tables": sorted(tables)}
|
|
174
|
+
order = "received_at" if "received_at" in cols else "rowid"
|
|
175
|
+
for row in conn.execute(f"SELECT {', '.join(cols)} FROM emails ORDER BY {order} DESC LIMIT 1000").fetchall():
|
|
176
|
+
parts.append(" | ".join(str(value or "")[:4000] for value in row))
|
|
177
|
+
if "sent_email_events" in tables:
|
|
178
|
+
cols = _select_existing_columns(
|
|
179
|
+
conn,
|
|
180
|
+
"sent_email_events",
|
|
181
|
+
["sender", "to_addrs", "cc_addrs", "subject", "sent_at", "status", "body_text"],
|
|
182
|
+
)
|
|
183
|
+
if cols:
|
|
184
|
+
order = "sent_at" if "sent_at" in cols else "rowid"
|
|
185
|
+
for row in conn.execute(f"SELECT {', '.join(cols)} FROM sent_email_events ORDER BY {order} DESC LIMIT 1000").fetchall():
|
|
186
|
+
parts.append(" | ".join(str(value or "")[:4000] for value in row))
|
|
187
|
+
finally:
|
|
188
|
+
conn.close()
|
|
189
|
+
return "\n".join(parts)[:MAX_CHARS], {"extractor": "nexo_email_db", "tables": sorted(tables) if "tables" in locals() else []}
|
|
190
|
+
|
|
191
|
+
|
|
95
192
|
def _zip_xml_text(path: Path, members: list[str]) -> str:
|
|
96
193
|
pieces: list[str] = []
|
|
97
194
|
with zipfile.ZipFile(path) as zf:
|
|
@@ -176,6 +273,14 @@ def extract_text(path: Path) -> tuple[str, dict]:
|
|
|
176
273
|
elif suffix == ".eml":
|
|
177
274
|
text, metadata = _extract_eml(path)
|
|
178
275
|
metadata["extractor"] = "eml"
|
|
276
|
+
elif suffix == ".emlx":
|
|
277
|
+
text, metadata = _extract_emlx(path)
|
|
278
|
+
metadata["extractor"] = "emlx"
|
|
279
|
+
elif suffix == ".msg":
|
|
280
|
+
text, metadata = _extract_msg(path)
|
|
281
|
+
metadata["extractor"] = metadata.get("extractor") or "msg"
|
|
282
|
+
elif suffix == ".db" and is_local_email_db(str(path)):
|
|
283
|
+
text, metadata = _extract_nexo_email_db(path)
|
|
179
284
|
elif suffix == ".pdf":
|
|
180
285
|
text = _extract_pdf(path)
|
|
181
286
|
elif suffix == ".docx":
|
|
@@ -186,6 +291,8 @@ def extract_text(path: Path) -> tuple[str, dict]:
|
|
|
186
291
|
text = _extract_xlsx(path)
|
|
187
292
|
else:
|
|
188
293
|
text = ""
|
|
294
|
+
if contains_secret(text):
|
|
295
|
+
metadata["content_secret_detected"] = True
|
|
189
296
|
return clean_text(text), metadata
|
|
190
297
|
|
|
191
298
|
|
|
@@ -67,6 +67,36 @@ SENSITIVE_PARTS = {
|
|
|
67
67
|
"browser profile",
|
|
68
68
|
}
|
|
69
69
|
|
|
70
|
+
EMAIL_RUNTIME_DB_NAMES = {
|
|
71
|
+
"email.db",
|
|
72
|
+
"email-tracker.db",
|
|
73
|
+
"emails.db",
|
|
74
|
+
"monitor.db",
|
|
75
|
+
"nexo-email.db",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
EMAIL_ATTACHMENT_SUFFIXES = {
|
|
79
|
+
".csv",
|
|
80
|
+
".docx",
|
|
81
|
+
".eml",
|
|
82
|
+
".emlx",
|
|
83
|
+
".html",
|
|
84
|
+
".md",
|
|
85
|
+
".pdf",
|
|
86
|
+
".pptx",
|
|
87
|
+
".txt",
|
|
88
|
+
".xlsx",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
EMAIL_EXTRACTABLE_SUFFIXES = {".eml", ".emlx", ".msg"}
|
|
92
|
+
OUTLOOK_MAC_INVENTORY_SUFFIXES = {
|
|
93
|
+
".olk15message",
|
|
94
|
+
".olk15msgsource",
|
|
95
|
+
".olk15msgattach",
|
|
96
|
+
".olk15event",
|
|
97
|
+
".olk15contact",
|
|
98
|
+
}
|
|
99
|
+
|
|
70
100
|
NOISY_PARTS = {
|
|
71
101
|
"node_modules",
|
|
72
102
|
"vendor",
|
|
@@ -173,6 +203,77 @@ def _contains_path_marker(lowered: str, markers: set[str]) -> bool:
|
|
|
173
203
|
return any(marker in lowered for marker in markers)
|
|
174
204
|
|
|
175
205
|
|
|
206
|
+
def _is_under_marker(lowered: str, marker: str) -> bool:
|
|
207
|
+
marker = marker.strip("/").lower()
|
|
208
|
+
if not marker:
|
|
209
|
+
return False
|
|
210
|
+
return lowered.endswith("/" + marker) or f"/{marker}/" in lowered
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _is_inside_windows_mail_package(lowered: str) -> bool:
|
|
214
|
+
return "/appdata/local/packages/microsoft.windowscommunicationsapps" in lowered
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _is_inside_outlook_mac_profile(lowered: str) -> bool:
|
|
218
|
+
return "/library/group containers/ubf8t346g9.office/outlook" in lowered
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def is_local_email_tree(path: str) -> bool:
|
|
222
|
+
lowered = _normalized(path)
|
|
223
|
+
if _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
|
|
224
|
+
return True
|
|
225
|
+
return any(
|
|
226
|
+
_is_under_marker(lowered, marker)
|
|
227
|
+
for marker in (
|
|
228
|
+
"library/mail",
|
|
229
|
+
".nexo/runtime/nexo-email",
|
|
230
|
+
"documents/outlook files",
|
|
231
|
+
"appdata/local/microsoft/outlook",
|
|
232
|
+
"appdata/roaming/microsoft/outlook",
|
|
233
|
+
"appdata/local/packages/microsoft.windowscommunicationsapps",
|
|
234
|
+
".thunderbird",
|
|
235
|
+
".mozilla-thunderbird",
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def is_local_email_db(path: str) -> bool:
|
|
241
|
+
p = Path(path)
|
|
242
|
+
return is_local_email_tree(path) and p.name.lower() in EMAIL_RUNTIME_DB_NAMES
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def is_allowed_local_email_file(path: str) -> bool:
|
|
246
|
+
if not is_local_email_tree(path):
|
|
247
|
+
return False
|
|
248
|
+
p = Path(path)
|
|
249
|
+
lowered = _normalized(path)
|
|
250
|
+
suffix = p.suffix.lower()
|
|
251
|
+
if is_sensitive_path(path):
|
|
252
|
+
return False
|
|
253
|
+
if _is_under_marker(lowered, ".nexo/runtime/nexo-email"):
|
|
254
|
+
if is_local_email_db(path):
|
|
255
|
+
return True
|
|
256
|
+
if _is_under_marker(lowered, ".nexo/runtime/nexo-email/attachments"):
|
|
257
|
+
return suffix in EMAIL_ATTACHMENT_SUFFIXES
|
|
258
|
+
return suffix in {".eml", ".emlx"}
|
|
259
|
+
if _is_under_marker(lowered, "library/mail"):
|
|
260
|
+
return suffix in {".eml", ".emlx"}
|
|
261
|
+
if any(
|
|
262
|
+
_is_under_marker(lowered, marker)
|
|
263
|
+
for marker in (
|
|
264
|
+
"library/group containers/ubf8t346g9.office/outlook",
|
|
265
|
+
"documents/outlook files",
|
|
266
|
+
"appdata/local/microsoft/outlook",
|
|
267
|
+
"appdata/roaming/microsoft/outlook",
|
|
268
|
+
"appdata/local/packages/microsoft.windowscommunicationsapps",
|
|
269
|
+
)
|
|
270
|
+
) or _is_inside_windows_mail_package(lowered) or _is_inside_outlook_mac_profile(lowered):
|
|
271
|
+
return suffix in {".eml", ".msg", ".pst", ".ost"} | OUTLOOK_MAC_INVENTORY_SUFFIXES
|
|
272
|
+
if _is_under_marker(lowered, ".thunderbird") or _is_under_marker(lowered, ".mozilla-thunderbird"):
|
|
273
|
+
return suffix in {".eml", ".mbox", ""}
|
|
274
|
+
return False
|
|
275
|
+
|
|
276
|
+
|
|
176
277
|
def _has_transient_project_part(path: str) -> bool:
|
|
177
278
|
parts = list(_normalized(path).replace(":", "/").split("/"))
|
|
178
279
|
for index, part in enumerate(parts):
|
|
@@ -239,6 +340,8 @@ def classify_path(path: str) -> tuple[int, str, str]:
|
|
|
239
340
|
lowered = _normalized(path)
|
|
240
341
|
parts = _parts(path)
|
|
241
342
|
|
|
343
|
+
if is_local_email_tree(path) and (Path(path).suffix == "" or is_allowed_local_email_file(path)):
|
|
344
|
+
return 2, "normal", "local_email_path"
|
|
242
345
|
if is_sensitive_path(path):
|
|
243
346
|
return 1, "sensitive_inventory_only", "sensitive_path"
|
|
244
347
|
if is_private_profile_path(path):
|
|
@@ -253,6 +356,8 @@ def classify_path(path: str) -> tuple[int, str, str]:
|
|
|
253
356
|
def should_skip_tree(path: str) -> bool:
|
|
254
357
|
lowered = _normalized(path)
|
|
255
358
|
parts = _parts(path)
|
|
359
|
+
if is_local_email_tree(path):
|
|
360
|
+
return False
|
|
256
361
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
257
362
|
return True
|
|
258
363
|
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
@@ -263,6 +368,8 @@ def should_skip_tree(path: str) -> bool:
|
|
|
263
368
|
def should_skip_file(path: str) -> bool:
|
|
264
369
|
lowered = _normalized(path)
|
|
265
370
|
parts = _parts(path)
|
|
371
|
+
if is_local_email_tree(path):
|
|
372
|
+
return not is_allowed_local_email_file(path)
|
|
266
373
|
if any(item in lowered for item in SYSTEM_PARTS):
|
|
267
374
|
return True
|
|
268
375
|
if is_sensitive_path(path) or is_private_profile_path(path):
|
|
@@ -282,6 +389,8 @@ def should_extract(path: str, depth: int) -> bool:
|
|
|
282
389
|
if should_skip_file(path):
|
|
283
390
|
return False
|
|
284
391
|
suffix = Path(path).suffix.lower()
|
|
392
|
+
if is_local_email_db(path):
|
|
393
|
+
return True
|
|
285
394
|
if suffix in {
|
|
286
395
|
".txt",
|
|
287
396
|
".md",
|
|
@@ -302,6 +411,8 @@ def should_extract(path: str, depth: int) -> bool:
|
|
|
302
411
|
".csv",
|
|
303
412
|
".tsv",
|
|
304
413
|
".eml",
|
|
414
|
+
".emlx",
|
|
415
|
+
".msg",
|
|
305
416
|
".pdf",
|
|
306
417
|
".docx",
|
|
307
418
|
".pptx",
|
package/src/tools_hot_context.py
CHANGED
|
@@ -43,6 +43,15 @@ def _format_local_context_evidence(query: str, *, limit: int = 4) -> str:
|
|
|
43
43
|
refs = result.get("evidence_refs") or []
|
|
44
44
|
if refs:
|
|
45
45
|
lines.append(f"Evidence refs: {', '.join(str(ref) for ref in refs[:limit])}")
|
|
46
|
+
relations = result.get("relations") or []
|
|
47
|
+
if relations:
|
|
48
|
+
lines.append("Local relations:")
|
|
49
|
+
for relation in relations[:limit]:
|
|
50
|
+
relation_type = str(relation.get("relation_type") or "related")
|
|
51
|
+
target = str(relation.get("target_ref") or relation.get("target_asset_id") or "").strip()
|
|
52
|
+
evidence = str(relation.get("evidence") or "").strip()
|
|
53
|
+
suffix = f" — {evidence[:120]}" if evidence else ""
|
|
54
|
+
lines.append(f"- {relation_type}: {target}{suffix}")
|
|
46
55
|
return "\n".join(lines)
|
|
47
56
|
|
|
48
57
|
|