engram-vault 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: engram-vault
3
+ Version: 0.1.0
4
+ Summary: Engram — local-first personal memory layer for AI agents: markdown vault + hybrid retrieval, exposed over MCP.
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: mcp[cli]>=1.27.1
7
+ Requires-Dist: python-ulid>=3.1.0
8
+ Requires-Dist: pyyaml>=6.0.3
9
+ Requires-Dist: fastembed>=0.4.0
10
+ Requires-Dist: hnswlib>=0.8.0
11
+ Requires-Dist: numpy>=2.4.6
@@ -0,0 +1,142 @@
1
+ """One-time backfill: embed the vault and build the FastKB index.
2
+
3
+ Root cause of ticket #1: add() embedded a query vector to FIND neighbors but
4
+ never PERSISTED the new fact's own vector, so the index only ever grew via the
5
+ UI's transformers.js path (16 of 43.7k facts). This rebuilds from scratch by
6
+ embedding every fact with the server's model (BAAI/bge-small-en-v1.5).
7
+
8
+ Two phases, deliberately separated:
9
+ PHASE 1 — embed + append (streaming, resumable, O(N)). Vectors are appended to
10
+ vectors.f32 and metadata to manifest.jsonl in batches. A kill/crash keeps
11
+ progress; a re-run truncates vectors.f32 to the manifest row count (repairing
12
+ any half-written batch) and continues. NO hnsw here — building it per-batch
13
+ is what made the old version go O(N^2) (re-mmap + re-save the growing index
14
+ every batch, rate collapsing 72->11/s).
15
+ PHASE 2 — build hnsw.bin ONCE from the full vectors.f32 (a few seconds). It is
16
+ fully rebuildable from vectors+manifest, so it is fine to drop and redo.
17
+
18
+ TEXT_CAP: bge-small only consumes the first 512 tokens, so we truncate each fact
19
+ before embedding — lossless for the model, but huge scraped raw/ pages no longer
20
+ cost seconds each to tokenize.
21
+
22
+ Usage:
23
+ KB_DIR="$HOME/Library/Application Support/KB" \
24
+ uv run --project server python server/backfill.py [--limit N]
25
+ [--dirs knowledge,events,...] [--fresh] [--batch 512]
26
+ """
27
+ from __future__ import annotations
28
+ import argparse, json, sys, time
29
+ from pathlib import Path
30
+
31
+ import numpy as np
32
+
33
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
34
+ import server # noqa
35
+ import fastkb # noqa
36
+
37
+ TEXT_CAP = 2000 # chars; ~512 tokens — the model truncates past this anyway
38
+
39
+
40
+ def fact_text(f: dict) -> str:
41
+ return (f["content"] + " " + " ".join(f.get("tags") or []))[:TEXT_CAP]
42
+
43
+
44
+ def _read_manifest(fk) -> list[dict]:
45
+ if not fk.manifest_path.exists():
46
+ return []
47
+ out = []
48
+ with fk.manifest_path.open() as f:
49
+ for line in f:
50
+ line = line.strip()
51
+ if line:
52
+ out.append(json.loads(line))
53
+ return out
54
+
55
+
56
+ def main():
57
+ ap = argparse.ArgumentParser()
58
+ ap.add_argument("--limit", type=int, default=0)
59
+ ap.add_argument("--dirs", type=str, default="")
60
+ ap.add_argument("--batch", type=int, default=512)
61
+ ap.add_argument("--fresh", action="store_true")
62
+ args = ap.parse_args()
63
+ allow = set(d.strip() for d in args.dirs.split(",") if d.strip()) or None
64
+
65
+ print(f"== backfill KB={server.KB} cap={TEXT_CAP} batch={args.batch} ==", flush=True)
66
+ t0 = time.perf_counter()
67
+ facts = [f for f in server._iter_facts() if (allow is None or f.get("dir") in allow)]
68
+ if args.limit:
69
+ facts = facts[: args.limit]
70
+ print(f"walked {len(facts)} facts in {time.perf_counter()-t0:.1f}s", flush=True)
71
+
72
+ fk = fastkb.FastKB(server.KB)
73
+ dim = fk.dim
74
+ if args.fresh:
75
+ fk.vec_path.write_bytes(b""); fk.manifest_path.write_text("")
76
+ if fk.hnsw_path.exists():
77
+ fk.hnsw_path.unlink()
78
+
79
+ # ---- resume: manifest is source of truth; repair vectors.f32 to match ----
80
+ manifest = _read_manifest(fk)
81
+ m_rows = len(manifest)
82
+ if fk.vec_path.exists():
83
+ good_bytes = m_rows * dim * 4
84
+ if fk.vec_path.stat().st_size != good_bytes:
85
+ with open(fk.vec_path, "r+b") as vf:
86
+ vf.truncate(good_bytes)
87
+ print(f"repaired vectors.f32 to {m_rows} rows", flush=True)
88
+ done = {m["id"] for m in manifest}
89
+ todo = [f for f in facts if f["id"] not in done]
90
+ print(f"phase1: indexed={m_rows} todo={len(todo)}", flush=True)
91
+
92
+ # ---- phase 1: embed + append ----
93
+ if todo:
94
+ from fastembed import TextEmbedding
95
+ emb = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
96
+ row = m_rows
97
+ n_done = 0
98
+ t_emb = time.perf_counter()
99
+ with open(fk.vec_path, "ab") as vf, fk.manifest_path.open("a") as mf:
100
+ for i in range(0, len(todo), args.batch):
101
+ batch = todo[i : i + args.batch]
102
+ vecs = np.asarray(
103
+ list(emb.embed([fact_text(f) for f in batch], batch_size=args.batch)),
104
+ dtype=np.float32,
105
+ )
106
+ vf.write(np.ascontiguousarray(vecs).tobytes()); vf.flush()
107
+ for f in batch:
108
+ mf.write(json.dumps(fastkb.FastKB._meta_from_fact(f, row)) + "\n")
109
+ row += 1
110
+ mf.flush()
111
+ n_done += len(batch)
112
+ rate = n_done / (time.perf_counter() - t_emb)
113
+ print(f" embedded {n_done}/{len(todo)} ({rate:.0f}/s, "
114
+ f"{(len(todo)-n_done)/max(rate,1):.0f}s left)", flush=True)
115
+
116
+ # ---- phase 2: build hnsw once from the full vector store ----
117
+ print("phase2: building hnsw index...", flush=True)
118
+ t_idx = time.perf_counter()
119
+ import hnswlib
120
+ manifest = _read_manifest(fk)
121
+ n = len(manifest)
122
+ vectors = np.memmap(fk.vec_path, dtype=np.float32, mode="r", shape=(n, dim))
123
+ idx = hnswlib.Index(space="cosine", dim=dim)
124
+ idx.init_index(max_elements=n + 8192,
125
+ ef_construction=fastkb.HNSW_EF_CONSTRUCTION, M=fastkb.HNSW_M)
126
+ idx.add_items(np.asarray(vectors), np.arange(n))
127
+ idx.set_ef(fastkb.HNSW_EF_QUERY)
128
+ idx.save_index(str(fk.hnsw_path))
129
+ print(f"phase2: hnsw built ({n} elements) in {time.perf_counter()-t_idx:.1f}s", flush=True)
130
+
131
+ by_dir: dict[str, int] = {}
132
+ for m in manifest:
133
+ by_dir[m["dir"]] = by_dir.get(m["dir"], 0) + 1
134
+ print(f"\nDONE: {n} vectors in {time.perf_counter()-t0:.1f}s total")
135
+ print(f"by dir: {by_dir}")
136
+ print(f"artifacts: vectors.f32={fk.vec_path.stat().st_size//1024}KB "
137
+ f"manifest={fk.manifest_path.stat().st_size//1024}KB "
138
+ f"hnsw={fk.hnsw_path.stat().st_size//1024}KB")
139
+
140
+
141
+ if __name__ == "__main__":
142
+ main()
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env python3
2
+ """kb — command-line doorway to the personal KB.
3
+
4
+ The vault is exposed two ways, both calling the same logic in server.py:
5
+ - MCP (server.py) — for agents, over JSON-RPC stdio.
6
+ - CLI (this file) — for shell scripts / connectors, over stdin/stdout.
7
+
8
+ A bash connector can't speak MCP mid-pipe, so this is how scripts write to
9
+ the brain:
10
+
11
+ CANVAS_TOKEN=... ./connectors/canvas.sh | kb ingest --source canvas
12
+
13
+ Connector contract (see connectors/README.md): a connector prints JSONL to
14
+ stdout, one record per line:
15
+
16
+ {"id": "<stable external id>", "title": "...", "body": "...",
17
+ "tags": ["..."], "dir": "knowledge|events|...", "scope": null}
18
+
19
+ `kb ingest` is the only trusted writer — connectors just fetch+emit, never
20
+ touch the vault directly. `id` makes re-runs idempotent (a per-source
21
+ manifest in <KB>/.connectors/<source>.json tracks what's been ingested).
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import argparse
26
+ import json
27
+ import sys
28
+ from pathlib import Path
29
+
30
+ import server as kb # same vault logic as the MCP server
31
+
32
+
33
+ def _emit(obj) -> None:
34
+ print(json.dumps(obj, ensure_ascii=False))
35
+
36
+
37
+ def cmd_add(args) -> None:
38
+ content = args.content if args.content is not None else sys.stdin.read()
39
+ if not content.strip():
40
+ print("kb add: empty content", file=sys.stderr)
41
+ sys.exit(1)
42
+ tags = [t.strip() for t in (args.tags or "").split(",") if t.strip()]
43
+ res = kb.add(
44
+ content=content,
45
+ tags=tags or None,
46
+ source=args.source,
47
+ dir=args.dir,
48
+ scope=args.scope,
49
+ )
50
+ _emit(res)
51
+
52
+
53
+ def cmd_search(args) -> None:
54
+ res = kb.search(query=args.query, limit=args.limit, scope=args.scope)
55
+ _emit(res)
56
+
57
+
58
+ def _manifest_path(source: str) -> Path:
59
+ d = kb.KB / ".connectors"
60
+ d.mkdir(parents=True, exist_ok=True)
61
+ return d / f"{source}.json"
62
+
63
+
64
+ def _record_to_content(rec: dict) -> str:
65
+ title = (rec.get("title") or "").strip()
66
+ body = (rec.get("body") or "").strip()
67
+ if title and not body.lstrip().startswith("#"):
68
+ return f"# {title}\n\n{body}".rstrip()
69
+ return (body or title).rstrip()
70
+
71
+
72
+ def cmd_ingest(args) -> None:
73
+ """Read JSONL from stdin and add each new record. Idempotent per-source
74
+ via a manifest keyed on each record's `id` (falls back to title)."""
75
+ mpath = _manifest_path(args.source)
76
+ seen: dict = {}
77
+ if mpath.exists():
78
+ try:
79
+ seen = json.loads(mpath.read_text())
80
+ except Exception:
81
+ seen = {}
82
+
83
+ new = skipped = errors = 0
84
+ for line in sys.stdin:
85
+ line = line.strip()
86
+ if not line:
87
+ continue
88
+ try:
89
+ rec = json.loads(line)
90
+ except Exception as e:
91
+ errors += 1
92
+ print(f"kb ingest: bad JSON line: {e}", file=sys.stderr)
93
+ continue
94
+ ext_id = str(rec.get("id") or rec.get("title") or "").strip()
95
+ if ext_id and ext_id in seen:
96
+ skipped += 1
97
+ continue
98
+ content = _record_to_content(rec)
99
+ if not content:
100
+ errors += 1
101
+ continue
102
+ tags = rec.get("tags") or []
103
+ if isinstance(tags, str):
104
+ tags = [t.strip() for t in tags.split(",") if t.strip()]
105
+ try:
106
+ res = kb.add(
107
+ content=content,
108
+ tags=tags or None,
109
+ source=args.source,
110
+ dir=rec.get("dir"),
111
+ scope=args.scope if args.scope is not None else rec.get("scope"),
112
+ link=args.link,
113
+ )
114
+ if ext_id:
115
+ seen[ext_id] = res.get("id")
116
+ new += 1
117
+ except Exception as e:
118
+ errors += 1
119
+ print(f"kb ingest: add failed: {e}", file=sys.stderr)
120
+
121
+ mpath.write_text(json.dumps(seen, indent=2))
122
+ _emit({"source": args.source, "new": new, "skipped": skipped, "errors": errors})
123
+
124
+
125
+ def main() -> None:
126
+ p = argparse.ArgumentParser(prog="kb", description="Personal KB CLI")
127
+ sub = p.add_subparsers(dest="cmd", required=True)
128
+
129
+ a = sub.add_parser("add", help="add one fact (content via --content or stdin)")
130
+ a.add_argument("--content")
131
+ a.add_argument("--tags", help="comma-separated")
132
+ a.add_argument("--source")
133
+ a.add_argument("--dir", help="raw|events|knowledge|skills|thoughts")
134
+ a.add_argument("--scope")
135
+ a.set_defaults(func=cmd_add)
136
+
137
+ s = sub.add_parser("search", help="substring search, JSON out")
138
+ s.add_argument("query")
139
+ s.add_argument("--limit", type=int, default=10)
140
+ s.add_argument("--scope")
141
+ s.set_defaults(func=cmd_search)
142
+
143
+ i = sub.add_parser("ingest", help="ingest JSONL records from stdin (connector runner)")
144
+ i.add_argument("--source", required=True)
145
+ i.add_argument("--scope")
146
+ i.add_argument("--link", action="store_true",
147
+ help="compute auto_related per fact (O(N)/fact — slow for bulk; off by default)")
148
+ i.set_defaults(func=cmd_ingest)
149
+
150
+ args = p.parse_args()
151
+ args.func(args)
152
+
153
+
154
+ if __name__ == "__main__":
155
+ main()
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: engram-vault
3
+ Version: 0.1.0
4
+ Summary: Engram — local-first personal memory layer for AI agents: markdown vault + hybrid retrieval, exposed over MCP.
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: mcp[cli]>=1.27.1
7
+ Requires-Dist: python-ulid>=3.1.0
8
+ Requires-Dist: pyyaml>=6.0.3
9
+ Requires-Dist: fastembed>=0.4.0
10
+ Requires-Dist: hnswlib>=0.8.0
11
+ Requires-Dist: numpy>=2.4.6
@@ -0,0 +1,15 @@
1
+ backfill.py
2
+ cli.py
3
+ entities.py
4
+ fastkb.py
5
+ hybrid.py
6
+ notifyd.py
7
+ pyproject.toml
8
+ reranker.py
9
+ server.py
10
+ engram_vault.egg-info/PKG-INFO
11
+ engram_vault.egg-info/SOURCES.txt
12
+ engram_vault.egg-info/dependency_links.txt
13
+ engram_vault.egg-info/entry_points.txt
14
+ engram_vault.egg-info/requires.txt
15
+ engram_vault.egg-info/top_level.txt
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ engram = cli:main
3
+ engram-vault = server:main
@@ -0,0 +1,6 @@
1
+ mcp[cli]>=1.27.1
2
+ python-ulid>=3.1.0
3
+ pyyaml>=6.0.3
4
+ fastembed>=0.4.0
5
+ hnswlib>=0.8.0
6
+ numpy>=2.4.6
@@ -0,0 +1,8 @@
1
+ backfill
2
+ cli
3
+ entities
4
+ fastkb
5
+ hybrid
6
+ notifyd
7
+ reranker
8
+ server
@@ -0,0 +1,183 @@
1
+ """Entity linking — the "knows-you" layer.
2
+
3
+ Extracts person/project entities from a fact's structured tags + project scope,
4
+ maintains a sidecar entity index (entity -> fact ids), and resolves a free-text
5
+ query to entities so retrieval can boost facts about the people/projects a query
6
+ names ("what do I know about Priya", "nouva graph bug").
7
+
8
+ Design: entities come from the HIGH-PRECISION signal already in the data —
9
+ `person:`/`contact:` and `project:`/`area:` tags + the project `scope` — not from
10
+ free-text NER over bodies (noisy; a deliberate non-goal here, LLM-upgradable
11
+ later). So this whole layer is deterministic, needs no embedder/LLM, and is
12
+ unit-testable offline.
13
+
14
+ All functions are pure over a plain dict (`index`), shaped:
15
+
16
+ { "person:priya": ["<fact_id>", ...], "project:nouva-desktop": [...], ... }
17
+
18
+ server.py owns persistence (atomic + file-locked writes beside the vault) so
19
+ concurrent adds don't lose updates; this module never touches the filesystem.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import re
24
+
25
+ # tag prefix -> entity type. Only these become entities.
26
+ TAG_ENTITY_TYPES = {
27
+ "person": "person", # named individuals (explicitly tagged)
28
+ "contact": "contact", # chat / conversation identities (iMessage import:
29
+ # group chats, phone numbers) — kept SEPARATE from
30
+ # person so a few real people aren't buried under
31
+ # high-volume chat slugs (mavs-vs-refs, brothaaassss).
32
+ "project": "project",
33
+ "area": "project", # project areas/components (kb-server, embeddings, ...)
34
+ }
35
+
36
+ # prefixes that LOOK entity-ish but are not: dates, confidence levels, browsing-
37
+ # history domains, status flags. Kept as an explicit denylist so a future tag
38
+ # prefix doesn't silently leak in.
39
+ NON_ENTITY_PREFIXES = frozenset({
40
+ "day", "conf", "host", "type", "status", "severity", "browser", "source",
41
+ "topic", "relationship", "group", "benchmark", "tool", "class", "course",
42
+ })
43
+
44
+ # entity name values that carry no identity — never index them.
45
+ _JUNK_VALUES = frozenset({"", "untitled", "unknown", "none", "null", "na", "n-a"})
46
+
47
+ _TOK = re.compile(r"[a-z0-9]+")
48
+ # query tokens shorter than this don't resolve entities (avoid "vs"/"the" noise)
49
+ _MIN_NAME_TOKEN = 4
50
+
51
+
52
+ def _norm(val: str) -> str:
53
+ return (val or "").strip().lower()
54
+
55
+
56
+ def entity_name(entity: str) -> str:
57
+ """'person:graham-neubig' -> 'graham-neubig'."""
58
+ return entity.split(":", 1)[1] if ":" in entity else entity
59
+
60
+
61
+ def entity_type(entity: str) -> str:
62
+ return entity.split(":", 1)[0] if ":" in entity else ""
63
+
64
+
65
+ def extract_entities(tags, scope=None) -> list[str]:
66
+ """Canonical entity ids ('type:name') for a fact, from its tags + scope.
67
+ Order-preserving and de-duplicated."""
68
+ out: list[str] = []
69
+ for t in tags or []:
70
+ if ":" not in t:
71
+ continue
72
+ pre, val = t.split(":", 1)
73
+ pre = pre.strip().lower()
74
+ if pre in NON_ENTITY_PREFIXES:
75
+ continue
76
+ et = TAG_ENTITY_TYPES.get(pre)
77
+ if not et:
78
+ continue
79
+ val = _norm(val)
80
+ if val in _JUNK_VALUES:
81
+ continue
82
+ out.append(f"{et}:{val}")
83
+ if scope:
84
+ sval = _norm(str(scope))
85
+ if sval not in _JUNK_VALUES:
86
+ out.append(f"project:{sval}")
87
+ seen: set[str] = set()
88
+ res: list[str] = []
89
+ for e in out:
90
+ if e not in seen:
91
+ seen.add(e)
92
+ res.append(e)
93
+ return res
94
+
95
+
96
+ # ---------------- index mutation (pure over a dict) ----------------
97
+
98
+ def index_add(index: dict, fact_id: str, entities) -> dict:
99
+ for e in entities:
100
+ lst = index.setdefault(e, [])
101
+ if fact_id not in lst:
102
+ lst.append(fact_id)
103
+ return index
104
+
105
+
106
+ def index_remove(index: dict, fact_id: str, entities=None) -> dict:
107
+ """Drop fact_id from the given entities (or from ALL entities if None)."""
108
+ targets = list(entities) if entities is not None else list(index.keys())
109
+ for e in targets:
110
+ lst = index.get(e)
111
+ if not lst:
112
+ continue
113
+ index[e] = [f for f in lst if f != fact_id]
114
+ if not index[e]:
115
+ del index[e]
116
+ return index
117
+
118
+
119
+ def rebuild(records) -> dict:
120
+ """Build an entity index from manifest-shaped records (each with `id`,
121
+ `tags`, `scope`). Manifest-backed, so a backfill needs no vault walk."""
122
+ index: dict = {}
123
+ for r in records:
124
+ ents = extract_entities(r.get("tags") or [], r.get("scope"))
125
+ if ents:
126
+ index_add(index, r["id"], ents)
127
+ return index
128
+
129
+
130
+ # ---------------- query-time resolution ----------------
131
+
132
+ def _token_map(index: dict) -> dict:
133
+ """name-token -> {entity ids containing it}. For O(query) resolution."""
134
+ tm: dict[str, set] = {}
135
+ for e in index:
136
+ for tok in _TOK.findall(entity_name(e)):
137
+ if len(tok) >= _MIN_NAME_TOKEN:
138
+ tm.setdefault(tok, set()).add(e)
139
+ return tm
140
+
141
+
142
+ def resolve(index: dict, query: str, token_map: dict | None = None) -> list[str]:
143
+ """Entities a query names: an entity matches when one of its name-tokens
144
+ (length >= 4) appears in the query. Returns most-specific first (entities
145
+ matched by more query tokens rank higher)."""
146
+ qtokens = {t for t in _TOK.findall((query or "").lower()) if len(t) >= _MIN_NAME_TOKEN}
147
+ if not qtokens:
148
+ return []
149
+ tm = token_map if token_map is not None else _token_map(index)
150
+ hits: dict[str, int] = {}
151
+ for t in qtokens:
152
+ for e in tm.get(t, ()): # type: ignore[union-attr]
153
+ hits[e] = hits.get(e, 0) + 1
154
+ return sorted(hits, key=lambda e: (-hits[e], -len(index.get(e, [])), e))
155
+
156
+
157
+ def facts_for(index: dict, entities, cap: int | None = None) -> list[str]:
158
+ """Union of fact ids across the given entities, de-duplicated (first seen)."""
159
+ seen: set[str] = set()
160
+ out: list[str] = []
161
+ for e in entities:
162
+ for fid in index.get(e, []):
163
+ if fid not in seen:
164
+ seen.add(fid)
165
+ out.append(fid)
166
+ if cap and len(out) >= cap:
167
+ return out
168
+ return out
169
+
170
+
171
+ def summarize(index: dict, type: str | None = None, query: str | None = None, limit: int = 50) -> list[dict]:
172
+ """[{entity, type, name, count}] for the entities() tool, most-referenced
173
+ first. Optional `type` filter and `query` name-match."""
174
+ items = []
175
+ matched = set(resolve(index, query)) if query else None
176
+ for e, facts in index.items():
177
+ if type and entity_type(e) != type:
178
+ continue
179
+ if matched is not None and e not in matched:
180
+ continue
181
+ items.append({"entity": e, "type": entity_type(e), "name": entity_name(e), "count": len(facts)})
182
+ items.sort(key=lambda d: (-d["count"], d["entity"]))
183
+ return items[:limit]