engram-vault 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- engram_vault-0.1.0/PKG-INFO +11 -0
- engram_vault-0.1.0/backfill.py +142 -0
- engram_vault-0.1.0/cli.py +155 -0
- engram_vault-0.1.0/engram_vault.egg-info/PKG-INFO +11 -0
- engram_vault-0.1.0/engram_vault.egg-info/SOURCES.txt +15 -0
- engram_vault-0.1.0/engram_vault.egg-info/dependency_links.txt +1 -0
- engram_vault-0.1.0/engram_vault.egg-info/entry_points.txt +3 -0
- engram_vault-0.1.0/engram_vault.egg-info/requires.txt +6 -0
- engram_vault-0.1.0/engram_vault.egg-info/top_level.txt +8 -0
- engram_vault-0.1.0/entities.py +183 -0
- engram_vault-0.1.0/fastkb.py +523 -0
- engram_vault-0.1.0/hybrid.py +154 -0
- engram_vault-0.1.0/notifyd.py +140 -0
- engram_vault-0.1.0/pyproject.toml +29 -0
- engram_vault-0.1.0/reranker.py +85 -0
- engram_vault-0.1.0/server.py +2088 -0
- engram_vault-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: engram-vault
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Engram — local-first personal memory layer for AI agents: markdown vault + hybrid retrieval, exposed over MCP.
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: mcp[cli]>=1.27.1
|
|
7
|
+
Requires-Dist: python-ulid>=3.1.0
|
|
8
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
9
|
+
Requires-Dist: fastembed>=0.4.0
|
|
10
|
+
Requires-Dist: hnswlib>=0.8.0
|
|
11
|
+
Requires-Dist: numpy>=2.4.6
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""One-time backfill: embed the vault and build the FastKB index.
|
|
2
|
+
|
|
3
|
+
Root cause of ticket #1: add() embedded a query vector to FIND neighbors but
|
|
4
|
+
never PERSISTED the new fact's own vector, so the index only ever grew via the
|
|
5
|
+
UI's transformers.js path (16 of 43.7k facts). This rebuilds from scratch by
|
|
6
|
+
embedding every fact with the server's model (BAAI/bge-small-en-v1.5).
|
|
7
|
+
|
|
8
|
+
Two phases, deliberately separated:
|
|
9
|
+
PHASE 1 — embed + append (streaming, resumable, O(N)). Vectors are appended to
|
|
10
|
+
vectors.f32 and metadata to manifest.jsonl in batches. A kill/crash keeps
|
|
11
|
+
progress; a re-run truncates vectors.f32 to the manifest row count (repairing
|
|
12
|
+
any half-written batch) and continues. NO hnsw here — building it per-batch
|
|
13
|
+
is what made the old version go O(N^2) (re-mmap + re-save the growing index
|
|
14
|
+
every batch, rate collapsing 72->11/s).
|
|
15
|
+
PHASE 2 — build hnsw.bin ONCE from the full vectors.f32 (a few seconds). It is
|
|
16
|
+
fully rebuildable from vectors+manifest, so it is fine to drop and redo.
|
|
17
|
+
|
|
18
|
+
TEXT_CAP: bge-small only consumes the first 512 tokens, so we truncate each fact
|
|
19
|
+
before embedding — lossless for the model, but huge scraped raw/ pages no longer
|
|
20
|
+
cost seconds each to tokenize.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
KB_DIR="$HOME/Library/Application Support/KB" \
|
|
24
|
+
uv run --project server python server/backfill.py [--limit N]
|
|
25
|
+
[--dirs knowledge,events,...] [--fresh] [--batch 512]
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
import argparse, json, sys, time
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
import numpy as np
|
|
32
|
+
|
|
33
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
34
|
+
import server # noqa
|
|
35
|
+
import fastkb # noqa
|
|
36
|
+
|
|
37
|
+
TEXT_CAP = 2000 # chars; ~512 tokens — the model truncates past this anyway
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def fact_text(f: dict) -> str:
|
|
41
|
+
return (f["content"] + " " + " ".join(f.get("tags") or []))[:TEXT_CAP]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _read_manifest(fk) -> list[dict]:
|
|
45
|
+
if not fk.manifest_path.exists():
|
|
46
|
+
return []
|
|
47
|
+
out = []
|
|
48
|
+
with fk.manifest_path.open() as f:
|
|
49
|
+
for line in f:
|
|
50
|
+
line = line.strip()
|
|
51
|
+
if line:
|
|
52
|
+
out.append(json.loads(line))
|
|
53
|
+
return out
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def main():
|
|
57
|
+
ap = argparse.ArgumentParser()
|
|
58
|
+
ap.add_argument("--limit", type=int, default=0)
|
|
59
|
+
ap.add_argument("--dirs", type=str, default="")
|
|
60
|
+
ap.add_argument("--batch", type=int, default=512)
|
|
61
|
+
ap.add_argument("--fresh", action="store_true")
|
|
62
|
+
args = ap.parse_args()
|
|
63
|
+
allow = set(d.strip() for d in args.dirs.split(",") if d.strip()) or None
|
|
64
|
+
|
|
65
|
+
print(f"== backfill KB={server.KB} cap={TEXT_CAP} batch={args.batch} ==", flush=True)
|
|
66
|
+
t0 = time.perf_counter()
|
|
67
|
+
facts = [f for f in server._iter_facts() if (allow is None or f.get("dir") in allow)]
|
|
68
|
+
if args.limit:
|
|
69
|
+
facts = facts[: args.limit]
|
|
70
|
+
print(f"walked {len(facts)} facts in {time.perf_counter()-t0:.1f}s", flush=True)
|
|
71
|
+
|
|
72
|
+
fk = fastkb.FastKB(server.KB)
|
|
73
|
+
dim = fk.dim
|
|
74
|
+
if args.fresh:
|
|
75
|
+
fk.vec_path.write_bytes(b""); fk.manifest_path.write_text("")
|
|
76
|
+
if fk.hnsw_path.exists():
|
|
77
|
+
fk.hnsw_path.unlink()
|
|
78
|
+
|
|
79
|
+
# ---- resume: manifest is source of truth; repair vectors.f32 to match ----
|
|
80
|
+
manifest = _read_manifest(fk)
|
|
81
|
+
m_rows = len(manifest)
|
|
82
|
+
if fk.vec_path.exists():
|
|
83
|
+
good_bytes = m_rows * dim * 4
|
|
84
|
+
if fk.vec_path.stat().st_size != good_bytes:
|
|
85
|
+
with open(fk.vec_path, "r+b") as vf:
|
|
86
|
+
vf.truncate(good_bytes)
|
|
87
|
+
print(f"repaired vectors.f32 to {m_rows} rows", flush=True)
|
|
88
|
+
done = {m["id"] for m in manifest}
|
|
89
|
+
todo = [f for f in facts if f["id"] not in done]
|
|
90
|
+
print(f"phase1: indexed={m_rows} todo={len(todo)}", flush=True)
|
|
91
|
+
|
|
92
|
+
# ---- phase 1: embed + append ----
|
|
93
|
+
if todo:
|
|
94
|
+
from fastembed import TextEmbedding
|
|
95
|
+
emb = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
|
96
|
+
row = m_rows
|
|
97
|
+
n_done = 0
|
|
98
|
+
t_emb = time.perf_counter()
|
|
99
|
+
with open(fk.vec_path, "ab") as vf, fk.manifest_path.open("a") as mf:
|
|
100
|
+
for i in range(0, len(todo), args.batch):
|
|
101
|
+
batch = todo[i : i + args.batch]
|
|
102
|
+
vecs = np.asarray(
|
|
103
|
+
list(emb.embed([fact_text(f) for f in batch], batch_size=args.batch)),
|
|
104
|
+
dtype=np.float32,
|
|
105
|
+
)
|
|
106
|
+
vf.write(np.ascontiguousarray(vecs).tobytes()); vf.flush()
|
|
107
|
+
for f in batch:
|
|
108
|
+
mf.write(json.dumps(fastkb.FastKB._meta_from_fact(f, row)) + "\n")
|
|
109
|
+
row += 1
|
|
110
|
+
mf.flush()
|
|
111
|
+
n_done += len(batch)
|
|
112
|
+
rate = n_done / (time.perf_counter() - t_emb)
|
|
113
|
+
print(f" embedded {n_done}/{len(todo)} ({rate:.0f}/s, "
|
|
114
|
+
f"{(len(todo)-n_done)/max(rate,1):.0f}s left)", flush=True)
|
|
115
|
+
|
|
116
|
+
# ---- phase 2: build hnsw once from the full vector store ----
|
|
117
|
+
print("phase2: building hnsw index...", flush=True)
|
|
118
|
+
t_idx = time.perf_counter()
|
|
119
|
+
import hnswlib
|
|
120
|
+
manifest = _read_manifest(fk)
|
|
121
|
+
n = len(manifest)
|
|
122
|
+
vectors = np.memmap(fk.vec_path, dtype=np.float32, mode="r", shape=(n, dim))
|
|
123
|
+
idx = hnswlib.Index(space="cosine", dim=dim)
|
|
124
|
+
idx.init_index(max_elements=n + 8192,
|
|
125
|
+
ef_construction=fastkb.HNSW_EF_CONSTRUCTION, M=fastkb.HNSW_M)
|
|
126
|
+
idx.add_items(np.asarray(vectors), np.arange(n))
|
|
127
|
+
idx.set_ef(fastkb.HNSW_EF_QUERY)
|
|
128
|
+
idx.save_index(str(fk.hnsw_path))
|
|
129
|
+
print(f"phase2: hnsw built ({n} elements) in {time.perf_counter()-t_idx:.1f}s", flush=True)
|
|
130
|
+
|
|
131
|
+
by_dir: dict[str, int] = {}
|
|
132
|
+
for m in manifest:
|
|
133
|
+
by_dir[m["dir"]] = by_dir.get(m["dir"], 0) + 1
|
|
134
|
+
print(f"\nDONE: {n} vectors in {time.perf_counter()-t0:.1f}s total")
|
|
135
|
+
print(f"by dir: {by_dir}")
|
|
136
|
+
print(f"artifacts: vectors.f32={fk.vec_path.stat().st_size//1024}KB "
|
|
137
|
+
f"manifest={fk.manifest_path.stat().st_size//1024}KB "
|
|
138
|
+
f"hnsw={fk.hnsw_path.stat().st_size//1024}KB")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
|
+
main()
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""kb — command-line doorway to the personal KB.
|
|
3
|
+
|
|
4
|
+
The vault is exposed two ways, both calling the same logic in server.py:
|
|
5
|
+
- MCP (server.py) — for agents, over JSON-RPC stdio.
|
|
6
|
+
- CLI (this file) — for shell scripts / connectors, over stdin/stdout.
|
|
7
|
+
|
|
8
|
+
A bash connector can't speak MCP mid-pipe, so this is how scripts write to
|
|
9
|
+
the brain:
|
|
10
|
+
|
|
11
|
+
CANVAS_TOKEN=... ./connectors/canvas.sh | kb ingest --source canvas
|
|
12
|
+
|
|
13
|
+
Connector contract (see connectors/README.md): a connector prints JSONL to
|
|
14
|
+
stdout, one record per line:
|
|
15
|
+
|
|
16
|
+
{"id": "<stable external id>", "title": "...", "body": "...",
|
|
17
|
+
"tags": ["..."], "dir": "knowledge|events|...", "scope": null}
|
|
18
|
+
|
|
19
|
+
`kb ingest` is the only trusted writer — connectors just fetch+emit, never
|
|
20
|
+
touch the vault directly. `id` makes re-runs idempotent (a per-source
|
|
21
|
+
manifest in <KB>/.connectors/<source>.json tracks what's been ingested).
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import argparse
|
|
26
|
+
import json
|
|
27
|
+
import sys
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
import server as kb # same vault logic as the MCP server
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _emit(obj) -> None:
|
|
34
|
+
print(json.dumps(obj, ensure_ascii=False))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def cmd_add(args) -> None:
|
|
38
|
+
content = args.content if args.content is not None else sys.stdin.read()
|
|
39
|
+
if not content.strip():
|
|
40
|
+
print("kb add: empty content", file=sys.stderr)
|
|
41
|
+
sys.exit(1)
|
|
42
|
+
tags = [t.strip() for t in (args.tags or "").split(",") if t.strip()]
|
|
43
|
+
res = kb.add(
|
|
44
|
+
content=content,
|
|
45
|
+
tags=tags or None,
|
|
46
|
+
source=args.source,
|
|
47
|
+
dir=args.dir,
|
|
48
|
+
scope=args.scope,
|
|
49
|
+
)
|
|
50
|
+
_emit(res)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def cmd_search(args) -> None:
|
|
54
|
+
res = kb.search(query=args.query, limit=args.limit, scope=args.scope)
|
|
55
|
+
_emit(res)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _manifest_path(source: str) -> Path:
|
|
59
|
+
d = kb.KB / ".connectors"
|
|
60
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
return d / f"{source}.json"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _record_to_content(rec: dict) -> str:
|
|
65
|
+
title = (rec.get("title") or "").strip()
|
|
66
|
+
body = (rec.get("body") or "").strip()
|
|
67
|
+
if title and not body.lstrip().startswith("#"):
|
|
68
|
+
return f"# {title}\n\n{body}".rstrip()
|
|
69
|
+
return (body or title).rstrip()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def cmd_ingest(args) -> None:
|
|
73
|
+
"""Read JSONL from stdin and add each new record. Idempotent per-source
|
|
74
|
+
via a manifest keyed on each record's `id` (falls back to title)."""
|
|
75
|
+
mpath = _manifest_path(args.source)
|
|
76
|
+
seen: dict = {}
|
|
77
|
+
if mpath.exists():
|
|
78
|
+
try:
|
|
79
|
+
seen = json.loads(mpath.read_text())
|
|
80
|
+
except Exception:
|
|
81
|
+
seen = {}
|
|
82
|
+
|
|
83
|
+
new = skipped = errors = 0
|
|
84
|
+
for line in sys.stdin:
|
|
85
|
+
line = line.strip()
|
|
86
|
+
if not line:
|
|
87
|
+
continue
|
|
88
|
+
try:
|
|
89
|
+
rec = json.loads(line)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
errors += 1
|
|
92
|
+
print(f"kb ingest: bad JSON line: {e}", file=sys.stderr)
|
|
93
|
+
continue
|
|
94
|
+
ext_id = str(rec.get("id") or rec.get("title") or "").strip()
|
|
95
|
+
if ext_id and ext_id in seen:
|
|
96
|
+
skipped += 1
|
|
97
|
+
continue
|
|
98
|
+
content = _record_to_content(rec)
|
|
99
|
+
if not content:
|
|
100
|
+
errors += 1
|
|
101
|
+
continue
|
|
102
|
+
tags = rec.get("tags") or []
|
|
103
|
+
if isinstance(tags, str):
|
|
104
|
+
tags = [t.strip() for t in tags.split(",") if t.strip()]
|
|
105
|
+
try:
|
|
106
|
+
res = kb.add(
|
|
107
|
+
content=content,
|
|
108
|
+
tags=tags or None,
|
|
109
|
+
source=args.source,
|
|
110
|
+
dir=rec.get("dir"),
|
|
111
|
+
scope=args.scope if args.scope is not None else rec.get("scope"),
|
|
112
|
+
link=args.link,
|
|
113
|
+
)
|
|
114
|
+
if ext_id:
|
|
115
|
+
seen[ext_id] = res.get("id")
|
|
116
|
+
new += 1
|
|
117
|
+
except Exception as e:
|
|
118
|
+
errors += 1
|
|
119
|
+
print(f"kb ingest: add failed: {e}", file=sys.stderr)
|
|
120
|
+
|
|
121
|
+
mpath.write_text(json.dumps(seen, indent=2))
|
|
122
|
+
_emit({"source": args.source, "new": new, "skipped": skipped, "errors": errors})
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def main() -> None:
|
|
126
|
+
p = argparse.ArgumentParser(prog="kb", description="Personal KB CLI")
|
|
127
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
128
|
+
|
|
129
|
+
a = sub.add_parser("add", help="add one fact (content via --content or stdin)")
|
|
130
|
+
a.add_argument("--content")
|
|
131
|
+
a.add_argument("--tags", help="comma-separated")
|
|
132
|
+
a.add_argument("--source")
|
|
133
|
+
a.add_argument("--dir", help="raw|events|knowledge|skills|thoughts")
|
|
134
|
+
a.add_argument("--scope")
|
|
135
|
+
a.set_defaults(func=cmd_add)
|
|
136
|
+
|
|
137
|
+
s = sub.add_parser("search", help="substring search, JSON out")
|
|
138
|
+
s.add_argument("query")
|
|
139
|
+
s.add_argument("--limit", type=int, default=10)
|
|
140
|
+
s.add_argument("--scope")
|
|
141
|
+
s.set_defaults(func=cmd_search)
|
|
142
|
+
|
|
143
|
+
i = sub.add_parser("ingest", help="ingest JSONL records from stdin (connector runner)")
|
|
144
|
+
i.add_argument("--source", required=True)
|
|
145
|
+
i.add_argument("--scope")
|
|
146
|
+
i.add_argument("--link", action="store_true",
|
|
147
|
+
help="compute auto_related per fact (O(N)/fact — slow for bulk; off by default)")
|
|
148
|
+
i.set_defaults(func=cmd_ingest)
|
|
149
|
+
|
|
150
|
+
args = p.parse_args()
|
|
151
|
+
args.func(args)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
if __name__ == "__main__":
|
|
155
|
+
main()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: engram-vault
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Engram — local-first personal memory layer for AI agents: markdown vault + hybrid retrieval, exposed over MCP.
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: mcp[cli]>=1.27.1
|
|
7
|
+
Requires-Dist: python-ulid>=3.1.0
|
|
8
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
9
|
+
Requires-Dist: fastembed>=0.4.0
|
|
10
|
+
Requires-Dist: hnswlib>=0.8.0
|
|
11
|
+
Requires-Dist: numpy>=2.4.6
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
backfill.py
|
|
2
|
+
cli.py
|
|
3
|
+
entities.py
|
|
4
|
+
fastkb.py
|
|
5
|
+
hybrid.py
|
|
6
|
+
notifyd.py
|
|
7
|
+
pyproject.toml
|
|
8
|
+
reranker.py
|
|
9
|
+
server.py
|
|
10
|
+
engram_vault.egg-info/PKG-INFO
|
|
11
|
+
engram_vault.egg-info/SOURCES.txt
|
|
12
|
+
engram_vault.egg-info/dependency_links.txt
|
|
13
|
+
engram_vault.egg-info/entry_points.txt
|
|
14
|
+
engram_vault.egg-info/requires.txt
|
|
15
|
+
engram_vault.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Entity linking — the "knows-you" layer.
|
|
2
|
+
|
|
3
|
+
Extracts person/project entities from a fact's structured tags + project scope,
|
|
4
|
+
maintains a sidecar entity index (entity -> fact ids), and resolves a free-text
|
|
5
|
+
query to entities so retrieval can boost facts about the people/projects a query
|
|
6
|
+
names ("what do I know about Priya", "nouva graph bug").
|
|
7
|
+
|
|
8
|
+
Design: entities come from the HIGH-PRECISION signal already in the data —
|
|
9
|
+
`person:`/`contact:` and `project:`/`area:` tags + the project `scope` — not from
|
|
10
|
+
free-text NER over bodies (noisy; a deliberate non-goal here, LLM-upgradable
|
|
11
|
+
later). So this whole layer is deterministic, needs no embedder/LLM, and is
|
|
12
|
+
unit-testable offline.
|
|
13
|
+
|
|
14
|
+
All functions are pure over a plain dict (`index`), shaped:
|
|
15
|
+
|
|
16
|
+
{ "person:priya": ["<fact_id>", ...], "project:nouva-desktop": [...], ... }
|
|
17
|
+
|
|
18
|
+
server.py owns persistence (atomic + file-locked writes beside the vault) so
|
|
19
|
+
concurrent adds don't lose updates; this module never touches the filesystem.
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
|
|
25
|
+
# tag prefix -> entity type. Only these become entities.
|
|
26
|
+
TAG_ENTITY_TYPES = {
|
|
27
|
+
"person": "person", # named individuals (explicitly tagged)
|
|
28
|
+
"contact": "contact", # chat / conversation identities (iMessage import:
|
|
29
|
+
# group chats, phone numbers) — kept SEPARATE from
|
|
30
|
+
# person so a few real people aren't buried under
|
|
31
|
+
# high-volume chat slugs (mavs-vs-refs, brothaaassss).
|
|
32
|
+
"project": "project",
|
|
33
|
+
"area": "project", # project areas/components (kb-server, embeddings, ...)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# prefixes that LOOK entity-ish but are not: dates, confidence levels, browsing-
|
|
37
|
+
# history domains, status flags. Kept as an explicit denylist so a future tag
|
|
38
|
+
# prefix doesn't silently leak in.
|
|
39
|
+
NON_ENTITY_PREFIXES = frozenset({
|
|
40
|
+
"day", "conf", "host", "type", "status", "severity", "browser", "source",
|
|
41
|
+
"topic", "relationship", "group", "benchmark", "tool", "class", "course",
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
# entity name values that carry no identity — never index them.
|
|
45
|
+
_JUNK_VALUES = frozenset({"", "untitled", "unknown", "none", "null", "na", "n-a"})
|
|
46
|
+
|
|
47
|
+
_TOK = re.compile(r"[a-z0-9]+")
|
|
48
|
+
# query tokens shorter than this don't resolve entities (avoid "vs"/"the" noise)
|
|
49
|
+
_MIN_NAME_TOKEN = 4
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _norm(val: str) -> str:
|
|
53
|
+
return (val or "").strip().lower()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def entity_name(entity: str) -> str:
|
|
57
|
+
"""'person:graham-neubig' -> 'graham-neubig'."""
|
|
58
|
+
return entity.split(":", 1)[1] if ":" in entity else entity
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def entity_type(entity: str) -> str:
|
|
62
|
+
return entity.split(":", 1)[0] if ":" in entity else ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def extract_entities(tags, scope=None) -> list[str]:
|
|
66
|
+
"""Canonical entity ids ('type:name') for a fact, from its tags + scope.
|
|
67
|
+
Order-preserving and de-duplicated."""
|
|
68
|
+
out: list[str] = []
|
|
69
|
+
for t in tags or []:
|
|
70
|
+
if ":" not in t:
|
|
71
|
+
continue
|
|
72
|
+
pre, val = t.split(":", 1)
|
|
73
|
+
pre = pre.strip().lower()
|
|
74
|
+
if pre in NON_ENTITY_PREFIXES:
|
|
75
|
+
continue
|
|
76
|
+
et = TAG_ENTITY_TYPES.get(pre)
|
|
77
|
+
if not et:
|
|
78
|
+
continue
|
|
79
|
+
val = _norm(val)
|
|
80
|
+
if val in _JUNK_VALUES:
|
|
81
|
+
continue
|
|
82
|
+
out.append(f"{et}:{val}")
|
|
83
|
+
if scope:
|
|
84
|
+
sval = _norm(str(scope))
|
|
85
|
+
if sval not in _JUNK_VALUES:
|
|
86
|
+
out.append(f"project:{sval}")
|
|
87
|
+
seen: set[str] = set()
|
|
88
|
+
res: list[str] = []
|
|
89
|
+
for e in out:
|
|
90
|
+
if e not in seen:
|
|
91
|
+
seen.add(e)
|
|
92
|
+
res.append(e)
|
|
93
|
+
return res
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------- index mutation (pure over a dict) ----------------
|
|
97
|
+
|
|
98
|
+
def index_add(index: dict, fact_id: str, entities) -> dict:
|
|
99
|
+
for e in entities:
|
|
100
|
+
lst = index.setdefault(e, [])
|
|
101
|
+
if fact_id not in lst:
|
|
102
|
+
lst.append(fact_id)
|
|
103
|
+
return index
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def index_remove(index: dict, fact_id: str, entities=None) -> dict:
|
|
107
|
+
"""Drop fact_id from the given entities (or from ALL entities if None)."""
|
|
108
|
+
targets = list(entities) if entities is not None else list(index.keys())
|
|
109
|
+
for e in targets:
|
|
110
|
+
lst = index.get(e)
|
|
111
|
+
if not lst:
|
|
112
|
+
continue
|
|
113
|
+
index[e] = [f for f in lst if f != fact_id]
|
|
114
|
+
if not index[e]:
|
|
115
|
+
del index[e]
|
|
116
|
+
return index
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def rebuild(records) -> dict:
|
|
120
|
+
"""Build an entity index from manifest-shaped records (each with `id`,
|
|
121
|
+
`tags`, `scope`). Manifest-backed, so a backfill needs no vault walk."""
|
|
122
|
+
index: dict = {}
|
|
123
|
+
for r in records:
|
|
124
|
+
ents = extract_entities(r.get("tags") or [], r.get("scope"))
|
|
125
|
+
if ents:
|
|
126
|
+
index_add(index, r["id"], ents)
|
|
127
|
+
return index
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ---------------- query-time resolution ----------------
|
|
131
|
+
|
|
132
|
+
def _token_map(index: dict) -> dict:
|
|
133
|
+
"""name-token -> {entity ids containing it}. For O(query) resolution."""
|
|
134
|
+
tm: dict[str, set] = {}
|
|
135
|
+
for e in index:
|
|
136
|
+
for tok in _TOK.findall(entity_name(e)):
|
|
137
|
+
if len(tok) >= _MIN_NAME_TOKEN:
|
|
138
|
+
tm.setdefault(tok, set()).add(e)
|
|
139
|
+
return tm
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def resolve(index: dict, query: str, token_map: dict | None = None) -> list[str]:
|
|
143
|
+
"""Entities a query names: an entity matches when one of its name-tokens
|
|
144
|
+
(length >= 4) appears in the query. Returns most-specific first (entities
|
|
145
|
+
matched by more query tokens rank higher)."""
|
|
146
|
+
qtokens = {t for t in _TOK.findall((query or "").lower()) if len(t) >= _MIN_NAME_TOKEN}
|
|
147
|
+
if not qtokens:
|
|
148
|
+
return []
|
|
149
|
+
tm = token_map if token_map is not None else _token_map(index)
|
|
150
|
+
hits: dict[str, int] = {}
|
|
151
|
+
for t in qtokens:
|
|
152
|
+
for e in tm.get(t, ()): # type: ignore[union-attr]
|
|
153
|
+
hits[e] = hits.get(e, 0) + 1
|
|
154
|
+
return sorted(hits, key=lambda e: (-hits[e], -len(index.get(e, [])), e))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def facts_for(index: dict, entities, cap: int | None = None) -> list[str]:
|
|
158
|
+
"""Union of fact ids across the given entities, de-duplicated (first seen)."""
|
|
159
|
+
seen: set[str] = set()
|
|
160
|
+
out: list[str] = []
|
|
161
|
+
for e in entities:
|
|
162
|
+
for fid in index.get(e, []):
|
|
163
|
+
if fid not in seen:
|
|
164
|
+
seen.add(fid)
|
|
165
|
+
out.append(fid)
|
|
166
|
+
if cap and len(out) >= cap:
|
|
167
|
+
return out
|
|
168
|
+
return out
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def summarize(index: dict, type: str | None = None, query: str | None = None, limit: int = 50) -> list[dict]:
|
|
172
|
+
"""[{entity, type, name, count}] for the entities() tool, most-referenced
|
|
173
|
+
first. Optional `type` filter and `query` name-match."""
|
|
174
|
+
items = []
|
|
175
|
+
matched = set(resolve(index, query)) if query else None
|
|
176
|
+
for e, facts in index.items():
|
|
177
|
+
if type and entity_type(e) != type:
|
|
178
|
+
continue
|
|
179
|
+
if matched is not None and e not in matched:
|
|
180
|
+
continue
|
|
181
|
+
items.append({"entity": e, "type": entity_type(e), "name": entity_name(e), "count": len(facts)})
|
|
182
|
+
items.sort(key=lambda d: (-d["count"], d["entity"]))
|
|
183
|
+
return items[:limit]
|