nodalkb 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nodalkb-1.0.0/PKG-INFO +134 -0
- nodalkb-1.0.0/README.md +116 -0
- nodalkb-1.0.0/backfill.py +147 -0
- nodalkb-1.0.0/cli.py +827 -0
- nodalkb-1.0.0/engram_assets.py +756 -0
- nodalkb-1.0.0/entities.py +220 -0
- nodalkb-1.0.0/fastkb.py +667 -0
- nodalkb-1.0.0/hybrid.py +154 -0
- nodalkb-1.0.0/mission.py +220 -0
- nodalkb-1.0.0/mission_html.py +4 -0
- nodalkb-1.0.0/nodalkb.egg-info/PKG-INFO +134 -0
- nodalkb-1.0.0/nodalkb.egg-info/SOURCES.txt +21 -0
- nodalkb-1.0.0/nodalkb.egg-info/dependency_links.txt +1 -0
- nodalkb-1.0.0/nodalkb.egg-info/entry_points.txt +4 -0
- nodalkb-1.0.0/nodalkb.egg-info/requires.txt +12 -0
- nodalkb-1.0.0/nodalkb.egg-info/top_level.txt +13 -0
- nodalkb-1.0.0/notifyd.py +161 -0
- nodalkb-1.0.0/profanity.py +63 -0
- nodalkb-1.0.0/pyproject.toml +58 -0
- nodalkb-1.0.0/reranker.py +85 -0
- nodalkb-1.0.0/server.py +4672 -0
- nodalkb-1.0.0/setup.cfg +4 -0
- nodalkb-1.0.0/top.py +1294 -0
nodalkb-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nodalkb
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Nodal — local-first personal memory layer for AI agents: markdown vault + hybrid retrieval, exposed over MCP.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: mcp[cli]>=1.27.1
|
|
9
|
+
Requires-Dist: python-ulid>=3.1.0
|
|
10
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
11
|
+
Requires-Dist: fastembed>=0.4.0
|
|
12
|
+
Requires-Dist: numpy>=2.4.6
|
|
13
|
+
Requires-Dist: textual>=3.0
|
|
14
|
+
Requires-Dist: langsmith>=0.1.0
|
|
15
|
+
Provides-Extra: ann
|
|
16
|
+
Requires-Dist: hnswlib>=0.8.0; extra == "ann"
|
|
17
|
+
Provides-Extra: trace
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<img alt="Nodal" src="https://engram-site-xi.vercel.app/icon.png" width="120" height="120">
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<p align="center"><strong>A local-first shared brain for AI agents.</strong><br>
|
|
24
|
+
One markdown file per fact, on your disk — written and recalled by every agent
|
|
25
|
+
on your machine over MCP, and by you with <code>grep</code>.</p>
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<a href="https://pypi.org/project/nodalkb/"><img alt="PyPI" src="https://img.shields.io/pypi/v/nodalkb?style=flat-square&color=177a44&label=pypi"></a>
|
|
29
|
+
<a href="https://pypi.org/project/nodalkb/"><img alt="Python" src="https://img.shields.io/pypi/pyversions/nodalkb?style=flat-square&color=14171b"></a>
|
|
30
|
+
<img alt="MCP tools" src="https://img.shields.io/badge/MCP-44_tools-6fdd8b?style=flat-square&labelColor=14171b">
|
|
31
|
+
<a href="https://github.com/arjan-experiments/kb/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/license-Apache--2.0-14171b?style=flat-square"></a>
|
|
32
|
+
</p>
|
|
33
|
+
|
|
34
|
+
One command installs uv (if missing), registers the server, plugs in the skill,
|
|
35
|
+
and starts the ping monitor:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
curl -fsSL https://engram-site-xi.vercel.app/install.sh | sh
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or do it by hand:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
claude mcp add nodal --scope user -- uvx nodalkb@latest # Claude Code
|
|
45
|
+
uvx --from nodalkb@latest nodal setup # skill + ping monitor + auto-update
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Any other MCP client — add to its config, then restart:
|
|
49
|
+
|
|
50
|
+
```json
|
|
51
|
+
{ "mcpServers": { "nodal": { "command": "uvx", "args": ["nodalkb@latest"] } } }
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
`@latest` so a relaunch auto-pulls new releases (uvx otherwise pins the
|
|
55
|
+
first-installed version). `nodal setup` auto-plugs the skill into
|
|
56
|
+
`~/.claude/skills/`, installs a desktop wake script, and runs `notifyd` as a
|
|
57
|
+
kept-alive monitor so teammate pings reach you even when you're idle. Check it
|
|
58
|
+
any time with `nodal doctor`.
|
|
59
|
+
|
|
60
|
+
## Teach your agent to remember
|
|
61
|
+
|
|
62
|
+
Copy this straight into your agent's system prompt (`CLAUDE.md`, custom
|
|
63
|
+
instructions, anywhere it reads on boot):
|
|
64
|
+
|
|
65
|
+
```text
|
|
66
|
+
You have Nodal, a local-first shared memory, available over MCP
|
|
67
|
+
(tools: add, search, recall, register_agent, send_message, inbox, ...).
|
|
68
|
+
|
|
69
|
+
Recall — at the start of any substantive task, and whenever prior context
|
|
70
|
+
would help (past decisions, projects, people, preferences), call `search`
|
|
71
|
+
with a few keywords before answering.
|
|
72
|
+
|
|
73
|
+
Capture — when you produce knowledge worth keeping (facts about the user,
|
|
74
|
+
decisions, learnings, debugging breakthroughs), call `add` without asking.
|
|
75
|
+
One atomic fact per add. Tag consistently: project:<name>, person:<name>,
|
|
76
|
+
pref, decision, learning. Set source to your client name.
|
|
77
|
+
|
|
78
|
+
Fleet (optional) — call `register_agent` once per session to join the team
|
|
79
|
+
brain under a stable name; check `inbox` for teammate messages and reply
|
|
80
|
+
with `send_message`. Never claim another agent's name.
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
That's the whole onboarding: recall before answering, capture liberally,
|
|
84
|
+
identify honestly.
|
|
85
|
+
|
|
86
|
+
## Why
|
|
87
|
+
|
|
88
|
+
Agents forget everything between sessions, and every framework wants to own
|
|
89
|
+
your data in a database you can't read. Nodal inverts both: **memory is plain
|
|
90
|
+
markdown in a folder you own** (`~/Library/Application Support/KB`, or
|
|
91
|
+
`NODAL_DIR`), with retrieval and coordination layered on top — no lock-in, no
|
|
92
|
+
cloud, greppable forever.
|
|
93
|
+
|
|
94
|
+
## What you get
|
|
95
|
+
|
|
96
|
+
**Memory** — `add` / `update` / `supersede` / `pin` facts with tags, scopes,
|
|
97
|
+
and entity links. Confidence decays unless reinforced; superseded facts keep
|
|
98
|
+
their history.
|
|
99
|
+
|
|
100
|
+
**Retrieval that's actually fast** — hybrid search (semantic ⊕ BM25 ⊕ entity)
|
|
101
|
+
fused with reciprocal-rank fusion and MMR diversification: ~140 ms warm on a
|
|
102
|
+
9k-fact vault. `recall` is the "ask your memory" verb.
|
|
103
|
+
|
|
104
|
+
**A real multi-agent layer** — multiple agents (Claude Code sessions, apps,
|
|
105
|
+
workers) share one vault as a team brain:
|
|
106
|
+
|
|
107
|
+
- **Authenticated identity (TOFU)** — first `register_agent` mints a per-name
|
|
108
|
+
token (sha256-only at rest); every message is stamped `via:` with the
|
|
109
|
+
authenticated sender, and presence shows *verified* only when proven.
|
|
110
|
+
Impersonation can't hide.
|
|
111
|
+
- **Messaging, handoffs, rooms** — DMs, broadcasts, threads, ticket-style
|
|
112
|
+
handoffs with read/done status, membership-gated rooms.
|
|
113
|
+
- **Wake on send, not polling** — a message fires the recipient's alarm
|
|
114
|
+
(desktop banner, SMS, in-app queue, or re-invoked agent session). Pollers
|
|
115
|
+
that remain use `inbox(since=…)` cursors: an idle tick reads zero files.
|
|
116
|
+
- **Fleet views** — `nodal` CLI (`nodal agents`, `nodal feed`) and `nodal
|
|
117
|
+
mission`, a full terminal mission control (fleet, ticket kanban, live comms).
|
|
118
|
+
|
|
119
|
+
**Contracts, not conventions** — `COORD.md` documents the wire format any
|
|
120
|
+
second implementation must honor; a bundled agent skill teaches the full
|
|
121
|
+
protocol so a new agent can join the fleet cold.
|
|
122
|
+
|
|
123
|
+
## The dogfood loop
|
|
124
|
+
|
|
125
|
+
Nodal is built by the agent fleet that runs on it — the same vault
|
|
126
|
+
coordinating its own development surfaced and fixed a thread-loading bug, an
|
|
127
|
+
unauthenticated-sender gap, and a GC regression within hours of each shipping.
|
|
128
|
+
The traces are the QA.
|
|
129
|
+
|
|
130
|
+
## Links
|
|
131
|
+
|
|
132
|
+
- Source & docs: <https://github.com/arjan-experiments/kb>
|
|
133
|
+
- Changelog: <https://github.com/arjan-experiments/kb/blob/main/CHANGELOG.md>
|
|
134
|
+
- License: Apache-2.0
|
nodalkb-1.0.0/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img alt="Nodal" src="https://engram-site-xi.vercel.app/icon.png" width="120" height="120">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center"><strong>A local-first shared brain for AI agents.</strong><br>
|
|
6
|
+
One markdown file per fact, on your disk — written and recalled by every agent
|
|
7
|
+
on your machine over MCP, and by you with <code>grep</code>.</p>
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<a href="https://pypi.org/project/nodalkb/"><img alt="PyPI" src="https://img.shields.io/pypi/v/nodalkb?style=flat-square&color=177a44&label=pypi"></a>
|
|
11
|
+
<a href="https://pypi.org/project/nodalkb/"><img alt="Python" src="https://img.shields.io/pypi/pyversions/nodalkb?style=flat-square&color=14171b"></a>
|
|
12
|
+
<img alt="MCP tools" src="https://img.shields.io/badge/MCP-44_tools-6fdd8b?style=flat-square&labelColor=14171b">
|
|
13
|
+
<a href="https://github.com/arjan-experiments/kb/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/license-Apache--2.0-14171b?style=flat-square"></a>
|
|
14
|
+
</p>
|
|
15
|
+
|
|
16
|
+
One command installs uv (if missing), registers the server, plugs in the skill,
|
|
17
|
+
and starts the ping monitor:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
curl -fsSL https://engram-site-xi.vercel.app/install.sh | sh
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Or do it by hand:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
claude mcp add nodal --scope user -- uvx nodalkb@latest # Claude Code
|
|
27
|
+
uvx --from nodalkb@latest nodal setup # skill + ping monitor + auto-update
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Any other MCP client — add to its config, then restart:
|
|
31
|
+
|
|
32
|
+
```json
|
|
33
|
+
{ "mcpServers": { "nodal": { "command": "uvx", "args": ["nodalkb@latest"] } } }
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
`@latest` so a relaunch auto-pulls new releases (uvx otherwise pins the
|
|
37
|
+
first-installed version). `nodal setup` auto-plugs the skill into
|
|
38
|
+
`~/.claude/skills/`, installs a desktop wake script, and runs `notifyd` as a
|
|
39
|
+
kept-alive monitor so teammate pings reach you even when you're idle. Check it
|
|
40
|
+
any time with `nodal doctor`.
|
|
41
|
+
|
|
42
|
+
## Teach your agent to remember
|
|
43
|
+
|
|
44
|
+
Copy this straight into your agent's system prompt (`CLAUDE.md`, custom
|
|
45
|
+
instructions, anywhere it reads on boot):
|
|
46
|
+
|
|
47
|
+
```text
|
|
48
|
+
You have Nodal, a local-first shared memory, available over MCP
|
|
49
|
+
(tools: add, search, recall, register_agent, send_message, inbox, ...).
|
|
50
|
+
|
|
51
|
+
Recall — at the start of any substantive task, and whenever prior context
|
|
52
|
+
would help (past decisions, projects, people, preferences), call `search`
|
|
53
|
+
with a few keywords before answering.
|
|
54
|
+
|
|
55
|
+
Capture — when you produce knowledge worth keeping (facts about the user,
|
|
56
|
+
decisions, learnings, debugging breakthroughs), call `add` without asking.
|
|
57
|
+
One atomic fact per add. Tag consistently: project:<name>, person:<name>,
|
|
58
|
+
pref, decision, learning. Set source to your client name.
|
|
59
|
+
|
|
60
|
+
Fleet (optional) — call `register_agent` once per session to join the team
|
|
61
|
+
brain under a stable name; check `inbox` for teammate messages and reply
|
|
62
|
+
with `send_message`. Never claim another agent's name.
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
That's the whole onboarding: recall before answering, capture liberally,
|
|
66
|
+
identify honestly.
|
|
67
|
+
|
|
68
|
+
## Why
|
|
69
|
+
|
|
70
|
+
Agents forget everything between sessions, and every framework wants to own
|
|
71
|
+
your data in a database you can't read. Nodal inverts both: **memory is plain
|
|
72
|
+
markdown in a folder you own** (`~/Library/Application Support/KB`, or
|
|
73
|
+
`NODAL_DIR`), with retrieval and coordination layered on top — no lock-in, no
|
|
74
|
+
cloud, greppable forever.
|
|
75
|
+
|
|
76
|
+
## What you get
|
|
77
|
+
|
|
78
|
+
**Memory** — `add` / `update` / `supersede` / `pin` facts with tags, scopes,
|
|
79
|
+
and entity links. Confidence decays unless reinforced; superseded facts keep
|
|
80
|
+
their history.
|
|
81
|
+
|
|
82
|
+
**Retrieval that's actually fast** — hybrid search (semantic ⊕ BM25 ⊕ entity)
|
|
83
|
+
fused with reciprocal-rank fusion and MMR diversification: ~140 ms warm on a
|
|
84
|
+
9k-fact vault. `recall` is the "ask your memory" verb.
|
|
85
|
+
|
|
86
|
+
**A real multi-agent layer** — multiple agents (Claude Code sessions, apps,
|
|
87
|
+
workers) share one vault as a team brain:
|
|
88
|
+
|
|
89
|
+
- **Authenticated identity (TOFU)** — first `register_agent` mints a per-name
|
|
90
|
+
token (sha256-only at rest); every message is stamped `via:` with the
|
|
91
|
+
authenticated sender, and presence shows *verified* only when proven.
|
|
92
|
+
Impersonation can't hide.
|
|
93
|
+
- **Messaging, handoffs, rooms** — DMs, broadcasts, threads, ticket-style
|
|
94
|
+
handoffs with read/done status, membership-gated rooms.
|
|
95
|
+
- **Wake on send, not polling** — a message fires the recipient's alarm
|
|
96
|
+
(desktop banner, SMS, in-app queue, or re-invoked agent session). Pollers
|
|
97
|
+
that remain use `inbox(since=…)` cursors: an idle tick reads zero files.
|
|
98
|
+
- **Fleet views** — `nodal` CLI (`nodal agents`, `nodal feed`) and `nodal
|
|
99
|
+
mission`, a full terminal mission control (fleet, ticket kanban, live comms).
|
|
100
|
+
|
|
101
|
+
**Contracts, not conventions** — `COORD.md` documents the wire format any
|
|
102
|
+
second implementation must honor; a bundled agent skill teaches the full
|
|
103
|
+
protocol so a new agent can join the fleet cold.
|
|
104
|
+
|
|
105
|
+
## The dogfood loop
|
|
106
|
+
|
|
107
|
+
Nodal is built by the agent fleet that runs on it — the same vault
|
|
108
|
+
coordinating its own development surfaced and fixed a thread-loading bug, an
|
|
109
|
+
unauthenticated-sender gap, and a GC regression within hours of each shipping.
|
|
110
|
+
The traces are the QA.
|
|
111
|
+
|
|
112
|
+
## Links
|
|
113
|
+
|
|
114
|
+
- Source & docs: <https://github.com/arjan-experiments/kb>
|
|
115
|
+
- Changelog: <https://github.com/arjan-experiments/kb/blob/main/CHANGELOG.md>
|
|
116
|
+
- License: Apache-2.0
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""One-time backfill: embed the vault and build the FastKB index.
|
|
2
|
+
|
|
3
|
+
Root cause of ticket #1: add() embedded a query vector to FIND neighbors but
|
|
4
|
+
never PERSISTED the new fact's own vector, so the index only ever grew via the
|
|
5
|
+
UI's transformers.js path (16 of 43.7k facts). This rebuilds from scratch by
|
|
6
|
+
embedding every fact with the server's model (BAAI/bge-small-en-v1.5).
|
|
7
|
+
|
|
8
|
+
Two phases, deliberately separated:
|
|
9
|
+
PHASE 1 — embed + append (streaming, resumable, O(N)). Vectors are appended to
|
|
10
|
+
vectors.f32 and metadata to manifest.jsonl in batches. A kill/crash keeps
|
|
11
|
+
progress; a re-run truncates vectors.f32 to the manifest row count (repairing
|
|
12
|
+
any half-written batch) and continues. NO hnsw here — building it per-batch
|
|
13
|
+
is what made the old version go O(N^2) (re-mmap + re-save the growing index
|
|
14
|
+
every batch, rate collapsing 72->11/s).
|
|
15
|
+
PHASE 2 — build hnsw.bin ONCE from the full vectors.f32 (a few seconds). It is
|
|
16
|
+
fully rebuildable from vectors+manifest, so it is fine to drop and redo.
|
|
17
|
+
|
|
18
|
+
TEXT_CAP: bge-small only consumes the first 512 tokens, so we truncate each fact
|
|
19
|
+
before embedding — lossless for the model, but huge scraped raw/ pages no longer
|
|
20
|
+
cost seconds each to tokenize.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
KB_DIR="$HOME/Library/Application Support/KB" \
|
|
24
|
+
uv run --project server python server/backfill.py [--limit N]
|
|
25
|
+
[--dirs knowledge,events,...] [--fresh] [--batch 512]
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
import argparse
|
|
29
|
+
import json
|
|
30
|
+
import sys
|
|
31
|
+
import time
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
import numpy as np
|
|
35
|
+
|
|
36
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
37
|
+
import server # noqa
|
|
38
|
+
import fastkb # noqa
|
|
39
|
+
|
|
40
|
+
TEXT_CAP = 2000 # chars; ~512 tokens — the model truncates past this anyway
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def fact_text(f: dict) -> str:
|
|
44
|
+
return (f["content"] + " " + " ".join(f.get("tags") or []))[:TEXT_CAP]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _read_manifest(fk) -> list[dict]:
|
|
48
|
+
if not fk.manifest_path.exists():
|
|
49
|
+
return []
|
|
50
|
+
out = []
|
|
51
|
+
with fk.manifest_path.open() as f:
|
|
52
|
+
for line in f:
|
|
53
|
+
line = line.strip()
|
|
54
|
+
if line:
|
|
55
|
+
out.append(json.loads(line))
|
|
56
|
+
return out
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def main():
|
|
60
|
+
ap = argparse.ArgumentParser()
|
|
61
|
+
ap.add_argument("--limit", type=int, default=0)
|
|
62
|
+
ap.add_argument("--dirs", type=str, default="")
|
|
63
|
+
ap.add_argument("--batch", type=int, default=512)
|
|
64
|
+
ap.add_argument("--fresh", action="store_true")
|
|
65
|
+
args = ap.parse_args()
|
|
66
|
+
allow = set(d.strip() for d in args.dirs.split(",") if d.strip()) or None
|
|
67
|
+
|
|
68
|
+
print(f"== backfill KB={server.KB} cap={TEXT_CAP} batch={args.batch} ==", flush=True)
|
|
69
|
+
t0 = time.perf_counter()
|
|
70
|
+
facts = [f for f in server._iter_facts() if (allow is None or f.get("dir") in allow)]
|
|
71
|
+
if args.limit:
|
|
72
|
+
facts = facts[: args.limit]
|
|
73
|
+
print(f"walked {len(facts)} facts in {time.perf_counter()-t0:.1f}s", flush=True)
|
|
74
|
+
|
|
75
|
+
fk = fastkb.FastKB(server.KB)
|
|
76
|
+
dim = fk.dim
|
|
77
|
+
if args.fresh:
|
|
78
|
+
fk.vec_path.write_bytes(b"")
|
|
79
|
+
fk.manifest_path.write_text("")
|
|
80
|
+
if fk.hnsw_path.exists():
|
|
81
|
+
fk.hnsw_path.unlink()
|
|
82
|
+
|
|
83
|
+
# ---- resume: manifest is source of truth; repair vectors.f32 to match ----
|
|
84
|
+
manifest = _read_manifest(fk)
|
|
85
|
+
m_rows = len(manifest)
|
|
86
|
+
if fk.vec_path.exists():
|
|
87
|
+
good_bytes = m_rows * dim * 4
|
|
88
|
+
if fk.vec_path.stat().st_size != good_bytes:
|
|
89
|
+
with open(fk.vec_path, "r+b") as vf:
|
|
90
|
+
vf.truncate(good_bytes)
|
|
91
|
+
print(f"repaired vectors.f32 to {m_rows} rows", flush=True)
|
|
92
|
+
done = {m["id"] for m in manifest}
|
|
93
|
+
todo = [f for f in facts if f["id"] not in done]
|
|
94
|
+
print(f"phase1: indexed={m_rows} todo={len(todo)}", flush=True)
|
|
95
|
+
|
|
96
|
+
# ---- phase 1: embed + append ----
|
|
97
|
+
if todo:
|
|
98
|
+
from fastembed import TextEmbedding
|
|
99
|
+
emb = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
|
100
|
+
row = m_rows
|
|
101
|
+
n_done = 0
|
|
102
|
+
t_emb = time.perf_counter()
|
|
103
|
+
with open(fk.vec_path, "ab") as vf, fk.manifest_path.open("a") as mf:
|
|
104
|
+
for i in range(0, len(todo), args.batch):
|
|
105
|
+
batch = todo[i : i + args.batch]
|
|
106
|
+
vecs = np.asarray(
|
|
107
|
+
list(emb.embed([fact_text(f) for f in batch], batch_size=args.batch)),
|
|
108
|
+
dtype=np.float32,
|
|
109
|
+
)
|
|
110
|
+
vf.write(np.ascontiguousarray(vecs).tobytes())
|
|
111
|
+
vf.flush()
|
|
112
|
+
for f in batch:
|
|
113
|
+
mf.write(json.dumps(fastkb.FastKB._meta_from_fact(f, row)) + "\n")
|
|
114
|
+
row += 1
|
|
115
|
+
mf.flush()
|
|
116
|
+
n_done += len(batch)
|
|
117
|
+
rate = n_done / (time.perf_counter() - t_emb)
|
|
118
|
+
print(f" embedded {n_done}/{len(todo)} ({rate:.0f}/s, "
|
|
119
|
+
f"{(len(todo)-n_done)/max(rate,1):.0f}s left)", flush=True)
|
|
120
|
+
|
|
121
|
+
# ---- phase 2: build hnsw once from the full vector store ----
|
|
122
|
+
print("phase2: building hnsw index...", flush=True)
|
|
123
|
+
t_idx = time.perf_counter()
|
|
124
|
+
import hnswlib
|
|
125
|
+
manifest = _read_manifest(fk)
|
|
126
|
+
n = len(manifest)
|
|
127
|
+
vectors = np.memmap(fk.vec_path, dtype=np.float32, mode="r", shape=(n, dim))
|
|
128
|
+
idx = hnswlib.Index(space="cosine", dim=dim)
|
|
129
|
+
idx.init_index(max_elements=n + 8192,
|
|
130
|
+
ef_construction=fastkb.HNSW_EF_CONSTRUCTION, M=fastkb.HNSW_M)
|
|
131
|
+
idx.add_items(np.asarray(vectors), np.arange(n))
|
|
132
|
+
idx.set_ef(fastkb.HNSW_EF_QUERY)
|
|
133
|
+
idx.save_index(str(fk.hnsw_path))
|
|
134
|
+
print(f"phase2: hnsw built ({n} elements) in {time.perf_counter()-t_idx:.1f}s", flush=True)
|
|
135
|
+
|
|
136
|
+
by_dir: dict[str, int] = {}
|
|
137
|
+
for m in manifest:
|
|
138
|
+
by_dir[m["dir"]] = by_dir.get(m["dir"], 0) + 1
|
|
139
|
+
print(f"\nDONE: {n} vectors in {time.perf_counter()-t0:.1f}s total")
|
|
140
|
+
print(f"by dir: {by_dir}")
|
|
141
|
+
print(f"artifacts: vectors.f32={fk.vec_path.stat().st_size//1024}KB "
|
|
142
|
+
f"manifest={fk.manifest_path.stat().st_size//1024}KB "
|
|
143
|
+
f"hnsw={fk.hnsw_path.stat().st_size//1024}KB")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
main()
|