alvin-bot 4.19.2 β 4.20.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/bin/cli.js +47 -1
- package/dist/index.js +11 -0
- package/dist/paths.js +4 -1
- package/dist/services/embeddings-migration.js +193 -0
- package/dist/services/embeddings.js +260 -164
- package/package.json +4 -2
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,35 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [4.20.1] β 2026-05-03
|
|
6
|
+
|
|
7
|
+
### π‘οΈ Hardening for the v4.20.0 SQLite migration
|
|
8
|
+
|
|
9
|
+
The v4.20 migration is fully automatic on first start, but a few things could go wrong on user installations that the maintainer instance never hits. v4.20.1 plugs each of them.
|
|
10
|
+
|
|
11
|
+
- **Lazy native binary load.** `better-sqlite3` is now `require()`-d inside `embeddings.ts`, not at module import time. If the prebuilt isn't available for the user's platform and a build-from-source fails (exotic Node version, missing toolchain, glibc mismatch), the bot logs a single clear warning with the exact rebuild command, and **keeps running** β only semantic memory search is disabled until the user fixes their install. Previously this would have crashed bot startup.
|
|
12
|
+
- **Pre-flight disk-space check.** Migration refuses to start unless the volume holding `~/.alvin-bot/memory/` has at least 2Γ the source JSON's size free (covers source + target + WAL during the transaction). Skipped migration leaves the JSON intact for retry on the next boot once space is free.
|
|
13
|
+
- **Progress logging.** On indexes larger than ~5 000 entries, the migration logs `β¦migrated N / M entries (P %)` every 5 000 rows so the user can see it isn't stuck.
|
|
14
|
+
- **Corrupt JSON recovery.** If `JSON.parse` of `.embeddings.json` throws, the file is moved aside to `.embeddings.json.broken.<timestamp>` and the next bot start treats this as a fresh install (rebuild-from-source on first search). No more boot-loop on a damaged index.
|
|
15
|
+
- **`alvin-bot doctor` shows memory health.** New "Memory:" section reports: native binary loadable, vector-store entry count + size, or β for not-yet-migrated installs β the legacy JSON's size and a hint that the next start will migrate.
|
|
16
|
+
- **Cleanup on failed migration.** WAL/SHM sidecars are removed alongside the half-written `.embeddings.db` so the next attempt starts from a clean slate.
|
|
17
|
+
|
|
18
|
+
No schema or API changes β drop-in over v4.20.0.
|
|
19
|
+
|
|
20
|
+
## [4.20.0] β 2026-05-03
|
|
21
|
+
|
|
22
|
+
### π Embeddings: JSON β SQLite
|
|
23
|
+
|
|
24
|
+
**Why.** The vector index `~/.alvin-bot/memory/.embeddings.json` had grown to **146 MB**. Every bot start parsed the whole file (slow boot, large heap), and every reindex iteration rewrote the entire 146 MB blob to disk. With ~3 800 entries the corpus is still small enough that linear-scan cosine similarity is fine, but the JSON serialisation overhead and per-write full-file rewrite were the real cost.
|
|
25
|
+
|
|
26
|
+
**Change.** New SQLite-backed store at `~/.alvin-bot/memory/.embeddings.db` (table `entries(id, source, text, vector BLOB, indexed_at)` + index on `source`). Vectors live as raw `Float32Array` BLOBs (4 B Γ 3072 dims = 12 KB each) instead of JSON-encoded Float64 arrays (β 24 KB each). Reindexing is per-chunk INSERT/UPDATE inside a single transaction β no full-file rewrite. WAL mode + 256 MB mmap, `synchronous = NORMAL`.
|
|
27
|
+
|
|
28
|
+
**Migration.** `src/services/embeddings-migration.ts` runs once on boot if `.embeddings.json` exists but `.embeddings.db` does not. Source JSON is renamed to `.embeddings.json.bak-pre-sqlite` after a successful entry-count match (idempotent, safe to re-run). On the maintainer's instance: 146 MB β 49 MB, 3 799 entries copied in 660 ms.
|
|
29
|
+
|
|
30
|
+
**Files touched.** `src/paths.ts` (new `EMBEDDINGS_DB`), `src/services/embeddings.ts` (full rewrite, drop-in same public surface), `src/services/embeddings-migration.ts` (new), `src/index.ts` (boot hook), `package.json` (deps `better-sqlite3@^12`, `@types/better-sqlite3` dev). Public API unchanged: `searchMemory`, `reindexMemory`, `initEmbeddings`, `getIndexStats` keep their signatures so callers in `engine.ts`, `web-server.ts` etc. don't change.
|
|
31
|
+
|
|
32
|
+
**Wins.** ~66 % smaller on disk. Bot boot no longer parses a 146 MB JSON. Reindex of a single file is O(log n) DELETE-by-source + transactional INSERTs instead of `JSON.stringify` + `writeFileSync` of the whole index.
|
|
33
|
+
|
|
5
34
|
## [4.19.2] β 2026-04-24
|
|
6
35
|
|
|
7
36
|
### π Fix: workspace switch produced "(no response)" format-kaskade; added empty-stream diagnostics
|
package/bin/cli.js
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
19
|
import { createInterface } from "readline";
|
|
20
|
-
import { existsSync, writeFileSync, readFileSync, mkdirSync, copyFileSync, readdirSync } from "fs";
|
|
20
|
+
import { existsSync, writeFileSync, readFileSync, mkdirSync, copyFileSync, readdirSync, statSync } from "fs";
|
|
21
21
|
import { resolve, join } from "path";
|
|
22
22
|
import { homedir } from "os";
|
|
23
23
|
import { execSync } from "child_process";
|
|
@@ -1361,6 +1361,52 @@ async function doctor() {
|
|
|
1361
1361
|
console.log(` β ALLOWED_USERS not set (nobody can message the bot)`);
|
|
1362
1362
|
}
|
|
1363
1363
|
|
|
1364
|
+
// ββ Memory (semantic search backend) ββ
|
|
1365
|
+
console.log("\n Memory:");
|
|
1366
|
+
const embJson = resolve(DATA_DIR, "memory", ".embeddings.json");
|
|
1367
|
+
const embDb = resolve(DATA_DIR, "memory", ".embeddings.db");
|
|
1368
|
+
const embBakSqlite = resolve(DATA_DIR, "memory", ".embeddings.json.bak-pre-sqlite");
|
|
1369
|
+
|
|
1370
|
+
// better-sqlite3 native binary loadable?
|
|
1371
|
+
let sqliteOk = false;
|
|
1372
|
+
let sqliteErr = "";
|
|
1373
|
+
try {
|
|
1374
|
+
const req = (await import("module")).createRequire(import.meta.url);
|
|
1375
|
+
req("better-sqlite3");
|
|
1376
|
+
sqliteOk = true;
|
|
1377
|
+
} catch (err) {
|
|
1378
|
+
sqliteErr = err instanceof Error ? err.message : String(err);
|
|
1379
|
+
}
|
|
1380
|
+
if (sqliteOk) {
|
|
1381
|
+
console.log(` β
better-sqlite3 native binary loadable`);
|
|
1382
|
+
} else {
|
|
1383
|
+
console.log(` β better-sqlite3 native binary not loadable β semantic search disabled`);
|
|
1384
|
+
console.log(` Fix: cd $(npm root -g)/alvin-bot && npm rebuild better-sqlite3`);
|
|
1385
|
+
console.log(` Detail: ${sqliteErr.split("\n")[0]}`);
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
if (sqliteOk && existsSync(embDb)) {
|
|
1389
|
+
try {
|
|
1390
|
+
const req = (await import("module")).createRequire(import.meta.url);
|
|
1391
|
+
const Database = req("better-sqlite3");
|
|
1392
|
+
const db = new Database(embDb, { readonly: true });
|
|
1393
|
+
const entries = db.prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
1394
|
+
const files = db.prepare("SELECT COUNT(*) AS c FROM file_mtimes").get().c;
|
|
1395
|
+
const sizeMb = (statSync(embDb).size / 1024 / 1024).toFixed(0);
|
|
1396
|
+
db.close();
|
|
1397
|
+
console.log(` β
Vector store: ${entries} entries across ${files} sources (${sizeMb} MB SQLite)`);
|
|
1398
|
+
} catch (err) {
|
|
1399
|
+
console.log(` β οΈ Vector store exists but unreadable: ${err.message}`);
|
|
1400
|
+
}
|
|
1401
|
+
} else if (existsSync(embJson)) {
|
|
1402
|
+
const sizeMb = (statSync(embJson).size / 1024 / 1024).toFixed(0);
|
|
1403
|
+
console.log(` β οΈ Legacy JSON index found (${sizeMb} MB) β will auto-migrate to SQLite on next bot start`);
|
|
1404
|
+
} else if (existsSync(embBakSqlite)) {
|
|
1405
|
+
console.log(` β
Migration to SQLite already done (legacy JSON kept as .bak-pre-sqlite)`);
|
|
1406
|
+
} else {
|
|
1407
|
+
console.log(` βΉοΈ No vector store yet β will be built on first message`);
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1364
1410
|
// ββ Extras ββ
|
|
1365
1411
|
console.log("\n Extras:");
|
|
1366
1412
|
|
package/dist/index.js
CHANGED
|
@@ -20,6 +20,17 @@ if (hasLegacyData()) {
|
|
|
20
20
|
}
|
|
21
21
|
// 3. Seed defaults for any files that don't exist yet (fresh install)
|
|
22
22
|
seedDefaults();
|
|
23
|
+
// 3b. v4.20 β One-shot migration of legacy .embeddings.json β SQLite (.embeddings.db).
|
|
24
|
+
// Idempotent and safe: source JSON is renamed to .bak-pre-sqlite after success.
|
|
25
|
+
import { shouldMigrateEmbeddingsToSqlite, migrateEmbeddingsToSqlite } from "./services/embeddings-migration.js";
|
|
26
|
+
if (shouldMigrateEmbeddingsToSqlite()) {
|
|
27
|
+
try {
|
|
28
|
+
migrateEmbeddingsToSqlite();
|
|
29
|
+
}
|
|
30
|
+
catch (err) {
|
|
31
|
+
console.error("β Embeddings migration failed β bot will continue with empty SQLite store, JSON kept:", err);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
23
34
|
// 3a. v4.12.2 β Audit + repair permissions on sensitive files. On multi-user
|
|
24
35
|
// systems, files written pre-v4.12.2 may have 0o644 / 0o666 mode β i.e.
|
|
25
36
|
// readable by other users on the same machine. This routine chmod-repairs
|
package/dist/paths.js
CHANGED
|
@@ -55,8 +55,11 @@ export const PROJECTS_MEMORY_DIR = resolve(DATA_DIR, "memory", "projects");
|
|
|
55
55
|
* name, purpose, cwd, color, emoji, and an optional system prompt body.
|
|
56
56
|
* See src/services/workspaces.ts for the loader and matcher. */
|
|
57
57
|
export const WORKSPACES_DIR = resolve(DATA_DIR, "workspaces");
|
|
58
|
-
/** memory/.embeddings.json β
|
|
58
|
+
/** memory/.embeddings.json β Legacy JSON vector index. Read on first SQLite migration only;
|
|
59
|
+
* active code path is EMBEDDINGS_DB. */
|
|
59
60
|
export const EMBEDDINGS_IDX = resolve(DATA_DIR, "memory", ".embeddings.json");
|
|
61
|
+
/** memory/.embeddings.db β SQLite vector store (replaces .embeddings.json since v4.20). */
|
|
62
|
+
export const EMBEDDINGS_DB = resolve(DATA_DIR, "memory", ".embeddings.db");
|
|
60
63
|
/** users/ β User profiles and per-user memory */
|
|
61
64
|
export const USERS_DIR = resolve(DATA_DIR, "users");
|
|
62
65
|
/** data/ β Runtime control data */
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* One-shot migration from legacy .embeddings.json β SQLite .embeddings.db.
|
|
3
|
+
*
|
|
4
|
+
* Triggered on startup if .embeddings.json exists but .embeddings.db does not.
|
|
5
|
+
* Idempotent: skips silently if the DB is already populated.
|
|
6
|
+
*
|
|
7
|
+
* Hardening (v4.20.1):
|
|
8
|
+
* - Lazy require of better-sqlite3 β missing native binary degrades to a clear
|
|
9
|
+
* warning + skip (bot keeps running, falls back to legacy JSON path until
|
|
10
|
+
* the user fixes their install).
|
|
11
|
+
* - Pre-flight disk-space check: refuses to start if free space < 2Γ source.
|
|
12
|
+
* - Progress logging every 1 000 entries on large indexes.
|
|
13
|
+
* - Corrupt source JSON is renamed to `.broken.<timestamp>` so the next run
|
|
14
|
+
* doesn't loop on the same parse error.
|
|
15
|
+
*
|
|
16
|
+
* Safety:
|
|
17
|
+
* - Source JSON is renamed to .embeddings.json.bak-pre-sqlite (kept on disk).
|
|
18
|
+
* - Entry counts are compared after import; mismatch β throw, leaving the
|
|
19
|
+
* half-written DB removed and the source JSON untouched.
|
|
20
|
+
*/
|
|
21
|
+
import fs from "fs";
|
|
22
|
+
import path from "path";
|
|
23
|
+
import { createRequire } from "module";
|
|
24
|
+
import { EMBEDDINGS_IDX, EMBEDDINGS_DB } from "../paths.js";
|
|
25
|
+
const cjsRequire = createRequire(import.meta.url);
|
|
26
|
+
function vectorToBlob(v) {
|
|
27
|
+
const f32 = new Float32Array(v);
|
|
28
|
+
return Buffer.from(f32.buffer, f32.byteOffset, f32.byteLength);
|
|
29
|
+
}
|
|
30
|
+
export function shouldMigrateEmbeddingsToSqlite() {
|
|
31
|
+
return fs.existsSync(EMBEDDINGS_IDX) && !fs.existsSync(EMBEDDINGS_DB);
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Best-effort free-space probe. Returns Infinity if the platform has no
|
|
35
|
+
* statfs (which means we'll proceed without the safety check rather than
|
|
36
|
+
* blocking the migration). Node 18.15+ ships statfsSync on all major platforms.
|
|
37
|
+
*/
|
|
38
|
+
function freeBytesOnVolume(forPath) {
|
|
39
|
+
try {
|
|
40
|
+
const fsAny = fs;
|
|
41
|
+
if (typeof fsAny.statfsSync !== "function")
|
|
42
|
+
return Number.POSITIVE_INFINITY;
|
|
43
|
+
const stat = fsAny.statfsSync(forPath);
|
|
44
|
+
const bavail = typeof stat.bavail === "bigint" ? Number(stat.bavail) : stat.bavail;
|
|
45
|
+
const bsize = typeof stat.bsize === "bigint" ? Number(stat.bsize) : stat.bsize;
|
|
46
|
+
return bavail * bsize;
|
|
47
|
+
}
|
|
48
|
+
catch {
|
|
49
|
+
return Number.POSITIVE_INFINITY;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Run the migration. Returns the entry count migrated, or null if skipped.
|
|
54
|
+
*/
|
|
55
|
+
export function migrateEmbeddingsToSqlite() {
|
|
56
|
+
if (!shouldMigrateEmbeddingsToSqlite())
|
|
57
|
+
return null;
|
|
58
|
+
// ββ Pre-flight: better-sqlite3 loadable? βββββββββββββββββββββββββββββββββββ
|
|
59
|
+
let Database;
|
|
60
|
+
try {
|
|
61
|
+
Database = cjsRequire("better-sqlite3");
|
|
62
|
+
}
|
|
63
|
+
catch (err) {
|
|
64
|
+
console.warn("β οΈ Embeddings migration skipped: better-sqlite3 native binary unavailable. " +
|
|
65
|
+
"Bot continues with legacy JSON index. Fix: `npm rebuild better-sqlite3` " +
|
|
66
|
+
"or reinstall alvin-bot. Underlying error:", err instanceof Error ? err.message : err);
|
|
67
|
+
return null;
|
|
68
|
+
}
|
|
69
|
+
const sourceSize = fs.statSync(EMBEDDINGS_IDX).size;
|
|
70
|
+
// ββ Pre-flight: enough free space? βββββββββββββββββββββββββββββββββββββββββ
|
|
71
|
+
const targetDir = path.dirname(EMBEDDINGS_DB);
|
|
72
|
+
fs.mkdirSync(targetDir, { recursive: true });
|
|
73
|
+
const free = freeBytesOnVolume(targetDir);
|
|
74
|
+
// We need source + about half of source for the SQLite file, plus headroom
|
|
75
|
+
// for WAL during the transaction. Demand 2Γ source size to be comfortable.
|
|
76
|
+
const required = sourceSize * 2;
|
|
77
|
+
if (free < required) {
|
|
78
|
+
console.warn(`β οΈ Embeddings migration skipped: insufficient free disk space on ${targetDir}. ` +
|
|
79
|
+
`Need ~${(required / 1024 / 1024).toFixed(0)} MB, have ${(free / 1024 / 1024).toFixed(0)} MB. ` +
|
|
80
|
+
`Free up some space and restart the bot to retry.`);
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
// ββ Read & parse source ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
84
|
+
const t0 = Date.now();
|
|
85
|
+
console.log(`π¦ Migrating embeddings JSON (${(sourceSize / 1024 / 1024).toFixed(0)} MB) β SQLite...`);
|
|
86
|
+
const raw = fs.readFileSync(EMBEDDINGS_IDX, "utf-8");
|
|
87
|
+
let legacy;
|
|
88
|
+
try {
|
|
89
|
+
legacy = JSON.parse(raw);
|
|
90
|
+
}
|
|
91
|
+
catch (err) {
|
|
92
|
+
// Move the broken JSON aside so we don't try to migrate it again next boot.
|
|
93
|
+
const broken = `${EMBEDDINGS_IDX}.broken.${Date.now()}`;
|
|
94
|
+
try {
|
|
95
|
+
fs.renameSync(EMBEDDINGS_IDX, broken);
|
|
96
|
+
console.error(`β Embeddings migration: source JSON is corrupt β renamed to ${path.basename(broken)} ` +
|
|
97
|
+
`and skipped. The bot will rebuild the index from scratch on first search ` +
|
|
98
|
+
`(this may incur Google API calls). Underlying parse error:`, err);
|
|
99
|
+
}
|
|
100
|
+
catch (renameErr) {
|
|
101
|
+
console.error("β Embeddings migration: source JSON is corrupt AND could not be renamed:", err, "Rename error:", renameErr);
|
|
102
|
+
}
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
const validEntries = (legacy.entries ?? []).filter(e => Array.isArray(e.vector) && e.vector.length > 0);
|
|
106
|
+
// ββ Write DB βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
107
|
+
const db = new Database(EMBEDDINGS_DB);
|
|
108
|
+
try {
|
|
109
|
+
db.pragma("journal_mode = WAL");
|
|
110
|
+
db.pragma("synchronous = NORMAL");
|
|
111
|
+
db.exec(`
|
|
112
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
113
|
+
key TEXT PRIMARY KEY,
|
|
114
|
+
value TEXT NOT NULL
|
|
115
|
+
);
|
|
116
|
+
CREATE TABLE IF NOT EXISTS file_mtimes (
|
|
117
|
+
source TEXT PRIMARY KEY,
|
|
118
|
+
mtime_ms REAL NOT NULL
|
|
119
|
+
);
|
|
120
|
+
CREATE TABLE IF NOT EXISTS entries (
|
|
121
|
+
id TEXT PRIMARY KEY,
|
|
122
|
+
source TEXT NOT NULL,
|
|
123
|
+
text TEXT NOT NULL,
|
|
124
|
+
vector BLOB NOT NULL,
|
|
125
|
+
indexed_at INTEGER NOT NULL
|
|
126
|
+
);
|
|
127
|
+
CREATE INDEX IF NOT EXISTS idx_entries_source ON entries(source);
|
|
128
|
+
`);
|
|
129
|
+
const setMeta = db.prepare("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value");
|
|
130
|
+
setMeta.run("model", legacy.model || "gemini-embedding-001");
|
|
131
|
+
setMeta.run("schemaVersion", "1");
|
|
132
|
+
setMeta.run("lastReindex", String(legacy.lastReindex || 0));
|
|
133
|
+
setMeta.run("migratedFromJson", String(Date.now()));
|
|
134
|
+
const insMtime = db.prepare("INSERT INTO file_mtimes (source, mtime_ms) VALUES (?, ?) ON CONFLICT(source) DO UPDATE SET mtime_ms = excluded.mtime_ms");
|
|
135
|
+
const writeMtimes = db.transaction((rows) => {
|
|
136
|
+
for (const [s, m] of rows)
|
|
137
|
+
insMtime.run(s, m);
|
|
138
|
+
});
|
|
139
|
+
writeMtimes(Object.entries(legacy.fileMtimes ?? {}));
|
|
140
|
+
const insEntry = db.prepare("INSERT INTO entries (id, source, text, vector, indexed_at) VALUES (?, ?, ?, ?, ?)");
|
|
141
|
+
// Write entries in chunks of 1 000 so we can log progress on huge indexes.
|
|
142
|
+
const CHUNK = 1000;
|
|
143
|
+
const total = validEntries.length;
|
|
144
|
+
let written = 0;
|
|
145
|
+
const writeChunk = db.transaction((rows) => {
|
|
146
|
+
for (const e of rows) {
|
|
147
|
+
insEntry.run(e.id, e.source, e.text, vectorToBlob(e.vector), e.indexedAt);
|
|
148
|
+
}
|
|
149
|
+
});
|
|
150
|
+
for (let i = 0; i < total; i += CHUNK) {
|
|
151
|
+
const slice = validEntries.slice(i, i + CHUNK);
|
|
152
|
+
writeChunk(slice);
|
|
153
|
+
written += slice.length;
|
|
154
|
+
if (total > 5000 && (written === total || written % 5000 === 0)) {
|
|
155
|
+
console.log(` β¦migrated ${written} / ${total} entries (${Math.round((written / total) * 100)} %)`);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
const writtenCount = db.prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
159
|
+
if (writtenCount !== validEntries.length) {
|
|
160
|
+
throw new Error(`Entry-count mismatch after migration: expected ${validEntries.length}, got ${writtenCount}`);
|
|
161
|
+
}
|
|
162
|
+
db.close();
|
|
163
|
+
// ββ Move source JSON aside so we never re-migrate ββββββββββββββββββββββββ
|
|
164
|
+
const bak = `${EMBEDDINGS_IDX}.bak-pre-sqlite`;
|
|
165
|
+
try {
|
|
166
|
+
fs.renameSync(EMBEDDINGS_IDX, bak);
|
|
167
|
+
}
|
|
168
|
+
catch (err) {
|
|
169
|
+
console.warn("β οΈ Could not rename source JSON (migration still succeeded):", err);
|
|
170
|
+
}
|
|
171
|
+
const targetSize = fs.statSync(EMBEDDINGS_DB).size;
|
|
172
|
+
const dt = Date.now() - t0;
|
|
173
|
+
console.log(`β
Embeddings migrated: ${writtenCount} entries, ${(sourceSize / 1024 / 1024).toFixed(0)} MB JSON β ${(targetSize / 1024 / 1024).toFixed(0)} MB SQLite in ${dt} ms`);
|
|
174
|
+
return { entries: writtenCount, sourceMb: sourceSize / 1024 / 1024, targetMb: targetSize / 1024 / 1024 };
|
|
175
|
+
}
|
|
176
|
+
catch (err) {
|
|
177
|
+
db.close();
|
|
178
|
+
// Remove half-written DB so the next boot retries cleanly with the original JSON intact.
|
|
179
|
+
try {
|
|
180
|
+
fs.unlinkSync(EMBEDDINGS_DB);
|
|
181
|
+
// also unlink WAL/SHM if present
|
|
182
|
+
for (const ext of ["-wal", "-shm"]) {
|
|
183
|
+
const p = `${EMBEDDINGS_DB}${ext}`;
|
|
184
|
+
if (fs.existsSync(p))
|
|
185
|
+
fs.unlinkSync(p);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
catch {
|
|
189
|
+
/* nothing to clean */
|
|
190
|
+
}
|
|
191
|
+
throw err;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
@@ -1,31 +1,159 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Embeddings Service β Vector-based semantic memory search.
|
|
3
3
|
*
|
|
4
|
-
* Uses Google's
|
|
5
|
-
* Stores embeddings in a
|
|
4
|
+
* Uses Google's gemini-embedding-001 model for generating embeddings.
|
|
5
|
+
* Stores embeddings in a SQLite database (.embeddings.db) β replaces the
|
|
6
|
+
* older .embeddings.json index since v4.20. The migration runs once
|
|
7
|
+
* automatically on startup (see src/migrate.ts).
|
|
6
8
|
*
|
|
7
9
|
* Architecture:
|
|
8
|
-
* - Each memory entry (paragraph/section) gets
|
|
9
|
-
* - Vectors are stored
|
|
10
|
-
*
|
|
11
|
-
* -
|
|
10
|
+
* - Each memory entry (paragraph/section) gets a 3072-dim Float32 vector.
|
|
11
|
+
* - Vectors are stored as raw BLOB (4 bytes Γ 3072 = 12 KB each) instead of
|
|
12
|
+
* JSON-encoded Float64 arrays (~24 KB each) β halves disk footprint.
|
|
13
|
+
* - Cosine similarity runs in-memory: SQLite has no native vector ops, but
|
|
14
|
+
* reading the BLOBs is mmap-cheap and JS does the dot product fast enough
|
|
15
|
+
* for the current corpus (a few thousand entries).
|
|
16
|
+
* - Reindexing is per-chunk INSERT/UPDATE β no full-file rewrite.
|
|
12
17
|
*/
|
|
13
18
|
import fs from "fs";
|
|
14
19
|
import path from "path";
|
|
15
20
|
import { resolve } from "path";
|
|
16
|
-
import { config } from "../config.js";
|
|
17
21
|
import os from "os";
|
|
18
|
-
import {
|
|
22
|
+
import { createRequire } from "module";
|
|
23
|
+
import { config } from "../config.js";
|
|
24
|
+
import { MEMORY_DIR, MEMORY_FILE, EMBEDDINGS_DB } from "../paths.js";
|
|
19
25
|
import { ASSETS_DIR, ASSETS_INDEX_MD } from "../paths.js";
|
|
26
|
+
let SqliteClass = null;
|
|
27
|
+
let sqliteLoadAttempted = false;
|
|
28
|
+
let sqliteLoadError = null;
|
|
29
|
+
const cjsRequire = createRequire(import.meta.url);
|
|
30
|
+
function loadSqlite() {
|
|
31
|
+
if (sqliteLoadAttempted)
|
|
32
|
+
return SqliteClass;
|
|
33
|
+
sqliteLoadAttempted = true;
|
|
34
|
+
try {
|
|
35
|
+
SqliteClass = cjsRequire("better-sqlite3");
|
|
36
|
+
return SqliteClass;
|
|
37
|
+
}
|
|
38
|
+
catch (err) {
|
|
39
|
+
sqliteLoadError = err instanceof Error ? err : new Error(String(err));
|
|
40
|
+
console.warn("β οΈ better-sqlite3 native binary unavailable β embeddings disabled. " +
|
|
41
|
+
"Bot continues without semantic memory search. Fix: rebuild deps with " +
|
|
42
|
+
"`cd $(npm root -g)/alvin-bot && npm rebuild better-sqlite3` or reinstall " +
|
|
43
|
+
"alvin-bot. Underlying error: " +
|
|
44
|
+
sqliteLoadError.message);
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
export function getEmbeddingsBackendStatus() {
|
|
49
|
+
loadSqlite();
|
|
50
|
+
return { available: SqliteClass !== null, error: sqliteLoadError?.message ?? null };
|
|
51
|
+
}
|
|
20
52
|
// Hub memory directory (Claude Hub β read-only, additional context)
|
|
21
53
|
const HUB_MEMORY_DIR = resolve(os.homedir(), ".claude", "hub", "MEMORY");
|
|
22
|
-
// ββ
|
|
54
|
+
// ββ Constants βββββββββββββββββββββββββββββββββββββββββββ
|
|
23
55
|
const EMBEDDING_MODEL = "gemini-embedding-001";
|
|
24
56
|
const EMBEDDING_DIMENSION = 3072;
|
|
57
|
+
const SCHEMA_VERSION = "1";
|
|
58
|
+
// ββ Vector encoding (Float32Array β Buffer) βββββββββββββ
|
|
59
|
+
function vectorToBlob(v) {
|
|
60
|
+
const f32 = new Float32Array(v);
|
|
61
|
+
// Buffer.from(arrayBuffer, byteOffset, length) preserves the underlying memory.
|
|
62
|
+
return Buffer.from(f32.buffer, f32.byteOffset, f32.byteLength);
|
|
63
|
+
}
|
|
64
|
+
function blobToVector(b) {
|
|
65
|
+
// Buffers from better-sqlite3 own their memory and may not be aligned to 4 bytes.
|
|
66
|
+
// Copying into a fresh Float32Array guarantees alignment.
|
|
67
|
+
const f32 = new Float32Array(b.byteLength / 4);
|
|
68
|
+
const dv = new DataView(b.buffer, b.byteOffset, b.byteLength);
|
|
69
|
+
for (let i = 0; i < f32.length; i++) {
|
|
70
|
+
f32[i] = dv.getFloat32(i * 4, true /* little-endian */);
|
|
71
|
+
}
|
|
72
|
+
return f32;
|
|
73
|
+
}
|
|
74
|
+
// ββ DB lifecycle ββββββββββββββββββββββββββββββββββββββββ
|
|
75
|
+
let dbInstance = null;
|
|
25
76
|
/**
|
|
26
|
-
*
|
|
27
|
-
*
|
|
77
|
+
* Returns the live DB handle, or null when better-sqlite3 isn't loadable.
|
|
78
|
+
* Callers must handle the null case (treat as "search unavailable").
|
|
28
79
|
*/
|
|
80
|
+
function db() {
|
|
81
|
+
if (dbInstance)
|
|
82
|
+
return dbInstance;
|
|
83
|
+
const Database = loadSqlite();
|
|
84
|
+
if (!Database)
|
|
85
|
+
return null;
|
|
86
|
+
// Ensure directory exists (handles fresh installs).
|
|
87
|
+
fs.mkdirSync(path.dirname(EMBEDDINGS_DB), { recursive: true });
|
|
88
|
+
dbInstance = new Database(EMBEDDINGS_DB);
|
|
89
|
+
dbInstance.pragma("journal_mode = WAL");
|
|
90
|
+
dbInstance.pragma("synchronous = NORMAL");
|
|
91
|
+
dbInstance.pragma("temp_store = MEMORY");
|
|
92
|
+
dbInstance.pragma("mmap_size = 268435456"); // 256 MB
|
|
93
|
+
dbInstance.exec(`
|
|
94
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
95
|
+
key TEXT PRIMARY KEY,
|
|
96
|
+
value TEXT NOT NULL
|
|
97
|
+
);
|
|
98
|
+
CREATE TABLE IF NOT EXISTS file_mtimes (
|
|
99
|
+
source TEXT PRIMARY KEY,
|
|
100
|
+
mtime_ms REAL NOT NULL
|
|
101
|
+
);
|
|
102
|
+
CREATE TABLE IF NOT EXISTS entries (
|
|
103
|
+
id TEXT PRIMARY KEY,
|
|
104
|
+
source TEXT NOT NULL,
|
|
105
|
+
text TEXT NOT NULL,
|
|
106
|
+
vector BLOB NOT NULL,
|
|
107
|
+
indexed_at INTEGER NOT NULL
|
|
108
|
+
);
|
|
109
|
+
CREATE INDEX IF NOT EXISTS idx_entries_source ON entries(source);
|
|
110
|
+
`);
|
|
111
|
+
// Initialise meta if absent.
|
|
112
|
+
const set = dbInstance.prepare("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO NOTHING");
|
|
113
|
+
set.run("model", EMBEDDING_MODEL);
|
|
114
|
+
set.run("schemaVersion", SCHEMA_VERSION);
|
|
115
|
+
return dbInstance;
|
|
116
|
+
}
|
|
117
|
+
/** Close handle (used by tests / shutdown). */
|
|
118
|
+
export function closeEmbeddingsDb() {
|
|
119
|
+
if (dbInstance) {
|
|
120
|
+
dbInstance.close();
|
|
121
|
+
dbInstance = null;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
/** Sharper assertion for use inside helpers that require an open DB. */
|
|
125
|
+
function dbOrThrow() {
|
|
126
|
+
const d = db();
|
|
127
|
+
if (!d) {
|
|
128
|
+
throw new Error("Embeddings DB unavailable β better-sqlite3 native module not loaded");
|
|
129
|
+
}
|
|
130
|
+
return d;
|
|
131
|
+
}
|
|
132
|
+
// ββ Meta helpers ββββββββββββββββββββββββββββββββββββββββ
|
|
133
|
+
function getMeta(key) {
|
|
134
|
+
const row = dbOrThrow().prepare("SELECT value FROM meta WHERE key = ?").get(key);
|
|
135
|
+
return row?.value ?? null;
|
|
136
|
+
}
|
|
137
|
+
function setMeta(key, value) {
|
|
138
|
+
dbOrThrow()
|
|
139
|
+
.prepare("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value")
|
|
140
|
+
.run(key, value);
|
|
141
|
+
}
|
|
142
|
+
function getFileMtimes() {
|
|
143
|
+
const rows = dbOrThrow()
|
|
144
|
+
.prepare("SELECT source, mtime_ms FROM file_mtimes")
|
|
145
|
+
.all();
|
|
146
|
+
const out = {};
|
|
147
|
+
for (const r of rows)
|
|
148
|
+
out[r.source] = r.mtime_ms;
|
|
149
|
+
return out;
|
|
150
|
+
}
|
|
151
|
+
function setFileMtime(source, mtimeMs) {
|
|
152
|
+
dbOrThrow()
|
|
153
|
+
.prepare("INSERT INTO file_mtimes (source, mtime_ms) VALUES (?, ?) ON CONFLICT(source) DO UPDATE SET mtime_ms = excluded.mtime_ms")
|
|
154
|
+
.run(source, mtimeMs);
|
|
155
|
+
}
|
|
156
|
+
// ββ Google Embeddings API βββββββββββββββββββββββββββββββ
|
|
29
157
|
async function getEmbeddings(texts) {
|
|
30
158
|
const apiKey = config.apiKeys.google;
|
|
31
159
|
if (!apiKey) {
|
|
@@ -50,16 +178,13 @@ async function getEmbeddings(texts) {
|
|
|
50
178
|
const err = await response.text();
|
|
51
179
|
throw new Error(`Embedding API error: ${response.status} β ${err}`);
|
|
52
180
|
}
|
|
53
|
-
const data = await response.json();
|
|
181
|
+
const data = (await response.json());
|
|
54
182
|
for (const emb of data.embeddings) {
|
|
55
183
|
results.push(emb.values);
|
|
56
184
|
}
|
|
57
185
|
}
|
|
58
186
|
return results;
|
|
59
187
|
}
|
|
60
|
-
/**
|
|
61
|
-
* Get embedding for a single query text.
|
|
62
|
-
*/
|
|
63
188
|
async function getQueryEmbedding(text) {
|
|
64
189
|
const apiKey = config.apiKeys.google;
|
|
65
190
|
if (!apiKey) {
|
|
@@ -78,11 +203,11 @@ async function getQueryEmbedding(text) {
|
|
|
78
203
|
const err = await response.text();
|
|
79
204
|
throw new Error(`Embedding API error: ${response.status} β ${err}`);
|
|
80
205
|
}
|
|
81
|
-
const data = await response.json();
|
|
206
|
+
const data = (await response.json());
|
|
82
207
|
return data.embedding.values;
|
|
83
208
|
}
|
|
84
209
|
// ββ Vector Math βββββββββββββββββββββββββββββββββββββββββ
|
|
85
|
-
function
|
|
210
|
+
function cosineSimilarityF32(a, b) {
|
|
86
211
|
if (a.length !== b.length)
|
|
87
212
|
return 0;
|
|
88
213
|
let dotProduct = 0;
|
|
@@ -97,20 +222,13 @@ function cosineSimilarity(a, b) {
|
|
|
97
222
|
return denom === 0 ? 0 : dotProduct / denom;
|
|
98
223
|
}
|
|
99
224
|
// ββ Text Chunking βββββββββββββββββββββββββββββββββββββββ
|
|
100
|
-
/**
|
|
101
|
-
* Split a markdown file into meaningful chunks.
|
|
102
|
-
* Splits on ## headers, keeping each section as a chunk.
|
|
103
|
-
* Falls back to paragraph splitting for files without headers.
|
|
104
|
-
*/
|
|
105
225
|
function chunkMarkdown(content, source) {
|
|
106
226
|
const chunks = [];
|
|
107
|
-
// Split on ## headers
|
|
108
227
|
const sections = content.split(/^(?=## )/gm);
|
|
109
228
|
for (let i = 0; i < sections.length; i++) {
|
|
110
229
|
const section = sections[i].trim();
|
|
111
230
|
if (!section || section.length < 20)
|
|
112
|
-
continue;
|
|
113
|
-
// If section is too long (>1000 chars), split into paragraphs
|
|
231
|
+
continue;
|
|
114
232
|
if (section.length > 1000) {
|
|
115
233
|
const paragraphs = section.split(/\n\n+/);
|
|
116
234
|
let currentChunk = "";
|
|
@@ -142,51 +260,7 @@ function chunkMarkdown(content, source) {
|
|
|
142
260
|
}
|
|
143
261
|
return chunks;
|
|
144
262
|
}
|
|
145
|
-
// ββ
|
|
146
|
-
// In-memory cache for the embedding index. Without this, every query would
|
|
147
|
-
// re-read and re-parse the on-disk index (can be 100+ MB, making searchMemory
|
|
148
|
-
// the slowest step in a message turn). We keep the parsed object and invalidate
|
|
149
|
-
// via mtime check β so external reindexers are still picked up.
|
|
150
|
-
let indexCache = null;
|
|
151
|
-
let indexCacheMtime = 0;
|
|
152
|
-
function loadIndex() {
|
|
153
|
-
try {
|
|
154
|
-
const st = fs.statSync(INDEX_FILE);
|
|
155
|
-
if (indexCache && st.mtimeMs === indexCacheMtime) {
|
|
156
|
-
return indexCache;
|
|
157
|
-
}
|
|
158
|
-
const raw = fs.readFileSync(INDEX_FILE, "utf-8");
|
|
159
|
-
indexCache = JSON.parse(raw);
|
|
160
|
-
indexCacheMtime = st.mtimeMs;
|
|
161
|
-
return indexCache;
|
|
162
|
-
}
|
|
163
|
-
catch {
|
|
164
|
-
// File missing or unparseable β return an empty index and don't cache it
|
|
165
|
-
// (next call will retry, so a freshly-written index gets picked up).
|
|
166
|
-
return {
|
|
167
|
-
model: EMBEDDING_MODEL,
|
|
168
|
-
lastReindex: 0,
|
|
169
|
-
fileMtimes: {},
|
|
170
|
-
entries: [],
|
|
171
|
-
};
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
function saveIndex(index) {
|
|
175
|
-
fs.writeFileSync(INDEX_FILE, JSON.stringify(index));
|
|
176
|
-
// Refresh cache immediately so the next loadIndex() sees the new state
|
|
177
|
-
// without a disk round-trip.
|
|
178
|
-
indexCache = index;
|
|
179
|
-
try {
|
|
180
|
-
indexCacheMtime = fs.statSync(INDEX_FILE).mtimeMs;
|
|
181
|
-
}
|
|
182
|
-
catch {
|
|
183
|
-
indexCacheMtime = Date.now();
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
/**
|
|
187
|
-
* Recursively walk a directory, returning file paths.
|
|
188
|
-
* Skips INDEX.json and INDEX.md at the directory root.
|
|
189
|
-
*/
|
|
263
|
+
// ββ Indexable file discovery ββββββββββββββββββββββββββββ
|
|
190
264
|
function walkAssetDir(dir) {
|
|
191
265
|
const results = [];
|
|
192
266
|
function walk(currentDir) {
|
|
@@ -213,17 +287,11 @@ function walkAssetDir(dir) {
|
|
|
213
287
|
return results;
|
|
214
288
|
}
|
|
215
289
|
const TEXT_EXTENSIONS = new Set([".md", ".html", ".txt", ".css", ".ts"]);
|
|
216
|
-
/**
|
|
217
|
-
* Get all files that should be indexed β memories + text-based assets.
|
|
218
|
-
*/
|
|
219
290
|
function getIndexableFiles() {
|
|
220
291
|
const files = [];
|
|
221
|
-
// ββ Memories (existing) βββββββββββββββββββββββββββββββ
|
|
222
|
-
// Alvin-Bot MEMORY.md
|
|
223
292
|
if (fs.existsSync(MEMORY_FILE)) {
|
|
224
293
|
files.push({ path: MEMORY_FILE, relativePath: "MEMORY.md" });
|
|
225
294
|
}
|
|
226
|
-
// Alvin-Bot daily logs
|
|
227
295
|
if (fs.existsSync(MEMORY_DIR)) {
|
|
228
296
|
const entries = fs.readdirSync(MEMORY_DIR);
|
|
229
297
|
for (const entry of entries) {
|
|
@@ -235,7 +303,6 @@ function getIndexableFiles() {
|
|
|
235
303
|
}
|
|
236
304
|
}
|
|
237
305
|
}
|
|
238
|
-
// Hub memories (~/.claude/hub/MEMORY/) β Claude Hub knowledge base
|
|
239
306
|
if (fs.existsSync(HUB_MEMORY_DIR)) {
|
|
240
307
|
try {
|
|
241
308
|
const entries = fs.readdirSync(HUB_MEMORY_DIR);
|
|
@@ -248,14 +315,13 @@ function getIndexableFiles() {
|
|
|
248
315
|
}
|
|
249
316
|
}
|
|
250
317
|
}
|
|
251
|
-
catch {
|
|
318
|
+
catch {
|
|
319
|
+
/* Hub not available β skip */
|
|
320
|
+
}
|
|
252
321
|
}
|
|
253
|
-
// ββ Assets (new) ββββββββββββββββββββββββββββββββββββββ
|
|
254
|
-
// Asset INDEX.md β compact summary of all assets
|
|
255
322
|
if (fs.existsSync(ASSETS_INDEX_MD)) {
|
|
256
323
|
files.push({ path: ASSETS_INDEX_MD, relativePath: "assets/INDEX.md" });
|
|
257
324
|
}
|
|
258
|
-
// Text-based asset files (HTML, MD, TXT, CSS, TS)
|
|
259
325
|
if (fs.existsSync(ASSETS_DIR)) {
|
|
260
326
|
for (const entry of walkAssetDir(ASSETS_DIR)) {
|
|
261
327
|
if (TEXT_EXTENSIONS.has(path.extname(entry.name))) {
|
|
@@ -268,120 +334,142 @@ function getIndexableFiles() {
|
|
|
268
334
|
}
|
|
269
335
|
return files;
|
|
270
336
|
}
|
|
271
|
-
|
|
272
|
-
* Check which files need reindexing (new or modified).
|
|
273
|
-
*/
|
|
274
|
-
function getStaleFiles(index) {
|
|
337
|
+
function getStaleFiles() {
|
|
275
338
|
const allFiles = getIndexableFiles();
|
|
339
|
+
const known = getFileMtimes();
|
|
276
340
|
const stale = [];
|
|
277
341
|
for (const file of allFiles) {
|
|
278
342
|
try {
|
|
279
|
-
const
|
|
280
|
-
|
|
281
|
-
if (!index.fileMtimes[file.relativePath] || index.fileMtimes[file.relativePath] < mtime) {
|
|
343
|
+
const mtime = fs.statSync(file.path).mtimeMs;
|
|
344
|
+
if (!known[file.relativePath] || known[file.relativePath] < mtime) {
|
|
282
345
|
stale.push(file);
|
|
283
346
|
}
|
|
284
347
|
}
|
|
285
348
|
catch {
|
|
286
|
-
|
|
349
|
+
/* file disappeared */
|
|
287
350
|
}
|
|
288
351
|
}
|
|
289
352
|
return stale;
|
|
290
353
|
}
|
|
291
354
|
// ββ Public API ββββββββββββββββββββββββββββββββββββββββββ
|
|
292
|
-
/**
|
|
293
|
-
* Reindex all memory files (or just stale ones).
|
|
294
|
-
* Returns number of chunks indexed.
|
|
295
|
-
*/
|
|
296
355
|
export async function reindexMemory(force = false) {
|
|
297
|
-
|
|
298
|
-
|
|
356
|
+
if (!loadSqlite()) {
|
|
357
|
+
return { indexed: 0, total: 0 };
|
|
358
|
+
}
|
|
359
|
+
const filesToIndex = force ? getIndexableFiles() : getStaleFiles();
|
|
299
360
|
if (filesToIndex.length === 0) {
|
|
300
|
-
|
|
361
|
+
const total = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
362
|
+
return { indexed: 0, total };
|
|
301
363
|
}
|
|
302
|
-
//
|
|
303
|
-
const
|
|
304
|
-
|
|
305
|
-
|
|
364
|
+
// Drop existing entries for files being reindexed (per-source DELETE is O(log n) thanks to idx).
|
|
365
|
+
const delStmt = dbOrThrow().prepare("DELETE FROM entries WHERE source = ?");
|
|
366
|
+
const dropOld = dbOrThrow().transaction((sources) => {
|
|
367
|
+
for (const s of sources)
|
|
368
|
+
delStmt.run(s);
|
|
369
|
+
});
|
|
370
|
+
dropOld(filesToIndex.map(f => f.relativePath));
|
|
371
|
+
// Chunk all files.
|
|
306
372
|
const allChunks = [];
|
|
307
373
|
for (const file of filesToIndex) {
|
|
308
374
|
try {
|
|
309
375
|
const content = fs.readFileSync(file.path, "utf-8");
|
|
310
376
|
const chunks = chunkMarkdown(content, file.relativePath);
|
|
377
|
+
const mtime = fs.statSync(file.path).mtimeMs;
|
|
311
378
|
for (const chunk of chunks) {
|
|
312
|
-
allChunks.push({ ...chunk, source: file.relativePath });
|
|
379
|
+
allChunks.push({ ...chunk, source: file.relativePath, mtime });
|
|
313
380
|
}
|
|
314
|
-
// Update mtime
|
|
315
|
-
const stat = fs.statSync(file.path);
|
|
316
|
-
index.fileMtimes[file.relativePath] = stat.mtimeMs;
|
|
317
381
|
}
|
|
318
382
|
catch (err) {
|
|
319
383
|
console.error(`Failed to chunk ${file.relativePath}:`, err);
|
|
320
384
|
}
|
|
321
385
|
}
|
|
322
386
|
if (allChunks.length === 0) {
|
|
323
|
-
|
|
324
|
-
|
|
387
|
+
// Even with zero chunks, keep mtimes in sync so we don't re-walk on next run.
|
|
388
|
+
const updMtime = dbOrThrow().transaction((files) => {
|
|
389
|
+
for (const f of files) {
|
|
390
|
+
try {
|
|
391
|
+
setFileMtime(f.relativePath, fs.statSync(f.path).mtimeMs);
|
|
392
|
+
}
|
|
393
|
+
catch {
|
|
394
|
+
/* file disappeared */
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
});
|
|
398
|
+
updMtime(filesToIndex);
|
|
399
|
+
const total = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
400
|
+
return { indexed: 0, total };
|
|
325
401
|
}
|
|
326
|
-
// Get embeddings for all chunks
|
|
402
|
+
// Get embeddings for all chunks (network).
|
|
327
403
|
const texts = allChunks.map(c => c.text);
|
|
328
404
|
const vectors = await getEmbeddings(texts);
|
|
329
|
-
//
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
405
|
+
// Single transaction for all writes.
|
|
406
|
+
const insertStmt = dbOrThrow().prepare("INSERT INTO entries (id, source, text, vector, indexed_at) VALUES (?, ?, ?, ?, ?) " +
|
|
407
|
+
"ON CONFLICT(id) DO UPDATE SET source=excluded.source, text=excluded.text, vector=excluded.vector, indexed_at=excluded.indexed_at");
|
|
408
|
+
const writeAll = dbOrThrow().transaction((rows) => {
|
|
409
|
+
for (const r of rows) {
|
|
410
|
+
insertStmt.run(r.id, r.source, r.text, r.vector, r.indexedAt);
|
|
411
|
+
}
|
|
412
|
+
});
|
|
413
|
+
const now = Date.now();
|
|
414
|
+
writeAll(allChunks.map((c, i) => ({
|
|
415
|
+
id: c.id,
|
|
416
|
+
source: c.source,
|
|
417
|
+
text: c.text,
|
|
418
|
+
vector: vectorToBlob(vectors[i]),
|
|
419
|
+
indexedAt: now,
|
|
420
|
+
})));
|
|
421
|
+
// Update mtimes for the files we just (re-)indexed.
|
|
422
|
+
const updMtime = dbOrThrow().transaction((files) => {
|
|
423
|
+
for (const f of files) {
|
|
424
|
+
try {
|
|
425
|
+
setFileMtime(f.relativePath, fs.statSync(f.path).mtimeMs);
|
|
426
|
+
}
|
|
427
|
+
catch {
|
|
428
|
+
/* file disappeared */
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
});
|
|
432
|
+
updMtime(filesToIndex);
|
|
433
|
+
setMeta("lastReindex", String(now));
|
|
434
|
+
const total = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
435
|
+
return { indexed: allChunks.length, total };
|
|
342
436
|
}
|
|
343
|
-
/**
|
|
344
|
-
* Semantic search across all indexed memory.
|
|
345
|
-
* Returns top-K results sorted by similarity.
|
|
346
|
-
*/
|
|
347
437
|
export async function searchMemory(query, topK = 5, minScore = 0.3) {
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
438
|
+
if (!loadSqlite()) {
|
|
439
|
+
return [];
|
|
440
|
+
}
|
|
441
|
+
// Auto-index if empty.
|
|
442
|
+
const total = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
443
|
+
if (total === 0) {
|
|
351
444
|
await reindexMemory();
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
if (reloaded.entries.length === 0)
|
|
445
|
+
const after = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
446
|
+
if (after === 0)
|
|
355
447
|
return [];
|
|
356
448
|
}
|
|
357
|
-
|
|
358
|
-
const
|
|
359
|
-
|
|
360
|
-
const
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
.slice(0, topK);
|
|
449
|
+
const queryVector = Float32Array.from(await getQueryEmbedding(query));
|
|
450
|
+
const rows = dbOrThrow().prepare("SELECT id, source, text, vector FROM entries").all();
|
|
451
|
+
const scored = [];
|
|
452
|
+
for (const row of rows) {
|
|
453
|
+
const v = blobToVector(row.vector);
|
|
454
|
+
const score = cosineSimilarityF32(queryVector, v);
|
|
455
|
+
if (score >= minScore) {
|
|
456
|
+
scored.push({ text: row.text, source: row.source, score });
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
scored.sort((a, b) => b.score - a.score);
|
|
460
|
+
return scored.slice(0, topK);
|
|
370
461
|
}
|
|
371
|
-
/**
|
|
372
|
-
* Get index stats for /status.
|
|
373
|
-
*/
|
|
374
|
-
/**
|
|
375
|
-
* Auto-reindex on startup. Indexes only stale/new files (incremental).
|
|
376
|
-
* Runs in background β does not block bot startup.
|
|
377
|
-
*/
|
|
378
462
|
export async function initEmbeddings() {
|
|
463
|
+
if (!loadSqlite()) {
|
|
464
|
+
return; // already warned via loadSqlite
|
|
465
|
+
}
|
|
379
466
|
try {
|
|
380
|
-
|
|
467
|
+
db(); // Open & migrate schema.
|
|
468
|
+
const stale = getStaleFiles();
|
|
381
469
|
if (stale.length === 0) {
|
|
382
|
-
const
|
|
383
|
-
if (
|
|
384
|
-
return;
|
|
470
|
+
const total = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
471
|
+
if (total > 0)
|
|
472
|
+
return;
|
|
385
473
|
}
|
|
386
474
|
const result = await reindexMemory();
|
|
387
475
|
if (result.indexed > 0) {
|
|
@@ -389,21 +477,29 @@ export async function initEmbeddings() {
|
|
|
389
477
|
}
|
|
390
478
|
}
|
|
391
479
|
catch (err) {
|
|
392
|
-
// Non-fatal β bot works without embeddings
|
|
393
480
|
console.warn("β οΈ Embeddings init failed:", err instanceof Error ? err.message : err);
|
|
394
481
|
}
|
|
395
482
|
}
|
|
396
483
|
export function getIndexStats() {
|
|
397
|
-
|
|
484
|
+
let entries = 0;
|
|
485
|
+
let files = 0;
|
|
486
|
+
let lastReindex = 0;
|
|
398
487
|
let sizeBytes = 0;
|
|
488
|
+
if (!loadSqlite()) {
|
|
489
|
+
return { entries, files, lastReindex, sizeBytes };
|
|
490
|
+
}
|
|
399
491
|
try {
|
|
400
|
-
|
|
492
|
+
entries = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM entries").get().c;
|
|
493
|
+
files = dbOrThrow().prepare("SELECT COUNT(*) AS c FROM file_mtimes").get().c;
|
|
494
|
+
const meta = getMeta("lastReindex");
|
|
495
|
+
if (meta)
|
|
496
|
+
lastReindex = Number(meta);
|
|
497
|
+
sizeBytes = fs.statSync(EMBEDDINGS_DB).size;
|
|
498
|
+
}
|
|
499
|
+
catch {
|
|
500
|
+
/* DB not yet initialised */
|
|
401
501
|
}
|
|
402
|
-
|
|
403
|
-
return {
|
|
404
|
-
entries: index.entries.length,
|
|
405
|
-
files: Object.keys(index.fileMtimes).length,
|
|
406
|
-
lastReindex: index.lastReindex,
|
|
407
|
-
sizeBytes,
|
|
408
|
-
};
|
|
502
|
+
return { entries, files, lastReindex, sizeBytes };
|
|
409
503
|
}
|
|
504
|
+
// ββ Re-export embedding dim for tests / debugging ββββββ
|
|
505
|
+
export { EMBEDDING_DIMENSION, EMBEDDING_MODEL };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "alvin-bot",
|
|
3
|
-
"version": "4.
|
|
4
|
-
"description": "Alvin Bot
|
|
3
|
+
"version": "4.20.1",
|
|
4
|
+
"description": "Alvin Bot β Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
7
7
|
"bin": {
|
|
@@ -170,6 +170,7 @@
|
|
|
170
170
|
"@types/node": "^22.0.0",
|
|
171
171
|
"@types/ws": "^8.18.1",
|
|
172
172
|
"@whiskeysockets/baileys": "^6.7.21",
|
|
173
|
+
"better-sqlite3": "^12.9.0",
|
|
173
174
|
"dotenv": "^16.4.0",
|
|
174
175
|
"electron-updater": "^6.8.3",
|
|
175
176
|
"grammy": "^1.30.0",
|
|
@@ -181,6 +182,7 @@
|
|
|
181
182
|
"ws": "^8.19.0"
|
|
182
183
|
},
|
|
183
184
|
"devDependencies": {
|
|
185
|
+
"@types/better-sqlite3": "^7.6.13",
|
|
184
186
|
"@vitest/ui": "^4.1.4",
|
|
185
187
|
"electron": "^35.7.5",
|
|
186
188
|
"electron-builder": "^26.8.1",
|