memwarden 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +402 -0
  3. package/dist/bundle/bundle.d.ts +28 -0
  4. package/dist/bundle/bundle.js +85 -0
  5. package/dist/cli/bin.d.ts +2 -0
  6. package/dist/cli/bin.js +593 -0
  7. package/dist/cli/connect.d.ts +63 -0
  8. package/dist/cli/connect.js +121 -0
  9. package/dist/cli/hook.d.ts +24 -0
  10. package/dist/cli/hook.js +186 -0
  11. package/dist/cli/tools.d.ts +47 -0
  12. package/dist/cli/tools.js +246 -0
  13. package/dist/daemon/ensure.d.ts +12 -0
  14. package/dist/daemon/ensure.js +54 -0
  15. package/dist/daemon/service.d.ts +15 -0
  16. package/dist/daemon/service.js +210 -0
  17. package/dist/embedding/index.d.ts +10 -0
  18. package/dist/embedding/index.js +33 -0
  19. package/dist/embedding/local-embedding.d.ts +14 -0
  20. package/dist/embedding/local-embedding.js +80 -0
  21. package/dist/functions/access-tracker.d.ts +13 -0
  22. package/dist/functions/access-tracker.js +92 -0
  23. package/dist/functions/audit.d.ts +46 -0
  24. package/dist/functions/audit.js +0 -0
  25. package/dist/functions/cjk-segmenter.d.ts +6 -0
  26. package/dist/functions/cjk-segmenter.js +120 -0
  27. package/dist/functions/compress-synthetic.d.ts +2 -0
  28. package/dist/functions/compress-synthetic.js +104 -0
  29. package/dist/functions/config.d.ts +68 -0
  30. package/dist/functions/config.js +231 -0
  31. package/dist/functions/conflicts.d.ts +19 -0
  32. package/dist/functions/conflicts.js +328 -0
  33. package/dist/functions/context.d.ts +3 -0
  34. package/dist/functions/context.js +155 -0
  35. package/dist/functions/dedup.d.ts +11 -0
  36. package/dist/functions/dedup.js +51 -0
  37. package/dist/functions/dejafix.d.ts +96 -0
  38. package/dist/functions/dejafix.js +356 -0
  39. package/dist/functions/doctor.d.ts +29 -0
  40. package/dist/functions/doctor.js +137 -0
  41. package/dist/functions/forget.d.ts +3 -0
  42. package/dist/functions/forget.js +87 -0
  43. package/dist/functions/hybrid-search.d.ts +17 -0
  44. package/dist/functions/hybrid-search.js +205 -0
  45. package/dist/functions/index.d.ts +32 -0
  46. package/dist/functions/index.js +44 -0
  47. package/dist/functions/keyed-mutex.d.ts +1 -0
  48. package/dist/functions/keyed-mutex.js +21 -0
  49. package/dist/functions/logger.d.ts +6 -0
  50. package/dist/functions/logger.js +37 -0
  51. package/dist/functions/memory-utils.d.ts +2 -0
  52. package/dist/functions/memory-utils.js +29 -0
  53. package/dist/functions/observe.d.ts +5 -0
  54. package/dist/functions/observe.js +326 -0
  55. package/dist/functions/paths.d.ts +1 -0
  56. package/dist/functions/paths.js +38 -0
  57. package/dist/functions/privacy.d.ts +1 -0
  58. package/dist/functions/privacy.js +30 -0
  59. package/dist/functions/provenance.d.ts +9 -0
  60. package/dist/functions/provenance.js +57 -0
  61. package/dist/functions/quantized-vector-index.d.ts +60 -0
  62. package/dist/functions/quantized-vector-index.js +275 -0
  63. package/dist/functions/receipt.d.ts +31 -0
  64. package/dist/functions/receipt.js +95 -0
  65. package/dist/functions/search-index.d.ts +27 -0
  66. package/dist/functions/search-index.js +217 -0
  67. package/dist/functions/search.d.ts +25 -0
  68. package/dist/functions/search.js +523 -0
  69. package/dist/functions/stemmer.d.ts +1 -0
  70. package/dist/functions/stemmer.js +110 -0
  71. package/dist/functions/synonyms.d.ts +1 -0
  72. package/dist/functions/synonyms.js +69 -0
  73. package/dist/functions/turboquant.d.ts +53 -0
  74. package/dist/functions/turboquant.js +278 -0
  75. package/dist/functions/types.d.ts +217 -0
  76. package/dist/functions/types.js +8 -0
  77. package/dist/functions/vector-index.d.ts +25 -0
  78. package/dist/functions/vector-index.js +125 -0
  79. package/dist/functions/vector-persistence.d.ts +14 -0
  80. package/dist/functions/vector-persistence.js +75 -0
  81. package/dist/functions/verify.d.ts +13 -0
  82. package/dist/functions/verify.js +104 -0
  83. package/dist/index.d.ts +1 -0
  84. package/dist/index.js +219 -0
  85. package/dist/kernel/http.d.ts +24 -0
  86. package/dist/kernel/http.js +261 -0
  87. package/dist/kernel/index.d.ts +19 -0
  88. package/dist/kernel/index.js +21 -0
  89. package/dist/kernel/kernel.d.ts +80 -0
  90. package/dist/kernel/kernel.js +297 -0
  91. package/dist/kernel/pubsub.d.ts +21 -0
  92. package/dist/kernel/pubsub.js +38 -0
  93. package/dist/kernel/types.d.ts +139 -0
  94. package/dist/kernel/types.js +20 -0
  95. package/dist/mcp/bin.d.ts +2 -0
  96. package/dist/mcp/bin.js +27 -0
  97. package/dist/mcp/server.d.ts +34 -0
  98. package/dist/mcp/server.js +377 -0
  99. package/dist/observability/metrics.d.ts +26 -0
  100. package/dist/observability/metrics.js +104 -0
  101. package/dist/proxy/server.d.ts +30 -0
  102. package/dist/proxy/server.js +331 -0
  103. package/dist/state/kv.d.ts +41 -0
  104. package/dist/state/kv.js +50 -0
  105. package/dist/state/oplog.d.ts +25 -0
  106. package/dist/state/oplog.js +57 -0
  107. package/dist/state/schema.d.ts +60 -0
  108. package/dist/state/schema.js +88 -0
  109. package/dist/state/store-libsql.d.ts +46 -0
  110. package/dist/state/store-libsql.js +263 -0
  111. package/dist/state/store-memory.d.ts +23 -0
  112. package/dist/state/store-memory.js +121 -0
  113. package/dist/state/store.d.ts +87 -0
  114. package/dist/state/store.js +58 -0
  115. package/dist/triggers/api.d.ts +14 -0
  116. package/dist/triggers/api.js +510 -0
  117. package/dist/triggers/auth.d.ts +1 -0
  118. package/dist/triggers/auth.js +13 -0
  119. package/package.json +58 -0
@@ -0,0 +1,275 @@
1
+ //
2
+ // TurboQuant-backed vector index with VectorIndex API parity. Stores packed
3
+ // 2/4-bit codes + a per-vector norm instead of full Float32 embeddings
4
+ // (~8-16x smaller). Search is asymmetric: the query stays full precision.
5
+ // When `rescoreDepth > 0` the full vectors are retained too and the top
6
+ // candidates are re-ranked with exact cosine (best recall, no memory
7
+ // saving); with rescoreDepth 0 only codes are kept (max compression).
8
+ //
9
+ // Drop-in for the VectorIndexLike surface consumed by search.ts and
10
+ // hybrid-search.ts. Default OFF: constructed only when
11
+ // MEMWARDEN_QUANT_VECTOR=true (see makeVectorIndex in search.ts).
12
+ import { TURBOQUANT_VERSION, ROTATION_ROUNDS, nextPow2, seedFromString, mulberry32, buildSignFlips, rotate, lloydMaxLevels, levelTableHash, encodeRotated, asymmetricDot, } from "./turboquant.js";
13
+ // Same byteOffset/byteLength-guarded base64 round-trip as vector-index.ts,
14
+ // duplicated on purpose for Uint8Array: the helpers in vector-index.ts are
15
+ // load-bearing bug fixes (#455/#469/#584/#587) and stay untouched there.
16
+ function uint8ToBase64(arr) {
17
+ return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength).toString("base64");
18
+ }
19
+ function base64ToUint8(b64) {
20
+ const buf = Buffer.from(b64, "base64");
21
+ return new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength);
22
+ }
23
+ function float32ToBase64(arr) {
24
+ return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength).toString("base64");
25
+ }
26
+ function base64ToFloat32(b64) {
27
+ const buf = Buffer.from(b64, "base64");
28
+ return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / Float32Array.BYTES_PER_ELEMENT);
29
+ }
30
+ function cosineSimilarity(a, b) {
31
+ if (a.length !== b.length)
32
+ return 0;
33
+ let dot = 0;
34
+ let normA = 0;
35
+ let normB = 0;
36
+ for (let i = 0; i < a.length; i++) {
37
+ const ai = a[i];
38
+ const bi = b[i];
39
+ dot += ai * bi;
40
+ normA += ai * ai;
41
+ normB += bi * bi;
42
+ }
43
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
44
+ return denom === 0 ? 0 : dot / denom;
45
+ }
46
+ export class QuantizedVectorIndex {
47
+ params;
48
+ vectors = new Map();
49
+ signFlips;
50
+ scratch;
51
+ queryScratch;
52
+ constructor(opts) {
53
+ const paddedDims = nextPow2(opts.dims);
54
+ this.params = {
55
+ version: TURBOQUANT_VERSION,
56
+ bits: opts.bits,
57
+ dims: opts.dims,
58
+ paddedDims,
59
+ seed: opts.seed,
60
+ rounds: ROTATION_ROUNDS,
61
+ levelHash: levelTableHash(opts.bits),
62
+ rescoreDepth: Math.max(0, Math.floor(opts.rescoreDepth)),
63
+ };
64
+ this.signFlips = buildSignFlips(mulberry32(seedFromString(opts.seed)), paddedDims, ROTATION_ROUNDS);
65
+ this.scratch = new Float32Array(paddedDims);
66
+ this.queryScratch = new Float32Array(paddedDims);
67
+ // Warm the level table so first add/search doesn't pay the Lloyd cost.
68
+ lloydMaxLevels(opts.bits);
69
+ }
70
+ add(obsId, sessionId, embedding) {
71
+ if (embedding.length !== this.params.dims)
72
+ return; // soft-skip, guarded upstream
73
+ const rotated = rotate(embedding, this.params.paddedDims, this.signFlips, this.scratch);
74
+ const { codes, norm } = encodeRotated(rotated, this.params.paddedDims, this.params.bits);
75
+ const entry = { codes, norm, sessionId };
76
+ if (this.params.rescoreDepth > 0) {
77
+ entry.full = new Float32Array(embedding);
78
+ }
79
+ this.vectors.set(obsId, entry);
80
+ }
81
+ remove(obsId) {
82
+ this.vectors.delete(obsId);
83
+ }
84
+ has(obsId) {
85
+ return this.vectors.has(obsId);
86
+ }
87
+ ids() {
88
+ return Array.from(this.vectors.keys());
89
+ }
90
+ /**
91
+ * Aligns the rescore setting with the current configuration after a
92
+ * restore: the persisted blob carries the rescoreDepth it was built
93
+ * with, which may no longer match the environment. Lowering to 0 drops
94
+ * the retained full vectors (reclaiming memory); raising it keeps
95
+ * working with whatever full vectors the blob had (entries without one
96
+ * simply keep their asymmetric score — the rescore pass guards on
97
+ * presence).
98
+ */
99
+ reconcileRescoreDepth(depth) {
100
+ const clamped = Math.max(0, Math.floor(depth));
101
+ if (clamped === this.params.rescoreDepth)
102
+ return;
103
+ this.params.rescoreDepth = clamped;
104
+ if (clamped === 0) {
105
+ for (const entry of this.vectors.values()) {
106
+ delete entry.full;
107
+ }
108
+ }
109
+ }
110
+ search(query, limit = 20) {
111
+ if (query.length !== this.params.dims || this.vectors.size === 0) {
112
+ return [];
113
+ }
114
+ const D = this.params.paddedDims;
115
+ // Dedicated query scratch: search is synchronous, so nothing else can
116
+ // touch it before the scan below completes; `this.scratch` stays
117
+ // reserved for add().
118
+ const rotatedQuery = rotate(query, D, this.signFlips, this.queryScratch);
119
+ const table = lloydMaxLevels(this.params.bits); // hoisted out of the scan
120
+ let qNormSq = 0;
121
+ for (let i = 0; i < D; i++) {
122
+ const v = rotatedQuery[i];
123
+ qNormSq += v * v;
124
+ }
125
+ const qNorm = Math.sqrt(qNormSq);
126
+ if (qNorm === 0)
127
+ return [];
128
+ const invScale = 1 / (Math.sqrt(D) * qNorm);
129
+ // First pass: asymmetric scores with the same bounded top-K pattern as
130
+ // VectorIndex.search. Pool size widens to rescoreDepth when rescoring.
131
+ const poolSize = Math.max(limit, this.params.rescoreDepth);
132
+ const results = [];
133
+ let minScore = -Infinity;
134
+ for (const [obsId, entry] of this.vectors) {
135
+ const score = entry.norm === 0
136
+ ? 0
137
+ : asymmetricDot(rotatedQuery, entry.codes, D, this.params.bits, table) *
138
+ invScale;
139
+ if (results.length < poolSize) {
140
+ results.push({ obsId, sessionId: entry.sessionId, score });
141
+ if (results.length === poolSize) {
142
+ results.sort((a, b) => a.score - b.score);
143
+ minScore = results[0].score;
144
+ }
145
+ }
146
+ else if (score > minScore) {
147
+ results[0] = { obsId, sessionId: entry.sessionId, score };
148
+ results.sort((a, b) => a.score - b.score);
149
+ minScore = results[0].score;
150
+ }
151
+ }
152
+ // Optional second pass: exact cosine on the retained full vectors.
153
+ if (this.params.rescoreDepth > 0) {
154
+ for (const r of results) {
155
+ const full = this.vectors.get(r.obsId)?.full;
156
+ if (full)
157
+ r.score = cosineSimilarity(query, full);
158
+ }
159
+ }
160
+ results.sort((a, b) => b.score - a.score);
161
+ return results.slice(0, limit);
162
+ }
163
+ get size() {
164
+ return this.vectors.size;
165
+ }
166
+ validateDimensions(expected) {
167
+ const mismatches = [];
168
+ const seenDimensions = new Set();
169
+ if (this.vectors.size > 0) {
170
+ seenDimensions.add(this.params.dims);
171
+ if (this.params.dims !== expected) {
172
+ for (const obsId of this.vectors.keys()) {
173
+ mismatches.push({ obsId, dim: this.params.dims });
174
+ }
175
+ }
176
+ }
177
+ return { mismatches, seenDimensions };
178
+ }
179
+ clear() {
180
+ this.vectors.clear();
181
+ }
182
+ restoreFrom(other) {
183
+ this.vectors = new Map();
184
+ for (const [obsId, entry] of other.vectors) {
185
+ const copy = {
186
+ codes: new Uint8Array(entry.codes),
187
+ norm: entry.norm,
188
+ sessionId: entry.sessionId,
189
+ };
190
+ if (entry.full)
191
+ copy.full = new Float32Array(entry.full);
192
+ this.vectors.set(obsId, copy);
193
+ }
194
+ }
195
+ serialize() {
196
+ const vectors = [];
197
+ for (const [obsId, entry] of this.vectors) {
198
+ const row = {
199
+ c: uint8ToBase64(entry.codes),
200
+ n: entry.norm,
201
+ s: entry.sessionId,
202
+ };
203
+ if (entry.full)
204
+ row.f = float32ToBase64(entry.full);
205
+ vectors.push([obsId, row]);
206
+ }
207
+ return JSON.stringify({ params: this.params, vectors });
208
+ }
209
+ /**
210
+ * Returns null when the payload's params don't reproduce the current
211
+ * algorithm (version, bits, seed, dims, rounds or level-table hash
212
+ * mismatch) — the caller is expected to fall back to a full rebuild.
213
+ */
214
+ static deserialize(json) {
215
+ let data;
216
+ try {
217
+ data = JSON.parse(json);
218
+ }
219
+ catch {
220
+ return null;
221
+ }
222
+ const obj = data;
223
+ const p = obj?.params;
224
+ if (!p ||
225
+ p.version !== TURBOQUANT_VERSION ||
226
+ (p.bits !== 2 && p.bits !== 4) ||
227
+ typeof p.dims !== "number" ||
228
+ typeof p.paddedDims !== "number" ||
229
+ typeof p.seed !== "string" ||
230
+ p.rounds !== ROTATION_ROUNDS ||
231
+ p.paddedDims !== nextPow2(p.dims) ||
232
+ p.levelHash !== levelTableHash(p.bits)) {
233
+ return null;
234
+ }
235
+ const idx = new QuantizedVectorIndex({
236
+ dims: p.dims,
237
+ bits: p.bits,
238
+ seed: p.seed,
239
+ rescoreDepth: typeof p.rescoreDepth === "number" ? p.rescoreDepth : 0,
240
+ });
241
+ if (!Array.isArray(obj.vectors))
242
+ return idx;
243
+ const codesLen = p.bits === 4 ? Math.ceil(p.paddedDims / 2) : Math.ceil(p.paddedDims / 4);
244
+ for (const row of obj.vectors) {
245
+ try {
246
+ if (!Array.isArray(row) || row.length < 2)
247
+ continue;
248
+ const [obsId, entry] = row;
249
+ if (typeof obsId !== "string" ||
250
+ typeof entry?.c !== "string" ||
251
+ typeof entry?.n !== "number" ||
252
+ typeof entry?.s !== "string")
253
+ continue;
254
+ const codes = base64ToUint8(entry.c);
255
+ if (codes.length !== codesLen)
256
+ continue;
257
+ const stored = {
258
+ codes,
259
+ norm: entry.n,
260
+ sessionId: entry.s,
261
+ };
262
+ if (typeof entry.f === "string") {
263
+ const full = base64ToFloat32(entry.f);
264
+ if (full.length === p.dims)
265
+ stored.full = full;
266
+ }
267
+ idx.vectors.set(obsId, stored);
268
+ }
269
+ catch {
270
+ continue;
271
+ }
272
+ }
273
+ return idx;
274
+ }
275
+ }
@@ -0,0 +1,31 @@
1
+ import type { ISdk } from "../kernel/index.js";
2
+ import type { StateKV } from "../state/kv.js";
3
+ interface ChainEntry {
4
+ id: number;
5
+ ts: string;
6
+ op: string;
7
+ scope: string;
8
+ key: string;
9
+ hash: string;
10
+ prev_hash: string;
11
+ }
12
+ export interface DeleteReceipt {
13
+ obsId: string;
14
+ title: string;
15
+ deletedAt: string;
16
+ /** The oplog entry that recorded this deletion. */
17
+ deleteEntry: ChainEntry | null;
18
+ /** The oplog entry that recorded the original write, when still present. */
19
+ createEntry: ChainEntry | null;
20
+ /** Whole-chain verification at receipt time. */
21
+ chainIntact: boolean;
22
+ /** SHA-256 over the canonical receipt fields above — offline-checkable. */
23
+ receiptHash: string;
24
+ }
25
+ export interface ForgetResult {
26
+ deleted: boolean;
27
+ reason?: string;
28
+ receipt?: DeleteReceipt;
29
+ }
30
+ export declare function registerReceiptFunction(sdk: ISdk, kv: StateKV): void;
31
+ export {};
@@ -0,0 +1,95 @@
1
+ //
2
+ // mem::forget — user-initiated deletion with a RECEIPT. The pain this
3
+ // answers is real and documented across competitor trackers: deletes that
4
+ // return success while the data stays on disk, with no way to prove
5
+ // otherwise. memwarden's delete is verifiable two ways:
6
+ //
7
+ // 1. The observation is removed from KV and every index in lockstep, and
8
+ // the response reports what was actually removed (deleted: false when
9
+ // the id wasn't found — never a fake success).
10
+ // 2. The deletion lands in the hash-chained oplog like every mutation, so
11
+ // the receipt cites the chain: the entry that recorded the delete, the
12
+ // entry that recorded the original write, and whether the whole chain
13
+ // verifies. Anyone with the store can recompute both hashes. Payloads
14
+ // are never included — a receipt proves the delete without
15
+ // re-disclosing what was deleted.
16
+ //
17
+ // The receipt's own hash covers its fields, so a receipt file can be
18
+ // checked for integrity on its own, offline.
19
+ import { createHash } from "node:crypto";
20
+ import { KV } from "../state/schema.js";
21
+ import { getSearchIndex, vectorIndexRemove } from "./search.js";
22
+ import { deleteAccessLog } from "./access-tracker.js";
23
+ import { logger } from "./logger.js";
24
+ function receiptHash(fields) {
25
+ // Stable key order via explicit construction — this is the contract a
26
+ // receipt verifier recomputes.
27
+ const canonical = JSON.stringify({
28
+ obsId: fields.obsId,
29
+ title: fields.title,
30
+ deletedAt: fields.deletedAt,
31
+ deleteEntry: fields.deleteEntry,
32
+ createEntry: fields.createEntry,
33
+ chainIntact: fields.chainIntact,
34
+ });
35
+ return createHash("sha256").update(canonical).digest("hex");
36
+ }
37
+ export function registerReceiptFunction(sdk, kv) {
38
+ sdk.registerFunction("mem::forget", async (data) => {
39
+ const obsId = (data?.observationId ?? "").trim();
40
+ if (!obsId)
41
+ return { deleted: false, reason: "observationId is required" };
42
+ // Find the session holding this observation (ids are globally unique;
43
+ // storage is scoped per session).
44
+ let sessions;
45
+ try {
46
+ sessions = await kv.list(KV.sessions);
47
+ }
48
+ catch {
49
+ return { deleted: false, reason: "store unavailable" };
50
+ }
51
+ let found;
52
+ for (const session of sessions) {
53
+ const obs = await kv
54
+ .get(KV.observations(session.id), obsId)
55
+ .catch(() => null);
56
+ if (obs) {
57
+ found = { sessionId: session.id, obs };
58
+ break;
59
+ }
60
+ }
61
+ if (!found) {
62
+ // The honest failure: nothing pretended, nothing "succeeded".
63
+ return { deleted: false, reason: `no observation with id ${obsId}` };
64
+ }
65
+ // Remove from KV and every index in lockstep (same discipline as the
66
+ // auto-forget sweep).
67
+ await kv.delete(KV.observations(found.sessionId), obsId);
68
+ getSearchIndex().remove(obsId);
69
+ vectorIndexRemove(obsId);
70
+ await deleteAccessLog(kv, obsId);
71
+ // Build the receipt from the chain.
72
+ const { entries } = await sdk.trigger({
73
+ function_id: "state::oplog-find",
74
+ payload: { key: obsId },
75
+ });
76
+ const deleteEntry = [...entries].reverse().find((e) => e.op === "delete") ?? null;
77
+ const createEntry = entries.find((e) => e.op !== "delete") ?? null;
78
+ const verdict = await sdk.trigger({ function_id: "state::verify", payload: {} });
79
+ const base = {
80
+ obsId,
81
+ title: found.obs.title ?? "(untitled)",
82
+ deletedAt: deleteEntry?.ts ?? new Date().toISOString(),
83
+ deleteEntry,
84
+ createEntry,
85
+ chainIntact: verdict.ok === true,
86
+ };
87
+ const receipt = { ...base, receiptHash: receiptHash(base) };
88
+ logger.info("memory forgotten with receipt", {
89
+ obsId,
90
+ oplogDeleteId: deleteEntry?.id,
91
+ chainIntact: receipt.chainIntact,
92
+ });
93
+ return { deleted: true, receipt };
94
+ });
95
+ }
@@ -0,0 +1,27 @@
1
+ import type { CompressedObservation } from "./types.js";
2
+ export interface Bm25Hit {
3
+ obsId: string;
4
+ sessionId: string;
5
+ score: number;
6
+ }
7
+ export declare class SearchIndex {
8
+ private docs;
9
+ private postings;
10
+ private termFreqs;
11
+ private totalLength;
12
+ private sortedTermsCache;
13
+ add(obs: CompressedObservation): void;
14
+ has(id: string): boolean;
15
+ remove(id: string): void;
16
+ get size(): number;
17
+ private contribution;
18
+ search(query: string, limit?: number): Bm25Hit[];
19
+ clear(): void;
20
+ restoreFrom(other: SearchIndex): void;
21
+ serialize(): string;
22
+ static deserialize(json: string): SearchIndex;
23
+ private extractTerms;
24
+ private tokenize;
25
+ private sortedTerms;
26
+ private lowerBound;
27
+ }
@@ -0,0 +1,217 @@
1
+ //
2
+ // Okapi BM25 inverted index. Pure and engine-independent. BM25 is a published
3
+ // ranking function; the standard constants (k1 = 1.2, b = 0.75) and the
4
+ // idf = log((N - df + 0.5)/(df + 0.5) + 1) form are used as published. On top
5
+ // of exact terms, query terms also match by prefix (binary-searched over the
6
+ // sorted term list, contribution halved) and via synonym expansion (synonym
7
+ // terms enter at weight 0.7).
8
+ import { stem } from "./stemmer.js";
9
+ import { getSynonyms } from "./synonyms.js";
10
+ import { segmentCjk, hasCjk } from "./cjk-segmenter.js";
11
+ const K1 = 1.2;
12
+ const B = 0.75;
13
+ export class SearchIndex {
14
+ docs = new Map(); // obsId -> entry
15
+ postings = new Map(); // term -> obsIds
16
+ termFreqs = new Map(); // obsId -> term -> tf
17
+ totalLength = 0;
18
+ sortedTermsCache = null;
19
+ add(obs) {
20
+ const terms = this.extractTerms(obs);
21
+ const tf = new Map();
22
+ for (const term of terms)
23
+ tf.set(term, (tf.get(term) ?? 0) + 1);
24
+ this.docs.set(obs.id, {
25
+ obsId: obs.id,
26
+ sessionId: obs.sessionId,
27
+ termCount: terms.length,
28
+ });
29
+ this.termFreqs.set(obs.id, tf);
30
+ this.totalLength += terms.length;
31
+ for (const term of tf.keys()) {
32
+ let posting = this.postings.get(term);
33
+ if (!posting)
34
+ this.postings.set(term, (posting = new Set()));
35
+ posting.add(obs.id);
36
+ }
37
+ this.sortedTermsCache = null;
38
+ }
39
+ has(id) {
40
+ return this.docs.has(id);
41
+ }
42
+ remove(id) {
43
+ const entry = this.docs.get(id);
44
+ if (!entry)
45
+ return;
46
+ const tf = this.termFreqs.get(id);
47
+ if (tf) {
48
+ for (const term of tf.keys()) {
49
+ const posting = this.postings.get(term);
50
+ if (posting) {
51
+ posting.delete(id);
52
+ if (posting.size === 0)
53
+ this.postings.delete(term);
54
+ }
55
+ }
56
+ this.termFreqs.delete(id);
57
+ }
58
+ this.totalLength = Math.max(0, this.totalLength - entry.termCount);
59
+ this.docs.delete(id);
60
+ this.sortedTermsCache = null;
61
+ }
62
+ get size() {
63
+ return this.docs.size;
64
+ }
65
+ // BM25 contribution of a single term to a single document (no query weight).
66
+ contribution(tf, df, n, docLen, avgLen) {
67
+ const idf = Math.log((n - df + 0.5) / (df + 0.5) + 1);
68
+ const num = tf * (K1 + 1);
69
+ const den = tf + K1 * (1 - B + B * (docLen / avgLen));
70
+ return idf * (num / den);
71
+ }
72
+ search(query, limit = 20) {
73
+ const rawTerms = this.tokenize(query.toLowerCase());
74
+ if (rawTerms.length === 0)
75
+ return [];
76
+ const n = this.docs.size;
77
+ if (n === 0)
78
+ return [];
79
+ const avgLen = this.totalLength / n;
80
+ // exact terms at full weight, synonyms at 0.7, de-duplicated
81
+ const queryTerms = [];
82
+ const seen = new Set();
83
+ for (const term of rawTerms) {
84
+ if (!seen.has(term)) {
85
+ seen.add(term);
86
+ queryTerms.push({ term, weight: 1 });
87
+ }
88
+ for (const syn of getSynonyms(term)) {
89
+ if (!seen.has(syn)) {
90
+ seen.add(syn);
91
+ queryTerms.push({ term: syn, weight: 0.7 });
92
+ }
93
+ }
94
+ }
95
+ const scores = new Map();
96
+ const accrue = (term, weight, factor) => {
97
+ const posting = this.postings.get(term);
98
+ if (!posting)
99
+ return;
100
+ const df = posting.size;
101
+ for (const obsId of posting) {
102
+ const doc = this.docs.get(obsId);
103
+ const tf = this.termFreqs.get(obsId)?.get(term) ?? 0;
104
+ const add = this.contribution(tf, df, n, doc.termCount, avgLen) * weight * factor;
105
+ scores.set(obsId, (scores.get(obsId) ?? 0) + add);
106
+ }
107
+ };
108
+ const sorted = this.sortedTerms();
109
+ for (const { term, weight } of queryTerms) {
110
+ accrue(term, weight, 1); // exact match
111
+ // prefix matches (term*) excluding the exact term, contribution halved
112
+ for (let i = this.lowerBound(sorted, term); i < sorted.length; i++) {
113
+ const candidate = sorted[i];
114
+ if (candidate === undefined || !candidate.startsWith(term))
115
+ break;
116
+ if (candidate === term)
117
+ continue;
118
+ accrue(candidate, weight, 0.5);
119
+ }
120
+ }
121
+ return [...scores.entries()]
122
+ .map(([obsId, score]) => ({
123
+ obsId,
124
+ sessionId: this.docs.get(obsId).sessionId,
125
+ score,
126
+ }))
127
+ .sort((a, b) => b.score - a.score)
128
+ .slice(0, limit);
129
+ }
130
+ clear() {
131
+ this.docs.clear();
132
+ this.postings.clear();
133
+ this.termFreqs.clear();
134
+ this.totalLength = 0;
135
+ this.sortedTermsCache = null;
136
+ }
137
+ restoreFrom(other) {
138
+ this.docs = new Map([...other.docs].map(([k, v]) => [k, { ...v }]));
139
+ this.postings = new Map([...other.postings].map(([k, v]) => [k, new Set(v)]));
140
+ this.termFreqs = new Map([...other.termFreqs].map(([k, v]) => [k, new Map(v)]));
141
+ this.totalLength = other.totalLength;
142
+ this.sortedTermsCache = null;
143
+ }
144
+ serialize() {
145
+ return JSON.stringify({
146
+ v: 2,
147
+ entries: [...this.docs.entries()],
148
+ inverted: [...this.postings.entries()].map(([term, ids]) => [term, [...ids]]),
149
+ docTerms: [...this.termFreqs.entries()].map(([id, counts]) => [id, [...counts.entries()]]),
150
+ totalDocLength: this.totalLength,
151
+ });
152
+ }
153
+ static deserialize(json) {
154
+ const idx = new SearchIndex();
155
+ try {
156
+ const data = JSON.parse(json);
157
+ if (!data?.entries || !data?.inverted || !data?.docTerms)
158
+ return idx;
159
+ for (const [id, entry] of data.entries)
160
+ idx.docs.set(id, entry);
161
+ for (const [term, ids] of data.inverted)
162
+ idx.postings.set(term, new Set(ids));
163
+ for (const [id, counts] of data.docTerms)
164
+ idx.termFreqs.set(id, new Map(counts));
165
+ const len = Number(data.totalDocLength);
166
+ idx.totalLength = Number.isFinite(len) && len >= 0 ? Math.floor(len) : 0;
167
+ }
168
+ catch {
169
+ return new SearchIndex();
170
+ }
171
+ return idx;
172
+ }
173
+ extractTerms(obs) {
174
+ const parts = [
175
+ obs.title,
176
+ obs.subtitle ?? "",
177
+ obs.narrative,
178
+ ...obs.facts,
179
+ ...obs.concepts,
180
+ ...obs.files,
181
+ obs.type,
182
+ ];
183
+ return this.tokenize(parts.join(" ").toLowerCase());
184
+ }
185
+ tokenize(text) {
186
+ const cleaned = text.replace(/[^\p{L}\p{N}\s/.\\-_]/gu, " ");
187
+ const tokens = [];
188
+ for (const word of cleaned.split(/\s+/)) {
189
+ if (word.length < 2)
190
+ continue;
191
+ if (hasCjk(word)) {
192
+ for (const seg of segmentCjk(word))
193
+ if (seg.length >= 1)
194
+ tokens.push(seg);
195
+ }
196
+ else {
197
+ tokens.push(stem(word));
198
+ }
199
+ }
200
+ return tokens;
201
+ }
202
+ sortedTerms() {
203
+ return (this.sortedTermsCache ??= [...this.postings.keys()].sort());
204
+ }
205
+ lowerBound(arr, target) {
206
+ let lo = 0;
207
+ let hi = arr.length;
208
+ while (lo < hi) {
209
+ const mid = (lo + hi) >>> 1;
210
+ if ((arr[mid] ?? "") < target)
211
+ lo = mid + 1;
212
+ else
213
+ hi = mid;
214
+ }
215
+ return lo;
216
+ }
217
+ }
@@ -0,0 +1,25 @@
1
+ import type { ISdk } from "../kernel/index.js";
2
+ import type { EmbeddingProvider, VectorIndexLike } from "./types.js";
3
+ import type { StateKV } from "../state/kv.js";
4
+ import { SearchIndex } from "./search-index.js";
5
+ export declare function getSearchIndex(): SearchIndex;
6
+ export declare function setVectorIndex(idx: VectorIndexLike | null): void;
7
+ export declare function getVectorIndex(): VectorIndexLike | null;
8
+ /**
9
+ * Constructs the configured vector index: TurboQuant-backed when
10
+ * MEMWARDEN_QUANT_VECTOR=true, the full-precision VectorIndex otherwise.
11
+ * `dims` comes from the embedding provider that will feed the index.
12
+ */
13
+ export declare function makeVectorIndex(dims: number): VectorIndexLike;
14
+ export declare function setEmbeddingProvider(provider: EmbeddingProvider | null): void;
15
+ export declare function getEmbeddingProvider(): EmbeddingProvider | null;
16
+ export declare function vectorIndexRemove(id: string): void;
17
+ export declare function clipEmbedInput(text: string): string;
18
+ export declare function vectorIndexAddGuarded(id: string, sessionId: string, text: string, context: {
19
+ kind: "memory" | "observation" | "synthetic";
20
+ logId: string;
21
+ }): Promise<boolean>;
22
+ export declare function rebuildIndex(kv: StateKV, opts?: {
23
+ preserveVectorIndex?: boolean;
24
+ }): Promise<number>;
25
+ export declare function registerSearchFunction(sdk: ISdk, kv: StateKV): void;