vecito 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ import { stemmer } from 'stemmer';
2
+
3
+ const STOPWORDS = new Set([
4
+ 'a','an','the','in','on','at','to','for','of','and','or','is','are','was',
5
+ 'were','it','its','by','be','as','do','how','what','which','with','that',
6
+ 'this','from','have','has','had','not','but','can','will','just','about',
7
+ 'than','also','more','some','such','into','over','after','they','them',
8
+ 'their','would','could','should','been','being','our','your','his','her',
9
+ 'we','you','he','she','did','does','may','might','must','shall','get',
10
+ 'got','let','few','all','any','each','both','very','only','even','own',
11
+ ]);
12
+
13
+ /**
14
+ * Stem a lowercase word. Short or stop words are returned as-is so they can
15
+ * still be excluded by the caller's own stopword check.
16
+ * @param {string} word Already lowercase.
17
+ * @returns {string}
18
+ */
19
+ function stem(word) {
20
+ return word.length >= 4 ? stemmer(word) : word;
21
+ }
22
+
23
+ /**
24
+ * Build a Set of stems from an array of query terms, excluding stopwords and
25
+ * tokens shorter than 3 characters.
26
+ * @param {string[]|Set<string>} terms
27
+ * @returns {Set<string>}
28
+ */
29
+ function stemSet(terms) {
30
+ const out = new Set();
31
+ for (const t of terms) {
32
+ const lo = t.toLowerCase();
33
+ if (lo.length >= 3 && !STOPWORDS.has(lo)) out.add(stem(lo));
34
+ }
35
+ return out;
36
+ }
37
+
38
+ /**
39
+ * Text highlighting utilities for search result display.
40
+ *
41
+ * {@link Highlighter.highlight} wraps matched terms in `<mark>` tags using
42
+ * stem-aware matching: "running" matches the term "run", "adventure" matches
43
+ * "adventurous", etc. {@link Highlighter.snippet} extracts a relevant excerpt
44
+ * with the same stem-aware center-finding.
45
+ */
46
+ export class Highlighter {
47
+ /**
48
+ * Escape HTML special characters in a plain-text string.
49
+ * @param {string} s
50
+ * @returns {string}
51
+ */
52
+ static escape(s) {
53
+ return s
54
+ .replace(/&/g, '&amp;')
55
+ .replace(/</g, '&lt;')
56
+ .replace(/>/g, '&gt;')
57
+ .replace(/"/g, '&quot;');
58
+ }
59
+
60
+ /**
61
+ * Tokenize a query string for use as highlight terms. Splits on non-word
62
+ * characters, lowercases, drops stopwords and tokens shorter than 3 chars.
63
+ * Used as a fallback when BM25 matched terms are not available (dense mode).
64
+ * @param {string} text
65
+ * @returns {string[]} Unique tokens, longest first.
66
+ */
67
+ static tokenize(text) {
68
+ const seen = new Set();
69
+ const tokens = text
70
+ .toLowerCase()
71
+ .replace(/[^a-z0-9\-_]+/g, ' ')
72
+ .split(/\s+/)
73
+ .filter(t => t.length >= 3 && !STOPWORDS.has(t) && !seen.has(t) && seen.add(t));
74
+ return tokens.sort((a, b) => b.length - a.length);
75
+ }
76
+
77
+ /**
78
+ * Wrap occurrences of `terms` in `text` with `<mark>` tags, HTML-escaping
79
+ * everything else. Matching is stem-aware and case-insensitive: the term
80
+ * "run" will match "running", "runs", "ran"; "adventure" matches "adventures".
81
+ * Gaps between matched words are bridged into one `<mark>` span when every
82
+ * word in the gap is a stopword (e.g. "Tales of Mystery" → single highlight).
83
+ * @param {string} text Plain text to highlight.
84
+ * @param {string[]|Set<string>} terms Terms to highlight.
85
+ * @returns {string} HTML string with `<mark>…</mark>` around matches.
86
+ */
87
+ static highlight(text, terms) {
88
+ const stems = stemSet(terms);
89
+ if (!stems.size) return Highlighter.escape(text);
90
+
91
+ // Tokenize into segments {raw, isWord, marked}
92
+ const tokenRe = /([A-Za-z]+)|([^A-Za-z]+)/g;
93
+ const segs = [];
94
+ let m;
95
+ while ((m = tokenRe.exec(text)) !== null) {
96
+ const raw = m[1] ?? m[2];
97
+ const isWord = !!m[1];
98
+ const marked = isWord && stems.has(stem(raw.toLowerCase()));
99
+ segs.push({ raw, isWord, marked });
100
+ }
101
+
102
+ // Bridge pass: if the gap between two marked segments contains only
103
+ // stopwords (and non-word chars like spaces/punctuation), mark it too.
104
+ let i = 0;
105
+ while (i < segs.length) {
106
+ if (!segs[i].marked) { i++; continue; }
107
+ // Find the next marked segment
108
+ let j = i + 1;
109
+ while (j < segs.length && !segs[j].marked) j++;
110
+ if (j < segs.length) {
111
+ // Check every word in the gap
112
+ const gapAllStop = segs.slice(i + 1, j).every(
113
+ s => !s.isWord || STOPWORDS.has(s.raw.toLowerCase())
114
+ );
115
+ if (gapAllStop) {
116
+ for (let k = i + 1; k < j; k++) segs[k].marked = true;
117
+ }
118
+ }
119
+ i = j;
120
+ }
121
+
122
+ // Render: merge consecutive marked segments into one <mark> span.
123
+ const parts = [];
124
+ let inMark = false;
125
+ for (const seg of segs) {
126
+ if (seg.marked && !inMark) { parts.push('<mark>'); inMark = true; }
127
+ if (!seg.marked && inMark) { parts.push('</mark>'); inMark = false; }
128
+ parts.push(Highlighter.escape(seg.raw));
129
+ }
130
+ if (inMark) parts.push('</mark>');
131
+ return parts.join('');
132
+ }
133
+
134
+ /**
135
+ * Extract a snippet of at most `maxLen` characters centred on the first
136
+ * stem match. Short texts are returned in full. Ellipsis (`…`) is added at
137
+ * truncated edges. Pass the result to {@link Highlighter.highlight} for markup.
138
+ * @param {string} text Plain text.
139
+ * @param {string[]|Set<string>} terms Terms to centre the window on.
140
+ * @param {number} [maxLen=220] Maximum character count of the returned snippet.
141
+ * @returns {string} Plain-text excerpt.
142
+ */
143
+ static snippet(text, terms, maxLen = 220) {
144
+ if (text.length <= maxLen) return text;
145
+
146
+ const stems = stemSet(terms);
147
+ let center = 0;
148
+
149
+ if (stems.size) {
150
+ // Find the first word whose stem matches any query stem.
151
+ const wordRe = /[A-Za-z]+/g;
152
+ let wm;
153
+ while ((wm = wordRe.exec(text)) !== null) {
154
+ if (stems.has(stem(wm[0].toLowerCase()))) {
155
+ center = wm.index;
156
+ break;
157
+ }
158
+ }
159
+ }
160
+
161
+ const half = Math.floor(maxLen / 2);
162
+ const start = Math.max(0, Math.min(center - half, text.length - maxLen));
163
+ const end = Math.min(text.length, start + maxLen);
164
+ return (start > 0 ? '…' : '') + text.slice(start, end) + (end < text.length ? '…' : '');
165
+ }
166
+ }
@@ -0,0 +1,365 @@
1
+ const isNode = typeof globalThis.process?.versions?.node === 'string';
2
+
3
+ let _wasmInitialized = false;
4
+ let _WasmSearchEngine;
5
+
6
+ /**
7
+ * Initialize the altor-vec WASM module and cache the engine class. Idempotent.
8
+ * @returns {Promise<void>}
9
+ */
10
+ async function ensureWasm() {
11
+ if (_wasmInitialized) return;
12
+
13
+ let mod;
14
+ if (isNode) {
15
+ const { readFileSync } = await import('fs');
16
+ const { fileURLToPath, pathToFileURL } = await import('url');
17
+ let altorUrl;
18
+ if (typeof import.meta.resolve === 'function') {
19
+ altorUrl = import.meta.resolve('altor-vec');
20
+ } else {
21
+ const { createRequire } = await import('module');
22
+ altorUrl = pathToFileURL(createRequire(import.meta.url).resolve('altor-vec')).href;
23
+ }
24
+ mod = await import(/* @vite-ignore */ altorUrl);
25
+ const wasmUrl = new URL('altor_vec_wasm_bg.wasm', altorUrl);
26
+ mod.initSync({ module: readFileSync(fileURLToPath(wasmUrl)) });
27
+ } else {
28
+ mod = await import('altor-vec');
29
+ await mod.default(); // async WASM fetch
30
+ }
31
+
32
+ _WasmSearchEngine = mod.WasmSearchEngine;
33
+ _wasmInitialized = true;
34
+ }
35
+
36
+ // HNSW build parameters. ef_construction trades build speed for recall quality.
37
+ const HNSW_M = 16;
38
+ const HNSW_EF_CONSTRUCTION = 200;
39
+ const HNSW_EF_SEARCH = 50;
40
+
41
+ // RRF fusion constant (standard value).
42
+ const RRF_K = 60;
43
+
44
+ /**
45
+ * Dense + sparse vector store backed by altor-vec (WASM HNSW).
46
+ *
47
+ * The HNSW graph serializes to bytes via `engine.to_bytes()` and restores in
48
+ * milliseconds via `new WasmSearchEngine(bytes)` — eliminating the long rebuild
49
+ * that edgevec required. Sparse (BM25) vectors are stored as plain arrays and
50
+ * searched with brute-force dot products, which gives 100% recall for ≤1M docs.
51
+ */
52
+ export class VecStore {
53
+ #engine; // WasmSearchEngine — null until first insert or load
54
+ #dimensions;
55
+ #count;
56
+ #metadata; // plain array of metadata objects, indexed by 0-based insert order
57
+ #sparseVecs; // sparse vectors stored alongside dense; index 0 is the alignment placeholder
58
+
59
+ /**
60
+ * @param {object} opts
61
+ * @param {number} opts.dimensions Dense vector dimensionality.
62
+ */
63
+ constructor({ dimensions }) {
64
+ this.#dimensions = dimensions;
65
+ this.#engine = null;
66
+ this.#count = 0;
67
+ this.#metadata = [];
68
+ this.#sparseVecs = null;
69
+ }
70
+
71
+ /**
72
+ * Initialize the WASM module. Must be called before insert / search.
73
+ * @returns {Promise<void>}
74
+ */
75
+ async init() {
76
+ await ensureWasm();
77
+ }
78
+
79
+ /**
80
+ * Insert a dense vector with optional metadata.
81
+ * @param {Float32Array|number[]} vector
82
+ * @param {Record<string, any>} [metadata]
83
+ * @returns {number} 1-based id for backward compatibility.
84
+ */
85
+ insert(vector, metadata) {
86
+ const vec = vector instanceof Float32Array ? vector : new Float32Array(vector);
87
+ if (!this.#engine) {
88
+ // from_vectors requires ≥1 vector; subsequent adds use add_vectors
89
+ this.#engine = _WasmSearchEngine.from_vectors(
90
+ vec, this.#dimensions, HNSW_M, HNSW_EF_CONSTRUCTION, HNSW_EF_SEARCH
91
+ );
92
+ } else {
93
+ this.#engine.add_vectors(vec, this.#dimensions);
94
+ }
95
+ this.#metadata.push(metadata || {});
96
+ this.#count++;
97
+ return this.#count; // 1-based
98
+ }
99
+
100
+ /**
101
+ * Enable sparse storage. Inserts a placeholder at index 0 so sparse ids
102
+ * (0-based) align with dense ids (1-based).
103
+ */
104
+ initSparse() {
105
+ this.#sparseVecs = [{ indices: [0], values: [1e-10], dim: 1 }];
106
+ }
107
+
108
+ /**
109
+ * Insert a sparse BM25 vector, paired positionally with the prior dense insert.
110
+ * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
111
+ */
112
+ insertSparse(sparse) {
113
+ if (this.#sparseVecs) {
114
+ this.#sparseVecs.push({
115
+ indices: Array.from(sparse.indices),
116
+ values: Array.from(sparse.values),
117
+ dim: sparse.dim,
118
+ });
119
+ }
120
+ }
121
+
122
+ /**
123
+ * Nearest-neighbor dense search.
124
+ * @param {Float32Array} query
125
+ * @param {number} [k=10]
126
+ * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
127
+ */
128
+ async search(query, k = 10) {
129
+ return this.#denseScan(query, k);
130
+ }
131
+
132
+ /**
133
+ * Dense search with a JS predicate post-filter. Fetches `k * 5` candidates
134
+ * from the HNSW, then applies the filter to metadata objects.
135
+ * @param {Float32Array} query
136
+ * @param {(meta: Record<string,any>) => boolean} filter JS predicate.
137
+ * @param {number} [k=10]
138
+ * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
139
+ */
140
+ async searchWithFilter(query, filter, k = 10) {
141
+ const candidates = this.#denseScan(query, k * 5);
142
+ if (typeof filter !== 'function') return candidates.slice(0, k);
143
+ return candidates.filter(r => filter(r.metadata)).slice(0, k);
144
+ }
145
+
146
+ /**
147
+ * Hybrid dense + sparse search with RRF rank fusion.
148
+ * Dense side uses the HNSW; sparse side uses brute-force dot products.
149
+ * @param {Float32Array} denseQuery
150
+ * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
151
+ * @param {number} [k=10]
152
+ * @param {object} [opts]
153
+ * @param {number} [opts.dense_k]
154
+ * @param {number} [opts.sparse_k]
155
+ * @param {string|{type:string,alpha:number}} [opts.fusion='rrf']
156
+ * @returns {Array<{id:number,score:number,dense_rank?:number,sparse_rank?:number,metadata:Record<string,any>}>}
157
+ */
158
+ hybridSearch(denseQuery, sparse, k = 10, { dense_k, sparse_k, fusion } = {}) {
159
+ const dk = dense_k ?? k * 3;
160
+ const sk = sparse_k ?? k * 3;
161
+
162
+ const denseResults = this.#denseScan(denseQuery, dk);
163
+
164
+ // Linear alpha fusion: alpha=0 → sparse only, alpha=1 → dense only
165
+ if (fusion && typeof fusion === 'object' && fusion.type === 'linear') {
166
+ const alpha = fusion.alpha ?? 0.5;
167
+ if (alpha === 0) {
168
+ const sp = this.#sparseScan(sparse, k);
169
+ return sp.map((r, rank) => ({ ...r, sparse_rank: rank + 1 }));
170
+ }
171
+ if (alpha === 1) {
172
+ return denseResults.slice(0, k).map((r, rank) => ({ ...r, dense_rank: rank + 1 }));
173
+ }
174
+ }
175
+
176
+ const sparseResults = this.#sparseScan(sparse, sk);
177
+ return this.#rrf(denseResults, sparseResults, k);
178
+ }
179
+
180
+ // ── Private search helpers ───────────────────────────────────────────────
181
+
182
+ /**
183
+ * Scan the HNSW for the k nearest neighbors.
184
+ * altor-vec returns [[nodeId, distance], ...] with cosine distance.
185
+ * We expose score = 1 - distance (higher = more similar).
186
+ */
187
+ #denseScan(query, k) {
188
+ if (!this.#engine) return [];
189
+ const q = query instanceof Float32Array ? query : new Float32Array(query);
190
+ const raw = JSON.parse(this.#engine.search(q, Math.min(k, this.#count)));
191
+ return raw.map(([nodeId, dist]) => ({
192
+ id: nodeId + 1,
193
+ score: 1 - dist,
194
+ metadata: this.#metadata[nodeId] || {},
195
+ }));
196
+ }
197
+
198
+ /**
199
+ * Brute-force sparse dot product search over stored BM25 vectors.
200
+ * O(docs × avg_nnz) — exact, 100% recall.
201
+ */
202
+ #sparseScan(query, k) {
203
+ if (!this.#sparseVecs || this.#sparseVecs.length <= 1) return [];
204
+ const qMap = new Map();
205
+ for (let t = 0; t < query.indices.length; t++) {
206
+ qMap.set(query.indices[t], query.values[t]);
207
+ }
208
+ const scores = [];
209
+ // #sparseVecs[0] is the alignment placeholder; docs start at index 1
210
+ for (let i = 1; i < this.#sparseVecs.length; i++) {
211
+ const sv = this.#sparseVecs[i];
212
+ let score = 0;
213
+ for (let t = 0; t < sv.indices.length; t++) {
214
+ const qv = qMap.get(sv.indices[t]);
215
+ if (qv !== undefined) score += qv * sv.values[t];
216
+ }
217
+ scores.push({ nodeId: i - 1, score });
218
+ }
219
+ scores.sort((a, b) => b.score - a.score);
220
+ return scores.slice(0, k).map(({ nodeId, score }) => ({
221
+ id: nodeId + 1,
222
+ score,
223
+ metadata: this.#metadata[nodeId] || {},
224
+ }));
225
+ }
226
+
227
+ /**
228
+ * Reciprocal Rank Fusion of dense and sparse result lists.
229
+ */
230
+ #rrf(dense, sparse, k) {
231
+ const acc = new Map(); // id → {rrfScore, dense_rank, sparse_rank}
232
+
233
+ dense.forEach(({ id }, rank) => {
234
+ acc.set(id, { rrfScore: 1 / (RRF_K + rank + 1), dense_rank: rank + 1, sparse_rank: null });
235
+ });
236
+ sparse.forEach(({ id }, rank) => {
237
+ const entry = acc.get(id);
238
+ const contrib = 1 / (RRF_K + rank + 1);
239
+ if (entry) {
240
+ entry.rrfScore += contrib;
241
+ entry.sparse_rank = rank + 1;
242
+ } else {
243
+ acc.set(id, { rrfScore: contrib, dense_rank: null, sparse_rank: rank + 1 });
244
+ }
245
+ });
246
+
247
+ return [...acc.entries()]
248
+ .sort((a, b) => b[1].rrfScore - a[1].rrfScore)
249
+ .slice(0, k)
250
+ .map(([id, { rrfScore, dense_rank, sparse_rank }]) => ({
251
+ id,
252
+ score: rrfScore,
253
+ dense_rank,
254
+ sparse_rank,
255
+ metadata: this.#metadata[id - 1] || {},
256
+ }));
257
+ }
258
+
259
+ // ── Serialization ────────────────────────────────────────────────────────
260
+
261
+ /**
262
+ * Serialize to a self-contained Uint8Array.
263
+ * Layout: [uint32 hnswLen][HNSW bytes][JSON: {dims, metadata, sparse?}]
264
+ * @returns {Uint8Array}
265
+ */
266
+ exportBytes() {
267
+ if (!this.#engine) throw new Error('VecStore: nothing to export — insert documents first');
268
+ const hnswBytes = this.#engine.to_bytes();
269
+ const payload = { dims: this.#dimensions, metadata: this.#metadata };
270
+ if (this.#sparseVecs) payload.sparse = this.#sparseVecs;
271
+ const jsonBytes = new TextEncoder().encode(JSON.stringify(payload));
272
+
273
+ const out = new Uint8Array(4 + hnswBytes.length + jsonBytes.length);
274
+ new DataView(out.buffer).setUint32(0, hnswBytes.length, true);
275
+ out.set(hnswBytes, 4);
276
+ out.set(jsonBytes, 4 + hnswBytes.length);
277
+ return out;
278
+ }
279
+
280
+ /**
281
+ * Write to a file (Node only).
282
+ * @param {string} filePath
283
+ */
284
+ async save(filePath) {
285
+ const { writeFileSync } = await import('fs');
286
+ writeFileSync(filePath, this.exportBytes());
287
+ }
288
+
289
+ /** @alias save */
290
+ async exportToFile(filePath) {
291
+ await this.save(filePath);
292
+ }
293
+
294
+ // ── Loaders ──────────────────────────────────────────────────────────────
295
+
296
+ /**
297
+ * Load from a file (Node only).
298
+ * @param {string} filePath
299
+ * @returns {Promise<VecStore>}
300
+ */
301
+ static async load(filePath) {
302
+ await ensureWasm();
303
+ const { readFileSync } = await import('fs');
304
+ const buf = readFileSync(filePath);
305
+ return VecStore.#fromBytes(new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength));
306
+ }
307
+
308
+ /**
309
+ * Rebuild from bytes produced by {@link VecStore#exportBytes}.
310
+ * @param {Uint8Array} bytes
311
+ * @returns {Promise<VecStore>}
312
+ */
313
+ static async loadFromBytes(bytes) {
314
+ await ensureWasm();
315
+ return VecStore.#fromBytes(bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes));
316
+ }
317
+
318
+ /**
319
+ * Fetch an exported blob over HTTP and rebuild it.
320
+ * @param {string} url
321
+ * @returns {Promise<VecStore>}
322
+ */
323
+ static async loadFromUrl(url) {
324
+ const response = await fetch(url);
325
+ const buf = await response.arrayBuffer();
326
+ return VecStore.loadFromBytes(new Uint8Array(buf));
327
+ }
328
+
329
+ /**
330
+ * Deserialize bytes produced by exportBytes.
331
+ * The HNSW graph is restored directly — no rebuild loop.
332
+ */
333
+ static #fromBytes(bytes) {
334
+ const hnswLen = new DataView(bytes.buffer, bytes.byteOffset, 4).getUint32(0, true);
335
+ const hnswBytes = bytes.subarray(4, 4 + hnswLen);
336
+ const jsonBytes = bytes.subarray(4 + hnswLen);
337
+ const parsed = JSON.parse(new TextDecoder().decode(jsonBytes));
338
+
339
+ // Support old edgevec format (plain array = metadata only) and new format
340
+ let dims, metadata, sparseVecs;
341
+ if (Array.isArray(parsed)) {
342
+ metadata = parsed;
343
+ dims = 0;
344
+ } else {
345
+ dims = parsed.dims ?? 0;
346
+ metadata = parsed.metadata;
347
+ sparseVecs = parsed.sparse ?? null;
348
+ }
349
+
350
+ const store = new VecStore({ dimensions: dims });
351
+ store.#engine = new _WasmSearchEngine(hnswBytes);
352
+ store.#count = store.#engine.len();
353
+ store.#metadata = metadata;
354
+ store.#sparseVecs = sparseVecs;
355
+ return store;
356
+ }
357
+
358
+ // ── Properties ───────────────────────────────────────────────────────────
359
+
360
+ /** Number of indexed vectors. */
361
+ get count() { return this.#count; }
362
+
363
+ /** Dimensionality of the dense vectors. */
364
+ get dimensions() { return this.#dimensions; }
365
+ }