vecito 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/vecito.js ADDED
@@ -0,0 +1,352 @@
1
+ import { Embedder } from './embedder.js';
2
+ import { BM25 } from './bm25.js';
3
+ import { VecStore } from './vec-store.js';
4
+ import { defaultText, defaultMetadata } from './extract.js';
5
+ import { Highlighter } from './highlight.js';
6
+
7
+ const isNode = typeof globalThis.process?.versions?.node === 'string';
8
+
9
+ /**
10
+ * Build the empty-sparse placeholder kept in sync with the index-build path —
11
+ * edgevec requires a non-empty sparse vector to keep dense/sparse ids aligned.
12
+ * @param {number} dim Vocabulary size.
13
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}}
14
+ */
15
+ function placeholderSparse(dim) {
16
+ return { indices: new Uint32Array([0]), values: new Float32Array([1e-10]), dim: Math.max(dim, 1) };
17
+ }
18
+
19
+ /**
20
+ * @typedef {object} AddOptions
21
+ * @property {(item: any) => string} [text] Extract the searchable text from an item.
22
+ * Defaults to {@link defaultText} (strings pass through, objects are flattened
23
+ * to their string leaves).
24
+ * @property {(item: any) => Record<string, any>} [metadata] Extract metadata
25
+ * returned with hits. Defaults to {@link defaultMetadata} (objects are kept
26
+ * as-is, so hits carry the original data back).
27
+ */
28
+
29
+ /**
30
+ * @typedef {object} SearchResult
31
+ * @property {number} [id] Internal vector id.
32
+ * @property {number} [score] Relevance score (interpretation depends on mode).
33
+ * @property {number} [dense_rank] Rank on the dense side (hybrid mode).
34
+ * @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
35
+ * @property {Record<string, any>} metadata The document's metadata.
36
+ */
37
+
38
+ /**
39
+ * Vecito — isomorphic (Node + browser) hybrid semantic search.
40
+ *
41
+ * Orchestrates an Embedder (dense), BM25 (sparse) and a VecStore (edgevec) so
42
+ * callers get a one-liner instead of wiring the three primitives by hand.
43
+ * Universal methods (addDocuments / search / exportBytes / loadFromBytes /
44
+ * loadFromUrl) run anywhere; save/load are Node-only.
45
+ */
46
+ export class Vecito {
47
+ #embedder;
48
+ #bm25;
49
+ #store;
50
+ #model;
51
+ #fitted;
52
+ #mode; // 'hybrid' | 'dense'
53
+
54
+ /**
55
+ * @param {object} [opts]
56
+ * @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Embedding model id.
57
+ * @param {string} [opts.dtype='q8'] Weight precision (`'q8'` ≈ 4× smaller
58
+ * download by default, `'fp32'` for full precision).
59
+ * @param {Embedder} [opts.embedder] A pre-built (optionally pre-loaded)
60
+ * {@link Embedder} to use instead of constructing one — handy for reusing a
61
+ * loaded model across indexes or for timing model load separately. Takes
62
+ * precedence over `model`/`dtype`.
63
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode. `'hybrid'` stores
64
+ * both dense vectors and BM25 sparse data; `'dense'` omits sparse/BM25 for a
65
+ * smaller snapshot. The mode is embedded in the snapshot and respected on load.
66
+ * @param {number} [opts.k1] BM25 term-frequency saturation (hybrid mode only).
67
+ * @param {number} [opts.b] BM25 length-normalization factor (hybrid mode only).
68
+ */
69
+ constructor({ model, dtype, embedder, mode, k1, b } = {}) {
70
+ if (embedder) {
71
+ this.#embedder = embedder;
72
+ this.#model = embedder.model;
73
+ } else {
74
+ this.#model = model || 'Xenova/all-MiniLM-L6-v2';
75
+ this.#embedder = new Embedder({ model: this.#model, dtype });
76
+ }
77
+ this.#mode = mode === 'dense' ? 'dense' : 'hybrid';
78
+ this.#bm25 = this.#mode === 'hybrid' ? new BM25({ k1, b }) : null;
79
+ this.#store = null;
80
+ this.#fitted = false;
81
+ }
82
+
83
+ /**
84
+ * Lazily create and initialize the backing vector store (with sparse storage
85
+ * enabled) on first use.
86
+ * @returns {Promise<VecStore>}
87
+ */
88
+ async #ensureStore() {
89
+ if (!this.#store) {
90
+ // Load the model first so the store is sized to its real width.
91
+ await this.#embedder.init();
92
+ this.#store = new VecStore({ dimensions: this.#embedder.dimensions });
93
+ await this.#store.init();
94
+ if (this.#mode === 'hybrid') this.#store.initSparse();
95
+ }
96
+ return this.#store;
97
+ }
98
+
99
+ /**
100
+ * Ensure the embedder is loaded and its width matches the index. Guards
101
+ * against searching/extending an index with a model of a different
102
+ * dimensionality than the one it was built with.
103
+ * @returns {Promise<void>}
104
+ */
105
+ async #assertModelMatchesStore() {
106
+ await this.#embedder.init();
107
+ if (this.#store && this.#embedder.dimensions !== this.#store.dimensions) {
108
+ throw new Error(
109
+ `Model mismatch: this index holds ${this.#store.dimensions}-dim vectors, ` +
110
+ `but model "${this.#model}" produces ${this.#embedder.dimensions}-dim vectors. ` +
111
+ `Use the model the index was built with.`
112
+ );
113
+ }
114
+ }
115
+
116
+ /**
117
+ * Index arbitrary data. Items may be strings, plain JSON objects, or anything
118
+ * else — the `text`/`metadata` extractors (with smart defaults) decide what to
119
+ * embed and what to return with hits, so raw objects work with zero config.
120
+ *
121
+ * BM25 is fit on the **first** call (its global df/idf statistics need a
122
+ * corpus), then frozen. Subsequent calls — including adding documents to an
123
+ * instance restored via {@link Vecito.load}/{@link Vecito.loadFromBytes} —
124
+ * append documents scored against that existing model, keeping the index
125
+ * consistent. Dense (semantic) search covers new documents fully; sparse
126
+ * scoring only sees terms already in the frozen vocabulary. For best lexical
127
+ * recall, pass your whole corpus in the first call.
128
+ * @param {any|any[]} items Item(s) to index.
129
+ * @param {AddOptions} [opts]
130
+ * @returns {Promise<this>}
131
+ */
132
+ async addDocuments(items, { text = defaultText, metadata = defaultMetadata } = {}) {
133
+ const list = Array.isArray(items) ? items : [items];
134
+ if (list.length === 0) return this;
135
+
136
+ const store = await this.#ensureStore();
137
+ const texts = list.map(text);
138
+
139
+ // Fit BM25 only the first time (hybrid mode only); later calls score new
140
+ // documents against the existing frozen model so ids/vocab stay aligned.
141
+ if (this.#mode === 'hybrid' && !this.#fitted) {
142
+ this.#bm25.fit(texts);
143
+ this.#fitted = true;
144
+ }
145
+
146
+ for (let i = 0; i < list.length; i++) {
147
+ const docText = texts[i];
148
+ const vector = await this.#embedder.embed(docText);
149
+ store.insert(vector, metadata(list[i]));
150
+
151
+ if (this.#mode === 'hybrid') {
152
+ const sparse = this.#bm25.score(docText);
153
+ if (sparse.indices.length > 0) {
154
+ store.insertSparse(sparse);
155
+ } else {
156
+ store.insertSparse(placeholderSparse(this.#bm25.vocabSize));
157
+ }
158
+ }
159
+ }
160
+ return this;
161
+ }
162
+
163
+ /**
164
+ * Search the index.
165
+ * @param {string} query Natural-language query.
166
+ * @param {object} [opts]
167
+ * @param {'hybrid'|'dense'|'sparse'} [opts.mode='hybrid'] Ranking strategy.
168
+ * 'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
169
+ * BM25-weighted. Falls back to dense if the query has no in-vocab terms.
170
+ * @param {number} [opts.top=10] Maximum number of results.
171
+ * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode, over-fetching to preserve the requested count.
172
+ * @returns {Promise<SearchResult[]>}
173
+ * @throws {Error} If nothing has been indexed or loaded yet.
174
+ */
175
+ async search(query, { mode = 'hybrid', top = 10, filter, matchedTerms: includeTerms = false } = {}) {
176
+ if (!this.#store) throw new Error('Vecito: nothing indexed — call addDocuments() or load() first');
177
+ await this.#assertModelMatchesStore();
178
+
179
+ // Dense-only snapshots have no sparse data — force dense regardless of request.
180
+ const effectiveMode = this.#mode === 'dense' ? 'dense' : mode;
181
+
182
+ const queryVec = await this.#embedder.embed(query);
183
+
184
+ let results;
185
+ let querySparse = null;
186
+
187
+ const fetchK = filter ? top * 5 : top;
188
+
189
+ if (effectiveMode === 'dense') {
190
+ results = await this.#store.search(queryVec, fetchK);
191
+ } else {
192
+ querySparse = this.#bm25.querySparse(query);
193
+ const hasSparse = querySparse.indices.length > 0;
194
+
195
+ if (!hasSparse) {
196
+ results = await this.#store.search(queryVec, fetchK);
197
+ } else if (effectiveMode === 'sparse') {
198
+ results = this.#store.hybridSearch(queryVec, querySparse, fetchK, {
199
+ fusion: { type: 'linear', alpha: 0.0 },
200
+ });
201
+ } else {
202
+ results = this.#store.hybridSearch(queryVec, querySparse, fetchK);
203
+ }
204
+ }
205
+
206
+ if (filter) results = results.filter(r => filter(r.metadata)).slice(0, top);
207
+
208
+ if (includeTerms) {
209
+ const terms = querySparse && querySparse.indices.length > 0
210
+ ? this.#bm25.termsForIndices(querySparse.indices)
211
+ : Highlighter.tokenize(query);
212
+ results = results.map(r => ({ ...r, matchedTerms: terms }));
213
+ }
214
+
215
+ return results;
216
+ }
217
+
218
+ /**
219
+ * Number of indexed documents.
220
+ * @returns {number}
221
+ */
222
+ get count() {
223
+ return this.#store ? this.#store.count : 0;
224
+ }
225
+
226
+ /**
227
+ * The embedding model id this instance uses.
228
+ * @returns {string}
229
+ */
230
+ get model() {
231
+ return this.#model;
232
+ }
233
+
234
+ /**
235
+ * Weight precision the embedder loads (e.g. `'q8'`, `'fp32'`).
236
+ * @returns {string}
237
+ */
238
+ get dtype() {
239
+ return this.#embedder.dtype;
240
+ }
241
+
242
+ /**
243
+ * Width of the dense vectors in the index, or null before anything is indexed.
244
+ * @returns {number|null}
245
+ */
246
+ get dimensions() {
247
+ return this.#store ? this.#store.dimensions : null;
248
+ }
249
+
250
+ /**
251
+ * Index mode this instance was built with (`'hybrid'` or `'dense'`).
252
+ * @returns {'hybrid'|'dense'}
253
+ */
254
+ get indexMode() {
255
+ return this.#mode;
256
+ }
257
+
258
+ // --- Persistence (universal, single self-contained blob) ---
259
+
260
+ /**
261
+ * Serialize everything (vectors + metadata + sparse + BM25 + model name +
262
+ * dtype) into one Uint8Array. Layout: [uint32 metaLen][meta JSON][VecStore
263
+ * bytes].
264
+ * @returns {Uint8Array}
265
+ * @throws {Error} If nothing has been indexed yet.
266
+ */
267
+ exportBytes() {
268
+ if (!this.#store) throw new Error('Vecito: nothing to export — call addDocuments() first');
269
+ const storeBytes = this.#store.exportBytes();
270
+ const meta = {
271
+ model: this.#model,
272
+ dtype: this.#embedder.dtype,
273
+ mode: this.#mode,
274
+ bm25: this.#mode === 'hybrid' && this.#fitted ? this.#bm25.toJSON() : null,
275
+ };
276
+ const metaBytes = new TextEncoder().encode(JSON.stringify(meta));
277
+
278
+ const out = new Uint8Array(4 + metaBytes.length + storeBytes.length);
279
+ new DataView(out.buffer).setUint32(0, metaBytes.length, true);
280
+ out.set(metaBytes, 4);
281
+ out.set(storeBytes, 4 + metaBytes.length);
282
+ return out;
283
+ }
284
+
285
+ /**
286
+ * Parse a container blob produced by {@link Vecito#exportBytes} back into a
287
+ * ready-to-search instance.
288
+ * @param {Uint8Array} bytes
289
+ * @returns {Promise<Vecito>}
290
+ */
291
+ static async #fromContainer(bytes) {
292
+ const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
293
+ const metaLen = view.getUint32(0, true);
294
+ const meta = JSON.parse(new TextDecoder().decode(bytes.subarray(4, 4 + metaLen)));
295
+ const storeBytes = bytes.subarray(4 + metaLen);
296
+
297
+ const v = new Vecito({ model: meta.model, dtype: meta.dtype, mode: meta.mode ?? 'hybrid' });
298
+ if (meta.bm25) {
299
+ v.#bm25 = BM25.fromJSON(meta.bm25);
300
+ v.#fitted = true;
301
+ }
302
+ v.#store = await VecStore.loadFromBytes(storeBytes);
303
+ return v;
304
+ }
305
+
306
+ /**
307
+ * Rebuild from bytes produced by {@link Vecito#exportBytes} (universal).
308
+ * @param {Uint8Array|ArrayBuffer} bytes
309
+ * @returns {Promise<Vecito>}
310
+ */
311
+ static async loadFromBytes(bytes) {
312
+ return Vecito.#fromContainer(bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes));
313
+ }
314
+
315
+ /**
316
+ * Fetch an exported blob over HTTP and rebuild it (browser + Node fetch).
317
+ * @param {string} url
318
+ * @returns {Promise<Vecito>}
319
+ */
320
+ static async loadFromUrl(url) {
321
+ const res = await fetch(url);
322
+ const buf = await res.arrayBuffer();
323
+ return Vecito.loadFromBytes(new Uint8Array(buf));
324
+ }
325
+
326
+ // --- Node-only file persistence ---
327
+
328
+ /**
329
+ * Write the exported blob to a file. Node only.
330
+ * @param {string} path
331
+ * @returns {Promise<void>}
332
+ * @throws {Error} In non-Node environments.
333
+ */
334
+ async save(path) {
335
+ if (!isNode) throw new Error('Vecito.save() is Node-only; use exportBytes()/persist() in the browser');
336
+ const { writeFileSync } = await import('fs');
337
+ writeFileSync(path, this.exportBytes());
338
+ }
339
+
340
+ /**
341
+ * Read an exported blob from a file. Node only.
342
+ * @param {string} path
343
+ * @returns {Promise<Vecito>}
344
+ * @throws {Error} In non-Node environments.
345
+ */
346
+ static async load(path) {
347
+ if (!isNode) throw new Error('Vecito.load() is Node-only; use loadFromBytes()/loadFromUrl() in the browser');
348
+ const { readFileSync } = await import('fs');
349
+ const buf = readFileSync(path);
350
+ return Vecito.loadFromBytes(new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength));
351
+ }
352
+ }
package/package.json ADDED
@@ -0,0 +1,62 @@
1
+ {
2
+ "name": "vecito",
3
+ "version": "0.1.0",
4
+ "description": "Tiny hybrid (dense + BM25) semantic search for Node and the browser",
5
+ "type": "module",
6
+ "author": "Jeka Kiselyov",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "git+https://github.com/jeka-kiselyov/vecito.git"
10
+ },
11
+ "homepage": "https://github.com/jeka-kiselyov/vecito#readme",
12
+ "bugs": {
13
+ "url": "https://github.com/jeka-kiselyov/vecito/issues"
14
+ },
15
+ "main": "index.js",
16
+ "types": "index.d.ts",
17
+ "publishConfig": {
18
+ "access": "public"
19
+ },
20
+ "bin": {
21
+ "vecito": "bin/cli.js"
22
+ },
23
+ "exports": {
24
+ ".": {
25
+ "types": "./index.d.ts",
26
+ "default": "./index.js"
27
+ },
28
+ "./file": {
29
+ "types": "./file.d.ts",
30
+ "default": "./lib/file-index.js"
31
+ }
32
+ },
33
+ "files": [
34
+ "index.js",
35
+ "index.d.ts",
36
+ "file.d.ts",
37
+ "lib/",
38
+ "bin/"
39
+ ],
40
+ "scripts": {
41
+ "test": "vitest run",
42
+ "dev:browser": "vite"
43
+ },
44
+ "keywords": [
45
+ "semantic-search",
46
+ "vector-search",
47
+ "embeddings",
48
+ "bm25",
49
+ "hybrid-search",
50
+ "rag"
51
+ ],
52
+ "dependencies": {
53
+ "@huggingface/transformers": "^3.5.1",
54
+ "altor-vec": "^0.1.3",
55
+ "stemmer": "^2.0.1"
56
+ },
57
+ "devDependencies": {
58
+ "vite": "^6.0.0",
59
+ "vitest": "^3.2.0"
60
+ },
61
+ "license": "MIT"
62
+ }