npm - vecito - Versions diffs - 0.1.0 - Mend

vecito 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/lib/vecito.js ADDED Viewed

@@ -0,0 +1,352 @@
+import { Embedder } from './embedder.js';
+import { BM25 } from './bm25.js';
+import { VecStore } from './vec-store.js';
+import { defaultText, defaultMetadata } from './extract.js';
+import { Highlighter } from './highlight.js';
+const isNode = typeof globalThis.process?.versions?.node === 'string';
+/**
+ * Build the empty-sparse placeholder kept in sync with the index-build path —
+ * edgevec requires a non-empty sparse vector to keep dense/sparse ids aligned.
+ * @param {number} dim Vocabulary size.
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}}
+ */
+function placeholderSparse(dim) {
+	return { indices: new Uint32Array([0]), values: new Float32Array([1e-10]), dim: Math.max(dim, 1) };
+}
+/**
+ * @typedef {object} AddOptions
+ * @property {(item: any) => string} [text] Extract the searchable text from an item.
+ *   Defaults to {@link defaultText} (strings pass through, objects are flattened
+ *   to their string leaves).
+ * @property {(item: any) => Record<string, any>} [metadata] Extract metadata
+ *   returned with hits. Defaults to {@link defaultMetadata} (objects are kept
+ *   as-is, so hits carry the original data back).
+ */
+/**
+ * @typedef {object} SearchResult
+ * @property {number} [id] Internal vector id.
+ * @property {number} [score] Relevance score (interpretation depends on mode).
+ * @property {number} [dense_rank] Rank on the dense side (hybrid mode).
+ * @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
+ * @property {Record<string, any>} metadata The document's metadata.
+ */
+/**
+ * Vecito — isomorphic (Node + browser) hybrid semantic search.
+ *
+ * Orchestrates an Embedder (dense), BM25 (sparse) and a VecStore (edgevec) so
+ * callers get a one-liner instead of wiring the three primitives by hand.
+ * Universal methods (addDocuments / search / exportBytes / loadFromBytes /
+ * loadFromUrl) run anywhere; save/load are Node-only.
+ */
+export class Vecito {
+	#embedder;
+	#bm25;
+	#store;
+	#model;
+	#fitted;
+	#mode; // 'hybrid' | 'dense'
+	/**
+	 * @param {object} [opts]
+	 * @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Embedding model id.
+	 * @param {string} [opts.dtype='q8'] Weight precision (`'q8'` ≈ 4× smaller
+	 *   download by default, `'fp32'` for full precision).
+	 * @param {Embedder} [opts.embedder] A pre-built (optionally pre-loaded)
+	 *   {@link Embedder} to use instead of constructing one — handy for reusing a
+	 *   loaded model across indexes or for timing model load separately. Takes
+	 *   precedence over `model`/`dtype`.
+	 * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode. `'hybrid'` stores
+	 *   both dense vectors and BM25 sparse data; `'dense'` omits sparse/BM25 for a
+	 *   smaller snapshot. The mode is embedded in the snapshot and respected on load.
+	 * @param {number} [opts.k1] BM25 term-frequency saturation (hybrid mode only).
+	 * @param {number} [opts.b] BM25 length-normalization factor (hybrid mode only).
+	 */
+	constructor({ model, dtype, embedder, mode, k1, b } = {}) {
+		if (embedder) {
+			this.#embedder = embedder;
+			this.#model = embedder.model;
+		} else {
+			this.#model = model || 'Xenova/all-MiniLM-L6-v2';
+			this.#embedder = new Embedder({ model: this.#model, dtype });
+		}
+		this.#mode = mode === 'dense' ? 'dense' : 'hybrid';
+		this.#bm25 = this.#mode === 'hybrid' ? new BM25({ k1, b }) : null;
+		this.#store = null;
+		this.#fitted = false;
+	}
+	/**
+	 * Lazily create and initialize the backing vector store (with sparse storage
+	 * enabled) on first use.
+	 * @returns {Promise<VecStore>}
+	 */
+	async #ensureStore() {
+		if (!this.#store) {
+			// Load the model first so the store is sized to its real width.
+			await this.#embedder.init();
+			this.#store = new VecStore({ dimensions: this.#embedder.dimensions });
+			await this.#store.init();
+			if (this.#mode === 'hybrid') this.#store.initSparse();
+		}
+		return this.#store;
+	}
+	/**
+	 * Ensure the embedder is loaded and its width matches the index. Guards
+	 * against searching/extending an index with a model of a different
+	 * dimensionality than the one it was built with.
+	 * @returns {Promise<void>}
+	 */
+	async #assertModelMatchesStore() {
+		await this.#embedder.init();
+		if (this.#store && this.#embedder.dimensions !== this.#store.dimensions) {
+			throw new Error(
+				`Model mismatch: this index holds ${this.#store.dimensions}-dim vectors, ` +
+				`but model "${this.#model}" produces ${this.#embedder.dimensions}-dim vectors. ` +
+				`Use the model the index was built with.`
+			);
+		}
+	}
+	/**
+	 * Index arbitrary data. Items may be strings, plain JSON objects, or anything
+	 * else — the `text`/`metadata` extractors (with smart defaults) decide what to
+	 * embed and what to return with hits, so raw objects work with zero config.
+	 *
+	 * BM25 is fit on the **first** call (its global df/idf statistics need a
+	 * corpus), then frozen. Subsequent calls — including adding documents to an
+	 * instance restored via {@link Vecito.load}/{@link Vecito.loadFromBytes} —
+	 * append documents scored against that existing model, keeping the index
+	 * consistent. Dense (semantic) search covers new documents fully; sparse
+	 * scoring only sees terms already in the frozen vocabulary. For best lexical
+	 * recall, pass your whole corpus in the first call.
+	 * @param {any|any[]} items Item(s) to index.
+	 * @param {AddOptions} [opts]
+	 * @returns {Promise<this>}
+	 */
+	async addDocuments(items, { text = defaultText, metadata = defaultMetadata } = {}) {
+		const list = Array.isArray(items) ? items : [items];
+		if (list.length === 0) return this;
+		const store = await this.#ensureStore();
+		const texts = list.map(text);
+		// Fit BM25 only the first time (hybrid mode only); later calls score new
+		// documents against the existing frozen model so ids/vocab stay aligned.
+		if (this.#mode === 'hybrid' && !this.#fitted) {
+			this.#bm25.fit(texts);
+			this.#fitted = true;
+		}
+		for (let i = 0; i < list.length; i++) {
+			const docText = texts[i];
+			const vector = await this.#embedder.embed(docText);
+			store.insert(vector, metadata(list[i]));
+			if (this.#mode === 'hybrid') {
+				const sparse = this.#bm25.score(docText);
+				if (sparse.indices.length > 0) {
+					store.insertSparse(sparse);
+				} else {
+					store.insertSparse(placeholderSparse(this.#bm25.vocabSize));
+				}
+			}
+		}
+		return this;
+	}
+	/**
+	 * Search the index.
+	 * @param {string} query Natural-language query.
+	 * @param {object} [opts]
+	 * @param {'hybrid'|'dense'|'sparse'} [opts.mode='hybrid'] Ranking strategy.
+	 *   'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
+	 *   BM25-weighted. Falls back to dense if the query has no in-vocab terms.
+	 * @param {number} [opts.top=10] Maximum number of results.
+	 * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode, over-fetching to preserve the requested count.
+	 * @returns {Promise<SearchResult[]>}
+	 * @throws {Error} If nothing has been indexed or loaded yet.
+	 */
+	async search(query, { mode = 'hybrid', top = 10, filter, matchedTerms: includeTerms = false } = {}) {
+		if (!this.#store) throw new Error('Vecito: nothing indexed — call addDocuments() or load() first');
+		await this.#assertModelMatchesStore();
+		// Dense-only snapshots have no sparse data — force dense regardless of request.
+		const effectiveMode = this.#mode === 'dense' ? 'dense' : mode;
+		const queryVec = await this.#embedder.embed(query);
+		let results;
+		let querySparse = null;
+		const fetchK = filter ? top * 5 : top;
+		if (effectiveMode === 'dense') {
+			results = await this.#store.search(queryVec, fetchK);
+		} else {
+			querySparse = this.#bm25.querySparse(query);
+			const hasSparse = querySparse.indices.length > 0;
+			if (!hasSparse) {
+				results = await this.#store.search(queryVec, fetchK);
+			} else if (effectiveMode === 'sparse') {
+				results = this.#store.hybridSearch(queryVec, querySparse, fetchK, {
+					fusion: { type: 'linear', alpha: 0.0 },
+				});
+			} else {
+				results = this.#store.hybridSearch(queryVec, querySparse, fetchK);
+			}
+		}
+		if (filter) results = results.filter(r => filter(r.metadata)).slice(0, top);
+		if (includeTerms) {
+			const terms = querySparse && querySparse.indices.length > 0
+				? this.#bm25.termsForIndices(querySparse.indices)
+				: Highlighter.tokenize(query);
+			results = results.map(r => ({ ...r, matchedTerms: terms }));
+		}
+		return results;
+	}
+	/**
+	 * Number of indexed documents.
+	 * @returns {number}
+	 */
+	get count() {
+		return this.#store ? this.#store.count : 0;
+	}
+	/**
+	 * The embedding model id this instance uses.
+	 * @returns {string}
+	 */
+	get model() {
+		return this.#model;
+	}
+	/**
+	 * Weight precision the embedder loads (e.g. `'q8'`, `'fp32'`).
+	 * @returns {string}
+	 */
+	get dtype() {
+		return this.#embedder.dtype;
+	}
+	/**
+	 * Width of the dense vectors in the index, or null before anything is indexed.
+	 * @returns {number|null}
+	 */
+	get dimensions() {
+		return this.#store ? this.#store.dimensions : null;
+	}
+	/**
+	 * Index mode this instance was built with (`'hybrid'` or `'dense'`).
+	 * @returns {'hybrid'|'dense'}
+	 */
+	get indexMode() {
+		return this.#mode;
+	}
+	// --- Persistence (universal, single self-contained blob) ---
+	/**
+	 * Serialize everything (vectors + metadata + sparse + BM25 + model name +
+	 * dtype) into one Uint8Array. Layout: [uint32 metaLen][meta JSON][VecStore
+	 * bytes].
+	 * @returns {Uint8Array}
+	 * @throws {Error} If nothing has been indexed yet.
+	 */
+	exportBytes() {
+		if (!this.#store) throw new Error('Vecito: nothing to export — call addDocuments() first');
+		const storeBytes = this.#store.exportBytes();
+		const meta = {
+			model: this.#model,
+			dtype: this.#embedder.dtype,
+			mode: this.#mode,
+			bm25: this.#mode === 'hybrid' && this.#fitted ? this.#bm25.toJSON() : null,
+		};
+		const metaBytes = new TextEncoder().encode(JSON.stringify(meta));
+		const out = new Uint8Array(4 + metaBytes.length + storeBytes.length);
+		new DataView(out.buffer).setUint32(0, metaBytes.length, true);
+		out.set(metaBytes, 4);
+		out.set(storeBytes, 4 + metaBytes.length);
+		return out;
+	}
+	/**
+	 * Parse a container blob produced by {@link Vecito#exportBytes} back into a
+	 * ready-to-search instance.
+	 * @param {Uint8Array} bytes
+	 * @returns {Promise<Vecito>}
+	 */
+	static async #fromContainer(bytes) {
+		const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+		const metaLen = view.getUint32(0, true);
+		const meta = JSON.parse(new TextDecoder().decode(bytes.subarray(4, 4 + metaLen)));
+		const storeBytes = bytes.subarray(4 + metaLen);
+		const v = new Vecito({ model: meta.model, dtype: meta.dtype, mode: meta.mode ?? 'hybrid' });
+		if (meta.bm25) {
+			v.#bm25 = BM25.fromJSON(meta.bm25);
+			v.#fitted = true;
+		}
+		v.#store = await VecStore.loadFromBytes(storeBytes);
+		return v;
+	}
+	/**
+	 * Rebuild from bytes produced by {@link Vecito#exportBytes} (universal).
+	 * @param {Uint8Array|ArrayBuffer} bytes
+	 * @returns {Promise<Vecito>}
+	 */
+	static async loadFromBytes(bytes) {
+		return Vecito.#fromContainer(bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes));
+	}
+	/**
+	 * Fetch an exported blob over HTTP and rebuild it (browser + Node fetch).
+	 * @param {string} url
+	 * @returns {Promise<Vecito>}
+	 */
+	static async loadFromUrl(url) {
+		const res = await fetch(url);
+		const buf = await res.arrayBuffer();
+		return Vecito.loadFromBytes(new Uint8Array(buf));
+	}
+	// --- Node-only file persistence ---
+	/**
+	 * Write the exported blob to a file. Node only.
+	 * @param {string} path
+	 * @returns {Promise<void>}
+	 * @throws {Error} In non-Node environments.
+	 */
+	async save(path) {
+		if (!isNode) throw new Error('Vecito.save() is Node-only; use exportBytes()/persist() in the browser');
+		const { writeFileSync } = await import('fs');
+		writeFileSync(path, this.exportBytes());
+	}
+	/**
+	 * Read an exported blob from a file. Node only.
+	 * @param {string} path
+	 * @returns {Promise<Vecito>}
+	 * @throws {Error} In non-Node environments.
+	 */
+	static async load(path) {
+		if (!isNode) throw new Error('Vecito.load() is Node-only; use loadFromBytes()/loadFromUrl() in the browser');
+		const { readFileSync } = await import('fs');
+		const buf = readFileSync(path);
+		return Vecito.loadFromBytes(new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength));
+	}
+}

package/package.json ADDED Viewed

@@ -0,0 +1,62 @@
+{
+	"name": "vecito",
+	"version": "0.1.0",
+	"description": "Tiny hybrid (dense + BM25) semantic search for Node and the browser",
+	"type": "module",
+	"author": "Jeka Kiselyov",
+	"repository": {
+		"type": "git",
+		"url": "git+https://github.com/jeka-kiselyov/vecito.git"
+	},
+	"homepage": "https://github.com/jeka-kiselyov/vecito#readme",
+	"bugs": {
+		"url": "https://github.com/jeka-kiselyov/vecito/issues"
+	},
+	"main": "index.js",
+	"types": "index.d.ts",
+	"publishConfig": {
+		"access": "public"
+	},
+	"bin": {
+		"vecito": "bin/cli.js"
+	},
+	"exports": {
+		".": {
+			"types": "./index.d.ts",
+			"default": "./index.js"
+		},
+		"./file": {
+			"types": "./file.d.ts",
+			"default": "./lib/file-index.js"
+		}
+	},
+	"files": [
+		"index.js",
+		"index.d.ts",
+		"file.d.ts",
+		"lib/",
+		"bin/"
+	],
+	"scripts": {
+		"test": "vitest run",
+		"dev:browser": "vite"
+	},
+	"keywords": [
+		"semantic-search",
+		"vector-search",
+		"embeddings",
+		"bm25",
+		"hybrid-search",
+		"rag"
+	],
+	"dependencies": {
+		"@huggingface/transformers": "^3.5.1",
+		"altor-vec": "^0.1.3",
+		"stemmer": "^2.0.1"
+	},
+	"devDependencies": {
+		"vite": "^6.0.0",
+		"vitest": "^3.2.0"
+	},
+	"license": "MIT"
+}