npm - vecito - Versions diffs - 0.1.0 - Mend

vecito 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/lib/highlight.js ADDED Viewed

@@ -0,0 +1,166 @@
+import { stemmer } from 'stemmer';
+const STOPWORDS = new Set([
+	'a','an','the','in','on','at','to','for','of','and','or','is','are','was',
+	'were','it','its','by','be','as','do','how','what','which','with','that',
+	'this','from','have','has','had','not','but','can','will','just','about',
+	'than','also','more','some','such','into','over','after','they','them',
+	'their','would','could','should','been','being','our','your','his','her',
+	'we','you','he','she','did','does','may','might','must','shall','get',
+	'got','let','few','all','any','each','both','very','only','even','own',
+]);
+/**
+ * Stem a lowercase word. Short or stop words are returned as-is so they can
+ * still be excluded by the caller's own stopword check.
+ * @param {string} word Already lowercase.
+ * @returns {string}
+ */
+function stem(word) {
+	return word.length >= 4 ? stemmer(word) : word;
+}
+/**
+ * Build a Set of stems from an array of query terms, excluding stopwords and
+ * tokens shorter than 3 characters.
+ * @param {string[]|Set<string>} terms
+ * @returns {Set<string>}
+ */
+function stemSet(terms) {
+	const out = new Set();
+	for (const t of terms) {
+		const lo = t.toLowerCase();
+		if (lo.length >= 3 && !STOPWORDS.has(lo)) out.add(stem(lo));
+	}
+	return out;
+}
+/**
+ * Text highlighting utilities for search result display.
+ *
+ * {@link Highlighter.highlight} wraps matched terms in `<mark>` tags using
+ * stem-aware matching: "running" matches the term "run", "adventure" matches
+ * "adventurous", etc. {@link Highlighter.snippet} extracts a relevant excerpt
+ * with the same stem-aware center-finding.
+ */
+export class Highlighter {
+	/**
+	 * Escape HTML special characters in a plain-text string.
+	 * @param {string} s
+	 * @returns {string}
+	 */
+	static escape(s) {
+		return s
+			.replace(/&/g, '&amp;')
+			.replace(/</g, '&lt;')
+			.replace(/>/g, '&gt;')
+			.replace(/"/g, '&quot;');
+	}
+	/**
+	 * Tokenize a query string for use as highlight terms. Splits on non-word
+	 * characters, lowercases, drops stopwords and tokens shorter than 3 chars.
+	 * Used as a fallback when BM25 matched terms are not available (dense mode).
+	 * @param {string} text
+	 * @returns {string[]} Unique tokens, longest first.
+	 */
+	static tokenize(text) {
+		const seen = new Set();
+		const tokens = text
+			.toLowerCase()
+			.replace(/[^a-z0-9\-_]+/g, ' ')
+			.split(/\s+/)
+			.filter(t => t.length >= 3 && !STOPWORDS.has(t) && !seen.has(t) && seen.add(t));
+		return tokens.sort((a, b) => b.length - a.length);
+	}
+	/**
+	 * Wrap occurrences of `terms` in `text` with `<mark>` tags, HTML-escaping
+	 * everything else. Matching is stem-aware and case-insensitive: the term
+	 * "run" will match "running", "runs", "ran"; "adventure" matches "adventures".
+	 * Gaps between matched words are bridged into one `<mark>` span when every
+	 * word in the gap is a stopword (e.g. "Tales of Mystery" → single highlight).
+	 * @param {string} text Plain text to highlight.
+	 * @param {string[]|Set<string>} terms Terms to highlight.
+	 * @returns {string} HTML string with `<mark>…</mark>` around matches.
+	 */
+	static highlight(text, terms) {
+		const stems = stemSet(terms);
+		if (!stems.size) return Highlighter.escape(text);
+		// Tokenize into segments {raw, isWord, marked}
+		const tokenRe = /([A-Za-z]+)|([^A-Za-z]+)/g;
+		const segs = [];
+		let m;
+		while ((m = tokenRe.exec(text)) !== null) {
+			const raw = m[1] ?? m[2];
+			const isWord = !!m[1];
+			const marked = isWord && stems.has(stem(raw.toLowerCase()));
+			segs.push({ raw, isWord, marked });
+		}
+		// Bridge pass: if the gap between two marked segments contains only
+		// stopwords (and non-word chars like spaces/punctuation), mark it too.
+		let i = 0;
+		while (i < segs.length) {
+			if (!segs[i].marked) { i++; continue; }
+			// Find the next marked segment
+			let j = i + 1;
+			while (j < segs.length && !segs[j].marked) j++;
+			if (j < segs.length) {
+				// Check every word in the gap
+				const gapAllStop = segs.slice(i + 1, j).every(
+					s => !s.isWord || STOPWORDS.has(s.raw.toLowerCase())
+				);
+				if (gapAllStop) {
+					for (let k = i + 1; k < j; k++) segs[k].marked = true;
+				}
+			}
+			i = j;
+		}
+		// Render: merge consecutive marked segments into one <mark> span.
+		const parts = [];
+		let inMark = false;
+		for (const seg of segs) {
+			if (seg.marked && !inMark) { parts.push('<mark>'); inMark = true; }
+			if (!seg.marked && inMark) { parts.push('</mark>'); inMark = false; }
+			parts.push(Highlighter.escape(seg.raw));
+		}
+		if (inMark) parts.push('</mark>');
+		return parts.join('');
+	}
+	/**
+	 * Extract a snippet of at most `maxLen` characters centred on the first
+	 * stem match. Short texts are returned in full. Ellipsis (`…`) is added at
+	 * truncated edges. Pass the result to {@link Highlighter.highlight} for markup.
+	 * @param {string} text Plain text.
+	 * @param {string[]|Set<string>} terms Terms to centre the window on.
+	 * @param {number} [maxLen=220] Maximum character count of the returned snippet.
+	 * @returns {string} Plain-text excerpt.
+	 */
+	static snippet(text, terms, maxLen = 220) {
+		if (text.length <= maxLen) return text;
+		const stems = stemSet(terms);
+		let center = 0;
+		if (stems.size) {
+			// Find the first word whose stem matches any query stem.
+			const wordRe = /[A-Za-z]+/g;
+			let wm;
+			while ((wm = wordRe.exec(text)) !== null) {
+				if (stems.has(stem(wm[0].toLowerCase()))) {
+					center = wm.index;
+					break;
+				}
+			}
+		}
+		const half = Math.floor(maxLen / 2);
+		const start = Math.max(0, Math.min(center - half, text.length - maxLen));
+		const end = Math.min(text.length, start + maxLen);
+		return (start > 0 ? '…' : '') + text.slice(start, end) + (end < text.length ? '…' : '');
+	}
+}

package/lib/vec-store.js ADDED Viewed

@@ -0,0 +1,365 @@
+const isNode = typeof globalThis.process?.versions?.node === 'string';
+let _wasmInitialized = false;
+let _WasmSearchEngine;
+/**
+ * Initialize the altor-vec WASM module and cache the engine class. Idempotent.
+ * @returns {Promise<void>}
+ */
+async function ensureWasm() {
+	if (_wasmInitialized) return;
+	let mod;
+	if (isNode) {
+		const { readFileSync } = await import('fs');
+		const { fileURLToPath, pathToFileURL } = await import('url');
+		let altorUrl;
+		if (typeof import.meta.resolve === 'function') {
+			altorUrl = import.meta.resolve('altor-vec');
+		} else {
+			const { createRequire } = await import('module');
+			altorUrl = pathToFileURL(createRequire(import.meta.url).resolve('altor-vec')).href;
+		}
+		mod = await import(/* @vite-ignore */ altorUrl);
+		const wasmUrl = new URL('altor_vec_wasm_bg.wasm', altorUrl);
+		mod.initSync({ module: readFileSync(fileURLToPath(wasmUrl)) });
+	} else {
+		mod = await import('altor-vec');
+		await mod.default(); // async WASM fetch
+	}
+	_WasmSearchEngine = mod.WasmSearchEngine;
+	_wasmInitialized = true;
+}
+// HNSW build parameters. ef_construction trades build speed for recall quality.
+const HNSW_M = 16;
+const HNSW_EF_CONSTRUCTION = 200;
+const HNSW_EF_SEARCH = 50;
+// RRF fusion constant (standard value).
+const RRF_K = 60;
+/**
+ * Dense + sparse vector store backed by altor-vec (WASM HNSW).
+ *
+ * The HNSW graph serializes to bytes via `engine.to_bytes()` and restores in
+ * milliseconds via `new WasmSearchEngine(bytes)` — eliminating the long rebuild
+ * that edgevec required. Sparse (BM25) vectors are stored as plain arrays and
+ * searched with brute-force dot products, which gives 100% recall for ≤1M docs.
+ */
+export class VecStore {
+	#engine;      // WasmSearchEngine — null until first insert or load
+	#dimensions;
+	#count;
+	#metadata;    // plain array of metadata objects, indexed by 0-based insert order
+	#sparseVecs;  // sparse vectors stored alongside dense; index 0 is the alignment placeholder
+	/**
+	 * @param {object} opts
+	 * @param {number} opts.dimensions Dense vector dimensionality.
+	 */
+	constructor({ dimensions }) {
+		this.#dimensions = dimensions;
+		this.#engine = null;
+		this.#count = 0;
+		this.#metadata = [];
+		this.#sparseVecs = null;
+	}
+	/**
+	 * Initialize the WASM module. Must be called before insert / search.
+	 * @returns {Promise<void>}
+	 */
+	async init() {
+		await ensureWasm();
+	}
+	/**
+	 * Insert a dense vector with optional metadata.
+	 * @param {Float32Array|number[]} vector
+	 * @param {Record<string, any>} [metadata]
+	 * @returns {number} 1-based id for backward compatibility.
+	 */
+	insert(vector, metadata) {
+		const vec = vector instanceof Float32Array ? vector : new Float32Array(vector);
+		if (!this.#engine) {
+			// from_vectors requires ≥1 vector; subsequent adds use add_vectors
+			this.#engine = _WasmSearchEngine.from_vectors(
+				vec, this.#dimensions, HNSW_M, HNSW_EF_CONSTRUCTION, HNSW_EF_SEARCH
+			);
+		} else {
+			this.#engine.add_vectors(vec, this.#dimensions);
+		}
+		this.#metadata.push(metadata || {});
+		this.#count++;
+		return this.#count; // 1-based
+	}
+	/**
+	 * Enable sparse storage. Inserts a placeholder at index 0 so sparse ids
+	 * (0-based) align with dense ids (1-based).
+	 */
+	initSparse() {
+		this.#sparseVecs = [{ indices: [0], values: [1e-10], dim: 1 }];
+	}
+	/**
+	 * Insert a sparse BM25 vector, paired positionally with the prior dense insert.
+	 * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
+	 */
+	insertSparse(sparse) {
+		if (this.#sparseVecs) {
+			this.#sparseVecs.push({
+				indices: Array.from(sparse.indices),
+				values: Array.from(sparse.values),
+				dim: sparse.dim,
+			});
+		}
+	}
+	/**
+	 * Nearest-neighbor dense search.
+	 * @param {Float32Array} query
+	 * @param {number} [k=10]
+	 * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
+	 */
+	async search(query, k = 10) {
+		return this.#denseScan(query, k);
+	}
+	/**
+	 * Dense search with a JS predicate post-filter. Fetches `k * 5` candidates
+	 * from the HNSW, then applies the filter to metadata objects.
+	 * @param {Float32Array} query
+	 * @param {(meta: Record<string,any>) => boolean} filter JS predicate.
+	 * @param {number} [k=10]
+	 * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
+	 */
+	async searchWithFilter(query, filter, k = 10) {
+		const candidates = this.#denseScan(query, k * 5);
+		if (typeof filter !== 'function') return candidates.slice(0, k);
+		return candidates.filter(r => filter(r.metadata)).slice(0, k);
+	}
+	/**
+	 * Hybrid dense + sparse search with RRF rank fusion.
+	 * Dense side uses the HNSW; sparse side uses brute-force dot products.
+	 * @param {Float32Array} denseQuery
+	 * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
+	 * @param {number} [k=10]
+	 * @param {object} [opts]
+	 * @param {number} [opts.dense_k]
+	 * @param {number} [opts.sparse_k]
+	 * @param {string|{type:string,alpha:number}} [opts.fusion='rrf']
+	 * @returns {Array<{id:number,score:number,dense_rank?:number,sparse_rank?:number,metadata:Record<string,any>}>}
+	 */
+	hybridSearch(denseQuery, sparse, k = 10, { dense_k, sparse_k, fusion } = {}) {
+		const dk = dense_k ?? k * 3;
+		const sk = sparse_k ?? k * 3;
+		const denseResults = this.#denseScan(denseQuery, dk);
+		// Linear alpha fusion: alpha=0 → sparse only, alpha=1 → dense only
+		if (fusion && typeof fusion === 'object' && fusion.type === 'linear') {
+			const alpha = fusion.alpha ?? 0.5;
+			if (alpha === 0) {
+				const sp = this.#sparseScan(sparse, k);
+				return sp.map((r, rank) => ({ ...r, sparse_rank: rank + 1 }));
+			}
+			if (alpha === 1) {
+				return denseResults.slice(0, k).map((r, rank) => ({ ...r, dense_rank: rank + 1 }));
+			}
+		}
+		const sparseResults = this.#sparseScan(sparse, sk);
+		return this.#rrf(denseResults, sparseResults, k);
+	}
+	// ── Private search helpers ───────────────────────────────────────────────
+	/**
+	 * Scan the HNSW for the k nearest neighbors.
+	 * altor-vec returns [[nodeId, distance], ...] with cosine distance.
+	 * We expose score = 1 - distance (higher = more similar).
+	 */
+	#denseScan(query, k) {
+		if (!this.#engine) return [];
+		const q = query instanceof Float32Array ? query : new Float32Array(query);
+		const raw = JSON.parse(this.#engine.search(q, Math.min(k, this.#count)));
+		return raw.map(([nodeId, dist]) => ({
+			id: nodeId + 1,
+			score: 1 - dist,
+			metadata: this.#metadata[nodeId] || {},
+		}));
+	}
+	/**
+	 * Brute-force sparse dot product search over stored BM25 vectors.
+	 * O(docs × avg_nnz) — exact, 100% recall.
+	 */
+	#sparseScan(query, k) {
+		if (!this.#sparseVecs || this.#sparseVecs.length <= 1) return [];
+		const qMap = new Map();
+		for (let t = 0; t < query.indices.length; t++) {
+			qMap.set(query.indices[t], query.values[t]);
+		}
+		const scores = [];
+		// #sparseVecs[0] is the alignment placeholder; docs start at index 1
+		for (let i = 1; i < this.#sparseVecs.length; i++) {
+			const sv = this.#sparseVecs[i];
+			let score = 0;
+			for (let t = 0; t < sv.indices.length; t++) {
+				const qv = qMap.get(sv.indices[t]);
+				if (qv !== undefined) score += qv * sv.values[t];
+			}
+			scores.push({ nodeId: i - 1, score });
+		}
+		scores.sort((a, b) => b.score - a.score);
+		return scores.slice(0, k).map(({ nodeId, score }) => ({
+			id: nodeId + 1,
+			score,
+			metadata: this.#metadata[nodeId] || {},
+		}));
+	}
+	/**
+	 * Reciprocal Rank Fusion of dense and sparse result lists.
+	 */
+	#rrf(dense, sparse, k) {
+		const acc = new Map(); // id → {rrfScore, dense_rank, sparse_rank}
+		dense.forEach(({ id }, rank) => {
+			acc.set(id, { rrfScore: 1 / (RRF_K + rank + 1), dense_rank: rank + 1, sparse_rank: null });
+		});
+		sparse.forEach(({ id }, rank) => {
+			const entry = acc.get(id);
+			const contrib = 1 / (RRF_K + rank + 1);
+			if (entry) {
+				entry.rrfScore += contrib;
+				entry.sparse_rank = rank + 1;
+			} else {
+				acc.set(id, { rrfScore: contrib, dense_rank: null, sparse_rank: rank + 1 });
+			}
+		});
+		return [...acc.entries()]
+			.sort((a, b) => b[1].rrfScore - a[1].rrfScore)
+			.slice(0, k)
+			.map(([id, { rrfScore, dense_rank, sparse_rank }]) => ({
+				id,
+				score: rrfScore,
+				dense_rank,
+				sparse_rank,
+				metadata: this.#metadata[id - 1] || {},
+			}));
+	}
+	// ── Serialization ────────────────────────────────────────────────────────
+	/**
+	 * Serialize to a self-contained Uint8Array.
+	 * Layout: [uint32 hnswLen][HNSW bytes][JSON: {dims, metadata, sparse?}]
+	 * @returns {Uint8Array}
+	 */
+	exportBytes() {
+		if (!this.#engine) throw new Error('VecStore: nothing to export — insert documents first');
+		const hnswBytes = this.#engine.to_bytes();
+		const payload = { dims: this.#dimensions, metadata: this.#metadata };
+		if (this.#sparseVecs) payload.sparse = this.#sparseVecs;
+		const jsonBytes = new TextEncoder().encode(JSON.stringify(payload));
+		const out = new Uint8Array(4 + hnswBytes.length + jsonBytes.length);
+		new DataView(out.buffer).setUint32(0, hnswBytes.length, true);
+		out.set(hnswBytes, 4);
+		out.set(jsonBytes, 4 + hnswBytes.length);
+		return out;
+	}
+	/**
+	 * Write to a file (Node only).
+	 * @param {string} filePath
+	 */
+	async save(filePath) {
+		const { writeFileSync } = await import('fs');
+		writeFileSync(filePath, this.exportBytes());
+	}
+	/** @alias save */
+	async exportToFile(filePath) {
+		await this.save(filePath);
+	}
+	// ── Loaders ──────────────────────────────────────────────────────────────
+	/**
+	 * Load from a file (Node only).
+	 * @param {string} filePath
+	 * @returns {Promise<VecStore>}
+	 */
+	static async load(filePath) {
+		await ensureWasm();
+		const { readFileSync } = await import('fs');
+		const buf = readFileSync(filePath);
+		return VecStore.#fromBytes(new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength));
+	}
+	/**
+	 * Rebuild from bytes produced by {@link VecStore#exportBytes}.
+	 * @param {Uint8Array} bytes
+	 * @returns {Promise<VecStore>}
+	 */
+	static async loadFromBytes(bytes) {
+		await ensureWasm();
+		return VecStore.#fromBytes(bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes));
+	}
+	/**
+	 * Fetch an exported blob over HTTP and rebuild it.
+	 * @param {string} url
+	 * @returns {Promise<VecStore>}
+	 */
+	static async loadFromUrl(url) {
+		const response = await fetch(url);
+		const buf = await response.arrayBuffer();
+		return VecStore.loadFromBytes(new Uint8Array(buf));
+	}
+	/**
+	 * Deserialize bytes produced by exportBytes.
+	 * The HNSW graph is restored directly — no rebuild loop.
+	 */
+	static #fromBytes(bytes) {
+		const hnswLen = new DataView(bytes.buffer, bytes.byteOffset, 4).getUint32(0, true);
+		const hnswBytes = bytes.subarray(4, 4 + hnswLen);
+		const jsonBytes = bytes.subarray(4 + hnswLen);
+		const parsed = JSON.parse(new TextDecoder().decode(jsonBytes));
+		// Support old edgevec format (plain array = metadata only) and new format
+		let dims, metadata, sparseVecs;
+		if (Array.isArray(parsed)) {
+			metadata = parsed;
+			dims = 0;
+		} else {
+			dims = parsed.dims ?? 0;
+			metadata = parsed.metadata;
+			sparseVecs = parsed.sparse ?? null;
+		}
+		const store = new VecStore({ dimensions: dims });
+		store.#engine = new _WasmSearchEngine(hnswBytes);
+		store.#count = store.#engine.len();
+		store.#metadata = metadata;
+		store.#sparseVecs = sparseVecs;
+		return store;
+	}
+	// ── Properties ───────────────────────────────────────────────────────────
+	/** Number of indexed vectors. */
+	get count() { return this.#count; }
+	/** Dimensionality of the dense vectors. */
+	get dimensions() { return this.#dimensions; }
+}