npm - vecito - Versions diffs - 0.1.0 → 0.1.1 - Mend

vecito 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +41 -4
package/bin/cli.js +36 -10
package/index.js +7 -0
package/lib/vec-store.js +0 -14
package/lib/vecito.js +32 -17
package/package.json +8 -6
package/types/bin/cli.d.ts +2 -0
package/types/index.d.ts +15 -0
package/types/lib/bm25.d.ts +93 -0
package/types/lib/embedder.d.ts +68 -0
package/types/lib/extract.d.ts +25 -0
package/types/lib/file-index.d.ts +59 -0
package/types/lib/highlight.d.ts +45 -0
package/types/lib/vec-store.d.ts +125 -0
package/types/lib/vecito.d.ts +195 -0
package/file.d.ts +0 -27
package/index.d.ts +0 -113

package/README.md CHANGED Viewed

@@ -7,7 +7,7 @@ lexical scoring over an [`altor-vec`](https://github.com/altor-lab/altor-vec) WA
 no API keys.
 **Where to build the index.** Building a snapshot means embedding every document and constructing the HNSW graph — the expensive part. It's usually best to do this once on the server or with the CLI (`vecito index`), then serve the resulting `.vecito` file and load it in the browser with `Vecito.loadFromUrl()`, which restores the pre-built graph in milliseconds. Building, indexing, and adding documents directly in the browser is fully supported too — it just runs that same per-document embedding client-side, which is slow for large corpora.
 ## Install
 ```bash
@@ -52,6 +52,17 @@ await v.addDocuments(rows, {
 (BM25-weighted). All modes support a `filter` predicate to post-filter results by metadata.
 If the query has no in-vocabulary terms, hybrid/sparse automatically fall back to dense.
+```js
+// `filter` is a JS predicate over each hit's metadata — works in any mode.
+// Filtering happens after ranking, so vecito over-fetches and grows the candidate
+// set adaptively (up to the whole index) to still return `top` matches when they exist.
+const hits = await v.search('how do plants make food?', {
+  mode: 'hybrid',
+  top: 3,
+  filter: m => m.title === 'Botany',
+});
+```
 ### Options & models
 ```js
@@ -69,6 +80,26 @@ later (including to a loaded snapshot — see below). Dense search covers new do
 sparse scoring only sees terms already in the frozen vocabulary, so pass your whole corpus up
 front for best lexical recall.
+### Highlighting
+Pass `matchedTerms: true` to get the query terms each hit matched, then render an excerpt with
+the exported `Highlighter`. `snippet()` extracts a relevant window centred on the first match;
+`highlight()` wraps matches in `<mark>` tags. Both are stem-aware (the term `run` matches
+`running`/`ran`) and case-insensitive, and `highlight()` HTML-escapes everything else.
+```js
+import { Vecito, Highlighter } from 'vecito';
+const hits = await v.search('how do plants make food?', { matchedTerms: true });
+for (const h of hits) {
+  const excerpt = Highlighter.snippet(h.metadata.body, h.matchedTerms); // plain-text window (≤220 chars)
+  const html = Highlighter.highlight(excerpt, h.matchedTerms);          // '…<mark>Photosynthesis</mark>…'
+}
+```
+In dense mode (no BM25 terms available) `matchedTerms` falls back to the query's own tokens, so
+highlighting still works. You can also call `Highlighter.tokenize(query)` to derive terms yourself.
 ### Persistence
 ```js
@@ -92,7 +123,7 @@ await loaded.save('data.vecito');
 ```
 The primitives are exported too if you want to wire them yourself:
-`import { Embedder, BM25, VecStore } from 'vecito'`.
+`import { Embedder, BM25, VecStore, Highlighter } from 'vecito'`.
 ## File indexing (`vecito/file`)
@@ -137,7 +168,7 @@ out of dependency pre-bundling (`optimizeDeps.exclude`) so their `import.meta.ur
 Install globally to get the `vecito` command on your `PATH`:
 ```bash
-pnpm add -g vecito
+pnpm add -g vecito --config.onlyBuiltDependencies='["onnxruntime-node","protobufjs","sharp"]'
 ```
 Or run it without installing via `pnpm dlx vecito …`.
@@ -152,6 +183,12 @@ vecito index ./docs -o docs.vecito
 # Search (path is optional; defaults to data.vecito in the current directory)
 vecito search "renewable energy sources" --mode hybrid --top 5
 vecito search "renewable energy sources" docs.vecito --top 5
+# Filter by metadata — a JS expression with the hit's metadata bound to `meta`
+vecito search "renewable energy" --filter 'meta.name.endsWith(".md")'
+# Machine-readable output — score, ranks, and full metadata as JSON (pipeable)
+vecito search "renewable energy" --json | jq '.[].metadata'
 ```
 `index` recursively walks the directory, indexing a broad set of text/data/code extensions
@@ -164,7 +201,7 @@ The trailing path is optional and **defaults to the current directory** — `ind
 ```
 vecito index [dir] [-o data.vecito] [--ext .md,.txt,...] [--hidden] [--limit N]
-vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>]
+vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>] [--json]
 ```
 ## Sample data

package/bin/cli.js CHANGED Viewed

@@ -1,12 +1,18 @@
 #!/usr/bin/env node
-import { existsSync, statSync } from 'fs';
-import { join } from 'path';
+import { existsSync, statSync, readFileSync } from 'fs';
+import { join, dirname } from 'path';
+import { fileURLToPath } from 'url';
 import { Vecito } from '../lib/vecito.js';
 import { walk, indexFiles } from '../lib/file-index.js';
 /** Default index filename, used for both output and directory-relative lookup. */
 const DEFAULT_INDEX = 'data.vecito';
+/** Package version, read from package.json next to this CLI. */
+const VERSION = JSON.parse(
+	readFileSync(join(dirname(fileURLToPath(import.meta.url)), '../package.json'), 'utf8'),
+).version;
 /** Flags that consume the following argv token as their value. */
 const VALUE_FLAGS = new Set(['-o', '--out', '--ext', '--limit', '--mode', '--top', '--filter']);
@@ -62,15 +68,18 @@ function resolveIndexPath(p) {
 }
 /**
- * Print CLI usage to stderr.
+ * Print CLI usage. Goes to stdout for an explicit help request, stderr otherwise.
+ * @param {{help?: boolean}} [opts]
  * @returns {void}
  */
-function usage() {
-	console.error(`vecito — hybrid (dense + BM25) semantic search
+function usage({ help = false } = {}) {
+	const write = help ? console.log : console.error;
+	write(`vecito — hybrid (dense + BM25) semantic search
 Usage:
   vecito index [dir] [-o data.vecito] [--mode dense|hybrid] [--ext .md,.txt,...] [--hidden] [--limit N]
-  vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>]
+  vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>] [--json]
+  vecito --version | --help
 The trailing path is optional and defaults to the current directory.
@@ -84,7 +93,8 @@ Index options:
 Search options:
   --mode <m>         Search mode: hybrid (default), dense, or sparse
   --top <n>          Number of results (default: 10)
-  --filter <expr>    JS expression over metadata, e.g. 'meta.category === "science"'`);
+  --filter <expr>    JS expression over metadata, e.g. 'meta.category === "science"'
+  --json             Output results as JSON (score, ranks, full metadata) on stdout`);
 }
 /**
@@ -130,6 +140,7 @@ async function cmdSearch() {
 	const mode = flag('--mode', 'hybrid');
 	const top = parseInt(flag('--top', '10'), 10);
+	const asJson = hasFlag('--json');
 	const filterExpr = flag('--filter', undefined);
 	let filter;
 	if (filterExpr) {
@@ -148,10 +159,19 @@ async function cmdSearch() {
 	const vecito = await Vecito.load(indexFile);
 	const effectiveMode = vecito.indexMode === 'dense' ? 'dense' : mode;
-	console.log(`Loaded ${vecito.count} doc(s) from ${indexFile} [index: ${vecito.indexMode}, search: ${effectiveMode}]\n`);
-	console.log(`Results for "${query}"${filterExpr ? ` (filter: ${filterExpr})` : ''}:\n`);
+	if (!asJson) {
+		console.log(`Loaded ${vecito.count} doc(s) from ${indexFile} [index: ${vecito.indexMode}, search: ${effectiveMode}]\n`);
+		console.log(`Results for "${query}"${filterExpr ? ` (filter: ${filterExpr})` : ''}:\n`);
+	}
 	const results = await vecito.search(query, { mode: effectiveMode, top, filter });
+	if (asJson) {
+		// Pure JSON on stdout (score, ranks, full metadata) — pipeable into jq etc.
+		console.log(JSON.stringify(results, null, 2));
+		return;
+	}
 	if (results.length === 0) {
 		console.log('(no results)');
 		return;
@@ -171,7 +191,13 @@ async function cmdSearch() {
 }
 const cmd = process.argv[2];
-if (cmd === 'index') {
+if (cmd === '--version' || cmd === '-v') {
+	console.log(VERSION);
+	process.exit(0);
+} else if (cmd === '--help' || cmd === '-h' || cmd === 'help' || !cmd) {
+	usage({ help: true });
+	process.exit(0);
+} else if (cmd === 'index') {
 	await cmdIndex();
 } else if (cmd === 'search') {
 	await cmdSearch();

package/index.js CHANGED Viewed

@@ -3,3 +3,10 @@ export { BM25 } from './lib/bm25.js';
 export { VecStore } from './lib/vec-store.js';
 export { Vecito } from './lib/vecito.js';
 export { Highlighter } from './lib/highlight.js';
+/**
+ * Re-export the public typedefs so consumers can import them from the package
+ * root. They are authored as JSDoc types on the Vecito module.
+ * @typedef {import('./lib/vecito.js').SearchResult} SearchResult
+ * @typedef {import('./lib/vecito.js').AddOptions} AddOptions
+ */

package/lib/vec-store.js CHANGED Viewed

@@ -129,20 +129,6 @@ export class VecStore {
 		return this.#denseScan(query, k);
 	}
-	/**
-	 * Dense search with a JS predicate post-filter. Fetches `k * 5` candidates
-	 * from the HNSW, then applies the filter to metadata objects.
-	 * @param {Float32Array} query
-	 * @param {(meta: Record<string,any>) => boolean} filter JS predicate.
-	 * @param {number} [k=10]
-	 * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
-	 */
-	async searchWithFilter(query, filter, k = 10) {
-		const candidates = this.#denseScan(query, k * 5);
-		if (typeof filter !== 'function') return candidates.slice(0, k);
-		return candidates.filter(r => filter(r.metadata)).slice(0, k);
-	}
 	/**
 	 * Hybrid dense + sparse search with RRF rank fusion.
 	 * Dense side uses the HNSW; sparse side uses brute-force dot products.

package/lib/vecito.js CHANGED Viewed

@@ -33,6 +33,7 @@ function placeholderSparse(dim) {
  * @property {number} [dense_rank] Rank on the dense side (hybrid mode).
  * @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
  * @property {Record<string, any>} metadata The document's metadata.
+ * @property {string[]} [matchedTerms] Query terms matched, present only when `search` is called with `matchedTerms: true`.
  */
 /**
@@ -168,7 +169,8 @@ export class Vecito {
 	 *   'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
 	 *   BM25-weighted. Falls back to dense if the query has no in-vocab terms.
 	 * @param {number} [opts.top=10] Maximum number of results.
-	 * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode, over-fetching to preserve the requested count.
+	 * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode. Over-fetches and grows the candidate set adaptively (up to the full index) so a selective predicate still returns up to `top` matches when they exist.
+	 * @param {boolean} [opts.matchedTerms=false] When true, attach the matched query terms to each result as a `matchedTerms` string array.
 	 * @returns {Promise<SearchResult[]>}
 	 * @throws {Error} If nothing has been indexed or loaded yet.
 	 */
@@ -181,29 +183,42 @@ export class Vecito {
 		const queryVec = await this.#embedder.embed(query);
-		let results;
 		let querySparse = null;
-		const fetchK = filter ? top * 5 : top;
-		if (effectiveMode === 'dense') {
-			results = await this.#store.search(queryVec, fetchK);
-		} else {
+		if (effectiveMode !== 'dense') {
 			querySparse = this.#bm25.querySparse(query);
-			const hasSparse = querySparse.indices.length > 0;
+		}
+		const hasSparse = !!querySparse && querySparse.indices.length > 0;
-			if (!hasSparse) {
-				results = await this.#store.search(queryVec, fetchK);
-			} else if (effectiveMode === 'sparse') {
-				results = this.#store.hybridSearch(queryVec, querySparse, fetchK, {
+		// Fetch the top-k ranked candidates from the store for the resolved mode.
+		const fetchCandidates = async (k) => {
+			if (effectiveMode === 'dense' || !hasSparse) {
+				return await this.#store.search(queryVec, k);
+			}
+			if (effectiveMode === 'sparse') {
+				return this.#store.hybridSearch(queryVec, querySparse, k, {
 					fusion: { type: 'linear', alpha: 0.0 },
 				});
-			} else {
-				results = this.#store.hybridSearch(queryVec, querySparse, fetchK);
 			}
-		}
+			return this.#store.hybridSearch(queryVec, querySparse, k);
+		};
-		if (filter) results = results.filter(r => filter(r.metadata)).slice(0, top);
+		let results;
+		if (!filter) {
+			results = await fetchCandidates(top);
+		} else {
+			// Filtering happens after ranking, so a selective predicate can leave fewer
+			// than `top` hits in the first batch. Grow the fetch and retry until we have
+			// enough matches or we've scanned the entire index.
+			const total = this.#store.count;
+			let fetchK = Math.min(top * 5, total);
+			while (true) {
+				const candidates = await fetchCandidates(fetchK);
+				results = candidates.filter(r => filter(r.metadata));
+				if (results.length >= top || fetchK >= total) break;
+				fetchK = Math.min(fetchK * 4, total);
+			}
+			results = results.slice(0, top);
+		}
 		if (includeTerms) {
 			const terms = querySparse && querySparse.indices.length > 0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "vecito",
-	"version": "0.1.0",
+	"version": "0.1.1",
 	"description": "Tiny hybrid (dense + BM25) semantic search for Node and the browser",
 	"type": "module",
 	"author": "Jeka Kiselyov",
@@ -13,7 +13,7 @@
 		"url": "https://github.com/jeka-kiselyov/vecito/issues"
 	},
 	"main": "index.js",
-	"types": "index.d.ts",
+	"types": "types/index.d.ts",
 	"publishConfig": {
 		"access": "public"
 	},
@@ -22,23 +22,24 @@
 	},
 	"exports": {
 		".": {
-			"types": "./index.d.ts",
+			"types": "./types/index.d.ts",
 			"default": "./index.js"
 		},
 		"./file": {
-			"types": "./file.d.ts",
+			"types": "./types/lib/file-index.d.ts",
 			"default": "./lib/file-index.js"
 		}
 	},
 	"files": [
 		"index.js",
-		"index.d.ts",
-		"file.d.ts",
+		"types/",
 		"lib/",
 		"bin/"
 	],
 	"scripts": {
 		"test": "vitest run",
+		"types": "tsc -p tsconfig.json",
+		"prepublishOnly": "pnpm run types && pnpm test",
 		"dev:browser": "vite"
 	},
 	"keywords": [
@@ -55,6 +56,7 @@
 		"stemmer": "^2.0.1"
 	},
 	"devDependencies": {
+		"typescript": "^6.0.3",
 		"vite": "^6.0.0",
 		"vitest": "^3.2.0"
 	},

package/types/bin/cli.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env node
2	+ export {};

package/types/index.d.ts ADDED Viewed

@@ -0,0 +1,15 @@
+export { Embedder } from "./lib/embedder.js";
+export { BM25 } from "./lib/bm25.js";
+export { VecStore } from "./lib/vec-store.js";
+export { Vecito } from "./lib/vecito.js";
+export { Highlighter } from "./lib/highlight.js";
+/**
+ * Re-export the public typedefs so consumers can import them from the package
+ * root. They are authored as JSDoc types on the Vecito module.
+ */
+export type SearchResult = import("./lib/vecito.js").SearchResult;
+/**
+ * Re-export the public typedefs so consumers can import them from the package
+ * root. They are authored as JSDoc types on the Vecito module.
+ */
+export type AddOptions = import("./lib/vecito.js").AddOptions;

package/types/lib/bm25.d.ts ADDED Viewed

@@ -0,0 +1,93 @@
+/**
+ * BM25 sparse lexical model.
+ *
+ * Fit over a corpus to learn vocabulary and document frequencies, then produce
+ * sparse vectors (term-id → weight) for documents ({@link BM25#score}) and
+ * queries ({@link BM25#querySparse}). Serializable via {@link BM25#toJSON} /
+ * {@link BM25.fromJSON}.
+ */
+export class BM25 {
+    /**
+     * Reconstruct a fitted model from {@link BM25#toJSON} output.
+     * @param {object} data
+     * @returns {BM25}
+     */
+    static fromJSON(data: object): BM25;
+    /**
+     * @param {object} [opts]
+     * @param {number} [opts.k1=1.2] Term-frequency saturation parameter.
+     * @param {number} [opts.b=0.75] Document-length normalization (0..1).
+     */
+    constructor({ k1, b }?: {
+        k1?: number | undefined;
+        b?: number | undefined;
+    });
+    /**
+     * Build the vocabulary and corpus statistics (document frequencies, average
+     * length) from a set of documents. Must be called before scoring.
+     * @param {string[]} texts The full corpus.
+     * @returns {void}
+     */
+    fit(texts: string[]): void;
+    /**
+     * Compute the BM25 sparse vector for a document. Out-of-vocabulary terms are
+     * ignored.
+     * @param {string} text Document text.
+     * @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
+     *   sparse vector over the vocabulary.
+     */
+    score(text: string): {
+        indices: Uint32Array;
+        values: Float32Array;
+        dim: number;
+    };
+    /**
+     * Convenience wrapper that scores many documents.
+     * @param {string[]} texts
+     * @returns {Array<{indices: Uint32Array, values: Float32Array, dim: number}>}
+     */
+    scoreAll(texts: string[]): Array<{
+        indices: Uint32Array;
+        values: Float32Array;
+        dim: number;
+    }>;
+    /**
+     * Map a query string to the list of in-vocabulary term ids it contains.
+     * @param {string} queryText
+     * @returns {{indices: number[], vocabSize: number}} Matched term ids.
+     */
+    scoreQuery(queryText: string): {
+        indices: number[];
+        vocabSize: number;
+    };
+    /**
+     * Build an IDF-weighted sparse vector for a query (assumes term frequency 1
+     * per query term). Use this to drive sparse/hybrid search.
+     * @param {string} queryText
+     * @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
+     *   sparse vector; empty `indices` means no query term is in the vocabulary.
+     */
+    querySparse(queryText: string): {
+        indices: Uint32Array;
+        values: Float32Array;
+        dim: number;
+    };
+    /**
+     * Serialize the fitted model to a plain JSON-safe object.
+     * @returns {object} Pass to {@link BM25.fromJSON} to restore.
+     */
+    toJSON(): object;
+    /**
+     * Map a list of vocabulary indices back to their original term strings.
+     * The reverse map is built lazily on first call and cached.
+     * @param {Uint32Array|number[]} indices Vocabulary term ids.
+     * @returns {string[]} The corresponding terms (unknown ids silently omitted).
+     */
+    termsForIndices(indices: Uint32Array | number[]): string[];
+    /**
+     * Number of distinct terms in the fitted vocabulary (the sparse dimension).
+     * @returns {number}
+     */
+    get vocabSize(): number;
+    #private;
+}

package/types/lib/embedder.d.ts ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * Dense text embedder backed by transformers.js.
+ *
+ * Produces L2-normalized, mean-pooled sentence embeddings. The model is loaded
+ * lazily on first use and cached for the lifetime of the instance.
+ */
+export class Embedder {
+    /**
+     * @param {object} [opts]
+     * @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Hugging Face model id
+     *   for a feature-extraction pipeline. Any output width is supported — the
+     *   actual dimension is detected at {@link Embedder#init} (see
+     *   {@link Embedder#dimensions}).
+     * @param {string} [opts.dtype='q8'] Weight precision to load: `'q8'`
+     *   (quantized, ~4× smaller download, the default) or `'fp32'` (full
+     *   precision), plus other transformers.js dtypes (`'fp16'`, `'q4'`, …) when
+     *   the model provides them.
+     */
+    constructor({ model, dtype }?: {
+        model?: string | undefined;
+        dtype?: string | undefined;
+    });
+    /**
+     * Load the underlying pipeline if it hasn't been loaded yet, and detect the
+     * model's embedding width with a one-token probe. Safe to call repeatedly;
+     * subsequent calls are no-ops. Called automatically by {@link Embedder#embed}
+     * / {@link Embedder#embedBatch}.
+     * @returns {Promise<void>}
+     */
+    init(): Promise<void>;
+    /**
+     * Embed a single string into a normalized dense vector.
+     * @param {string} text Input text.
+     * @returns {Promise<Float32Array>} A {@link Embedder#dimensions}-length vector.
+     */
+    embed(text: string): Promise<Float32Array>;
+    /**
+     * Embed many strings, processing them in batches for throughput.
+     * @param {string[]} texts Input texts.
+     * @param {object} [opts]
+     * @param {number} [opts.batchSize=32] Number of texts per forward pass.
+     * @returns {Promise<Float32Array[]>} One vector per input, in input order.
+     *   Each is a view into a shared buffer — copy it if you need to retain it
+     *   independently.
+     */
+    embedBatch(texts: string[], { batchSize }?: {
+        batchSize?: number | undefined;
+    }): Promise<Float32Array[]>;
+    /**
+     * Dimensionality of the vectors this model produces. Available only after
+     * {@link Embedder#init} (or a first {@link Embedder#embed}) has run, since it
+     * is detected from the loaded model.
+     * @returns {number} e.g. 384 for MiniLM/BGE-small, 768 for MPNet/GTE-base.
+     * @throws {Error} If called before the model has been initialized.
+     */
+    get dimensions(): number;
+    /**
+     * The Hugging Face model id this embedder uses.
+     * @returns {string}
+     */
+    get model(): string;
+    /**
+     * Weight precision this embedder loads (e.g. `'q8'`, `'fp32'`).
+     * @returns {string}
+     */
+    get dtype(): string;
+    #private;
+}

package/types/lib/extract.d.ts ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * Universal (Node + browser) helpers for turning arbitrary data into the text
+ * Vecito indexes and the metadata it returns with hits. No filesystem deps.
+ */
+/**
+ * Recursively collect string leaves from any value and join them. Lets raw JSON
+ * objects/arrays be indexed without the caller pre-extracting their text.
+ * @param {*} value Any value — object, array, string, etc.
+ * @returns {string} All string leaves, '. '-joined (empty string if none).
+ */
+export function flattenStrings(value: any): string;
+/**
+ * Default text extractor: strings pass through; objects/arrays are flattened to
+ * their string leaves; everything else is stringified.
+ * @param {*} item
+ * @returns {string}
+ */
+export function defaultText(item: any): string;
+/**
+ * Default metadata extractor: objects are returned as-is (so search hits carry
+ * the original data back); non-objects carry no metadata.
+ * @param {*} item
+ * @returns {Record<string, any>}
+ */
+export function defaultMetadata(item: any): Record<string, any>;

package/types/lib/file-index.d.ts ADDED Viewed

@@ -0,0 +1,59 @@
+/**
+ * Recursively collect files under `dir` whose extension is allowed. Skips
+ * dotfiles and dot-directories unless `hidden` is true.
+ * @param {string} dir Directory to walk.
+ * @param {object} [opts]
+ * @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
+ * @param {boolean} [opts.hidden=false] Include entries whose name starts with '.'.
+ * @param {number} [opts.limit=Infinity] Stop after this many files.
+ * @returns {string[]} Matching file paths.
+ */
+export function walk(dir: string, { ext, hidden, limit }?: {
+    ext?: string[] | undefined;
+    hidden?: boolean | undefined;
+    limit?: number | undefined;
+}): string[];
+/**
+ * Index an explicit list of files into a fresh {@link Vecito}.
+ * @param {string[]} paths File paths to index (one document each).
+ * @param {object} [opts]
+ * @param {string} [opts.model] Embedding model id passed to Vecito.
+ * @param {string} [opts.dtype] Weight precision passed to Vecito.
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
+ * @param {string} [opts.base] Base dir for relative `path` metadata.
+ * @returns {Promise<Vecito>}
+ */
+export function indexFiles(paths: string[], { model, dtype, mode, base }?: {
+    model?: string | undefined;
+    dtype?: string | undefined;
+    mode?: "hybrid" | "dense" | undefined;
+    base?: string | undefined;
+}): Promise<Vecito>;
+/**
+ * Walk a directory and index every matching file into a fresh {@link Vecito}.
+ * @param {string} dir Directory to index.
+ * @param {object} [opts]
+ * @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
+ * @param {boolean} [opts.hidden=false] Include dotfiles/dot-directories.
+ * @param {number} [opts.limit=Infinity] Index at most this many files.
+ * @param {string} [opts.model] Embedding model id passed to Vecito.
+ * @param {string} [opts.dtype] Weight precision passed to Vecito.
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
+ * @returns {Promise<Vecito>}
+ */
+export function indexDirectory(dir: string, { ext, hidden, limit, model, dtype, mode }?: {
+    ext?: string[] | undefined;
+    hidden?: boolean | undefined;
+    limit?: number | undefined;
+    model?: string | undefined;
+    dtype?: string | undefined;
+    mode?: "hybrid" | "dense" | undefined;
+}): Promise<Vecito>;
+/**
+ * Filesystem layer on top of the core {@link Vecito} library. Turns files and
+ * directories into data items and feeds them to the generic indexer. Node-only
+ * (imports `fs`/`path`); use it via the `vecito/file` subpath export.
+ */
+/** Broad default set of text-ish extensions. Override with the `ext` option. */
+export const DEFAULT_EXTENSIONS: string[];
+import { Vecito } from './vecito.js';

package/types/lib/highlight.d.ts ADDED Viewed

@@ -0,0 +1,45 @@
+/**
+ * Text highlighting utilities for search result display.
+ *
+ * {@link Highlighter.highlight} wraps matched terms in `<mark>` tags using
+ * stem-aware matching: "running" matches the term "run", "adventure" matches
+ * "adventurous", etc. {@link Highlighter.snippet} extracts a relevant excerpt
+ * with the same stem-aware center-finding.
+ */
+export class Highlighter {
+    /**
+     * Escape HTML special characters in a plain-text string.
+     * @param {string} s
+     * @returns {string}
+     */
+    static escape(s: string): string;
+    /**
+     * Tokenize a query string for use as highlight terms. Splits on non-word
+     * characters, lowercases, drops stopwords and tokens shorter than 3 chars.
+     * Used as a fallback when BM25 matched terms are not available (dense mode).
+     * @param {string} text
+     * @returns {string[]} Unique tokens, longest first.
+     */
+    static tokenize(text: string): string[];
+    /**
+     * Wrap occurrences of `terms` in `text` with `<mark>` tags, HTML-escaping
+     * everything else. Matching is stem-aware and case-insensitive: the term
+     * "run" will match "running", "runs", "ran"; "adventure" matches "adventures".
+     * Gaps between matched words are bridged into one `<mark>` span when every
+     * word in the gap is a stopword (e.g. "Tales of Mystery" → single highlight).
+     * @param {string} text Plain text to highlight.
+     * @param {string[]|Set<string>} terms Terms to highlight.
+     * @returns {string} HTML string with `<mark>…</mark>` around matches.
+     */
+    static highlight(text: string, terms: string[] | Set<string>): string;
+    /**
+     * Extract a snippet of at most `maxLen` characters centred on the first
+     * stem match. Short texts are returned in full. Ellipsis (`…`) is added at
+     * truncated edges. Pass the result to {@link Highlighter.highlight} for markup.
+     * @param {string} text Plain text.
+     * @param {string[]|Set<string>} terms Terms to centre the window on.
+     * @param {number} [maxLen=220] Maximum character count of the returned snippet.
+     * @returns {string} Plain-text excerpt.
+     */
+    static snippet(text: string, terms: string[] | Set<string>, maxLen?: number): string;
+}

package/types/lib/vec-store.d.ts ADDED Viewed

@@ -0,0 +1,125 @@
+/**
+ * Dense + sparse vector store backed by altor-vec (WASM HNSW).
+ *
+ * The HNSW graph serializes to bytes via `engine.to_bytes()` and restores in
+ * milliseconds via `new WasmSearchEngine(bytes)` — eliminating the long rebuild
+ * that edgevec required. Sparse (BM25) vectors are stored as plain arrays and
+ * searched with brute-force dot products, which gives 100% recall for ≤1M docs.
+ */
+export class VecStore {
+    /**
+     * Load from a file (Node only).
+     * @param {string} filePath
+     * @returns {Promise<VecStore>}
+     */
+    static load(filePath: string): Promise<VecStore>;
+    /**
+     * Rebuild from bytes produced by {@link VecStore#exportBytes}.
+     * @param {Uint8Array} bytes
+     * @returns {Promise<VecStore>}
+     */
+    static loadFromBytes(bytes: Uint8Array): Promise<VecStore>;
+    /**
+     * Fetch an exported blob over HTTP and rebuild it.
+     * @param {string} url
+     * @returns {Promise<VecStore>}
+     */
+    static loadFromUrl(url: string): Promise<VecStore>;
+    /**
+     * Deserialize bytes produced by exportBytes.
+     * The HNSW graph is restored directly — no rebuild loop.
+     */
+    static #fromBytes(bytes: any): VecStore;
+    /**
+     * @param {object} opts
+     * @param {number} opts.dimensions Dense vector dimensionality.
+     */
+    constructor({ dimensions }: {
+        dimensions: number;
+    });
+    /**
+     * Initialize the WASM module. Must be called before insert / search.
+     * @returns {Promise<void>}
+     */
+    init(): Promise<void>;
+    /**
+     * Insert a dense vector with optional metadata.
+     * @param {Float32Array|number[]} vector
+     * @param {Record<string, any>} [metadata]
+     * @returns {number} 1-based id for backward compatibility.
+     */
+    insert(vector: Float32Array | number[], metadata?: Record<string, any>): number;
+    /**
+     * Enable sparse storage. Inserts a placeholder at index 0 so sparse ids
+     * (0-based) align with dense ids (1-based).
+     */
+    initSparse(): void;
+    /**
+     * Insert a sparse BM25 vector, paired positionally with the prior dense insert.
+     * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
+     */
+    insertSparse(sparse: {
+        indices: Uint32Array;
+        values: Float32Array;
+        dim: number;
+    }): void;
+    /**
+     * Nearest-neighbor dense search.
+     * @param {Float32Array} query
+     * @param {number} [k=10]
+     * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
+     */
+    search(query: Float32Array, k?: number): Promise<Array<{
+        id: number;
+        score: number;
+        metadata: Record<string, any>;
+    }>>;
+    /**
+     * Hybrid dense + sparse search with RRF rank fusion.
+     * Dense side uses the HNSW; sparse side uses brute-force dot products.
+     * @param {Float32Array} denseQuery
+     * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
+     * @param {number} [k=10]
+     * @param {object} [opts]
+     * @param {number} [opts.dense_k]
+     * @param {number} [opts.sparse_k]
+     * @param {string|{type:string,alpha:number}} [opts.fusion='rrf']
+     * @returns {Array<{id:number,score:number,dense_rank?:number,sparse_rank?:number,metadata:Record<string,any>}>}
+     */
+    hybridSearch(denseQuery: Float32Array, sparse: {
+        indices: Uint32Array;
+        values: Float32Array;
+        dim: number;
+    }, k?: number, { dense_k, sparse_k, fusion }?: {
+        dense_k?: number | undefined;
+        sparse_k?: number | undefined;
+        fusion?: string | {
+            type: string;
+            alpha: number;
+        } | undefined;
+    }): Array<{
+        id: number;
+        score: number;
+        dense_rank?: number;
+        sparse_rank?: number;
+        metadata: Record<string, any>;
+    }>;
+    /**
+     * Serialize to a self-contained Uint8Array.
+     * Layout: [uint32 hnswLen][HNSW bytes][JSON: {dims, metadata, sparse?}]
+     * @returns {Uint8Array}
+     */
+    exportBytes(): Uint8Array;
+    /**
+     * Write to a file (Node only).
+     * @param {string} filePath
+     */
+    save(filePath: string): Promise<void>;
+    /** @alias save */
+    exportToFile(filePath: any): Promise<void>;
+    /** Number of indexed vectors. */
+    get count(): number;
+    /** Dimensionality of the dense vectors. */
+    get dimensions(): number;
+    #private;
+}

package/types/lib/vecito.d.ts ADDED Viewed

@@ -0,0 +1,195 @@
+/**
+ * @typedef {object} AddOptions
+ * @property {(item: any) => string} [text] Extract the searchable text from an item.
+ *   Defaults to {@link defaultText} (strings pass through, objects are flattened
+ *   to their string leaves).
+ * @property {(item: any) => Record<string, any>} [metadata] Extract metadata
+ *   returned with hits. Defaults to {@link defaultMetadata} (objects are kept
+ *   as-is, so hits carry the original data back).
+ */
+/**
+ * @typedef {object} SearchResult
+ * @property {number} [id] Internal vector id.
+ * @property {number} [score] Relevance score (interpretation depends on mode).
+ * @property {number} [dense_rank] Rank on the dense side (hybrid mode).
+ * @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
+ * @property {Record<string, any>} metadata The document's metadata.
+ * @property {string[]} [matchedTerms] Query terms matched, present only when `search` is called with `matchedTerms: true`.
+ */
+/**
+ * Vecito — isomorphic (Node + browser) hybrid semantic search.
+ *
+ * Orchestrates an Embedder (dense), BM25 (sparse) and a VecStore (edgevec) so
+ * callers get a one-liner instead of wiring the three primitives by hand.
+ * Universal methods (addDocuments / search / exportBytes / loadFromBytes /
+ * loadFromUrl) run anywhere; save/load are Node-only.
+ */
+export class Vecito {
+    /**
+     * Parse a container blob produced by {@link Vecito#exportBytes} back into a
+     * ready-to-search instance.
+     * @param {Uint8Array} bytes
+     * @returns {Promise<Vecito>}
+     */
+    static #fromContainer(bytes: Uint8Array): Promise<Vecito>;
+    /**
+     * Rebuild from bytes produced by {@link Vecito#exportBytes} (universal).
+     * @param {Uint8Array|ArrayBuffer} bytes
+     * @returns {Promise<Vecito>}
+     */
+    static loadFromBytes(bytes: Uint8Array | ArrayBuffer): Promise<Vecito>;
+    /**
+     * Fetch an exported blob over HTTP and rebuild it (browser + Node fetch).
+     * @param {string} url
+     * @returns {Promise<Vecito>}
+     */
+    static loadFromUrl(url: string): Promise<Vecito>;
+    /**
+     * Read an exported blob from a file. Node only.
+     * @param {string} path
+     * @returns {Promise<Vecito>}
+     * @throws {Error} In non-Node environments.
+     */
+    static load(path: string): Promise<Vecito>;
+    /**
+     * @param {object} [opts]
+     * @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Embedding model id.
+     * @param {string} [opts.dtype='q8'] Weight precision (`'q8'` ≈ 4× smaller
+     *   download by default, `'fp32'` for full precision).
+     * @param {Embedder} [opts.embedder] A pre-built (optionally pre-loaded)
+     *   {@link Embedder} to use instead of constructing one — handy for reusing a
+     *   loaded model across indexes or for timing model load separately. Takes
+     *   precedence over `model`/`dtype`.
+     * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode. `'hybrid'` stores
+     *   both dense vectors and BM25 sparse data; `'dense'` omits sparse/BM25 for a
+     *   smaller snapshot. The mode is embedded in the snapshot and respected on load.
+     * @param {number} [opts.k1] BM25 term-frequency saturation (hybrid mode only).
+     * @param {number} [opts.b] BM25 length-normalization factor (hybrid mode only).
+     */
+    constructor({ model, dtype, embedder, mode, k1, b }?: {
+        model?: string | undefined;
+        dtype?: string | undefined;
+        embedder?: Embedder | undefined;
+        mode?: "hybrid" | "dense" | undefined;
+        k1?: number | undefined;
+        b?: number | undefined;
+    });
+    /**
+     * Index arbitrary data. Items may be strings, plain JSON objects, or anything
+     * else — the `text`/`metadata` extractors (with smart defaults) decide what to
+     * embed and what to return with hits, so raw objects work with zero config.
+     *
+     * BM25 is fit on the **first** call (its global df/idf statistics need a
+     * corpus), then frozen. Subsequent calls — including adding documents to an
+     * instance restored via {@link Vecito.load}/{@link Vecito.loadFromBytes} —
+     * append documents scored against that existing model, keeping the index
+     * consistent. Dense (semantic) search covers new documents fully; sparse
+     * scoring only sees terms already in the frozen vocabulary. For best lexical
+     * recall, pass your whole corpus in the first call.
+     * @param {any|any[]} items Item(s) to index.
+     * @param {AddOptions} [opts]
+     * @returns {Promise<this>}
+     */
+    addDocuments(items: any | any[], { text, metadata }?: AddOptions): Promise<this>;
+    /**
+     * Search the index.
+     * @param {string} query Natural-language query.
+     * @param {object} [opts]
+     * @param {'hybrid'|'dense'|'sparse'} [opts.mode='hybrid'] Ranking strategy.
+     *   'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
+     *   BM25-weighted. Falls back to dense if the query has no in-vocab terms.
+     * @param {number} [opts.top=10] Maximum number of results.
+     * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode. Over-fetches and grows the candidate set adaptively (up to the full index) so a selective predicate still returns up to `top` matches when they exist.
+     * @param {boolean} [opts.matchedTerms=false] When true, attach the matched query terms to each result as a `matchedTerms` string array.
+     * @returns {Promise<SearchResult[]>}
+     * @throws {Error} If nothing has been indexed or loaded yet.
+     */
+    search(query: string, { mode, top, filter, matchedTerms: includeTerms }?: {
+        mode?: "sparse" | "hybrid" | "dense" | undefined;
+        top?: number | undefined;
+        filter?: ((meta: Record<string, any>) => boolean) | undefined;
+        matchedTerms?: boolean | undefined;
+    }): Promise<SearchResult[]>;
+    /**
+     * Number of indexed documents.
+     * @returns {number}
+     */
+    get count(): number;
+    /**
+     * The embedding model id this instance uses.
+     * @returns {string}
+     */
+    get model(): string;
+    /**
+     * Weight precision the embedder loads (e.g. `'q8'`, `'fp32'`).
+     * @returns {string}
+     */
+    get dtype(): string;
+    /**
+     * Width of the dense vectors in the index, or null before anything is indexed.
+     * @returns {number|null}
+     */
+    get dimensions(): number | null;
+    /**
+     * Index mode this instance was built with (`'hybrid'` or `'dense'`).
+     * @returns {'hybrid'|'dense'}
+     */
+    get indexMode(): "hybrid" | "dense";
+    /**
+     * Serialize everything (vectors + metadata + sparse + BM25 + model name +
+     * dtype) into one Uint8Array. Layout: [uint32 metaLen][meta JSON][VecStore
+     * bytes].
+     * @returns {Uint8Array}
+     * @throws {Error} If nothing has been indexed yet.
+     */
+    exportBytes(): Uint8Array;
+    /**
+     * Write the exported blob to a file. Node only.
+     * @param {string} path
+     * @returns {Promise<void>}
+     * @throws {Error} In non-Node environments.
+     */
+    save(path: string): Promise<void>;
+    #private;
+}
+export type AddOptions = {
+    /**
+     * Extract the searchable text from an item.
+     * Defaults to {@link defaultText} (strings pass through, objects are flattened
+     * to their string leaves).
+     */
+    text?: ((item: any) => string) | undefined;
+    /**
+     * Extract metadata
+     * returned with hits. Defaults to {@link defaultMetadata} (objects are kept
+     * as-is, so hits carry the original data back).
+     */
+    metadata?: ((item: any) => Record<string, any>) | undefined;
+};
+export type SearchResult = {
+    /**
+     * Internal vector id.
+     */
+    id?: number | undefined;
+    /**
+     * Relevance score (interpretation depends on mode).
+     */
+    score?: number | undefined;
+    /**
+     * Rank on the dense side (hybrid mode).
+     */
+    dense_rank?: number | undefined;
+    /**
+     * Rank on the sparse side (hybrid mode).
+     */
+    sparse_rank?: number | undefined;
+    /**
+     * The document's metadata.
+     */
+    metadata: Record<string, any>;
+    /**
+     * Query terms matched, present only when `search` is called with `matchedTerms: true`.
+     */
+    matchedTerms?: string[] | undefined;
+};
+import { Embedder } from './embedder.js';

package/file.d.ts DELETED Viewed

@@ -1,27 +0,0 @@
-import { Vecito } from './index';
-export const DEFAULT_EXTENSIONS: string[];
-export interface WalkOptions {
-	ext?: string[];
-	hidden?: boolean;
-	limit?: number;
-}
-export interface IndexOptions extends WalkOptions {
-	model?: string;
-	dtype?: string;
-	mode?: 'hybrid' | 'dense';
-}
-/** Recursively collect matching file paths, skipping dotfiles by default. */
-export function walk(dir: string, opts?: WalkOptions): string[];
-/** Index an explicit list of files into a fresh Vecito. */
-export function indexFiles(
-	paths: string[],
-	opts?: { model?: string; dtype?: string; mode?: 'hybrid' | 'dense'; base?: string }
-): Promise<Vecito>;
-/** Walk a directory and index every matching file into a fresh Vecito. */
-export function indexDirectory(dir: string, opts?: IndexOptions): Promise<Vecito>;

package/index.d.ts DELETED Viewed

@@ -1,113 +0,0 @@
-export interface SparseVector {
-	indices: Uint32Array;
-	values: Float32Array;
-	dim: number;
-}
-export interface SearchResult {
-	id?: number;
-	score?: number;
-	dense_rank?: number;
-	sparse_rank?: number;
-	metadata: Record<string, any>;
-	/** BM25-matched (hybrid) or tokenized (dense) terms. Present only when search was called with `{ matchedTerms: true }`. */
-	matchedTerms?: string[];
-}
-export class Highlighter {
-	/** Escape HTML special characters in a plain-text string. */
-	static escape(s: string): string;
-	/** Tokenize a query string for dense-mode fallback highlighting. */
-	static tokenize(text: string): string[];
-	/** Wrap occurrences of `terms` in `text` with `<mark>` tags. Matching is stem-aware: "run" matches "running", "adventure" matches "adventures". */
-	static highlight(text: string, terms: string[] | Set<string>): string;
-	/** Extract a snippet centred on the first stem match (plain text — pass to highlight for markup). */
-	static snippet(text: string, terms: string[] | Set<string>, maxLen?: number): string;
-}
-export class Embedder {
-	constructor(opts?: { model?: string; dtype?: string });
-	init(): Promise<void>;
-	embed(text: string): Promise<Float32Array>;
-	embedBatch(texts: string[], opts?: { batchSize?: number }): Promise<Float32Array[]>;
-	get dimensions(): number;
-	get dtype(): string;
-	get model(): string;
-}
-export class BM25 {
-	constructor(opts?: { k1?: number; b?: number });
-	fit(texts: string[]): void;
-	score(text: string): SparseVector;
-	scoreAll(texts: string[]): SparseVector[];
-	/** Map a query string to the in-vocabulary term ids it contains. */
-	scoreQuery(queryText: string): { indices: number[]; vocabSize: number };
-	querySparse(queryText: string): SparseVector;
-	/** Map vocabulary term ids back to their original term strings (unknown ids omitted). */
-	termsForIndices(indices: Uint32Array | number[]): string[];
-	toJSON(): Record<string, any>;
-	static fromJSON(data: Record<string, any>): BM25;
-	get vocabSize(): number;
-}
-export class VecStore {
-	constructor(opts: { dimensions: number });
-	init(): Promise<void>;
-	insert(vector: Float32Array | number[], metadata?: Record<string, any>): number;
-	initSparse(): void;
-	insertSparse(sparse: SparseVector): void;
-	search(query: Float32Array, k?: number): Promise<SearchResult[]>;
-	/** Post-filters HNSW candidates with a JS predicate over metadata objects. */
-	searchWithFilter(query: Float32Array, filter: (meta: Record<string, any>) => boolean, k?: number): Promise<SearchResult[]>;
-	hybridSearch(
-		denseQuery: Float32Array,
-		sparse: SparseVector,
-		k?: number,
-		opts?: { dense_k?: number; sparse_k?: number; fusion?: any }
-	): SearchResult[];
-	save(filePath: string): Promise<void>;
-	/** Alias for {@link VecStore.save}. */
-	exportToFile(filePath: string): Promise<void>;
-	exportBytes(): Uint8Array;
-	static load(filePath: string): Promise<VecStore>;
-	static loadFromBytes(bytes: Uint8Array): Promise<VecStore>;
-	static loadFromUrl(url: string): Promise<VecStore>;
-	get count(): number;
-	/** Dense vector width of the index. */
-	get dimensions(): number;
-}
-export interface AddOptions {
-	/** Extract searchable text from an item (default: flatten string values). */
-	text?: (item: any) => string;
-	/** Extract metadata returned with hits (default: the object itself). */
-	metadata?: (item: any) => Record<string, any>;
-}
-export interface VecitoSearchOptions {
-	mode?: 'hybrid' | 'dense' | 'sparse';
-	top?: number;
-	/** JS predicate over metadata — post-filters results in any mode, over-fetching to preserve the requested count. */
-	filter?: (meta: Record<string, any>) => boolean;
-	/** When true, each result includes `matchedTerms` for use with `Highlighter.highlight`. */
-	matchedTerms?: boolean;
-}
-export class Vecito {
-	constructor(opts?: { model?: string; dtype?: string; embedder?: Embedder; mode?: 'hybrid' | 'dense'; k1?: number; b?: number });
-	addDocuments(items: any | any[], opts?: AddOptions): Promise<this>;
-	search(query: string, opts?: VecitoSearchOptions): Promise<SearchResult[]>;
-	exportBytes(): Uint8Array;
-	save(path: string): Promise<void>;
-	static load(path: string): Promise<Vecito>;
-	static loadFromBytes(bytes: Uint8Array | ArrayBuffer): Promise<Vecito>;
-	static loadFromUrl(url: string): Promise<Vecito>;
-	get count(): number;
-	get model(): string;
-	/** Weight precision the embedder loads (e.g. 'q8', 'fp32'). */
-	get dtype(): string;
-	/** Dense vector width of the index, or null before anything is indexed. */
-	get dimensions(): number | null;
-	/** Index mode this instance was built with ('hybrid' or 'dense'). */
-	get indexMode(): 'hybrid' | 'dense';
-}