vecito 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,7 +7,7 @@ lexical scoring over an [`altor-vec`](https://github.com/altor-lab/altor-vec) WA
7
7
  no API keys.
8
8
 
9
9
  **Where to build the index.** Building a snapshot means embedding every document and constructing the HNSW graph — the expensive part. It's usually best to do this once on the server or with the CLI (`vecito index`), then serve the resulting `.vecito` file and load it in the browser with `Vecito.loadFromUrl()`, which restores the pre-built graph in milliseconds. Building, indexing, and adding documents directly in the browser is fully supported too — it just runs that same per-document embedding client-side, which is slow for large corpora.
10
-
10
+
11
11
  ## Install
12
12
 
13
13
  ```bash
@@ -52,6 +52,17 @@ await v.addDocuments(rows, {
52
52
  (BM25-weighted). All modes support a `filter` predicate to post-filter results by metadata.
53
53
  If the query has no in-vocabulary terms, hybrid/sparse automatically fall back to dense.
54
54
 
55
+ ```js
56
+ // `filter` is a JS predicate over each hit's metadata — works in any mode.
57
+ // Filtering happens after ranking, so vecito over-fetches and grows the candidate
58
+ // set adaptively (up to the whole index) to still return `top` matches when they exist.
59
+ const hits = await v.search('how do plants make food?', {
60
+ mode: 'hybrid',
61
+ top: 3,
62
+ filter: m => m.title === 'Botany',
63
+ });
64
+ ```
65
+
55
66
  ### Options & models
56
67
 
57
68
  ```js
@@ -69,6 +80,26 @@ later (including to a loaded snapshot — see below). Dense search covers new do
69
80
  sparse scoring only sees terms already in the frozen vocabulary, so pass your whole corpus up
70
81
  front for best lexical recall.
71
82
 
83
+ ### Highlighting
84
+
85
+ Pass `matchedTerms: true` to get the query terms each hit matched, then render an excerpt with
86
+ the exported `Highlighter`. `snippet()` extracts a relevant window centred on the first match;
87
+ `highlight()` wraps matches in `<mark>` tags. Both are stem-aware (the term `run` matches
88
+ `running`/`ran`) and case-insensitive, and `highlight()` HTML-escapes everything else.
89
+
90
+ ```js
91
+ import { Vecito, Highlighter } from 'vecito';
92
+
93
+ const hits = await v.search('how do plants make food?', { matchedTerms: true });
94
+ for (const h of hits) {
95
+ const excerpt = Highlighter.snippet(h.metadata.body, h.matchedTerms); // plain-text window (≤220 chars)
96
+ const html = Highlighter.highlight(excerpt, h.matchedTerms); // '…<mark>Photosynthesis</mark>…'
97
+ }
98
+ ```
99
+
100
+ In dense mode (no BM25 terms available) `matchedTerms` falls back to the query's own tokens, so
101
+ highlighting still works. You can also call `Highlighter.tokenize(query)` to derive terms yourself.
102
+
72
103
  ### Persistence
73
104
 
74
105
  ```js
@@ -92,7 +123,7 @@ await loaded.save('data.vecito');
92
123
  ```
93
124
 
94
125
  The primitives are exported too if you want to wire them yourself:
95
- `import { Embedder, BM25, VecStore } from 'vecito'`.
126
+ `import { Embedder, BM25, VecStore, Highlighter } from 'vecito'`.
96
127
 
97
128
  ## File indexing (`vecito/file`)
98
129
 
@@ -137,7 +168,7 @@ out of dependency pre-bundling (`optimizeDeps.exclude`) so their `import.meta.ur
137
168
  Install globally to get the `vecito` command on your `PATH`:
138
169
 
139
170
  ```bash
140
- pnpm add -g vecito
171
+ pnpm add -g vecito --config.onlyBuiltDependencies='["onnxruntime-node","protobufjs","sharp"]'
141
172
  ```
142
173
 
143
174
  Or run it without installing via `pnpm dlx vecito …`.
@@ -152,6 +183,12 @@ vecito index ./docs -o docs.vecito
152
183
  # Search (path is optional; defaults to data.vecito in the current directory)
153
184
  vecito search "renewable energy sources" --mode hybrid --top 5
154
185
  vecito search "renewable energy sources" docs.vecito --top 5
186
+
187
+ # Filter by metadata — a JS expression with the hit's metadata bound to `meta`
188
+ vecito search "renewable energy" --filter 'meta.name.endsWith(".md")'
189
+
190
+ # Machine-readable output — score, ranks, and full metadata as JSON (pipeable)
191
+ vecito search "renewable energy" --json | jq '.[].metadata'
155
192
  ```
156
193
 
157
194
  `index` recursively walks the directory, indexing a broad set of text/data/code extensions
@@ -164,7 +201,7 @@ The trailing path is optional and **defaults to the current directory** — `ind
164
201
 
165
202
  ```
166
203
  vecito index [dir] [-o data.vecito] [--ext .md,.txt,...] [--hidden] [--limit N]
167
- vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>]
204
+ vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>] [--json]
168
205
  ```
169
206
 
170
207
  ## Sample data
package/bin/cli.js CHANGED
@@ -1,12 +1,18 @@
1
1
  #!/usr/bin/env node
2
- import { existsSync, statSync } from 'fs';
3
- import { join } from 'path';
2
+ import { existsSync, statSync, readFileSync } from 'fs';
3
+ import { join, dirname } from 'path';
4
+ import { fileURLToPath } from 'url';
4
5
  import { Vecito } from '../lib/vecito.js';
5
6
  import { walk, indexFiles } from '../lib/file-index.js';
6
7
 
7
8
  /** Default index filename, used for both output and directory-relative lookup. */
8
9
  const DEFAULT_INDEX = 'data.vecito';
9
10
 
11
+ /** Package version, read from package.json next to this CLI. */
12
+ const VERSION = JSON.parse(
13
+ readFileSync(join(dirname(fileURLToPath(import.meta.url)), '../package.json'), 'utf8'),
14
+ ).version;
15
+
10
16
  /** Flags that consume the following argv token as their value. */
11
17
  const VALUE_FLAGS = new Set(['-o', '--out', '--ext', '--limit', '--mode', '--top', '--filter']);
12
18
 
@@ -62,15 +68,18 @@ function resolveIndexPath(p) {
62
68
  }
63
69
 
64
70
  /**
65
- * Print CLI usage to stderr.
71
+ * Print CLI usage. Goes to stdout for an explicit help request, stderr otherwise.
72
+ * @param {{help?: boolean}} [opts]
66
73
  * @returns {void}
67
74
  */
68
- function usage() {
69
- console.error(`vecito hybrid (dense + BM25) semantic search
75
+ function usage({ help = false } = {}) {
76
+ const write = help ? console.log : console.error;
77
+ write(`vecito — hybrid (dense + BM25) semantic search
70
78
 
71
79
  Usage:
72
80
  vecito index [dir] [-o data.vecito] [--mode dense|hybrid] [--ext .md,.txt,...] [--hidden] [--limit N]
73
- vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>]
81
+ vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>] [--json]
82
+ vecito --version | --help
74
83
 
75
84
  The trailing path is optional and defaults to the current directory.
76
85
 
@@ -84,7 +93,8 @@ Index options:
84
93
  Search options:
85
94
  --mode <m> Search mode: hybrid (default), dense, or sparse
86
95
  --top <n> Number of results (default: 10)
87
- --filter <expr> JS expression over metadata, e.g. 'meta.category === "science"'`);
96
+ --filter <expr> JS expression over metadata, e.g. 'meta.category === "science"'
97
+ --json Output results as JSON (score, ranks, full metadata) on stdout`);
88
98
  }
89
99
 
90
100
  /**
@@ -130,6 +140,7 @@ async function cmdSearch() {
130
140
 
131
141
  const mode = flag('--mode', 'hybrid');
132
142
  const top = parseInt(flag('--top', '10'), 10);
143
+ const asJson = hasFlag('--json');
133
144
  const filterExpr = flag('--filter', undefined);
134
145
  let filter;
135
146
  if (filterExpr) {
@@ -148,10 +159,19 @@ async function cmdSearch() {
148
159
 
149
160
  const vecito = await Vecito.load(indexFile);
150
161
  const effectiveMode = vecito.indexMode === 'dense' ? 'dense' : mode;
151
- console.log(`Loaded ${vecito.count} doc(s) from ${indexFile} [index: ${vecito.indexMode}, search: ${effectiveMode}]\n`);
152
- console.log(`Results for "${query}"${filterExpr ? ` (filter: ${filterExpr})` : ''}:\n`);
162
+ if (!asJson) {
163
+ console.log(`Loaded ${vecito.count} doc(s) from ${indexFile} [index: ${vecito.indexMode}, search: ${effectiveMode}]\n`);
164
+ console.log(`Results for "${query}"${filterExpr ? ` (filter: ${filterExpr})` : ''}:\n`);
165
+ }
153
166
 
154
167
  const results = await vecito.search(query, { mode: effectiveMode, top, filter });
168
+
169
+ if (asJson) {
170
+ // Pure JSON on stdout (score, ranks, full metadata) — pipeable into jq etc.
171
+ console.log(JSON.stringify(results, null, 2));
172
+ return;
173
+ }
174
+
155
175
  if (results.length === 0) {
156
176
  console.log('(no results)');
157
177
  return;
@@ -171,7 +191,13 @@ async function cmdSearch() {
171
191
  }
172
192
 
173
193
  const cmd = process.argv[2];
174
- if (cmd === 'index') {
194
+ if (cmd === '--version' || cmd === '-v') {
195
+ console.log(VERSION);
196
+ process.exit(0);
197
+ } else if (cmd === '--help' || cmd === '-h' || cmd === 'help' || !cmd) {
198
+ usage({ help: true });
199
+ process.exit(0);
200
+ } else if (cmd === 'index') {
175
201
  await cmdIndex();
176
202
  } else if (cmd === 'search') {
177
203
  await cmdSearch();
package/index.js CHANGED
@@ -3,3 +3,10 @@ export { BM25 } from './lib/bm25.js';
3
3
  export { VecStore } from './lib/vec-store.js';
4
4
  export { Vecito } from './lib/vecito.js';
5
5
  export { Highlighter } from './lib/highlight.js';
6
+
7
+ /**
8
+ * Re-export the public typedefs so consumers can import them from the package
9
+ * root. They are authored as JSDoc types on the Vecito module.
10
+ * @typedef {import('./lib/vecito.js').SearchResult} SearchResult
11
+ * @typedef {import('./lib/vecito.js').AddOptions} AddOptions
12
+ */
package/lib/vec-store.js CHANGED
@@ -129,20 +129,6 @@ export class VecStore {
129
129
  return this.#denseScan(query, k);
130
130
  }
131
131
 
132
- /**
133
- * Dense search with a JS predicate post-filter. Fetches `k * 5` candidates
134
- * from the HNSW, then applies the filter to metadata objects.
135
- * @param {Float32Array} query
136
- * @param {(meta: Record<string,any>) => boolean} filter JS predicate.
137
- * @param {number} [k=10]
138
- * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
139
- */
140
- async searchWithFilter(query, filter, k = 10) {
141
- const candidates = this.#denseScan(query, k * 5);
142
- if (typeof filter !== 'function') return candidates.slice(0, k);
143
- return candidates.filter(r => filter(r.metadata)).slice(0, k);
144
- }
145
-
146
132
  /**
147
133
  * Hybrid dense + sparse search with RRF rank fusion.
148
134
  * Dense side uses the HNSW; sparse side uses brute-force dot products.
package/lib/vecito.js CHANGED
@@ -33,6 +33,7 @@ function placeholderSparse(dim) {
33
33
  * @property {number} [dense_rank] Rank on the dense side (hybrid mode).
34
34
  * @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
35
35
  * @property {Record<string, any>} metadata The document's metadata.
36
+ * @property {string[]} [matchedTerms] Query terms matched, present only when `search` is called with `matchedTerms: true`.
36
37
  */
37
38
 
38
39
  /**
@@ -168,7 +169,8 @@ export class Vecito {
168
169
  * 'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
169
170
  * BM25-weighted. Falls back to dense if the query has no in-vocab terms.
170
171
  * @param {number} [opts.top=10] Maximum number of results.
171
- * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode, over-fetching to preserve the requested count.
172
+ * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode. Over-fetches and grows the candidate set adaptively (up to the full index) so a selective predicate still returns up to `top` matches when they exist.
173
+ * @param {boolean} [opts.matchedTerms=false] When true, attach the matched query terms to each result as a `matchedTerms` string array.
172
174
  * @returns {Promise<SearchResult[]>}
173
175
  * @throws {Error} If nothing has been indexed or loaded yet.
174
176
  */
@@ -181,29 +183,42 @@ export class Vecito {
181
183
 
182
184
  const queryVec = await this.#embedder.embed(query);
183
185
 
184
- let results;
185
186
  let querySparse = null;
186
-
187
- const fetchK = filter ? top * 5 : top;
188
-
189
- if (effectiveMode === 'dense') {
190
- results = await this.#store.search(queryVec, fetchK);
191
- } else {
187
+ if (effectiveMode !== 'dense') {
192
188
  querySparse = this.#bm25.querySparse(query);
193
- const hasSparse = querySparse.indices.length > 0;
189
+ }
190
+ const hasSparse = !!querySparse && querySparse.indices.length > 0;
194
191
 
195
- if (!hasSparse) {
196
- results = await this.#store.search(queryVec, fetchK);
197
- } else if (effectiveMode === 'sparse') {
198
- results = this.#store.hybridSearch(queryVec, querySparse, fetchK, {
192
+ // Fetch the top-k ranked candidates from the store for the resolved mode.
193
+ const fetchCandidates = async (k) => {
194
+ if (effectiveMode === 'dense' || !hasSparse) {
195
+ return await this.#store.search(queryVec, k);
196
+ }
197
+ if (effectiveMode === 'sparse') {
198
+ return this.#store.hybridSearch(queryVec, querySparse, k, {
199
199
  fusion: { type: 'linear', alpha: 0.0 },
200
200
  });
201
- } else {
202
- results = this.#store.hybridSearch(queryVec, querySparse, fetchK);
203
201
  }
204
- }
202
+ return this.#store.hybridSearch(queryVec, querySparse, k);
203
+ };
205
204
 
206
- if (filter) results = results.filter(r => filter(r.metadata)).slice(0, top);
205
+ let results;
206
+ if (!filter) {
207
+ results = await fetchCandidates(top);
208
+ } else {
209
+ // Filtering happens after ranking, so a selective predicate can leave fewer
210
+ // than `top` hits in the first batch. Grow the fetch and retry until we have
211
+ // enough matches or we've scanned the entire index.
212
+ const total = this.#store.count;
213
+ let fetchK = Math.min(top * 5, total);
214
+ while (true) {
215
+ const candidates = await fetchCandidates(fetchK);
216
+ results = candidates.filter(r => filter(r.metadata));
217
+ if (results.length >= top || fetchK >= total) break;
218
+ fetchK = Math.min(fetchK * 4, total);
219
+ }
220
+ results = results.slice(0, top);
221
+ }
207
222
 
208
223
  if (includeTerms) {
209
224
  const terms = querySparse && querySparse.indices.length > 0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vecito",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Tiny hybrid (dense + BM25) semantic search for Node and the browser",
5
5
  "type": "module",
6
6
  "author": "Jeka Kiselyov",
@@ -13,7 +13,7 @@
13
13
  "url": "https://github.com/jeka-kiselyov/vecito/issues"
14
14
  },
15
15
  "main": "index.js",
16
- "types": "index.d.ts",
16
+ "types": "types/index.d.ts",
17
17
  "publishConfig": {
18
18
  "access": "public"
19
19
  },
@@ -22,23 +22,24 @@
22
22
  },
23
23
  "exports": {
24
24
  ".": {
25
- "types": "./index.d.ts",
25
+ "types": "./types/index.d.ts",
26
26
  "default": "./index.js"
27
27
  },
28
28
  "./file": {
29
- "types": "./file.d.ts",
29
+ "types": "./types/lib/file-index.d.ts",
30
30
  "default": "./lib/file-index.js"
31
31
  }
32
32
  },
33
33
  "files": [
34
34
  "index.js",
35
- "index.d.ts",
36
- "file.d.ts",
35
+ "types/",
37
36
  "lib/",
38
37
  "bin/"
39
38
  ],
40
39
  "scripts": {
41
40
  "test": "vitest run",
41
+ "types": "tsc -p tsconfig.json",
42
+ "prepublishOnly": "pnpm run types && pnpm test",
42
43
  "dev:browser": "vite"
43
44
  },
44
45
  "keywords": [
@@ -55,6 +56,7 @@
55
56
  "stemmer": "^2.0.1"
56
57
  },
57
58
  "devDependencies": {
59
+ "typescript": "^6.0.3",
58
60
  "vite": "^6.0.0",
59
61
  "vitest": "^3.2.0"
60
62
  },
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
@@ -0,0 +1,15 @@
1
+ export { Embedder } from "./lib/embedder.js";
2
+ export { BM25 } from "./lib/bm25.js";
3
+ export { VecStore } from "./lib/vec-store.js";
4
+ export { Vecito } from "./lib/vecito.js";
5
+ export { Highlighter } from "./lib/highlight.js";
6
+ /**
7
+ * Re-export the public typedefs so consumers can import them from the package
8
+ * root. They are authored as JSDoc types on the Vecito module.
9
+ */
10
+ export type SearchResult = import("./lib/vecito.js").SearchResult;
11
+ /**
12
+ * Re-export the public typedefs so consumers can import them from the package
13
+ * root. They are authored as JSDoc types on the Vecito module.
14
+ */
15
+ export type AddOptions = import("./lib/vecito.js").AddOptions;
@@ -0,0 +1,93 @@
1
+ /**
2
+ * BM25 sparse lexical model.
3
+ *
4
+ * Fit over a corpus to learn vocabulary and document frequencies, then produce
5
+ * sparse vectors (term-id → weight) for documents ({@link BM25#score}) and
6
+ * queries ({@link BM25#querySparse}). Serializable via {@link BM25#toJSON} /
7
+ * {@link BM25.fromJSON}.
8
+ */
9
+ export class BM25 {
10
+ /**
11
+ * Reconstruct a fitted model from {@link BM25#toJSON} output.
12
+ * @param {object} data
13
+ * @returns {BM25}
14
+ */
15
+ static fromJSON(data: object): BM25;
16
+ /**
17
+ * @param {object} [opts]
18
+ * @param {number} [opts.k1=1.2] Term-frequency saturation parameter.
19
+ * @param {number} [opts.b=0.75] Document-length normalization (0..1).
20
+ */
21
+ constructor({ k1, b }?: {
22
+ k1?: number | undefined;
23
+ b?: number | undefined;
24
+ });
25
+ /**
26
+ * Build the vocabulary and corpus statistics (document frequencies, average
27
+ * length) from a set of documents. Must be called before scoring.
28
+ * @param {string[]} texts The full corpus.
29
+ * @returns {void}
30
+ */
31
+ fit(texts: string[]): void;
32
+ /**
33
+ * Compute the BM25 sparse vector for a document. Out-of-vocabulary terms are
34
+ * ignored.
35
+ * @param {string} text Document text.
36
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
37
+ * sparse vector over the vocabulary.
38
+ */
39
+ score(text: string): {
40
+ indices: Uint32Array;
41
+ values: Float32Array;
42
+ dim: number;
43
+ };
44
+ /**
45
+ * Convenience wrapper that scores many documents.
46
+ * @param {string[]} texts
47
+ * @returns {Array<{indices: Uint32Array, values: Float32Array, dim: number}>}
48
+ */
49
+ scoreAll(texts: string[]): Array<{
50
+ indices: Uint32Array;
51
+ values: Float32Array;
52
+ dim: number;
53
+ }>;
54
+ /**
55
+ * Map a query string to the list of in-vocabulary term ids it contains.
56
+ * @param {string} queryText
57
+ * @returns {{indices: number[], vocabSize: number}} Matched term ids.
58
+ */
59
+ scoreQuery(queryText: string): {
60
+ indices: number[];
61
+ vocabSize: number;
62
+ };
63
+ /**
64
+ * Build an IDF-weighted sparse vector for a query (assumes term frequency 1
65
+ * per query term). Use this to drive sparse/hybrid search.
66
+ * @param {string} queryText
67
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
68
+ * sparse vector; empty `indices` means no query term is in the vocabulary.
69
+ */
70
+ querySparse(queryText: string): {
71
+ indices: Uint32Array;
72
+ values: Float32Array;
73
+ dim: number;
74
+ };
75
+ /**
76
+ * Serialize the fitted model to a plain JSON-safe object.
77
+ * @returns {object} Pass to {@link BM25.fromJSON} to restore.
78
+ */
79
+ toJSON(): object;
80
+ /**
81
+ * Map a list of vocabulary indices back to their original term strings.
82
+ * The reverse map is built lazily on first call and cached.
83
+ * @param {Uint32Array|number[]} indices Vocabulary term ids.
84
+ * @returns {string[]} The corresponding terms (unknown ids silently omitted).
85
+ */
86
+ termsForIndices(indices: Uint32Array | number[]): string[];
87
+ /**
88
+ * Number of distinct terms in the fitted vocabulary (the sparse dimension).
89
+ * @returns {number}
90
+ */
91
+ get vocabSize(): number;
92
+ #private;
93
+ }
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Dense text embedder backed by transformers.js.
3
+ *
4
+ * Produces L2-normalized, mean-pooled sentence embeddings. The model is loaded
5
+ * lazily on first use and cached for the lifetime of the instance.
6
+ */
7
+ export class Embedder {
8
+ /**
9
+ * @param {object} [opts]
10
+ * @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Hugging Face model id
11
+ * for a feature-extraction pipeline. Any output width is supported — the
12
+ * actual dimension is detected at {@link Embedder#init} (see
13
+ * {@link Embedder#dimensions}).
14
+ * @param {string} [opts.dtype='q8'] Weight precision to load: `'q8'`
15
+ * (quantized, ~4× smaller download, the default) or `'fp32'` (full
16
+ * precision), plus other transformers.js dtypes (`'fp16'`, `'q4'`, …) when
17
+ * the model provides them.
18
+ */
19
+ constructor({ model, dtype }?: {
20
+ model?: string | undefined;
21
+ dtype?: string | undefined;
22
+ });
23
+ /**
24
+ * Load the underlying pipeline if it hasn't been loaded yet, and detect the
25
+ * model's embedding width with a one-token probe. Safe to call repeatedly;
26
+ * subsequent calls are no-ops. Called automatically by {@link Embedder#embed}
27
+ * / {@link Embedder#embedBatch}.
28
+ * @returns {Promise<void>}
29
+ */
30
+ init(): Promise<void>;
31
+ /**
32
+ * Embed a single string into a normalized dense vector.
33
+ * @param {string} text Input text.
34
+ * @returns {Promise<Float32Array>} A {@link Embedder#dimensions}-length vector.
35
+ */
36
+ embed(text: string): Promise<Float32Array>;
37
+ /**
38
+ * Embed many strings, processing them in batches for throughput.
39
+ * @param {string[]} texts Input texts.
40
+ * @param {object} [opts]
41
+ * @param {number} [opts.batchSize=32] Number of texts per forward pass.
42
+ * @returns {Promise<Float32Array[]>} One vector per input, in input order.
43
+ * Each is a view into a shared buffer — copy it if you need to retain it
44
+ * independently.
45
+ */
46
+ embedBatch(texts: string[], { batchSize }?: {
47
+ batchSize?: number | undefined;
48
+ }): Promise<Float32Array[]>;
49
+ /**
50
+ * Dimensionality of the vectors this model produces. Available only after
51
+ * {@link Embedder#init} (or a first {@link Embedder#embed}) has run, since it
52
+ * is detected from the loaded model.
53
+ * @returns {number} e.g. 384 for MiniLM/BGE-small, 768 for MPNet/GTE-base.
54
+ * @throws {Error} If called before the model has been initialized.
55
+ */
56
+ get dimensions(): number;
57
+ /**
58
+ * The Hugging Face model id this embedder uses.
59
+ * @returns {string}
60
+ */
61
+ get model(): string;
62
+ /**
63
+ * Weight precision this embedder loads (e.g. `'q8'`, `'fp32'`).
64
+ * @returns {string}
65
+ */
66
+ get dtype(): string;
67
+ #private;
68
+ }
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Universal (Node + browser) helpers for turning arbitrary data into the text
3
+ * Vecito indexes and the metadata it returns with hits. No filesystem deps.
4
+ */
5
+ /**
6
+ * Recursively collect string leaves from any value and join them. Lets raw JSON
7
+ * objects/arrays be indexed without the caller pre-extracting their text.
8
+ * @param {*} value Any value — object, array, string, etc.
9
+ * @returns {string} All string leaves, '. '-joined (empty string if none).
10
+ */
11
+ export function flattenStrings(value: any): string;
12
+ /**
13
+ * Default text extractor: strings pass through; objects/arrays are flattened to
14
+ * their string leaves; everything else is stringified.
15
+ * @param {*} item
16
+ * @returns {string}
17
+ */
18
+ export function defaultText(item: any): string;
19
+ /**
20
+ * Default metadata extractor: objects are returned as-is (so search hits carry
21
+ * the original data back); non-objects carry no metadata.
22
+ * @param {*} item
23
+ * @returns {Record<string, any>}
24
+ */
25
+ export function defaultMetadata(item: any): Record<string, any>;
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Recursively collect files under `dir` whose extension is allowed. Skips
3
+ * dotfiles and dot-directories unless `hidden` is true.
4
+ * @param {string} dir Directory to walk.
5
+ * @param {object} [opts]
6
+ * @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
7
+ * @param {boolean} [opts.hidden=false] Include entries whose name starts with '.'.
8
+ * @param {number} [opts.limit=Infinity] Stop after this many files.
9
+ * @returns {string[]} Matching file paths.
10
+ */
11
+ export function walk(dir: string, { ext, hidden, limit }?: {
12
+ ext?: string[] | undefined;
13
+ hidden?: boolean | undefined;
14
+ limit?: number | undefined;
15
+ }): string[];
16
+ /**
17
+ * Index an explicit list of files into a fresh {@link Vecito}.
18
+ * @param {string[]} paths File paths to index (one document each).
19
+ * @param {object} [opts]
20
+ * @param {string} [opts.model] Embedding model id passed to Vecito.
21
+ * @param {string} [opts.dtype] Weight precision passed to Vecito.
22
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
23
+ * @param {string} [opts.base] Base dir for relative `path` metadata.
24
+ * @returns {Promise<Vecito>}
25
+ */
26
+ export function indexFiles(paths: string[], { model, dtype, mode, base }?: {
27
+ model?: string | undefined;
28
+ dtype?: string | undefined;
29
+ mode?: "hybrid" | "dense" | undefined;
30
+ base?: string | undefined;
31
+ }): Promise<Vecito>;
32
+ /**
33
+ * Walk a directory and index every matching file into a fresh {@link Vecito}.
34
+ * @param {string} dir Directory to index.
35
+ * @param {object} [opts]
36
+ * @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
37
+ * @param {boolean} [opts.hidden=false] Include dotfiles/dot-directories.
38
+ * @param {number} [opts.limit=Infinity] Index at most this many files.
39
+ * @param {string} [opts.model] Embedding model id passed to Vecito.
40
+ * @param {string} [opts.dtype] Weight precision passed to Vecito.
41
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
42
+ * @returns {Promise<Vecito>}
43
+ */
44
+ export function indexDirectory(dir: string, { ext, hidden, limit, model, dtype, mode }?: {
45
+ ext?: string[] | undefined;
46
+ hidden?: boolean | undefined;
47
+ limit?: number | undefined;
48
+ model?: string | undefined;
49
+ dtype?: string | undefined;
50
+ mode?: "hybrid" | "dense" | undefined;
51
+ }): Promise<Vecito>;
52
+ /**
53
+ * Filesystem layer on top of the core {@link Vecito} library. Turns files and
54
+ * directories into data items and feeds them to the generic indexer. Node-only
55
+ * (imports `fs`/`path`); use it via the `vecito/file` subpath export.
56
+ */
57
+ /** Broad default set of text-ish extensions. Override with the `ext` option. */
58
+ export const DEFAULT_EXTENSIONS: string[];
59
+ import { Vecito } from './vecito.js';
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Text highlighting utilities for search result display.
3
+ *
4
+ * {@link Highlighter.highlight} wraps matched terms in `<mark>` tags using
5
+ * stem-aware matching: "running" matches the term "run", "adventure" matches
6
+ * "adventurous", etc. {@link Highlighter.snippet} extracts a relevant excerpt
7
+ * with the same stem-aware center-finding.
8
+ */
9
+ export class Highlighter {
10
+ /**
11
+ * Escape HTML special characters in a plain-text string.
12
+ * @param {string} s
13
+ * @returns {string}
14
+ */
15
+ static escape(s: string): string;
16
+ /**
17
+ * Tokenize a query string for use as highlight terms. Splits on non-word
18
+ * characters, lowercases, drops stopwords and tokens shorter than 3 chars.
19
+ * Used as a fallback when BM25 matched terms are not available (dense mode).
20
+ * @param {string} text
21
+ * @returns {string[]} Unique tokens, longest first.
22
+ */
23
+ static tokenize(text: string): string[];
24
+ /**
25
+ * Wrap occurrences of `terms` in `text` with `<mark>` tags, HTML-escaping
26
+ * everything else. Matching is stem-aware and case-insensitive: the term
27
+ * "run" will match "running", "runs", "ran"; "adventure" matches "adventures".
28
+ * Gaps between matched words are bridged into one `<mark>` span when every
29
+ * word in the gap is a stopword (e.g. "Tales of Mystery" → single highlight).
30
+ * @param {string} text Plain text to highlight.
31
+ * @param {string[]|Set<string>} terms Terms to highlight.
32
+ * @returns {string} HTML string with `<mark>…</mark>` around matches.
33
+ */
34
+ static highlight(text: string, terms: string[] | Set<string>): string;
35
+ /**
36
+ * Extract a snippet of at most `maxLen` characters centred on the first
37
+ * stem match. Short texts are returned in full. Ellipsis (`…`) is added at
38
+ * truncated edges. Pass the result to {@link Highlighter.highlight} for markup.
39
+ * @param {string} text Plain text.
40
+ * @param {string[]|Set<string>} terms Terms to centre the window on.
41
+ * @param {number} [maxLen=220] Maximum character count of the returned snippet.
42
+ * @returns {string} Plain-text excerpt.
43
+ */
44
+ static snippet(text: string, terms: string[] | Set<string>, maxLen?: number): string;
45
+ }
@@ -0,0 +1,125 @@
1
+ /**
2
+ * Dense + sparse vector store backed by altor-vec (WASM HNSW).
3
+ *
4
+ * The HNSW graph serializes to bytes via `engine.to_bytes()` and restores in
5
+ * milliseconds via `new WasmSearchEngine(bytes)` — eliminating the long rebuild
6
+ * that edgevec required. Sparse (BM25) vectors are stored as plain arrays and
7
+ * searched with brute-force dot products, which gives 100% recall for ≤1M docs.
8
+ */
9
+ export class VecStore {
10
+ /**
11
+ * Load from a file (Node only).
12
+ * @param {string} filePath
13
+ * @returns {Promise<VecStore>}
14
+ */
15
+ static load(filePath: string): Promise<VecStore>;
16
+ /**
17
+ * Rebuild from bytes produced by {@link VecStore#exportBytes}.
18
+ * @param {Uint8Array} bytes
19
+ * @returns {Promise<VecStore>}
20
+ */
21
+ static loadFromBytes(bytes: Uint8Array): Promise<VecStore>;
22
+ /**
23
+ * Fetch an exported blob over HTTP and rebuild it.
24
+ * @param {string} url
25
+ * @returns {Promise<VecStore>}
26
+ */
27
+ static loadFromUrl(url: string): Promise<VecStore>;
28
+ /**
29
+ * Deserialize bytes produced by exportBytes.
30
+ * The HNSW graph is restored directly — no rebuild loop.
31
+ */
32
+ static #fromBytes(bytes: any): VecStore;
33
+ /**
34
+ * @param {object} opts
35
+ * @param {number} opts.dimensions Dense vector dimensionality.
36
+ */
37
+ constructor({ dimensions }: {
38
+ dimensions: number;
39
+ });
40
+ /**
41
+ * Initialize the WASM module. Must be called before insert / search.
42
+ * @returns {Promise<void>}
43
+ */
44
+ init(): Promise<void>;
45
+ /**
46
+ * Insert a dense vector with optional metadata.
47
+ * @param {Float32Array|number[]} vector
48
+ * @param {Record<string, any>} [metadata]
49
+ * @returns {number} 1-based id for backward compatibility.
50
+ */
51
+ insert(vector: Float32Array | number[], metadata?: Record<string, any>): number;
52
+ /**
53
+ * Enable sparse storage. Inserts a placeholder at index 0 so sparse ids
54
+ * (0-based) align with dense ids (1-based).
55
+ */
56
+ initSparse(): void;
57
+ /**
58
+ * Insert a sparse BM25 vector, paired positionally with the prior dense insert.
59
+ * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
60
+ */
61
+ insertSparse(sparse: {
62
+ indices: Uint32Array;
63
+ values: Float32Array;
64
+ dim: number;
65
+ }): void;
66
+ /**
67
+ * Nearest-neighbor dense search.
68
+ * @param {Float32Array} query
69
+ * @param {number} [k=10]
70
+ * @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
71
+ */
72
+ search(query: Float32Array, k?: number): Promise<Array<{
73
+ id: number;
74
+ score: number;
75
+ metadata: Record<string, any>;
76
+ }>>;
77
+ /**
78
+ * Hybrid dense + sparse search with RRF rank fusion.
79
+ * Dense side uses the HNSW; sparse side uses brute-force dot products.
80
+ * @param {Float32Array} denseQuery
81
+ * @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
82
+ * @param {number} [k=10]
83
+ * @param {object} [opts]
84
+ * @param {number} [opts.dense_k]
85
+ * @param {number} [opts.sparse_k]
86
+ * @param {string|{type:string,alpha:number}} [opts.fusion='rrf']
87
+ * @returns {Array<{id:number,score:number,dense_rank?:number,sparse_rank?:number,metadata:Record<string,any>}>}
88
+ */
89
+ hybridSearch(denseQuery: Float32Array, sparse: {
90
+ indices: Uint32Array;
91
+ values: Float32Array;
92
+ dim: number;
93
+ }, k?: number, { dense_k, sparse_k, fusion }?: {
94
+ dense_k?: number | undefined;
95
+ sparse_k?: number | undefined;
96
+ fusion?: string | {
97
+ type: string;
98
+ alpha: number;
99
+ } | undefined;
100
+ }): Array<{
101
+ id: number;
102
+ score: number;
103
+ dense_rank?: number;
104
+ sparse_rank?: number;
105
+ metadata: Record<string, any>;
106
+ }>;
107
+ /**
108
+ * Serialize to a self-contained Uint8Array.
109
+ * Layout: [uint32 hnswLen][HNSW bytes][JSON: {dims, metadata, sparse?}]
110
+ * @returns {Uint8Array}
111
+ */
112
+ exportBytes(): Uint8Array;
113
+ /**
114
+ * Write to a file (Node only).
115
+ * @param {string} filePath
116
+ */
117
+ save(filePath: string): Promise<void>;
118
+ /** @alias save */
119
+ exportToFile(filePath: any): Promise<void>;
120
+ /** Number of indexed vectors. */
121
+ get count(): number;
122
+ /** Dimensionality of the dense vectors. */
123
+ get dimensions(): number;
124
+ #private;
125
+ }
@@ -0,0 +1,195 @@
1
+ /**
2
+ * @typedef {object} AddOptions
3
+ * @property {(item: any) => string} [text] Extract the searchable text from an item.
4
+ * Defaults to {@link defaultText} (strings pass through, objects are flattened
5
+ * to their string leaves).
6
+ * @property {(item: any) => Record<string, any>} [metadata] Extract metadata
7
+ * returned with hits. Defaults to {@link defaultMetadata} (objects are kept
8
+ * as-is, so hits carry the original data back).
9
+ */
10
+ /**
11
+ * @typedef {object} SearchResult
12
+ * @property {number} [id] Internal vector id.
13
+ * @property {number} [score] Relevance score (interpretation depends on mode).
14
+ * @property {number} [dense_rank] Rank on the dense side (hybrid mode).
15
+ * @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
16
+ * @property {Record<string, any>} metadata The document's metadata.
17
+ * @property {string[]} [matchedTerms] Query terms matched, present only when `search` is called with `matchedTerms: true`.
18
+ */
19
+ /**
20
+ * Vecito — isomorphic (Node + browser) hybrid semantic search.
21
+ *
22
+ * Orchestrates an Embedder (dense), BM25 (sparse) and a VecStore (edgevec) so
23
+ * callers get a one-liner instead of wiring the three primitives by hand.
24
+ * Universal methods (addDocuments / search / exportBytes / loadFromBytes /
25
+ * loadFromUrl) run anywhere; save/load are Node-only.
26
+ */
27
+ export class Vecito {
28
+ /**
29
+ * Parse a container blob produced by {@link Vecito#exportBytes} back into a
30
+ * ready-to-search instance.
31
+ * @param {Uint8Array} bytes
32
+ * @returns {Promise<Vecito>}
33
+ */
34
+ static #fromContainer(bytes: Uint8Array): Promise<Vecito>;
35
+ /**
36
+ * Rebuild from bytes produced by {@link Vecito#exportBytes} (universal).
37
+ * @param {Uint8Array|ArrayBuffer} bytes
38
+ * @returns {Promise<Vecito>}
39
+ */
40
+ static loadFromBytes(bytes: Uint8Array | ArrayBuffer): Promise<Vecito>;
41
+ /**
42
+ * Fetch an exported blob over HTTP and rebuild it (browser + Node fetch).
43
+ * @param {string} url
44
+ * @returns {Promise<Vecito>}
45
+ */
46
+ static loadFromUrl(url: string): Promise<Vecito>;
47
+ /**
48
+ * Read an exported blob from a file. Node only.
49
+ * @param {string} path
50
+ * @returns {Promise<Vecito>}
51
+ * @throws {Error} In non-Node environments.
52
+ */
53
+ static load(path: string): Promise<Vecito>;
54
+ /**
55
+ * @param {object} [opts]
56
+ * @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Embedding model id.
57
+ * @param {string} [opts.dtype='q8'] Weight precision (`'q8'` ≈ 4× smaller
58
+ * download by default, `'fp32'` for full precision).
59
+ * @param {Embedder} [opts.embedder] A pre-built (optionally pre-loaded)
60
+ * {@link Embedder} to use instead of constructing one — handy for reusing a
61
+ * loaded model across indexes or for timing model load separately. Takes
62
+ * precedence over `model`/`dtype`.
63
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode. `'hybrid'` stores
64
+ * both dense vectors and BM25 sparse data; `'dense'` omits sparse/BM25 for a
65
+ * smaller snapshot. The mode is embedded in the snapshot and respected on load.
66
+ * @param {number} [opts.k1] BM25 term-frequency saturation (hybrid mode only).
67
+ * @param {number} [opts.b] BM25 length-normalization factor (hybrid mode only).
68
+ */
69
+ constructor({ model, dtype, embedder, mode, k1, b }?: {
70
+ model?: string | undefined;
71
+ dtype?: string | undefined;
72
+ embedder?: Embedder | undefined;
73
+ mode?: "hybrid" | "dense" | undefined;
74
+ k1?: number | undefined;
75
+ b?: number | undefined;
76
+ });
77
+ /**
78
+ * Index arbitrary data. Items may be strings, plain JSON objects, or anything
79
+ * else — the `text`/`metadata` extractors (with smart defaults) decide what to
80
+ * embed and what to return with hits, so raw objects work with zero config.
81
+ *
82
+ * BM25 is fit on the **first** call (its global df/idf statistics need a
83
+ * corpus), then frozen. Subsequent calls — including adding documents to an
84
+ * instance restored via {@link Vecito.load}/{@link Vecito.loadFromBytes} —
85
+ * append documents scored against that existing model, keeping the index
86
+ * consistent. Dense (semantic) search covers new documents fully; sparse
87
+ * scoring only sees terms already in the frozen vocabulary. For best lexical
88
+ * recall, pass your whole corpus in the first call.
89
+ * @param {any|any[]} items Item(s) to index.
90
+ * @param {AddOptions} [opts]
91
+ * @returns {Promise<this>}
92
+ */
93
+ addDocuments(items: any | any[], { text, metadata }?: AddOptions): Promise<this>;
94
+ /**
95
+ * Search the index.
96
+ * @param {string} query Natural-language query.
97
+ * @param {object} [opts]
98
+ * @param {'hybrid'|'dense'|'sparse'} [opts.mode='hybrid'] Ranking strategy.
99
+ * 'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
100
+ * BM25-weighted. Falls back to dense if the query has no in-vocab terms.
101
+ * @param {number} [opts.top=10] Maximum number of results.
102
+ * @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode. Over-fetches and grows the candidate set adaptively (up to the full index) so a selective predicate still returns up to `top` matches when they exist.
103
+ * @param {boolean} [opts.matchedTerms=false] When true, attach the matched query terms to each result as a `matchedTerms` string array.
104
+ * @returns {Promise<SearchResult[]>}
105
+ * @throws {Error} If nothing has been indexed or loaded yet.
106
+ */
107
+ search(query: string, { mode, top, filter, matchedTerms: includeTerms }?: {
108
+ mode?: "sparse" | "hybrid" | "dense" | undefined;
109
+ top?: number | undefined;
110
+ filter?: ((meta: Record<string, any>) => boolean) | undefined;
111
+ matchedTerms?: boolean | undefined;
112
+ }): Promise<SearchResult[]>;
113
+ /**
114
+ * Number of indexed documents.
115
+ * @returns {number}
116
+ */
117
+ get count(): number;
118
+ /**
119
+ * The embedding model id this instance uses.
120
+ * @returns {string}
121
+ */
122
+ get model(): string;
123
+ /**
124
+ * Weight precision the embedder loads (e.g. `'q8'`, `'fp32'`).
125
+ * @returns {string}
126
+ */
127
+ get dtype(): string;
128
+ /**
129
+ * Width of the dense vectors in the index, or null before anything is indexed.
130
+ * @returns {number|null}
131
+ */
132
+ get dimensions(): number | null;
133
+ /**
134
+ * Index mode this instance was built with (`'hybrid'` or `'dense'`).
135
+ * @returns {'hybrid'|'dense'}
136
+ */
137
+ get indexMode(): "hybrid" | "dense";
138
+ /**
139
+ * Serialize everything (vectors + metadata + sparse + BM25 + model name +
140
+ * dtype) into one Uint8Array. Layout: [uint32 metaLen][meta JSON][VecStore
141
+ * bytes].
142
+ * @returns {Uint8Array}
143
+ * @throws {Error} If nothing has been indexed yet.
144
+ */
145
+ exportBytes(): Uint8Array;
146
+ /**
147
+ * Write the exported blob to a file. Node only.
148
+ * @param {string} path
149
+ * @returns {Promise<void>}
150
+ * @throws {Error} In non-Node environments.
151
+ */
152
+ save(path: string): Promise<void>;
153
+ #private;
154
+ }
155
+ export type AddOptions = {
156
+ /**
157
+ * Extract the searchable text from an item.
158
+ * Defaults to {@link defaultText} (strings pass through, objects are flattened
159
+ * to their string leaves).
160
+ */
161
+ text?: ((item: any) => string) | undefined;
162
+ /**
163
+ * Extract metadata
164
+ * returned with hits. Defaults to {@link defaultMetadata} (objects are kept
165
+ * as-is, so hits carry the original data back).
166
+ */
167
+ metadata?: ((item: any) => Record<string, any>) | undefined;
168
+ };
169
+ export type SearchResult = {
170
+ /**
171
+ * Internal vector id.
172
+ */
173
+ id?: number | undefined;
174
+ /**
175
+ * Relevance score (interpretation depends on mode).
176
+ */
177
+ score?: number | undefined;
178
+ /**
179
+ * Rank on the dense side (hybrid mode).
180
+ */
181
+ dense_rank?: number | undefined;
182
+ /**
183
+ * Rank on the sparse side (hybrid mode).
184
+ */
185
+ sparse_rank?: number | undefined;
186
+ /**
187
+ * The document's metadata.
188
+ */
189
+ metadata: Record<string, any>;
190
+ /**
191
+ * Query terms matched, present only when `search` is called with `matchedTerms: true`.
192
+ */
193
+ matchedTerms?: string[] | undefined;
194
+ };
195
+ import { Embedder } from './embedder.js';
package/file.d.ts DELETED
@@ -1,27 +0,0 @@
1
- import { Vecito } from './index';
2
-
3
- export const DEFAULT_EXTENSIONS: string[];
4
-
5
- export interface WalkOptions {
6
- ext?: string[];
7
- hidden?: boolean;
8
- limit?: number;
9
- }
10
-
11
- export interface IndexOptions extends WalkOptions {
12
- model?: string;
13
- dtype?: string;
14
- mode?: 'hybrid' | 'dense';
15
- }
16
-
17
- /** Recursively collect matching file paths, skipping dotfiles by default. */
18
- export function walk(dir: string, opts?: WalkOptions): string[];
19
-
20
- /** Index an explicit list of files into a fresh Vecito. */
21
- export function indexFiles(
22
- paths: string[],
23
- opts?: { model?: string; dtype?: string; mode?: 'hybrid' | 'dense'; base?: string }
24
- ): Promise<Vecito>;
25
-
26
- /** Walk a directory and index every matching file into a fresh Vecito. */
27
- export function indexDirectory(dir: string, opts?: IndexOptions): Promise<Vecito>;
package/index.d.ts DELETED
@@ -1,113 +0,0 @@
1
- export interface SparseVector {
2
- indices: Uint32Array;
3
- values: Float32Array;
4
- dim: number;
5
- }
6
-
7
- export interface SearchResult {
8
- id?: number;
9
- score?: number;
10
- dense_rank?: number;
11
- sparse_rank?: number;
12
- metadata: Record<string, any>;
13
- /** BM25-matched (hybrid) or tokenized (dense) terms. Present only when search was called with `{ matchedTerms: true }`. */
14
- matchedTerms?: string[];
15
- }
16
-
17
- export class Highlighter {
18
- /** Escape HTML special characters in a plain-text string. */
19
- static escape(s: string): string;
20
- /** Tokenize a query string for dense-mode fallback highlighting. */
21
- static tokenize(text: string): string[];
22
- /** Wrap occurrences of `terms` in `text` with `<mark>` tags. Matching is stem-aware: "run" matches "running", "adventure" matches "adventures". */
23
- static highlight(text: string, terms: string[] | Set<string>): string;
24
- /** Extract a snippet centred on the first stem match (plain text — pass to highlight for markup). */
25
- static snippet(text: string, terms: string[] | Set<string>, maxLen?: number): string;
26
- }
27
-
28
- export class Embedder {
29
- constructor(opts?: { model?: string; dtype?: string });
30
- init(): Promise<void>;
31
- embed(text: string): Promise<Float32Array>;
32
- embedBatch(texts: string[], opts?: { batchSize?: number }): Promise<Float32Array[]>;
33
- get dimensions(): number;
34
- get dtype(): string;
35
- get model(): string;
36
- }
37
-
38
- export class BM25 {
39
- constructor(opts?: { k1?: number; b?: number });
40
- fit(texts: string[]): void;
41
- score(text: string): SparseVector;
42
- scoreAll(texts: string[]): SparseVector[];
43
- /** Map a query string to the in-vocabulary term ids it contains. */
44
- scoreQuery(queryText: string): { indices: number[]; vocabSize: number };
45
- querySparse(queryText: string): SparseVector;
46
- /** Map vocabulary term ids back to their original term strings (unknown ids omitted). */
47
- termsForIndices(indices: Uint32Array | number[]): string[];
48
- toJSON(): Record<string, any>;
49
- static fromJSON(data: Record<string, any>): BM25;
50
- get vocabSize(): number;
51
- }
52
-
53
- export class VecStore {
54
- constructor(opts: { dimensions: number });
55
- init(): Promise<void>;
56
- insert(vector: Float32Array | number[], metadata?: Record<string, any>): number;
57
- initSparse(): void;
58
- insertSparse(sparse: SparseVector): void;
59
- search(query: Float32Array, k?: number): Promise<SearchResult[]>;
60
- /** Post-filters HNSW candidates with a JS predicate over metadata objects. */
61
- searchWithFilter(query: Float32Array, filter: (meta: Record<string, any>) => boolean, k?: number): Promise<SearchResult[]>;
62
- hybridSearch(
63
- denseQuery: Float32Array,
64
- sparse: SparseVector,
65
- k?: number,
66
- opts?: { dense_k?: number; sparse_k?: number; fusion?: any }
67
- ): SearchResult[];
68
- save(filePath: string): Promise<void>;
69
- /** Alias for {@link VecStore.save}. */
70
- exportToFile(filePath: string): Promise<void>;
71
- exportBytes(): Uint8Array;
72
- static load(filePath: string): Promise<VecStore>;
73
- static loadFromBytes(bytes: Uint8Array): Promise<VecStore>;
74
- static loadFromUrl(url: string): Promise<VecStore>;
75
- get count(): number;
76
- /** Dense vector width of the index. */
77
- get dimensions(): number;
78
- }
79
-
80
- export interface AddOptions {
81
- /** Extract searchable text from an item (default: flatten string values). */
82
- text?: (item: any) => string;
83
- /** Extract metadata returned with hits (default: the object itself). */
84
- metadata?: (item: any) => Record<string, any>;
85
- }
86
-
87
- export interface VecitoSearchOptions {
88
- mode?: 'hybrid' | 'dense' | 'sparse';
89
- top?: number;
90
- /** JS predicate over metadata — post-filters results in any mode, over-fetching to preserve the requested count. */
91
- filter?: (meta: Record<string, any>) => boolean;
92
- /** When true, each result includes `matchedTerms` for use with `Highlighter.highlight`. */
93
- matchedTerms?: boolean;
94
- }
95
-
96
- export class Vecito {
97
- constructor(opts?: { model?: string; dtype?: string; embedder?: Embedder; mode?: 'hybrid' | 'dense'; k1?: number; b?: number });
98
- addDocuments(items: any | any[], opts?: AddOptions): Promise<this>;
99
- search(query: string, opts?: VecitoSearchOptions): Promise<SearchResult[]>;
100
- exportBytes(): Uint8Array;
101
- save(path: string): Promise<void>;
102
- static load(path: string): Promise<Vecito>;
103
- static loadFromBytes(bytes: Uint8Array | ArrayBuffer): Promise<Vecito>;
104
- static loadFromUrl(url: string): Promise<Vecito>;
105
- get count(): number;
106
- get model(): string;
107
- /** Weight precision the embedder loads (e.g. 'q8', 'fp32'). */
108
- get dtype(): string;
109
- /** Dense vector width of the index, or null before anything is indexed. */
110
- get dimensions(): number | null;
111
- /** Index mode this instance was built with ('hybrid' or 'dense'). */
112
- get indexMode(): 'hybrid' | 'dense';
113
- }