vecito 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -4
- package/bin/cli.js +36 -10
- package/index.js +7 -0
- package/lib/vec-store.js +0 -14
- package/lib/vecito.js +32 -17
- package/package.json +8 -6
- package/types/bin/cli.d.ts +2 -0
- package/types/index.d.ts +15 -0
- package/types/lib/bm25.d.ts +93 -0
- package/types/lib/embedder.d.ts +68 -0
- package/types/lib/extract.d.ts +25 -0
- package/types/lib/file-index.d.ts +59 -0
- package/types/lib/highlight.d.ts +45 -0
- package/types/lib/vec-store.d.ts +125 -0
- package/types/lib/vecito.d.ts +195 -0
- package/file.d.ts +0 -27
- package/index.d.ts +0 -113
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@ lexical scoring over an [`altor-vec`](https://github.com/altor-lab/altor-vec) WA
|
|
|
7
7
|
no API keys.
|
|
8
8
|
|
|
9
9
|
**Where to build the index.** Building a snapshot means embedding every document and constructing the HNSW graph — the expensive part. It's usually best to do this once on the server or with the CLI (`vecito index`), then serve the resulting `.vecito` file and load it in the browser with `Vecito.loadFromUrl()`, which restores the pre-built graph in milliseconds. Building, indexing, and adding documents directly in the browser is fully supported too — it just runs that same per-document embedding client-side, which is slow for large corpora.
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
## Install
|
|
12
12
|
|
|
13
13
|
```bash
|
|
@@ -52,6 +52,17 @@ await v.addDocuments(rows, {
|
|
|
52
52
|
(BM25-weighted). All modes support a `filter` predicate to post-filter results by metadata.
|
|
53
53
|
If the query has no in-vocabulary terms, hybrid/sparse automatically fall back to dense.
|
|
54
54
|
|
|
55
|
+
```js
|
|
56
|
+
// `filter` is a JS predicate over each hit's metadata — works in any mode.
|
|
57
|
+
// Filtering happens after ranking, so vecito over-fetches and grows the candidate
|
|
58
|
+
// set adaptively (up to the whole index) to still return `top` matches when they exist.
|
|
59
|
+
const hits = await v.search('how do plants make food?', {
|
|
60
|
+
mode: 'hybrid',
|
|
61
|
+
top: 3,
|
|
62
|
+
filter: m => m.title === 'Botany',
|
|
63
|
+
});
|
|
64
|
+
```
|
|
65
|
+
|
|
55
66
|
### Options & models
|
|
56
67
|
|
|
57
68
|
```js
|
|
@@ -69,6 +80,26 @@ later (including to a loaded snapshot — see below). Dense search covers new do
|
|
|
69
80
|
sparse scoring only sees terms already in the frozen vocabulary, so pass your whole corpus up
|
|
70
81
|
front for best lexical recall.
|
|
71
82
|
|
|
83
|
+
### Highlighting
|
|
84
|
+
|
|
85
|
+
Pass `matchedTerms: true` to get the query terms each hit matched, then render an excerpt with
|
|
86
|
+
the exported `Highlighter`. `snippet()` extracts a relevant window centred on the first match;
|
|
87
|
+
`highlight()` wraps matches in `<mark>` tags. Both are stem-aware (the term `run` matches
|
|
88
|
+
`running`/`ran`) and case-insensitive, and `highlight()` HTML-escapes everything else.
|
|
89
|
+
|
|
90
|
+
```js
|
|
91
|
+
import { Vecito, Highlighter } from 'vecito';
|
|
92
|
+
|
|
93
|
+
const hits = await v.search('how do plants make food?', { matchedTerms: true });
|
|
94
|
+
for (const h of hits) {
|
|
95
|
+
const excerpt = Highlighter.snippet(h.metadata.body, h.matchedTerms); // plain-text window (≤220 chars)
|
|
96
|
+
const html = Highlighter.highlight(excerpt, h.matchedTerms); // '…<mark>Photosynthesis</mark>…'
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
In dense mode (no BM25 terms available) `matchedTerms` falls back to the query's own tokens, so
|
|
101
|
+
highlighting still works. You can also call `Highlighter.tokenize(query)` to derive terms yourself.
|
|
102
|
+
|
|
72
103
|
### Persistence
|
|
73
104
|
|
|
74
105
|
```js
|
|
@@ -92,7 +123,7 @@ await loaded.save('data.vecito');
|
|
|
92
123
|
```
|
|
93
124
|
|
|
94
125
|
The primitives are exported too if you want to wire them yourself:
|
|
95
|
-
`import { Embedder, BM25, VecStore } from 'vecito'`.
|
|
126
|
+
`import { Embedder, BM25, VecStore, Highlighter } from 'vecito'`.
|
|
96
127
|
|
|
97
128
|
## File indexing (`vecito/file`)
|
|
98
129
|
|
|
@@ -137,7 +168,7 @@ out of dependency pre-bundling (`optimizeDeps.exclude`) so their `import.meta.ur
|
|
|
137
168
|
Install globally to get the `vecito` command on your `PATH`:
|
|
138
169
|
|
|
139
170
|
```bash
|
|
140
|
-
pnpm add -g vecito
|
|
171
|
+
pnpm add -g vecito --config.onlyBuiltDependencies='["onnxruntime-node","protobufjs","sharp"]'
|
|
141
172
|
```
|
|
142
173
|
|
|
143
174
|
Or run it without installing via `pnpm dlx vecito …`.
|
|
@@ -152,6 +183,12 @@ vecito index ./docs -o docs.vecito
|
|
|
152
183
|
# Search (path is optional; defaults to data.vecito in the current directory)
|
|
153
184
|
vecito search "renewable energy sources" --mode hybrid --top 5
|
|
154
185
|
vecito search "renewable energy sources" docs.vecito --top 5
|
|
186
|
+
|
|
187
|
+
# Filter by metadata — a JS expression with the hit's metadata bound to `meta`
|
|
188
|
+
vecito search "renewable energy" --filter 'meta.name.endsWith(".md")'
|
|
189
|
+
|
|
190
|
+
# Machine-readable output — score, ranks, and full metadata as JSON (pipeable)
|
|
191
|
+
vecito search "renewable energy" --json | jq '.[].metadata'
|
|
155
192
|
```
|
|
156
193
|
|
|
157
194
|
`index` recursively walks the directory, indexing a broad set of text/data/code extensions
|
|
@@ -164,7 +201,7 @@ The trailing path is optional and **defaults to the current directory** — `ind
|
|
|
164
201
|
|
|
165
202
|
```
|
|
166
203
|
vecito index [dir] [-o data.vecito] [--ext .md,.txt,...] [--hidden] [--limit N]
|
|
167
|
-
vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>]
|
|
204
|
+
vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>] [--json]
|
|
168
205
|
```
|
|
169
206
|
|
|
170
207
|
## Sample data
|
package/bin/cli.js
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { existsSync, statSync } from 'fs';
|
|
3
|
-
import { join } from 'path';
|
|
2
|
+
import { existsSync, statSync, readFileSync } from 'fs';
|
|
3
|
+
import { join, dirname } from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
4
5
|
import { Vecito } from '../lib/vecito.js';
|
|
5
6
|
import { walk, indexFiles } from '../lib/file-index.js';
|
|
6
7
|
|
|
7
8
|
/** Default index filename, used for both output and directory-relative lookup. */
|
|
8
9
|
const DEFAULT_INDEX = 'data.vecito';
|
|
9
10
|
|
|
11
|
+
/** Package version, read from package.json next to this CLI. */
|
|
12
|
+
const VERSION = JSON.parse(
|
|
13
|
+
readFileSync(join(dirname(fileURLToPath(import.meta.url)), '../package.json'), 'utf8'),
|
|
14
|
+
).version;
|
|
15
|
+
|
|
10
16
|
/** Flags that consume the following argv token as their value. */
|
|
11
17
|
const VALUE_FLAGS = new Set(['-o', '--out', '--ext', '--limit', '--mode', '--top', '--filter']);
|
|
12
18
|
|
|
@@ -62,15 +68,18 @@ function resolveIndexPath(p) {
|
|
|
62
68
|
}
|
|
63
69
|
|
|
64
70
|
/**
|
|
65
|
-
* Print CLI usage to stderr.
|
|
71
|
+
* Print CLI usage. Goes to stdout for an explicit help request, stderr otherwise.
|
|
72
|
+
* @param {{help?: boolean}} [opts]
|
|
66
73
|
* @returns {void}
|
|
67
74
|
*/
|
|
68
|
-
function usage() {
|
|
69
|
-
|
|
75
|
+
function usage({ help = false } = {}) {
|
|
76
|
+
const write = help ? console.log : console.error;
|
|
77
|
+
write(`vecito — hybrid (dense + BM25) semantic search
|
|
70
78
|
|
|
71
79
|
Usage:
|
|
72
80
|
vecito index [dir] [-o data.vecito] [--mode dense|hybrid] [--ext .md,.txt,...] [--hidden] [--limit N]
|
|
73
|
-
vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>]
|
|
81
|
+
vecito search <query> [path] [--mode dense|sparse|hybrid] [--top N] [--filter <expr>] [--json]
|
|
82
|
+
vecito --version | --help
|
|
74
83
|
|
|
75
84
|
The trailing path is optional and defaults to the current directory.
|
|
76
85
|
|
|
@@ -84,7 +93,8 @@ Index options:
|
|
|
84
93
|
Search options:
|
|
85
94
|
--mode <m> Search mode: hybrid (default), dense, or sparse
|
|
86
95
|
--top <n> Number of results (default: 10)
|
|
87
|
-
--filter <expr> JS expression over metadata, e.g. 'meta.category === "science"'
|
|
96
|
+
--filter <expr> JS expression over metadata, e.g. 'meta.category === "science"'
|
|
97
|
+
--json Output results as JSON (score, ranks, full metadata) on stdout`);
|
|
88
98
|
}
|
|
89
99
|
|
|
90
100
|
/**
|
|
@@ -130,6 +140,7 @@ async function cmdSearch() {
|
|
|
130
140
|
|
|
131
141
|
const mode = flag('--mode', 'hybrid');
|
|
132
142
|
const top = parseInt(flag('--top', '10'), 10);
|
|
143
|
+
const asJson = hasFlag('--json');
|
|
133
144
|
const filterExpr = flag('--filter', undefined);
|
|
134
145
|
let filter;
|
|
135
146
|
if (filterExpr) {
|
|
@@ -148,10 +159,19 @@ async function cmdSearch() {
|
|
|
148
159
|
|
|
149
160
|
const vecito = await Vecito.load(indexFile);
|
|
150
161
|
const effectiveMode = vecito.indexMode === 'dense' ? 'dense' : mode;
|
|
151
|
-
|
|
152
|
-
|
|
162
|
+
if (!asJson) {
|
|
163
|
+
console.log(`Loaded ${vecito.count} doc(s) from ${indexFile} [index: ${vecito.indexMode}, search: ${effectiveMode}]\n`);
|
|
164
|
+
console.log(`Results for "${query}"${filterExpr ? ` (filter: ${filterExpr})` : ''}:\n`);
|
|
165
|
+
}
|
|
153
166
|
|
|
154
167
|
const results = await vecito.search(query, { mode: effectiveMode, top, filter });
|
|
168
|
+
|
|
169
|
+
if (asJson) {
|
|
170
|
+
// Pure JSON on stdout (score, ranks, full metadata) — pipeable into jq etc.
|
|
171
|
+
console.log(JSON.stringify(results, null, 2));
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
|
|
155
175
|
if (results.length === 0) {
|
|
156
176
|
console.log('(no results)');
|
|
157
177
|
return;
|
|
@@ -171,7 +191,13 @@ async function cmdSearch() {
|
|
|
171
191
|
}
|
|
172
192
|
|
|
173
193
|
const cmd = process.argv[2];
|
|
174
|
-
if (cmd === '
|
|
194
|
+
if (cmd === '--version' || cmd === '-v') {
|
|
195
|
+
console.log(VERSION);
|
|
196
|
+
process.exit(0);
|
|
197
|
+
} else if (cmd === '--help' || cmd === '-h' || cmd === 'help' || !cmd) {
|
|
198
|
+
usage({ help: true });
|
|
199
|
+
process.exit(0);
|
|
200
|
+
} else if (cmd === 'index') {
|
|
175
201
|
await cmdIndex();
|
|
176
202
|
} else if (cmd === 'search') {
|
|
177
203
|
await cmdSearch();
|
package/index.js
CHANGED
|
@@ -3,3 +3,10 @@ export { BM25 } from './lib/bm25.js';
|
|
|
3
3
|
export { VecStore } from './lib/vec-store.js';
|
|
4
4
|
export { Vecito } from './lib/vecito.js';
|
|
5
5
|
export { Highlighter } from './lib/highlight.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Re-export the public typedefs so consumers can import them from the package
|
|
9
|
+
* root. They are authored as JSDoc types on the Vecito module.
|
|
10
|
+
* @typedef {import('./lib/vecito.js').SearchResult} SearchResult
|
|
11
|
+
* @typedef {import('./lib/vecito.js').AddOptions} AddOptions
|
|
12
|
+
*/
|
package/lib/vec-store.js
CHANGED
|
@@ -129,20 +129,6 @@ export class VecStore {
|
|
|
129
129
|
return this.#denseScan(query, k);
|
|
130
130
|
}
|
|
131
131
|
|
|
132
|
-
/**
|
|
133
|
-
* Dense search with a JS predicate post-filter. Fetches `k * 5` candidates
|
|
134
|
-
* from the HNSW, then applies the filter to metadata objects.
|
|
135
|
-
* @param {Float32Array} query
|
|
136
|
-
* @param {(meta: Record<string,any>) => boolean} filter JS predicate.
|
|
137
|
-
* @param {number} [k=10]
|
|
138
|
-
* @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
|
|
139
|
-
*/
|
|
140
|
-
async searchWithFilter(query, filter, k = 10) {
|
|
141
|
-
const candidates = this.#denseScan(query, k * 5);
|
|
142
|
-
if (typeof filter !== 'function') return candidates.slice(0, k);
|
|
143
|
-
return candidates.filter(r => filter(r.metadata)).slice(0, k);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
132
|
/**
|
|
147
133
|
* Hybrid dense + sparse search with RRF rank fusion.
|
|
148
134
|
* Dense side uses the HNSW; sparse side uses brute-force dot products.
|
package/lib/vecito.js
CHANGED
|
@@ -33,6 +33,7 @@ function placeholderSparse(dim) {
|
|
|
33
33
|
* @property {number} [dense_rank] Rank on the dense side (hybrid mode).
|
|
34
34
|
* @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
|
|
35
35
|
* @property {Record<string, any>} metadata The document's metadata.
|
|
36
|
+
* @property {string[]} [matchedTerms] Query terms matched, present only when `search` is called with `matchedTerms: true`.
|
|
36
37
|
*/
|
|
37
38
|
|
|
38
39
|
/**
|
|
@@ -168,7 +169,8 @@ export class Vecito {
|
|
|
168
169
|
* 'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
|
|
169
170
|
* BM25-weighted. Falls back to dense if the query has no in-vocab terms.
|
|
170
171
|
* @param {number} [opts.top=10] Maximum number of results.
|
|
171
|
-
* @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode
|
|
172
|
+
* @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode. Over-fetches and grows the candidate set adaptively (up to the full index) so a selective predicate still returns up to `top` matches when they exist.
|
|
173
|
+
* @param {boolean} [opts.matchedTerms=false] When true, attach the matched query terms to each result as a `matchedTerms` string array.
|
|
172
174
|
* @returns {Promise<SearchResult[]>}
|
|
173
175
|
* @throws {Error} If nothing has been indexed or loaded yet.
|
|
174
176
|
*/
|
|
@@ -181,29 +183,42 @@ export class Vecito {
|
|
|
181
183
|
|
|
182
184
|
const queryVec = await this.#embedder.embed(query);
|
|
183
185
|
|
|
184
|
-
let results;
|
|
185
186
|
let querySparse = null;
|
|
186
|
-
|
|
187
|
-
const fetchK = filter ? top * 5 : top;
|
|
188
|
-
|
|
189
|
-
if (effectiveMode === 'dense') {
|
|
190
|
-
results = await this.#store.search(queryVec, fetchK);
|
|
191
|
-
} else {
|
|
187
|
+
if (effectiveMode !== 'dense') {
|
|
192
188
|
querySparse = this.#bm25.querySparse(query);
|
|
193
|
-
|
|
189
|
+
}
|
|
190
|
+
const hasSparse = !!querySparse && querySparse.indices.length > 0;
|
|
194
191
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
192
|
+
// Fetch the top-k ranked candidates from the store for the resolved mode.
|
|
193
|
+
const fetchCandidates = async (k) => {
|
|
194
|
+
if (effectiveMode === 'dense' || !hasSparse) {
|
|
195
|
+
return await this.#store.search(queryVec, k);
|
|
196
|
+
}
|
|
197
|
+
if (effectiveMode === 'sparse') {
|
|
198
|
+
return this.#store.hybridSearch(queryVec, querySparse, k, {
|
|
199
199
|
fusion: { type: 'linear', alpha: 0.0 },
|
|
200
200
|
});
|
|
201
|
-
} else {
|
|
202
|
-
results = this.#store.hybridSearch(queryVec, querySparse, fetchK);
|
|
203
201
|
}
|
|
204
|
-
|
|
202
|
+
return this.#store.hybridSearch(queryVec, querySparse, k);
|
|
203
|
+
};
|
|
205
204
|
|
|
206
|
-
|
|
205
|
+
let results;
|
|
206
|
+
if (!filter) {
|
|
207
|
+
results = await fetchCandidates(top);
|
|
208
|
+
} else {
|
|
209
|
+
// Filtering happens after ranking, so a selective predicate can leave fewer
|
|
210
|
+
// than `top` hits in the first batch. Grow the fetch and retry until we have
|
|
211
|
+
// enough matches or we've scanned the entire index.
|
|
212
|
+
const total = this.#store.count;
|
|
213
|
+
let fetchK = Math.min(top * 5, total);
|
|
214
|
+
while (true) {
|
|
215
|
+
const candidates = await fetchCandidates(fetchK);
|
|
216
|
+
results = candidates.filter(r => filter(r.metadata));
|
|
217
|
+
if (results.length >= top || fetchK >= total) break;
|
|
218
|
+
fetchK = Math.min(fetchK * 4, total);
|
|
219
|
+
}
|
|
220
|
+
results = results.slice(0, top);
|
|
221
|
+
}
|
|
207
222
|
|
|
208
223
|
if (includeTerms) {
|
|
209
224
|
const terms = querySparse && querySparse.indices.length > 0
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "vecito",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"description": "Tiny hybrid (dense + BM25) semantic search for Node and the browser",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "Jeka Kiselyov",
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
"url": "https://github.com/jeka-kiselyov/vecito/issues"
|
|
14
14
|
},
|
|
15
15
|
"main": "index.js",
|
|
16
|
-
"types": "index.d.ts",
|
|
16
|
+
"types": "types/index.d.ts",
|
|
17
17
|
"publishConfig": {
|
|
18
18
|
"access": "public"
|
|
19
19
|
},
|
|
@@ -22,23 +22,24 @@
|
|
|
22
22
|
},
|
|
23
23
|
"exports": {
|
|
24
24
|
".": {
|
|
25
|
-
"types": "./index.d.ts",
|
|
25
|
+
"types": "./types/index.d.ts",
|
|
26
26
|
"default": "./index.js"
|
|
27
27
|
},
|
|
28
28
|
"./file": {
|
|
29
|
-
"types": "./file.d.ts",
|
|
29
|
+
"types": "./types/lib/file-index.d.ts",
|
|
30
30
|
"default": "./lib/file-index.js"
|
|
31
31
|
}
|
|
32
32
|
},
|
|
33
33
|
"files": [
|
|
34
34
|
"index.js",
|
|
35
|
-
"
|
|
36
|
-
"file.d.ts",
|
|
35
|
+
"types/",
|
|
37
36
|
"lib/",
|
|
38
37
|
"bin/"
|
|
39
38
|
],
|
|
40
39
|
"scripts": {
|
|
41
40
|
"test": "vitest run",
|
|
41
|
+
"types": "tsc -p tsconfig.json",
|
|
42
|
+
"prepublishOnly": "pnpm run types && pnpm test",
|
|
42
43
|
"dev:browser": "vite"
|
|
43
44
|
},
|
|
44
45
|
"keywords": [
|
|
@@ -55,6 +56,7 @@
|
|
|
55
56
|
"stemmer": "^2.0.1"
|
|
56
57
|
},
|
|
57
58
|
"devDependencies": {
|
|
59
|
+
"typescript": "^6.0.3",
|
|
58
60
|
"vite": "^6.0.0",
|
|
59
61
|
"vitest": "^3.2.0"
|
|
60
62
|
},
|
package/types/index.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export { Embedder } from "./lib/embedder.js";
|
|
2
|
+
export { BM25 } from "./lib/bm25.js";
|
|
3
|
+
export { VecStore } from "./lib/vec-store.js";
|
|
4
|
+
export { Vecito } from "./lib/vecito.js";
|
|
5
|
+
export { Highlighter } from "./lib/highlight.js";
|
|
6
|
+
/**
|
|
7
|
+
* Re-export the public typedefs so consumers can import them from the package
|
|
8
|
+
* root. They are authored as JSDoc types on the Vecito module.
|
|
9
|
+
*/
|
|
10
|
+
export type SearchResult = import("./lib/vecito.js").SearchResult;
|
|
11
|
+
/**
|
|
12
|
+
* Re-export the public typedefs so consumers can import them from the package
|
|
13
|
+
* root. They are authored as JSDoc types on the Vecito module.
|
|
14
|
+
*/
|
|
15
|
+
export type AddOptions = import("./lib/vecito.js").AddOptions;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 sparse lexical model.
|
|
3
|
+
*
|
|
4
|
+
* Fit over a corpus to learn vocabulary and document frequencies, then produce
|
|
5
|
+
* sparse vectors (term-id → weight) for documents ({@link BM25#score}) and
|
|
6
|
+
* queries ({@link BM25#querySparse}). Serializable via {@link BM25#toJSON} /
|
|
7
|
+
* {@link BM25.fromJSON}.
|
|
8
|
+
*/
|
|
9
|
+
export class BM25 {
|
|
10
|
+
/**
|
|
11
|
+
* Reconstruct a fitted model from {@link BM25#toJSON} output.
|
|
12
|
+
* @param {object} data
|
|
13
|
+
* @returns {BM25}
|
|
14
|
+
*/
|
|
15
|
+
static fromJSON(data: object): BM25;
|
|
16
|
+
/**
|
|
17
|
+
* @param {object} [opts]
|
|
18
|
+
* @param {number} [opts.k1=1.2] Term-frequency saturation parameter.
|
|
19
|
+
* @param {number} [opts.b=0.75] Document-length normalization (0..1).
|
|
20
|
+
*/
|
|
21
|
+
constructor({ k1, b }?: {
|
|
22
|
+
k1?: number | undefined;
|
|
23
|
+
b?: number | undefined;
|
|
24
|
+
});
|
|
25
|
+
/**
|
|
26
|
+
* Build the vocabulary and corpus statistics (document frequencies, average
|
|
27
|
+
* length) from a set of documents. Must be called before scoring.
|
|
28
|
+
* @param {string[]} texts The full corpus.
|
|
29
|
+
* @returns {void}
|
|
30
|
+
*/
|
|
31
|
+
fit(texts: string[]): void;
|
|
32
|
+
/**
|
|
33
|
+
* Compute the BM25 sparse vector for a document. Out-of-vocabulary terms are
|
|
34
|
+
* ignored.
|
|
35
|
+
* @param {string} text Document text.
|
|
36
|
+
* @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
|
|
37
|
+
* sparse vector over the vocabulary.
|
|
38
|
+
*/
|
|
39
|
+
score(text: string): {
|
|
40
|
+
indices: Uint32Array;
|
|
41
|
+
values: Float32Array;
|
|
42
|
+
dim: number;
|
|
43
|
+
};
|
|
44
|
+
/**
|
|
45
|
+
* Convenience wrapper that scores many documents.
|
|
46
|
+
* @param {string[]} texts
|
|
47
|
+
* @returns {Array<{indices: Uint32Array, values: Float32Array, dim: number}>}
|
|
48
|
+
*/
|
|
49
|
+
scoreAll(texts: string[]): Array<{
|
|
50
|
+
indices: Uint32Array;
|
|
51
|
+
values: Float32Array;
|
|
52
|
+
dim: number;
|
|
53
|
+
}>;
|
|
54
|
+
/**
|
|
55
|
+
* Map a query string to the list of in-vocabulary term ids it contains.
|
|
56
|
+
* @param {string} queryText
|
|
57
|
+
* @returns {{indices: number[], vocabSize: number}} Matched term ids.
|
|
58
|
+
*/
|
|
59
|
+
scoreQuery(queryText: string): {
|
|
60
|
+
indices: number[];
|
|
61
|
+
vocabSize: number;
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Build an IDF-weighted sparse vector for a query (assumes term frequency 1
|
|
65
|
+
* per query term). Use this to drive sparse/hybrid search.
|
|
66
|
+
* @param {string} queryText
|
|
67
|
+
* @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
|
|
68
|
+
* sparse vector; empty `indices` means no query term is in the vocabulary.
|
|
69
|
+
*/
|
|
70
|
+
querySparse(queryText: string): {
|
|
71
|
+
indices: Uint32Array;
|
|
72
|
+
values: Float32Array;
|
|
73
|
+
dim: number;
|
|
74
|
+
};
|
|
75
|
+
/**
|
|
76
|
+
* Serialize the fitted model to a plain JSON-safe object.
|
|
77
|
+
* @returns {object} Pass to {@link BM25.fromJSON} to restore.
|
|
78
|
+
*/
|
|
79
|
+
toJSON(): object;
|
|
80
|
+
/**
|
|
81
|
+
* Map a list of vocabulary indices back to their original term strings.
|
|
82
|
+
* The reverse map is built lazily on first call and cached.
|
|
83
|
+
* @param {Uint32Array|number[]} indices Vocabulary term ids.
|
|
84
|
+
* @returns {string[]} The corresponding terms (unknown ids silently omitted).
|
|
85
|
+
*/
|
|
86
|
+
termsForIndices(indices: Uint32Array | number[]): string[];
|
|
87
|
+
/**
|
|
88
|
+
* Number of distinct terms in the fitted vocabulary (the sparse dimension).
|
|
89
|
+
* @returns {number}
|
|
90
|
+
*/
|
|
91
|
+
get vocabSize(): number;
|
|
92
|
+
#private;
|
|
93
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dense text embedder backed by transformers.js.
|
|
3
|
+
*
|
|
4
|
+
* Produces L2-normalized, mean-pooled sentence embeddings. The model is loaded
|
|
5
|
+
* lazily on first use and cached for the lifetime of the instance.
|
|
6
|
+
*/
|
|
7
|
+
export class Embedder {
|
|
8
|
+
/**
|
|
9
|
+
* @param {object} [opts]
|
|
10
|
+
* @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Hugging Face model id
|
|
11
|
+
* for a feature-extraction pipeline. Any output width is supported — the
|
|
12
|
+
* actual dimension is detected at {@link Embedder#init} (see
|
|
13
|
+
* {@link Embedder#dimensions}).
|
|
14
|
+
* @param {string} [opts.dtype='q8'] Weight precision to load: `'q8'`
|
|
15
|
+
* (quantized, ~4× smaller download, the default) or `'fp32'` (full
|
|
16
|
+
* precision), plus other transformers.js dtypes (`'fp16'`, `'q4'`, …) when
|
|
17
|
+
* the model provides them.
|
|
18
|
+
*/
|
|
19
|
+
constructor({ model, dtype }?: {
|
|
20
|
+
model?: string | undefined;
|
|
21
|
+
dtype?: string | undefined;
|
|
22
|
+
});
|
|
23
|
+
/**
|
|
24
|
+
* Load the underlying pipeline if it hasn't been loaded yet, and detect the
|
|
25
|
+
* model's embedding width with a one-token probe. Safe to call repeatedly;
|
|
26
|
+
* subsequent calls are no-ops. Called automatically by {@link Embedder#embed}
|
|
27
|
+
* / {@link Embedder#embedBatch}.
|
|
28
|
+
* @returns {Promise<void>}
|
|
29
|
+
*/
|
|
30
|
+
init(): Promise<void>;
|
|
31
|
+
/**
|
|
32
|
+
* Embed a single string into a normalized dense vector.
|
|
33
|
+
* @param {string} text Input text.
|
|
34
|
+
* @returns {Promise<Float32Array>} A {@link Embedder#dimensions}-length vector.
|
|
35
|
+
*/
|
|
36
|
+
embed(text: string): Promise<Float32Array>;
|
|
37
|
+
/**
|
|
38
|
+
* Embed many strings, processing them in batches for throughput.
|
|
39
|
+
* @param {string[]} texts Input texts.
|
|
40
|
+
* @param {object} [opts]
|
|
41
|
+
* @param {number} [opts.batchSize=32] Number of texts per forward pass.
|
|
42
|
+
* @returns {Promise<Float32Array[]>} One vector per input, in input order.
|
|
43
|
+
* Each is a view into a shared buffer — copy it if you need to retain it
|
|
44
|
+
* independently.
|
|
45
|
+
*/
|
|
46
|
+
embedBatch(texts: string[], { batchSize }?: {
|
|
47
|
+
batchSize?: number | undefined;
|
|
48
|
+
}): Promise<Float32Array[]>;
|
|
49
|
+
/**
|
|
50
|
+
* Dimensionality of the vectors this model produces. Available only after
|
|
51
|
+
* {@link Embedder#init} (or a first {@link Embedder#embed}) has run, since it
|
|
52
|
+
* is detected from the loaded model.
|
|
53
|
+
* @returns {number} e.g. 384 for MiniLM/BGE-small, 768 for MPNet/GTE-base.
|
|
54
|
+
* @throws {Error} If called before the model has been initialized.
|
|
55
|
+
*/
|
|
56
|
+
get dimensions(): number;
|
|
57
|
+
/**
|
|
58
|
+
* The Hugging Face model id this embedder uses.
|
|
59
|
+
* @returns {string}
|
|
60
|
+
*/
|
|
61
|
+
get model(): string;
|
|
62
|
+
/**
|
|
63
|
+
* Weight precision this embedder loads (e.g. `'q8'`, `'fp32'`).
|
|
64
|
+
* @returns {string}
|
|
65
|
+
*/
|
|
66
|
+
get dtype(): string;
|
|
67
|
+
#private;
|
|
68
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Universal (Node + browser) helpers for turning arbitrary data into the text
|
|
3
|
+
* Vecito indexes and the metadata it returns with hits. No filesystem deps.
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Recursively collect string leaves from any value and join them. Lets raw JSON
|
|
7
|
+
* objects/arrays be indexed without the caller pre-extracting their text.
|
|
8
|
+
* @param {*} value Any value — object, array, string, etc.
|
|
9
|
+
* @returns {string} All string leaves, '. '-joined (empty string if none).
|
|
10
|
+
*/
|
|
11
|
+
export function flattenStrings(value: any): string;
|
|
12
|
+
/**
|
|
13
|
+
* Default text extractor: strings pass through; objects/arrays are flattened to
|
|
14
|
+
* their string leaves; everything else is stringified.
|
|
15
|
+
* @param {*} item
|
|
16
|
+
* @returns {string}
|
|
17
|
+
*/
|
|
18
|
+
export function defaultText(item: any): string;
|
|
19
|
+
/**
|
|
20
|
+
* Default metadata extractor: objects are returned as-is (so search hits carry
|
|
21
|
+
* the original data back); non-objects carry no metadata.
|
|
22
|
+
* @param {*} item
|
|
23
|
+
* @returns {Record<string, any>}
|
|
24
|
+
*/
|
|
25
|
+
export function defaultMetadata(item: any): Record<string, any>;
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recursively collect files under `dir` whose extension is allowed. Skips
|
|
3
|
+
* dotfiles and dot-directories unless `hidden` is true.
|
|
4
|
+
* @param {string} dir Directory to walk.
|
|
5
|
+
* @param {object} [opts]
|
|
6
|
+
* @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
|
|
7
|
+
* @param {boolean} [opts.hidden=false] Include entries whose name starts with '.'.
|
|
8
|
+
* @param {number} [opts.limit=Infinity] Stop after this many files.
|
|
9
|
+
* @returns {string[]} Matching file paths.
|
|
10
|
+
*/
|
|
11
|
+
export function walk(dir: string, { ext, hidden, limit }?: {
|
|
12
|
+
ext?: string[] | undefined;
|
|
13
|
+
hidden?: boolean | undefined;
|
|
14
|
+
limit?: number | undefined;
|
|
15
|
+
}): string[];
|
|
16
|
+
/**
|
|
17
|
+
* Index an explicit list of files into a fresh {@link Vecito}.
|
|
18
|
+
* @param {string[]} paths File paths to index (one document each).
|
|
19
|
+
* @param {object} [opts]
|
|
20
|
+
* @param {string} [opts.model] Embedding model id passed to Vecito.
|
|
21
|
+
* @param {string} [opts.dtype] Weight precision passed to Vecito.
|
|
22
|
+
* @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
|
|
23
|
+
* @param {string} [opts.base] Base dir for relative `path` metadata.
|
|
24
|
+
* @returns {Promise<Vecito>}
|
|
25
|
+
*/
|
|
26
|
+
export function indexFiles(paths: string[], { model, dtype, mode, base }?: {
|
|
27
|
+
model?: string | undefined;
|
|
28
|
+
dtype?: string | undefined;
|
|
29
|
+
mode?: "hybrid" | "dense" | undefined;
|
|
30
|
+
base?: string | undefined;
|
|
31
|
+
}): Promise<Vecito>;
|
|
32
|
+
/**
|
|
33
|
+
* Walk a directory and index every matching file into a fresh {@link Vecito}.
|
|
34
|
+
* @param {string} dir Directory to index.
|
|
35
|
+
* @param {object} [opts]
|
|
36
|
+
* @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
|
|
37
|
+
* @param {boolean} [opts.hidden=false] Include dotfiles/dot-directories.
|
|
38
|
+
* @param {number} [opts.limit=Infinity] Index at most this many files.
|
|
39
|
+
* @param {string} [opts.model] Embedding model id passed to Vecito.
|
|
40
|
+
* @param {string} [opts.dtype] Weight precision passed to Vecito.
|
|
41
|
+
* @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
|
|
42
|
+
* @returns {Promise<Vecito>}
|
|
43
|
+
*/
|
|
44
|
+
export function indexDirectory(dir: string, { ext, hidden, limit, model, dtype, mode }?: {
|
|
45
|
+
ext?: string[] | undefined;
|
|
46
|
+
hidden?: boolean | undefined;
|
|
47
|
+
limit?: number | undefined;
|
|
48
|
+
model?: string | undefined;
|
|
49
|
+
dtype?: string | undefined;
|
|
50
|
+
mode?: "hybrid" | "dense" | undefined;
|
|
51
|
+
}): Promise<Vecito>;
|
|
52
|
+
/**
|
|
53
|
+
* Filesystem layer on top of the core {@link Vecito} library. Turns files and
|
|
54
|
+
* directories into data items and feeds them to the generic indexer. Node-only
|
|
55
|
+
* (imports `fs`/`path`); use it via the `vecito/file` subpath export.
|
|
56
|
+
*/
|
|
57
|
+
/** Broad default set of text-ish extensions. Override with the `ext` option. */
|
|
58
|
+
export const DEFAULT_EXTENSIONS: string[];
|
|
59
|
+
import { Vecito } from './vecito.js';
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text highlighting utilities for search result display.
|
|
3
|
+
*
|
|
4
|
+
* {@link Highlighter.highlight} wraps matched terms in `<mark>` tags using
|
|
5
|
+
* stem-aware matching: "running" matches the term "run", "adventure" matches
|
|
6
|
+
* "adventurous", etc. {@link Highlighter.snippet} extracts a relevant excerpt
|
|
7
|
+
* with the same stem-aware center-finding.
|
|
8
|
+
*/
|
|
9
|
+
export class Highlighter {
|
|
10
|
+
/**
|
|
11
|
+
* Escape HTML special characters in a plain-text string.
|
|
12
|
+
* @param {string} s
|
|
13
|
+
* @returns {string}
|
|
14
|
+
*/
|
|
15
|
+
static escape(s: string): string;
|
|
16
|
+
/**
|
|
17
|
+
* Tokenize a query string for use as highlight terms. Splits on non-word
|
|
18
|
+
* characters, lowercases, drops stopwords and tokens shorter than 3 chars.
|
|
19
|
+
* Used as a fallback when BM25 matched terms are not available (dense mode).
|
|
20
|
+
* @param {string} text
|
|
21
|
+
* @returns {string[]} Unique tokens, longest first.
|
|
22
|
+
*/
|
|
23
|
+
static tokenize(text: string): string[];
|
|
24
|
+
/**
|
|
25
|
+
* Wrap occurrences of `terms` in `text` with `<mark>` tags, HTML-escaping
|
|
26
|
+
* everything else. Matching is stem-aware and case-insensitive: the term
|
|
27
|
+
* "run" will match "running", "runs", "ran"; "adventure" matches "adventures".
|
|
28
|
+
* Gaps between matched words are bridged into one `<mark>` span when every
|
|
29
|
+
* word in the gap is a stopword (e.g. "Tales of Mystery" → single highlight).
|
|
30
|
+
* @param {string} text Plain text to highlight.
|
|
31
|
+
* @param {string[]|Set<string>} terms Terms to highlight.
|
|
32
|
+
* @returns {string} HTML string with `<mark>…</mark>` around matches.
|
|
33
|
+
*/
|
|
34
|
+
static highlight(text: string, terms: string[] | Set<string>): string;
|
|
35
|
+
/**
|
|
36
|
+
* Extract a snippet of at most `maxLen` characters centred on the first
|
|
37
|
+
* stem match. Short texts are returned in full. Ellipsis (`…`) is added at
|
|
38
|
+
* truncated edges. Pass the result to {@link Highlighter.highlight} for markup.
|
|
39
|
+
* @param {string} text Plain text.
|
|
40
|
+
* @param {string[]|Set<string>} terms Terms to centre the window on.
|
|
41
|
+
* @param {number} [maxLen=220] Maximum character count of the returned snippet.
|
|
42
|
+
* @returns {string} Plain-text excerpt.
|
|
43
|
+
*/
|
|
44
|
+
static snippet(text: string, terms: string[] | Set<string>, maxLen?: number): string;
|
|
45
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dense + sparse vector store backed by altor-vec (WASM HNSW).
|
|
3
|
+
*
|
|
4
|
+
* The HNSW graph serializes to bytes via `engine.to_bytes()` and restores in
|
|
5
|
+
* milliseconds via `new WasmSearchEngine(bytes)` — eliminating the long rebuild
|
|
6
|
+
* that edgevec required. Sparse (BM25) vectors are stored as plain arrays and
|
|
7
|
+
* searched with brute-force dot products, which gives 100% recall for ≤1M docs.
|
|
8
|
+
*/
|
|
9
|
+
export class VecStore {
|
|
10
|
+
/**
|
|
11
|
+
* Load from a file (Node only).
|
|
12
|
+
* @param {string} filePath
|
|
13
|
+
* @returns {Promise<VecStore>}
|
|
14
|
+
*/
|
|
15
|
+
static load(filePath: string): Promise<VecStore>;
|
|
16
|
+
/**
|
|
17
|
+
* Rebuild from bytes produced by {@link VecStore#exportBytes}.
|
|
18
|
+
* @param {Uint8Array} bytes
|
|
19
|
+
* @returns {Promise<VecStore>}
|
|
20
|
+
*/
|
|
21
|
+
static loadFromBytes(bytes: Uint8Array): Promise<VecStore>;
|
|
22
|
+
/**
|
|
23
|
+
* Fetch an exported blob over HTTP and rebuild it.
|
|
24
|
+
* @param {string} url
|
|
25
|
+
* @returns {Promise<VecStore>}
|
|
26
|
+
*/
|
|
27
|
+
static loadFromUrl(url: string): Promise<VecStore>;
|
|
28
|
+
/**
|
|
29
|
+
* Deserialize bytes produced by exportBytes.
|
|
30
|
+
* The HNSW graph is restored directly — no rebuild loop.
|
|
31
|
+
*/
|
|
32
|
+
static #fromBytes(bytes: any): VecStore;
|
|
33
|
+
/**
|
|
34
|
+
* @param {object} opts
|
|
35
|
+
* @param {number} opts.dimensions Dense vector dimensionality.
|
|
36
|
+
*/
|
|
37
|
+
constructor({ dimensions }: {
|
|
38
|
+
dimensions: number;
|
|
39
|
+
});
|
|
40
|
+
/**
|
|
41
|
+
* Initialize the WASM module. Must be called before insert / search.
|
|
42
|
+
* @returns {Promise<void>}
|
|
43
|
+
*/
|
|
44
|
+
init(): Promise<void>;
|
|
45
|
+
/**
|
|
46
|
+
* Insert a dense vector with optional metadata.
|
|
47
|
+
* @param {Float32Array|number[]} vector
|
|
48
|
+
* @param {Record<string, any>} [metadata]
|
|
49
|
+
* @returns {number} 1-based id for backward compatibility.
|
|
50
|
+
*/
|
|
51
|
+
insert(vector: Float32Array | number[], metadata?: Record<string, any>): number;
|
|
52
|
+
/**
|
|
53
|
+
* Enable sparse storage. Inserts a placeholder at index 0 so sparse ids
|
|
54
|
+
* (0-based) align with dense ids (1-based).
|
|
55
|
+
*/
|
|
56
|
+
initSparse(): void;
|
|
57
|
+
/**
|
|
58
|
+
* Insert a sparse BM25 vector, paired positionally with the prior dense insert.
|
|
59
|
+
* @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
|
|
60
|
+
*/
|
|
61
|
+
insertSparse(sparse: {
|
|
62
|
+
indices: Uint32Array;
|
|
63
|
+
values: Float32Array;
|
|
64
|
+
dim: number;
|
|
65
|
+
}): void;
|
|
66
|
+
/**
|
|
67
|
+
* Nearest-neighbor dense search.
|
|
68
|
+
* @param {Float32Array} query
|
|
69
|
+
* @param {number} [k=10]
|
|
70
|
+
* @returns {Promise<Array<{id: number, score: number, metadata: Record<string,any>}>>}
|
|
71
|
+
*/
|
|
72
|
+
search(query: Float32Array, k?: number): Promise<Array<{
|
|
73
|
+
id: number;
|
|
74
|
+
score: number;
|
|
75
|
+
metadata: Record<string, any>;
|
|
76
|
+
}>>;
|
|
77
|
+
/**
|
|
78
|
+
* Hybrid dense + sparse search with RRF rank fusion.
|
|
79
|
+
* Dense side uses the HNSW; sparse side uses brute-force dot products.
|
|
80
|
+
* @param {Float32Array} denseQuery
|
|
81
|
+
* @param {{indices: Uint32Array, values: Float32Array, dim: number}} sparse
|
|
82
|
+
* @param {number} [k=10]
|
|
83
|
+
* @param {object} [opts]
|
|
84
|
+
* @param {number} [opts.dense_k]
|
|
85
|
+
* @param {number} [opts.sparse_k]
|
|
86
|
+
* @param {string|{type:string,alpha:number}} [opts.fusion='rrf']
|
|
87
|
+
* @returns {Array<{id:number,score:number,dense_rank?:number,sparse_rank?:number,metadata:Record<string,any>}>}
|
|
88
|
+
*/
|
|
89
|
+
hybridSearch(denseQuery: Float32Array, sparse: {
|
|
90
|
+
indices: Uint32Array;
|
|
91
|
+
values: Float32Array;
|
|
92
|
+
dim: number;
|
|
93
|
+
}, k?: number, { dense_k, sparse_k, fusion }?: {
|
|
94
|
+
dense_k?: number | undefined;
|
|
95
|
+
sparse_k?: number | undefined;
|
|
96
|
+
fusion?: string | {
|
|
97
|
+
type: string;
|
|
98
|
+
alpha: number;
|
|
99
|
+
} | undefined;
|
|
100
|
+
}): Array<{
|
|
101
|
+
id: number;
|
|
102
|
+
score: number;
|
|
103
|
+
dense_rank?: number;
|
|
104
|
+
sparse_rank?: number;
|
|
105
|
+
metadata: Record<string, any>;
|
|
106
|
+
}>;
|
|
107
|
+
/**
|
|
108
|
+
* Serialize to a self-contained Uint8Array.
|
|
109
|
+
* Layout: [uint32 hnswLen][HNSW bytes][JSON: {dims, metadata, sparse?}]
|
|
110
|
+
* @returns {Uint8Array}
|
|
111
|
+
*/
|
|
112
|
+
exportBytes(): Uint8Array;
|
|
113
|
+
/**
|
|
114
|
+
* Write to a file (Node only).
|
|
115
|
+
* @param {string} filePath
|
|
116
|
+
*/
|
|
117
|
+
save(filePath: string): Promise<void>;
|
|
118
|
+
/** @alias save */
|
|
119
|
+
exportToFile(filePath: any): Promise<void>;
|
|
120
|
+
/** Number of indexed vectors. */
|
|
121
|
+
get count(): number;
|
|
122
|
+
/** Dimensionality of the dense vectors. */
|
|
123
|
+
get dimensions(): number;
|
|
124
|
+
#private;
|
|
125
|
+
}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @typedef {object} AddOptions
|
|
3
|
+
* @property {(item: any) => string} [text] Extract the searchable text from an item.
|
|
4
|
+
* Defaults to {@link defaultText} (strings pass through, objects are flattened
|
|
5
|
+
* to their string leaves).
|
|
6
|
+
* @property {(item: any) => Record<string, any>} [metadata] Extract metadata
|
|
7
|
+
* returned with hits. Defaults to {@link defaultMetadata} (objects are kept
|
|
8
|
+
* as-is, so hits carry the original data back).
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* @typedef {object} SearchResult
|
|
12
|
+
* @property {number} [id] Internal vector id.
|
|
13
|
+
* @property {number} [score] Relevance score (interpretation depends on mode).
|
|
14
|
+
* @property {number} [dense_rank] Rank on the dense side (hybrid mode).
|
|
15
|
+
* @property {number} [sparse_rank] Rank on the sparse side (hybrid mode).
|
|
16
|
+
* @property {Record<string, any>} metadata The document's metadata.
|
|
17
|
+
* @property {string[]} [matchedTerms] Query terms matched, present only when `search` is called with `matchedTerms: true`.
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* Vecito — isomorphic (Node + browser) hybrid semantic search.
|
|
21
|
+
*
|
|
22
|
+
* Orchestrates an Embedder (dense), BM25 (sparse) and a VecStore (edgevec) so
|
|
23
|
+
* callers get a one-liner instead of wiring the three primitives by hand.
|
|
24
|
+
* Universal methods (addDocuments / search / exportBytes / loadFromBytes /
|
|
25
|
+
* loadFromUrl) run anywhere; save/load are Node-only.
|
|
26
|
+
*/
|
|
27
|
+
export class Vecito {
|
|
28
|
+
/**
|
|
29
|
+
* Parse a container blob produced by {@link Vecito#exportBytes} back into a
|
|
30
|
+
* ready-to-search instance.
|
|
31
|
+
* @param {Uint8Array} bytes
|
|
32
|
+
* @returns {Promise<Vecito>}
|
|
33
|
+
*/
|
|
34
|
+
static #fromContainer(bytes: Uint8Array): Promise<Vecito>;
|
|
35
|
+
/**
|
|
36
|
+
* Rebuild from bytes produced by {@link Vecito#exportBytes} (universal).
|
|
37
|
+
* @param {Uint8Array|ArrayBuffer} bytes
|
|
38
|
+
* @returns {Promise<Vecito>}
|
|
39
|
+
*/
|
|
40
|
+
static loadFromBytes(bytes: Uint8Array | ArrayBuffer): Promise<Vecito>;
|
|
41
|
+
/**
|
|
42
|
+
* Fetch an exported blob over HTTP and rebuild it (browser + Node fetch).
|
|
43
|
+
* @param {string} url
|
|
44
|
+
* @returns {Promise<Vecito>}
|
|
45
|
+
*/
|
|
46
|
+
static loadFromUrl(url: string): Promise<Vecito>;
|
|
47
|
+
/**
|
|
48
|
+
* Read an exported blob from a file. Node only.
|
|
49
|
+
* @param {string} path
|
|
50
|
+
* @returns {Promise<Vecito>}
|
|
51
|
+
* @throws {Error} In non-Node environments.
|
|
52
|
+
*/
|
|
53
|
+
static load(path: string): Promise<Vecito>;
|
|
54
|
+
/**
|
|
55
|
+
* @param {object} [opts]
|
|
56
|
+
* @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Embedding model id.
|
|
57
|
+
* @param {string} [opts.dtype='q8'] Weight precision (`'q8'` ≈ 4× smaller
|
|
58
|
+
* download by default, `'fp32'` for full precision).
|
|
59
|
+
* @param {Embedder} [opts.embedder] A pre-built (optionally pre-loaded)
|
|
60
|
+
* {@link Embedder} to use instead of constructing one — handy for reusing a
|
|
61
|
+
* loaded model across indexes or for timing model load separately. Takes
|
|
62
|
+
* precedence over `model`/`dtype`.
|
|
63
|
+
* @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode. `'hybrid'` stores
|
|
64
|
+
* both dense vectors and BM25 sparse data; `'dense'` omits sparse/BM25 for a
|
|
65
|
+
* smaller snapshot. The mode is embedded in the snapshot and respected on load.
|
|
66
|
+
* @param {number} [opts.k1] BM25 term-frequency saturation (hybrid mode only).
|
|
67
|
+
* @param {number} [opts.b] BM25 length-normalization factor (hybrid mode only).
|
|
68
|
+
*/
|
|
69
|
+
constructor({ model, dtype, embedder, mode, k1, b }?: {
|
|
70
|
+
model?: string | undefined;
|
|
71
|
+
dtype?: string | undefined;
|
|
72
|
+
embedder?: Embedder | undefined;
|
|
73
|
+
mode?: "hybrid" | "dense" | undefined;
|
|
74
|
+
k1?: number | undefined;
|
|
75
|
+
b?: number | undefined;
|
|
76
|
+
});
|
|
77
|
+
/**
|
|
78
|
+
* Index arbitrary data. Items may be strings, plain JSON objects, or anything
|
|
79
|
+
* else — the `text`/`metadata` extractors (with smart defaults) decide what to
|
|
80
|
+
* embed and what to return with hits, so raw objects work with zero config.
|
|
81
|
+
*
|
|
82
|
+
* BM25 is fit on the **first** call (its global df/idf statistics need a
|
|
83
|
+
* corpus), then frozen. Subsequent calls — including adding documents to an
|
|
84
|
+
* instance restored via {@link Vecito.load}/{@link Vecito.loadFromBytes} —
|
|
85
|
+
* append documents scored against that existing model, keeping the index
|
|
86
|
+
* consistent. Dense (semantic) search covers new documents fully; sparse
|
|
87
|
+
* scoring only sees terms already in the frozen vocabulary. For best lexical
|
|
88
|
+
* recall, pass your whole corpus in the first call.
|
|
89
|
+
* @param {any|any[]} items Item(s) to index.
|
|
90
|
+
* @param {AddOptions} [opts]
|
|
91
|
+
* @returns {Promise<this>}
|
|
92
|
+
*/
|
|
93
|
+
addDocuments(items: any | any[], { text, metadata }?: AddOptions): Promise<this>;
|
|
94
|
+
/**
|
|
95
|
+
* Search the index.
|
|
96
|
+
* @param {string} query Natural-language query.
|
|
97
|
+
* @param {object} [opts]
|
|
98
|
+
* @param {'hybrid'|'dense'|'sparse'} [opts.mode='hybrid'] Ranking strategy.
|
|
99
|
+
* 'hybrid' fuses dense + BM25 via RRF; 'dense' is vectors-only; 'sparse' is
|
|
100
|
+
* BM25-weighted. Falls back to dense if the query has no in-vocab terms.
|
|
101
|
+
* @param {number} [opts.top=10] Maximum number of results.
|
|
102
|
+
* @param {(meta: Record<string,any>) => boolean} [opts.filter] JS predicate over metadata — post-filters results in any mode. Over-fetches and grows the candidate set adaptively (up to the full index) so a selective predicate still returns up to `top` matches when they exist.
|
|
103
|
+
* @param {boolean} [opts.matchedTerms=false] When true, attach the matched query terms to each result as a `matchedTerms` string array.
|
|
104
|
+
* @returns {Promise<SearchResult[]>}
|
|
105
|
+
* @throws {Error} If nothing has been indexed or loaded yet.
|
|
106
|
+
*/
|
|
107
|
+
search(query: string, { mode, top, filter, matchedTerms: includeTerms }?: {
|
|
108
|
+
mode?: "sparse" | "hybrid" | "dense" | undefined;
|
|
109
|
+
top?: number | undefined;
|
|
110
|
+
filter?: ((meta: Record<string, any>) => boolean) | undefined;
|
|
111
|
+
matchedTerms?: boolean | undefined;
|
|
112
|
+
}): Promise<SearchResult[]>;
|
|
113
|
+
/**
|
|
114
|
+
* Number of indexed documents.
|
|
115
|
+
* @returns {number}
|
|
116
|
+
*/
|
|
117
|
+
get count(): number;
|
|
118
|
+
/**
|
|
119
|
+
* The embedding model id this instance uses.
|
|
120
|
+
* @returns {string}
|
|
121
|
+
*/
|
|
122
|
+
get model(): string;
|
|
123
|
+
/**
|
|
124
|
+
* Weight precision the embedder loads (e.g. `'q8'`, `'fp32'`).
|
|
125
|
+
* @returns {string}
|
|
126
|
+
*/
|
|
127
|
+
get dtype(): string;
|
|
128
|
+
/**
|
|
129
|
+
* Width of the dense vectors in the index, or null before anything is indexed.
|
|
130
|
+
* @returns {number|null}
|
|
131
|
+
*/
|
|
132
|
+
get dimensions(): number | null;
|
|
133
|
+
/**
|
|
134
|
+
* Index mode this instance was built with (`'hybrid'` or `'dense'`).
|
|
135
|
+
* @returns {'hybrid'|'dense'}
|
|
136
|
+
*/
|
|
137
|
+
get indexMode(): "hybrid" | "dense";
|
|
138
|
+
/**
|
|
139
|
+
* Serialize everything (vectors + metadata + sparse + BM25 + model name +
|
|
140
|
+
* dtype) into one Uint8Array. Layout: [uint32 metaLen][meta JSON][VecStore
|
|
141
|
+
* bytes].
|
|
142
|
+
* @returns {Uint8Array}
|
|
143
|
+
* @throws {Error} If nothing has been indexed yet.
|
|
144
|
+
*/
|
|
145
|
+
exportBytes(): Uint8Array;
|
|
146
|
+
/**
|
|
147
|
+
* Write the exported blob to a file. Node only.
|
|
148
|
+
* @param {string} path
|
|
149
|
+
* @returns {Promise<void>}
|
|
150
|
+
* @throws {Error} In non-Node environments.
|
|
151
|
+
*/
|
|
152
|
+
save(path: string): Promise<void>;
|
|
153
|
+
#private;
|
|
154
|
+
}
|
|
155
|
+
export type AddOptions = {
|
|
156
|
+
/**
|
|
157
|
+
* Extract the searchable text from an item.
|
|
158
|
+
* Defaults to {@link defaultText} (strings pass through, objects are flattened
|
|
159
|
+
* to their string leaves).
|
|
160
|
+
*/
|
|
161
|
+
text?: ((item: any) => string) | undefined;
|
|
162
|
+
/**
|
|
163
|
+
* Extract metadata
|
|
164
|
+
* returned with hits. Defaults to {@link defaultMetadata} (objects are kept
|
|
165
|
+
* as-is, so hits carry the original data back).
|
|
166
|
+
*/
|
|
167
|
+
metadata?: ((item: any) => Record<string, any>) | undefined;
|
|
168
|
+
};
|
|
169
|
+
export type SearchResult = {
|
|
170
|
+
/**
|
|
171
|
+
* Internal vector id.
|
|
172
|
+
*/
|
|
173
|
+
id?: number | undefined;
|
|
174
|
+
/**
|
|
175
|
+
* Relevance score (interpretation depends on mode).
|
|
176
|
+
*/
|
|
177
|
+
score?: number | undefined;
|
|
178
|
+
/**
|
|
179
|
+
* Rank on the dense side (hybrid mode).
|
|
180
|
+
*/
|
|
181
|
+
dense_rank?: number | undefined;
|
|
182
|
+
/**
|
|
183
|
+
* Rank on the sparse side (hybrid mode).
|
|
184
|
+
*/
|
|
185
|
+
sparse_rank?: number | undefined;
|
|
186
|
+
/**
|
|
187
|
+
* The document's metadata.
|
|
188
|
+
*/
|
|
189
|
+
metadata: Record<string, any>;
|
|
190
|
+
/**
|
|
191
|
+
* Query terms matched, present only when `search` is called with `matchedTerms: true`.
|
|
192
|
+
*/
|
|
193
|
+
matchedTerms?: string[] | undefined;
|
|
194
|
+
};
|
|
195
|
+
import { Embedder } from './embedder.js';
|
package/file.d.ts
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import { Vecito } from './index';
|
|
2
|
-
|
|
3
|
-
export const DEFAULT_EXTENSIONS: string[];
|
|
4
|
-
|
|
5
|
-
export interface WalkOptions {
|
|
6
|
-
ext?: string[];
|
|
7
|
-
hidden?: boolean;
|
|
8
|
-
limit?: number;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export interface IndexOptions extends WalkOptions {
|
|
12
|
-
model?: string;
|
|
13
|
-
dtype?: string;
|
|
14
|
-
mode?: 'hybrid' | 'dense';
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
/** Recursively collect matching file paths, skipping dotfiles by default. */
|
|
18
|
-
export function walk(dir: string, opts?: WalkOptions): string[];
|
|
19
|
-
|
|
20
|
-
/** Index an explicit list of files into a fresh Vecito. */
|
|
21
|
-
export function indexFiles(
|
|
22
|
-
paths: string[],
|
|
23
|
-
opts?: { model?: string; dtype?: string; mode?: 'hybrid' | 'dense'; base?: string }
|
|
24
|
-
): Promise<Vecito>;
|
|
25
|
-
|
|
26
|
-
/** Walk a directory and index every matching file into a fresh Vecito. */
|
|
27
|
-
export function indexDirectory(dir: string, opts?: IndexOptions): Promise<Vecito>;
|
package/index.d.ts
DELETED
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
export interface SparseVector {
|
|
2
|
-
indices: Uint32Array;
|
|
3
|
-
values: Float32Array;
|
|
4
|
-
dim: number;
|
|
5
|
-
}
|
|
6
|
-
|
|
7
|
-
export interface SearchResult {
|
|
8
|
-
id?: number;
|
|
9
|
-
score?: number;
|
|
10
|
-
dense_rank?: number;
|
|
11
|
-
sparse_rank?: number;
|
|
12
|
-
metadata: Record<string, any>;
|
|
13
|
-
/** BM25-matched (hybrid) or tokenized (dense) terms. Present only when search was called with `{ matchedTerms: true }`. */
|
|
14
|
-
matchedTerms?: string[];
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export class Highlighter {
|
|
18
|
-
/** Escape HTML special characters in a plain-text string. */
|
|
19
|
-
static escape(s: string): string;
|
|
20
|
-
/** Tokenize a query string for dense-mode fallback highlighting. */
|
|
21
|
-
static tokenize(text: string): string[];
|
|
22
|
-
/** Wrap occurrences of `terms` in `text` with `<mark>` tags. Matching is stem-aware: "run" matches "running", "adventure" matches "adventures". */
|
|
23
|
-
static highlight(text: string, terms: string[] | Set<string>): string;
|
|
24
|
-
/** Extract a snippet centred on the first stem match (plain text — pass to highlight for markup). */
|
|
25
|
-
static snippet(text: string, terms: string[] | Set<string>, maxLen?: number): string;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
export class Embedder {
|
|
29
|
-
constructor(opts?: { model?: string; dtype?: string });
|
|
30
|
-
init(): Promise<void>;
|
|
31
|
-
embed(text: string): Promise<Float32Array>;
|
|
32
|
-
embedBatch(texts: string[], opts?: { batchSize?: number }): Promise<Float32Array[]>;
|
|
33
|
-
get dimensions(): number;
|
|
34
|
-
get dtype(): string;
|
|
35
|
-
get model(): string;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
export class BM25 {
|
|
39
|
-
constructor(opts?: { k1?: number; b?: number });
|
|
40
|
-
fit(texts: string[]): void;
|
|
41
|
-
score(text: string): SparseVector;
|
|
42
|
-
scoreAll(texts: string[]): SparseVector[];
|
|
43
|
-
/** Map a query string to the in-vocabulary term ids it contains. */
|
|
44
|
-
scoreQuery(queryText: string): { indices: number[]; vocabSize: number };
|
|
45
|
-
querySparse(queryText: string): SparseVector;
|
|
46
|
-
/** Map vocabulary term ids back to their original term strings (unknown ids omitted). */
|
|
47
|
-
termsForIndices(indices: Uint32Array | number[]): string[];
|
|
48
|
-
toJSON(): Record<string, any>;
|
|
49
|
-
static fromJSON(data: Record<string, any>): BM25;
|
|
50
|
-
get vocabSize(): number;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
export class VecStore {
|
|
54
|
-
constructor(opts: { dimensions: number });
|
|
55
|
-
init(): Promise<void>;
|
|
56
|
-
insert(vector: Float32Array | number[], metadata?: Record<string, any>): number;
|
|
57
|
-
initSparse(): void;
|
|
58
|
-
insertSparse(sparse: SparseVector): void;
|
|
59
|
-
search(query: Float32Array, k?: number): Promise<SearchResult[]>;
|
|
60
|
-
/** Post-filters HNSW candidates with a JS predicate over metadata objects. */
|
|
61
|
-
searchWithFilter(query: Float32Array, filter: (meta: Record<string, any>) => boolean, k?: number): Promise<SearchResult[]>;
|
|
62
|
-
hybridSearch(
|
|
63
|
-
denseQuery: Float32Array,
|
|
64
|
-
sparse: SparseVector,
|
|
65
|
-
k?: number,
|
|
66
|
-
opts?: { dense_k?: number; sparse_k?: number; fusion?: any }
|
|
67
|
-
): SearchResult[];
|
|
68
|
-
save(filePath: string): Promise<void>;
|
|
69
|
-
/** Alias for {@link VecStore.save}. */
|
|
70
|
-
exportToFile(filePath: string): Promise<void>;
|
|
71
|
-
exportBytes(): Uint8Array;
|
|
72
|
-
static load(filePath: string): Promise<VecStore>;
|
|
73
|
-
static loadFromBytes(bytes: Uint8Array): Promise<VecStore>;
|
|
74
|
-
static loadFromUrl(url: string): Promise<VecStore>;
|
|
75
|
-
get count(): number;
|
|
76
|
-
/** Dense vector width of the index. */
|
|
77
|
-
get dimensions(): number;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
export interface AddOptions {
|
|
81
|
-
/** Extract searchable text from an item (default: flatten string values). */
|
|
82
|
-
text?: (item: any) => string;
|
|
83
|
-
/** Extract metadata returned with hits (default: the object itself). */
|
|
84
|
-
metadata?: (item: any) => Record<string, any>;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
export interface VecitoSearchOptions {
|
|
88
|
-
mode?: 'hybrid' | 'dense' | 'sparse';
|
|
89
|
-
top?: number;
|
|
90
|
-
/** JS predicate over metadata — post-filters results in any mode, over-fetching to preserve the requested count. */
|
|
91
|
-
filter?: (meta: Record<string, any>) => boolean;
|
|
92
|
-
/** When true, each result includes `matchedTerms` for use with `Highlighter.highlight`. */
|
|
93
|
-
matchedTerms?: boolean;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
export class Vecito {
|
|
97
|
-
constructor(opts?: { model?: string; dtype?: string; embedder?: Embedder; mode?: 'hybrid' | 'dense'; k1?: number; b?: number });
|
|
98
|
-
addDocuments(items: any | any[], opts?: AddOptions): Promise<this>;
|
|
99
|
-
search(query: string, opts?: VecitoSearchOptions): Promise<SearchResult[]>;
|
|
100
|
-
exportBytes(): Uint8Array;
|
|
101
|
-
save(path: string): Promise<void>;
|
|
102
|
-
static load(path: string): Promise<Vecito>;
|
|
103
|
-
static loadFromBytes(bytes: Uint8Array | ArrayBuffer): Promise<Vecito>;
|
|
104
|
-
static loadFromUrl(url: string): Promise<Vecito>;
|
|
105
|
-
get count(): number;
|
|
106
|
-
get model(): string;
|
|
107
|
-
/** Weight precision the embedder loads (e.g. 'q8', 'fp32'). */
|
|
108
|
-
get dtype(): string;
|
|
109
|
-
/** Dense vector width of the index, or null before anything is indexed. */
|
|
110
|
-
get dimensions(): number | null;
|
|
111
|
-
/** Index mode this instance was built with ('hybrid' or 'dense'). */
|
|
112
|
-
get indexMode(): 'hybrid' | 'dense';
|
|
113
|
-
}
|