vecito 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/bm25.js ADDED
@@ -0,0 +1,265 @@
1
+ const DEFAULT_K1 = 1.2;
2
+ const DEFAULT_B = 0.75;
3
+
4
+ /**
5
+ * Tokenize text into lowercase terms for BM25. Splits on non-alphanumeric
6
+ * characters (keeping `-`, `_`, `.`) and drops single-character tokens.
7
+ * @param {string} text
8
+ * @returns {string[]} Lowercased tokens.
9
+ */
10
+ function tokenize(text) {
11
+ return text
12
+ .toLowerCase()
13
+ .replace(/[^a-z0-9\-_\.]+/g, ' ')
14
+ .split(/\s+/)
15
+ .filter(t => t.length > 1);
16
+ }
17
+
18
+ /**
19
+ * BM25 sparse lexical model.
20
+ *
21
+ * Fit over a corpus to learn vocabulary and document frequencies, then produce
22
+ * sparse vectors (term-id → weight) for documents ({@link BM25#score}) and
23
+ * queries ({@link BM25#querySparse}). Serializable via {@link BM25#toJSON} /
24
+ * {@link BM25.fromJSON}.
25
+ */
26
+ export class BM25 {
27
+ #vocab; // term → id
28
+ #vocabSize;
29
+ #df; // term id → document frequency
30
+ #docCount;
31
+ #avgDocLen;
32
+ #docLens; // per-document token count
33
+ #k1;
34
+ #b;
35
+ #reverseVocab; // id → term, built lazily
36
+
37
+ /**
38
+ * @param {object} [opts]
39
+ * @param {number} [opts.k1=1.2] Term-frequency saturation parameter.
40
+ * @param {number} [opts.b=0.75] Document-length normalization (0..1).
41
+ */
42
+ constructor({ k1, b } = {}) {
43
+ this.#vocab = new Map();
44
+ this.#vocabSize = 0;
45
+ this.#df = [];
46
+ this.#docCount = 0;
47
+ this.#avgDocLen = 0;
48
+ this.#docLens = [];
49
+ this.#k1 = k1 ?? DEFAULT_K1;
50
+ this.#b = b ?? DEFAULT_B;
51
+ this.#reverseVocab = null;
52
+ }
53
+
54
+ /**
55
+ * Build the vocabulary and corpus statistics (document frequencies, average
56
+ * length) from a set of documents. Must be called before scoring.
57
+ * @param {string[]} texts The full corpus.
58
+ * @returns {void}
59
+ */
60
+ fit(texts) {
61
+ const docs = texts.map(t => tokenize(t));
62
+
63
+ // Build vocabulary
64
+ for (const tokens of docs) {
65
+ for (const t of tokens) {
66
+ if (!this.#vocab.has(t)) {
67
+ this.#vocab.set(t, this.#vocabSize++);
68
+ }
69
+ }
70
+ }
71
+
72
+ // Compute document frequencies
73
+ this.#df = new Uint32Array(this.#vocabSize);
74
+ this.#docCount = docs.length;
75
+ let totalLen = 0;
76
+
77
+ for (const tokens of docs) {
78
+ const seen = new Set();
79
+ this.#docLens.push(tokens.length);
80
+ totalLen += tokens.length;
81
+ for (const t of tokens) {
82
+ const id = this.#vocab.get(t);
83
+ if (!seen.has(id)) {
84
+ this.#df[id]++;
85
+ seen.add(id);
86
+ }
87
+ }
88
+ }
89
+
90
+ this.#avgDocLen = totalLen / this.#docCount;
91
+ }
92
+
93
+ /**
94
+ * Compute the BM25 sparse vector for a document. Out-of-vocabulary terms are
95
+ * ignored.
96
+ * @param {string} text Document text.
97
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
98
+ * sparse vector over the vocabulary.
99
+ */
100
+ score(text) {
101
+ const tokens = tokenize(text);
102
+ // term frequency in this document
103
+ const tf = new Map();
104
+ for (const t of tokens) {
105
+ const id = this.#vocab.get(t);
106
+ if (id !== undefined) {
107
+ tf.set(id, (tf.get(id) || 0) + 1);
108
+ }
109
+ }
110
+ return this.#scoreFromTf(tf, tokens.length);
111
+ }
112
+
113
+ /**
114
+ * Convenience wrapper that scores many documents.
115
+ * @param {string[]} texts
116
+ * @returns {Array<{indices: Uint32Array, values: Float32Array, dim: number}>}
117
+ */
118
+ scoreAll(texts) {
119
+ return texts.map(t => this.score(t));
120
+ }
121
+
122
+ /**
123
+ * Map a query string to the list of in-vocabulary term ids it contains.
124
+ * @param {string} queryText
125
+ * @returns {{indices: number[], vocabSize: number}} Matched term ids.
126
+ */
127
+ scoreQuery(queryText) {
128
+ const queryTokens = tokenize(queryText);
129
+ const queryIds = [];
130
+ for (const t of queryTokens) {
131
+ const id = this.#vocab.get(t);
132
+ if (id !== undefined) queryIds.push(id);
133
+ }
134
+ return { indices: queryIds, vocabSize: this.#vocabSize };
135
+ }
136
+
137
+ /**
138
+ * Build an IDF-weighted sparse vector for a query (assumes term frequency 1
139
+ * per query term). Use this to drive sparse/hybrid search.
140
+ * @param {string} queryText
141
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
142
+ * sparse vector; empty `indices` means no query term is in the vocabulary.
143
+ */
144
+ querySparse(queryText) {
145
+ const tokens = tokenize(queryText);
146
+ const termScores = {};
147
+ const seen = new Set();
148
+
149
+ for (const t of tokens) {
150
+ const id = this.#vocab.get(t);
151
+ if (id === undefined || seen.has(id)) continue;
152
+ seen.add(id);
153
+ // IDF for this term
154
+ const df = this.#df[id];
155
+ const idf = Math.log((this.#docCount - df + 0.5) / (df + 0.5) + 1);
156
+ // Query term gets weight = IDF (tf=1 in query)
157
+ if (idf > 0) termScores[id] = idf;
158
+ }
159
+
160
+ return toSparse(termScores, this.#vocabSize);
161
+ }
162
+
163
+ /**
164
+ * Apply the BM25 weighting formula to a term-frequency map.
165
+ * @param {Map<number, number>} tf term id → frequency in the document.
166
+ * @param {number} docLen Document length in tokens.
167
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}}
168
+ */
169
+ #scoreFromTf(tf, docLen) {
170
+ const termScores = {};
171
+ const N = this.#docCount;
172
+ const avgDl = this.#avgDocLen;
173
+ const k1 = this.#k1;
174
+ const b = this.#b;
175
+
176
+ for (const [id, freq] of tf) {
177
+ const df = this.#df[id];
178
+ const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1);
179
+ const tfNorm = (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * docLen / avgDl));
180
+ const s = idf * tfNorm;
181
+ if (s > 0) termScores[id] = s;
182
+ }
183
+
184
+ return toSparse(termScores, this.#vocabSize);
185
+ }
186
+
187
+ /**
188
+ * Serialize the fitted model to a plain JSON-safe object.
189
+ * @returns {object} Pass to {@link BM25.fromJSON} to restore.
190
+ */
191
+ toJSON() {
192
+ return {
193
+ vocab: Object.fromEntries(this.#vocab),
194
+ df: Array.from(this.#df),
195
+ docCount: this.#docCount,
196
+ avgDocLen: this.#avgDocLen,
197
+ docLens: this.#docLens,
198
+ k1: this.#k1,
199
+ b: this.#b,
200
+ };
201
+ }
202
+
203
+ /**
204
+ * Reconstruct a fitted model from {@link BM25#toJSON} output.
205
+ * @param {object} data
206
+ * @returns {BM25}
207
+ */
208
+ static fromJSON(data) {
209
+ const bm25 = new BM25({ k1: data.k1, b: data.b });
210
+ bm25.#vocab = new Map(Object.entries(data.vocab).map(([k, v]) => [k, Number(v)]));
211
+ bm25.#vocabSize = bm25.#vocab.size;
212
+ bm25.#df = new Uint32Array(data.df);
213
+ bm25.#docCount = data.docCount;
214
+ bm25.#avgDocLen = data.avgDocLen;
215
+ bm25.#docLens = data.docLens;
216
+ return bm25;
217
+ }
218
+
219
+ /**
220
+ * Map a list of vocabulary indices back to their original term strings.
221
+ * The reverse map is built lazily on first call and cached.
222
+ * @param {Uint32Array|number[]} indices Vocabulary term ids.
223
+ * @returns {string[]} The corresponding terms (unknown ids silently omitted).
224
+ */
225
+ termsForIndices(indices) {
226
+ if (!this.#reverseVocab) {
227
+ this.#reverseVocab = new Map();
228
+ for (const [term, id] of this.#vocab) this.#reverseVocab.set(id, term);
229
+ }
230
+ const out = [];
231
+ for (const id of indices) {
232
+ const term = this.#reverseVocab.get(id);
233
+ if (term !== undefined) out.push(term);
234
+ }
235
+ return out;
236
+ }
237
+
238
+ /**
239
+ * Number of distinct terms in the fitted vocabulary (the sparse dimension).
240
+ * @returns {number}
241
+ */
242
+ get vocabSize() {
243
+ return this.#vocabSize;
244
+ }
245
+ }
246
+
247
+ /**
248
+ * Convert a term-id → score map into a sorted sparse vector, dropping
249
+ * non-positive weights.
250
+ * @param {Record<string, number>} termScores
251
+ * @param {number} dim Vocabulary size.
252
+ * @returns {{indices: Uint32Array, values: Float32Array, dim: number}}
253
+ */
254
+ function toSparse(termScores, dim) {
255
+ const entries = Object.entries(termScores)
256
+ .map(([id, score]) => [Number(id), score])
257
+ .filter(([, s]) => s > 0)
258
+ .sort((a, b) => a[0] - b[0]);
259
+
260
+ return {
261
+ indices: new Uint32Array(entries.map(e => e[0])),
262
+ values: new Float32Array(entries.map(e => e[1])),
263
+ dim,
264
+ };
265
+ }
@@ -0,0 +1,114 @@
1
+ import { pipeline } from '@huggingface/transformers';
2
+
3
+ /**
4
+ * Dense text embedder backed by transformers.js.
5
+ *
6
+ * Produces L2-normalized, mean-pooled sentence embeddings. The model is loaded
7
+ * lazily on first use and cached for the lifetime of the instance.
8
+ */
9
+ export class Embedder {
10
+ #model;
11
+ #dtype;
12
+ #pipe;
13
+ #dimensions;
14
+
15
+ /**
16
+ * @param {object} [opts]
17
+ * @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Hugging Face model id
18
+ * for a feature-extraction pipeline. Any output width is supported — the
19
+ * actual dimension is detected at {@link Embedder#init} (see
20
+ * {@link Embedder#dimensions}).
21
+ * @param {string} [opts.dtype='q8'] Weight precision to load: `'q8'`
22
+ * (quantized, ~4× smaller download, the default) or `'fp32'` (full
23
+ * precision), plus other transformers.js dtypes (`'fp16'`, `'q4'`, …) when
24
+ * the model provides them.
25
+ */
26
+ constructor({ model, dtype } = {}) {
27
+ this.#model = model || 'Xenova/all-MiniLM-L6-v2';
28
+ this.#dtype = dtype || 'q8';
29
+ this.#pipe = null;
30
+ this.#dimensions = null;
31
+ }
32
+
33
+ /**
34
+ * Load the underlying pipeline if it hasn't been loaded yet, and detect the
35
+ * model's embedding width with a one-token probe. Safe to call repeatedly;
36
+ * subsequent calls are no-ops. Called automatically by {@link Embedder#embed}
37
+ * / {@link Embedder#embedBatch}.
38
+ * @returns {Promise<void>}
39
+ */
40
+ async init() {
41
+ if (!this.#pipe) {
42
+ this.#pipe = await pipeline('feature-extraction', this.#model, {
43
+ dtype: this.#dtype,
44
+ });
45
+ // Detect the real embedding width (384, 768, …) rather than assuming.
46
+ const probe = await this.#pipe('x', { pooling: 'mean', normalize: true });
47
+ this.#dimensions = probe.data.length;
48
+ }
49
+ }
50
+
51
+ /**
52
+ * Embed a single string into a normalized dense vector.
53
+ * @param {string} text Input text.
54
+ * @returns {Promise<Float32Array>} A {@link Embedder#dimensions}-length vector.
55
+ */
56
+ async embed(text) {
57
+ await this.init();
58
+ const output = await this.#pipe(text, { pooling: 'mean', normalize: true });
59
+ return new Float32Array(output.data);
60
+ }
61
+
62
+ /**
63
+ * Embed many strings, processing them in batches for throughput.
64
+ * @param {string[]} texts Input texts.
65
+ * @param {object} [opts]
66
+ * @param {number} [opts.batchSize=32] Number of texts per forward pass.
67
+ * @returns {Promise<Float32Array[]>} One vector per input, in input order.
68
+ * Each is a view into a shared buffer — copy it if you need to retain it
69
+ * independently.
70
+ */
71
+ async embedBatch(texts, { batchSize = 32 } = {}) {
72
+ await this.init();
73
+ const dims = this.dimensions;
74
+ const results = [];
75
+ for (let i = 0; i < texts.length; i += batchSize) {
76
+ const batch = texts.slice(i, i + batchSize);
77
+ const output = await this.#pipe(batch, { pooling: 'mean', normalize: true });
78
+ for (let j = 0; j < batch.length; j++) {
79
+ results.push(new Float32Array(output.data.buffer, output.data.byteOffset + j * dims * 4, dims));
80
+ }
81
+ }
82
+ return results;
83
+ }
84
+
85
+ /**
86
+ * Dimensionality of the vectors this model produces. Available only after
87
+ * {@link Embedder#init} (or a first {@link Embedder#embed}) has run, since it
88
+ * is detected from the loaded model.
89
+ * @returns {number} e.g. 384 for MiniLM/BGE-small, 768 for MPNet/GTE-base.
90
+ * @throws {Error} If called before the model has been initialized.
91
+ */
92
+ get dimensions() {
93
+ if (this.#dimensions == null) {
94
+ throw new Error('Embedder.dimensions is only known after init() or embed()');
95
+ }
96
+ return this.#dimensions;
97
+ }
98
+
99
+ /**
100
+ * The Hugging Face model id this embedder uses.
101
+ * @returns {string}
102
+ */
103
+ get model() {
104
+ return this.#model;
105
+ }
106
+
107
+ /**
108
+ * Weight precision this embedder loads (e.g. `'q8'`, `'fp32'`).
109
+ * @returns {string}
110
+ */
111
+ get dtype() {
112
+ return this.#dtype;
113
+ }
114
+ }
package/lib/extract.js ADDED
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Universal (Node + browser) helpers for turning arbitrary data into the text
3
+ * Vecito indexes and the metadata it returns with hits. No filesystem deps.
4
+ */
5
+
6
+ /**
7
+ * Recursively collect string leaves from any value and join them. Lets raw JSON
8
+ * objects/arrays be indexed without the caller pre-extracting their text.
9
+ * @param {*} value Any value — object, array, string, etc.
10
+ * @returns {string} All string leaves, '. '-joined (empty string if none).
11
+ */
12
+ export function flattenStrings(value) {
13
+ const parts = [];
14
+ const visit = v => {
15
+ if (typeof v === 'string') parts.push(v);
16
+ else if (Array.isArray(v)) v.forEach(visit);
17
+ else if (v && typeof v === 'object') Object.values(v).forEach(visit);
18
+ };
19
+ visit(value);
20
+ return parts.join('. ');
21
+ }
22
+
23
+ /**
24
+ * Default text extractor: strings pass through; objects/arrays are flattened to
25
+ * their string leaves; everything else is stringified.
26
+ * @param {*} item
27
+ * @returns {string}
28
+ */
29
+ export function defaultText(item) {
30
+ if (typeof item === 'string') return item;
31
+ if (item && typeof item === 'object') return flattenStrings(item);
32
+ return item == null ? '' : String(item);
33
+ }
34
+
35
+ /**
36
+ * Default metadata extractor: objects are returned as-is (so search hits carry
37
+ * the original data back); non-objects carry no metadata.
38
+ * @param {*} item
39
+ * @returns {Record<string, any>}
40
+ */
41
+ export function defaultMetadata(item) {
42
+ return item && typeof item === 'object' && !Array.isArray(item) ? item : {};
43
+ }
@@ -0,0 +1,118 @@
1
+ import { readdirSync, readFileSync, statSync } from 'fs';
2
+ import { join, basename, relative } from 'path';
3
+ import { Vecito } from './vecito.js';
4
+ import { defaultText } from './extract.js';
5
+
6
+ /**
7
+ * Filesystem layer on top of the core {@link Vecito} library. Turns files and
8
+ * directories into data items and feeds them to the generic indexer. Node-only
9
+ * (imports `fs`/`path`); use it via the `vecito/file` subpath export.
10
+ */
11
+
12
+ /** Broad default set of text-ish extensions. Override with the `ext` option. */
13
+ export const DEFAULT_EXTENSIONS = [
14
+ // docs / text
15
+ '.md', '.markdown', '.mdx', '.txt', '.text', '.rst', '.org', '.tex', '.log',
16
+ // data / config
17
+ '.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.yaml', '.yml', '.toml',
18
+ '.ini', '.cfg', '.conf', '.xml', '.html', '.htm',
19
+ // code
20
+ '.js', '.mjs', '.cjs', '.ts', '.tsx', '.jsx', '.py', '.rb', '.php', '.go',
21
+ '.rs', '.java', '.c', '.h', '.cpp', '.hpp', '.cs', '.sh', '.sql', '.css',
22
+ '.scss', '.move', '.sol',
23
+ ];
24
+
25
+ /**
26
+ * Normalize an extension list to lowercase, dot-prefixed entries.
27
+ * @param {string[]} exts
28
+ * @returns {string[]}
29
+ */
30
+ function normalizeExts(exts) {
31
+ return exts.map(e => (e.startsWith('.') ? e : '.' + e).toLowerCase());
32
+ }
33
+
34
+ /**
35
+ * Recursively collect files under `dir` whose extension is allowed. Skips
36
+ * dotfiles and dot-directories unless `hidden` is true.
37
+ * @param {string} dir Directory to walk.
38
+ * @param {object} [opts]
39
+ * @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
40
+ * @param {boolean} [opts.hidden=false] Include entries whose name starts with '.'.
41
+ * @param {number} [opts.limit=Infinity] Stop after this many files.
42
+ * @returns {string[]} Matching file paths.
43
+ */
44
+ export function walk(dir, { ext = DEFAULT_EXTENSIONS, hidden = false, limit = Infinity } = {}) {
45
+ const exts = normalizeExts(ext);
46
+ const out = [];
47
+ const recurse = current => {
48
+ for (const entry of readdirSync(current)) {
49
+ if (out.length >= limit) return;
50
+ // Skip dotfiles / dot-directories by default (.git, .env, .DS_Store, ...).
51
+ if (!hidden && entry.startsWith('.')) continue;
52
+ const full = join(current, entry);
53
+ const st = statSync(full);
54
+ if (st.isDirectory()) {
55
+ recurse(full);
56
+ } else if (st.isFile()) {
57
+ const lower = entry.toLowerCase();
58
+ if (exts.some(e => lower.endsWith(e))) out.push(full);
59
+ }
60
+ }
61
+ };
62
+ recurse(dir);
63
+ return out.slice(0, limit);
64
+ }
65
+
66
+ /**
67
+ * Read a file into a data item. `.json`/`.jsonl`/`.ndjson` are parsed to objects
68
+ * (falling back to raw text on parse failure); everything else stays a string.
69
+ * @param {string} file Absolute or relative file path.
70
+ * @param {string} [base] Base dir for the item's relative `path` metadata.
71
+ * @returns {{data: any, path: string, name: string}}
72
+ */
73
+ function readItem(file, base) {
74
+ const raw = readFileSync(file, 'utf-8');
75
+ const lower = file.toLowerCase();
76
+ let data = raw;
77
+ if (lower.endsWith('.json') || lower.endsWith('.jsonl') || lower.endsWith('.ndjson')) {
78
+ try { data = JSON.parse(raw); } catch { data = raw; }
79
+ }
80
+ return { data, path: base ? relative(base, file) : file, name: basename(file) };
81
+ }
82
+
83
+ /**
84
+ * Index an explicit list of files into a fresh {@link Vecito}.
85
+ * @param {string[]} paths File paths to index (one document each).
86
+ * @param {object} [opts]
87
+ * @param {string} [opts.model] Embedding model id passed to Vecito.
88
+ * @param {string} [opts.dtype] Weight precision passed to Vecito.
89
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
90
+ * @param {string} [opts.base] Base dir for relative `path` metadata.
91
+ * @returns {Promise<Vecito>}
92
+ */
93
+ export async function indexFiles(paths, { model, dtype, mode, base } = {}) {
94
+ const items = paths.map(p => readItem(p, base));
95
+ const vecito = new Vecito({ model, dtype, mode });
96
+ await vecito.addDocuments(items, {
97
+ text: it => defaultText(it.data).trim(),
98
+ metadata: it => ({ path: it.path, name: it.name }),
99
+ });
100
+ return vecito;
101
+ }
102
+
103
+ /**
104
+ * Walk a directory and index every matching file into a fresh {@link Vecito}.
105
+ * @param {string} dir Directory to index.
106
+ * @param {object} [opts]
107
+ * @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
108
+ * @param {boolean} [opts.hidden=false] Include dotfiles/dot-directories.
109
+ * @param {number} [opts.limit=Infinity] Index at most this many files.
110
+ * @param {string} [opts.model] Embedding model id passed to Vecito.
111
+ * @param {string} [opts.dtype] Weight precision passed to Vecito.
112
+ * @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
113
+ * @returns {Promise<Vecito>}
114
+ */
115
+ export async function indexDirectory(dir, { ext, hidden, limit, model, dtype, mode } = {}) {
116
+ const files = walk(dir, { ext, hidden, limit });
117
+ return indexFiles(files, { model, dtype, mode, base: dir });
118
+ }