vecito 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +176 -0
- package/bin/cli.js +181 -0
- package/file.d.ts +27 -0
- package/index.d.ts +113 -0
- package/index.js +5 -0
- package/lib/bm25.js +265 -0
- package/lib/embedder.js +114 -0
- package/lib/extract.js +43 -0
- package/lib/file-index.js +118 -0
- package/lib/highlight.js +166 -0
- package/lib/vec-store.js +365 -0
- package/lib/vecito.js +352 -0
- package/package.json +62 -0
package/lib/bm25.js
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
const DEFAULT_K1 = 1.2;
|
|
2
|
+
const DEFAULT_B = 0.75;
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Tokenize text into lowercase terms for BM25. Splits on non-alphanumeric
|
|
6
|
+
* characters (keeping `-`, `_`, `.`) and drops single-character tokens.
|
|
7
|
+
* @param {string} text
|
|
8
|
+
* @returns {string[]} Lowercased tokens.
|
|
9
|
+
*/
|
|
10
|
+
function tokenize(text) {
|
|
11
|
+
return text
|
|
12
|
+
.toLowerCase()
|
|
13
|
+
.replace(/[^a-z0-9\-_\.]+/g, ' ')
|
|
14
|
+
.split(/\s+/)
|
|
15
|
+
.filter(t => t.length > 1);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* BM25 sparse lexical model.
|
|
20
|
+
*
|
|
21
|
+
* Fit over a corpus to learn vocabulary and document frequencies, then produce
|
|
22
|
+
* sparse vectors (term-id → weight) for documents ({@link BM25#score}) and
|
|
23
|
+
* queries ({@link BM25#querySparse}). Serializable via {@link BM25#toJSON} /
|
|
24
|
+
* {@link BM25.fromJSON}.
|
|
25
|
+
*/
|
|
26
|
+
export class BM25 {
|
|
27
|
+
#vocab; // term → id
|
|
28
|
+
#vocabSize;
|
|
29
|
+
#df; // term id → document frequency
|
|
30
|
+
#docCount;
|
|
31
|
+
#avgDocLen;
|
|
32
|
+
#docLens; // per-document token count
|
|
33
|
+
#k1;
|
|
34
|
+
#b;
|
|
35
|
+
#reverseVocab; // id → term, built lazily
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* @param {object} [opts]
|
|
39
|
+
* @param {number} [opts.k1=1.2] Term-frequency saturation parameter.
|
|
40
|
+
* @param {number} [opts.b=0.75] Document-length normalization (0..1).
|
|
41
|
+
*/
|
|
42
|
+
constructor({ k1, b } = {}) {
|
|
43
|
+
this.#vocab = new Map();
|
|
44
|
+
this.#vocabSize = 0;
|
|
45
|
+
this.#df = [];
|
|
46
|
+
this.#docCount = 0;
|
|
47
|
+
this.#avgDocLen = 0;
|
|
48
|
+
this.#docLens = [];
|
|
49
|
+
this.#k1 = k1 ?? DEFAULT_K1;
|
|
50
|
+
this.#b = b ?? DEFAULT_B;
|
|
51
|
+
this.#reverseVocab = null;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Build the vocabulary and corpus statistics (document frequencies, average
|
|
56
|
+
* length) from a set of documents. Must be called before scoring.
|
|
57
|
+
* @param {string[]} texts The full corpus.
|
|
58
|
+
* @returns {void}
|
|
59
|
+
*/
|
|
60
|
+
fit(texts) {
|
|
61
|
+
const docs = texts.map(t => tokenize(t));
|
|
62
|
+
|
|
63
|
+
// Build vocabulary
|
|
64
|
+
for (const tokens of docs) {
|
|
65
|
+
for (const t of tokens) {
|
|
66
|
+
if (!this.#vocab.has(t)) {
|
|
67
|
+
this.#vocab.set(t, this.#vocabSize++);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Compute document frequencies
|
|
73
|
+
this.#df = new Uint32Array(this.#vocabSize);
|
|
74
|
+
this.#docCount = docs.length;
|
|
75
|
+
let totalLen = 0;
|
|
76
|
+
|
|
77
|
+
for (const tokens of docs) {
|
|
78
|
+
const seen = new Set();
|
|
79
|
+
this.#docLens.push(tokens.length);
|
|
80
|
+
totalLen += tokens.length;
|
|
81
|
+
for (const t of tokens) {
|
|
82
|
+
const id = this.#vocab.get(t);
|
|
83
|
+
if (!seen.has(id)) {
|
|
84
|
+
this.#df[id]++;
|
|
85
|
+
seen.add(id);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
this.#avgDocLen = totalLen / this.#docCount;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Compute the BM25 sparse vector for a document. Out-of-vocabulary terms are
|
|
95
|
+
* ignored.
|
|
96
|
+
* @param {string} text Document text.
|
|
97
|
+
* @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
|
|
98
|
+
* sparse vector over the vocabulary.
|
|
99
|
+
*/
|
|
100
|
+
score(text) {
|
|
101
|
+
const tokens = tokenize(text);
|
|
102
|
+
// term frequency in this document
|
|
103
|
+
const tf = new Map();
|
|
104
|
+
for (const t of tokens) {
|
|
105
|
+
const id = this.#vocab.get(t);
|
|
106
|
+
if (id !== undefined) {
|
|
107
|
+
tf.set(id, (tf.get(id) || 0) + 1);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return this.#scoreFromTf(tf, tokens.length);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Convenience wrapper that scores many documents.
|
|
115
|
+
* @param {string[]} texts
|
|
116
|
+
* @returns {Array<{indices: Uint32Array, values: Float32Array, dim: number}>}
|
|
117
|
+
*/
|
|
118
|
+
scoreAll(texts) {
|
|
119
|
+
return texts.map(t => this.score(t));
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Map a query string to the list of in-vocabulary term ids it contains.
|
|
124
|
+
* @param {string} queryText
|
|
125
|
+
* @returns {{indices: number[], vocabSize: number}} Matched term ids.
|
|
126
|
+
*/
|
|
127
|
+
scoreQuery(queryText) {
|
|
128
|
+
const queryTokens = tokenize(queryText);
|
|
129
|
+
const queryIds = [];
|
|
130
|
+
for (const t of queryTokens) {
|
|
131
|
+
const id = this.#vocab.get(t);
|
|
132
|
+
if (id !== undefined) queryIds.push(id);
|
|
133
|
+
}
|
|
134
|
+
return { indices: queryIds, vocabSize: this.#vocabSize };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Build an IDF-weighted sparse vector for a query (assumes term frequency 1
|
|
139
|
+
* per query term). Use this to drive sparse/hybrid search.
|
|
140
|
+
* @param {string} queryText
|
|
141
|
+
* @returns {{indices: Uint32Array, values: Float32Array, dim: number}} Sorted
|
|
142
|
+
* sparse vector; empty `indices` means no query term is in the vocabulary.
|
|
143
|
+
*/
|
|
144
|
+
querySparse(queryText) {
|
|
145
|
+
const tokens = tokenize(queryText);
|
|
146
|
+
const termScores = {};
|
|
147
|
+
const seen = new Set();
|
|
148
|
+
|
|
149
|
+
for (const t of tokens) {
|
|
150
|
+
const id = this.#vocab.get(t);
|
|
151
|
+
if (id === undefined || seen.has(id)) continue;
|
|
152
|
+
seen.add(id);
|
|
153
|
+
// IDF for this term
|
|
154
|
+
const df = this.#df[id];
|
|
155
|
+
const idf = Math.log((this.#docCount - df + 0.5) / (df + 0.5) + 1);
|
|
156
|
+
// Query term gets weight = IDF (tf=1 in query)
|
|
157
|
+
if (idf > 0) termScores[id] = idf;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return toSparse(termScores, this.#vocabSize);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Apply the BM25 weighting formula to a term-frequency map.
|
|
165
|
+
* @param {Map<number, number>} tf term id → frequency in the document.
|
|
166
|
+
* @param {number} docLen Document length in tokens.
|
|
167
|
+
* @returns {{indices: Uint32Array, values: Float32Array, dim: number}}
|
|
168
|
+
*/
|
|
169
|
+
#scoreFromTf(tf, docLen) {
|
|
170
|
+
const termScores = {};
|
|
171
|
+
const N = this.#docCount;
|
|
172
|
+
const avgDl = this.#avgDocLen;
|
|
173
|
+
const k1 = this.#k1;
|
|
174
|
+
const b = this.#b;
|
|
175
|
+
|
|
176
|
+
for (const [id, freq] of tf) {
|
|
177
|
+
const df = this.#df[id];
|
|
178
|
+
const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1);
|
|
179
|
+
const tfNorm = (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * docLen / avgDl));
|
|
180
|
+
const s = idf * tfNorm;
|
|
181
|
+
if (s > 0) termScores[id] = s;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return toSparse(termScores, this.#vocabSize);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Serialize the fitted model to a plain JSON-safe object.
|
|
189
|
+
* @returns {object} Pass to {@link BM25.fromJSON} to restore.
|
|
190
|
+
*/
|
|
191
|
+
toJSON() {
|
|
192
|
+
return {
|
|
193
|
+
vocab: Object.fromEntries(this.#vocab),
|
|
194
|
+
df: Array.from(this.#df),
|
|
195
|
+
docCount: this.#docCount,
|
|
196
|
+
avgDocLen: this.#avgDocLen,
|
|
197
|
+
docLens: this.#docLens,
|
|
198
|
+
k1: this.#k1,
|
|
199
|
+
b: this.#b,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Reconstruct a fitted model from {@link BM25#toJSON} output.
|
|
205
|
+
* @param {object} data
|
|
206
|
+
* @returns {BM25}
|
|
207
|
+
*/
|
|
208
|
+
static fromJSON(data) {
|
|
209
|
+
const bm25 = new BM25({ k1: data.k1, b: data.b });
|
|
210
|
+
bm25.#vocab = new Map(Object.entries(data.vocab).map(([k, v]) => [k, Number(v)]));
|
|
211
|
+
bm25.#vocabSize = bm25.#vocab.size;
|
|
212
|
+
bm25.#df = new Uint32Array(data.df);
|
|
213
|
+
bm25.#docCount = data.docCount;
|
|
214
|
+
bm25.#avgDocLen = data.avgDocLen;
|
|
215
|
+
bm25.#docLens = data.docLens;
|
|
216
|
+
return bm25;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Map a list of vocabulary indices back to their original term strings.
|
|
221
|
+
* The reverse map is built lazily on first call and cached.
|
|
222
|
+
* @param {Uint32Array|number[]} indices Vocabulary term ids.
|
|
223
|
+
* @returns {string[]} The corresponding terms (unknown ids silently omitted).
|
|
224
|
+
*/
|
|
225
|
+
termsForIndices(indices) {
|
|
226
|
+
if (!this.#reverseVocab) {
|
|
227
|
+
this.#reverseVocab = new Map();
|
|
228
|
+
for (const [term, id] of this.#vocab) this.#reverseVocab.set(id, term);
|
|
229
|
+
}
|
|
230
|
+
const out = [];
|
|
231
|
+
for (const id of indices) {
|
|
232
|
+
const term = this.#reverseVocab.get(id);
|
|
233
|
+
if (term !== undefined) out.push(term);
|
|
234
|
+
}
|
|
235
|
+
return out;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Number of distinct terms in the fitted vocabulary (the sparse dimension).
|
|
240
|
+
* @returns {number}
|
|
241
|
+
*/
|
|
242
|
+
get vocabSize() {
|
|
243
|
+
return this.#vocabSize;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Convert a term-id → score map into a sorted sparse vector, dropping
|
|
249
|
+
* non-positive weights.
|
|
250
|
+
* @param {Record<string, number>} termScores
|
|
251
|
+
* @param {number} dim Vocabulary size.
|
|
252
|
+
* @returns {{indices: Uint32Array, values: Float32Array, dim: number}}
|
|
253
|
+
*/
|
|
254
|
+
function toSparse(termScores, dim) {
|
|
255
|
+
const entries = Object.entries(termScores)
|
|
256
|
+
.map(([id, score]) => [Number(id), score])
|
|
257
|
+
.filter(([, s]) => s > 0)
|
|
258
|
+
.sort((a, b) => a[0] - b[0]);
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
indices: new Uint32Array(entries.map(e => e[0])),
|
|
262
|
+
values: new Float32Array(entries.map(e => e[1])),
|
|
263
|
+
dim,
|
|
264
|
+
};
|
|
265
|
+
}
|
package/lib/embedder.js
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { pipeline } from '@huggingface/transformers';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Dense text embedder backed by transformers.js.
|
|
5
|
+
*
|
|
6
|
+
* Produces L2-normalized, mean-pooled sentence embeddings. The model is loaded
|
|
7
|
+
* lazily on first use and cached for the lifetime of the instance.
|
|
8
|
+
*/
|
|
9
|
+
export class Embedder {
|
|
10
|
+
#model;
|
|
11
|
+
#dtype;
|
|
12
|
+
#pipe;
|
|
13
|
+
#dimensions;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* @param {object} [opts]
|
|
17
|
+
* @param {string} [opts.model='Xenova/all-MiniLM-L6-v2'] Hugging Face model id
|
|
18
|
+
* for a feature-extraction pipeline. Any output width is supported — the
|
|
19
|
+
* actual dimension is detected at {@link Embedder#init} (see
|
|
20
|
+
* {@link Embedder#dimensions}).
|
|
21
|
+
* @param {string} [opts.dtype='q8'] Weight precision to load: `'q8'`
|
|
22
|
+
* (quantized, ~4× smaller download, the default) or `'fp32'` (full
|
|
23
|
+
* precision), plus other transformers.js dtypes (`'fp16'`, `'q4'`, …) when
|
|
24
|
+
* the model provides them.
|
|
25
|
+
*/
|
|
26
|
+
constructor({ model, dtype } = {}) {
|
|
27
|
+
this.#model = model || 'Xenova/all-MiniLM-L6-v2';
|
|
28
|
+
this.#dtype = dtype || 'q8';
|
|
29
|
+
this.#pipe = null;
|
|
30
|
+
this.#dimensions = null;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Load the underlying pipeline if it hasn't been loaded yet, and detect the
|
|
35
|
+
* model's embedding width with a one-token probe. Safe to call repeatedly;
|
|
36
|
+
* subsequent calls are no-ops. Called automatically by {@link Embedder#embed}
|
|
37
|
+
* / {@link Embedder#embedBatch}.
|
|
38
|
+
* @returns {Promise<void>}
|
|
39
|
+
*/
|
|
40
|
+
async init() {
|
|
41
|
+
if (!this.#pipe) {
|
|
42
|
+
this.#pipe = await pipeline('feature-extraction', this.#model, {
|
|
43
|
+
dtype: this.#dtype,
|
|
44
|
+
});
|
|
45
|
+
// Detect the real embedding width (384, 768, …) rather than assuming.
|
|
46
|
+
const probe = await this.#pipe('x', { pooling: 'mean', normalize: true });
|
|
47
|
+
this.#dimensions = probe.data.length;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Embed a single string into a normalized dense vector.
|
|
53
|
+
* @param {string} text Input text.
|
|
54
|
+
* @returns {Promise<Float32Array>} A {@link Embedder#dimensions}-length vector.
|
|
55
|
+
*/
|
|
56
|
+
async embed(text) {
|
|
57
|
+
await this.init();
|
|
58
|
+
const output = await this.#pipe(text, { pooling: 'mean', normalize: true });
|
|
59
|
+
return new Float32Array(output.data);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Embed many strings, processing them in batches for throughput.
|
|
64
|
+
* @param {string[]} texts Input texts.
|
|
65
|
+
* @param {object} [opts]
|
|
66
|
+
* @param {number} [opts.batchSize=32] Number of texts per forward pass.
|
|
67
|
+
* @returns {Promise<Float32Array[]>} One vector per input, in input order.
|
|
68
|
+
* Each is a view into a shared buffer — copy it if you need to retain it
|
|
69
|
+
* independently.
|
|
70
|
+
*/
|
|
71
|
+
async embedBatch(texts, { batchSize = 32 } = {}) {
|
|
72
|
+
await this.init();
|
|
73
|
+
const dims = this.dimensions;
|
|
74
|
+
const results = [];
|
|
75
|
+
for (let i = 0; i < texts.length; i += batchSize) {
|
|
76
|
+
const batch = texts.slice(i, i + batchSize);
|
|
77
|
+
const output = await this.#pipe(batch, { pooling: 'mean', normalize: true });
|
|
78
|
+
for (let j = 0; j < batch.length; j++) {
|
|
79
|
+
results.push(new Float32Array(output.data.buffer, output.data.byteOffset + j * dims * 4, dims));
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return results;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Dimensionality of the vectors this model produces. Available only after
|
|
87
|
+
* {@link Embedder#init} (or a first {@link Embedder#embed}) has run, since it
|
|
88
|
+
* is detected from the loaded model.
|
|
89
|
+
* @returns {number} e.g. 384 for MiniLM/BGE-small, 768 for MPNet/GTE-base.
|
|
90
|
+
* @throws {Error} If called before the model has been initialized.
|
|
91
|
+
*/
|
|
92
|
+
get dimensions() {
|
|
93
|
+
if (this.#dimensions == null) {
|
|
94
|
+
throw new Error('Embedder.dimensions is only known after init() or embed()');
|
|
95
|
+
}
|
|
96
|
+
return this.#dimensions;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* The Hugging Face model id this embedder uses.
|
|
101
|
+
* @returns {string}
|
|
102
|
+
*/
|
|
103
|
+
get model() {
|
|
104
|
+
return this.#model;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Weight precision this embedder loads (e.g. `'q8'`, `'fp32'`).
|
|
109
|
+
* @returns {string}
|
|
110
|
+
*/
|
|
111
|
+
get dtype() {
|
|
112
|
+
return this.#dtype;
|
|
113
|
+
}
|
|
114
|
+
}
|
package/lib/extract.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Universal (Node + browser) helpers for turning arbitrary data into the text
|
|
3
|
+
* Vecito indexes and the metadata it returns with hits. No filesystem deps.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Recursively collect string leaves from any value and join them. Lets raw JSON
|
|
8
|
+
* objects/arrays be indexed without the caller pre-extracting their text.
|
|
9
|
+
* @param {*} value Any value — object, array, string, etc.
|
|
10
|
+
* @returns {string} All string leaves, '. '-joined (empty string if none).
|
|
11
|
+
*/
|
|
12
|
+
export function flattenStrings(value) {
|
|
13
|
+
const parts = [];
|
|
14
|
+
const visit = v => {
|
|
15
|
+
if (typeof v === 'string') parts.push(v);
|
|
16
|
+
else if (Array.isArray(v)) v.forEach(visit);
|
|
17
|
+
else if (v && typeof v === 'object') Object.values(v).forEach(visit);
|
|
18
|
+
};
|
|
19
|
+
visit(value);
|
|
20
|
+
return parts.join('. ');
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Default text extractor: strings pass through; objects/arrays are flattened to
|
|
25
|
+
* their string leaves; everything else is stringified.
|
|
26
|
+
* @param {*} item
|
|
27
|
+
* @returns {string}
|
|
28
|
+
*/
|
|
29
|
+
export function defaultText(item) {
|
|
30
|
+
if (typeof item === 'string') return item;
|
|
31
|
+
if (item && typeof item === 'object') return flattenStrings(item);
|
|
32
|
+
return item == null ? '' : String(item);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Default metadata extractor: objects are returned as-is (so search hits carry
|
|
37
|
+
* the original data back); non-objects carry no metadata.
|
|
38
|
+
* @param {*} item
|
|
39
|
+
* @returns {Record<string, any>}
|
|
40
|
+
*/
|
|
41
|
+
export function defaultMetadata(item) {
|
|
42
|
+
return item && typeof item === 'object' && !Array.isArray(item) ? item : {};
|
|
43
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { readdirSync, readFileSync, statSync } from 'fs';
|
|
2
|
+
import { join, basename, relative } from 'path';
|
|
3
|
+
import { Vecito } from './vecito.js';
|
|
4
|
+
import { defaultText } from './extract.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Filesystem layer on top of the core {@link Vecito} library. Turns files and
|
|
8
|
+
* directories into data items and feeds them to the generic indexer. Node-only
|
|
9
|
+
* (imports `fs`/`path`); use it via the `vecito/file` subpath export.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/** Broad default set of text-ish extensions. Override with the `ext` option. */
|
|
13
|
+
export const DEFAULT_EXTENSIONS = [
|
|
14
|
+
// docs / text
|
|
15
|
+
'.md', '.markdown', '.mdx', '.txt', '.text', '.rst', '.org', '.tex', '.log',
|
|
16
|
+
// data / config
|
|
17
|
+
'.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.yaml', '.yml', '.toml',
|
|
18
|
+
'.ini', '.cfg', '.conf', '.xml', '.html', '.htm',
|
|
19
|
+
// code
|
|
20
|
+
'.js', '.mjs', '.cjs', '.ts', '.tsx', '.jsx', '.py', '.rb', '.php', '.go',
|
|
21
|
+
'.rs', '.java', '.c', '.h', '.cpp', '.hpp', '.cs', '.sh', '.sql', '.css',
|
|
22
|
+
'.scss', '.move', '.sol',
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Normalize an extension list to lowercase, dot-prefixed entries.
|
|
27
|
+
* @param {string[]} exts
|
|
28
|
+
* @returns {string[]}
|
|
29
|
+
*/
|
|
30
|
+
function normalizeExts(exts) {
|
|
31
|
+
return exts.map(e => (e.startsWith('.') ? e : '.' + e).toLowerCase());
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Recursively collect files under `dir` whose extension is allowed. Skips
|
|
36
|
+
* dotfiles and dot-directories unless `hidden` is true.
|
|
37
|
+
* @param {string} dir Directory to walk.
|
|
38
|
+
* @param {object} [opts]
|
|
39
|
+
* @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
|
|
40
|
+
* @param {boolean} [opts.hidden=false] Include entries whose name starts with '.'.
|
|
41
|
+
* @param {number} [opts.limit=Infinity] Stop after this many files.
|
|
42
|
+
* @returns {string[]} Matching file paths.
|
|
43
|
+
*/
|
|
44
|
+
export function walk(dir, { ext = DEFAULT_EXTENSIONS, hidden = false, limit = Infinity } = {}) {
|
|
45
|
+
const exts = normalizeExts(ext);
|
|
46
|
+
const out = [];
|
|
47
|
+
const recurse = current => {
|
|
48
|
+
for (const entry of readdirSync(current)) {
|
|
49
|
+
if (out.length >= limit) return;
|
|
50
|
+
// Skip dotfiles / dot-directories by default (.git, .env, .DS_Store, ...).
|
|
51
|
+
if (!hidden && entry.startsWith('.')) continue;
|
|
52
|
+
const full = join(current, entry);
|
|
53
|
+
const st = statSync(full);
|
|
54
|
+
if (st.isDirectory()) {
|
|
55
|
+
recurse(full);
|
|
56
|
+
} else if (st.isFile()) {
|
|
57
|
+
const lower = entry.toLowerCase();
|
|
58
|
+
if (exts.some(e => lower.endsWith(e))) out.push(full);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
recurse(dir);
|
|
63
|
+
return out.slice(0, limit);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Read a file into a data item. `.json`/`.jsonl`/`.ndjson` are parsed to objects
|
|
68
|
+
* (falling back to raw text on parse failure); everything else stays a string.
|
|
69
|
+
* @param {string} file Absolute or relative file path.
|
|
70
|
+
* @param {string} [base] Base dir for the item's relative `path` metadata.
|
|
71
|
+
* @returns {{data: any, path: string, name: string}}
|
|
72
|
+
*/
|
|
73
|
+
function readItem(file, base) {
|
|
74
|
+
const raw = readFileSync(file, 'utf-8');
|
|
75
|
+
const lower = file.toLowerCase();
|
|
76
|
+
let data = raw;
|
|
77
|
+
if (lower.endsWith('.json') || lower.endsWith('.jsonl') || lower.endsWith('.ndjson')) {
|
|
78
|
+
try { data = JSON.parse(raw); } catch { data = raw; }
|
|
79
|
+
}
|
|
80
|
+
return { data, path: base ? relative(base, file) : file, name: basename(file) };
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Index an explicit list of files into a fresh {@link Vecito}.
|
|
85
|
+
* @param {string[]} paths File paths to index (one document each).
|
|
86
|
+
* @param {object} [opts]
|
|
87
|
+
* @param {string} [opts.model] Embedding model id passed to Vecito.
|
|
88
|
+
* @param {string} [opts.dtype] Weight precision passed to Vecito.
|
|
89
|
+
* @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
|
|
90
|
+
* @param {string} [opts.base] Base dir for relative `path` metadata.
|
|
91
|
+
* @returns {Promise<Vecito>}
|
|
92
|
+
*/
|
|
93
|
+
export async function indexFiles(paths, { model, dtype, mode, base } = {}) {
|
|
94
|
+
const items = paths.map(p => readItem(p, base));
|
|
95
|
+
const vecito = new Vecito({ model, dtype, mode });
|
|
96
|
+
await vecito.addDocuments(items, {
|
|
97
|
+
text: it => defaultText(it.data).trim(),
|
|
98
|
+
metadata: it => ({ path: it.path, name: it.name }),
|
|
99
|
+
});
|
|
100
|
+
return vecito;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Walk a directory and index every matching file into a fresh {@link Vecito}.
|
|
105
|
+
* @param {string} dir Directory to index.
|
|
106
|
+
* @param {object} [opts]
|
|
107
|
+
* @param {string[]} [opts.ext=DEFAULT_EXTENSIONS] Extensions to include.
|
|
108
|
+
* @param {boolean} [opts.hidden=false] Include dotfiles/dot-directories.
|
|
109
|
+
* @param {number} [opts.limit=Infinity] Index at most this many files.
|
|
110
|
+
* @param {string} [opts.model] Embedding model id passed to Vecito.
|
|
111
|
+
* @param {string} [opts.dtype] Weight precision passed to Vecito.
|
|
112
|
+
* @param {'hybrid'|'dense'} [opts.mode='hybrid'] Index mode passed to Vecito.
|
|
113
|
+
* @returns {Promise<Vecito>}
|
|
114
|
+
*/
|
|
115
|
+
export async function indexDirectory(dir, { ext, hidden, limit, model, dtype, mode } = {}) {
|
|
116
|
+
const files = walk(dir, { ext, hidden, limit });
|
|
117
|
+
return indexFiles(files, { model, dtype, mode, base: dir });
|
|
118
|
+
}
|