semantic-code-mcp 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +259 -0
- package/config.json +85 -0
- package/features/check-last-version.js +504 -0
- package/features/clear-cache.js +75 -0
- package/features/get-status.js +210 -0
- package/features/hybrid-search.js +189 -0
- package/features/index-codebase.js +999 -0
- package/features/set-workspace.js +183 -0
- package/index.js +297 -0
- package/lib/ast-chunker.js +273 -0
- package/lib/cache-factory.js +13 -0
- package/lib/cache.js +157 -0
- package/lib/config.js +1296 -0
- package/lib/embedding-worker.js +155 -0
- package/lib/gemini-embedder.js +351 -0
- package/lib/ignore-patterns.js +896 -0
- package/lib/milvus-cache.js +478 -0
- package/lib/mrl-embedder.js +235 -0
- package/lib/project-detector.js +75 -0
- package/lib/resource-throttle.js +85 -0
- package/lib/sqlite-cache.js +468 -0
- package/lib/tokenizer.js +149 -0
- package/lib/utils.js +214 -0
- package/package.json +70 -0
- package/reindex.js +109 -0
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
import fs from "fs/promises";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { DataType, MetricType, MilvusClient } from "@zilliz/milvus2-sdk-node";
|
|
4
|
+
|
|
5
|
+
const DEFAULT_COLLECTION = "smart_coding_embeddings";
|
|
6
|
+
const DEFAULT_MAX_CONTENT_LENGTH = 65535;
|
|
7
|
+
|
|
8
|
+
function escapeFilterString(value) {
|
|
9
|
+
return String(value).replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
function parseIntSafe(value, fallback = 0) {
|
|
13
|
+
const parsed = Number.parseInt(String(value), 10);
|
|
14
|
+
return Number.isNaN(parsed) ? fallback : parsed;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class MilvusCache {
|
|
18
|
+
constructor(config) {
|
|
19
|
+
this.config = config;
|
|
20
|
+
this.client = null;
|
|
21
|
+
this.vectorStore = [];
|
|
22
|
+
this.fileHashes = new Map();
|
|
23
|
+
this.isSaving = false;
|
|
24
|
+
this.hashesDirty = false;
|
|
25
|
+
this.lastWriteError = null;
|
|
26
|
+
this.writeQueue = Promise.resolve();
|
|
27
|
+
this.hashPath = path.join(config.cacheDirectory, "file-hashes.json");
|
|
28
|
+
this.collectionName = config.milvusCollection || DEFAULT_COLLECTION;
|
|
29
|
+
this.dimension = this.resolveDimension(config);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
resolveDimension(config) {
|
|
33
|
+
const provider = (config.embeddingProvider || "local").toLowerCase();
|
|
34
|
+
const isApiProvider = ["gemini", "openai", "openai-compatible", "vertex"].includes(provider);
|
|
35
|
+
const candidate =
|
|
36
|
+
isApiProvider ? config.geminiDimensions : config.embeddingDimension;
|
|
37
|
+
const dim = Number(candidate);
|
|
38
|
+
if (Number.isInteger(dim) && dim > 0) return dim;
|
|
39
|
+
return isApiProvider ? 768 : 128;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
getRequestBase() {
|
|
43
|
+
const base = {};
|
|
44
|
+
if (this.config.milvusDatabase) {
|
|
45
|
+
base.db_name = this.config.milvusDatabase;
|
|
46
|
+
}
|
|
47
|
+
return base;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
enqueueWrite(task, label = "write") {
|
|
51
|
+
this.writeQueue = this.writeQueue
|
|
52
|
+
.then(async () => {
|
|
53
|
+
await task();
|
|
54
|
+
})
|
|
55
|
+
.catch((error) => {
|
|
56
|
+
this.lastWriteError = error;
|
|
57
|
+
console.error(`[Cache] Milvus ${label} failed: ${error.message}`);
|
|
58
|
+
});
|
|
59
|
+
return this.writeQueue;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async waitForWrites() {
|
|
63
|
+
await this.writeQueue;
|
|
64
|
+
if (this.lastWriteError) {
|
|
65
|
+
const error = this.lastWriteError;
|
|
66
|
+
this.lastWriteError = null;
|
|
67
|
+
throw error;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
validateConfig() {
|
|
72
|
+
const address = this.config?.milvusAddress?.trim();
|
|
73
|
+
if (!address) {
|
|
74
|
+
throw new Error(
|
|
75
|
+
"[Cache] Milvus provider selected but SMART_CODING_MILVUS_ADDRESS is not set."
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
return address;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
async load() {
|
|
82
|
+
if (!this.config.enableCache) return;
|
|
83
|
+
|
|
84
|
+
this.validateConfig();
|
|
85
|
+
this.dimension = this.resolveDimension(this.config);
|
|
86
|
+
this.collectionName = this.config.milvusCollection || DEFAULT_COLLECTION;
|
|
87
|
+
this.hashPath = path.join(this.config.cacheDirectory, "file-hashes.json");
|
|
88
|
+
|
|
89
|
+
await fs.mkdir(this.config.cacheDirectory, { recursive: true });
|
|
90
|
+
await this.loadFileHashes();
|
|
91
|
+
|
|
92
|
+
this.client = new MilvusClient({
|
|
93
|
+
address: this.config.milvusAddress,
|
|
94
|
+
token: this.config.milvusToken || undefined,
|
|
95
|
+
database: this.config.milvusDatabase || undefined
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
await this.ensureCollection();
|
|
99
|
+
// ANN mode: keep local vector store empty and delegate retrieval to Milvus search API.
|
|
100
|
+
this.vectorStore = [];
|
|
101
|
+
const stats = await this.getStats();
|
|
102
|
+
|
|
103
|
+
console.error(
|
|
104
|
+
`[Cache] Loaded Milvus cache (ANN mode): ${stats.totalChunks} embeddings from ${stats.totalFiles} files`
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async ensureCollection() {
|
|
109
|
+
const hasRes = await this.client.hasCollection({
|
|
110
|
+
collection_name: this.collectionName,
|
|
111
|
+
...this.getRequestBase()
|
|
112
|
+
});
|
|
113
|
+
const exists = Boolean(hasRes?.value);
|
|
114
|
+
|
|
115
|
+
if (!exists) {
|
|
116
|
+
await this.client.createCollection({
|
|
117
|
+
collection_name: this.collectionName,
|
|
118
|
+
description: "smart-coding-mcp embeddings cache",
|
|
119
|
+
fields: [
|
|
120
|
+
{
|
|
121
|
+
name: "id",
|
|
122
|
+
data_type: DataType.Int64,
|
|
123
|
+
is_primary_key: true,
|
|
124
|
+
autoID: true
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
name: "file",
|
|
128
|
+
data_type: DataType.VarChar,
|
|
129
|
+
max_length: 4096
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
name: "start_line",
|
|
133
|
+
data_type: DataType.Int64
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
name: "end_line",
|
|
137
|
+
data_type: DataType.Int64
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
name: "content",
|
|
141
|
+
data_type: DataType.VarChar,
|
|
142
|
+
max_length: DEFAULT_MAX_CONTENT_LENGTH
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
name: "vector",
|
|
146
|
+
data_type: DataType.FloatVector,
|
|
147
|
+
dim: this.dimension
|
|
148
|
+
}
|
|
149
|
+
],
|
|
150
|
+
enable_dynamic_field: false,
|
|
151
|
+
...this.getRequestBase()
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
try {
|
|
155
|
+
await this.client.createIndex({
|
|
156
|
+
collection_name: this.collectionName,
|
|
157
|
+
field_name: "vector",
|
|
158
|
+
index_type: "AUTOINDEX",
|
|
159
|
+
metric_type: MetricType.COSINE,
|
|
160
|
+
...this.getRequestBase()
|
|
161
|
+
});
|
|
162
|
+
} catch (error) {
|
|
163
|
+
console.error(`[Cache] Milvus index create warning: ${error.message}`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
await this.client.loadCollection({
|
|
168
|
+
collection_name: this.collectionName,
|
|
169
|
+
...this.getRequestBase()
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async searchByVector(queryVector, topK = 10, filter = null) {
|
|
174
|
+
if (!this.client) {
|
|
175
|
+
throw new Error("[Cache] Milvus client not initialized");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const vector = Array.isArray(queryVector)
|
|
179
|
+
? queryVector
|
|
180
|
+
: Array.from(queryVector || []);
|
|
181
|
+
if (vector.length !== this.dimension) {
|
|
182
|
+
throw new Error(
|
|
183
|
+
`[Cache] Query vector dimension mismatch: expected ${this.dimension}, got ${vector.length}`
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const normalizedTopK = Number.isInteger(topK) && topK > 0 ? topK : 10;
|
|
188
|
+
const searchParams = {
|
|
189
|
+
collection_name: this.collectionName,
|
|
190
|
+
anns_field: "vector",
|
|
191
|
+
data: vector,
|
|
192
|
+
output_fields: ["file", "start_line", "end_line", "content"],
|
|
193
|
+
limit: normalizedTopK,
|
|
194
|
+
metric_type: "COSINE",
|
|
195
|
+
...this.getRequestBase()
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
if (typeof filter === "string" && filter.trim()) {
|
|
199
|
+
searchParams.filter = filter.trim();
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const res = await this.client.search(searchParams);
|
|
203
|
+
const rawResults = Array.isArray(res?.results) ? res.results : [];
|
|
204
|
+
const rows = Array.isArray(rawResults[0]) ? rawResults[0] : rawResults;
|
|
205
|
+
|
|
206
|
+
return rows
|
|
207
|
+
.filter((row) => row && row.file)
|
|
208
|
+
.map((row) => ({
|
|
209
|
+
file: String(row.file),
|
|
210
|
+
startLine: parseIntSafe(row.start_line, 0),
|
|
211
|
+
endLine: parseIntSafe(row.end_line, 0),
|
|
212
|
+
content: String(row.content || ""),
|
|
213
|
+
score: Number(row.score || 0)
|
|
214
|
+
}));
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
async getStats() {
|
|
218
|
+
if (!this.client) {
|
|
219
|
+
return { totalChunks: this.vectorStore.length, totalFiles: this.fileHashes.size };
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
let totalChunks = 0;
|
|
223
|
+
|
|
224
|
+
// Prefer explicit count query when available (more reliable during ANN mode).
|
|
225
|
+
try {
|
|
226
|
+
const countRes = await this.client.count({
|
|
227
|
+
collection_name: this.collectionName,
|
|
228
|
+
expr: "id >= 0",
|
|
229
|
+
...this.getRequestBase()
|
|
230
|
+
});
|
|
231
|
+
totalChunks = Number(countRes?.data || 0);
|
|
232
|
+
} catch {
|
|
233
|
+
// Fallback to collection statistics below.
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (!Number.isFinite(totalChunks) || totalChunks < 0) {
|
|
237
|
+
totalChunks = 0;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (totalChunks === 0) {
|
|
241
|
+
const stats = await this.client.getCollectionStatistics({
|
|
242
|
+
collection_name: this.collectionName,
|
|
243
|
+
...this.getRequestBase()
|
|
244
|
+
});
|
|
245
|
+
const statsList = Array.isArray(stats?.stats) ? stats.stats : [];
|
|
246
|
+
const rowCountFromList = statsList.find((item) => item?.key === "row_count")?.value;
|
|
247
|
+
totalChunks = parseIntSafe(stats?.data?.row_count ?? rowCountFromList, 0);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
totalChunks,
|
|
252
|
+
totalFiles: this.fileHashes.size
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
normalizeChunk(chunk) {
|
|
257
|
+
if (!chunk || !chunk.file) return null;
|
|
258
|
+
|
|
259
|
+
const rawVector = Array.isArray(chunk.vector)
|
|
260
|
+
? chunk.vector
|
|
261
|
+
: Array.from(chunk.vector || []);
|
|
262
|
+
if (rawVector.length !== this.dimension) {
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
file: String(chunk.file),
|
|
268
|
+
start_line: parseIntSafe(chunk.startLine, 0),
|
|
269
|
+
end_line: parseIntSafe(chunk.endLine, 0),
|
|
270
|
+
content: String(chunk.content || "").slice(0, DEFAULT_MAX_CONTENT_LENGTH),
|
|
271
|
+
vector: rawVector.map((value) => Number(value))
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
getVectorStore() {
|
|
276
|
+
return this.vectorStore;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
setVectorStore(store) {
|
|
280
|
+
const normalizedStore = [];
|
|
281
|
+
for (const chunk of Array.isArray(store) ? store : []) {
|
|
282
|
+
const normalized = this.normalizeChunk(chunk);
|
|
283
|
+
if (!normalized) continue;
|
|
284
|
+
normalizedStore.push({
|
|
285
|
+
file: normalized.file,
|
|
286
|
+
startLine: normalized.start_line,
|
|
287
|
+
endLine: normalized.end_line,
|
|
288
|
+
content: normalized.content,
|
|
289
|
+
vector: normalized.vector
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
this.vectorStore = normalizedStore;
|
|
294
|
+
|
|
295
|
+
this.enqueueWrite(async () => {
|
|
296
|
+
if (!this.client) return;
|
|
297
|
+
await this.client.delete({
|
|
298
|
+
collection_name: this.collectionName,
|
|
299
|
+
filter: "id >= 0",
|
|
300
|
+
...this.getRequestBase()
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
if (normalizedStore.length === 0) return;
|
|
304
|
+
await this.client.insert({
|
|
305
|
+
collection_name: this.collectionName,
|
|
306
|
+
data: normalizedStore.map((row) => ({
|
|
307
|
+
file: row.file,
|
|
308
|
+
start_line: row.startLine,
|
|
309
|
+
end_line: row.endLine,
|
|
310
|
+
content: row.content,
|
|
311
|
+
vector: row.vector
|
|
312
|
+
})),
|
|
313
|
+
...this.getRequestBase()
|
|
314
|
+
});
|
|
315
|
+
}, "setVectorStore");
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
addToStore(chunk) {
|
|
319
|
+
this.addBatchToStore([chunk]);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
addBatchToStore(chunks) {
|
|
323
|
+
if (!Array.isArray(chunks) || chunks.length === 0) return;
|
|
324
|
+
|
|
325
|
+
const normalizedBatch = [];
|
|
326
|
+
const localBatch = [];
|
|
327
|
+
|
|
328
|
+
for (const chunk of chunks) {
|
|
329
|
+
const normalized = this.normalizeChunk(chunk);
|
|
330
|
+
if (!normalized) continue;
|
|
331
|
+
normalizedBatch.push(normalized);
|
|
332
|
+
localBatch.push({
|
|
333
|
+
file: normalized.file,
|
|
334
|
+
startLine: normalized.start_line,
|
|
335
|
+
endLine: normalized.end_line,
|
|
336
|
+
content: normalized.content,
|
|
337
|
+
vector: normalized.vector
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
if (normalizedBatch.length === 0) return;
|
|
342
|
+
if (!this.client) {
|
|
343
|
+
this.vectorStore.push(...localBatch);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
this.enqueueWrite(async () => {
|
|
347
|
+
if (!this.client) return;
|
|
348
|
+
await this.client.insert({
|
|
349
|
+
collection_name: this.collectionName,
|
|
350
|
+
data: normalizedBatch,
|
|
351
|
+
...this.getRequestBase()
|
|
352
|
+
});
|
|
353
|
+
}, "insert");
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
removeFileFromStore(file) {
|
|
357
|
+
this.vectorStore = this.vectorStore.filter((chunk) => chunk.file !== file);
|
|
358
|
+
|
|
359
|
+
const escaped = escapeFilterString(file);
|
|
360
|
+
this.enqueueWrite(async () => {
|
|
361
|
+
if (!this.client) return;
|
|
362
|
+
await this.client.delete({
|
|
363
|
+
collection_name: this.collectionName,
|
|
364
|
+
filter: `file == "${escaped}"`,
|
|
365
|
+
...this.getRequestBase()
|
|
366
|
+
});
|
|
367
|
+
}, "deleteByFile");
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
getFileHash(file) {
|
|
371
|
+
const entry = this.fileHashes.get(file);
|
|
372
|
+
if (typeof entry === "string") return entry;
|
|
373
|
+
return entry?.hash || null;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
getFileMtime(file) {
|
|
377
|
+
const entry = this.fileHashes.get(file);
|
|
378
|
+
return entry?.mtime ?? null;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
setFileHash(file, hash, mtime = null) {
|
|
382
|
+
this.fileHashes.set(file, { hash, mtime });
|
|
383
|
+
this.hashesDirty = true;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
deleteFileHash(file) {
|
|
387
|
+
this.fileHashes.delete(file);
|
|
388
|
+
this.hashesDirty = true;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
getAllFileHashes() {
|
|
392
|
+
return this.fileHashes;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
clearAllFileHashes() {
|
|
396
|
+
this.fileHashes = new Map();
|
|
397
|
+
this.hashesDirty = true;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
async loadFileHashes() {
|
|
401
|
+
try {
|
|
402
|
+
const raw = await fs.readFile(this.hashPath, "utf-8");
|
|
403
|
+
const parsed = JSON.parse(raw);
|
|
404
|
+
this.fileHashes = new Map(Object.entries(parsed || {}));
|
|
405
|
+
this.hashesDirty = false;
|
|
406
|
+
} catch {
|
|
407
|
+
this.fileHashes = new Map();
|
|
408
|
+
this.hashesDirty = false;
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
async saveFileHashes() {
|
|
413
|
+
if (!this.hashesDirty) return;
|
|
414
|
+
await fs.mkdir(this.config.cacheDirectory, { recursive: true });
|
|
415
|
+
await fs.writeFile(
|
|
416
|
+
this.hashPath,
|
|
417
|
+
JSON.stringify(Object.fromEntries(this.fileHashes), null, 2),
|
|
418
|
+
"utf-8"
|
|
419
|
+
);
|
|
420
|
+
this.hashesDirty = false;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
async save() {
|
|
424
|
+
if (!this.config.enableCache) return;
|
|
425
|
+
|
|
426
|
+
this.isSaving = true;
|
|
427
|
+
try {
|
|
428
|
+
await this.waitForWrites();
|
|
429
|
+
if (this.client) {
|
|
430
|
+
await this.client.flush({
|
|
431
|
+
collection_names: [this.collectionName],
|
|
432
|
+
...this.getRequestBase()
|
|
433
|
+
});
|
|
434
|
+
}
|
|
435
|
+
await this.saveFileHashes();
|
|
436
|
+
} finally {
|
|
437
|
+
this.isSaving = false;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
async saveIncremental() {
|
|
442
|
+
if (!this.config.enableCache) return;
|
|
443
|
+
await this.waitForWrites();
|
|
444
|
+
await this.saveFileHashes();
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
async resetForFullReindex() {
|
|
448
|
+
this.setVectorStore([]);
|
|
449
|
+
this.clearAllFileHashes();
|
|
450
|
+
await this.save();
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
async clear() {
|
|
454
|
+
if (!this.config.enableCache) return;
|
|
455
|
+
|
|
456
|
+
await this.waitForWrites();
|
|
457
|
+
|
|
458
|
+
if (this.client) {
|
|
459
|
+
try {
|
|
460
|
+
await this.client.dropCollection({
|
|
461
|
+
collection_name: this.collectionName,
|
|
462
|
+
...this.getRequestBase()
|
|
463
|
+
});
|
|
464
|
+
} catch (error) {
|
|
465
|
+
console.error(`[Cache] Milvus drop collection warning: ${error.message}`);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
await this.ensureCollection();
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
this.vectorStore = [];
|
|
472
|
+
this.fileHashes = new Map();
|
|
473
|
+
this.hashesDirty = true;
|
|
474
|
+
await this.saveFileHashes();
|
|
475
|
+
|
|
476
|
+
console.error(`[Cache] Milvus cache cleared successfully: ${this.collectionName}`);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MRL (Matryoshka Representation Learning) Embedder
|
|
3
|
+
*
|
|
4
|
+
* Provides flexible embedding dimensions (64, 128, 256, 512, 768) using
|
|
5
|
+
* nomic-embed-text-v1.5 with layer normalization and dimension slicing.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { pipeline, layer_norm } from '@huggingface/transformers';
|
|
9
|
+
import { existsSync, rmSync } from 'fs';
|
|
10
|
+
import { join, dirname } from 'path';
|
|
11
|
+
import { fileURLToPath } from 'url';
|
|
12
|
+
|
|
13
|
+
// Valid MRL dimensions for nomic-embed-text-v1.5
|
|
14
|
+
const VALID_DIMENSIONS = [64, 128, 256, 512, 768];
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Clear the HuggingFace transformers cache for a specific model
|
|
18
|
+
* Used for auto-recovery from corrupted model files
|
|
19
|
+
*/
|
|
20
|
+
function clearModelCache(modelName) {
|
|
21
|
+
try {
|
|
22
|
+
// Find the transformers package location
|
|
23
|
+
// import.meta.resolve may return .../dist/index.js, so check parent too
|
|
24
|
+
const resolvedPath = dirname(fileURLToPath(import.meta.resolve('@huggingface/transformers')));
|
|
25
|
+
const possibleRoots = [resolvedPath, dirname(resolvedPath)];
|
|
26
|
+
|
|
27
|
+
for (const root of possibleRoots) {
|
|
28
|
+
// Try different cache path patterns
|
|
29
|
+
const cachePaths = [
|
|
30
|
+
join(root, '.cache', modelName.replace('/', '-')), // nomic-ai-nomic-embed-text-v1.5
|
|
31
|
+
join(root, '.cache', ...modelName.split('/')) // nomic-ai/nomic-embed-text-v1.5
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
for (const cacheDir of cachePaths) {
|
|
35
|
+
if (existsSync(cacheDir)) {
|
|
36
|
+
console.error(`[MRL] Clearing corrupted cache: ${cacheDir}`);
|
|
37
|
+
rmSync(cacheDir, { recursive: true, force: true });
|
|
38
|
+
return true;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
} catch (e) {
|
|
43
|
+
console.error(`[MRL] Failed to clear cache: ${e.message}`);
|
|
44
|
+
}
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Create an MRL-enabled embedder with configurable output dimensions
|
|
50
|
+
*
|
|
51
|
+
* @param {string} modelName - Model identifier (e.g., 'nomic-ai/nomic-embed-text-v1.5')
|
|
52
|
+
* @param {object} options - Configuration options
|
|
53
|
+
* @param {number} options.dimension - Target embedding dimension (64, 128, 256, 512, 768)
|
|
54
|
+
* @param {string} options.device - Device to use ('cpu', 'webgpu', 'auto')
|
|
55
|
+
* @returns {Function} Embedder function compatible with existing codebase
|
|
56
|
+
*/
|
|
57
|
+
export async function createMRLEmbedder(modelName, options = {}) {
|
|
58
|
+
const dimension = options.dimension || 256;
|
|
59
|
+
const device = options.device || 'cpu';
|
|
60
|
+
|
|
61
|
+
// Validate dimension
|
|
62
|
+
if (!VALID_DIMENSIONS.includes(dimension)) {
|
|
63
|
+
console.error(`[MRL] Invalid dimension ${dimension}, using 256. Valid: ${VALID_DIMENSIONS.join(', ')}`);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const targetDim = VALID_DIMENSIONS.includes(dimension) ? dimension : 256;
|
|
67
|
+
|
|
68
|
+
console.error(`[MRL] Loading ${modelName} (output: ${targetDim}d, device: ${device})`);
|
|
69
|
+
|
|
70
|
+
// Detect best device if auto
|
|
71
|
+
const finalDevice = device === 'auto' ? detectBestDevice() : device;
|
|
72
|
+
|
|
73
|
+
// Create the feature extraction pipeline with auto-recovery for corrupted models
|
|
74
|
+
const pipelineOptions = {};
|
|
75
|
+
if (finalDevice === 'webgpu') {
|
|
76
|
+
pipelineOptions.device = 'webgpu';
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
let extractor;
|
|
80
|
+
|
|
81
|
+
// Helper to detect corruption errors
|
|
82
|
+
function isCorruptionError(err) {
|
|
83
|
+
if (!err.message) return false;
|
|
84
|
+
return err.message.includes('Protobuf parsing failed') ||
|
|
85
|
+
err.message.includes('Invalid model') ||
|
|
86
|
+
err.message.includes('ONNX') && err.message.includes('corrupt');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Helper to load/reload the extractor
|
|
90
|
+
async function loadExtractor(clearCache = false) {
|
|
91
|
+
if (clearCache) {
|
|
92
|
+
console.error(`[MRL] Corrupted model detected, attempting auto-recovery...`);
|
|
93
|
+
clearModelCache(modelName);
|
|
94
|
+
}
|
|
95
|
+
return await pipeline('feature-extraction', modelName, pipelineOptions);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
try {
|
|
99
|
+
extractor = await loadExtractor();
|
|
100
|
+
} catch (err) {
|
|
101
|
+
if (isCorruptionError(err)) {
|
|
102
|
+
extractor = await loadExtractor(true);
|
|
103
|
+
} else {
|
|
104
|
+
throw err;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
console.error(`[MRL] Model loaded on ${finalDevice}`);
|
|
109
|
+
|
|
110
|
+
// Fallback embedder for when MRL model fails at runtime
|
|
111
|
+
let fallbackEmbedder = null;
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Embed text with MRL dimension slicing
|
|
115
|
+
* Compatible with existing embedder(text, options) signature
|
|
116
|
+
* Includes runtime auto-recovery for corrupted models with fallback
|
|
117
|
+
*/
|
|
118
|
+
async function embed(text, embedOptions = {}) {
|
|
119
|
+
// If we've fallen back to legacy, use it
|
|
120
|
+
if (fallbackEmbedder) {
|
|
121
|
+
return await fallbackEmbedder(text, embedOptions);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
async function doEmbed() {
|
|
125
|
+
// Generate full 768d embedding
|
|
126
|
+
let embeddings = await extractor(text, { pooling: 'mean' });
|
|
127
|
+
|
|
128
|
+
// Apply MRL: layer_norm -> slice -> normalize
|
|
129
|
+
embeddings = layer_norm(embeddings, [embeddings.dims[1]])
|
|
130
|
+
.slice(null, [0, targetDim])
|
|
131
|
+
.normalize(2, -1);
|
|
132
|
+
|
|
133
|
+
// Return in format compatible with existing code (has .data property)
|
|
134
|
+
return {
|
|
135
|
+
data: embeddings.data,
|
|
136
|
+
dims: [embeddings.dims[0], targetDim]
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
try {
|
|
141
|
+
return await doEmbed();
|
|
142
|
+
} catch (err) {
|
|
143
|
+
// Runtime corruption detection - try reload first
|
|
144
|
+
if (isCorruptionError(err)) {
|
|
145
|
+
console.error(`[MRL] Runtime corruption detected, attempting reload...`);
|
|
146
|
+
try {
|
|
147
|
+
extractor = await loadExtractor(true);
|
|
148
|
+
return await doEmbed();
|
|
149
|
+
} catch (reloadErr) {
|
|
150
|
+
// Reload failed - fall back to legacy model
|
|
151
|
+
console.error(`[MRL] Reload failed, falling back to legacy model...`);
|
|
152
|
+
const { createLegacyEmbedder } = await import('./mrl-embedder.js');
|
|
153
|
+
fallbackEmbedder = await createLegacyEmbedder();
|
|
154
|
+
embed.dimension = fallbackEmbedder.dimension;
|
|
155
|
+
embed.modelName = fallbackEmbedder.modelName;
|
|
156
|
+
return await fallbackEmbedder(text, embedOptions);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
throw err;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Attach metadata
|
|
164
|
+
embed.modelName = modelName;
|
|
165
|
+
embed.dimension = targetDim;
|
|
166
|
+
embed.device = finalDevice;
|
|
167
|
+
|
|
168
|
+
return embed;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Detect best available device for inference
|
|
173
|
+
*/
|
|
174
|
+
function detectBestDevice() {
|
|
175
|
+
// WebGPU check (browser environment)
|
|
176
|
+
if (typeof navigator !== 'undefined' && navigator.gpu) {
|
|
177
|
+
return 'webgpu';
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Node.js with experimental WebGPU (Node 20+)
|
|
181
|
+
// This would require --experimental-webgpu flag
|
|
182
|
+
// For now, default to CPU in Node.js
|
|
183
|
+
return 'cpu';
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Create a legacy-compatible embedder (384d, MiniLM)
|
|
188
|
+
* Used as fallback if MRL model fails to load
|
|
189
|
+
*/
|
|
190
|
+
export async function createLegacyEmbedder(modelName = 'Xenova/all-MiniLM-L6-v2') {
|
|
191
|
+
console.error(`[Embedder] Loading legacy model: ${modelName}`);
|
|
192
|
+
const extractor = await pipeline('feature-extraction', modelName);
|
|
193
|
+
|
|
194
|
+
async function embed(text, options = {}) {
|
|
195
|
+
const output = await extractor(text, { pooling: 'mean', normalize: true });
|
|
196
|
+
return output;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
embed.modelName = modelName;
|
|
200
|
+
embed.dimension = 384;
|
|
201
|
+
embed.device = 'cpu';
|
|
202
|
+
|
|
203
|
+
return embed;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Smart embedder factory - picks MRL or legacy based on config
|
|
208
|
+
*/
|
|
209
|
+
export async function createEmbedder(config) {
|
|
210
|
+
const provider = (config.embeddingProvider || 'local').toLowerCase();
|
|
211
|
+
const model = config.embeddingModel || 'nomic-ai/nomic-embed-text-v1.5';
|
|
212
|
+
const dimension = config.embeddingDimension || 256;
|
|
213
|
+
const device = config.device || 'cpu';
|
|
214
|
+
|
|
215
|
+
// API providers (Gemini/OpenAI/OpenAI-compatible/Vertex)
|
|
216
|
+
if (['gemini', 'openai', 'openai-compatible', 'vertex'].includes(provider) || model.includes('gemini')) {
|
|
217
|
+
const { createGeminiEmbedder } = await import('./gemini-embedder.js');
|
|
218
|
+
return await createGeminiEmbedder(config);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Use MRL for nomic models
|
|
222
|
+
if (model.includes('nomic')) {
|
|
223
|
+
try {
|
|
224
|
+
return await createMRLEmbedder(model, { dimension, device });
|
|
225
|
+
} catch (err) {
|
|
226
|
+
console.error(`[Embedder] MRL model failed: ${err.message}, falling back to legacy`);
|
|
227
|
+
return await createLegacyEmbedder();
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Use legacy for MiniLM and other models
|
|
232
|
+
return await createLegacyEmbedder(model);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
export { VALID_DIMENSIONS };
|