smart-coding-mcp 2.3.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/embedding-worker.js +41 -4
- package/lib/mrl-embedder.js +52 -4
- package/package.json +1 -1
- package/test/model-cache-recovery.test.js +242 -0
package/lib/embedding-worker.js
CHANGED
|
@@ -1,9 +1,31 @@
|
|
|
1
1
|
import { parentPort, workerData } from "worker_threads";
|
|
2
2
|
import { pipeline, layer_norm } from "@huggingface/transformers";
|
|
3
|
+
import { existsSync, rmSync } from 'fs';
|
|
4
|
+
import { join, dirname } from 'path';
|
|
5
|
+
import { fileURLToPath } from 'url';
|
|
3
6
|
|
|
4
7
|
let embedder = null;
|
|
5
8
|
const VALID_DIMENSIONS = [64, 128, 256, 512, 768];
|
|
6
9
|
|
|
10
|
+
/**
|
|
11
|
+
* Clear the HuggingFace transformers cache for a specific model
|
|
12
|
+
* Used for auto-recovery from corrupted model files
|
|
13
|
+
*/
|
|
14
|
+
function clearModelCache(modelName) {
|
|
15
|
+
try {
|
|
16
|
+
const transformersPath = dirname(fileURLToPath(import.meta.resolve('@huggingface/transformers')));
|
|
17
|
+
const cacheDir = join(transformersPath, '.cache', ...modelName.split('/'));
|
|
18
|
+
if (existsSync(cacheDir)) {
|
|
19
|
+
console.error(`[Worker] Clearing corrupted cache: ${cacheDir}`);
|
|
20
|
+
rmSync(cacheDir, { recursive: true, force: true });
|
|
21
|
+
return true;
|
|
22
|
+
}
|
|
23
|
+
} catch (e) {
|
|
24
|
+
console.error(`[Worker] Failed to clear cache: ${e.message}`);
|
|
25
|
+
}
|
|
26
|
+
return false;
|
|
27
|
+
}
|
|
28
|
+
|
|
7
29
|
// Initialize the embedding model once when worker starts
|
|
8
30
|
async function initializeEmbedder() {
|
|
9
31
|
if (!embedder) {
|
|
@@ -11,12 +33,27 @@ async function initializeEmbedder() {
|
|
|
11
33
|
const dimension = workerData.embeddingDimension || 256;
|
|
12
34
|
const targetDim = VALID_DIMENSIONS.includes(dimension) ? dimension : 256;
|
|
13
35
|
const isNomic = modelName.includes('nomic');
|
|
14
|
-
|
|
15
|
-
|
|
36
|
+
|
|
37
|
+
// Load model with auto-recovery for corrupted files
|
|
38
|
+
let extractor;
|
|
39
|
+
try {
|
|
40
|
+
extractor = await pipeline("feature-extraction", modelName);
|
|
41
|
+
} catch (err) {
|
|
42
|
+
if (err.message && err.message.includes('Protobuf parsing failed')) {
|
|
43
|
+
console.error(`[Worker] Corrupted model detected, attempting auto-recovery...`);
|
|
44
|
+
if (clearModelCache(modelName)) {
|
|
45
|
+
extractor = await pipeline("feature-extraction", modelName);
|
|
46
|
+
} else {
|
|
47
|
+
throw err;
|
|
48
|
+
}
|
|
49
|
+
} else {
|
|
50
|
+
throw err;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
16
53
|
|
|
17
54
|
if (isNomic) {
|
|
18
55
|
// MRL embedder with dimension slicing
|
|
19
|
-
embedder = async function(text
|
|
56
|
+
embedder = async function(text) {
|
|
20
57
|
let embeddings = await extractor(text, { pooling: 'mean' });
|
|
21
58
|
embeddings = layer_norm(embeddings, [embeddings.dims[1]])
|
|
22
59
|
.slice(null, [0, targetDim])
|
|
@@ -26,7 +63,7 @@ async function initializeEmbedder() {
|
|
|
26
63
|
embedder.dimension = targetDim;
|
|
27
64
|
} else {
|
|
28
65
|
// Legacy embedder (MiniLM etc.)
|
|
29
|
-
embedder = async function(text
|
|
66
|
+
embedder = async function(text) {
|
|
30
67
|
return await extractor(text, { pooling: 'mean', normalize: true });
|
|
31
68
|
};
|
|
32
69
|
embedder.dimension = 384;
|
package/lib/mrl-embedder.js
CHANGED
|
@@ -1,15 +1,47 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* MRL (Matryoshka Representation Learning) Embedder
|
|
3
|
-
*
|
|
3
|
+
*
|
|
4
4
|
* Provides flexible embedding dimensions (64, 128, 256, 512, 768) using
|
|
5
5
|
* nomic-embed-text-v1.5 with layer normalization and dimension slicing.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
8
|
import { pipeline, layer_norm } from '@huggingface/transformers';
|
|
9
|
+
import { existsSync, rmSync } from 'fs';
|
|
10
|
+
import { join, dirname } from 'path';
|
|
11
|
+
import { fileURLToPath } from 'url';
|
|
9
12
|
|
|
10
13
|
// Valid MRL dimensions for nomic-embed-text-v1.5
|
|
11
14
|
const VALID_DIMENSIONS = [64, 128, 256, 512, 768];
|
|
12
15
|
|
|
16
|
+
/**
|
|
17
|
+
* Clear the HuggingFace transformers cache for a specific model
|
|
18
|
+
* Used for auto-recovery from corrupted model files
|
|
19
|
+
*/
|
|
20
|
+
function clearModelCache(modelName) {
|
|
21
|
+
try {
|
|
22
|
+
// Find the transformers package location
|
|
23
|
+
const transformersPath = dirname(fileURLToPath(import.meta.resolve('@huggingface/transformers')));
|
|
24
|
+
const cacheDir = join(transformersPath, '.cache', modelName.replace('/', '-'));
|
|
25
|
+
|
|
26
|
+
if (existsSync(cacheDir)) {
|
|
27
|
+
console.error(`[MRL] Clearing corrupted cache: ${cacheDir}`);
|
|
28
|
+
rmSync(cacheDir, { recursive: true, force: true });
|
|
29
|
+
return true;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Also try the model name with original slash (nomic-ai/nomic-embed-text-v1.5)
|
|
33
|
+
const cacheDir2 = join(transformersPath, '.cache', ...modelName.split('/'));
|
|
34
|
+
if (existsSync(cacheDir2)) {
|
|
35
|
+
console.error(`[MRL] Clearing corrupted cache: ${cacheDir2}`);
|
|
36
|
+
rmSync(cacheDir2, { recursive: true, force: true });
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
} catch (e) {
|
|
40
|
+
console.error(`[MRL] Failed to clear cache: ${e.message}`);
|
|
41
|
+
}
|
|
42
|
+
return false;
|
|
43
|
+
}
|
|
44
|
+
|
|
13
45
|
/**
|
|
14
46
|
* Create an MRL-enabled embedder with configurable output dimensions
|
|
15
47
|
*
|
|
@@ -35,13 +67,29 @@ export async function createMRLEmbedder(modelName, options = {}) {
|
|
|
35
67
|
// Detect best device if auto
|
|
36
68
|
const finalDevice = device === 'auto' ? detectBestDevice() : device;
|
|
37
69
|
|
|
38
|
-
// Create the feature extraction pipeline
|
|
70
|
+
// Create the feature extraction pipeline with auto-recovery for corrupted models
|
|
39
71
|
const pipelineOptions = {};
|
|
40
72
|
if (finalDevice === 'webgpu') {
|
|
41
73
|
pipelineOptions.device = 'webgpu';
|
|
42
74
|
}
|
|
43
|
-
|
|
44
|
-
|
|
75
|
+
|
|
76
|
+
let extractor;
|
|
77
|
+
try {
|
|
78
|
+
extractor = await pipeline('feature-extraction', modelName, pipelineOptions);
|
|
79
|
+
} catch (err) {
|
|
80
|
+
// Detect corrupted ONNX model (Protobuf parsing failed)
|
|
81
|
+
if (err.message && err.message.includes('Protobuf parsing failed')) {
|
|
82
|
+
console.error(`[MRL] Corrupted model detected, attempting auto-recovery...`);
|
|
83
|
+
if (clearModelCache(modelName)) {
|
|
84
|
+
// Retry after clearing cache
|
|
85
|
+
extractor = await pipeline('feature-extraction', modelName, pipelineOptions);
|
|
86
|
+
} else {
|
|
87
|
+
throw err;
|
|
88
|
+
}
|
|
89
|
+
} else {
|
|
90
|
+
throw err;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
45
93
|
|
|
46
94
|
console.error(`[MRL] Model loaded on ${finalDevice}`);
|
|
47
95
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "smart-coding-mcp",
|
|
3
|
-
"version": "2.3.
|
|
3
|
+
"version": "2.3.1",
|
|
4
4
|
"description": "An extensible MCP server that enhances coding productivity with AI-powered features including semantic code search, intelligent indexing, and more, using local LLMs",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for Model Cache Auto-Recovery
|
|
3
|
+
*
|
|
4
|
+
* Tests the auto-recovery mechanism for corrupted ONNX model files:
|
|
5
|
+
* - Cache directory detection and clearing
|
|
6
|
+
* - Protobuf parsing error detection
|
|
7
|
+
* - Retry after cache clear
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
11
|
+
import { existsSync, mkdirSync, writeFileSync, rmSync } from 'fs';
|
|
12
|
+
import { join, dirname } from 'path';
|
|
13
|
+
import { fileURLToPath } from 'url';
|
|
14
|
+
import { tmpdir } from 'os';
|
|
15
|
+
|
|
16
|
+
// Create a test cache directory structure
|
|
17
|
+
const testCacheBase = join(tmpdir(), 'smart-coding-mcp-test-cache');
|
|
18
|
+
|
|
19
|
+
describe('Model Cache Recovery', () => {
|
|
20
|
+
beforeEach(() => {
|
|
21
|
+
// Clean up before each test
|
|
22
|
+
if (existsSync(testCacheBase)) {
|
|
23
|
+
rmSync(testCacheBase, { recursive: true, force: true });
|
|
24
|
+
}
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
afterEach(() => {
|
|
28
|
+
// Clean up after each test
|
|
29
|
+
if (existsSync(testCacheBase)) {
|
|
30
|
+
rmSync(testCacheBase, { recursive: true, force: true });
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
describe('Cache Directory Detection', () => {
|
|
35
|
+
it('should detect cache directory with forward slash model names', () => {
|
|
36
|
+
// Create a mock cache structure
|
|
37
|
+
const cacheDir = join(testCacheBase, 'nomic-ai', 'nomic-embed-text-v1.5', 'onnx');
|
|
38
|
+
mkdirSync(cacheDir, { recursive: true });
|
|
39
|
+
writeFileSync(join(cacheDir, 'model.onnx'), 'corrupted data');
|
|
40
|
+
|
|
41
|
+
expect(existsSync(cacheDir)).toBe(true);
|
|
42
|
+
|
|
43
|
+
// Simulate cache clear logic
|
|
44
|
+
const modelName = 'nomic-ai/nomic-embed-text-v1.5';
|
|
45
|
+
const cachePath = join(testCacheBase, ...modelName.split('/'));
|
|
46
|
+
|
|
47
|
+
if (existsSync(cachePath)) {
|
|
48
|
+
rmSync(cachePath, { recursive: true, force: true });
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
expect(existsSync(cachePath)).toBe(false);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('should handle non-existent cache gracefully', () => {
|
|
55
|
+
const nonExistentPath = join(testCacheBase, 'does-not-exist');
|
|
56
|
+
|
|
57
|
+
// Should not throw
|
|
58
|
+
let cleared = false;
|
|
59
|
+
if (existsSync(nonExistentPath)) {
|
|
60
|
+
rmSync(nonExistentPath, { recursive: true, force: true });
|
|
61
|
+
cleared = true;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
expect(cleared).toBe(false);
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('should clear nested model cache directories', () => {
|
|
68
|
+
// Create nested structure mimicking real cache
|
|
69
|
+
const modelDir = join(testCacheBase, 'nomic-ai', 'nomic-embed-text-v1.5');
|
|
70
|
+
const onnxDir = join(modelDir, 'onnx');
|
|
71
|
+
const tokenizerDir = join(modelDir, 'tokenizer');
|
|
72
|
+
|
|
73
|
+
mkdirSync(onnxDir, { recursive: true });
|
|
74
|
+
mkdirSync(tokenizerDir, { recursive: true });
|
|
75
|
+
|
|
76
|
+
writeFileSync(join(onnxDir, 'model.onnx'), 'corrupted');
|
|
77
|
+
writeFileSync(join(tokenizerDir, 'tokenizer.json'), '{}');
|
|
78
|
+
|
|
79
|
+
expect(existsSync(onnxDir)).toBe(true);
|
|
80
|
+
expect(existsSync(tokenizerDir)).toBe(true);
|
|
81
|
+
|
|
82
|
+
// Clear the model directory (not just onnx)
|
|
83
|
+
rmSync(modelDir, { recursive: true, force: true });
|
|
84
|
+
|
|
85
|
+
expect(existsSync(modelDir)).toBe(false);
|
|
86
|
+
expect(existsSync(onnxDir)).toBe(false);
|
|
87
|
+
expect(existsSync(tokenizerDir)).toBe(false);
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
describe('Error Detection', () => {
|
|
92
|
+
it('should identify Protobuf parsing errors as recoverable', () => {
|
|
93
|
+
const recoverableErrors = [
|
|
94
|
+
'Protobuf parsing failed',
|
|
95
|
+
'Load model from /path/to/model.onnx failed:Protobuf parsing failed.',
|
|
96
|
+
'Error: Protobuf parsing failed'
|
|
97
|
+
];
|
|
98
|
+
|
|
99
|
+
for (const errorMsg of recoverableErrors) {
|
|
100
|
+
const isRecoverable = errorMsg.includes('Protobuf parsing failed');
|
|
101
|
+
expect(isRecoverable).toBe(true);
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it('should not identify other errors as recoverable', () => {
|
|
106
|
+
const nonRecoverableErrors = [
|
|
107
|
+
'Network error',
|
|
108
|
+
'File not found',
|
|
109
|
+
'Out of memory',
|
|
110
|
+
'Invalid model format',
|
|
111
|
+
'ONNX runtime error'
|
|
112
|
+
];
|
|
113
|
+
|
|
114
|
+
for (const errorMsg of nonRecoverableErrors) {
|
|
115
|
+
const isRecoverable = errorMsg.includes('Protobuf parsing failed');
|
|
116
|
+
expect(isRecoverable).toBe(false);
|
|
117
|
+
}
|
|
118
|
+
});
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
describe('Recovery Flow', () => {
|
|
122
|
+
it('should simulate recovery sequence', async () => {
|
|
123
|
+
// Create corrupted cache
|
|
124
|
+
const cacheDir = join(testCacheBase, 'nomic-ai', 'nomic-embed-text-v1.5');
|
|
125
|
+
mkdirSync(join(cacheDir, 'onnx'), { recursive: true });
|
|
126
|
+
writeFileSync(join(cacheDir, 'onnx', 'model.onnx'), 'corrupted protobuf data');
|
|
127
|
+
|
|
128
|
+
let loadAttempts = 0;
|
|
129
|
+
let cacheCleared = false;
|
|
130
|
+
|
|
131
|
+
// Simulate the recovery flow
|
|
132
|
+
const mockLoadModel = async () => {
|
|
133
|
+
loadAttempts++;
|
|
134
|
+
if (loadAttempts === 1 && !cacheCleared) {
|
|
135
|
+
throw new Error('Load model from /path/model.onnx failed:Protobuf parsing failed.');
|
|
136
|
+
}
|
|
137
|
+
return { success: true };
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
const mockClearCache = () => {
|
|
141
|
+
if (existsSync(cacheDir)) {
|
|
142
|
+
rmSync(cacheDir, { recursive: true, force: true });
|
|
143
|
+
cacheCleared = true;
|
|
144
|
+
return true;
|
|
145
|
+
}
|
|
146
|
+
return false;
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
// First attempt should fail
|
|
150
|
+
let result;
|
|
151
|
+
try {
|
|
152
|
+
result = await mockLoadModel();
|
|
153
|
+
} catch (err) {
|
|
154
|
+
if (err.message.includes('Protobuf parsing failed')) {
|
|
155
|
+
// Clear cache and retry
|
|
156
|
+
mockClearCache();
|
|
157
|
+
result = await mockLoadModel();
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
expect(loadAttempts).toBe(2);
|
|
162
|
+
expect(cacheCleared).toBe(true);
|
|
163
|
+
expect(result.success).toBe(true);
|
|
164
|
+
expect(existsSync(cacheDir)).toBe(false);
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
it('should propagate non-recoverable errors', async () => {
|
|
168
|
+
const mockLoadModel = async () => {
|
|
169
|
+
throw new Error('Network connection failed');
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
await expect(async () => {
|
|
173
|
+
try {
|
|
174
|
+
await mockLoadModel();
|
|
175
|
+
} catch (err) {
|
|
176
|
+
if (err.message.includes('Protobuf parsing failed')) {
|
|
177
|
+
// Would clear cache and retry, but this error is different
|
|
178
|
+
}
|
|
179
|
+
throw err;
|
|
180
|
+
}
|
|
181
|
+
}).rejects.toThrow('Network connection failed');
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
it('should handle cache clear failure gracefully', async () => {
|
|
185
|
+
let loadAttempts = 0;
|
|
186
|
+
|
|
187
|
+
const mockLoadModel = async () => {
|
|
188
|
+
loadAttempts++;
|
|
189
|
+
throw new Error('Protobuf parsing failed');
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
const mockClearCache = () => {
|
|
193
|
+
// Simulate cache clear failure (e.g., permission denied)
|
|
194
|
+
return false;
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
await expect(async () => {
|
|
198
|
+
try {
|
|
199
|
+
await mockLoadModel();
|
|
200
|
+
} catch (err) {
|
|
201
|
+
if (err.message.includes('Protobuf parsing failed')) {
|
|
202
|
+
if (!mockClearCache()) {
|
|
203
|
+
// Cache clear failed, re-throw original error
|
|
204
|
+
throw err;
|
|
205
|
+
}
|
|
206
|
+
} else {
|
|
207
|
+
throw err;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}).rejects.toThrow('Protobuf parsing failed');
|
|
211
|
+
|
|
212
|
+
expect(loadAttempts).toBe(1);
|
|
213
|
+
});
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
describe('Path Resolution', () => {
|
|
217
|
+
it('should handle model names with organization prefix', () => {
|
|
218
|
+
const modelName = 'nomic-ai/nomic-embed-text-v1.5';
|
|
219
|
+
const parts = modelName.split('/');
|
|
220
|
+
|
|
221
|
+
expect(parts).toEqual(['nomic-ai', 'nomic-embed-text-v1.5']);
|
|
222
|
+
expect(parts.length).toBe(2);
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
it('should handle model names without organization prefix', () => {
|
|
226
|
+
const modelName = 'all-MiniLM-L6-v2';
|
|
227
|
+
const parts = modelName.split('/');
|
|
228
|
+
|
|
229
|
+
expect(parts).toEqual(['all-MiniLM-L6-v2']);
|
|
230
|
+
expect(parts.length).toBe(1);
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it('should build correct cache paths', () => {
|
|
234
|
+
const baseDir = '/some/cache/path';
|
|
235
|
+
const modelName = 'nomic-ai/nomic-embed-text-v1.5';
|
|
236
|
+
|
|
237
|
+
const cachePath = join(baseDir, ...modelName.split('/'));
|
|
238
|
+
|
|
239
|
+
expect(cachePath).toBe('/some/cache/path/nomic-ai/nomic-embed-text-v1.5');
|
|
240
|
+
});
|
|
241
|
+
});
|
|
242
|
+
});
|