smart-coding-mcp 1.4.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +99 -31
- package/config.json +4 -1
- package/features/get-status.js +132 -0
- package/features/index-codebase.js +8 -0
- package/features/set-workspace.js +155 -0
- package/index.js +22 -3
- package/lib/ast-chunker.js +273 -0
- package/lib/config.js +40 -1
- package/lib/embedding-worker.js +29 -2
- package/lib/mrl-embedder.js +133 -0
- package/lib/tokenizer.js +4 -0
- package/package.json +5 -3
- package/test/ast-chunker.test.js +105 -0
- package/test/device-detection.test.js +110 -0
- package/test/embedding-model.test.js +14 -11
- package/test/helpers.js +3 -3
- package/test/mrl-embedder.test.js +108 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AST-based Code Chunker
|
|
3
|
+
*
|
|
4
|
+
* Uses Tree-sitter to parse code and chunk at semantic boundaries
|
|
5
|
+
* (functions, classes, methods) instead of arbitrary line splits.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import Parser from 'web-tree-sitter';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
import fs from 'fs/promises';
|
|
11
|
+
import { fileURLToPath } from 'url';
|
|
12
|
+
import { smartChunk } from './utils.js'; // Fallback
|
|
13
|
+
|
|
14
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
15
|
+
|
|
16
|
+
// Mapping of file extensions to Tree-sitter language names
|
|
17
|
+
const LANGUAGE_MAP = {
|
|
18
|
+
js: 'javascript',
|
|
19
|
+
mjs: 'javascript',
|
|
20
|
+
cjs: 'javascript',
|
|
21
|
+
jsx: 'javascript',
|
|
22
|
+
ts: 'typescript',
|
|
23
|
+
tsx: 'typescript',
|
|
24
|
+
py: 'python',
|
|
25
|
+
go: 'go',
|
|
26
|
+
rs: 'rust',
|
|
27
|
+
rb: 'ruby',
|
|
28
|
+
java: 'java',
|
|
29
|
+
c: 'c',
|
|
30
|
+
cpp: 'cpp',
|
|
31
|
+
h: 'c',
|
|
32
|
+
hpp: 'cpp'
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// Node types that represent semantic boundaries
|
|
36
|
+
const SEMANTIC_NODES = {
|
|
37
|
+
javascript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement'],
|
|
38
|
+
typescript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement'],
|
|
39
|
+
python: ['function_definition', 'class_definition', 'decorated_definition'],
|
|
40
|
+
go: ['function_declaration', 'method_declaration', 'type_declaration'],
|
|
41
|
+
rust: ['function_item', 'impl_item', 'struct_item', 'enum_item'],
|
|
42
|
+
ruby: ['method', 'class', 'module'],
|
|
43
|
+
java: ['method_declaration', 'class_declaration', 'interface_declaration'],
|
|
44
|
+
c: ['function_definition', 'struct_specifier'],
|
|
45
|
+
cpp: ['function_definition', 'class_specifier', 'struct_specifier']
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
export class ASTChunker {
|
|
49
|
+
constructor(config) {
|
|
50
|
+
this.config = config;
|
|
51
|
+
this.parser = null;
|
|
52
|
+
this.languages = new Map();
|
|
53
|
+
this.initialized = false;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Initialize Tree-sitter parser
|
|
58
|
+
*/
|
|
59
|
+
async init() {
|
|
60
|
+
if (this.initialized) return;
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
await Parser.init();
|
|
64
|
+
this.parser = new Parser();
|
|
65
|
+
this.initialized = true;
|
|
66
|
+
console.error('[AST] Tree-sitter parser initialized');
|
|
67
|
+
} catch (error) {
|
|
68
|
+
console.error('[AST] Failed to initialize Tree-sitter:', error.message);
|
|
69
|
+
throw error;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Load a language grammar
|
|
75
|
+
*/
|
|
76
|
+
async loadLanguage(langName) {
|
|
77
|
+
if (this.languages.has(langName)) {
|
|
78
|
+
return this.languages.get(langName);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
try {
|
|
82
|
+
// Try to find the WASM file in node_modules
|
|
83
|
+
const possiblePaths = [
|
|
84
|
+
path.join(__dirname, '..', 'node_modules', `tree-sitter-${langName}`, `tree-sitter-${langName}.wasm`),
|
|
85
|
+
path.join(__dirname, '..', 'node_modules', 'tree-sitter-wasms', 'out', `tree-sitter-${langName}.wasm`),
|
|
86
|
+
path.join(__dirname, '..', 'grammars', `tree-sitter-${langName}.wasm`)
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
for (const wasmPath of possiblePaths) {
|
|
90
|
+
try {
|
|
91
|
+
await fs.access(wasmPath);
|
|
92
|
+
const language = await Parser.Language.load(wasmPath);
|
|
93
|
+
this.languages.set(langName, language);
|
|
94
|
+
if (this.config.verbose) {
|
|
95
|
+
console.error(`[AST] Loaded ${langName} grammar from ${wasmPath}`);
|
|
96
|
+
}
|
|
97
|
+
return language;
|
|
98
|
+
} catch {
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
console.error(`[AST] No grammar found for ${langName}`);
|
|
104
|
+
return null;
|
|
105
|
+
} catch (error) {
|
|
106
|
+
console.error(`[AST] Failed to load ${langName}:`, error.message);
|
|
107
|
+
return null;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Get the language name from file extension
|
|
113
|
+
*/
|
|
114
|
+
getLanguageForFile(file) {
|
|
115
|
+
const ext = path.extname(file).slice(1).toLowerCase();
|
|
116
|
+
return LANGUAGE_MAP[ext] || null;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Chunk code using AST analysis
|
|
121
|
+
*/
|
|
122
|
+
async chunk(content, file) {
|
|
123
|
+
// Initialize if needed
|
|
124
|
+
if (!this.initialized) {
|
|
125
|
+
await this.init();
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const langName = this.getLanguageForFile(file);
|
|
129
|
+
|
|
130
|
+
// Fall back to smart chunking if language not supported
|
|
131
|
+
if (!langName) {
|
|
132
|
+
if (this.config.verbose) {
|
|
133
|
+
console.error(`[AST] No AST support for ${path.extname(file)}, using smart chunking`);
|
|
134
|
+
}
|
|
135
|
+
return smartChunk(content, file, this.config);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const language = await this.loadLanguage(langName);
|
|
139
|
+
|
|
140
|
+
// Fall back if grammar not available
|
|
141
|
+
if (!language) {
|
|
142
|
+
return smartChunk(content, file, this.config);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
try {
|
|
146
|
+
this.parser.setLanguage(language);
|
|
147
|
+
const tree = this.parser.parse(content);
|
|
148
|
+
const chunks = [];
|
|
149
|
+
const lines = content.split('\n');
|
|
150
|
+
const semanticNodes = SEMANTIC_NODES[langName] || [];
|
|
151
|
+
|
|
152
|
+
// Walk the AST and extract semantic chunks
|
|
153
|
+
this.walkTree(tree.rootNode, (node) => {
|
|
154
|
+
if (semanticNodes.includes(node.type)) {
|
|
155
|
+
const startLine = node.startPosition.row;
|
|
156
|
+
const endLine = node.endPosition.row;
|
|
157
|
+
|
|
158
|
+
// Skip very small nodes (< 3 lines)
|
|
159
|
+
if (endLine - startLine < 2) return;
|
|
160
|
+
|
|
161
|
+
// Extract the text for this node
|
|
162
|
+
const chunkLines = lines.slice(startLine, endLine + 1);
|
|
163
|
+
const text = chunkLines.join('\n');
|
|
164
|
+
|
|
165
|
+
// Skip if too large (will be handled by split)
|
|
166
|
+
const targetTokens = this.config.chunkSize * 4; // Rough estimate
|
|
167
|
+
if (text.length > targetTokens * 4) {
|
|
168
|
+
// Split large nodes
|
|
169
|
+
this.splitLargeNode(node, lines, chunks);
|
|
170
|
+
} else {
|
|
171
|
+
chunks.push({
|
|
172
|
+
text,
|
|
173
|
+
startLine: startLine + 1, // 1-indexed
|
|
174
|
+
endLine: endLine + 1,
|
|
175
|
+
nodeType: node.type
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
// If no semantic chunks found, fall back to smart chunking
|
|
182
|
+
if (chunks.length === 0) {
|
|
183
|
+
return smartChunk(content, file, this.config);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Sort by start line
|
|
187
|
+
chunks.sort((a, b) => a.startLine - b.startLine);
|
|
188
|
+
|
|
189
|
+
// Merge small gaps and remove overlaps
|
|
190
|
+
return this.mergeAndCleanChunks(chunks, lines);
|
|
191
|
+
|
|
192
|
+
} catch (error) {
|
|
193
|
+
console.error(`[AST] Parse error for ${file}:`, error.message);
|
|
194
|
+
return smartChunk(content, file, this.config);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Walk the AST tree and call callback for each node
|
|
200
|
+
*/
|
|
201
|
+
walkTree(node, callback) {
|
|
202
|
+
callback(node);
|
|
203
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
204
|
+
this.walkTree(node.child(i), callback);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Split large AST nodes into smaller chunks
|
|
210
|
+
*/
|
|
211
|
+
splitLargeNode(node, lines, chunks) {
|
|
212
|
+
const chunkSize = this.config.chunkSize || 25;
|
|
213
|
+
const startLine = node.startPosition.row;
|
|
214
|
+
const endLine = node.endPosition.row;
|
|
215
|
+
|
|
216
|
+
for (let i = startLine; i <= endLine; i += chunkSize) {
|
|
217
|
+
const chunkEnd = Math.min(i + chunkSize - 1, endLine);
|
|
218
|
+
const chunkLines = lines.slice(i, chunkEnd + 1);
|
|
219
|
+
|
|
220
|
+
chunks.push({
|
|
221
|
+
text: chunkLines.join('\n'),
|
|
222
|
+
startLine: i + 1,
|
|
223
|
+
endLine: chunkEnd + 1,
|
|
224
|
+
nodeType: node.type + '_part'
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Merge small chunks and clean up overlaps
|
|
231
|
+
*/
|
|
232
|
+
mergeAndCleanChunks(chunks, lines) {
|
|
233
|
+
const cleaned = [];
|
|
234
|
+
const minSize = 5; // Minimum lines per chunk
|
|
235
|
+
|
|
236
|
+
for (const chunk of chunks) {
|
|
237
|
+
// Skip if overlaps with previous
|
|
238
|
+
if (cleaned.length > 0) {
|
|
239
|
+
const prev = cleaned[cleaned.length - 1];
|
|
240
|
+
if (chunk.startLine <= prev.endLine) {
|
|
241
|
+
// Extend previous chunk if this one extends further
|
|
242
|
+
if (chunk.endLine > prev.endLine) {
|
|
243
|
+
prev.endLine = chunk.endLine;
|
|
244
|
+
const extendedLines = lines.slice(prev.startLine - 1, prev.endLine);
|
|
245
|
+
prev.text = extendedLines.join('\n');
|
|
246
|
+
}
|
|
247
|
+
continue;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Add to cleaned list
|
|
252
|
+
cleaned.push(chunk);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return cleaned;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Factory function to get the appropriate chunker based on config
|
|
261
|
+
*/
|
|
262
|
+
export function getChunker(config) {
|
|
263
|
+
if (config.chunkingMode === 'ast') {
|
|
264
|
+
return new ASTChunker(config);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Return a wrapper that uses smartChunk
|
|
268
|
+
return {
|
|
269
|
+
async chunk(content, file) {
|
|
270
|
+
return smartChunk(content, file, config);
|
|
271
|
+
}
|
|
272
|
+
};
|
|
273
|
+
}
|
package/lib/config.js
CHANGED
|
@@ -62,7 +62,10 @@ const DEFAULT_CONFIG = {
|
|
|
62
62
|
watchFiles: false,
|
|
63
63
|
verbose: false,
|
|
64
64
|
workerThreads: "auto", // "auto" = CPU cores - 1, or set a number
|
|
65
|
-
embeddingModel: "
|
|
65
|
+
embeddingModel: "nomic-ai/nomic-embed-text-v1.5",
|
|
66
|
+
embeddingDimension: 256, // MRL dimension: 64, 128, 256, 512, 768
|
|
67
|
+
device: "auto", // "cpu", "webgpu", or "auto"
|
|
68
|
+
chunkingMode: "smart", // "smart", "ast", or "line"
|
|
66
69
|
semanticWeight: 0.7,
|
|
67
70
|
exactMatchBoost: 1.5,
|
|
68
71
|
smartIndexing: true
|
|
@@ -237,6 +240,42 @@ export async function loadConfig(workspaceDir = null) {
|
|
|
237
240
|
}
|
|
238
241
|
}
|
|
239
242
|
|
|
243
|
+
// MRL embedding dimension
|
|
244
|
+
if (process.env.SMART_CODING_EMBEDDING_DIMENSION !== undefined) {
|
|
245
|
+
const value = parseInt(process.env.SMART_CODING_EMBEDDING_DIMENSION, 10);
|
|
246
|
+
const validDims = [64, 128, 256, 512, 768];
|
|
247
|
+
if (validDims.includes(value)) {
|
|
248
|
+
config.embeddingDimension = value;
|
|
249
|
+
console.error(`[Config] Using embedding dimension: ${value}`);
|
|
250
|
+
} else {
|
|
251
|
+
console.error(`[Config] Invalid SMART_CODING_EMBEDDING_DIMENSION: ${value}, using default (must be 64, 128, 256, 512, or 768)`);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Device selection
|
|
256
|
+
if (process.env.SMART_CODING_DEVICE !== undefined) {
|
|
257
|
+
const value = process.env.SMART_CODING_DEVICE.trim().toLowerCase();
|
|
258
|
+
const validDevices = ['cpu', 'webgpu', 'auto'];
|
|
259
|
+
if (validDevices.includes(value)) {
|
|
260
|
+
config.device = value;
|
|
261
|
+
console.error(`[Config] Using device: ${value}`);
|
|
262
|
+
} else {
|
|
263
|
+
console.error(`[Config] Invalid SMART_CODING_DEVICE: ${value}, using default (must be 'cpu', 'webgpu', or 'auto')`);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Chunking mode
|
|
268
|
+
if (process.env.SMART_CODING_CHUNKING_MODE !== undefined) {
|
|
269
|
+
const value = process.env.SMART_CODING_CHUNKING_MODE.trim().toLowerCase();
|
|
270
|
+
const validModes = ['smart', 'ast', 'line'];
|
|
271
|
+
if (validModes.includes(value)) {
|
|
272
|
+
config.chunkingMode = value;
|
|
273
|
+
console.error(`[Config] Using chunking mode: ${value}`);
|
|
274
|
+
} else {
|
|
275
|
+
console.error(`[Config] Invalid SMART_CODING_CHUNKING_MODE: ${value}, using default (must be 'smart', 'ast', or 'line')`);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
240
279
|
return config;
|
|
241
280
|
}
|
|
242
281
|
|
package/lib/embedding-worker.js
CHANGED
|
@@ -1,12 +1,38 @@
|
|
|
1
1
|
import { parentPort, workerData } from "worker_threads";
|
|
2
|
-
import { pipeline } from "@
|
|
2
|
+
import { pipeline, layer_norm } from "@huggingface/transformers";
|
|
3
3
|
|
|
4
4
|
let embedder = null;
|
|
5
|
+
const VALID_DIMENSIONS = [64, 128, 256, 512, 768];
|
|
5
6
|
|
|
6
7
|
// Initialize the embedding model once when worker starts
|
|
7
8
|
async function initializeEmbedder() {
|
|
8
9
|
if (!embedder) {
|
|
9
|
-
|
|
10
|
+
const modelName = workerData.embeddingModel || 'nomic-ai/nomic-embed-text-v1.5';
|
|
11
|
+
const dimension = workerData.embeddingDimension || 256;
|
|
12
|
+
const targetDim = VALID_DIMENSIONS.includes(dimension) ? dimension : 256;
|
|
13
|
+
const isNomic = modelName.includes('nomic');
|
|
14
|
+
|
|
15
|
+
const extractor = await pipeline("feature-extraction", modelName);
|
|
16
|
+
|
|
17
|
+
if (isNomic) {
|
|
18
|
+
// MRL embedder with dimension slicing
|
|
19
|
+
embedder = async function(text, options = {}) {
|
|
20
|
+
let embeddings = await extractor(text, { pooling: 'mean' });
|
|
21
|
+
embeddings = layer_norm(embeddings, [embeddings.dims[1]])
|
|
22
|
+
.slice(null, [0, targetDim])
|
|
23
|
+
.normalize(2, -1);
|
|
24
|
+
return { data: embeddings.data };
|
|
25
|
+
};
|
|
26
|
+
embedder.dimension = targetDim;
|
|
27
|
+
} else {
|
|
28
|
+
// Legacy embedder (MiniLM etc.)
|
|
29
|
+
embedder = async function(text, options = {}) {
|
|
30
|
+
return await extractor(text, { pooling: 'mean', normalize: true });
|
|
31
|
+
};
|
|
32
|
+
embedder.dimension = 384;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
embedder.modelName = modelName;
|
|
10
36
|
}
|
|
11
37
|
return embedder;
|
|
12
38
|
}
|
|
@@ -65,3 +91,4 @@ initializeEmbedder().then(() => {
|
|
|
65
91
|
}).catch((error) => {
|
|
66
92
|
parentPort.postMessage({ type: "error", error: error.message });
|
|
67
93
|
});
|
|
94
|
+
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MRL (Matryoshka Representation Learning) Embedder
|
|
3
|
+
*
|
|
4
|
+
* Provides flexible embedding dimensions (64, 128, 256, 512, 768) using
|
|
5
|
+
* nomic-embed-text-v1.5 with layer normalization and dimension slicing.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { pipeline, layer_norm } from '@huggingface/transformers';
|
|
9
|
+
|
|
10
|
+
// Valid MRL dimensions for nomic-embed-text-v1.5
|
|
11
|
+
const VALID_DIMENSIONS = [64, 128, 256, 512, 768];
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Create an MRL-enabled embedder with configurable output dimensions
|
|
15
|
+
*
|
|
16
|
+
* @param {string} modelName - Model identifier (e.g., 'nomic-ai/nomic-embed-text-v1.5')
|
|
17
|
+
* @param {object} options - Configuration options
|
|
18
|
+
* @param {number} options.dimension - Target embedding dimension (64, 128, 256, 512, 768)
|
|
19
|
+
* @param {string} options.device - Device to use ('cpu', 'webgpu', 'auto')
|
|
20
|
+
* @returns {Function} Embedder function compatible with existing codebase
|
|
21
|
+
*/
|
|
22
|
+
export async function createMRLEmbedder(modelName, options = {}) {
|
|
23
|
+
const dimension = options.dimension || 256;
|
|
24
|
+
const device = options.device || 'cpu';
|
|
25
|
+
|
|
26
|
+
// Validate dimension
|
|
27
|
+
if (!VALID_DIMENSIONS.includes(dimension)) {
|
|
28
|
+
console.error(`[MRL] Invalid dimension ${dimension}, using 256. Valid: ${VALID_DIMENSIONS.join(', ')}`);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const targetDim = VALID_DIMENSIONS.includes(dimension) ? dimension : 256;
|
|
32
|
+
|
|
33
|
+
console.error(`[MRL] Loading ${modelName} (output: ${targetDim}d, device: ${device})`);
|
|
34
|
+
|
|
35
|
+
// Detect best device if auto
|
|
36
|
+
const finalDevice = device === 'auto' ? detectBestDevice() : device;
|
|
37
|
+
|
|
38
|
+
// Create the feature extraction pipeline
|
|
39
|
+
const pipelineOptions = {};
|
|
40
|
+
if (finalDevice === 'webgpu') {
|
|
41
|
+
pipelineOptions.device = 'webgpu';
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const extractor = await pipeline('feature-extraction', modelName, pipelineOptions);
|
|
45
|
+
|
|
46
|
+
console.error(`[MRL] Model loaded on ${finalDevice}`);
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Embed text with MRL dimension slicing
|
|
50
|
+
* Compatible with existing embedder(text, options) signature
|
|
51
|
+
*/
|
|
52
|
+
async function embed(text, embedOptions = {}) {
|
|
53
|
+
// Generate full 768d embedding
|
|
54
|
+
let embeddings = await extractor(text, { pooling: 'mean' });
|
|
55
|
+
|
|
56
|
+
// Apply MRL: layer_norm -> slice -> normalize
|
|
57
|
+
embeddings = layer_norm(embeddings, [embeddings.dims[1]])
|
|
58
|
+
.slice(null, [0, targetDim])
|
|
59
|
+
.normalize(2, -1);
|
|
60
|
+
|
|
61
|
+
// Return in format compatible with existing code (has .data property)
|
|
62
|
+
return {
|
|
63
|
+
data: embeddings.data,
|
|
64
|
+
dims: [embeddings.dims[0], targetDim]
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Attach metadata
|
|
69
|
+
embed.modelName = modelName;
|
|
70
|
+
embed.dimension = targetDim;
|
|
71
|
+
embed.device = finalDevice;
|
|
72
|
+
|
|
73
|
+
return embed;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Detect best available device for inference
|
|
78
|
+
*/
|
|
79
|
+
function detectBestDevice() {
|
|
80
|
+
// WebGPU check (browser environment)
|
|
81
|
+
if (typeof navigator !== 'undefined' && navigator.gpu) {
|
|
82
|
+
return 'webgpu';
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Node.js with experimental WebGPU (Node 20+)
|
|
86
|
+
// This would require --experimental-webgpu flag
|
|
87
|
+
// For now, default to CPU in Node.js
|
|
88
|
+
return 'cpu';
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Create a legacy-compatible embedder (384d, MiniLM)
|
|
93
|
+
* Used as fallback if MRL model fails to load
|
|
94
|
+
*/
|
|
95
|
+
export async function createLegacyEmbedder(modelName = 'Xenova/all-MiniLM-L6-v2') {
|
|
96
|
+
console.error(`[Embedder] Loading legacy model: ${modelName}`);
|
|
97
|
+
const extractor = await pipeline('feature-extraction', modelName);
|
|
98
|
+
|
|
99
|
+
async function embed(text, options = {}) {
|
|
100
|
+
const output = await extractor(text, { pooling: 'mean', normalize: true });
|
|
101
|
+
return output;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
embed.modelName = modelName;
|
|
105
|
+
embed.dimension = 384;
|
|
106
|
+
embed.device = 'cpu';
|
|
107
|
+
|
|
108
|
+
return embed;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Smart embedder factory - picks MRL or legacy based on config
|
|
113
|
+
*/
|
|
114
|
+
export async function createEmbedder(config) {
|
|
115
|
+
const model = config.embeddingModel || 'nomic-ai/nomic-embed-text-v1.5';
|
|
116
|
+
const dimension = config.embeddingDimension || 256;
|
|
117
|
+
const device = config.device || 'cpu';
|
|
118
|
+
|
|
119
|
+
// Use MRL for nomic models
|
|
120
|
+
if (model.includes('nomic')) {
|
|
121
|
+
try {
|
|
122
|
+
return await createMRLEmbedder(model, { dimension, device });
|
|
123
|
+
} catch (err) {
|
|
124
|
+
console.error(`[Embedder] MRL model failed: ${err.message}, falling back to legacy`);
|
|
125
|
+
return await createLegacyEmbedder();
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Use legacy for MiniLM and other models
|
|
130
|
+
return await createLegacyEmbedder(model);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
export { VALID_DIMENSIONS };
|
package/lib/tokenizer.js
CHANGED
|
@@ -10,6 +10,10 @@
|
|
|
10
10
|
* Each model has its own maximum sequence length
|
|
11
11
|
*/
|
|
12
12
|
export const MODEL_TOKEN_LIMITS = {
|
|
13
|
+
// MRL / Nomic models (longer context)
|
|
14
|
+
"nomic-ai/nomic-embed-text-v1.5": 8192,
|
|
15
|
+
"nomic-ai/nomic-embed-text-v1": 2048,
|
|
16
|
+
|
|
13
17
|
// Sentence Transformers / MiniLM family
|
|
14
18
|
"Xenova/all-MiniLM-L6-v2": 256,
|
|
15
19
|
"Xenova/all-MiniLM-L12-v2": 256,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "smart-coding-mcp",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "An extensible MCP server that enhances coding productivity with AI-powered features including semantic code search, intelligent indexing, and more, using local LLMs",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -45,11 +45,13 @@
|
|
|
45
45
|
"homepage": "https://github.com/omar-haris/smart-coding-mcp#readme",
|
|
46
46
|
"license": "MIT",
|
|
47
47
|
"dependencies": {
|
|
48
|
+
"@huggingface/transformers": "^3.8.1",
|
|
48
49
|
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
49
|
-
"@xenova/transformers": "^2.17.2",
|
|
50
50
|
"chokidar": "^3.5.3",
|
|
51
|
+
"fastembed": "^2.1.0",
|
|
51
52
|
"fdir": "^6.5.0",
|
|
52
|
-
"glob": "^10.3.10"
|
|
53
|
+
"glob": "^10.3.10",
|
|
54
|
+
"web-tree-sitter": "^0.24.6"
|
|
53
55
|
},
|
|
54
56
|
"engines": {
|
|
55
57
|
"node": ">=18.0.0"
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for AST Chunker
|
|
3
|
+
*
|
|
4
|
+
* Tests the AST-based code chunking functionality:
|
|
5
|
+
* - Tree-sitter initialization
|
|
6
|
+
* - Language detection
|
|
7
|
+
* - Semantic chunking vs smart chunking fallback
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { describe, it, expect, beforeAll } from 'vitest';
|
|
11
|
+
import { ASTChunker, getChunker } from '../lib/ast-chunker.js';
|
|
12
|
+
import { loadConfig } from '../lib/config.js';
|
|
13
|
+
|
|
14
|
+
describe('AST Chunker', () => {
|
|
15
|
+
let config;
|
|
16
|
+
|
|
17
|
+
beforeAll(async () => {
|
|
18
|
+
config = await loadConfig();
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
describe('Chunker Factory', () => {
|
|
22
|
+
it('should return AST chunker when mode is ast', () => {
|
|
23
|
+
const chunker = getChunker({ ...config, chunkingMode: 'ast' });
|
|
24
|
+
expect(chunker).toBeInstanceOf(ASTChunker);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it('should return smart chunker wrapper when mode is smart', () => {
|
|
28
|
+
const chunker = getChunker({ ...config, chunkingMode: 'smart' });
|
|
29
|
+
expect(typeof chunker.chunk).toBe('function');
|
|
30
|
+
expect(chunker).not.toBeInstanceOf(ASTChunker);
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
describe('Language Detection', () => {
|
|
35
|
+
it('should detect JavaScript files', () => {
|
|
36
|
+
const chunker = new ASTChunker(config);
|
|
37
|
+
expect(chunker.getLanguageForFile('test.js')).toBe('javascript');
|
|
38
|
+
expect(chunker.getLanguageForFile('test.mjs')).toBe('javascript');
|
|
39
|
+
expect(chunker.getLanguageForFile('test.jsx')).toBe('javascript');
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it('should detect TypeScript files', () => {
|
|
43
|
+
const chunker = new ASTChunker(config);
|
|
44
|
+
expect(chunker.getLanguageForFile('test.ts')).toBe('typescript');
|
|
45
|
+
expect(chunker.getLanguageForFile('test.tsx')).toBe('typescript');
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it('should detect Python files', () => {
|
|
49
|
+
const chunker = new ASTChunker(config);
|
|
50
|
+
expect(chunker.getLanguageForFile('test.py')).toBe('python');
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it('should return null for unsupported files', () => {
|
|
54
|
+
const chunker = new ASTChunker(config);
|
|
55
|
+
expect(chunker.getLanguageForFile('test.sql')).toBeNull();
|
|
56
|
+
expect(chunker.getLanguageForFile('test.md')).toBeNull();
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
describe('Fallback Behavior', () => {
|
|
61
|
+
it('should fall back to smart chunking for unsupported languages', async () => {
|
|
62
|
+
const chunker = new ASTChunker(config);
|
|
63
|
+
const sqlContent = 'SELECT * FROM users WHERE id = 1;';
|
|
64
|
+
|
|
65
|
+
const chunks = await chunker.chunk(sqlContent, 'query.sql');
|
|
66
|
+
expect(Array.isArray(chunks)).toBe(true);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('should handle empty content', async () => {
|
|
70
|
+
const chunker = new ASTChunker(config);
|
|
71
|
+
const chunks = await chunker.chunk('', 'empty.js');
|
|
72
|
+
expect(Array.isArray(chunks)).toBe(true);
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe('JavaScript Chunking', () => {
|
|
77
|
+
it('should chunk JavaScript functions', async () => {
|
|
78
|
+
const chunker = new ASTChunker(config);
|
|
79
|
+
const jsCode = `
|
|
80
|
+
function add(a, b) {
|
|
81
|
+
return a + b;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function multiply(a, b) {
|
|
85
|
+
return a * b;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
class Calculator {
|
|
89
|
+
constructor() {
|
|
90
|
+
this.result = 0;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
add(n) {
|
|
94
|
+
this.result += n;
|
|
95
|
+
return this;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
`;
|
|
99
|
+
|
|
100
|
+
const chunks = await chunker.chunk(jsCode, 'calc.js');
|
|
101
|
+
expect(Array.isArray(chunks)).toBe(true);
|
|
102
|
+
// Should have found some chunks (exact number depends on Tree-sitter grammar availability)
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
});
|