semantic-code-mcp 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +259 -0
- package/config.json +85 -0
- package/features/check-last-version.js +504 -0
- package/features/clear-cache.js +75 -0
- package/features/get-status.js +210 -0
- package/features/hybrid-search.js +189 -0
- package/features/index-codebase.js +999 -0
- package/features/set-workspace.js +183 -0
- package/index.js +297 -0
- package/lib/ast-chunker.js +273 -0
- package/lib/cache-factory.js +13 -0
- package/lib/cache.js +157 -0
- package/lib/config.js +1296 -0
- package/lib/embedding-worker.js +155 -0
- package/lib/gemini-embedder.js +351 -0
- package/lib/ignore-patterns.js +896 -0
- package/lib/milvus-cache.js +478 -0
- package/lib/mrl-embedder.js +235 -0
- package/lib/project-detector.js +75 -0
- package/lib/resource-throttle.js +85 -0
- package/lib/sqlite-cache.js +468 -0
- package/lib/tokenizer.js +149 -0
- package/lib/utils.js +214 -0
- package/package.json +70 -0
- package/reindex.js +109 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AST-based Code Chunker
|
|
3
|
+
*
|
|
4
|
+
* Uses Tree-sitter to parse code and chunk at semantic boundaries
|
|
5
|
+
* (functions, classes, methods) instead of arbitrary line splits.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import Parser from 'web-tree-sitter';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
import fs from 'fs/promises';
|
|
11
|
+
import { fileURLToPath } from 'url';
|
|
12
|
+
import { smartChunk } from './utils.js'; // Fallback
|
|
13
|
+
|
|
14
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
15
|
+
|
|
16
|
+
// Mapping of file extensions to Tree-sitter language names
|
|
17
|
+
const LANGUAGE_MAP = {
|
|
18
|
+
js: 'javascript',
|
|
19
|
+
mjs: 'javascript',
|
|
20
|
+
cjs: 'javascript',
|
|
21
|
+
jsx: 'javascript',
|
|
22
|
+
ts: 'typescript',
|
|
23
|
+
tsx: 'typescript',
|
|
24
|
+
py: 'python',
|
|
25
|
+
go: 'go',
|
|
26
|
+
rs: 'rust',
|
|
27
|
+
rb: 'ruby',
|
|
28
|
+
java: 'java',
|
|
29
|
+
c: 'c',
|
|
30
|
+
cpp: 'cpp',
|
|
31
|
+
h: 'c',
|
|
32
|
+
hpp: 'cpp'
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// Node types that represent semantic boundaries
|
|
36
|
+
const SEMANTIC_NODES = {
|
|
37
|
+
javascript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement'],
|
|
38
|
+
typescript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement'],
|
|
39
|
+
python: ['function_definition', 'class_definition', 'decorated_definition'],
|
|
40
|
+
go: ['function_declaration', 'method_declaration', 'type_declaration'],
|
|
41
|
+
rust: ['function_item', 'impl_item', 'struct_item', 'enum_item'],
|
|
42
|
+
ruby: ['method', 'class', 'module'],
|
|
43
|
+
java: ['method_declaration', 'class_declaration', 'interface_declaration'],
|
|
44
|
+
c: ['function_definition', 'struct_specifier'],
|
|
45
|
+
cpp: ['function_definition', 'class_specifier', 'struct_specifier']
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
export class ASTChunker {
|
|
49
|
+
constructor(config) {
|
|
50
|
+
this.config = config;
|
|
51
|
+
this.parser = null;
|
|
52
|
+
this.languages = new Map();
|
|
53
|
+
this.initialized = false;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Initialize Tree-sitter parser
|
|
58
|
+
*/
|
|
59
|
+
async init() {
|
|
60
|
+
if (this.initialized) return;
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
await Parser.init();
|
|
64
|
+
this.parser = new Parser();
|
|
65
|
+
this.initialized = true;
|
|
66
|
+
console.error('[AST] Tree-sitter parser initialized');
|
|
67
|
+
} catch (error) {
|
|
68
|
+
console.error('[AST] Failed to initialize Tree-sitter:', error.message);
|
|
69
|
+
throw error;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Load a language grammar
|
|
75
|
+
*/
|
|
76
|
+
async loadLanguage(langName) {
|
|
77
|
+
if (this.languages.has(langName)) {
|
|
78
|
+
return this.languages.get(langName);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
try {
|
|
82
|
+
// Try to find the WASM file in node_modules
|
|
83
|
+
const possiblePaths = [
|
|
84
|
+
path.join(__dirname, '..', 'node_modules', `tree-sitter-${langName}`, `tree-sitter-${langName}.wasm`),
|
|
85
|
+
path.join(__dirname, '..', 'node_modules', 'tree-sitter-wasms', 'out', `tree-sitter-${langName}.wasm`),
|
|
86
|
+
path.join(__dirname, '..', 'grammars', `tree-sitter-${langName}.wasm`)
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
for (const wasmPath of possiblePaths) {
|
|
90
|
+
try {
|
|
91
|
+
await fs.access(wasmPath);
|
|
92
|
+
const language = await Parser.Language.load(wasmPath);
|
|
93
|
+
this.languages.set(langName, language);
|
|
94
|
+
if (this.config.verbose) {
|
|
95
|
+
console.error(`[AST] Loaded ${langName} grammar from ${wasmPath}`);
|
|
96
|
+
}
|
|
97
|
+
return language;
|
|
98
|
+
} catch {
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
console.error(`[AST] No grammar found for ${langName}`);
|
|
104
|
+
return null;
|
|
105
|
+
} catch (error) {
|
|
106
|
+
console.error(`[AST] Failed to load ${langName}:`, error.message);
|
|
107
|
+
return null;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Get the language name from file extension
|
|
113
|
+
*/
|
|
114
|
+
getLanguageForFile(file) {
|
|
115
|
+
const ext = path.extname(file).slice(1).toLowerCase();
|
|
116
|
+
return LANGUAGE_MAP[ext] || null;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Chunk code using AST analysis
|
|
121
|
+
*/
|
|
122
|
+
async chunk(content, file) {
|
|
123
|
+
// Initialize if needed
|
|
124
|
+
if (!this.initialized) {
|
|
125
|
+
await this.init();
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const langName = this.getLanguageForFile(file);
|
|
129
|
+
|
|
130
|
+
// Fall back to smart chunking if language not supported
|
|
131
|
+
if (!langName) {
|
|
132
|
+
if (this.config.verbose) {
|
|
133
|
+
console.error(`[AST] No AST support for ${path.extname(file)}, using smart chunking`);
|
|
134
|
+
}
|
|
135
|
+
return smartChunk(content, file, this.config);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const language = await this.loadLanguage(langName);
|
|
139
|
+
|
|
140
|
+
// Fall back if grammar not available
|
|
141
|
+
if (!language) {
|
|
142
|
+
return smartChunk(content, file, this.config);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
try {
|
|
146
|
+
this.parser.setLanguage(language);
|
|
147
|
+
const tree = this.parser.parse(content);
|
|
148
|
+
const chunks = [];
|
|
149
|
+
const lines = content.split('\n');
|
|
150
|
+
const semanticNodes = SEMANTIC_NODES[langName] || [];
|
|
151
|
+
|
|
152
|
+
// Walk the AST and extract semantic chunks
|
|
153
|
+
this.walkTree(tree.rootNode, (node) => {
|
|
154
|
+
if (semanticNodes.includes(node.type)) {
|
|
155
|
+
const startLine = node.startPosition.row;
|
|
156
|
+
const endLine = node.endPosition.row;
|
|
157
|
+
|
|
158
|
+
// Skip very small nodes (< 3 lines)
|
|
159
|
+
if (endLine - startLine < 2) return;
|
|
160
|
+
|
|
161
|
+
// Extract the text for this node
|
|
162
|
+
const chunkLines = lines.slice(startLine, endLine + 1);
|
|
163
|
+
const text = chunkLines.join('\n');
|
|
164
|
+
|
|
165
|
+
// Skip if too large (will be handled by split)
|
|
166
|
+
const targetTokens = this.config.chunkSize * 4; // Rough estimate
|
|
167
|
+
if (text.length > targetTokens * 4) {
|
|
168
|
+
// Split large nodes
|
|
169
|
+
this.splitLargeNode(node, lines, chunks);
|
|
170
|
+
} else {
|
|
171
|
+
chunks.push({
|
|
172
|
+
text,
|
|
173
|
+
startLine: startLine + 1, // 1-indexed
|
|
174
|
+
endLine: endLine + 1,
|
|
175
|
+
nodeType: node.type
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
// If no semantic chunks found, fall back to smart chunking
|
|
182
|
+
if (chunks.length === 0) {
|
|
183
|
+
return smartChunk(content, file, this.config);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Sort by start line
|
|
187
|
+
chunks.sort((a, b) => a.startLine - b.startLine);
|
|
188
|
+
|
|
189
|
+
// Merge small gaps and remove overlaps
|
|
190
|
+
return this.mergeAndCleanChunks(chunks, lines);
|
|
191
|
+
|
|
192
|
+
} catch (error) {
|
|
193
|
+
console.error(`[AST] Parse error for ${file}:`, error.message);
|
|
194
|
+
return smartChunk(content, file, this.config);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Walk the AST tree and call callback for each node
|
|
200
|
+
*/
|
|
201
|
+
walkTree(node, callback) {
|
|
202
|
+
callback(node);
|
|
203
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
204
|
+
this.walkTree(node.child(i), callback);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Split large AST nodes into smaller chunks
|
|
210
|
+
*/
|
|
211
|
+
splitLargeNode(node, lines, chunks) {
|
|
212
|
+
const chunkSize = this.config.chunkSize || 25;
|
|
213
|
+
const startLine = node.startPosition.row;
|
|
214
|
+
const endLine = node.endPosition.row;
|
|
215
|
+
|
|
216
|
+
for (let i = startLine; i <= endLine; i += chunkSize) {
|
|
217
|
+
const chunkEnd = Math.min(i + chunkSize - 1, endLine);
|
|
218
|
+
const chunkLines = lines.slice(i, chunkEnd + 1);
|
|
219
|
+
|
|
220
|
+
chunks.push({
|
|
221
|
+
text: chunkLines.join('\n'),
|
|
222
|
+
startLine: i + 1,
|
|
223
|
+
endLine: chunkEnd + 1,
|
|
224
|
+
nodeType: node.type + '_part'
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Merge small chunks and clean up overlaps
|
|
231
|
+
*/
|
|
232
|
+
mergeAndCleanChunks(chunks, lines) {
|
|
233
|
+
const cleaned = [];
|
|
234
|
+
const minSize = 5; // Minimum lines per chunk
|
|
235
|
+
|
|
236
|
+
for (const chunk of chunks) {
|
|
237
|
+
// Skip if overlaps with previous
|
|
238
|
+
if (cleaned.length > 0) {
|
|
239
|
+
const prev = cleaned[cleaned.length - 1];
|
|
240
|
+
if (chunk.startLine <= prev.endLine) {
|
|
241
|
+
// Extend previous chunk if this one extends further
|
|
242
|
+
if (chunk.endLine > prev.endLine) {
|
|
243
|
+
prev.endLine = chunk.endLine;
|
|
244
|
+
const extendedLines = lines.slice(prev.startLine - 1, prev.endLine);
|
|
245
|
+
prev.text = extendedLines.join('\n');
|
|
246
|
+
}
|
|
247
|
+
continue;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Add to cleaned list
|
|
252
|
+
cleaned.push(chunk);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return cleaned;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Factory function to get the appropriate chunker based on config
|
|
261
|
+
*/
|
|
262
|
+
export function getChunker(config) {
|
|
263
|
+
if (config.chunkingMode === 'ast') {
|
|
264
|
+
return new ASTChunker(config);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Return a wrapper that uses smartChunk
|
|
268
|
+
return {
|
|
269
|
+
async chunk(content, file) {
|
|
270
|
+
return smartChunk(content, file, config);
|
|
271
|
+
}
|
|
272
|
+
};
|
|
273
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { SQLiteCache } from "./sqlite-cache.js";
|
|
2
|
+
import { MilvusCache } from "./milvus-cache.js";
|
|
3
|
+
|
|
4
|
+
export function createCache(config) {
|
|
5
|
+
// Keep cache provider selection centralized so index/search paths stay consistent.
|
|
6
|
+
const provider = (config?.vectorStoreProvider || "sqlite").toLowerCase();
|
|
7
|
+
|
|
8
|
+
if (provider === "milvus") {
|
|
9
|
+
return new MilvusCache(config);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
return new SQLiteCache(config);
|
|
13
|
+
}
|
package/lib/cache.js
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import fs from "fs/promises";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { cosineSimilarity } from "./utils.js";
|
|
4
|
+
|
|
5
|
+
export class EmbeddingsCache {
|
|
6
|
+
constructor(config) {
|
|
7
|
+
this.config = config;
|
|
8
|
+
this.vectorStore = [];
|
|
9
|
+
this.fileHashes = new Map();
|
|
10
|
+
this.isSaving = false;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
async load() {
|
|
14
|
+
if (!this.config.enableCache) return;
|
|
15
|
+
|
|
16
|
+
try {
|
|
17
|
+
await fs.mkdir(this.config.cacheDirectory, { recursive: true });
|
|
18
|
+
const cacheFile = path.join(this.config.cacheDirectory, "embeddings.json");
|
|
19
|
+
const hashFile = path.join(this.config.cacheDirectory, "file-hashes.json");
|
|
20
|
+
|
|
21
|
+
const [cacheData, hashData] = await Promise.all([
|
|
22
|
+
fs.readFile(cacheFile, "utf-8").catch(() => null),
|
|
23
|
+
fs.readFile(hashFile, "utf-8").catch(() => null)
|
|
24
|
+
]);
|
|
25
|
+
|
|
26
|
+
if (cacheData && hashData) {
|
|
27
|
+
const rawVectorStore = JSON.parse(cacheData);
|
|
28
|
+
const rawHashes = new Map(Object.entries(JSON.parse(hashData)));
|
|
29
|
+
|
|
30
|
+
// Filter cache to only include files matching current extensions
|
|
31
|
+
const allowedExtensions = this.config.fileExtensions.map(ext => `.${ext}`);
|
|
32
|
+
|
|
33
|
+
this.vectorStore = rawVectorStore.filter(chunk => {
|
|
34
|
+
const ext = path.extname(chunk.file);
|
|
35
|
+
return allowedExtensions.includes(ext);
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
// Only keep hashes for files matching current extensions
|
|
39
|
+
for (const [file, hash] of rawHashes) {
|
|
40
|
+
const ext = path.extname(file);
|
|
41
|
+
if (allowedExtensions.includes(ext)) {
|
|
42
|
+
this.fileHashes.set(file, hash);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const filtered = rawVectorStore.length - this.vectorStore.length;
|
|
47
|
+
if (filtered > 0) {
|
|
48
|
+
console.error(`[Cache] Filtered ${filtered} outdated cache entries`);
|
|
49
|
+
}
|
|
50
|
+
console.error(`[Cache] Loaded ${this.vectorStore.length} cached embeddings`);
|
|
51
|
+
}
|
|
52
|
+
} catch (error) {
|
|
53
|
+
console.error("[Cache] Failed to load cache:", error.message);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async save() {
|
|
58
|
+
if (!this.config.enableCache) return;
|
|
59
|
+
|
|
60
|
+
this.isSaving = true;
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
await fs.mkdir(this.config.cacheDirectory, { recursive: true });
|
|
64
|
+
const cacheFile = path.join(this.config.cacheDirectory, "embeddings.json");
|
|
65
|
+
const hashFile = path.join(this.config.cacheDirectory, "file-hashes.json");
|
|
66
|
+
|
|
67
|
+
await Promise.all([
|
|
68
|
+
fs.writeFile(cacheFile, JSON.stringify(this.vectorStore, null, 2)),
|
|
69
|
+
fs.writeFile(hashFile, JSON.stringify(Object.fromEntries(this.fileHashes), null, 2))
|
|
70
|
+
]);
|
|
71
|
+
} catch (error) {
|
|
72
|
+
console.error("[Cache] Failed to save cache:", error.message);
|
|
73
|
+
} finally {
|
|
74
|
+
this.isSaving = false;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
getVectorStore() {
|
|
79
|
+
return this.vectorStore;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
searchByVector(queryVector, topK = 10) {
|
|
83
|
+
const normalizedTopK = Number.isInteger(topK) && topK > 0 ? topK : 10;
|
|
84
|
+
|
|
85
|
+
return this.vectorStore
|
|
86
|
+
.map((chunk) => ({
|
|
87
|
+
...chunk,
|
|
88
|
+
score: cosineSimilarity(queryVector, chunk.vector)
|
|
89
|
+
}))
|
|
90
|
+
.sort((a, b) => b.score - a.score)
|
|
91
|
+
.slice(0, normalizedTopK);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
getStats() {
|
|
95
|
+
return {
|
|
96
|
+
totalChunks: this.vectorStore.length,
|
|
97
|
+
totalFiles: new Set(this.vectorStore.map((v) => v.file)).size
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
setVectorStore(store) {
|
|
102
|
+
this.vectorStore = store;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
getFileHash(file) {
|
|
106
|
+
const entry = this.fileHashes.get(file);
|
|
107
|
+
// Support both old format (string) and new format ({ hash, mtime })
|
|
108
|
+
if (typeof entry === 'string') {
|
|
109
|
+
return entry;
|
|
110
|
+
}
|
|
111
|
+
return entry?.hash;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
getFileMtime(file) {
|
|
115
|
+
const entry = this.fileHashes.get(file);
|
|
116
|
+
return entry?.mtime;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
setFileHash(file, hash, mtime = null) {
|
|
120
|
+
this.fileHashes.set(file, { hash, mtime });
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
deleteFileHash(file) {
|
|
124
|
+
this.fileHashes.delete(file);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
getAllFileHashes() {
|
|
128
|
+
return this.fileHashes;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
clearAllFileHashes() {
|
|
132
|
+
this.fileHashes = new Map();
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
removeFileFromStore(file) {
|
|
136
|
+
this.vectorStore = this.vectorStore.filter(chunk => chunk.file !== file);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
addToStore(chunk) {
|
|
141
|
+
this.vectorStore.push(chunk);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
async clear() {
|
|
145
|
+
if (!this.config.enableCache) return;
|
|
146
|
+
|
|
147
|
+
try {
|
|
148
|
+
await fs.rm(this.config.cacheDirectory, { recursive: true, force: true });
|
|
149
|
+
this.vectorStore = [];
|
|
150
|
+
this.fileHashes = new Map();
|
|
151
|
+
console.error(`[Cache] Cache cleared successfully: ${this.config.cacheDirectory}`);
|
|
152
|
+
} catch (error) {
|
|
153
|
+
console.error("[Cache] Failed to clear cache:", error.message);
|
|
154
|
+
throw error;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|