semantic-code-mcp 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,273 @@
1
+ /**
2
+ * AST-based Code Chunker
3
+ *
4
+ * Uses Tree-sitter to parse code and chunk at semantic boundaries
5
+ * (functions, classes, methods) instead of arbitrary line splits.
6
+ */
7
+
8
+ import Parser from 'web-tree-sitter';
9
+ import path from 'path';
10
+ import fs from 'fs/promises';
11
+ import { fileURLToPath } from 'url';
12
+ import { smartChunk } from './utils.js'; // Fallback
13
+
14
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
15
+
16
+ // Mapping of file extensions to Tree-sitter language names
17
+ const LANGUAGE_MAP = {
18
+ js: 'javascript',
19
+ mjs: 'javascript',
20
+ cjs: 'javascript',
21
+ jsx: 'javascript',
22
+ ts: 'typescript',
23
+ tsx: 'typescript',
24
+ py: 'python',
25
+ go: 'go',
26
+ rs: 'rust',
27
+ rb: 'ruby',
28
+ java: 'java',
29
+ c: 'c',
30
+ cpp: 'cpp',
31
+ h: 'c',
32
+ hpp: 'cpp'
33
+ };
34
+
35
+ // Node types that represent semantic boundaries
36
+ const SEMANTIC_NODES = {
37
+ javascript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement'],
38
+ typescript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement'],
39
+ python: ['function_definition', 'class_definition', 'decorated_definition'],
40
+ go: ['function_declaration', 'method_declaration', 'type_declaration'],
41
+ rust: ['function_item', 'impl_item', 'struct_item', 'enum_item'],
42
+ ruby: ['method', 'class', 'module'],
43
+ java: ['method_declaration', 'class_declaration', 'interface_declaration'],
44
+ c: ['function_definition', 'struct_specifier'],
45
+ cpp: ['function_definition', 'class_specifier', 'struct_specifier']
46
+ };
47
+
48
+ export class ASTChunker {
49
+ constructor(config) {
50
+ this.config = config;
51
+ this.parser = null;
52
+ this.languages = new Map();
53
+ this.initialized = false;
54
+ }
55
+
56
+ /**
57
+ * Initialize Tree-sitter parser
58
+ */
59
+ async init() {
60
+ if (this.initialized) return;
61
+
62
+ try {
63
+ await Parser.init();
64
+ this.parser = new Parser();
65
+ this.initialized = true;
66
+ console.error('[AST] Tree-sitter parser initialized');
67
+ } catch (error) {
68
+ console.error('[AST] Failed to initialize Tree-sitter:', error.message);
69
+ throw error;
70
+ }
71
+ }
72
+
73
+ /**
74
+ * Load a language grammar
75
+ */
76
+ async loadLanguage(langName) {
77
+ if (this.languages.has(langName)) {
78
+ return this.languages.get(langName);
79
+ }
80
+
81
+ try {
82
+ // Try to find the WASM file in node_modules
83
+ const possiblePaths = [
84
+ path.join(__dirname, '..', 'node_modules', `tree-sitter-${langName}`, `tree-sitter-${langName}.wasm`),
85
+ path.join(__dirname, '..', 'node_modules', 'tree-sitter-wasms', 'out', `tree-sitter-${langName}.wasm`),
86
+ path.join(__dirname, '..', 'grammars', `tree-sitter-${langName}.wasm`)
87
+ ];
88
+
89
+ for (const wasmPath of possiblePaths) {
90
+ try {
91
+ await fs.access(wasmPath);
92
+ const language = await Parser.Language.load(wasmPath);
93
+ this.languages.set(langName, language);
94
+ if (this.config.verbose) {
95
+ console.error(`[AST] Loaded ${langName} grammar from ${wasmPath}`);
96
+ }
97
+ return language;
98
+ } catch {
99
+ continue;
100
+ }
101
+ }
102
+
103
+ console.error(`[AST] No grammar found for ${langName}`);
104
+ return null;
105
+ } catch (error) {
106
+ console.error(`[AST] Failed to load ${langName}:`, error.message);
107
+ return null;
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Get the language name from file extension
113
+ */
114
+ getLanguageForFile(file) {
115
+ const ext = path.extname(file).slice(1).toLowerCase();
116
+ return LANGUAGE_MAP[ext] || null;
117
+ }
118
+
119
+ /**
120
+ * Chunk code using AST analysis
121
+ */
122
+ async chunk(content, file) {
123
+ // Initialize if needed
124
+ if (!this.initialized) {
125
+ await this.init();
126
+ }
127
+
128
+ const langName = this.getLanguageForFile(file);
129
+
130
+ // Fall back to smart chunking if language not supported
131
+ if (!langName) {
132
+ if (this.config.verbose) {
133
+ console.error(`[AST] No AST support for ${path.extname(file)}, using smart chunking`);
134
+ }
135
+ return smartChunk(content, file, this.config);
136
+ }
137
+
138
+ const language = await this.loadLanguage(langName);
139
+
140
+ // Fall back if grammar not available
141
+ if (!language) {
142
+ return smartChunk(content, file, this.config);
143
+ }
144
+
145
+ try {
146
+ this.parser.setLanguage(language);
147
+ const tree = this.parser.parse(content);
148
+ const chunks = [];
149
+ const lines = content.split('\n');
150
+ const semanticNodes = SEMANTIC_NODES[langName] || [];
151
+
152
+ // Walk the AST and extract semantic chunks
153
+ this.walkTree(tree.rootNode, (node) => {
154
+ if (semanticNodes.includes(node.type)) {
155
+ const startLine = node.startPosition.row;
156
+ const endLine = node.endPosition.row;
157
+
158
+ // Skip very small nodes (< 3 lines)
159
+ if (endLine - startLine < 2) return;
160
+
161
+ // Extract the text for this node
162
+ const chunkLines = lines.slice(startLine, endLine + 1);
163
+ const text = chunkLines.join('\n');
164
+
165
+ // Skip if too large (will be handled by split)
166
+ const targetTokens = this.config.chunkSize * 4; // Rough estimate
167
+ if (text.length > targetTokens * 4) {
168
+ // Split large nodes
169
+ this.splitLargeNode(node, lines, chunks);
170
+ } else {
171
+ chunks.push({
172
+ text,
173
+ startLine: startLine + 1, // 1-indexed
174
+ endLine: endLine + 1,
175
+ nodeType: node.type
176
+ });
177
+ }
178
+ }
179
+ });
180
+
181
+ // If no semantic chunks found, fall back to smart chunking
182
+ if (chunks.length === 0) {
183
+ return smartChunk(content, file, this.config);
184
+ }
185
+
186
+ // Sort by start line
187
+ chunks.sort((a, b) => a.startLine - b.startLine);
188
+
189
+ // Merge small gaps and remove overlaps
190
+ return this.mergeAndCleanChunks(chunks, lines);
191
+
192
+ } catch (error) {
193
+ console.error(`[AST] Parse error for ${file}:`, error.message);
194
+ return smartChunk(content, file, this.config);
195
+ }
196
+ }
197
+
198
+ /**
199
+ * Walk the AST tree and call callback for each node
200
+ */
201
+ walkTree(node, callback) {
202
+ callback(node);
203
+ for (let i = 0; i < node.childCount; i++) {
204
+ this.walkTree(node.child(i), callback);
205
+ }
206
+ }
207
+
208
+ /**
209
+ * Split large AST nodes into smaller chunks
210
+ */
211
+ splitLargeNode(node, lines, chunks) {
212
+ const chunkSize = this.config.chunkSize || 25;
213
+ const startLine = node.startPosition.row;
214
+ const endLine = node.endPosition.row;
215
+
216
+ for (let i = startLine; i <= endLine; i += chunkSize) {
217
+ const chunkEnd = Math.min(i + chunkSize - 1, endLine);
218
+ const chunkLines = lines.slice(i, chunkEnd + 1);
219
+
220
+ chunks.push({
221
+ text: chunkLines.join('\n'),
222
+ startLine: i + 1,
223
+ endLine: chunkEnd + 1,
224
+ nodeType: node.type + '_part'
225
+ });
226
+ }
227
+ }
228
+
229
+ /**
230
+ * Merge small chunks and clean up overlaps
231
+ */
232
+ mergeAndCleanChunks(chunks, lines) {
233
+ const cleaned = [];
234
+ const minSize = 5; // Minimum lines per chunk
235
+
236
+ for (const chunk of chunks) {
237
+ // Skip if overlaps with previous
238
+ if (cleaned.length > 0) {
239
+ const prev = cleaned[cleaned.length - 1];
240
+ if (chunk.startLine <= prev.endLine) {
241
+ // Extend previous chunk if this one extends further
242
+ if (chunk.endLine > prev.endLine) {
243
+ prev.endLine = chunk.endLine;
244
+ const extendedLines = lines.slice(prev.startLine - 1, prev.endLine);
245
+ prev.text = extendedLines.join('\n');
246
+ }
247
+ continue;
248
+ }
249
+ }
250
+
251
+ // Add to cleaned list
252
+ cleaned.push(chunk);
253
+ }
254
+
255
+ return cleaned;
256
+ }
257
+ }
258
+
259
+ /**
260
+ * Factory function to get the appropriate chunker based on config
261
+ */
262
+ export function getChunker(config) {
263
+ if (config.chunkingMode === 'ast') {
264
+ return new ASTChunker(config);
265
+ }
266
+
267
+ // Return a wrapper that uses smartChunk
268
+ return {
269
+ async chunk(content, file) {
270
+ return smartChunk(content, file, config);
271
+ }
272
+ };
273
+ }
@@ -0,0 +1,13 @@
1
+ import { SQLiteCache } from "./sqlite-cache.js";
2
+ import { MilvusCache } from "./milvus-cache.js";
3
+
4
+ export function createCache(config) {
5
+ // Keep cache provider selection centralized so index/search paths stay consistent.
6
+ const provider = (config?.vectorStoreProvider || "sqlite").toLowerCase();
7
+
8
+ if (provider === "milvus") {
9
+ return new MilvusCache(config);
10
+ }
11
+
12
+ return new SQLiteCache(config);
13
+ }
package/lib/cache.js ADDED
@@ -0,0 +1,157 @@
1
+ import fs from "fs/promises";
2
+ import path from "path";
3
+ import { cosineSimilarity } from "./utils.js";
4
+
5
+ export class EmbeddingsCache {
6
+ constructor(config) {
7
+ this.config = config;
8
+ this.vectorStore = [];
9
+ this.fileHashes = new Map();
10
+ this.isSaving = false;
11
+ }
12
+
13
+ async load() {
14
+ if (!this.config.enableCache) return;
15
+
16
+ try {
17
+ await fs.mkdir(this.config.cacheDirectory, { recursive: true });
18
+ const cacheFile = path.join(this.config.cacheDirectory, "embeddings.json");
19
+ const hashFile = path.join(this.config.cacheDirectory, "file-hashes.json");
20
+
21
+ const [cacheData, hashData] = await Promise.all([
22
+ fs.readFile(cacheFile, "utf-8").catch(() => null),
23
+ fs.readFile(hashFile, "utf-8").catch(() => null)
24
+ ]);
25
+
26
+ if (cacheData && hashData) {
27
+ const rawVectorStore = JSON.parse(cacheData);
28
+ const rawHashes = new Map(Object.entries(JSON.parse(hashData)));
29
+
30
+ // Filter cache to only include files matching current extensions
31
+ const allowedExtensions = this.config.fileExtensions.map(ext => `.${ext}`);
32
+
33
+ this.vectorStore = rawVectorStore.filter(chunk => {
34
+ const ext = path.extname(chunk.file);
35
+ return allowedExtensions.includes(ext);
36
+ });
37
+
38
+ // Only keep hashes for files matching current extensions
39
+ for (const [file, hash] of rawHashes) {
40
+ const ext = path.extname(file);
41
+ if (allowedExtensions.includes(ext)) {
42
+ this.fileHashes.set(file, hash);
43
+ }
44
+ }
45
+
46
+ const filtered = rawVectorStore.length - this.vectorStore.length;
47
+ if (filtered > 0) {
48
+ console.error(`[Cache] Filtered ${filtered} outdated cache entries`);
49
+ }
50
+ console.error(`[Cache] Loaded ${this.vectorStore.length} cached embeddings`);
51
+ }
52
+ } catch (error) {
53
+ console.error("[Cache] Failed to load cache:", error.message);
54
+ }
55
+ }
56
+
57
+ async save() {
58
+ if (!this.config.enableCache) return;
59
+
60
+ this.isSaving = true;
61
+
62
+ try {
63
+ await fs.mkdir(this.config.cacheDirectory, { recursive: true });
64
+ const cacheFile = path.join(this.config.cacheDirectory, "embeddings.json");
65
+ const hashFile = path.join(this.config.cacheDirectory, "file-hashes.json");
66
+
67
+ await Promise.all([
68
+ fs.writeFile(cacheFile, JSON.stringify(this.vectorStore, null, 2)),
69
+ fs.writeFile(hashFile, JSON.stringify(Object.fromEntries(this.fileHashes), null, 2))
70
+ ]);
71
+ } catch (error) {
72
+ console.error("[Cache] Failed to save cache:", error.message);
73
+ } finally {
74
+ this.isSaving = false;
75
+ }
76
+ }
77
+
78
+ getVectorStore() {
79
+ return this.vectorStore;
80
+ }
81
+
82
+ searchByVector(queryVector, topK = 10) {
83
+ const normalizedTopK = Number.isInteger(topK) && topK > 0 ? topK : 10;
84
+
85
+ return this.vectorStore
86
+ .map((chunk) => ({
87
+ ...chunk,
88
+ score: cosineSimilarity(queryVector, chunk.vector)
89
+ }))
90
+ .sort((a, b) => b.score - a.score)
91
+ .slice(0, normalizedTopK);
92
+ }
93
+
94
+ getStats() {
95
+ return {
96
+ totalChunks: this.vectorStore.length,
97
+ totalFiles: new Set(this.vectorStore.map((v) => v.file)).size
98
+ };
99
+ }
100
+
101
+ setVectorStore(store) {
102
+ this.vectorStore = store;
103
+ }
104
+
105
+ getFileHash(file) {
106
+ const entry = this.fileHashes.get(file);
107
+ // Support both old format (string) and new format ({ hash, mtime })
108
+ if (typeof entry === 'string') {
109
+ return entry;
110
+ }
111
+ return entry?.hash;
112
+ }
113
+
114
+ getFileMtime(file) {
115
+ const entry = this.fileHashes.get(file);
116
+ return entry?.mtime;
117
+ }
118
+
119
+ setFileHash(file, hash, mtime = null) {
120
+ this.fileHashes.set(file, { hash, mtime });
121
+ }
122
+
123
+ deleteFileHash(file) {
124
+ this.fileHashes.delete(file);
125
+ }
126
+
127
+ getAllFileHashes() {
128
+ return this.fileHashes;
129
+ }
130
+
131
+ clearAllFileHashes() {
132
+ this.fileHashes = new Map();
133
+ }
134
+
135
+ removeFileFromStore(file) {
136
+ this.vectorStore = this.vectorStore.filter(chunk => chunk.file !== file);
137
+ }
138
+
139
+
140
+ addToStore(chunk) {
141
+ this.vectorStore.push(chunk);
142
+ }
143
+
144
+ async clear() {
145
+ if (!this.config.enableCache) return;
146
+
147
+ try {
148
+ await fs.rm(this.config.cacheDirectory, { recursive: true, force: true });
149
+ this.vectorStore = [];
150
+ this.fileHashes = new Map();
151
+ console.error(`[Cache] Cache cleared successfully: ${this.config.cacheDirectory}`);
152
+ } catch (error) {
153
+ console.error("[Cache] Failed to clear cache:", error.message);
154
+ throw error;
155
+ }
156
+ }
157
+ }