semantic-code-mcp 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ /**
2
+ * Token estimation and limits for embedding models
3
+ *
4
+ * This module provides token counting utilities and model-specific limits
5
+ * to ensure text chunks don't exceed the model's maximum sequence length.
6
+ */
7
+
8
+ /**
9
+ * Token limits for supported embedding models
10
+ * Each model has its own maximum sequence length
11
+ */
12
+ export const MODEL_TOKEN_LIMITS = {
13
+ // MRL / Nomic models (longer context)
14
+ "nomic-ai/nomic-embed-text-v1.5": 8192,
15
+ "nomic-ai/nomic-embed-text-v1": 2048,
16
+
17
+ // Gemini embedding models
18
+ "gemini-embedding-001": 2048,
19
+
20
+ // Sentence Transformers / MiniLM family
21
+ "Xenova/all-MiniLM-L6-v2": 256,
22
+ "Xenova/all-MiniLM-L12-v2": 256,
23
+ "Xenova/paraphrase-MiniLM-L6-v2": 128,
24
+ "Xenova/paraphrase-MiniLM-L3-v2": 128,
25
+
26
+ // MPNet models
27
+ "Xenova/all-mpnet-base-v2": 384,
28
+ "Xenova/paraphrase-mpnet-base-v2": 384,
29
+
30
+ // Multilingual models
31
+ "Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
32
+ "Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
33
+
34
+ // Code-specific models
35
+ "Xenova/codebert-base": 512,
36
+ "Xenova/graphcodebert-base": 512,
37
+
38
+ // E5 models
39
+ "Xenova/e5-small-v2": 512,
40
+ "Xenova/e5-base-v2": 512,
41
+ "Xenova/e5-large-v2": 512,
42
+
43
+ // BGE models
44
+ "Xenova/bge-small-en-v1.5": 512,
45
+ "Xenova/bge-base-en-v1.5": 512,
46
+ "Xenova/bge-large-en-v1.5": 512,
47
+
48
+ // Default fallback
49
+ "default": 256
50
+ };
51
+
52
+ /**
53
+ * Get the maximum token limit for a given model
54
+ * Case-insensitive lookup for robustness
55
+ * @param {string} modelName - The model name (e.g., "Xenova/all-MiniLM-L6-v2")
56
+ * @returns {number} Maximum tokens supported by the model
57
+ */
58
+ export function getModelTokenLimit(modelName) {
59
+ if (!modelName) return MODEL_TOKEN_LIMITS["default"];
60
+
61
+ // Direct match first (fastest)
62
+ if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
63
+ return MODEL_TOKEN_LIMITS[modelName];
64
+ }
65
+
66
+ // Case-insensitive search
67
+ const normalizedName = modelName.toLowerCase();
68
+ for (const [key, value] of Object.entries(MODEL_TOKEN_LIMITS)) {
69
+ if (key.toLowerCase() === normalizedName) {
70
+ return value;
71
+ }
72
+ }
73
+
74
+ return MODEL_TOKEN_LIMITS["default"];
75
+ }
76
+
77
+ /**
78
+ * Get chunking parameters for a model
79
+ * Returns target and overlap tokens based on the model's limit
80
+ * @param {string} modelName - The model name
81
+ * @returns {{ maxTokens: number, targetTokens: number, overlapTokens: number }}
82
+ */
83
+ export function getChunkingParams(modelName) {
84
+ const maxTokens = getModelTokenLimit(modelName);
85
+
86
+ // Target: 85% of max to leave safety buffer
87
+ const targetTokens = Math.floor(maxTokens * 0.85);
88
+
89
+ // Overlap: 15-20% of target for context continuity
90
+ const overlapTokens = Math.floor(targetTokens * 0.18);
91
+
92
+ return {
93
+ maxTokens,
94
+ targetTokens,
95
+ overlapTokens
96
+ };
97
+ }
98
+
99
+ /**
100
+ * Estimate token count for text (conservative estimate for code)
101
+ * Uses a simple heuristic: counts words, special characters, and estimates subwords
102
+ *
103
+ * This is conservative - actual tokenizers may produce fewer tokens.
104
+ * For most accurate results, use the actual tokenizer, but this is much faster.
105
+ *
106
+ * @param {string} text - The text to estimate tokens for
107
+ * @returns {number} Estimated token count
108
+ */
109
+ export function estimateTokens(text) {
110
+ if (!text || text.length === 0) return 0;
111
+
112
+ // Count words (split by whitespace)
113
+ const words = text.split(/\s+/).filter(w => w.length > 0);
114
+
115
+ // Count special characters/punctuation that often become separate tokens
116
+ const specialChars = (text.match(/[{}()\[\];:,.<>!=+\-*\/%&|^~@#$"'`\\]/g) || []).length;
117
+
118
+ // Estimate: words + special chars + 2 (for [CLS] and [SEP] special tokens)
119
+ // For long words, add extra tokens due to subword tokenization
120
+ let tokenCount = 2; // [CLS] and [SEP]
121
+
122
+ for (const word of words) {
123
+ if (word.length <= 4) {
124
+ tokenCount += 1;
125
+ } else if (word.length <= 10) {
126
+ tokenCount += 2;
127
+ } else {
128
+ // Long words get split into ~4-char subwords
129
+ tokenCount += Math.ceil(word.length / 4);
130
+ }
131
+ }
132
+
133
+ // Many special chars merge with adjacent tokens, so count ~50%
134
+ tokenCount += Math.floor(specialChars * 0.5);
135
+
136
+ return tokenCount;
137
+ }
138
+
139
+ /**
140
+ * Check if text exceeds the token limit for a model
141
+ * @param {string} text - The text to check
142
+ * @param {string} modelName - The model name
143
+ * @returns {boolean} True if the text exceeds the limit
144
+ */
145
+ export function exceedsTokenLimit(text, modelName) {
146
+ const limit = getModelTokenLimit(modelName);
147
+ const tokens = estimateTokens(text);
148
+ return tokens > limit;
149
+ }
package/lib/utils.js ADDED
@@ -0,0 +1,214 @@
1
+ import crypto from "crypto";
2
+ import path from "path";
3
+ import { estimateTokens, getChunkingParams, getModelTokenLimit } from "./tokenizer.js";
4
+
5
+ // Re-export tokenizer utilities
6
+ export { estimateTokens, getChunkingParams, getModelTokenLimit, MODEL_TOKEN_LIMITS } from "./tokenizer.js";
7
+
8
+ /**
9
+ * Calculate cosine similarity between two vectors
10
+ */
11
+ export function cosineSimilarity(a, b) {
12
+ let dot = 0, normA = 0, normB = 0;
13
+ for (let i = 0; i < a.length; i++) {
14
+ dot += a[i] * b[i];
15
+ normA += a[i] * a[i];
16
+ normB += b[i] * b[i];
17
+ }
18
+ return dot / (Math.sqrt(normA) * Math.sqrt(normB));
19
+ }
20
+
21
+ /**
22
+ * Generate hash for file content to detect changes
23
+ */
24
+ export function hashContent(content) {
25
+ return crypto.createHash("md5").update(content).digest("hex");
26
+ }
27
+
28
+ /**
29
+ * Intelligent chunking with token limit awareness
30
+ * Tries to split by function/class boundaries while respecting token limits
31
+ *
32
+ * @param {string} content - File content to chunk
33
+ * @param {string} file - File path (for language detection)
34
+ * @param {object} config - Configuration object with embeddingModel
35
+ * @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
36
+ */
37
+ export function smartChunk(content, file, config) {
38
+ const lines = content.split("\n");
39
+ const chunks = [];
40
+ const ext = path.extname(file);
41
+
42
+ // Get model-specific chunking parameters
43
+ const { targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
44
+
45
+ // Language-specific patterns for function/class detection
46
+ const patterns = {
47
+ // JavaScript/TypeScript
48
+ js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
49
+ jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
50
+ ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
51
+ tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
52
+ mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
53
+ cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
54
+
55
+ // Python
56
+ py: /^(class|def|async\s+def)\s+\w+/,
57
+ pyw: /^(class|def|async\s+def)\s+\w+/,
58
+ pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
59
+
60
+ // Java/Kotlin/Scala
61
+ java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
62
+ kt: /^(class|interface|object|fun|val|var)\s+\w+/,
63
+ kts: /^(class|interface|object|fun|val|var)\s+\w+/,
64
+ scala: /^(class|object|trait|def|val|var)\s+\w+/,
65
+
66
+ // C/C++
67
+ c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
68
+ cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
69
+ cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
70
+ cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
71
+ h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
72
+ hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
73
+ hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
74
+
75
+ // C#
76
+ cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
77
+ csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
78
+
79
+ // Go
80
+ go: /^(func|type|const|var)\s+\w+/,
81
+
82
+ // Rust
83
+ rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
84
+
85
+ // PHP
86
+ php: /^(class|interface|trait|function|const)\s+\w+/,
87
+ phtml: /^(<\?php|class|interface|trait|function)\s*/,
88
+
89
+ // Ruby
90
+ rb: /^(class|module|def)\s+\w+/,
91
+ rake: /^(class|module|def|task|namespace)\s+\w+/,
92
+
93
+ // Swift
94
+ swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
95
+
96
+ // R
97
+ r: /^(\w+)\s*(<-|=)\s*function/,
98
+ R: /^(\w+)\s*(<-|=)\s*function/,
99
+
100
+ // Lua
101
+ lua: /^(function|local\s+function)\s+\w+/,
102
+
103
+ // Shell scripts
104
+ sh: /^(\w+\s*\(\)|function\s+\w+)/,
105
+ bash: /^(\w+\s*\(\)|function\s+\w+)/,
106
+ zsh: /^(\w+\s*\(\)|function\s+\w+)/,
107
+ fish: /^function\s+\w+/,
108
+
109
+ // CSS/Styles
110
+ css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
111
+ scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
112
+ sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
113
+ less: /^(@\w+:|\.|\#|@media)\s*/,
114
+ styl: /^(\$\w+\s*=|\w+\(|\.|\#)\s*/,
115
+
116
+ // Markup/HTML
117
+ html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
118
+ htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
119
+ xml: /^(<\w+|\s*<!\[CDATA\[)/,
120
+ svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
121
+
122
+ // Config files
123
+ json: /^(\s*"[\w-]+"\s*:\s*[\[{])/,
124
+ yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
125
+ yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
126
+ toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
127
+ ini: /^(\[\w+\]|\w+\s*=)/,
128
+ env: /^[A-Z_][A-Z0-9_]*=/,
129
+
130
+ // Documentation
131
+ md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
132
+ mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
133
+ txt: /^.{50,}/, // Split on long paragraphs
134
+ rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
135
+
136
+ // Database
137
+ sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
138
+
139
+ // Perl
140
+ pl: /^(sub|package|use|require)\s+\w+/,
141
+ pm: /^(sub|package|use|require)\s+\w+/,
142
+
143
+ // Vim
144
+ vim: /^(function|command|autocmd|let\s+g:)\s*/,
145
+ };
146
+
147
+ const langPattern = patterns[ext.slice(1)] || patterns.js;
148
+ let currentChunk = [];
149
+ let chunkStartLine = 0;
150
+ let currentTokenCount = 0;
151
+
152
+ for (let i = 0; i < lines.length; i++) {
153
+ const line = lines[i];
154
+ const lineTokens = estimateTokens(line);
155
+
156
+ // Check if adding this line would exceed token limit
157
+ const wouldExceedLimit = (currentTokenCount + lineTokens) > targetTokens;
158
+
159
+ // Check if this is a good split point (function/class boundary)
160
+ const isGoodSplitPoint =
161
+ langPattern.test(line.trim()) &&
162
+ currentChunk.length > 3; // At least a few lines before splitting
163
+
164
+ // Split if we exceed limit OR at a good split point when near limit
165
+ const shouldSplit = wouldExceedLimit || (isGoodSplitPoint && currentTokenCount > targetTokens * 0.6);
166
+
167
+ if (shouldSplit && currentChunk.length > 0) {
168
+ const chunkText = currentChunk.join("\n");
169
+ if (chunkText.trim().length > 20) {
170
+ chunks.push({
171
+ text: chunkText,
172
+ startLine: chunkStartLine + 1,
173
+ endLine: i,
174
+ tokenCount: currentTokenCount
175
+ });
176
+ }
177
+
178
+ // Calculate overlap: keep last N lines that fit within overlapTokens
179
+ let overlapLines = [];
180
+ let overlapTokensCount = 0;
181
+ for (let j = currentChunk.length - 1; j >= 0 && overlapTokensCount < overlapTokens; j--) {
182
+ const lineT = estimateTokens(currentChunk[j]);
183
+ if (overlapTokensCount + lineT <= overlapTokens) {
184
+ overlapLines.unshift(currentChunk[j]);
185
+ overlapTokensCount += lineT;
186
+ } else {
187
+ break;
188
+ }
189
+ }
190
+
191
+ currentChunk = overlapLines;
192
+ currentTokenCount = overlapTokensCount;
193
+ chunkStartLine = i - overlapLines.length;
194
+ }
195
+
196
+ currentChunk.push(line);
197
+ currentTokenCount += lineTokens;
198
+ }
199
+
200
+ // Add remaining chunk
201
+ if (currentChunk.length > 0) {
202
+ const chunkText = currentChunk.join("\n");
203
+ if (chunkText.trim().length > 20) {
204
+ chunks.push({
205
+ text: chunkText,
206
+ startLine: chunkStartLine + 1,
207
+ endLine: lines.length,
208
+ tokenCount: currentTokenCount
209
+ });
210
+ }
211
+ }
212
+
213
+ return chunks;
214
+ }
package/package.json ADDED
@@ -0,0 +1,70 @@
1
+ {
2
+ "name": "semantic-code-mcp",
3
+ "version": "2.0.0",
4
+ "description": "AI-powered semantic code search for coding agents. MCP server with multi-provider embeddings and hybrid search.",
5
+ "type": "module",
6
+ "main": "index.js",
7
+ "bin": {
8
+ "semantic-code-mcp": "index.js"
9
+ },
10
+ "scripts": {
11
+ "start": "node index.js",
12
+ "dev": "node --watch index.js",
13
+ "test": "vitest run",
14
+ "test:watch": "vitest",
15
+ "clear-cache": "node scripts/clear-cache.js"
16
+ },
17
+ "keywords": [
18
+ "mcp",
19
+ "semantic-search",
20
+ "code-search",
21
+ "embeddings",
22
+ "ai",
23
+ "model-context-protocol",
24
+ "hybrid-search",
25
+ "code-intelligence",
26
+ "milvus",
27
+ "gemini",
28
+ "openai",
29
+ "tree-sitter",
30
+ "ast",
31
+ "vector-search"
32
+ ],
33
+ "author": {
34
+ "name": "bitkyc08",
35
+ "url": "https://github.com/bitkyc08-arch"
36
+ },
37
+ "contributors": [
38
+ {
39
+ "name": "Omar Haris",
40
+ "url": "https://www.linkedin.com/in/omarharis/"
41
+ }
42
+ ],
43
+ "repository": {
44
+ "type": "git",
45
+ "url": "https://github.com/bitkyc08-arch/smart-coding-mcp.git"
46
+ },
47
+ "bugs": {
48
+ "url": "https://github.com/bitkyc08-arch/smart-coding-mcp/issues"
49
+ },
50
+ "homepage": "https://github.com/bitkyc08-arch/smart-coding-mcp#readme",
51
+ "license": "MIT",
52
+ "dependencies": {
53
+ "@huggingface/transformers": "^3.8.1",
54
+ "@modelcontextprotocol/sdk": "^1.0.4",
55
+ "@zilliz/milvus2-sdk-node": "^2.6.10",
56
+ "better-sqlite3": "^12.5.0",
57
+ "chokidar": "^3.5.3",
58
+ "fastembed": "^2.1.0",
59
+ "fdir": "^6.5.0",
60
+ "glob": "^10.3.10",
61
+ "google-auth-library": "^9.0.0",
62
+ "web-tree-sitter": "^0.24.6"
63
+ },
64
+ "engines": {
65
+ "node": ">=18.0.0"
66
+ },
67
+ "devDependencies": {
68
+ "vitest": "^4.0.16"
69
+ }
70
+ }
package/reindex.js ADDED
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * smart-coding-mcp 리인덱싱 스크립트 (shell 직접 실행용)
4
+ * MCP 서버 없이 직접 인덱싱, 로그 출력
5
+ *
6
+ * 사용법:
7
+ * node reindex.js /path/to/workspace [--force]
8
+ *
9
+ * 환경 변수는 MCP config와 동일하게 설정 필요.
10
+ */
11
+ import { loadConfig } from "./lib/config.js";
12
+ import { createCache } from "./lib/cache-factory.js";
13
+ import { createEmbedder } from "./lib/mrl-embedder.js";
14
+ import { CodebaseIndexer } from "./features/index-codebase.js";
15
+ import { parseArgs } from "util";
16
+
17
+ const { values, positionals } = parseArgs({
18
+ allowPositionals: true,
19
+ options: {
20
+ force: { type: "boolean", short: "f", default: false },
21
+ help: { type: "boolean", short: "h", default: false },
22
+ },
23
+ });
24
+
25
+ if (values.help) {
26
+ console.log(`
27
+ smart-coding-mcp 리인덱싱 (shell 직접 실행)
28
+
29
+ Usage:
30
+ node reindex.js [workspace_path] [--force]
31
+
32
+ Options:
33
+ -f, --force 전체 재인덱싱 (캐시 무시)
34
+ -h, --help 도움말
35
+
36
+ Environment:
37
+ MCP config의 env를 그대로 사용합니다.
38
+ SMART_CODING_EMBEDDING_PROVIDER, SMART_CODING_GEMINI_BATCH_SIZE 등
39
+ `);
40
+ process.exit(0);
41
+ }
42
+
43
+ const workspaceDir = positionals[0] || process.cwd();
44
+ const force = values.force;
45
+
46
+ function log(msg) {
47
+ const ts = new Date().toLocaleTimeString("ko-KR", { hour12: false });
48
+ console.log(`[${ts}] ${msg}`);
49
+ }
50
+
51
+ async function main() {
52
+ log(`🚀 Reindex 시작: ${workspaceDir}`);
53
+ log(` force=${force}`);
54
+
55
+ // 1. 설정 로드
56
+ const config = await loadConfig(workspaceDir);
57
+ log(` searchDirectory: ${config.searchDirectory}`);
58
+ log(` cacheDirectory: ${config.cacheDirectory}`);
59
+ log(` extensions: ${config.fileExtensions?.length || "?"} types`);
60
+ log(` excludePatterns: ${config.excludePatterns?.length || "?"} patterns`);
61
+ console.log();
62
+
63
+ // 2. 임베더 로드
64
+ log("🧠 임베더 로딩...");
65
+ const embedder = await createEmbedder(config);
66
+ log(` model: ${embedder.modelName} (${embedder.dimension}d, device: ${embedder.device})`);
67
+
68
+ // 3. 캐시 로드
69
+ log("💾 캐시 로딩...");
70
+ const cache = createCache(config);
71
+ await cache.load();
72
+
73
+ const statsBefore = cache.getStats?.() || {};
74
+ log(` 캐시 항목: ${statsBefore.totalEntries ?? "?"}`);
75
+
76
+ // 4. 인덱서 생성 & 실행
77
+ log("📁 인덱싱 시작...");
78
+ console.log();
79
+
80
+ const t0 = Date.now();
81
+ const indexer = new CodebaseIndexer(embedder, cache, config);
82
+ const result = await indexer.indexAll(force);
83
+
84
+ const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
85
+
86
+ console.log();
87
+ log("🎉 완료!");
88
+ log(` 파일: ${result.filesProcessed ?? "?"}개`);
89
+ log(` 청크: ${result.chunksProcessed ?? "?"}개`);
90
+ log(` 새로운 파일: ${result.newFiles ?? "?"}개`);
91
+ log(` 업데이트: ${result.updatedFiles ?? "?"}개`);
92
+ log(` 스킵: ${result.skippedFiles ?? "?"}개`);
93
+ log(` 삭제: ${result.deletedFiles ?? "?"}개`);
94
+ log(` 소요: ${elapsed}s`);
95
+
96
+ // 5. 캐시 저장
97
+ if (cache.save) {
98
+ await cache.save();
99
+ log("💾 캐시 저장 완료");
100
+ }
101
+
102
+ process.exit(0);
103
+ }
104
+
105
+ main().catch((err) => {
106
+ console.error(`\n❌ 에러: ${err.message}`);
107
+ console.error(err.stack);
108
+ process.exit(1);
109
+ });