smart-coding-mcp 1.2.4 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -168
- package/config.json +4 -3
- package/example.png +0 -0
- package/features/clear-cache.js +30 -7
- package/features/index-codebase.js +507 -37
- package/how-its-works.png +0 -0
- package/index.js +2 -2
- package/lib/cache.js +5 -0
- package/lib/config.js +29 -4
- package/lib/embedding-worker.js +67 -0
- package/lib/tokenizer.js +142 -0
- package/lib/utils.js +113 -25
- package/package.json +9 -3
- package/test/clear-cache.test.js +288 -0
- package/test/embedding-model.test.js +230 -0
- package/test/helpers.js +128 -0
- package/test/hybrid-search.test.js +243 -0
- package/test/index-codebase.test.js +246 -0
- package/test/integration.test.js +223 -0
- package/test/tokenizer.test.js +225 -0
- package/vitest.config.js +29 -0
package/lib/config.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import fs from "fs/promises";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import { fileURLToPath } from "url";
|
|
3
4
|
import { ProjectDetector } from "./project-detector.js";
|
|
4
5
|
|
|
5
6
|
const DEFAULT_CONFIG = {
|
|
@@ -48,10 +49,11 @@ const DEFAULT_CONFIG = {
|
|
|
48
49
|
"**/coverage/**",
|
|
49
50
|
"**/.next/**",
|
|
50
51
|
"**/target/**",
|
|
51
|
-
"**/vendor/**"
|
|
52
|
+
"**/vendor/**",
|
|
53
|
+
"**/.smart-coding-cache/**"
|
|
52
54
|
],
|
|
53
|
-
chunkSize:
|
|
54
|
-
chunkOverlap:
|
|
55
|
+
chunkSize: 25, // Lines per chunk (larger = fewer embeddings = faster indexing)
|
|
56
|
+
chunkOverlap: 5, // Overlap between chunks for context continuity
|
|
55
57
|
batchSize: 100,
|
|
56
58
|
maxFileSize: 1048576, // 1MB - skip files larger than this
|
|
57
59
|
maxResults: 5,
|
|
@@ -59,6 +61,7 @@ const DEFAULT_CONFIG = {
|
|
|
59
61
|
cacheDirectory: "./.smart-coding-cache",
|
|
60
62
|
watchFiles: false,
|
|
61
63
|
verbose: false,
|
|
64
|
+
workerThreads: "auto", // "auto" = CPU cores - 1, or set a number
|
|
62
65
|
embeddingModel: "Xenova/all-MiniLM-L6-v2",
|
|
63
66
|
semanticWeight: 0.7,
|
|
64
67
|
exactMatchBoost: 1.5,
|
|
@@ -80,7 +83,7 @@ export async function loadConfig(workspaceDir = null) {
|
|
|
80
83
|
console.error(`[Config] Workspace mode: ${baseDir}`);
|
|
81
84
|
} else {
|
|
82
85
|
// Server mode: load config from server directory
|
|
83
|
-
const scriptDir = path.dirname(
|
|
86
|
+
const scriptDir = path.dirname(fileURLToPath(import.meta.url));
|
|
84
87
|
baseDir = path.resolve(scriptDir, '..');
|
|
85
88
|
configPath = path.join(baseDir, "config.json");
|
|
86
89
|
}
|
|
@@ -212,6 +215,28 @@ export async function loadConfig(workspaceDir = null) {
|
|
|
212
215
|
}
|
|
213
216
|
}
|
|
214
217
|
|
|
218
|
+
if (process.env.SMART_CODING_EMBEDDING_MODEL !== undefined) {
|
|
219
|
+
const value = process.env.SMART_CODING_EMBEDDING_MODEL.trim();
|
|
220
|
+
if (value.length > 0) {
|
|
221
|
+
config.embeddingModel = value;
|
|
222
|
+
console.error(`[Config] Using custom embedding model: ${value}`);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
if (process.env.SMART_CODING_WORKER_THREADS !== undefined) {
|
|
227
|
+
const value = process.env.SMART_CODING_WORKER_THREADS.trim().toLowerCase();
|
|
228
|
+
if (value === 'auto') {
|
|
229
|
+
config.workerThreads = 'auto';
|
|
230
|
+
} else {
|
|
231
|
+
const numValue = parseInt(value, 10);
|
|
232
|
+
if (!isNaN(numValue) && numValue >= 1 && numValue <= 32) {
|
|
233
|
+
config.workerThreads = numValue;
|
|
234
|
+
} else {
|
|
235
|
+
console.error(`[Config] Invalid SMART_CODING_WORKER_THREADS: ${value}, using default (must be 'auto' or 1-32)`);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
215
240
|
return config;
|
|
216
241
|
}
|
|
217
242
|
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { parentPort, workerData } from "worker_threads";
|
|
2
|
+
import { pipeline } from "@xenova/transformers";
|
|
3
|
+
|
|
4
|
+
let embedder = null;
|
|
5
|
+
|
|
6
|
+
// Initialize the embedding model once when worker starts
|
|
7
|
+
async function initializeEmbedder() {
|
|
8
|
+
if (!embedder) {
|
|
9
|
+
embedder = await pipeline("feature-extraction", workerData.embeddingModel);
|
|
10
|
+
}
|
|
11
|
+
return embedder;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Process chunks with optimized single-text embedding
|
|
16
|
+
* Note: Batch processing with transformers.js WASM backend doesn't improve speed
|
|
17
|
+
* because it loops internally. Single calls are actually faster.
|
|
18
|
+
*/
|
|
19
|
+
async function processChunks(chunks) {
|
|
20
|
+
const embedder = await initializeEmbedder();
|
|
21
|
+
const results = [];
|
|
22
|
+
|
|
23
|
+
for (const chunk of chunks) {
|
|
24
|
+
try {
|
|
25
|
+
const output = await embedder(chunk.text, { pooling: "mean", normalize: true });
|
|
26
|
+
results.push({
|
|
27
|
+
file: chunk.file,
|
|
28
|
+
startLine: chunk.startLine,
|
|
29
|
+
endLine: chunk.endLine,
|
|
30
|
+
content: chunk.text,
|
|
31
|
+
vector: Array.from(output.data),
|
|
32
|
+
success: true
|
|
33
|
+
});
|
|
34
|
+
} catch (error) {
|
|
35
|
+
results.push({
|
|
36
|
+
file: chunk.file,
|
|
37
|
+
startLine: chunk.startLine,
|
|
38
|
+
endLine: chunk.endLine,
|
|
39
|
+
error: error.message,
|
|
40
|
+
success: false
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return results;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Listen for messages from main thread
|
|
49
|
+
parentPort.on("message", async (message) => {
|
|
50
|
+
if (message.type === "process") {
|
|
51
|
+
try {
|
|
52
|
+
const results = await processChunks(message.chunks);
|
|
53
|
+
parentPort.postMessage({ type: "results", results, batchId: message.batchId });
|
|
54
|
+
} catch (error) {
|
|
55
|
+
parentPort.postMessage({ type: "error", error: error.message, batchId: message.batchId });
|
|
56
|
+
}
|
|
57
|
+
} else if (message.type === "shutdown") {
|
|
58
|
+
process.exit(0);
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// Signal that worker is ready
|
|
63
|
+
initializeEmbedder().then(() => {
|
|
64
|
+
parentPort.postMessage({ type: "ready" });
|
|
65
|
+
}).catch((error) => {
|
|
66
|
+
parentPort.postMessage({ type: "error", error: error.message });
|
|
67
|
+
});
|
package/lib/tokenizer.js
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token estimation and limits for embedding models
|
|
3
|
+
*
|
|
4
|
+
* This module provides token counting utilities and model-specific limits
|
|
5
|
+
* to ensure text chunks don't exceed the model's maximum sequence length.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Token limits for supported embedding models
|
|
10
|
+
* Each model has its own maximum sequence length
|
|
11
|
+
*/
|
|
12
|
+
export const MODEL_TOKEN_LIMITS = {
|
|
13
|
+
// Sentence Transformers / MiniLM family
|
|
14
|
+
"Xenova/all-MiniLM-L6-v2": 256,
|
|
15
|
+
"Xenova/all-MiniLM-L12-v2": 256,
|
|
16
|
+
"Xenova/paraphrase-MiniLM-L6-v2": 128,
|
|
17
|
+
"Xenova/paraphrase-MiniLM-L3-v2": 128,
|
|
18
|
+
|
|
19
|
+
// MPNet models
|
|
20
|
+
"Xenova/all-mpnet-base-v2": 384,
|
|
21
|
+
"Xenova/paraphrase-mpnet-base-v2": 384,
|
|
22
|
+
|
|
23
|
+
// Multilingual models
|
|
24
|
+
"Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
|
|
25
|
+
"Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
|
|
26
|
+
|
|
27
|
+
// Code-specific models
|
|
28
|
+
"Xenova/codebert-base": 512,
|
|
29
|
+
"Xenova/graphcodebert-base": 512,
|
|
30
|
+
|
|
31
|
+
// E5 models
|
|
32
|
+
"Xenova/e5-small-v2": 512,
|
|
33
|
+
"Xenova/e5-base-v2": 512,
|
|
34
|
+
"Xenova/e5-large-v2": 512,
|
|
35
|
+
|
|
36
|
+
// BGE models
|
|
37
|
+
"Xenova/bge-small-en-v1.5": 512,
|
|
38
|
+
"Xenova/bge-base-en-v1.5": 512,
|
|
39
|
+
"Xenova/bge-large-en-v1.5": 512,
|
|
40
|
+
|
|
41
|
+
// Default fallback
|
|
42
|
+
"default": 256
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Get the maximum token limit for a given model
|
|
47
|
+
* Case-insensitive lookup for robustness
|
|
48
|
+
* @param {string} modelName - The model name (e.g., "Xenova/all-MiniLM-L6-v2")
|
|
49
|
+
* @returns {number} Maximum tokens supported by the model
|
|
50
|
+
*/
|
|
51
|
+
export function getModelTokenLimit(modelName) {
|
|
52
|
+
if (!modelName) return MODEL_TOKEN_LIMITS["default"];
|
|
53
|
+
|
|
54
|
+
// Direct match first (fastest)
|
|
55
|
+
if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
|
|
56
|
+
return MODEL_TOKEN_LIMITS[modelName];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Case-insensitive search
|
|
60
|
+
const normalizedName = modelName.toLowerCase();
|
|
61
|
+
for (const [key, value] of Object.entries(MODEL_TOKEN_LIMITS)) {
|
|
62
|
+
if (key.toLowerCase() === normalizedName) {
|
|
63
|
+
return value;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return MODEL_TOKEN_LIMITS["default"];
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Get chunking parameters for a model
|
|
72
|
+
* Returns target and overlap tokens based on the model's limit
|
|
73
|
+
* @param {string} modelName - The model name
|
|
74
|
+
* @returns {{ maxTokens: number, targetTokens: number, overlapTokens: number }}
|
|
75
|
+
*/
|
|
76
|
+
export function getChunkingParams(modelName) {
|
|
77
|
+
const maxTokens = getModelTokenLimit(modelName);
|
|
78
|
+
|
|
79
|
+
// Target: 85% of max to leave safety buffer
|
|
80
|
+
const targetTokens = Math.floor(maxTokens * 0.85);
|
|
81
|
+
|
|
82
|
+
// Overlap: 15-20% of target for context continuity
|
|
83
|
+
const overlapTokens = Math.floor(targetTokens * 0.18);
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
maxTokens,
|
|
87
|
+
targetTokens,
|
|
88
|
+
overlapTokens
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Estimate token count for text (conservative estimate for code)
|
|
94
|
+
* Uses a simple heuristic: counts words, special characters, and estimates subwords
|
|
95
|
+
*
|
|
96
|
+
* This is conservative - actual tokenizers may produce fewer tokens.
|
|
97
|
+
* For most accurate results, use the actual tokenizer, but this is much faster.
|
|
98
|
+
*
|
|
99
|
+
* @param {string} text - The text to estimate tokens for
|
|
100
|
+
* @returns {number} Estimated token count
|
|
101
|
+
*/
|
|
102
|
+
export function estimateTokens(text) {
|
|
103
|
+
if (!text || text.length === 0) return 0;
|
|
104
|
+
|
|
105
|
+
// Count words (split by whitespace)
|
|
106
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
107
|
+
|
|
108
|
+
// Count special characters/punctuation that often become separate tokens
|
|
109
|
+
const specialChars = (text.match(/[{}()\[\];:,.<>!=+\-*\/%&|^~@#$"'`\\]/g) || []).length;
|
|
110
|
+
|
|
111
|
+
// Estimate: words + special chars + 2 (for [CLS] and [SEP] special tokens)
|
|
112
|
+
// For long words, add extra tokens due to subword tokenization
|
|
113
|
+
let tokenCount = 2; // [CLS] and [SEP]
|
|
114
|
+
|
|
115
|
+
for (const word of words) {
|
|
116
|
+
if (word.length <= 4) {
|
|
117
|
+
tokenCount += 1;
|
|
118
|
+
} else if (word.length <= 10) {
|
|
119
|
+
tokenCount += 2;
|
|
120
|
+
} else {
|
|
121
|
+
// Long words get split into ~4-char subwords
|
|
122
|
+
tokenCount += Math.ceil(word.length / 4);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Many special chars merge with adjacent tokens, so count ~50%
|
|
127
|
+
tokenCount += Math.floor(specialChars * 0.5);
|
|
128
|
+
|
|
129
|
+
return tokenCount;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Check if text exceeds the token limit for a model
|
|
134
|
+
* @param {string} text - The text to check
|
|
135
|
+
* @param {string} modelName - The model name
|
|
136
|
+
* @returns {boolean} True if the text exceeds the limit
|
|
137
|
+
*/
|
|
138
|
+
export function exceedsTokenLimit(text, modelName) {
|
|
139
|
+
const limit = getModelTokenLimit(modelName);
|
|
140
|
+
const tokens = estimateTokens(text);
|
|
141
|
+
return tokens > limit;
|
|
142
|
+
}
|
package/lib/utils.js
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import crypto from "crypto";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import { estimateTokens, getChunkingParams, getModelTokenLimit } from "./tokenizer.js";
|
|
4
|
+
|
|
5
|
+
// Re-export tokenizer utilities
|
|
6
|
+
export { estimateTokens, getChunkingParams, getModelTokenLimit, MODEL_TOKEN_LIMITS } from "./tokenizer.js";
|
|
3
7
|
|
|
4
8
|
/**
|
|
5
9
|
* Calculate cosine similarity between two vectors
|
|
@@ -22,13 +26,22 @@ export function hashContent(content) {
|
|
|
22
26
|
}
|
|
23
27
|
|
|
24
28
|
/**
|
|
25
|
-
* Intelligent chunking
|
|
29
|
+
* Intelligent chunking with token limit awareness
|
|
30
|
+
* Tries to split by function/class boundaries while respecting token limits
|
|
31
|
+
*
|
|
32
|
+
* @param {string} content - File content to chunk
|
|
33
|
+
* @param {string} file - File path (for language detection)
|
|
34
|
+
* @param {object} config - Configuration object with embeddingModel
|
|
35
|
+
* @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
|
|
26
36
|
*/
|
|
27
37
|
export function smartChunk(content, file, config) {
|
|
28
38
|
const lines = content.split("\n");
|
|
29
39
|
const chunks = [];
|
|
30
40
|
const ext = path.extname(file);
|
|
31
41
|
|
|
42
|
+
// Get model-specific chunking parameters
|
|
43
|
+
const { targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
|
|
44
|
+
|
|
32
45
|
// Language-specific patterns for function/class detection
|
|
33
46
|
const patterns = {
|
|
34
47
|
// JavaScript/TypeScript
|
|
@@ -42,6 +55,7 @@ export function smartChunk(content, file, config) {
|
|
|
42
55
|
// Python
|
|
43
56
|
py: /^(class|def|async\s+def)\s+\w+/,
|
|
44
57
|
pyw: /^(class|def|async\s+def)\s+\w+/,
|
|
58
|
+
pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
|
|
45
59
|
|
|
46
60
|
// Java/Kotlin/Scala
|
|
47
61
|
java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
|
|
@@ -56,70 +70,144 @@ export function smartChunk(content, file, config) {
|
|
|
56
70
|
cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
57
71
|
h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
58
72
|
hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
73
|
+
hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
59
74
|
|
|
60
75
|
// C#
|
|
61
76
|
cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
77
|
+
csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
62
78
|
|
|
63
79
|
// Go
|
|
64
80
|
go: /^(func|type|const|var)\s+\w+/,
|
|
65
81
|
|
|
66
82
|
// Rust
|
|
67
|
-
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static)\s+\w+/,
|
|
83
|
+
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
|
|
68
84
|
|
|
69
85
|
// PHP
|
|
70
86
|
php: /^(class|interface|trait|function|const)\s+\w+/,
|
|
87
|
+
phtml: /^(<\?php|class|interface|trait|function)\s*/,
|
|
71
88
|
|
|
72
89
|
// Ruby
|
|
73
90
|
rb: /^(class|module|def)\s+\w+/,
|
|
74
|
-
rake: /^(class|module|def|task)\s+\w+/,
|
|
91
|
+
rake: /^(class|module|def|task|namespace)\s+\w+/,
|
|
75
92
|
|
|
76
93
|
// Swift
|
|
77
|
-
swift: /^(class|struct|enum|protocol|func|var|let)\s+\w+/,
|
|
94
|
+
swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
|
|
78
95
|
|
|
79
96
|
// R
|
|
80
|
-
r: /^(\w+)\s
|
|
81
|
-
R: /^(\w+)\s
|
|
97
|
+
r: /^(\w+)\s*(<-|=)\s*function/,
|
|
98
|
+
R: /^(\w+)\s*(<-|=)\s*function/,
|
|
82
99
|
|
|
83
100
|
// Lua
|
|
84
101
|
lua: /^(function|local\s+function)\s+\w+/,
|
|
102
|
+
|
|
103
|
+
// Shell scripts
|
|
104
|
+
sh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
105
|
+
bash: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
106
|
+
zsh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
107
|
+
fish: /^function\s+\w+/,
|
|
108
|
+
|
|
109
|
+
// CSS/Styles
|
|
110
|
+
css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
|
|
111
|
+
scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
|
|
112
|
+
sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
|
|
113
|
+
less: /^(@\w+:|\.|\#|@media)\s*/,
|
|
114
|
+
styl: /^(\$\w+\s*=|\w+\(|\.|\#)\s*/,
|
|
115
|
+
|
|
116
|
+
// Markup/HTML
|
|
117
|
+
html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
118
|
+
htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
119
|
+
xml: /^(<\w+|\s*<!\[CDATA\[)/,
|
|
120
|
+
svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
|
|
121
|
+
|
|
122
|
+
// Config files
|
|
123
|
+
json: /^(\s*"[\w-]+"\s*:\s*[\[{])/,
|
|
124
|
+
yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
125
|
+
yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
126
|
+
toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
|
|
127
|
+
ini: /^(\[\w+\]|\w+\s*=)/,
|
|
128
|
+
env: /^[A-Z_][A-Z0-9_]*=/,
|
|
129
|
+
|
|
130
|
+
// Documentation
|
|
131
|
+
md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
|
|
132
|
+
mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
|
|
133
|
+
txt: /^.{50,}/, // Split on long paragraphs
|
|
134
|
+
rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
|
|
135
|
+
|
|
136
|
+
// Database
|
|
137
|
+
sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
|
|
138
|
+
|
|
139
|
+
// Perl
|
|
140
|
+
pl: /^(sub|package|use|require)\s+\w+/,
|
|
141
|
+
pm: /^(sub|package|use|require)\s+\w+/,
|
|
142
|
+
|
|
143
|
+
// Vim
|
|
144
|
+
vim: /^(function|command|autocmd|let\s+g:)\s*/,
|
|
85
145
|
};
|
|
86
146
|
|
|
87
|
-
|
|
88
147
|
const langPattern = patterns[ext.slice(1)] || patterns.js;
|
|
89
148
|
let currentChunk = [];
|
|
90
149
|
let chunkStartLine = 0;
|
|
150
|
+
let currentTokenCount = 0;
|
|
91
151
|
|
|
92
152
|
for (let i = 0; i < lines.length; i++) {
|
|
93
153
|
const line = lines[i];
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
// Check if
|
|
97
|
-
const
|
|
154
|
+
const lineTokens = estimateTokens(line);
|
|
155
|
+
|
|
156
|
+
// Check if adding this line would exceed token limit
|
|
157
|
+
const wouldExceedLimit = (currentTokenCount + lineTokens) > targetTokens;
|
|
158
|
+
|
|
159
|
+
// Check if this is a good split point (function/class boundary)
|
|
160
|
+
const isGoodSplitPoint =
|
|
98
161
|
langPattern.test(line.trim()) &&
|
|
99
|
-
currentChunk.length >
|
|
162
|
+
currentChunk.length > 3; // At least a few lines before splitting
|
|
163
|
+
|
|
164
|
+
// Split if we exceed limit OR at a good split point when near limit
|
|
165
|
+
const shouldSplit = wouldExceedLimit || (isGoodSplitPoint && currentTokenCount > targetTokens * 0.6);
|
|
100
166
|
|
|
101
|
-
if (shouldSplit
|
|
102
|
-
|
|
167
|
+
if (shouldSplit && currentChunk.length > 0) {
|
|
168
|
+
const chunkText = currentChunk.join("\n");
|
|
169
|
+
if (chunkText.trim().length > 20) {
|
|
103
170
|
chunks.push({
|
|
104
|
-
text:
|
|
171
|
+
text: chunkText,
|
|
105
172
|
startLine: chunkStartLine + 1,
|
|
106
|
-
endLine: i
|
|
173
|
+
endLine: i,
|
|
174
|
+
tokenCount: currentTokenCount
|
|
107
175
|
});
|
|
108
176
|
}
|
|
109
177
|
|
|
110
|
-
//
|
|
111
|
-
|
|
112
|
-
|
|
178
|
+
// Calculate overlap: keep last N lines that fit within overlapTokens
|
|
179
|
+
let overlapLines = [];
|
|
180
|
+
let overlapTokensCount = 0;
|
|
181
|
+
for (let j = currentChunk.length - 1; j >= 0 && overlapTokensCount < overlapTokens; j--) {
|
|
182
|
+
const lineT = estimateTokens(currentChunk[j]);
|
|
183
|
+
if (overlapTokensCount + lineT <= overlapTokens) {
|
|
184
|
+
overlapLines.unshift(currentChunk[j]);
|
|
185
|
+
overlapTokensCount += lineT;
|
|
186
|
+
} else {
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
currentChunk = overlapLines;
|
|
192
|
+
currentTokenCount = overlapTokensCount;
|
|
193
|
+
chunkStartLine = i - overlapLines.length;
|
|
113
194
|
}
|
|
195
|
+
|
|
196
|
+
currentChunk.push(line);
|
|
197
|
+
currentTokenCount += lineTokens;
|
|
114
198
|
}
|
|
115
199
|
|
|
116
200
|
// Add remaining chunk
|
|
117
|
-
if (currentChunk.length > 0
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
201
|
+
if (currentChunk.length > 0) {
|
|
202
|
+
const chunkText = currentChunk.join("\n");
|
|
203
|
+
if (chunkText.trim().length > 20) {
|
|
204
|
+
chunks.push({
|
|
205
|
+
text: chunkText,
|
|
206
|
+
startLine: chunkStartLine + 1,
|
|
207
|
+
endLine: lines.length,
|
|
208
|
+
tokenCount: currentTokenCount
|
|
209
|
+
});
|
|
210
|
+
}
|
|
123
211
|
}
|
|
124
212
|
|
|
125
213
|
return chunks;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "smart-coding-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.1",
|
|
4
4
|
"description": "An extensible MCP server that enhances coding productivity with AI-powered features including semantic code search, intelligent indexing, and more, using local LLMs",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
"scripts": {
|
|
11
11
|
"start": "node index.js",
|
|
12
12
|
"dev": "node --watch index.js",
|
|
13
|
+
"test": "vitest run",
|
|
14
|
+
"test:watch": "vitest",
|
|
13
15
|
"clear-cache": "node scripts/clear-cache.js"
|
|
14
16
|
},
|
|
15
17
|
"keywords": [
|
|
@@ -45,10 +47,14 @@
|
|
|
45
47
|
"dependencies": {
|
|
46
48
|
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
47
49
|
"@xenova/transformers": "^2.17.2",
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
+
"chokidar": "^3.5.3",
|
|
51
|
+
"fdir": "^6.5.0",
|
|
52
|
+
"glob": "^10.3.10"
|
|
50
53
|
},
|
|
51
54
|
"engines": {
|
|
52
55
|
"node": ">=18.0.0"
|
|
56
|
+
},
|
|
57
|
+
"devDependencies": {
|
|
58
|
+
"vitest": "^4.0.16"
|
|
53
59
|
}
|
|
54
60
|
}
|