smart-coding-mcp 1.2.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -167
- package/config.json +4 -3
- package/example.png +0 -0
- package/features/index-codebase.js +445 -28
- package/how-its-works.png +0 -0
- package/index.js +1 -1
- package/lib/config.js +27 -3
- package/lib/embedding-worker.js +67 -0
- package/lib/tokenizer.js +142 -0
- package/lib/utils.js +113 -25
- package/package.json +4 -3
package/README.md
CHANGED
|
@@ -8,6 +8,8 @@ AI coding assistants work better when they can find relevant code quickly. Tradi
|
|
|
8
8
|
|
|
9
9
|
This MCP server solves that by indexing your codebase with AI embeddings. Your AI assistant can search by meaning instead of exact keywords, finding relevant code even when the terminology differs.
|
|
10
10
|
|
|
11
|
+

|
|
12
|
+
|
|
11
13
|
## Why Use This
|
|
12
14
|
|
|
13
15
|
**Better Code Understanding**
|
|
@@ -36,6 +38,12 @@ Install globally via npm:
|
|
|
36
38
|
npm install -g smart-coding-mcp
|
|
37
39
|
```
|
|
38
40
|
|
|
41
|
+
To update to the latest version:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
npm update -g smart-coding-mcp
|
|
45
|
+
```
|
|
46
|
+
|
|
39
47
|
## Configuration
|
|
40
48
|
|
|
41
49
|
Add to your MCP configuration file. The location depends on your IDE and OS:
|
|
@@ -80,33 +88,23 @@ Add the server configuration to the `mcpServers` object in your config file:
|
|
|
80
88
|
}
|
|
81
89
|
```
|
|
82
90
|
|
|
83
|
-
### Option 3: Auto-Detect Current Directory
|
|
84
|
-
|
|
85
|
-
```json
|
|
86
|
-
{
|
|
87
|
-
"mcpServers": {
|
|
88
|
-
"smart-coding-mcp": {
|
|
89
|
-
"command": "smart-coding-mcp"
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
```
|
|
94
|
-
|
|
95
91
|
## Environment Variables
|
|
96
92
|
|
|
97
93
|
Override configuration settings via environment variables in your MCP config:
|
|
98
94
|
|
|
99
|
-
| Variable | Type | Default
|
|
100
|
-
| -------------------------------- | ------- |
|
|
101
|
-
| `SMART_CODING_VERBOSE` | boolean | `false`
|
|
102
|
-
| `SMART_CODING_BATCH_SIZE` | number | `100`
|
|
103
|
-
| `SMART_CODING_MAX_FILE_SIZE` | number | `1048576`
|
|
104
|
-
| `SMART_CODING_CHUNK_SIZE` | number | `
|
|
105
|
-
| `SMART_CODING_MAX_RESULTS` | number | `5`
|
|
106
|
-
| `SMART_CODING_SMART_INDEXING` | boolean | `true`
|
|
107
|
-
| `SMART_CODING_WATCH_FILES` | boolean | `false`
|
|
108
|
-
| `SMART_CODING_SEMANTIC_WEIGHT` | number | `0.7`
|
|
109
|
-
| `SMART_CODING_EXACT_MATCH_BOOST` | number | `1.5`
|
|
95
|
+
| Variable | Type | Default | Description |
|
|
96
|
+
| -------------------------------- | ------- | ------------------------- | ------------------------------------- |
|
|
97
|
+
| `SMART_CODING_VERBOSE` | boolean | `false` | Enable detailed logging |
|
|
98
|
+
| `SMART_CODING_BATCH_SIZE` | number | `100` | Files to process in parallel |
|
|
99
|
+
| `SMART_CODING_MAX_FILE_SIZE` | number | `1048576` | Max file size in bytes (1MB) |
|
|
100
|
+
| `SMART_CODING_CHUNK_SIZE` | number | `25` | Lines of code per chunk |
|
|
101
|
+
| `SMART_CODING_MAX_RESULTS` | number | `5` | Max search results |
|
|
102
|
+
| `SMART_CODING_SMART_INDEXING` | boolean | `true` | Enable smart project detection |
|
|
103
|
+
| `SMART_CODING_WATCH_FILES` | boolean | `false` | Enable file watching for auto-reindex |
|
|
104
|
+
| `SMART_CODING_SEMANTIC_WEIGHT` | number | `0.7` | Weight for semantic similarity (0-1) |
|
|
105
|
+
| `SMART_CODING_EXACT_MATCH_BOOST` | number | `1.5` | Boost for exact text matches |
|
|
106
|
+
| `SMART_CODING_EMBEDDING_MODEL` | string | `Xenova/all-MiniLM-L6-v2` | AI embedding model to use |
|
|
107
|
+
| `SMART_CODING_WORKER_THREADS` | string | `auto` | Worker threads (`auto` or 1-32) |
|
|
110
108
|
|
|
111
109
|
**Example with environment variables:**
|
|
112
110
|
|
|
@@ -160,60 +158,7 @@ The server indexes your code in four steps:
|
|
|
160
158
|
|
|
161
159
|
When you search, your query is converted to the same vector format and compared against all code chunks using cosine similarity. The most relevant matches are returned.
|
|
162
160
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
The server detects your project type by looking for marker files and automatically applies appropriate ignore patterns:
|
|
166
|
-
|
|
167
|
-
**JavaScript/Node** (package.json found)
|
|
168
|
-
|
|
169
|
-
- Ignores: node_modules, dist, build, .next, coverage
|
|
170
|
-
|
|
171
|
-
**Python** (requirements.txt or pyproject.toml)
|
|
172
|
-
|
|
173
|
-
- Ignores: **pycache**, venv, .pytest_cache, .tox
|
|
174
|
-
|
|
175
|
-
**Android** (build.gradle)
|
|
176
|
-
|
|
177
|
-
- Ignores: .gradle, build artifacts, generated code
|
|
178
|
-
|
|
179
|
-
**iOS** (Podfile)
|
|
180
|
-
|
|
181
|
-
- Ignores: Pods, DerivedData, xcuserdata
|
|
182
|
-
|
|
183
|
-
**And more**: Go, PHP, Rust, Ruby, .NET
|
|
184
|
-
|
|
185
|
-
This typically reduces indexed file count by 100x. A project with 50,000 files (including node_modules) indexes just 500 actual source files.
|
|
186
|
-
|
|
187
|
-
## Configuration
|
|
188
|
-
|
|
189
|
-
The server works out of the box with sensible defaults. Create a `config.json` file in your workspace to customize:
|
|
190
|
-
|
|
191
|
-
```json
|
|
192
|
-
{
|
|
193
|
-
"searchDirectory": ".",
|
|
194
|
-
"fileExtensions": ["js", "ts", "py", "java", "go"],
|
|
195
|
-
"excludePatterns": ["**/my-custom-ignore/**"],
|
|
196
|
-
"smartIndexing": true,
|
|
197
|
-
"verbose": false,
|
|
198
|
-
"enableCache": true,
|
|
199
|
-
"cacheDirectory": "./.smart-coding-cache",
|
|
200
|
-
"watchFiles": true,
|
|
201
|
-
"chunkSize": 15,
|
|
202
|
-
"batchSize": 100,
|
|
203
|
-
"maxFileSize": 1048576,
|
|
204
|
-
"maxResults": 5
|
|
205
|
-
}
|
|
206
|
-
```
|
|
207
|
-
|
|
208
|
-
**Key options:**
|
|
209
|
-
|
|
210
|
-
- `smartIndexing`: Enable automatic project type detection and smart ignore patterns (default: true)
|
|
211
|
-
- `verbose`: Show detailed indexing logs (default: false)
|
|
212
|
-
- `watchFiles`: Automatically reindex when files change (default: true)
|
|
213
|
-
- `enableCache`: Cache embeddings to disk (default: true)
|
|
214
|
-
- `chunkSize`: Lines of code per chunk - smaller = more precise, larger = more context (default: 15)
|
|
215
|
-
- `batchSize`: Number of files to process in parallel (default: 100)
|
|
216
|
-
- `maxFileSize`: Skip files larger than this size in bytes (default: 1MB)
|
|
161
|
+

|
|
217
162
|
|
|
218
163
|
## Examples
|
|
219
164
|
|
|
@@ -243,85 +188,6 @@ Query: "error handling and exceptions"
|
|
|
243
188
|
|
|
244
189
|
Finds all try/catch blocks and error handling patterns.
|
|
245
190
|
|
|
246
|
-
## Performance
|
|
247
|
-
|
|
248
|
-
Tested on a typical JavaScript project:
|
|
249
|
-
|
|
250
|
-
| Metric | Without Smart Indexing | With Smart Indexing |
|
|
251
|
-
| -------------- | ---------------------- | ------------------- |
|
|
252
|
-
| Files scanned | 50,000+ | 500 |
|
|
253
|
-
| Indexing time | 10+ min | 2-3 min |
|
|
254
|
-
| Memory usage | 2GB+ | ~200MB |
|
|
255
|
-
| Search latency | N/A | <100ms |
|
|
256
|
-
|
|
257
|
-
## Supported File Types
|
|
258
|
-
|
|
259
|
-
Languages: JavaScript, TypeScript, Python, Java, Kotlin, Scala, C, C++, C#, Go, Rust, Ruby, PHP, Swift, Shell
|
|
260
|
-
|
|
261
|
-
Web: HTML, CSS, SCSS, Sass, XML, SVG
|
|
262
|
-
|
|
263
|
-
Config/Data: JSON, YAML, TOML, SQL
|
|
264
|
-
|
|
265
|
-
Total: 36 file extensions
|
|
266
|
-
|
|
267
|
-
## Architecture
|
|
268
|
-
|
|
269
|
-
```
|
|
270
|
-
smart-coding-mcp/
|
|
271
|
-
├── index.js # MCP server entry point
|
|
272
|
-
├── lib/
|
|
273
|
-
│ ├── config.js # Configuration + smart detection
|
|
274
|
-
│ ├── cache.js # Embeddings persistence
|
|
275
|
-
│ ├── utils.js # Smart chunking
|
|
276
|
-
│ ├── ignore-patterns.js # Language-specific patterns
|
|
277
|
-
│ └── project-detector.js # Project type detection
|
|
278
|
-
└── features/
|
|
279
|
-
├── hybrid-search.js # Semantic + exact match search
|
|
280
|
-
├── index-codebase.js # File indexing + watching
|
|
281
|
-
└── clear-cache.js # Cache management
|
|
282
|
-
```
|
|
283
|
-
|
|
284
|
-
The modular design makes it easy to add new features. See ARCHITECTURE.md for implementation details.
|
|
285
|
-
|
|
286
|
-
## Troubleshooting
|
|
287
|
-
|
|
288
|
-
**"Server can't find config.json"**
|
|
289
|
-
|
|
290
|
-
Make sure `cwd` is set in your MCP configuration to the full path of smart-coding-mcp.
|
|
291
|
-
|
|
292
|
-
**"Indexing takes too long"**
|
|
293
|
-
|
|
294
|
-
- Verify `smartIndexing` is enabled
|
|
295
|
-
- Add more patterns to `excludePatterns`
|
|
296
|
-
- Reduce `fileExtensions` to only what you need
|
|
297
|
-
|
|
298
|
-
**"Search results aren't relevant"**
|
|
299
|
-
|
|
300
|
-
- Try more specific queries
|
|
301
|
-
- Increase `maxResults` to see more options
|
|
302
|
-
- Run `index_codebase` to force a full reindex
|
|
303
|
-
|
|
304
|
-
**"Cache corruption errors"**
|
|
305
|
-
|
|
306
|
-
Use the `clear_cache` tool or run:
|
|
307
|
-
|
|
308
|
-
```bash
|
|
309
|
-
npm run clear-cache
|
|
310
|
-
```
|
|
311
|
-
|
|
312
|
-
## CLI Commands
|
|
313
|
-
|
|
314
|
-
```bash
|
|
315
|
-
# Start the server
|
|
316
|
-
npm start
|
|
317
|
-
|
|
318
|
-
# Development mode with auto-restart
|
|
319
|
-
npm run dev
|
|
320
|
-
|
|
321
|
-
# Clear embeddings cache
|
|
322
|
-
npm run clear-cache
|
|
323
|
-
```
|
|
324
|
-
|
|
325
191
|
## Privacy
|
|
326
192
|
|
|
327
193
|
- AI model runs entirely on your machine
|
|
@@ -353,17 +219,6 @@ This project builds on research from Cursor showing that semantic search improve
|
|
|
353
219
|
|
|
354
220
|
See: https://cursor.com/blog/semsearch
|
|
355
221
|
|
|
356
|
-
## Contributing
|
|
357
|
-
|
|
358
|
-
Contributions are welcome. See CONTRIBUTING.md for guidelines.
|
|
359
|
-
|
|
360
|
-
Potential areas for improvement:
|
|
361
|
-
|
|
362
|
-
- Additional language support
|
|
363
|
-
- Code complexity analysis
|
|
364
|
-
- Refactoring pattern detection
|
|
365
|
-
- Documentation generation
|
|
366
|
-
|
|
367
222
|
## License
|
|
368
223
|
|
|
369
224
|
MIT License
|
package/config.json
CHANGED
|
@@ -50,8 +50,8 @@
|
|
|
50
50
|
"**/.smart-coding-cache/**"
|
|
51
51
|
],
|
|
52
52
|
"smartIndexing": true,
|
|
53
|
-
"chunkSize":
|
|
54
|
-
"chunkOverlap":
|
|
53
|
+
"chunkSize": 25,
|
|
54
|
+
"chunkOverlap": 5,
|
|
55
55
|
"batchSize": 100,
|
|
56
56
|
"maxFileSize": 1048576,
|
|
57
57
|
"maxResults": 5,
|
|
@@ -61,5 +61,6 @@
|
|
|
61
61
|
"verbose": false,
|
|
62
62
|
"embeddingModel": "Xenova/all-MiniLM-L6-v2",
|
|
63
63
|
"semanticWeight": 0.7,
|
|
64
|
-
"exactMatchBoost": 1.5
|
|
64
|
+
"exactMatchBoost": 1.5,
|
|
65
|
+
"workerThreads": "auto"
|
|
65
66
|
}
|
package/example.png
ADDED
|
Binary file
|
|
@@ -1,15 +1,243 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { fdir } from "fdir";
|
|
2
2
|
import fs from "fs/promises";
|
|
3
3
|
import chokidar from "chokidar";
|
|
4
4
|
import path from "path";
|
|
5
|
+
import os from "os";
|
|
6
|
+
import { Worker } from "worker_threads";
|
|
7
|
+
import { fileURLToPath } from "url";
|
|
5
8
|
import { smartChunk, hashContent } from "../lib/utils.js";
|
|
6
9
|
|
|
10
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
11
|
+
|
|
7
12
|
export class CodebaseIndexer {
|
|
8
|
-
constructor(embedder, cache, config) {
|
|
13
|
+
constructor(embedder, cache, config, server = null) {
|
|
9
14
|
this.embedder = embedder;
|
|
10
15
|
this.cache = cache;
|
|
11
16
|
this.config = config;
|
|
17
|
+
this.server = server;
|
|
12
18
|
this.watcher = null;
|
|
19
|
+
this.workers = [];
|
|
20
|
+
this.workerReady = [];
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Initialize worker thread pool for parallel embedding
|
|
25
|
+
*/
|
|
26
|
+
async initializeWorkers() {
|
|
27
|
+
const numWorkers = this.config.workerThreads === "auto"
|
|
28
|
+
? Math.max(1, os.cpus().length - 1)
|
|
29
|
+
: (this.config.workerThreads || 1);
|
|
30
|
+
|
|
31
|
+
// Only use workers if we have more than 1 CPU
|
|
32
|
+
if (numWorkers <= 1) {
|
|
33
|
+
console.error("[Indexer] Single-threaded mode (1 CPU detected)");
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (this.config.verbose) {
|
|
38
|
+
console.error(`[Indexer] Worker config: workerThreads=${this.config.workerThreads}, resolved to ${numWorkers}`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
console.error(`[Indexer] Initializing ${numWorkers} worker threads...`);
|
|
42
|
+
|
|
43
|
+
const workerPath = path.join(__dirname, "../lib/embedding-worker.js");
|
|
44
|
+
|
|
45
|
+
for (let i = 0; i < numWorkers; i++) {
|
|
46
|
+
try {
|
|
47
|
+
const worker = new Worker(workerPath, {
|
|
48
|
+
workerData: {
|
|
49
|
+
embeddingModel: this.config.embeddingModel,
|
|
50
|
+
verbose: this.config.verbose
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
const readyPromise = new Promise((resolve, reject) => {
|
|
55
|
+
const timeout = setTimeout(() => reject(new Error("Worker init timeout")), 120000);
|
|
56
|
+
|
|
57
|
+
worker.once("message", (msg) => {
|
|
58
|
+
clearTimeout(timeout);
|
|
59
|
+
if (msg.type === "ready") {
|
|
60
|
+
resolve(worker);
|
|
61
|
+
} else if (msg.type === "error") {
|
|
62
|
+
reject(new Error(msg.error));
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
worker.once("error", (err) => {
|
|
67
|
+
clearTimeout(timeout);
|
|
68
|
+
reject(err);
|
|
69
|
+
});
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
this.workers.push(worker);
|
|
73
|
+
this.workerReady.push(readyPromise);
|
|
74
|
+
} catch (err) {
|
|
75
|
+
console.error(`[Indexer] Failed to create worker ${i}: ${err.message}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Wait for all workers to be ready
|
|
80
|
+
try {
|
|
81
|
+
await Promise.all(this.workerReady);
|
|
82
|
+
console.error(`[Indexer] ${this.workers.length} workers ready`);
|
|
83
|
+
if (this.config.verbose) {
|
|
84
|
+
console.error(`[Indexer] Each worker loaded model: ${this.config.embeddingModel}`);
|
|
85
|
+
}
|
|
86
|
+
} catch (err) {
|
|
87
|
+
console.error(`[Indexer] Worker initialization failed: ${err.message}, falling back to single-threaded`);
|
|
88
|
+
this.terminateWorkers();
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Terminate all worker threads
|
|
94
|
+
*/
|
|
95
|
+
terminateWorkers() {
|
|
96
|
+
for (const worker of this.workers) {
|
|
97
|
+
worker.postMessage({ type: "shutdown" });
|
|
98
|
+
}
|
|
99
|
+
this.workers = [];
|
|
100
|
+
this.workerReady = [];
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Send MCP progress notification to connected clients
|
|
105
|
+
*/
|
|
106
|
+
sendProgress(progress, total, message) {
|
|
107
|
+
if (this.server) {
|
|
108
|
+
try {
|
|
109
|
+
this.server.sendNotification("notifications/progress", {
|
|
110
|
+
progressToken: "indexing",
|
|
111
|
+
progress,
|
|
112
|
+
total,
|
|
113
|
+
message
|
|
114
|
+
});
|
|
115
|
+
} catch (err) {
|
|
116
|
+
// Silently ignore if client doesn't support progress notifications
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Process chunks using worker thread pool with timeout and error recovery
|
|
123
|
+
*/
|
|
124
|
+
async processChunksWithWorkers(allChunks) {
|
|
125
|
+
if (this.workers.length === 0) {
|
|
126
|
+
// Fallback to single-threaded processing
|
|
127
|
+
return this.processChunksSingleThreaded(allChunks);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const results = [];
|
|
131
|
+
const chunkSize = Math.ceil(allChunks.length / this.workers.length);
|
|
132
|
+
const workerPromises = [];
|
|
133
|
+
const WORKER_TIMEOUT = 300000; // 5 minutes per batch
|
|
134
|
+
|
|
135
|
+
if (this.config.verbose) {
|
|
136
|
+
console.error(`[Indexer] Distributing ${allChunks.length} chunks across ${this.workers.length} workers (~${chunkSize} chunks each)`);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
for (let i = 0; i < this.workers.length; i++) {
|
|
140
|
+
const workerChunks = allChunks.slice(i * chunkSize, (i + 1) * chunkSize);
|
|
141
|
+
if (workerChunks.length === 0) continue;
|
|
142
|
+
|
|
143
|
+
if (this.config.verbose) {
|
|
144
|
+
console.error(`[Indexer] Worker ${i}: processing ${workerChunks.length} chunks`);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const promise = new Promise((resolve, reject) => {
|
|
148
|
+
const worker = this.workers[i];
|
|
149
|
+
const batchId = `batch-${i}-${Date.now()}`;
|
|
150
|
+
|
|
151
|
+
// Timeout handler
|
|
152
|
+
const timeout = setTimeout(() => {
|
|
153
|
+
worker.off("message", handler);
|
|
154
|
+
console.error(`[Indexer] Worker ${i} timed out, falling back to single-threaded for this batch`);
|
|
155
|
+
// Return empty and let fallback handle it
|
|
156
|
+
resolve([]);
|
|
157
|
+
}, WORKER_TIMEOUT);
|
|
158
|
+
|
|
159
|
+
const handler = (msg) => {
|
|
160
|
+
if (msg.batchId === batchId) {
|
|
161
|
+
clearTimeout(timeout);
|
|
162
|
+
worker.off("message", handler);
|
|
163
|
+
if (msg.type === "results") {
|
|
164
|
+
resolve(msg.results);
|
|
165
|
+
} else if (msg.type === "error") {
|
|
166
|
+
console.error(`[Indexer] Worker ${i} error: ${msg.error}`);
|
|
167
|
+
resolve([]); // Return empty, don't reject - let fallback handle
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
// Handle worker crash
|
|
173
|
+
const errorHandler = (err) => {
|
|
174
|
+
clearTimeout(timeout);
|
|
175
|
+
worker.off("message", handler);
|
|
176
|
+
console.error(`[Indexer] Worker ${i} crashed: ${err.message}`);
|
|
177
|
+
resolve([]); // Return empty, don't reject
|
|
178
|
+
};
|
|
179
|
+
worker.once("error", errorHandler);
|
|
180
|
+
|
|
181
|
+
worker.on("message", handler);
|
|
182
|
+
worker.postMessage({ type: "process", chunks: workerChunks, batchId });
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
workerPromises.push({ promise, chunks: workerChunks });
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Wait for all workers with error recovery
|
|
189
|
+
const workerResults = await Promise.all(workerPromises.map(p => p.promise));
|
|
190
|
+
|
|
191
|
+
// Collect results and identify failed chunks that need retry
|
|
192
|
+
const failedChunks = [];
|
|
193
|
+
for (let i = 0; i < workerResults.length; i++) {
|
|
194
|
+
if (workerResults[i].length > 0) {
|
|
195
|
+
results.push(...workerResults[i]);
|
|
196
|
+
} else if (workerPromises[i].chunks.length > 0) {
|
|
197
|
+
// Worker failed or timed out, need to retry these chunks
|
|
198
|
+
failedChunks.push(...workerPromises[i].chunks);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Retry failed chunks with single-threaded fallback
|
|
203
|
+
if (failedChunks.length > 0) {
|
|
204
|
+
console.error(`[Indexer] Retrying ${failedChunks.length} chunks with single-threaded fallback...`);
|
|
205
|
+
const retryResults = await this.processChunksSingleThreaded(failedChunks);
|
|
206
|
+
results.push(...retryResults);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return results;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Single-threaded chunk processing (fallback)
|
|
214
|
+
*/
|
|
215
|
+
async processChunksSingleThreaded(chunks) {
|
|
216
|
+
const results = [];
|
|
217
|
+
|
|
218
|
+
for (const chunk of chunks) {
|
|
219
|
+
try {
|
|
220
|
+
const output = await this.embedder(chunk.text, { pooling: "mean", normalize: true });
|
|
221
|
+
results.push({
|
|
222
|
+
file: chunk.file,
|
|
223
|
+
startLine: chunk.startLine,
|
|
224
|
+
endLine: chunk.endLine,
|
|
225
|
+
content: chunk.text,
|
|
226
|
+
vector: Array.from(output.data),
|
|
227
|
+
success: true
|
|
228
|
+
});
|
|
229
|
+
} catch (error) {
|
|
230
|
+
results.push({
|
|
231
|
+
file: chunk.file,
|
|
232
|
+
startLine: chunk.startLine,
|
|
233
|
+
endLine: chunk.endLine,
|
|
234
|
+
error: error.message,
|
|
235
|
+
success: false
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return results;
|
|
13
241
|
}
|
|
14
242
|
|
|
15
243
|
async indexFile(file) {
|
|
@@ -83,46 +311,235 @@ export class CodebaseIndexer {
|
|
|
83
311
|
}
|
|
84
312
|
}
|
|
85
313
|
|
|
86
|
-
|
|
87
|
-
|
|
314
|
+
/**
|
|
315
|
+
* Discover files using fdir (3-5x faster than glob)
|
|
316
|
+
* Uses config.excludePatterns which includes smart patterns from ignore-patterns.js
|
|
317
|
+
*/
|
|
318
|
+
async discoverFiles() {
|
|
319
|
+
const startTime = Date.now();
|
|
88
320
|
|
|
89
|
-
|
|
90
|
-
const
|
|
91
|
-
ignore: this.config.excludePatterns,
|
|
92
|
-
absolute: true
|
|
93
|
-
});
|
|
94
|
-
|
|
95
|
-
console.error(`[Indexer] Found ${files.length} files to process`);
|
|
321
|
+
// Build extension filter from config
|
|
322
|
+
const extensions = new Set(this.config.fileExtensions.map(ext => `.${ext}`));
|
|
96
323
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
324
|
+
// Extract directory names from glob patterns in config.excludePatterns
|
|
325
|
+
// Patterns like "**/node_modules/**" -> "node_modules"
|
|
326
|
+
const excludeDirs = new Set();
|
|
327
|
+
for (const pattern of this.config.excludePatterns) {
|
|
328
|
+
// Extract directory names from glob patterns
|
|
329
|
+
const match = pattern.match(/\*\*\/([^/*]+)\/?\*?\*?$/);
|
|
330
|
+
if (match) {
|
|
331
|
+
excludeDirs.add(match[1]);
|
|
332
|
+
}
|
|
333
|
+
// Also handle patterns like "**/dirname/**"
|
|
334
|
+
const match2 = pattern.match(/\*\*\/([^/*]+)\/\*\*$/);
|
|
335
|
+
if (match2) {
|
|
336
|
+
excludeDirs.add(match2[1]);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Always exclude cache directory
|
|
341
|
+
excludeDirs.add(".smart-coding-cache");
|
|
100
342
|
|
|
101
|
-
|
|
102
|
-
|
|
343
|
+
if (this.config.verbose) {
|
|
344
|
+
console.error(`[Indexer] Using ${excludeDirs.size} exclude directories from config`);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
const api = new fdir()
|
|
348
|
+
.withFullPaths()
|
|
349
|
+
.exclude((dirName) => excludeDirs.has(dirName))
|
|
350
|
+
.filter((filePath) => extensions.has(path.extname(filePath)))
|
|
351
|
+
.crawl(this.config.searchDirectory);
|
|
352
|
+
|
|
353
|
+
const files = await api.withPromise();
|
|
354
|
+
|
|
355
|
+
console.error(`[Indexer] File discovery: ${files.length} files in ${Date.now() - startTime}ms`);
|
|
356
|
+
return files;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* Pre-filter files by hash (skip unchanged files before processing)
|
|
361
|
+
*/
|
|
362
|
+
async preFilterFiles(files) {
|
|
363
|
+
const startTime = Date.now();
|
|
364
|
+
const filesToProcess = [];
|
|
365
|
+
const skippedCount = { unchanged: 0, tooLarge: 0, error: 0 };
|
|
366
|
+
|
|
367
|
+
// Process in parallel batches for speed
|
|
368
|
+
const BATCH_SIZE = 500;
|
|
103
369
|
|
|
104
370
|
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
|
105
371
|
const batch = files.slice(i, i + BATCH_SIZE);
|
|
106
372
|
|
|
107
|
-
// Process batch in parallel
|
|
108
373
|
const results = await Promise.all(
|
|
109
|
-
batch.map(file =>
|
|
374
|
+
batch.map(async (file) => {
|
|
375
|
+
try {
|
|
376
|
+
const stats = await fs.stat(file);
|
|
377
|
+
|
|
378
|
+
if (stats.isDirectory()) {
|
|
379
|
+
return null;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
if (stats.size > this.config.maxFileSize) {
|
|
383
|
+
skippedCount.tooLarge++;
|
|
384
|
+
return null;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
const content = await fs.readFile(file, "utf-8");
|
|
388
|
+
const hash = hashContent(content);
|
|
389
|
+
|
|
390
|
+
if (this.cache.getFileHash(file) === hash) {
|
|
391
|
+
skippedCount.unchanged++;
|
|
392
|
+
return null;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
return { file, content, hash };
|
|
396
|
+
} catch (error) {
|
|
397
|
+
skippedCount.error++;
|
|
398
|
+
return null;
|
|
399
|
+
}
|
|
400
|
+
})
|
|
110
401
|
);
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
totalChunks += chunksAdded;
|
|
115
|
-
processedFiles++;
|
|
116
|
-
if (chunksAdded === 0) skippedFiles++;
|
|
402
|
+
|
|
403
|
+
for (const result of results) {
|
|
404
|
+
if (result) filesToProcess.push(result);
|
|
117
405
|
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
console.error(`[Indexer] Pre-filter: ${filesToProcess.length} changed, ${skippedCount.unchanged} unchanged, ${skippedCount.tooLarge} too large, ${skippedCount.error} errors (${Date.now() - startTime}ms)`);
|
|
409
|
+
return filesToProcess;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
async indexAll() {
|
|
413
|
+
const totalStartTime = Date.now();
|
|
414
|
+
console.error(`[Indexer] Starting optimized indexing in ${this.config.searchDirectory}...`);
|
|
415
|
+
|
|
416
|
+
// Step 1: Fast file discovery with fdir
|
|
417
|
+
const files = await this.discoverFiles();
|
|
418
|
+
|
|
419
|
+
if (files.length === 0) {
|
|
420
|
+
console.error("[Indexer] No files found to index");
|
|
421
|
+
this.sendProgress(100, 100, "No files found to index");
|
|
422
|
+
return;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Send progress: discovery complete
|
|
426
|
+
this.sendProgress(5, 100, `Discovered ${files.length} files`);
|
|
427
|
+
|
|
428
|
+
// Step 2: Pre-filter unchanged files (early hash check)
|
|
429
|
+
const filesToProcess = await this.preFilterFiles(files);
|
|
430
|
+
|
|
431
|
+
if (filesToProcess.length === 0) {
|
|
432
|
+
console.error("[Indexer] All files unchanged, nothing to index");
|
|
433
|
+
this.sendProgress(100, 100, "All files up to date");
|
|
434
|
+
await this.cache.save();
|
|
435
|
+
return;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Send progress: filtering complete
|
|
439
|
+
this.sendProgress(10, 100, `Processing ${filesToProcess.length} changed files`);
|
|
440
|
+
|
|
441
|
+
// Step 3: Determine batch size based on project size
|
|
442
|
+
const adaptiveBatchSize = files.length > 10000 ? 500 :
|
|
443
|
+
files.length > 1000 ? 200 :
|
|
444
|
+
this.config.batchSize || 100;
|
|
445
|
+
|
|
446
|
+
console.error(`[Indexer] Processing ${filesToProcess.length} files (batch size: ${adaptiveBatchSize})`);
|
|
447
|
+
|
|
448
|
+
// Step 4: Initialize worker threads (always use when multi-core available)
|
|
449
|
+
const useWorkers = os.cpus().length > 1;
|
|
450
|
+
|
|
451
|
+
if (useWorkers) {
|
|
452
|
+
await this.initializeWorkers();
|
|
453
|
+
console.error(`[Indexer] Multi-threaded mode: ${this.workers.length} workers active`);
|
|
454
|
+
} else {
|
|
455
|
+
console.error(`[Indexer] Single-threaded mode (single-core system)`);
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
let totalChunks = 0;
|
|
459
|
+
let processedFiles = 0;
|
|
460
|
+
|
|
461
|
+
// Step 5: Process files in adaptive batches
|
|
462
|
+
for (let i = 0; i < filesToProcess.length; i += adaptiveBatchSize) {
|
|
463
|
+
const batch = filesToProcess.slice(i, i + adaptiveBatchSize);
|
|
118
464
|
|
|
119
|
-
//
|
|
120
|
-
|
|
121
|
-
|
|
465
|
+
// Generate all chunks for this batch
|
|
466
|
+
const allChunks = [];
|
|
467
|
+
|
|
468
|
+
for (const { file, content, hash } of batch) {
|
|
469
|
+
// Remove old chunks for this file
|
|
470
|
+
this.cache.removeFileFromStore(file);
|
|
471
|
+
|
|
472
|
+
const chunks = smartChunk(content, file, this.config);
|
|
473
|
+
|
|
474
|
+
for (const chunk of chunks) {
|
|
475
|
+
allChunks.push({
|
|
476
|
+
file,
|
|
477
|
+
text: chunk.text,
|
|
478
|
+
startLine: chunk.startLine,
|
|
479
|
+
endLine: chunk.endLine,
|
|
480
|
+
hash
|
|
481
|
+
});
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// Process chunks (with workers if available, otherwise single-threaded)
|
|
486
|
+
let results;
|
|
487
|
+
if (useWorkers && this.workers.length > 0) {
|
|
488
|
+
results = await this.processChunksWithWorkers(allChunks);
|
|
489
|
+
} else {
|
|
490
|
+
results = await this.processChunksSingleThreaded(allChunks);
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// Store successful results
|
|
494
|
+
const fileHashes = new Map();
|
|
495
|
+
for (const result of results) {
|
|
496
|
+
if (result.success) {
|
|
497
|
+
this.cache.addToStore({
|
|
498
|
+
file: result.file,
|
|
499
|
+
startLine: result.startLine,
|
|
500
|
+
endLine: result.endLine,
|
|
501
|
+
content: result.content,
|
|
502
|
+
vector: result.vector
|
|
503
|
+
});
|
|
504
|
+
totalChunks++;
|
|
505
|
+
}
|
|
506
|
+
// Track hash for each file
|
|
507
|
+
const chunkInfo = allChunks.find(c => c.file === result.file);
|
|
508
|
+
if (chunkInfo) {
|
|
509
|
+
fileHashes.set(result.file, chunkInfo.hash);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
// Update file hashes
|
|
514
|
+
for (const [file, hash] of fileHashes) {
|
|
515
|
+
this.cache.setFileHash(file, hash);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
processedFiles += batch.length;
|
|
519
|
+
|
|
520
|
+
// Progress indicator every batch
|
|
521
|
+
if (processedFiles % (adaptiveBatchSize * 2) === 0 || processedFiles === filesToProcess.length) {
|
|
522
|
+
const elapsed = ((Date.now() - totalStartTime) / 1000).toFixed(1);
|
|
523
|
+
const rate = (processedFiles / parseFloat(elapsed)).toFixed(0);
|
|
524
|
+
console.error(`[Indexer] Progress: ${processedFiles}/${filesToProcess.length} files (${rate} files/sec)`);
|
|
525
|
+
|
|
526
|
+
// Send MCP progress notification (10-95% range for batch processing)
|
|
527
|
+
const progressPercent = Math.floor(10 + (processedFiles / filesToProcess.length) * 85);
|
|
528
|
+
this.sendProgress(progressPercent, 100, `Indexed ${processedFiles}/${filesToProcess.length} files (${rate}/sec)`);
|
|
122
529
|
}
|
|
123
530
|
}
|
|
124
531
|
|
|
125
|
-
|
|
532
|
+
// Cleanup workers
|
|
533
|
+
if (useWorkers) {
|
|
534
|
+
this.terminateWorkers();
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
const totalTime = ((Date.now() - totalStartTime) / 1000).toFixed(1);
|
|
538
|
+
console.error(`[Indexer] Complete: ${totalChunks} chunks from ${filesToProcess.length} files in ${totalTime}s`);
|
|
539
|
+
|
|
540
|
+
// Send completion progress
|
|
541
|
+
this.sendProgress(100, 100, `Complete: ${totalChunks} chunks from ${filesToProcess.length} files in ${totalTime}s`);
|
|
542
|
+
|
|
126
543
|
await this.cache.save();
|
|
127
544
|
}
|
|
128
545
|
|
|
Binary file
|
package/index.js
CHANGED
|
@@ -95,7 +95,7 @@ async function initialize() {
|
|
|
95
95
|
await cache.load();
|
|
96
96
|
|
|
97
97
|
// Initialize features
|
|
98
|
-
indexer = new CodebaseIndexer(embedder, cache, config);
|
|
98
|
+
indexer = new CodebaseIndexer(embedder, cache, config, server);
|
|
99
99
|
hybridSearch = new HybridSearch(embedder, cache, config);
|
|
100
100
|
const cacheClearer = new ClearCacheFeature.CacheClearer(embedder, cache, config);
|
|
101
101
|
|
package/lib/config.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import fs from "fs/promises";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import { fileURLToPath } from "url";
|
|
3
4
|
import { ProjectDetector } from "./project-detector.js";
|
|
4
5
|
|
|
5
6
|
const DEFAULT_CONFIG = {
|
|
@@ -50,8 +51,8 @@ const DEFAULT_CONFIG = {
|
|
|
50
51
|
"**/target/**",
|
|
51
52
|
"**/vendor/**"
|
|
52
53
|
],
|
|
53
|
-
chunkSize:
|
|
54
|
-
chunkOverlap:
|
|
54
|
+
chunkSize: 25, // Lines per chunk (larger = fewer embeddings = faster indexing)
|
|
55
|
+
chunkOverlap: 5, // Overlap between chunks for context continuity
|
|
55
56
|
batchSize: 100,
|
|
56
57
|
maxFileSize: 1048576, // 1MB - skip files larger than this
|
|
57
58
|
maxResults: 5,
|
|
@@ -59,6 +60,7 @@ const DEFAULT_CONFIG = {
|
|
|
59
60
|
cacheDirectory: "./.smart-coding-cache",
|
|
60
61
|
watchFiles: false,
|
|
61
62
|
verbose: false,
|
|
63
|
+
workerThreads: "auto", // "auto" = CPU cores - 1, or set a number
|
|
62
64
|
embeddingModel: "Xenova/all-MiniLM-L6-v2",
|
|
63
65
|
semanticWeight: 0.7,
|
|
64
66
|
exactMatchBoost: 1.5,
|
|
@@ -80,7 +82,7 @@ export async function loadConfig(workspaceDir = null) {
|
|
|
80
82
|
console.error(`[Config] Workspace mode: ${baseDir}`);
|
|
81
83
|
} else {
|
|
82
84
|
// Server mode: load config from server directory
|
|
83
|
-
const scriptDir = path.dirname(
|
|
85
|
+
const scriptDir = path.dirname(fileURLToPath(import.meta.url));
|
|
84
86
|
baseDir = path.resolve(scriptDir, '..');
|
|
85
87
|
configPath = path.join(baseDir, "config.json");
|
|
86
88
|
}
|
|
@@ -212,6 +214,28 @@ export async function loadConfig(workspaceDir = null) {
|
|
|
212
214
|
}
|
|
213
215
|
}
|
|
214
216
|
|
|
217
|
+
if (process.env.SMART_CODING_EMBEDDING_MODEL !== undefined) {
|
|
218
|
+
const value = process.env.SMART_CODING_EMBEDDING_MODEL.trim();
|
|
219
|
+
if (value.length > 0) {
|
|
220
|
+
config.embeddingModel = value;
|
|
221
|
+
console.error(`[Config] Using custom embedding model: ${value}`);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if (process.env.SMART_CODING_WORKER_THREADS !== undefined) {
|
|
226
|
+
const value = process.env.SMART_CODING_WORKER_THREADS.trim().toLowerCase();
|
|
227
|
+
if (value === 'auto') {
|
|
228
|
+
config.workerThreads = 'auto';
|
|
229
|
+
} else {
|
|
230
|
+
const numValue = parseInt(value, 10);
|
|
231
|
+
if (!isNaN(numValue) && numValue >= 1 && numValue <= 32) {
|
|
232
|
+
config.workerThreads = numValue;
|
|
233
|
+
} else {
|
|
234
|
+
console.error(`[Config] Invalid SMART_CODING_WORKER_THREADS: ${value}, using default (must be 'auto' or 1-32)`);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
215
239
|
return config;
|
|
216
240
|
}
|
|
217
241
|
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { parentPort, workerData } from "worker_threads";
|
|
2
|
+
import { pipeline } from "@xenova/transformers";
|
|
3
|
+
|
|
4
|
+
let embedder = null;
|
|
5
|
+
|
|
6
|
+
// Initialize the embedding model once when worker starts
|
|
7
|
+
async function initializeEmbedder() {
|
|
8
|
+
if (!embedder) {
|
|
9
|
+
embedder = await pipeline("feature-extraction", workerData.embeddingModel);
|
|
10
|
+
}
|
|
11
|
+
return embedder;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Process chunks with optimized single-text embedding
|
|
16
|
+
* Note: Batch processing with transformers.js WASM backend doesn't improve speed
|
|
17
|
+
* because it loops internally. Single calls are actually faster.
|
|
18
|
+
*/
|
|
19
|
+
async function processChunks(chunks) {
|
|
20
|
+
const embedder = await initializeEmbedder();
|
|
21
|
+
const results = [];
|
|
22
|
+
|
|
23
|
+
for (const chunk of chunks) {
|
|
24
|
+
try {
|
|
25
|
+
const output = await embedder(chunk.text, { pooling: "mean", normalize: true });
|
|
26
|
+
results.push({
|
|
27
|
+
file: chunk.file,
|
|
28
|
+
startLine: chunk.startLine,
|
|
29
|
+
endLine: chunk.endLine,
|
|
30
|
+
content: chunk.text,
|
|
31
|
+
vector: Array.from(output.data),
|
|
32
|
+
success: true
|
|
33
|
+
});
|
|
34
|
+
} catch (error) {
|
|
35
|
+
results.push({
|
|
36
|
+
file: chunk.file,
|
|
37
|
+
startLine: chunk.startLine,
|
|
38
|
+
endLine: chunk.endLine,
|
|
39
|
+
error: error.message,
|
|
40
|
+
success: false
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return results;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Listen for messages from main thread
|
|
49
|
+
parentPort.on("message", async (message) => {
|
|
50
|
+
if (message.type === "process") {
|
|
51
|
+
try {
|
|
52
|
+
const results = await processChunks(message.chunks);
|
|
53
|
+
parentPort.postMessage({ type: "results", results, batchId: message.batchId });
|
|
54
|
+
} catch (error) {
|
|
55
|
+
parentPort.postMessage({ type: "error", error: error.message, batchId: message.batchId });
|
|
56
|
+
}
|
|
57
|
+
} else if (message.type === "shutdown") {
|
|
58
|
+
process.exit(0);
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// Signal that worker is ready
|
|
63
|
+
initializeEmbedder().then(() => {
|
|
64
|
+
parentPort.postMessage({ type: "ready" });
|
|
65
|
+
}).catch((error) => {
|
|
66
|
+
parentPort.postMessage({ type: "error", error: error.message });
|
|
67
|
+
});
|
package/lib/tokenizer.js
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token estimation and limits for embedding models
|
|
3
|
+
*
|
|
4
|
+
* This module provides token counting utilities and model-specific limits
|
|
5
|
+
* to ensure text chunks don't exceed the model's maximum sequence length.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Token limits for supported embedding models
|
|
10
|
+
* Each model has its own maximum sequence length
|
|
11
|
+
*/
|
|
12
|
+
export const MODEL_TOKEN_LIMITS = {
|
|
13
|
+
// Sentence Transformers / MiniLM family
|
|
14
|
+
"Xenova/all-MiniLM-L6-v2": 256,
|
|
15
|
+
"Xenova/all-MiniLM-L12-v2": 256,
|
|
16
|
+
"Xenova/paraphrase-MiniLM-L6-v2": 128,
|
|
17
|
+
"Xenova/paraphrase-MiniLM-L3-v2": 128,
|
|
18
|
+
|
|
19
|
+
// MPNet models
|
|
20
|
+
"Xenova/all-mpnet-base-v2": 384,
|
|
21
|
+
"Xenova/paraphrase-mpnet-base-v2": 384,
|
|
22
|
+
|
|
23
|
+
// Multilingual models
|
|
24
|
+
"Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
|
|
25
|
+
"Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
|
|
26
|
+
|
|
27
|
+
// Code-specific models
|
|
28
|
+
"Xenova/codebert-base": 512,
|
|
29
|
+
"Xenova/graphcodebert-base": 512,
|
|
30
|
+
|
|
31
|
+
// E5 models
|
|
32
|
+
"Xenova/e5-small-v2": 512,
|
|
33
|
+
"Xenova/e5-base-v2": 512,
|
|
34
|
+
"Xenova/e5-large-v2": 512,
|
|
35
|
+
|
|
36
|
+
// BGE models
|
|
37
|
+
"Xenova/bge-small-en-v1.5": 512,
|
|
38
|
+
"Xenova/bge-base-en-v1.5": 512,
|
|
39
|
+
"Xenova/bge-large-en-v1.5": 512,
|
|
40
|
+
|
|
41
|
+
// Default fallback
|
|
42
|
+
"default": 256
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Get the maximum token limit for a given model
|
|
47
|
+
* Case-insensitive lookup for robustness
|
|
48
|
+
* @param {string} modelName - The model name (e.g., "Xenova/all-MiniLM-L6-v2")
|
|
49
|
+
* @returns {number} Maximum tokens supported by the model
|
|
50
|
+
*/
|
|
51
|
+
export function getModelTokenLimit(modelName) {
|
|
52
|
+
if (!modelName) return MODEL_TOKEN_LIMITS["default"];
|
|
53
|
+
|
|
54
|
+
// Direct match first (fastest)
|
|
55
|
+
if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
|
|
56
|
+
return MODEL_TOKEN_LIMITS[modelName];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Case-insensitive search
|
|
60
|
+
const normalizedName = modelName.toLowerCase();
|
|
61
|
+
for (const [key, value] of Object.entries(MODEL_TOKEN_LIMITS)) {
|
|
62
|
+
if (key.toLowerCase() === normalizedName) {
|
|
63
|
+
return value;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return MODEL_TOKEN_LIMITS["default"];
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Get chunking parameters for a model
|
|
72
|
+
* Returns target and overlap tokens based on the model's limit
|
|
73
|
+
* @param {string} modelName - The model name
|
|
74
|
+
* @returns {{ maxTokens: number, targetTokens: number, overlapTokens: number }}
|
|
75
|
+
*/
|
|
76
|
+
export function getChunkingParams(modelName) {
|
|
77
|
+
const maxTokens = getModelTokenLimit(modelName);
|
|
78
|
+
|
|
79
|
+
// Target: 85% of max to leave safety buffer
|
|
80
|
+
const targetTokens = Math.floor(maxTokens * 0.85);
|
|
81
|
+
|
|
82
|
+
// Overlap: 15-20% of target for context continuity
|
|
83
|
+
const overlapTokens = Math.floor(targetTokens * 0.18);
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
maxTokens,
|
|
87
|
+
targetTokens,
|
|
88
|
+
overlapTokens
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Estimate token count for text (conservative estimate for code)
|
|
94
|
+
* Uses a simple heuristic: counts words, special characters, and estimates subwords
|
|
95
|
+
*
|
|
96
|
+
* This is conservative - actual tokenizers may produce fewer tokens.
|
|
97
|
+
* For most accurate results, use the actual tokenizer, but this is much faster.
|
|
98
|
+
*
|
|
99
|
+
* @param {string} text - The text to estimate tokens for
|
|
100
|
+
* @returns {number} Estimated token count
|
|
101
|
+
*/
|
|
102
|
+
export function estimateTokens(text) {
|
|
103
|
+
if (!text || text.length === 0) return 0;
|
|
104
|
+
|
|
105
|
+
// Count words (split by whitespace)
|
|
106
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
107
|
+
|
|
108
|
+
// Count special characters/punctuation that often become separate tokens
|
|
109
|
+
const specialChars = (text.match(/[{}()\[\];:,.<>!=+\-*\/%&|^~@#$"'`\\]/g) || []).length;
|
|
110
|
+
|
|
111
|
+
// Estimate: words + special chars + 2 (for [CLS] and [SEP] special tokens)
|
|
112
|
+
// For long words, add extra tokens due to subword tokenization
|
|
113
|
+
let tokenCount = 2; // [CLS] and [SEP]
|
|
114
|
+
|
|
115
|
+
for (const word of words) {
|
|
116
|
+
if (word.length <= 4) {
|
|
117
|
+
tokenCount += 1;
|
|
118
|
+
} else if (word.length <= 10) {
|
|
119
|
+
tokenCount += 2;
|
|
120
|
+
} else {
|
|
121
|
+
// Long words get split into ~4-char subwords
|
|
122
|
+
tokenCount += Math.ceil(word.length / 4);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Many special chars merge with adjacent tokens, so count ~50%
|
|
127
|
+
tokenCount += Math.floor(specialChars * 0.5);
|
|
128
|
+
|
|
129
|
+
return tokenCount;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Check if text exceeds the token limit for a model
|
|
134
|
+
* @param {string} text - The text to check
|
|
135
|
+
* @param {string} modelName - The model name
|
|
136
|
+
* @returns {boolean} True if the text exceeds the limit
|
|
137
|
+
*/
|
|
138
|
+
export function exceedsTokenLimit(text, modelName) {
|
|
139
|
+
const limit = getModelTokenLimit(modelName);
|
|
140
|
+
const tokens = estimateTokens(text);
|
|
141
|
+
return tokens > limit;
|
|
142
|
+
}
|
package/lib/utils.js
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import crypto from "crypto";
|
|
2
2
|
import path from "path";
|
|
3
|
+
import { estimateTokens, getChunkingParams, getModelTokenLimit } from "./tokenizer.js";
|
|
4
|
+
|
|
5
|
+
// Re-export tokenizer utilities
|
|
6
|
+
export { estimateTokens, getChunkingParams, getModelTokenLimit, MODEL_TOKEN_LIMITS } from "./tokenizer.js";
|
|
3
7
|
|
|
4
8
|
/**
|
|
5
9
|
* Calculate cosine similarity between two vectors
|
|
@@ -22,13 +26,22 @@ export function hashContent(content) {
|
|
|
22
26
|
}
|
|
23
27
|
|
|
24
28
|
/**
|
|
25
|
-
* Intelligent chunking
|
|
29
|
+
* Intelligent chunking with token limit awareness
|
|
30
|
+
* Tries to split by function/class boundaries while respecting token limits
|
|
31
|
+
*
|
|
32
|
+
* @param {string} content - File content to chunk
|
|
33
|
+
* @param {string} file - File path (for language detection)
|
|
34
|
+
* @param {object} config - Configuration object with embeddingModel
|
|
35
|
+
* @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
|
|
26
36
|
*/
|
|
27
37
|
export function smartChunk(content, file, config) {
|
|
28
38
|
const lines = content.split("\n");
|
|
29
39
|
const chunks = [];
|
|
30
40
|
const ext = path.extname(file);
|
|
31
41
|
|
|
42
|
+
// Get model-specific chunking parameters
|
|
43
|
+
const { targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
|
|
44
|
+
|
|
32
45
|
// Language-specific patterns for function/class detection
|
|
33
46
|
const patterns = {
|
|
34
47
|
// JavaScript/TypeScript
|
|
@@ -42,6 +55,7 @@ export function smartChunk(content, file, config) {
|
|
|
42
55
|
// Python
|
|
43
56
|
py: /^(class|def|async\s+def)\s+\w+/,
|
|
44
57
|
pyw: /^(class|def|async\s+def)\s+\w+/,
|
|
58
|
+
pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
|
|
45
59
|
|
|
46
60
|
// Java/Kotlin/Scala
|
|
47
61
|
java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
|
|
@@ -56,70 +70,144 @@ export function smartChunk(content, file, config) {
|
|
|
56
70
|
cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
57
71
|
h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
58
72
|
hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
73
|
+
hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
59
74
|
|
|
60
75
|
// C#
|
|
61
76
|
cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
77
|
+
csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
62
78
|
|
|
63
79
|
// Go
|
|
64
80
|
go: /^(func|type|const|var)\s+\w+/,
|
|
65
81
|
|
|
66
82
|
// Rust
|
|
67
|
-
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static)\s+\w+/,
|
|
83
|
+
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
|
|
68
84
|
|
|
69
85
|
// PHP
|
|
70
86
|
php: /^(class|interface|trait|function|const)\s+\w+/,
|
|
87
|
+
phtml: /^(<\?php|class|interface|trait|function)\s*/,
|
|
71
88
|
|
|
72
89
|
// Ruby
|
|
73
90
|
rb: /^(class|module|def)\s+\w+/,
|
|
74
|
-
rake: /^(class|module|def|task)\s+\w+/,
|
|
91
|
+
rake: /^(class|module|def|task|namespace)\s+\w+/,
|
|
75
92
|
|
|
76
93
|
// Swift
|
|
77
|
-
swift: /^(class|struct|enum|protocol|func|var|let)\s+\w+/,
|
|
94
|
+
swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
|
|
78
95
|
|
|
79
96
|
// R
|
|
80
|
-
r: /^(\w+)\s
|
|
81
|
-
R: /^(\w+)\s
|
|
97
|
+
r: /^(\w+)\s*(<-|=)\s*function/,
|
|
98
|
+
R: /^(\w+)\s*(<-|=)\s*function/,
|
|
82
99
|
|
|
83
100
|
// Lua
|
|
84
101
|
lua: /^(function|local\s+function)\s+\w+/,
|
|
102
|
+
|
|
103
|
+
// Shell scripts
|
|
104
|
+
sh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
105
|
+
bash: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
106
|
+
zsh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
107
|
+
fish: /^function\s+\w+/,
|
|
108
|
+
|
|
109
|
+
// CSS/Styles
|
|
110
|
+
css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
|
|
111
|
+
scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
|
|
112
|
+
sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
|
|
113
|
+
less: /^(@\w+:|\.|\#|@media)\s*/,
|
|
114
|
+
styl: /^(\$\w+\s*=|\w+\(|\.|\#)\s*/,
|
|
115
|
+
|
|
116
|
+
// Markup/HTML
|
|
117
|
+
html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
118
|
+
htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
119
|
+
xml: /^(<\w+|\s*<!\[CDATA\[)/,
|
|
120
|
+
svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
|
|
121
|
+
|
|
122
|
+
// Config files
|
|
123
|
+
json: /^(\s*"[\w-]+"\s*:\s*[\[{])/,
|
|
124
|
+
yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
125
|
+
yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
126
|
+
toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
|
|
127
|
+
ini: /^(\[\w+\]|\w+\s*=)/,
|
|
128
|
+
env: /^[A-Z_][A-Z0-9_]*=/,
|
|
129
|
+
|
|
130
|
+
// Documentation
|
|
131
|
+
md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
|
|
132
|
+
mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
|
|
133
|
+
txt: /^.{50,}/, // Split on long paragraphs
|
|
134
|
+
rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
|
|
135
|
+
|
|
136
|
+
// Database
|
|
137
|
+
sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
|
|
138
|
+
|
|
139
|
+
// Perl
|
|
140
|
+
pl: /^(sub|package|use|require)\s+\w+/,
|
|
141
|
+
pm: /^(sub|package|use|require)\s+\w+/,
|
|
142
|
+
|
|
143
|
+
// Vim
|
|
144
|
+
vim: /^(function|command|autocmd|let\s+g:)\s*/,
|
|
85
145
|
};
|
|
86
146
|
|
|
87
|
-
|
|
88
147
|
const langPattern = patterns[ext.slice(1)] || patterns.js;
|
|
89
148
|
let currentChunk = [];
|
|
90
149
|
let chunkStartLine = 0;
|
|
150
|
+
let currentTokenCount = 0;
|
|
91
151
|
|
|
92
152
|
for (let i = 0; i < lines.length; i++) {
|
|
93
153
|
const line = lines[i];
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
// Check if
|
|
97
|
-
const
|
|
154
|
+
const lineTokens = estimateTokens(line);
|
|
155
|
+
|
|
156
|
+
// Check if adding this line would exceed token limit
|
|
157
|
+
const wouldExceedLimit = (currentTokenCount + lineTokens) > targetTokens;
|
|
158
|
+
|
|
159
|
+
// Check if this is a good split point (function/class boundary)
|
|
160
|
+
const isGoodSplitPoint =
|
|
98
161
|
langPattern.test(line.trim()) &&
|
|
99
|
-
currentChunk.length >
|
|
162
|
+
currentChunk.length > 3; // At least a few lines before splitting
|
|
163
|
+
|
|
164
|
+
// Split if we exceed limit OR at a good split point when near limit
|
|
165
|
+
const shouldSplit = wouldExceedLimit || (isGoodSplitPoint && currentTokenCount > targetTokens * 0.6);
|
|
100
166
|
|
|
101
|
-
if (shouldSplit
|
|
102
|
-
|
|
167
|
+
if (shouldSplit && currentChunk.length > 0) {
|
|
168
|
+
const chunkText = currentChunk.join("\n");
|
|
169
|
+
if (chunkText.trim().length > 20) {
|
|
103
170
|
chunks.push({
|
|
104
|
-
text:
|
|
171
|
+
text: chunkText,
|
|
105
172
|
startLine: chunkStartLine + 1,
|
|
106
|
-
endLine: i
|
|
173
|
+
endLine: i,
|
|
174
|
+
tokenCount: currentTokenCount
|
|
107
175
|
});
|
|
108
176
|
}
|
|
109
177
|
|
|
110
|
-
//
|
|
111
|
-
|
|
112
|
-
|
|
178
|
+
// Calculate overlap: keep last N lines that fit within overlapTokens
|
|
179
|
+
let overlapLines = [];
|
|
180
|
+
let overlapTokensCount = 0;
|
|
181
|
+
for (let j = currentChunk.length - 1; j >= 0 && overlapTokensCount < overlapTokens; j--) {
|
|
182
|
+
const lineT = estimateTokens(currentChunk[j]);
|
|
183
|
+
if (overlapTokensCount + lineT <= overlapTokens) {
|
|
184
|
+
overlapLines.unshift(currentChunk[j]);
|
|
185
|
+
overlapTokensCount += lineT;
|
|
186
|
+
} else {
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
currentChunk = overlapLines;
|
|
192
|
+
currentTokenCount = overlapTokensCount;
|
|
193
|
+
chunkStartLine = i - overlapLines.length;
|
|
113
194
|
}
|
|
195
|
+
|
|
196
|
+
currentChunk.push(line);
|
|
197
|
+
currentTokenCount += lineTokens;
|
|
114
198
|
}
|
|
115
199
|
|
|
116
200
|
// Add remaining chunk
|
|
117
|
-
if (currentChunk.length > 0
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
201
|
+
if (currentChunk.length > 0) {
|
|
202
|
+
const chunkText = currentChunk.join("\n");
|
|
203
|
+
if (chunkText.trim().length > 20) {
|
|
204
|
+
chunks.push({
|
|
205
|
+
text: chunkText,
|
|
206
|
+
startLine: chunkStartLine + 1,
|
|
207
|
+
endLine: lines.length,
|
|
208
|
+
tokenCount: currentTokenCount
|
|
209
|
+
});
|
|
210
|
+
}
|
|
123
211
|
}
|
|
124
212
|
|
|
125
213
|
return chunks;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "smart-coding-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "An extensible MCP server that enhances coding productivity with AI-powered features including semantic code search, intelligent indexing, and more, using local LLMs",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -45,8 +45,9 @@
|
|
|
45
45
|
"dependencies": {
|
|
46
46
|
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
47
47
|
"@xenova/transformers": "^2.17.2",
|
|
48
|
-
"
|
|
49
|
-
"
|
|
48
|
+
"chokidar": "^3.5.3",
|
|
49
|
+
"fdir": "^6.5.0",
|
|
50
|
+
"glob": "^10.3.10"
|
|
50
51
|
},
|
|
51
52
|
"engines": {
|
|
52
53
|
"node": ">=18.0.0"
|