@softerist/heuristic-mcp 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +9 -4
- package/CONTRIBUTING.md +6 -6
- package/README.md +37 -18
- package/config.json +12 -2
- package/features/ann-config.js +120 -0
- package/features/find-similar-code.js +40 -2
- package/features/hybrid-search.js +69 -5
- package/features/index-codebase.js +28 -4
- package/index.js +9 -1
- package/lib/cache.js +396 -10
- package/lib/call-graph.js +281 -0
- package/lib/config.js +123 -16
- package/lib/project-detector.js +49 -36
- package/package.json +5 -8
- package/test/ann-fallback.test.js +68 -0
- package/test/call-graph.test.js +142 -0
- package/test/clear-cache.test.js +3 -6
- package/test/helpers.js +64 -7
- package/test/hybrid-search.test.js +2 -2
- package/test/index-codebase.test.js +3 -10
- package/test/integration.test.js +3 -3
package/ARCHITECTURE.md
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Architecture Overview
|
|
2
2
|
|
|
3
|
-
This document outlines the modular architecture of
|
|
3
|
+
This document outlines the modular architecture of Heuristic MCP.
|
|
4
4
|
|
|
5
5
|
## Directory Structure
|
|
6
6
|
|
|
7
7
|
```
|
|
8
|
-
|
|
8
|
+
heuristic-mcp/
|
|
9
9
|
├── index.js # Main entry point, MCP server setup
|
|
10
10
|
├── package.json # Package configuration
|
|
11
11
|
├── config.json # User configuration
|
|
@@ -49,6 +49,7 @@ smart-coding-mcp/
|
|
|
49
49
|
- Manages persistence of embedding vectors
|
|
50
50
|
- File hash tracking for change detection
|
|
51
51
|
- Load/save operations for disk cache
|
|
52
|
+
- Optional ANN (HNSW) index build/load/save for fast search
|
|
52
53
|
|
|
53
54
|
### lib/utils.js
|
|
54
55
|
|
|
@@ -173,6 +174,8 @@ smartChunk() - split into chunks
|
|
|
173
174
|
embedder - generate vectors
|
|
174
175
|
↓
|
|
175
176
|
EmbeddingsCache - store in memory + disk
|
|
177
|
+
↓
|
|
178
|
+
ANN index (optional) - build/load from cache
|
|
176
179
|
```
|
|
177
180
|
|
|
178
181
|
### Search Flow
|
|
@@ -182,7 +185,9 @@ User query
|
|
|
182
185
|
↓
|
|
183
186
|
embedder - query to vector
|
|
184
187
|
↓
|
|
185
|
-
|
|
188
|
+
ANN candidate search (optional)
|
|
189
|
+
↓
|
|
190
|
+
cosineSimilarity() - score candidates
|
|
186
191
|
↓
|
|
187
192
|
exact match boost - adjust scores
|
|
188
193
|
↓
|
package/CONTRIBUTING.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Contributing to
|
|
1
|
+
# Contributing to Heuristic MCP
|
|
2
2
|
|
|
3
3
|
Thank you for your interest in contributing! This document provides guidelines for contributing to the project.
|
|
4
4
|
|
|
@@ -14,8 +14,8 @@ Thank you for your interest in contributing! This document provides guidelines f
|
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
16
|
# Fork and clone the repository
|
|
17
|
-
git clone https://github.com/
|
|
18
|
-
cd
|
|
17
|
+
git clone https://github.com/softerist/heuristic-mcp.git
|
|
18
|
+
cd heuristic-mcp
|
|
19
19
|
|
|
20
20
|
# Install dependencies
|
|
21
21
|
npm install
|
|
@@ -233,7 +233,7 @@ git push origin feature/your-feature-name
|
|
|
233
233
|
```bash
|
|
234
234
|
# Test with a sample project
|
|
235
235
|
cd /path/to/test/project
|
|
236
|
-
node /path/to/
|
|
236
|
+
node /path/to/heuristic-mcp/index.js
|
|
237
237
|
|
|
238
238
|
# In another terminal, send MCP requests
|
|
239
239
|
```
|
|
@@ -297,7 +297,7 @@ Looking for ideas? Consider implementing:
|
|
|
297
297
|
|
|
298
298
|
- **Issues**: Use GitHub Issues for bugs and feature requests
|
|
299
299
|
- **Discussions**: Use GitHub Discussions for questions
|
|
300
|
-
- **Email**: Contact
|
|
300
|
+
- **Email**: Contact Softerist via website
|
|
301
301
|
|
|
302
302
|
## License
|
|
303
303
|
|
|
@@ -305,4 +305,4 @@ By contributing, you agree that your contributions will be licensed under the MI
|
|
|
305
305
|
|
|
306
306
|
---
|
|
307
307
|
|
|
308
|
-
Thank you for contributing to
|
|
308
|
+
Thank you for contributing to Heuristic MCP!
|
package/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Heuristic MCP
|
|
2
2
|
|
|
3
|
-
[](https://www.npmjs.com/package/@softerist/heuristic-mcp)
|
|
4
|
+
[](https://www.npmjs.com/package/@softerist/heuristic-mcp)
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
|
6
6
|
[](https://nodejs.org/)
|
|
7
7
|
|
|
@@ -28,6 +28,7 @@ This MCP server solves that by indexing your codebase with AI embeddings. Your A
|
|
|
28
28
|
- Pre-indexed embeddings are faster than scanning files at runtime
|
|
29
29
|
- Smart project detection skips dependencies automatically (node_modules, vendor, etc.)
|
|
30
30
|
- Incremental updates - only re-processes changed files
|
|
31
|
+
- Optional ANN search (HNSW) for faster queries on large codebases
|
|
31
32
|
|
|
32
33
|
**Privacy**
|
|
33
34
|
|
|
@@ -40,13 +41,13 @@ This MCP server solves that by indexing your codebase with AI embeddings. Your A
|
|
|
40
41
|
Install globally via npm:
|
|
41
42
|
|
|
42
43
|
```bash
|
|
43
|
-
npm install -g
|
|
44
|
+
npm install -g @softerist/heuristic-mcp
|
|
44
45
|
```
|
|
45
46
|
|
|
46
47
|
To update to the latest version:
|
|
47
48
|
|
|
48
49
|
```bash
|
|
49
|
-
npm update -g
|
|
50
|
+
npm update -g @softerist/heuristic-mcp
|
|
50
51
|
```
|
|
51
52
|
|
|
52
53
|
## Configuration
|
|
@@ -68,8 +69,8 @@ Add the server configuration to the `mcpServers` object in your config file:
|
|
|
68
69
|
```json
|
|
69
70
|
{
|
|
70
71
|
"mcpServers": {
|
|
71
|
-
"
|
|
72
|
-
"command": "
|
|
72
|
+
"heuristic-mcp": {
|
|
73
|
+
"command": "heuristic-mcp",
|
|
73
74
|
"args": ["--workspace", "/absolute/path/to/your/project"]
|
|
74
75
|
}
|
|
75
76
|
}
|
|
@@ -81,12 +82,12 @@ Add the server configuration to the `mcpServers` object in your config file:
|
|
|
81
82
|
```json
|
|
82
83
|
{
|
|
83
84
|
"mcpServers": {
|
|
84
|
-
"
|
|
85
|
-
"command": "
|
|
85
|
+
"heuristic-mcp-project-a": {
|
|
86
|
+
"command": "heuristic-mcp",
|
|
86
87
|
"args": ["--workspace", "/path/to/project-a"]
|
|
87
88
|
},
|
|
88
|
-
"
|
|
89
|
-
"command": "
|
|
89
|
+
"heuristic-mcp-project-b": {
|
|
90
|
+
"command": "heuristic-mcp",
|
|
90
91
|
"args": ["--workspace", "/path/to/project-b"]
|
|
91
92
|
}
|
|
92
93
|
}
|
|
@@ -108,21 +109,25 @@ Override configuration settings via environment variables in your MCP config:
|
|
|
108
109
|
| `SMART_CODING_WATCH_FILES` | boolean | `false` | Enable file watching for auto-reindex |
|
|
109
110
|
| `SMART_CODING_SEMANTIC_WEIGHT` | number | `0.7` | Weight for semantic similarity (0-1) |
|
|
110
111
|
| `SMART_CODING_EXACT_MATCH_BOOST` | number | `1.5` | Boost for exact text matches |
|
|
112
|
+
| `SMART_CODING_RECENCY_BOOST` | number | `0.1` | Boost for recently modified files |
|
|
113
|
+
| `SMART_CODING_RECENCY_DECAY_DAYS`| number | `30` | Days until recency boost fades to 0 |
|
|
111
114
|
| `SMART_CODING_EMBEDDING_MODEL` | string | `Xenova/all-MiniLM-L6-v2` | AI embedding model to use |
|
|
112
115
|
| `SMART_CODING_WORKER_THREADS` | string | `auto` | Worker threads (`auto` or 1-32) |
|
|
116
|
+
| `SMART_CODING_ANN_ENABLED` | boolean | `true` | Enable ANN search (HNSW) |
|
|
117
|
+
|
|
118
|
+
**ANN note**: HNSW support uses optional `hnswlib-node`. If it isn't installed, the server falls back to exact (linear) search automatically.
|
|
113
119
|
|
|
114
120
|
**Example with environment variables:**
|
|
115
121
|
|
|
116
122
|
```json
|
|
117
123
|
{
|
|
118
124
|
"mcpServers": {
|
|
119
|
-
"
|
|
120
|
-
"command": "
|
|
125
|
+
"heuristic-mcp": {
|
|
126
|
+
"command": "heuristic-mcp",
|
|
121
127
|
"args": ["--workspace", "/path/to/project"],
|
|
122
128
|
"env": {
|
|
123
129
|
"SMART_CODING_VERBOSE": "true",
|
|
124
|
-
"
|
|
125
|
-
"SMART_CODING_MAX_FILE_SIZE": "2097152"
|
|
130
|
+
"SMART_CODING_RECENCY_BOOST": "0.2"
|
|
126
131
|
}
|
|
127
132
|
}
|
|
128
133
|
}
|
|
@@ -140,6 +145,13 @@ Query: "Where do we validate user input?"
|
|
|
140
145
|
Returns: Relevant validation code with file paths and line numbers
|
|
141
146
|
```
|
|
142
147
|
|
|
148
|
+
**find_similar_code** - Find duplicates or patterns
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
Input: A snippet of code or a file path
|
|
152
|
+
Returns: Other code in the project that looks or functions similarly
|
|
153
|
+
```
|
|
154
|
+
|
|
143
155
|
**index_codebase** - Manually trigger reindexing
|
|
144
156
|
|
|
145
157
|
```
|
|
@@ -156,12 +168,15 @@ Useful when cache becomes corrupted or outdated
|
|
|
156
168
|
|
|
157
169
|
The server indexes your code in four steps:
|
|
158
170
|
|
|
159
|
-
1. **Discovery**: Scans your project for source files
|
|
171
|
+
1. **Discovery**: Scans your project for source files (smartly ignoring build/vendor folders)
|
|
160
172
|
2. **Chunking**: Breaks code into meaningful pieces (respecting function boundaries)
|
|
161
173
|
3. **Embedding**: Converts each chunk to a vector using a local AI model
|
|
162
174
|
4. **Storage**: Saves embeddings to `.smart-coding-cache/` for fast startup
|
|
163
175
|
|
|
164
|
-
When you search, your query is converted to the same vector format
|
|
176
|
+
When you search, your query is converted to the same vector format. We use a **hybrid ranking algorithm** that combines:
|
|
177
|
+
- **Semantic Similarity** (cosine similarity of vectors)
|
|
178
|
+
- **Exact Keyword Matching** (BM25-inspired boost)
|
|
179
|
+
- **Recency Boosting** (favoring files you're actively working on)
|
|
165
180
|
|
|
166
181
|

|
|
167
182
|
|
|
@@ -224,11 +239,15 @@ This project builds on research from Cursor showing that semantic search improve
|
|
|
224
239
|
|
|
225
240
|
See: https://cursor.com/blog/semsearch
|
|
226
241
|
|
|
242
|
+
## Acknowledgements
|
|
243
|
+
|
|
244
|
+
This project is a fork of [smart-coding-mcp](https://github.com/omar-haris/smart-coding-mcp) by [Omar Haris](https://www.linkedin.com/in/omarharis/). We thank him for the original implementation.
|
|
245
|
+
|
|
227
246
|
## License
|
|
228
247
|
|
|
229
248
|
MIT License
|
|
230
249
|
|
|
231
|
-
Copyright (c) 2025
|
|
250
|
+
Copyright (c) 2025 Softerist
|
|
232
251
|
|
|
233
252
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
234
253
|
of this software and associated documentation files (the "Software"), to deal
|
package/config.json
CHANGED
|
@@ -62,5 +62,15 @@
|
|
|
62
62
|
"embeddingModel": "Xenova/all-MiniLM-L6-v2",
|
|
63
63
|
"semanticWeight": 0.7,
|
|
64
64
|
"exactMatchBoost": 1.5,
|
|
65
|
-
"workerThreads": "auto"
|
|
66
|
-
|
|
65
|
+
"workerThreads": "auto",
|
|
66
|
+
"annEnabled": true,
|
|
67
|
+
"annMinChunks": 5000,
|
|
68
|
+
"annMinCandidates": 50,
|
|
69
|
+
"annMaxCandidates": 200,
|
|
70
|
+
"annCandidateMultiplier": 20,
|
|
71
|
+
"annEfConstruction": 200,
|
|
72
|
+
"annEfSearch": 64,
|
|
73
|
+
"annM": 16,
|
|
74
|
+
"annIndexCache": true,
|
|
75
|
+
"annMetric": "cosine"
|
|
76
|
+
}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ANN Config Tool - Runtime tuning of ANN search parameters
|
|
3
|
+
*
|
|
4
|
+
* Allows adjusting efSearch on the fly for speed/accuracy tradeoff,
|
|
5
|
+
* and querying current ANN index statistics.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export class AnnConfigTool {
|
|
9
|
+
constructor(cache, config) {
|
|
10
|
+
this.cache = cache;
|
|
11
|
+
this.config = config;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Adjust efSearch and optionally trigger index rebuild
|
|
16
|
+
*/
|
|
17
|
+
async execute(args) {
|
|
18
|
+
const action = args.action || "stats";
|
|
19
|
+
|
|
20
|
+
if (action === "stats") {
|
|
21
|
+
return this.cache.getAnnStats();
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (action === "set_ef_search") {
|
|
25
|
+
const efSearch = args.efSearch;
|
|
26
|
+
if (efSearch === undefined) {
|
|
27
|
+
return { success: false, error: "efSearch parameter is required for set_ef_search action" };
|
|
28
|
+
}
|
|
29
|
+
return this.cache.setEfSearch(efSearch);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (action === "rebuild") {
|
|
33
|
+
// Force invalidate and rebuild the ANN index
|
|
34
|
+
this.cache.invalidateAnnIndex();
|
|
35
|
+
const index = await this.cache.ensureAnnIndex();
|
|
36
|
+
return {
|
|
37
|
+
success: index !== null,
|
|
38
|
+
message: index ? "ANN index rebuilt successfully" : "ANN index rebuild failed or not available"
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return { success: false, error: `Unknown action: ${action}. Valid actions: stats, set_ef_search, rebuild` };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
formatResults(result) {
|
|
46
|
+
if (result.success === false) {
|
|
47
|
+
return `Error: ${result.error}`;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (result.enabled !== undefined) {
|
|
51
|
+
// Stats response
|
|
52
|
+
let output = "## ANN Index Statistics\n\n";
|
|
53
|
+
output += `- **Enabled**: ${result.enabled}\n`;
|
|
54
|
+
output += `- **Index Loaded**: ${result.indexLoaded}\n`;
|
|
55
|
+
output += `- **Dirty (needs rebuild)**: ${result.dirty}\n`;
|
|
56
|
+
output += `- **Vector Count**: ${result.vectorCount}\n`;
|
|
57
|
+
output += `- **Min Chunks for ANN**: ${result.minChunksForAnn}\n`;
|
|
58
|
+
|
|
59
|
+
if (result.config) {
|
|
60
|
+
output += "\n### Current Config\n\n";
|
|
61
|
+
output += `- **Metric**: ${result.config.metric}\n`;
|
|
62
|
+
output += `- **Dimensions**: ${result.config.dim}\n`;
|
|
63
|
+
output += `- **Indexed Vectors**: ${result.config.count}\n`;
|
|
64
|
+
output += `- **M (connectivity)**: ${result.config.m}\n`;
|
|
65
|
+
output += `- **efConstruction**: ${result.config.efConstruction}\n`;
|
|
66
|
+
output += `- **efSearch**: ${result.config.efSearch}\n`;
|
|
67
|
+
} else {
|
|
68
|
+
output += "\n*No active ANN index.*\n";
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return output;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Other responses (set_ef_search, rebuild)
|
|
75
|
+
return JSON.stringify(result, null, 2);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// MCP Tool definition
|
|
80
|
+
export function getToolDefinition() {
|
|
81
|
+
return {
|
|
82
|
+
name: "d_ann_config",
|
|
83
|
+
description: "Configure and monitor the ANN (Approximate Nearest Neighbor) search index. Actions: 'stats' (view current config), 'set_ef_search' (tune search accuracy/speed), 'rebuild' (force index rebuild).",
|
|
84
|
+
inputSchema: {
|
|
85
|
+
type: "object",
|
|
86
|
+
properties: {
|
|
87
|
+
action: {
|
|
88
|
+
type: "string",
|
|
89
|
+
enum: ["stats", "set_ef_search", "rebuild"],
|
|
90
|
+
description: "Action to perform. 'stats' shows current config, 'set_ef_search' changes the search parameter, 'rebuild' forces index rebuild.",
|
|
91
|
+
default: "stats"
|
|
92
|
+
},
|
|
93
|
+
efSearch: {
|
|
94
|
+
type: "number",
|
|
95
|
+
description: "New efSearch value (only for set_ef_search action). Higher = more accurate but slower. Typical range: 16-512.",
|
|
96
|
+
minimum: 1,
|
|
97
|
+
maximum: 1000
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
annotations: {
|
|
102
|
+
title: "ANN Index Configuration",
|
|
103
|
+
readOnlyHint: false,
|
|
104
|
+
destructiveHint: false,
|
|
105
|
+
idempotentHint: true,
|
|
106
|
+
openWorldHint: false
|
|
107
|
+
}
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Tool handler
|
|
112
|
+
export async function handleToolCall(request, annConfigTool) {
|
|
113
|
+
const args = request.params.arguments || {};
|
|
114
|
+
const result = await annConfigTool.execute(args);
|
|
115
|
+
const formattedText = annConfigTool.formatResults(result);
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
content: [{ type: "text", text: formattedText }]
|
|
119
|
+
};
|
|
120
|
+
}
|
|
@@ -12,6 +12,15 @@ export class FindSimilarCode {
|
|
|
12
12
|
this.config = config;
|
|
13
13
|
}
|
|
14
14
|
|
|
15
|
+
getAnnCandidateCount(maxResults, totalChunks) {
|
|
16
|
+
const minCandidates = this.config.annMinCandidates ?? 0;
|
|
17
|
+
const maxCandidates = this.config.annMaxCandidates ?? totalChunks;
|
|
18
|
+
const multiplier = this.config.annCandidateMultiplier ?? 1;
|
|
19
|
+
const desired = Math.max(minCandidates, Math.ceil(maxResults * multiplier));
|
|
20
|
+
const capped = Math.min(maxCandidates, desired);
|
|
21
|
+
return Math.min(totalChunks, Math.max(maxResults, capped));
|
|
22
|
+
}
|
|
23
|
+
|
|
15
24
|
async execute({ code, maxResults = 5, minSimilarity = 0.3 }) {
|
|
16
25
|
const vectorStore = this.cache.getVectorStore();
|
|
17
26
|
|
|
@@ -25,18 +34,47 @@ export class FindSimilarCode {
|
|
|
25
34
|
// Generate embedding for the input code
|
|
26
35
|
const codeEmbed = await this.embedder(code, { pooling: "mean", normalize: true });
|
|
27
36
|
const codeVector = Array.from(codeEmbed.data);
|
|
37
|
+
const codeVectorTyped = codeEmbed.data;
|
|
38
|
+
|
|
39
|
+
let candidates = vectorStore;
|
|
40
|
+
let usedAnn = false;
|
|
41
|
+
if (this.config.annEnabled) {
|
|
42
|
+
const candidateCount = this.getAnnCandidateCount(maxResults, vectorStore.length);
|
|
43
|
+
const annLabels = await this.cache.queryAnn(codeVectorTyped, candidateCount);
|
|
44
|
+
if (annLabels && annLabels.length >= maxResults) {
|
|
45
|
+
usedAnn = true;
|
|
46
|
+
const seen = new Set();
|
|
47
|
+
candidates = annLabels
|
|
48
|
+
.map((index) => {
|
|
49
|
+
if (seen.has(index)) return null;
|
|
50
|
+
seen.add(index);
|
|
51
|
+
return vectorStore[index];
|
|
52
|
+
})
|
|
53
|
+
.filter(Boolean);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
28
56
|
|
|
29
57
|
// Score all chunks by similarity
|
|
30
|
-
|
|
58
|
+
let scoredChunks = candidates.map(chunk => {
|
|
31
59
|
const similarity = dotSimilarity(codeVector, chunk.vector);
|
|
32
60
|
return { ...chunk, similarity };
|
|
33
61
|
});
|
|
34
62
|
|
|
35
63
|
// Filter by minimum similarity and sort
|
|
36
|
-
|
|
64
|
+
let filteredResults = scoredChunks
|
|
37
65
|
.filter(chunk => chunk.similarity >= minSimilarity)
|
|
38
66
|
.sort((a, b) => b.similarity - a.similarity);
|
|
39
67
|
|
|
68
|
+
if (usedAnn && filteredResults.length < maxResults) {
|
|
69
|
+
scoredChunks = vectorStore.map(chunk => {
|
|
70
|
+
const similarity = dotSimilarity(codeVector, chunk.vector);
|
|
71
|
+
return { ...chunk, similarity };
|
|
72
|
+
});
|
|
73
|
+
filteredResults = scoredChunks
|
|
74
|
+
.filter(chunk => chunk.similarity >= minSimilarity)
|
|
75
|
+
.sort((a, b) => b.similarity - a.similarity);
|
|
76
|
+
}
|
|
77
|
+
|
|
40
78
|
// Deduplicate: if input code is from indexed file, skip exact matches
|
|
41
79
|
const normalizedInput = code.trim().replace(/\s+/g, ' ');
|
|
42
80
|
const results = filteredResults
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import path from "path";
|
|
2
2
|
import fs from "fs/promises";
|
|
3
3
|
import { dotSimilarity } from "../lib/utils.js";
|
|
4
|
+
import { extractSymbolsFromContent } from "../lib/call-graph.js";
|
|
4
5
|
|
|
5
6
|
export class HybridSearch {
|
|
6
7
|
constructor(embedder, cache, config) {
|
|
@@ -10,6 +11,15 @@ export class HybridSearch {
|
|
|
10
11
|
this.fileModTimes = new Map(); // Cache for file modification times
|
|
11
12
|
}
|
|
12
13
|
|
|
14
|
+
getAnnCandidateCount(maxResults, totalChunks) {
|
|
15
|
+
const minCandidates = this.config.annMinCandidates ?? 0;
|
|
16
|
+
const maxCandidates = this.config.annMaxCandidates ?? totalChunks;
|
|
17
|
+
const multiplier = this.config.annCandidateMultiplier ?? 1;
|
|
18
|
+
const desired = Math.max(minCandidates, Math.ceil(maxResults * multiplier));
|
|
19
|
+
const capped = Math.min(maxCandidates, desired);
|
|
20
|
+
return Math.min(totalChunks, Math.max(maxResults, capped));
|
|
21
|
+
}
|
|
22
|
+
|
|
13
23
|
async populateFileModTimes(files) {
|
|
14
24
|
const uniqueFiles = new Set(files);
|
|
15
25
|
const missing = [];
|
|
@@ -56,13 +66,37 @@ export class HybridSearch {
|
|
|
56
66
|
// Generate query embedding
|
|
57
67
|
const queryEmbed = await this.embedder(query, { pooling: "mean", normalize: true });
|
|
58
68
|
const queryVector = Array.from(queryEmbed.data);
|
|
69
|
+
const queryVectorTyped = queryEmbed.data;
|
|
70
|
+
|
|
71
|
+
let candidates = vectorStore;
|
|
72
|
+
let usedAnn = false;
|
|
73
|
+
if (this.config.annEnabled) {
|
|
74
|
+
const candidateCount = this.getAnnCandidateCount(maxResults, vectorStore.length);
|
|
75
|
+
const annLabels = await this.cache.queryAnn(queryVectorTyped, candidateCount);
|
|
76
|
+
if (annLabels && annLabels.length >= maxResults) {
|
|
77
|
+
usedAnn = true;
|
|
78
|
+
const seen = new Set();
|
|
79
|
+
candidates = annLabels
|
|
80
|
+
.map((index) => {
|
|
81
|
+
if (seen.has(index)) return null;
|
|
82
|
+
seen.add(index);
|
|
83
|
+
return vectorStore[index];
|
|
84
|
+
})
|
|
85
|
+
.filter(Boolean);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (usedAnn && candidates.length < maxResults) {
|
|
90
|
+
candidates = vectorStore;
|
|
91
|
+
usedAnn = false;
|
|
92
|
+
}
|
|
59
93
|
|
|
60
94
|
if (this.config.recencyBoost > 0) {
|
|
61
|
-
await this.populateFileModTimes(
|
|
95
|
+
await this.populateFileModTimes(candidates.map(chunk => chunk.file));
|
|
62
96
|
}
|
|
63
97
|
|
|
64
98
|
// Score all chunks (synchronous map now, much faster)
|
|
65
|
-
const scoredChunks =
|
|
99
|
+
const scoredChunks = candidates.map(chunk => {
|
|
66
100
|
// Semantic similarity (vectors are normalized)
|
|
67
101
|
let score = dotSimilarity(queryVector, chunk.vector) * this.config.semanticWeight;
|
|
68
102
|
|
|
@@ -97,10 +131,40 @@ export class HybridSearch {
|
|
|
97
131
|
return { ...chunk, score };
|
|
98
132
|
});
|
|
99
133
|
|
|
134
|
+
// Sort by initial score
|
|
135
|
+
scoredChunks.sort((a, b) => b.score - a.score);
|
|
136
|
+
|
|
137
|
+
// Apply call graph proximity boost if enabled
|
|
138
|
+
if (this.config.callGraphEnabled && this.config.callGraphBoost > 0) {
|
|
139
|
+
// Extract symbols from top initial results
|
|
140
|
+
const topN = Math.min(5, scoredChunks.length);
|
|
141
|
+
const symbolsFromTop = new Set();
|
|
142
|
+
for (let i = 0; i < topN; i++) {
|
|
143
|
+
const symbols = extractSymbolsFromContent(scoredChunks[i].content);
|
|
144
|
+
for (const sym of symbols) {
|
|
145
|
+
symbolsFromTop.add(sym);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (symbolsFromTop.size > 0) {
|
|
150
|
+
// Get related files from call graph
|
|
151
|
+
const relatedFiles = await this.cache.getRelatedFiles(Array.from(symbolsFromTop));
|
|
152
|
+
|
|
153
|
+
// Apply boost to chunks from related files
|
|
154
|
+
for (const chunk of scoredChunks) {
|
|
155
|
+
const proximity = relatedFiles.get(chunk.file);
|
|
156
|
+
if (proximity) {
|
|
157
|
+
chunk.score += proximity * this.config.callGraphBoost;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Re-sort after applying call graph boost
|
|
162
|
+
scoredChunks.sort((a, b) => b.score - a.score);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
100
166
|
// Get top results
|
|
101
|
-
const results = scoredChunks
|
|
102
|
-
.sort((a, b) => b.score - a.score)
|
|
103
|
-
.slice(0, maxResults);
|
|
167
|
+
const results = scoredChunks.slice(0, maxResults);
|
|
104
168
|
|
|
105
169
|
return { results, message: null };
|
|
106
170
|
}
|
|
@@ -6,6 +6,7 @@ import os from "os";
|
|
|
6
6
|
import { Worker } from "worker_threads";
|
|
7
7
|
import { fileURLToPath } from "url";
|
|
8
8
|
import { smartChunk, hashContent } from "../lib/utils.js";
|
|
9
|
+
import { extractCallData } from "../lib/call-graph.js";
|
|
9
10
|
|
|
10
11
|
function escapeRegExp(value) {
|
|
11
12
|
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
@@ -585,18 +586,30 @@ export class CodebaseIndexer {
|
|
|
585
586
|
// Step 5: Process files in adaptive batches
|
|
586
587
|
for (let i = 0; i < filesToProcess.length; i += adaptiveBatchSize) {
|
|
587
588
|
const batch = filesToProcess.slice(i, i + adaptiveBatchSize);
|
|
588
|
-
|
|
589
|
+
|
|
589
590
|
// Generate all chunks for this batch
|
|
590
591
|
const allChunks = [];
|
|
591
592
|
const fileStats = new Map();
|
|
592
|
-
|
|
593
|
+
|
|
593
594
|
for (const { file, content, hash } of batch) {
|
|
594
595
|
// Remove old chunks for this file
|
|
595
596
|
this.cache.removeFileFromStore(file);
|
|
596
|
-
|
|
597
|
+
|
|
598
|
+
// Extract call graph data if enabled
|
|
599
|
+
if (this.config.callGraphEnabled) {
|
|
600
|
+
try {
|
|
601
|
+
const callData = extractCallData(content, file);
|
|
602
|
+
this.cache.setFileCallData(file, callData);
|
|
603
|
+
} catch (err) {
|
|
604
|
+
if (this.config.verbose) {
|
|
605
|
+
console.error(`[Indexer] Call graph extraction failed for ${path.basename(file)}: ${err.message}`);
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
|
|
597
610
|
const chunks = smartChunk(content, file, this.config);
|
|
598
611
|
fileStats.set(file, { hash, totalChunks: 0, successChunks: 0 });
|
|
599
|
-
|
|
612
|
+
|
|
600
613
|
for (const chunk of chunks) {
|
|
601
614
|
allChunks.push({
|
|
602
615
|
file,
|
|
@@ -673,6 +686,17 @@ export class CodebaseIndexer {
|
|
|
673
686
|
|
|
674
687
|
await this.cache.save();
|
|
675
688
|
|
|
689
|
+
// Rebuild call graph in background
|
|
690
|
+
if (this.config.callGraphEnabled) {
|
|
691
|
+
this.cache.rebuildCallGraph();
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
void this.cache.ensureAnnIndex().catch((error) => {
|
|
695
|
+
if (this.config.verbose) {
|
|
696
|
+
console.error(`[ANN] Background ANN build failed: ${error.message}`);
|
|
697
|
+
}
|
|
698
|
+
});
|
|
699
|
+
|
|
676
700
|
const vectorStore = this.cache.getVectorStore();
|
|
677
701
|
return {
|
|
678
702
|
skipped: false,
|
package/index.js
CHANGED
|
@@ -19,6 +19,7 @@ import * as IndexCodebaseFeature from "./features/index-codebase.js";
|
|
|
19
19
|
import * as HybridSearchFeature from "./features/hybrid-search.js";
|
|
20
20
|
import * as ClearCacheFeature from "./features/clear-cache.js";
|
|
21
21
|
import * as FindSimilarCodeFeature from "./features/find-similar-code.js";
|
|
22
|
+
import * as AnnConfigFeature from "./features/ann-config.js";
|
|
22
23
|
|
|
23
24
|
// Parse workspace from command line arguments
|
|
24
25
|
const args = process.argv.slice(2);
|
|
@@ -76,6 +77,11 @@ const features = [
|
|
|
76
77
|
module: FindSimilarCodeFeature,
|
|
77
78
|
instance: null,
|
|
78
79
|
handler: FindSimilarCodeFeature.handleToolCall
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
module: AnnConfigFeature,
|
|
83
|
+
instance: null,
|
|
84
|
+
handler: AnnConfigFeature.handleToolCall
|
|
79
85
|
}
|
|
80
86
|
];
|
|
81
87
|
|
|
@@ -105,12 +111,14 @@ async function initialize() {
|
|
|
105
111
|
hybridSearch = new HybridSearch(embedder, cache, config);
|
|
106
112
|
const cacheClearer = new ClearCacheFeature.CacheClearer(embedder, cache, config, indexer);
|
|
107
113
|
const findSimilarCode = new FindSimilarCodeFeature.FindSimilarCode(embedder, cache, config);
|
|
114
|
+
const annConfig = new AnnConfigFeature.AnnConfigTool(cache, config);
|
|
108
115
|
|
|
109
116
|
// Store feature instances (matches features array order)
|
|
110
117
|
features[0].instance = hybridSearch;
|
|
111
118
|
features[1].instance = indexer;
|
|
112
119
|
features[2].instance = cacheClearer;
|
|
113
120
|
features[3].instance = findSimilarCode;
|
|
121
|
+
features[4].instance = annConfig;
|
|
114
122
|
|
|
115
123
|
// Attach hybridSearch to server for cross-feature access (e.g. cache invalidation)
|
|
116
124
|
server.hybridSearch = hybridSearch;
|
|
@@ -177,7 +185,7 @@ async function main() {
|
|
|
177
185
|
const transport = new StdioServerTransport();
|
|
178
186
|
await server.connect(transport);
|
|
179
187
|
|
|
180
|
-
console.error("[Server]
|
|
188
|
+
console.error("[Server] Heuristic MCP server ready!");
|
|
181
189
|
}
|
|
182
190
|
|
|
183
191
|
// Graceful shutdown
|