mcp-vector-search 0.0.3__py3-none-any.whl → 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +3 -2
- mcp_vector_search/cli/commands/auto_index.py +397 -0
- mcp_vector_search/cli/commands/config.py +88 -40
- mcp_vector_search/cli/commands/index.py +198 -52
- mcp_vector_search/cli/commands/init.py +471 -58
- mcp_vector_search/cli/commands/install.py +284 -0
- mcp_vector_search/cli/commands/mcp.py +495 -0
- mcp_vector_search/cli/commands/search.py +241 -87
- mcp_vector_search/cli/commands/status.py +184 -58
- mcp_vector_search/cli/commands/watch.py +34 -35
- mcp_vector_search/cli/didyoumean.py +184 -0
- mcp_vector_search/cli/export.py +320 -0
- mcp_vector_search/cli/history.py +292 -0
- mcp_vector_search/cli/interactive.py +342 -0
- mcp_vector_search/cli/main.py +175 -27
- mcp_vector_search/cli/output.py +63 -45
- mcp_vector_search/config/defaults.py +50 -36
- mcp_vector_search/config/settings.py +49 -35
- mcp_vector_search/core/auto_indexer.py +298 -0
- mcp_vector_search/core/connection_pool.py +322 -0
- mcp_vector_search/core/database.py +335 -25
- mcp_vector_search/core/embeddings.py +73 -29
- mcp_vector_search/core/exceptions.py +19 -2
- mcp_vector_search/core/factory.py +310 -0
- mcp_vector_search/core/git_hooks.py +345 -0
- mcp_vector_search/core/indexer.py +237 -73
- mcp_vector_search/core/models.py +21 -19
- mcp_vector_search/core/project.py +73 -58
- mcp_vector_search/core/scheduler.py +330 -0
- mcp_vector_search/core/search.py +574 -86
- mcp_vector_search/core/watcher.py +48 -46
- mcp_vector_search/mcp/__init__.py +4 -0
- mcp_vector_search/mcp/__main__.py +25 -0
- mcp_vector_search/mcp/server.py +701 -0
- mcp_vector_search/parsers/base.py +30 -31
- mcp_vector_search/parsers/javascript.py +74 -48
- mcp_vector_search/parsers/python.py +57 -49
- mcp_vector_search/parsers/registry.py +47 -32
- mcp_vector_search/parsers/text.py +179 -0
- mcp_vector_search/utils/__init__.py +40 -0
- mcp_vector_search/utils/gitignore.py +229 -0
- mcp_vector_search/utils/timing.py +334 -0
- mcp_vector_search/utils/version.py +47 -0
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/METADATA +173 -7
- mcp_vector_search-0.4.12.dist-info/RECORD +54 -0
- mcp_vector_search-0.0.3.dist-info/RECORD +0 -35
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/licenses/LICENSE +0 -0
mcp_vector_search/core/search.py
CHANGED
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import Any
|
|
6
6
|
|
|
7
7
|
from loguru import logger
|
|
8
8
|
|
|
9
|
+
from .auto_indexer import AutoIndexer, SearchTriggeredIndexer
|
|
9
10
|
from .database import VectorDatabase
|
|
10
11
|
from .exceptions import SearchError
|
|
11
12
|
from .models import SearchResult
|
|
@@ -18,48 +19,70 @@ class SemanticSearchEngine:
|
|
|
18
19
|
self,
|
|
19
20
|
database: VectorDatabase,
|
|
20
21
|
project_root: Path,
|
|
21
|
-
similarity_threshold: float = 0.
|
|
22
|
+
similarity_threshold: float = 0.3,
|
|
23
|
+
auto_indexer: AutoIndexer | None = None,
|
|
24
|
+
enable_auto_reindex: bool = True,
|
|
22
25
|
) -> None:
|
|
23
26
|
"""Initialize semantic search engine.
|
|
24
|
-
|
|
27
|
+
|
|
25
28
|
Args:
|
|
26
29
|
database: Vector database instance
|
|
27
30
|
project_root: Project root directory
|
|
28
31
|
similarity_threshold: Default similarity threshold
|
|
32
|
+
auto_indexer: Optional auto-indexer for semi-automatic reindexing
|
|
33
|
+
enable_auto_reindex: Whether to enable automatic reindexing
|
|
29
34
|
"""
|
|
30
35
|
self.database = database
|
|
31
36
|
self.project_root = project_root
|
|
32
37
|
self.similarity_threshold = similarity_threshold
|
|
38
|
+
self.auto_indexer = auto_indexer
|
|
39
|
+
self.enable_auto_reindex = enable_auto_reindex
|
|
40
|
+
|
|
41
|
+
# Initialize search-triggered indexer if auto-indexer is provided
|
|
42
|
+
self.search_triggered_indexer = None
|
|
43
|
+
if auto_indexer and enable_auto_reindex:
|
|
44
|
+
self.search_triggered_indexer = SearchTriggeredIndexer(auto_indexer)
|
|
33
45
|
|
|
34
46
|
async def search(
|
|
35
47
|
self,
|
|
36
48
|
query: str,
|
|
37
49
|
limit: int = 10,
|
|
38
|
-
filters:
|
|
39
|
-
similarity_threshold:
|
|
50
|
+
filters: dict[str, Any] | None = None,
|
|
51
|
+
similarity_threshold: float | None = None,
|
|
40
52
|
include_context: bool = True,
|
|
41
|
-
) ->
|
|
53
|
+
) -> list[SearchResult]:
|
|
42
54
|
"""Perform semantic search for code.
|
|
43
|
-
|
|
55
|
+
|
|
44
56
|
Args:
|
|
45
57
|
query: Search query
|
|
46
58
|
limit: Maximum number of results
|
|
47
59
|
filters: Optional filters (language, file_path, etc.)
|
|
48
60
|
similarity_threshold: Minimum similarity score
|
|
49
61
|
include_context: Whether to include context lines
|
|
50
|
-
|
|
62
|
+
|
|
51
63
|
Returns:
|
|
52
64
|
List of search results
|
|
53
65
|
"""
|
|
54
66
|
if not query.strip():
|
|
55
67
|
return []
|
|
56
68
|
|
|
57
|
-
|
|
69
|
+
# Auto-reindex check before search
|
|
70
|
+
if self.search_triggered_indexer:
|
|
71
|
+
try:
|
|
72
|
+
await self.search_triggered_indexer.pre_search_hook()
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f"Auto-reindex check failed: {e}")
|
|
75
|
+
|
|
76
|
+
threshold = (
|
|
77
|
+
similarity_threshold
|
|
78
|
+
if similarity_threshold is not None
|
|
79
|
+
else self._get_adaptive_threshold(query)
|
|
80
|
+
)
|
|
58
81
|
|
|
59
82
|
try:
|
|
60
83
|
# Preprocess query
|
|
61
84
|
processed_query = self._preprocess_query(query)
|
|
62
|
-
|
|
85
|
+
|
|
63
86
|
# Perform vector search
|
|
64
87
|
results = await self.database.search(
|
|
65
88
|
query=processed_query,
|
|
@@ -77,7 +100,9 @@ class SemanticSearchEngine:
|
|
|
77
100
|
# Apply additional ranking if needed
|
|
78
101
|
ranked_results = self._rerank_results(enhanced_results, query)
|
|
79
102
|
|
|
80
|
-
logger.debug(
|
|
103
|
+
logger.debug(
|
|
104
|
+
f"Search for '{query}' with threshold {threshold:.3f} returned {len(ranked_results)} results"
|
|
105
|
+
)
|
|
81
106
|
return ranked_results
|
|
82
107
|
|
|
83
108
|
except Exception as e:
|
|
@@ -87,29 +112,31 @@ class SemanticSearchEngine:
|
|
|
87
112
|
async def search_similar(
|
|
88
113
|
self,
|
|
89
114
|
file_path: Path,
|
|
90
|
-
function_name:
|
|
115
|
+
function_name: str | None = None,
|
|
91
116
|
limit: int = 10,
|
|
92
|
-
similarity_threshold:
|
|
93
|
-
) ->
|
|
117
|
+
similarity_threshold: float | None = None,
|
|
118
|
+
) -> list[SearchResult]:
|
|
94
119
|
"""Find code similar to a specific function or file.
|
|
95
|
-
|
|
120
|
+
|
|
96
121
|
Args:
|
|
97
122
|
file_path: Path to the reference file
|
|
98
123
|
function_name: Specific function name (optional)
|
|
99
124
|
limit: Maximum number of results
|
|
100
125
|
similarity_threshold: Minimum similarity score
|
|
101
|
-
|
|
126
|
+
|
|
102
127
|
Returns:
|
|
103
128
|
List of similar code results
|
|
104
129
|
"""
|
|
105
130
|
try:
|
|
106
131
|
# Read the reference file
|
|
107
|
-
with open(file_path,
|
|
132
|
+
with open(file_path, encoding="utf-8") as f:
|
|
108
133
|
content = f.read()
|
|
109
134
|
|
|
110
135
|
# If function name is specified, try to extract just that function
|
|
111
136
|
if function_name:
|
|
112
|
-
function_content = self._extract_function_content(
|
|
137
|
+
function_content = self._extract_function_content(
|
|
138
|
+
content, function_name
|
|
139
|
+
)
|
|
113
140
|
if function_content:
|
|
114
141
|
content = function_content
|
|
115
142
|
|
|
@@ -128,27 +155,27 @@ class SemanticSearchEngine:
|
|
|
128
155
|
async def search_by_context(
|
|
129
156
|
self,
|
|
130
157
|
context_description: str,
|
|
131
|
-
focus_areas:
|
|
158
|
+
focus_areas: list[str] | None = None,
|
|
132
159
|
limit: int = 10,
|
|
133
|
-
) ->
|
|
160
|
+
) -> list[SearchResult]:
|
|
134
161
|
"""Search for code based on contextual description.
|
|
135
|
-
|
|
162
|
+
|
|
136
163
|
Args:
|
|
137
164
|
context_description: Description of what you're looking for
|
|
138
165
|
focus_areas: Areas to focus on (e.g., ["security", "authentication"])
|
|
139
166
|
limit: Maximum number of results
|
|
140
|
-
|
|
167
|
+
|
|
141
168
|
Returns:
|
|
142
169
|
List of contextually relevant results
|
|
143
170
|
"""
|
|
144
171
|
# Build enhanced query with focus areas
|
|
145
172
|
query_parts = [context_description]
|
|
146
|
-
|
|
173
|
+
|
|
147
174
|
if focus_areas:
|
|
148
175
|
query_parts.extend(focus_areas)
|
|
149
|
-
|
|
176
|
+
|
|
150
177
|
enhanced_query = " ".join(query_parts)
|
|
151
|
-
|
|
178
|
+
|
|
152
179
|
return await self.search(
|
|
153
180
|
query=enhanced_query,
|
|
154
181
|
limit=limit,
|
|
@@ -157,46 +184,161 @@ class SemanticSearchEngine:
|
|
|
157
184
|
|
|
158
185
|
def _preprocess_query(self, query: str) -> str:
|
|
159
186
|
"""Preprocess search query for better results.
|
|
160
|
-
|
|
187
|
+
|
|
161
188
|
Args:
|
|
162
189
|
query: Raw search query
|
|
163
|
-
|
|
190
|
+
|
|
164
191
|
Returns:
|
|
165
192
|
Processed query
|
|
166
193
|
"""
|
|
167
194
|
# Remove extra whitespace
|
|
168
195
|
query = re.sub(r"\s+", " ", query.strip())
|
|
169
|
-
|
|
170
|
-
# Expand common abbreviations
|
|
196
|
+
|
|
197
|
+
# Expand common programming abbreviations and synonyms
|
|
171
198
|
expansions = {
|
|
172
|
-
"auth": "authentication",
|
|
173
|
-
"db": "database",
|
|
174
|
-
"api": "application programming interface",
|
|
175
|
-
"ui": "user interface",
|
|
176
|
-
"util": "utility",
|
|
177
|
-
"config": "configuration",
|
|
199
|
+
"auth": "authentication authorize login",
|
|
200
|
+
"db": "database data storage",
|
|
201
|
+
"api": "application programming interface endpoint",
|
|
202
|
+
"ui": "user interface frontend view",
|
|
203
|
+
"util": "utility helper function",
|
|
204
|
+
"config": "configuration settings options",
|
|
205
|
+
"async": "asynchronous await promise",
|
|
206
|
+
"sync": "synchronous blocking",
|
|
207
|
+
"func": "function method",
|
|
208
|
+
"var": "variable",
|
|
209
|
+
"param": "parameter argument",
|
|
210
|
+
"init": "initialize setup create",
|
|
211
|
+
"parse": "parsing parser analyze",
|
|
212
|
+
"validate": "validation check verify",
|
|
213
|
+
"handle": "handler process manage",
|
|
214
|
+
"error": "exception failure bug",
|
|
215
|
+
"test": "testing unittest spec",
|
|
216
|
+
"mock": "mocking stub fake",
|
|
217
|
+
"log": "logging logger debug",
|
|
178
218
|
}
|
|
179
|
-
|
|
219
|
+
|
|
220
|
+
# Add programming language keywords and concepts
|
|
221
|
+
programming_concepts = {
|
|
222
|
+
"class": "class object type",
|
|
223
|
+
"method": "method function procedure",
|
|
224
|
+
"property": "property attribute field",
|
|
225
|
+
"import": "import require include",
|
|
226
|
+
"export": "export module public",
|
|
227
|
+
"return": "return yield output",
|
|
228
|
+
"loop": "loop iterate for while",
|
|
229
|
+
"condition": "condition if else branch",
|
|
230
|
+
"array": "array list collection",
|
|
231
|
+
"string": "string text character",
|
|
232
|
+
"number": "number integer float",
|
|
233
|
+
"boolean": "boolean true false",
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Merge all expansions
|
|
237
|
+
all_expansions = {**expansions, **programming_concepts}
|
|
238
|
+
|
|
180
239
|
words = query.lower().split()
|
|
181
240
|
expanded_words = []
|
|
182
|
-
|
|
241
|
+
|
|
183
242
|
for word in words:
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
243
|
+
# Add original word
|
|
244
|
+
expanded_words.append(word)
|
|
245
|
+
|
|
246
|
+
# Add expansions if available
|
|
247
|
+
if word in all_expansions:
|
|
248
|
+
expanded_words.extend(all_expansions[word].split())
|
|
249
|
+
|
|
250
|
+
# Remove duplicates while preserving order
|
|
251
|
+
seen = set()
|
|
252
|
+
unique_words = []
|
|
253
|
+
for word in expanded_words:
|
|
254
|
+
if word not in seen:
|
|
255
|
+
seen.add(word)
|
|
256
|
+
unique_words.append(word)
|
|
257
|
+
|
|
258
|
+
return " ".join(unique_words)
|
|
259
|
+
|
|
260
|
+
def _get_adaptive_threshold(self, query: str) -> float:
|
|
261
|
+
"""Get adaptive similarity threshold based on query characteristics.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
query: Search query
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Adaptive similarity threshold
|
|
268
|
+
"""
|
|
269
|
+
base_threshold = self.similarity_threshold
|
|
270
|
+
query_lower = query.lower()
|
|
271
|
+
words = query.split()
|
|
272
|
+
|
|
273
|
+
# Adjust threshold based on query characteristics
|
|
274
|
+
|
|
275
|
+
# 1. Single word queries - lower threshold for broader results
|
|
276
|
+
if len(words) == 1:
|
|
277
|
+
return max(0.01, base_threshold - 0.29)
|
|
278
|
+
|
|
279
|
+
# 2. Very specific technical terms - lower threshold
|
|
280
|
+
technical_terms = [
|
|
281
|
+
"javascript",
|
|
282
|
+
"typescript",
|
|
283
|
+
"python",
|
|
284
|
+
"java",
|
|
285
|
+
"cpp",
|
|
286
|
+
"rust",
|
|
287
|
+
"go",
|
|
288
|
+
"function",
|
|
289
|
+
"class",
|
|
290
|
+
"method",
|
|
291
|
+
"variable",
|
|
292
|
+
"import",
|
|
293
|
+
"export",
|
|
294
|
+
"async",
|
|
295
|
+
"await",
|
|
296
|
+
"promise",
|
|
297
|
+
"callback",
|
|
298
|
+
"api",
|
|
299
|
+
"database",
|
|
300
|
+
"parser",
|
|
301
|
+
"compiler",
|
|
302
|
+
"interpreter",
|
|
303
|
+
"syntax",
|
|
304
|
+
"semantic",
|
|
305
|
+
"mcp",
|
|
306
|
+
"gateway",
|
|
307
|
+
"server",
|
|
308
|
+
"client",
|
|
309
|
+
"protocol",
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
if any(term in query_lower for term in technical_terms):
|
|
313
|
+
return max(0.01, base_threshold - 0.29)
|
|
314
|
+
|
|
315
|
+
# 3. Short queries (2-3 words) - slightly lower threshold
|
|
316
|
+
if len(words) <= 3:
|
|
317
|
+
return max(0.1, base_threshold - 0.1)
|
|
318
|
+
|
|
319
|
+
# 4. Long queries (>6 words) - higher threshold for precision
|
|
320
|
+
if len(words) > 6:
|
|
321
|
+
return min(0.8, base_threshold + 0.1)
|
|
322
|
+
|
|
323
|
+
# 5. Queries with exact identifiers (CamelCase, snake_case)
|
|
324
|
+
if re.search(r"\b[A-Z][a-zA-Z]*\b", query) or "_" in query:
|
|
325
|
+
return max(0.05, base_threshold - 0.25)
|
|
326
|
+
|
|
327
|
+
# 6. Common programming patterns
|
|
328
|
+
if any(pattern in query for pattern in ["()", ".", "->", "=>", "::"]):
|
|
329
|
+
return max(0.25, base_threshold - 0.1)
|
|
330
|
+
|
|
331
|
+
return base_threshold
|
|
190
332
|
|
|
191
333
|
async def _enhance_result(
|
|
192
334
|
self, result: SearchResult, include_context: bool
|
|
193
335
|
) -> SearchResult:
|
|
194
336
|
"""Enhance search result with additional information.
|
|
195
|
-
|
|
337
|
+
|
|
196
338
|
Args:
|
|
197
339
|
result: Original search result
|
|
198
340
|
include_context: Whether to include context lines
|
|
199
|
-
|
|
341
|
+
|
|
200
342
|
Returns:
|
|
201
343
|
Enhanced search result
|
|
202
344
|
"""
|
|
@@ -205,7 +347,7 @@ class SemanticSearchEngine:
|
|
|
205
347
|
|
|
206
348
|
try:
|
|
207
349
|
# Read the source file to get context
|
|
208
|
-
with open(result.file_path,
|
|
350
|
+
with open(result.file_path, encoding="utf-8") as f:
|
|
209
351
|
lines = f.readlines()
|
|
210
352
|
|
|
211
353
|
# Get context lines before and after
|
|
@@ -216,9 +358,7 @@ class SemanticSearchEngine:
|
|
|
216
358
|
context_before = [
|
|
217
359
|
line.rstrip() for line in lines[start_idx : result.start_line - 1]
|
|
218
360
|
]
|
|
219
|
-
context_after = [
|
|
220
|
-
line.rstrip() for line in lines[result.end_line : end_idx]
|
|
221
|
-
]
|
|
361
|
+
context_after = [line.rstrip() for line in lines[result.end_line : end_idx]]
|
|
222
362
|
|
|
223
363
|
# Update result with context
|
|
224
364
|
result.context_before = context_before
|
|
@@ -230,66 +370,414 @@ class SemanticSearchEngine:
|
|
|
230
370
|
return result
|
|
231
371
|
|
|
232
372
|
def _rerank_results(
|
|
233
|
-
self, results:
|
|
234
|
-
) ->
|
|
235
|
-
"""Apply
|
|
236
|
-
|
|
373
|
+
self, results: list[SearchResult], query: str
|
|
374
|
+
) -> list[SearchResult]:
|
|
375
|
+
"""Apply advanced ranking to search results using multiple factors.
|
|
376
|
+
|
|
237
377
|
Args:
|
|
238
378
|
results: Original search results
|
|
239
379
|
query: Original search query
|
|
240
|
-
|
|
380
|
+
|
|
241
381
|
Returns:
|
|
242
382
|
Reranked search results
|
|
243
383
|
"""
|
|
244
|
-
|
|
384
|
+
if not results:
|
|
385
|
+
return results
|
|
386
|
+
|
|
245
387
|
query_lower = query.lower()
|
|
246
|
-
|
|
388
|
+
query_words = set(query_lower.split())
|
|
389
|
+
|
|
390
|
+
for result in results:
|
|
391
|
+
# Start with base similarity score
|
|
392
|
+
score = result.similarity_score
|
|
393
|
+
|
|
394
|
+
# Factor 1: Exact matches in identifiers (high boost)
|
|
395
|
+
if result.function_name:
|
|
396
|
+
func_name_lower = result.function_name.lower()
|
|
397
|
+
if query_lower in func_name_lower:
|
|
398
|
+
score += 0.15 # Strong boost for function name match
|
|
399
|
+
# Partial word matches
|
|
400
|
+
for word in query_words:
|
|
401
|
+
if word in func_name_lower:
|
|
402
|
+
score += 0.05
|
|
403
|
+
|
|
404
|
+
if result.class_name:
|
|
405
|
+
class_name_lower = result.class_name.lower()
|
|
406
|
+
if query_lower in class_name_lower:
|
|
407
|
+
score += 0.15 # Strong boost for class name match
|
|
408
|
+
# Partial word matches
|
|
409
|
+
for word in query_words:
|
|
410
|
+
if word in class_name_lower:
|
|
411
|
+
score += 0.05
|
|
412
|
+
|
|
413
|
+
# Factor 2: File name relevance
|
|
414
|
+
file_name_lower = result.file_path.name.lower()
|
|
415
|
+
if query_lower in file_name_lower:
|
|
416
|
+
score += 0.08
|
|
417
|
+
for word in query_words:
|
|
418
|
+
if word in file_name_lower:
|
|
419
|
+
score += 0.03
|
|
420
|
+
|
|
421
|
+
# Factor 3: Content density (how many query words appear)
|
|
422
|
+
content_lower = result.content.lower()
|
|
423
|
+
word_matches = sum(1 for word in query_words if word in content_lower)
|
|
424
|
+
if word_matches > 0:
|
|
425
|
+
density_boost = (word_matches / len(query_words)) * 0.1
|
|
426
|
+
score += density_boost
|
|
427
|
+
|
|
428
|
+
# Factor 4: Code structure preferences
|
|
429
|
+
# Boost functions over general code blocks
|
|
430
|
+
if result.chunk_type == "function":
|
|
431
|
+
score += 0.05
|
|
432
|
+
elif result.chunk_type == "class":
|
|
433
|
+
score += 0.03
|
|
434
|
+
|
|
435
|
+
# Factor 5: File type preferences (prefer source files over tests/docs)
|
|
436
|
+
file_ext = result.file_path.suffix.lower()
|
|
437
|
+
if file_ext in [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs"]:
|
|
438
|
+
score += 0.02
|
|
439
|
+
elif "test" in result.file_path.name.lower():
|
|
440
|
+
score -= 0.02 # Slightly penalize test files unless specifically searching for tests
|
|
441
|
+
|
|
442
|
+
# Factor 6: Recency bias (prefer shorter file paths - often more core files)
|
|
443
|
+
path_depth = len(result.file_path.parts)
|
|
444
|
+
if path_depth <= 3:
|
|
445
|
+
score += 0.02
|
|
446
|
+
elif path_depth > 5:
|
|
447
|
+
score -= 0.01
|
|
448
|
+
|
|
449
|
+
# Ensure score doesn't exceed 1.0
|
|
450
|
+
result.similarity_score = min(1.0, score)
|
|
451
|
+
|
|
452
|
+
# Sort by enhanced similarity score
|
|
453
|
+
results.sort(key=lambda r: r.similarity_score, reverse=True)
|
|
454
|
+
|
|
455
|
+
# Update ranks
|
|
456
|
+
for i, result in enumerate(results):
|
|
457
|
+
result.rank = i + 1
|
|
458
|
+
|
|
459
|
+
return results
|
|
460
|
+
|
|
461
|
+
def analyze_query(self, query: str) -> dict[str, Any]:
|
|
462
|
+
"""Analyze search query and provide suggestions for improvement.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
query: Search query to analyze
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Dictionary with analysis results and suggestions
|
|
469
|
+
"""
|
|
470
|
+
analysis = {
|
|
471
|
+
"original_query": query,
|
|
472
|
+
"processed_query": self._preprocess_query(query),
|
|
473
|
+
"query_type": "general",
|
|
474
|
+
"suggestions": [],
|
|
475
|
+
"confidence": "medium",
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
query_lower = query.lower()
|
|
479
|
+
|
|
480
|
+
# Detect query type
|
|
481
|
+
if any(word in query_lower for word in ["function", "method", "def", "func"]):
|
|
482
|
+
analysis["query_type"] = "function_search"
|
|
483
|
+
analysis["suggestions"].append(
|
|
484
|
+
"Try searching for specific function names or patterns"
|
|
485
|
+
)
|
|
486
|
+
elif any(word in query_lower for word in ["class", "object", "type"]):
|
|
487
|
+
analysis["query_type"] = "class_search"
|
|
488
|
+
analysis["suggestions"].append(
|
|
489
|
+
"Include class inheritance or interface information"
|
|
490
|
+
)
|
|
491
|
+
elif any(word in query_lower for word in ["error", "exception", "bug", "fix"]):
|
|
492
|
+
analysis["query_type"] = "error_handling"
|
|
493
|
+
analysis["suggestions"].append("Include error types or exception names")
|
|
494
|
+
elif any(word in query_lower for word in ["test", "spec", "mock"]):
|
|
495
|
+
analysis["query_type"] = "testing"
|
|
496
|
+
analysis["suggestions"].append("Specify test framework or testing patterns")
|
|
497
|
+
elif any(word in query_lower for word in ["config", "setting", "option"]):
|
|
498
|
+
analysis["query_type"] = "configuration"
|
|
499
|
+
analysis["suggestions"].append(
|
|
500
|
+
"Include configuration file types or setting names"
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
# Analyze query complexity
|
|
504
|
+
words = query.split()
|
|
505
|
+
if len(words) == 1:
|
|
506
|
+
analysis["confidence"] = "low"
|
|
507
|
+
analysis["suggestions"].append(
|
|
508
|
+
"Try adding more descriptive words for better results"
|
|
509
|
+
)
|
|
510
|
+
elif len(words) > 10:
|
|
511
|
+
analysis["confidence"] = "low"
|
|
512
|
+
analysis["suggestions"].append(
|
|
513
|
+
"Consider simplifying your query for better matching"
|
|
514
|
+
)
|
|
515
|
+
else:
|
|
516
|
+
analysis["confidence"] = "high"
|
|
517
|
+
|
|
518
|
+
# Check for common programming patterns
|
|
519
|
+
if re.search(r"\b\w+\(\)", query):
|
|
520
|
+
analysis["suggestions"].append(
|
|
521
|
+
"Function call detected - searching for function definitions"
|
|
522
|
+
)
|
|
523
|
+
if re.search(r"\b[A-Z][a-zA-Z]*\b", query):
|
|
524
|
+
analysis["suggestions"].append(
|
|
525
|
+
"CamelCase detected - searching for class or type names"
|
|
526
|
+
)
|
|
527
|
+
if re.search(r"\b\w+\.\w+", query):
|
|
528
|
+
analysis["suggestions"].append(
|
|
529
|
+
"Dot notation detected - searching for method calls or properties"
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
return analysis
|
|
533
|
+
|
|
534
|
+
def suggest_related_queries(
|
|
535
|
+
self, query: str, results: list[SearchResult]
|
|
536
|
+
) -> list[str]:
|
|
537
|
+
"""Suggest related queries based on search results.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
query: Original search query
|
|
541
|
+
results: Search results
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
List of suggested related queries
|
|
545
|
+
"""
|
|
546
|
+
suggestions = []
|
|
547
|
+
|
|
548
|
+
if not results:
|
|
549
|
+
# No results - suggest broader queries
|
|
550
|
+
words = query.lower().split()
|
|
551
|
+
if len(words) > 1:
|
|
552
|
+
# Try individual words
|
|
553
|
+
suggestions.extend(words[:3]) # Top 3 words
|
|
554
|
+
|
|
555
|
+
# Suggest common related terms
|
|
556
|
+
related_terms = {
|
|
557
|
+
"auth": ["login", "user", "session", "token"],
|
|
558
|
+
"database": ["query", "model", "schema", "connection"],
|
|
559
|
+
"api": ["endpoint", "request", "response", "handler"],
|
|
560
|
+
"test": ["mock", "assert", "spec", "unit"],
|
|
561
|
+
"error": ["exception", "handle", "catch", "debug"],
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
for word in words:
|
|
565
|
+
if word in related_terms:
|
|
566
|
+
suggestions.extend(related_terms[word][:2])
|
|
567
|
+
else:
|
|
568
|
+
# Extract common patterns from results
|
|
569
|
+
function_names = [r.function_name for r in results if r.function_name]
|
|
570
|
+
class_names = [r.class_name for r in results if r.class_name]
|
|
571
|
+
|
|
572
|
+
# Suggest function names
|
|
573
|
+
if function_names:
|
|
574
|
+
unique_functions = list(set(function_names))[:3]
|
|
575
|
+
suggestions.extend(unique_functions)
|
|
576
|
+
|
|
577
|
+
# Suggest class names
|
|
578
|
+
if class_names:
|
|
579
|
+
unique_classes = list(set(class_names))[:3]
|
|
580
|
+
suggestions.extend(unique_classes)
|
|
581
|
+
|
|
582
|
+
# Suggest file-based queries
|
|
583
|
+
file_patterns = set()
|
|
584
|
+
for result in results[:5]: # Top 5 results
|
|
585
|
+
file_name = result.file_path.stem
|
|
586
|
+
if "_" in file_name:
|
|
587
|
+
file_patterns.update(file_name.split("_"))
|
|
588
|
+
elif file_name not in suggestions:
|
|
589
|
+
file_patterns.add(file_name)
|
|
590
|
+
|
|
591
|
+
suggestions.extend(list(file_patterns)[:3])
|
|
592
|
+
|
|
593
|
+
# Remove duplicates and original query words
|
|
594
|
+
query_words = set(query.lower().split())
|
|
595
|
+
unique_suggestions = []
|
|
596
|
+
for suggestion in suggestions:
|
|
597
|
+
if (
|
|
598
|
+
suggestion
|
|
599
|
+
and suggestion.lower() not in query_words
|
|
600
|
+
and suggestion not in unique_suggestions
|
|
601
|
+
):
|
|
602
|
+
unique_suggestions.append(suggestion)
|
|
603
|
+
|
|
604
|
+
return unique_suggestions[:5] # Return top 5 suggestions
|
|
605
|
+
|
|
606
|
+
async def search_with_context(
|
|
607
|
+
self,
|
|
608
|
+
query: str,
|
|
609
|
+
context_files: list[Path] | None = None,
|
|
610
|
+
limit: int = 10,
|
|
611
|
+
similarity_threshold: float | None = None,
|
|
612
|
+
) -> dict[str, Any]:
|
|
613
|
+
"""Enhanced search with contextual analysis and suggestions.
|
|
614
|
+
|
|
615
|
+
Args:
|
|
616
|
+
query: Search query
|
|
617
|
+
context_files: Optional list of files to provide context
|
|
618
|
+
limit: Maximum number of results
|
|
619
|
+
similarity_threshold: Minimum similarity score
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
Dictionary with results, analysis, and suggestions
|
|
623
|
+
"""
|
|
624
|
+
# Analyze the query
|
|
625
|
+
query_analysis = self.analyze_query(query)
|
|
626
|
+
|
|
627
|
+
# Perform the search
|
|
628
|
+
results = await self.search(
|
|
629
|
+
query=query,
|
|
630
|
+
limit=limit,
|
|
631
|
+
similarity_threshold=similarity_threshold,
|
|
632
|
+
include_context=True,
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
# Get related query suggestions
|
|
636
|
+
suggestions = self.suggest_related_queries(query, results)
|
|
637
|
+
|
|
638
|
+
# Enhance results with additional context if context files provided
|
|
639
|
+
if context_files:
|
|
640
|
+
results = await self._enhance_with_file_context(results, context_files)
|
|
641
|
+
|
|
642
|
+
# Calculate result quality metrics
|
|
643
|
+
quality_metrics = self._calculate_result_quality(results, query)
|
|
644
|
+
|
|
645
|
+
return {
|
|
646
|
+
"query": query,
|
|
647
|
+
"analysis": query_analysis,
|
|
648
|
+
"results": results,
|
|
649
|
+
"suggestions": suggestions,
|
|
650
|
+
"metrics": quality_metrics,
|
|
651
|
+
"total_results": len(results),
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
async def _enhance_with_file_context(
|
|
655
|
+
self, results: list[SearchResult], context_files: list[Path]
|
|
656
|
+
) -> list[SearchResult]:
|
|
657
|
+
"""Enhance results by considering context from specific files.
|
|
658
|
+
|
|
659
|
+
Args:
|
|
660
|
+
results: Original search results
|
|
661
|
+
context_files: Files to use for context
|
|
662
|
+
|
|
663
|
+
Returns:
|
|
664
|
+
Enhanced search results
|
|
665
|
+
"""
|
|
666
|
+
# Read context from files
|
|
667
|
+
context_content = []
|
|
668
|
+
for file_path in context_files:
|
|
669
|
+
try:
|
|
670
|
+
with open(file_path, encoding="utf-8") as f:
|
|
671
|
+
content = f.read()
|
|
672
|
+
context_content.append(content)
|
|
673
|
+
except Exception as e:
|
|
674
|
+
logger.warning(f"Failed to read context file {file_path}: {e}")
|
|
675
|
+
|
|
676
|
+
if not context_content:
|
|
677
|
+
return results
|
|
678
|
+
|
|
679
|
+
# Boost results that are related to context files
|
|
680
|
+
context_text = " ".join(context_content).lower()
|
|
681
|
+
|
|
247
682
|
for result in results:
|
|
248
|
-
#
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
if result.
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
# Re-sort by
|
|
683
|
+
# Check if result is from one of the context files
|
|
684
|
+
if result.file_path in context_files:
|
|
685
|
+
result.similarity_score = min(1.0, result.similarity_score + 0.1)
|
|
686
|
+
|
|
687
|
+
# Check if result content relates to context
|
|
688
|
+
result.content.lower()
|
|
689
|
+
if result.function_name:
|
|
690
|
+
func_name_lower = result.function_name.lower()
|
|
691
|
+
if func_name_lower in context_text:
|
|
692
|
+
result.similarity_score = min(1.0, result.similarity_score + 0.05)
|
|
693
|
+
|
|
694
|
+
if result.class_name:
|
|
695
|
+
class_name_lower = result.class_name.lower()
|
|
696
|
+
if class_name_lower in context_text:
|
|
697
|
+
result.similarity_score = min(1.0, result.similarity_score + 0.05)
|
|
698
|
+
|
|
699
|
+
# Re-sort by updated scores
|
|
265
700
|
results.sort(key=lambda r: r.similarity_score, reverse=True)
|
|
266
|
-
|
|
701
|
+
|
|
267
702
|
# Update ranks
|
|
268
703
|
for i, result in enumerate(results):
|
|
269
704
|
result.rank = i + 1
|
|
270
|
-
|
|
705
|
+
|
|
271
706
|
return results
|
|
272
707
|
|
|
273
|
-
def
|
|
708
|
+
def _calculate_result_quality(
|
|
709
|
+
self, results: list[SearchResult], query: str
|
|
710
|
+
) -> dict[str, Any]:
|
|
711
|
+
"""Calculate quality metrics for search results.
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
results: Search results
|
|
715
|
+
query: Original query
|
|
716
|
+
|
|
717
|
+
Returns:
|
|
718
|
+
Dictionary with quality metrics
|
|
719
|
+
"""
|
|
720
|
+
if not results:
|
|
721
|
+
return {
|
|
722
|
+
"average_score": 0.0,
|
|
723
|
+
"score_distribution": {},
|
|
724
|
+
"diversity": 0.0,
|
|
725
|
+
"coverage": 0.0,
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
# Calculate average similarity score
|
|
729
|
+
scores = [r.similarity_score for r in results]
|
|
730
|
+
avg_score = sum(scores) / len(scores)
|
|
731
|
+
|
|
732
|
+
# Score distribution
|
|
733
|
+
high_quality = sum(1 for s in scores if s >= 0.8)
|
|
734
|
+
medium_quality = sum(1 for s in scores if 0.6 <= s < 0.8)
|
|
735
|
+
low_quality = sum(1 for s in scores if s < 0.6)
|
|
736
|
+
|
|
737
|
+
# Diversity (unique files)
|
|
738
|
+
unique_files = len({r.file_path for r in results})
|
|
739
|
+
diversity = unique_files / len(results) if results else 0.0
|
|
740
|
+
|
|
741
|
+
# Coverage (how many query words are covered)
|
|
742
|
+
query_words = set(query.lower().split())
|
|
743
|
+
covered_words = set()
|
|
744
|
+
for result in results:
|
|
745
|
+
content_words = set(result.content.lower().split())
|
|
746
|
+
covered_words.update(query_words.intersection(content_words))
|
|
747
|
+
|
|
748
|
+
coverage = len(covered_words) / len(query_words) if query_words else 0.0
|
|
749
|
+
|
|
750
|
+
return {
|
|
751
|
+
"average_score": round(avg_score, 3),
|
|
752
|
+
"score_distribution": {
|
|
753
|
+
"high_quality": high_quality,
|
|
754
|
+
"medium_quality": medium_quality,
|
|
755
|
+
"low_quality": low_quality,
|
|
756
|
+
},
|
|
757
|
+
"diversity": round(diversity, 3),
|
|
758
|
+
"coverage": round(coverage, 3),
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
def _extract_function_content(self, content: str, function_name: str) -> str | None:
|
|
274
762
|
"""Extract content of a specific function from code.
|
|
275
|
-
|
|
763
|
+
|
|
276
764
|
Args:
|
|
277
765
|
content: Full file content
|
|
278
766
|
function_name: Name of function to extract
|
|
279
|
-
|
|
767
|
+
|
|
280
768
|
Returns:
|
|
281
769
|
Function content if found, None otherwise
|
|
282
770
|
"""
|
|
283
771
|
# Simple regex-based extraction (could be improved with AST)
|
|
284
772
|
pattern = rf"^\s*def\s+{re.escape(function_name)}\s*\("
|
|
285
773
|
lines = content.splitlines()
|
|
286
|
-
|
|
774
|
+
|
|
287
775
|
for i, line in enumerate(lines):
|
|
288
776
|
if re.match(pattern, line):
|
|
289
777
|
# Found function start, now find the end
|
|
290
778
|
start_line = i
|
|
291
779
|
indent_level = len(line) - len(line.lstrip())
|
|
292
|
-
|
|
780
|
+
|
|
293
781
|
# Find end of function
|
|
294
782
|
end_line = len(lines)
|
|
295
783
|
for j in range(i + 1, len(lines)):
|
|
@@ -298,27 +786,27 @@ class SemanticSearchEngine:
|
|
|
298
786
|
if current_indent <= indent_level:
|
|
299
787
|
end_line = j
|
|
300
788
|
break
|
|
301
|
-
|
|
789
|
+
|
|
302
790
|
return "\n".join(lines[start_line:end_line])
|
|
303
|
-
|
|
791
|
+
|
|
304
792
|
return None
|
|
305
793
|
|
|
306
|
-
async def get_search_stats(self) ->
|
|
794
|
+
async def get_search_stats(self) -> dict[str, Any]:
|
|
307
795
|
"""Get search engine statistics.
|
|
308
|
-
|
|
796
|
+
|
|
309
797
|
Returns:
|
|
310
798
|
Dictionary with search statistics
|
|
311
799
|
"""
|
|
312
800
|
try:
|
|
313
801
|
db_stats = await self.database.get_stats()
|
|
314
|
-
|
|
802
|
+
|
|
315
803
|
return {
|
|
316
804
|
"total_chunks": db_stats.total_chunks,
|
|
317
805
|
"languages": db_stats.languages,
|
|
318
806
|
"similarity_threshold": self.similarity_threshold,
|
|
319
807
|
"project_root": str(self.project_root),
|
|
320
808
|
}
|
|
321
|
-
|
|
809
|
+
|
|
322
810
|
except Exception as e:
|
|
323
811
|
logger.error(f"Failed to get search stats: {e}")
|
|
324
812
|
return {"error": str(e)}
|