mcp-vector-search 0.15.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +10 -0
- mcp_vector_search/cli/__init__.py +1 -0
- mcp_vector_search/cli/commands/__init__.py +1 -0
- mcp_vector_search/cli/commands/auto_index.py +397 -0
- mcp_vector_search/cli/commands/chat.py +534 -0
- mcp_vector_search/cli/commands/config.py +393 -0
- mcp_vector_search/cli/commands/demo.py +358 -0
- mcp_vector_search/cli/commands/index.py +762 -0
- mcp_vector_search/cli/commands/init.py +658 -0
- mcp_vector_search/cli/commands/install.py +869 -0
- mcp_vector_search/cli/commands/install_old.py +700 -0
- mcp_vector_search/cli/commands/mcp.py +1254 -0
- mcp_vector_search/cli/commands/reset.py +393 -0
- mcp_vector_search/cli/commands/search.py +796 -0
- mcp_vector_search/cli/commands/setup.py +1133 -0
- mcp_vector_search/cli/commands/status.py +584 -0
- mcp_vector_search/cli/commands/uninstall.py +404 -0
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +265 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +201 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
- mcp_vector_search/cli/commands/visualize.py.original +2536 -0
- mcp_vector_search/cli/commands/watch.py +287 -0
- mcp_vector_search/cli/didyoumean.py +520 -0
- mcp_vector_search/cli/export.py +320 -0
- mcp_vector_search/cli/history.py +295 -0
- mcp_vector_search/cli/interactive.py +342 -0
- mcp_vector_search/cli/main.py +484 -0
- mcp_vector_search/cli/output.py +414 -0
- mcp_vector_search/cli/suggestions.py +375 -0
- mcp_vector_search/config/__init__.py +1 -0
- mcp_vector_search/config/constants.py +24 -0
- mcp_vector_search/config/defaults.py +200 -0
- mcp_vector_search/config/settings.py +146 -0
- mcp_vector_search/core/__init__.py +1 -0
- mcp_vector_search/core/auto_indexer.py +298 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/connection_pool.py +360 -0
- mcp_vector_search/core/database.py +1237 -0
- mcp_vector_search/core/directory_index.py +318 -0
- mcp_vector_search/core/embeddings.py +294 -0
- mcp_vector_search/core/exceptions.py +89 -0
- mcp_vector_search/core/factory.py +318 -0
- mcp_vector_search/core/git_hooks.py +345 -0
- mcp_vector_search/core/indexer.py +1002 -0
- mcp_vector_search/core/llm_client.py +453 -0
- mcp_vector_search/core/models.py +294 -0
- mcp_vector_search/core/project.py +350 -0
- mcp_vector_search/core/scheduler.py +330 -0
- mcp_vector_search/core/search.py +952 -0
- mcp_vector_search/core/watcher.py +322 -0
- mcp_vector_search/mcp/__init__.py +5 -0
- mcp_vector_search/mcp/__main__.py +25 -0
- mcp_vector_search/mcp/server.py +752 -0
- mcp_vector_search/parsers/__init__.py +8 -0
- mcp_vector_search/parsers/base.py +296 -0
- mcp_vector_search/parsers/dart.py +605 -0
- mcp_vector_search/parsers/html.py +413 -0
- mcp_vector_search/parsers/javascript.py +643 -0
- mcp_vector_search/parsers/php.py +694 -0
- mcp_vector_search/parsers/python.py +502 -0
- mcp_vector_search/parsers/registry.py +223 -0
- mcp_vector_search/parsers/ruby.py +678 -0
- mcp_vector_search/parsers/text.py +186 -0
- mcp_vector_search/parsers/utils.py +265 -0
- mcp_vector_search/py.typed +1 -0
- mcp_vector_search/utils/__init__.py +42 -0
- mcp_vector_search/utils/gitignore.py +250 -0
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +339 -0
- mcp_vector_search/utils/timing.py +338 -0
- mcp_vector_search/utils/version.py +47 -0
- mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
- mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
- mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
- mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
- mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,952 @@
|
|
|
1
|
+
"""Semantic search engine for MCP Vector Search."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
from collections import OrderedDict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import aiofiles
|
|
10
|
+
from loguru import logger
|
|
11
|
+
|
|
12
|
+
from ..config.constants import DEFAULT_CACHE_SIZE
|
|
13
|
+
from .auto_indexer import AutoIndexer, SearchTriggeredIndexer
|
|
14
|
+
from .database import VectorDatabase
|
|
15
|
+
from .exceptions import SearchError
|
|
16
|
+
from .models import SearchResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SemanticSearchEngine:
|
|
20
|
+
"""Semantic search engine for code search."""
|
|
21
|
+
|
|
22
|
+
# Query expansion constants (class-level for performance)
|
|
23
|
+
_QUERY_EXPANSIONS = {
|
|
24
|
+
# Common abbreviations
|
|
25
|
+
"auth": "authentication authorize login",
|
|
26
|
+
"db": "database data storage",
|
|
27
|
+
"api": "application programming interface endpoint",
|
|
28
|
+
"ui": "user interface frontend view",
|
|
29
|
+
"util": "utility helper function",
|
|
30
|
+
"config": "configuration settings options",
|
|
31
|
+
"async": "asynchronous await promise",
|
|
32
|
+
"sync": "synchronous blocking",
|
|
33
|
+
"func": "function method",
|
|
34
|
+
"var": "variable",
|
|
35
|
+
"param": "parameter argument",
|
|
36
|
+
"init": "initialize setup create",
|
|
37
|
+
"parse": "parsing parser analyze",
|
|
38
|
+
"validate": "validation check verify",
|
|
39
|
+
"handle": "handler process manage",
|
|
40
|
+
"error": "exception failure bug",
|
|
41
|
+
"test": "testing unittest spec",
|
|
42
|
+
"mock": "mocking stub fake",
|
|
43
|
+
"log": "logging logger debug",
|
|
44
|
+
# Programming concepts
|
|
45
|
+
"class": "class object type",
|
|
46
|
+
"method": "method function procedure",
|
|
47
|
+
"property": "property attribute field",
|
|
48
|
+
"import": "import require include",
|
|
49
|
+
"export": "export module public",
|
|
50
|
+
"return": "return yield output",
|
|
51
|
+
"loop": "loop iterate for while",
|
|
52
|
+
"condition": "condition if else branch",
|
|
53
|
+
"array": "array list collection",
|
|
54
|
+
"string": "string text character",
|
|
55
|
+
"number": "number integer float",
|
|
56
|
+
"boolean": "boolean true false",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Reranking boost constants (class-level for performance)
|
|
60
|
+
_BOOST_EXACT_IDENTIFIER = 0.15
|
|
61
|
+
_BOOST_PARTIAL_IDENTIFIER = 0.05
|
|
62
|
+
_BOOST_FILE_NAME_EXACT = 0.08
|
|
63
|
+
_BOOST_FILE_NAME_PARTIAL = 0.03
|
|
64
|
+
_BOOST_FUNCTION_CHUNK = 0.05
|
|
65
|
+
_BOOST_CLASS_CHUNK = 0.03
|
|
66
|
+
_BOOST_SOURCE_FILE = 0.02
|
|
67
|
+
_BOOST_SHALLOW_PATH = 0.02
|
|
68
|
+
_PENALTY_TEST_FILE = -0.02
|
|
69
|
+
_PENALTY_DEEP_PATH = -0.01
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
database: VectorDatabase,
|
|
74
|
+
project_root: Path,
|
|
75
|
+
similarity_threshold: float = 0.3,
|
|
76
|
+
auto_indexer: AutoIndexer | None = None,
|
|
77
|
+
enable_auto_reindex: bool = True,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""Initialize semantic search engine.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
database: Vector database instance
|
|
83
|
+
project_root: Project root directory
|
|
84
|
+
similarity_threshold: Default similarity threshold
|
|
85
|
+
auto_indexer: Optional auto-indexer for semi-automatic reindexing
|
|
86
|
+
enable_auto_reindex: Whether to enable automatic reindexing
|
|
87
|
+
"""
|
|
88
|
+
self.database = database
|
|
89
|
+
self.project_root = project_root
|
|
90
|
+
self.similarity_threshold = similarity_threshold
|
|
91
|
+
self.auto_indexer = auto_indexer
|
|
92
|
+
self.enable_auto_reindex = enable_auto_reindex
|
|
93
|
+
|
|
94
|
+
# Initialize search-triggered indexer if auto-indexer is provided
|
|
95
|
+
self.search_triggered_indexer = None
|
|
96
|
+
if auto_indexer and enable_auto_reindex:
|
|
97
|
+
self.search_triggered_indexer = SearchTriggeredIndexer(auto_indexer)
|
|
98
|
+
|
|
99
|
+
# File content cache for performance (proper LRU with OrderedDict)
|
|
100
|
+
self._file_cache: OrderedDict[Path, list[str]] = OrderedDict()
|
|
101
|
+
self._cache_maxsize = DEFAULT_CACHE_SIZE
|
|
102
|
+
self._cache_hits = 0
|
|
103
|
+
self._cache_misses = 0
|
|
104
|
+
|
|
105
|
+
# Health check throttling (only check every 60 seconds)
|
|
106
|
+
self._last_health_check: float = 0.0
|
|
107
|
+
self._health_check_interval: float = 60.0
|
|
108
|
+
|
|
109
|
+
async def search(
|
|
110
|
+
self,
|
|
111
|
+
query: str,
|
|
112
|
+
limit: int = 10,
|
|
113
|
+
filters: dict[str, Any] | None = None,
|
|
114
|
+
similarity_threshold: float | None = None,
|
|
115
|
+
include_context: bool = True,
|
|
116
|
+
) -> list[SearchResult]:
|
|
117
|
+
"""Perform semantic search for code.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
query: Search query
|
|
121
|
+
limit: Maximum number of results
|
|
122
|
+
filters: Optional filters (language, file_path, etc.)
|
|
123
|
+
similarity_threshold: Minimum similarity score
|
|
124
|
+
include_context: Whether to include context lines
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of search results
|
|
128
|
+
"""
|
|
129
|
+
if not query.strip():
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
# Throttled health check before search (only every 60 seconds)
|
|
133
|
+
current_time = time.time()
|
|
134
|
+
if current_time - self._last_health_check >= self._health_check_interval:
|
|
135
|
+
try:
|
|
136
|
+
if hasattr(self.database, "health_check"):
|
|
137
|
+
is_healthy = await self.database.health_check()
|
|
138
|
+
if not is_healthy:
|
|
139
|
+
logger.warning(
|
|
140
|
+
"Database health check failed - attempting recovery"
|
|
141
|
+
)
|
|
142
|
+
# Health check already attempts recovery, so we can proceed
|
|
143
|
+
self._last_health_check = current_time
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.warning(f"Health check failed: {e}")
|
|
146
|
+
self._last_health_check = current_time
|
|
147
|
+
|
|
148
|
+
# Auto-reindex check before search
|
|
149
|
+
if self.search_triggered_indexer:
|
|
150
|
+
try:
|
|
151
|
+
await self.search_triggered_indexer.pre_search_hook()
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.warning(f"Auto-reindex check failed: {e}")
|
|
154
|
+
|
|
155
|
+
threshold = (
|
|
156
|
+
similarity_threshold
|
|
157
|
+
if similarity_threshold is not None
|
|
158
|
+
else self._get_adaptive_threshold(query)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
# Preprocess query
|
|
163
|
+
processed_query = self._preprocess_query(query)
|
|
164
|
+
|
|
165
|
+
# Perform vector search
|
|
166
|
+
results = await self.database.search(
|
|
167
|
+
query=processed_query,
|
|
168
|
+
limit=limit,
|
|
169
|
+
filters=filters,
|
|
170
|
+
similarity_threshold=threshold,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Post-process results
|
|
174
|
+
enhanced_results = []
|
|
175
|
+
for result in results:
|
|
176
|
+
enhanced_result = await self._enhance_result(result, include_context)
|
|
177
|
+
enhanced_results.append(enhanced_result)
|
|
178
|
+
|
|
179
|
+
# Apply additional ranking if needed
|
|
180
|
+
ranked_results = self._rerank_results(enhanced_results, query)
|
|
181
|
+
|
|
182
|
+
logger.debug(
|
|
183
|
+
f"Search for '{query}' with threshold {threshold:.3f} returned {len(ranked_results)} results"
|
|
184
|
+
)
|
|
185
|
+
return ranked_results
|
|
186
|
+
|
|
187
|
+
except Exception as e:
|
|
188
|
+
error_msg = str(e).lower()
|
|
189
|
+
# Check for corruption indicators
|
|
190
|
+
if any(
|
|
191
|
+
indicator in error_msg
|
|
192
|
+
for indicator in [
|
|
193
|
+
"pickle",
|
|
194
|
+
"unpickling",
|
|
195
|
+
"eof",
|
|
196
|
+
"ran out of input",
|
|
197
|
+
"hnsw",
|
|
198
|
+
"index",
|
|
199
|
+
"deserialize",
|
|
200
|
+
"corrupt",
|
|
201
|
+
]
|
|
202
|
+
):
|
|
203
|
+
logger.error(f"Index corruption detected during search: {e}")
|
|
204
|
+
logger.info(
|
|
205
|
+
"The index appears to be corrupted. Please run 'mcp-vector-search reset' to clear the index and then 'mcp-vector-search index' to rebuild it."
|
|
206
|
+
)
|
|
207
|
+
raise SearchError(
|
|
208
|
+
"Index corruption detected. Please run 'mcp-vector-search reset' followed by 'mcp-vector-search index' to rebuild."
|
|
209
|
+
) from e
|
|
210
|
+
else:
|
|
211
|
+
logger.error(f"Search failed for query '{query}': {e}")
|
|
212
|
+
raise SearchError(f"Search failed: {e}") from e
|
|
213
|
+
|
|
214
|
+
async def search_similar(
|
|
215
|
+
self,
|
|
216
|
+
file_path: Path,
|
|
217
|
+
function_name: str | None = None,
|
|
218
|
+
limit: int = 10,
|
|
219
|
+
similarity_threshold: float | None = None,
|
|
220
|
+
) -> list[SearchResult]:
|
|
221
|
+
"""Find code similar to a specific function or file.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
file_path: Path to the reference file
|
|
225
|
+
function_name: Specific function name (optional)
|
|
226
|
+
limit: Maximum number of results
|
|
227
|
+
similarity_threshold: Minimum similarity score
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
List of similar code results
|
|
231
|
+
"""
|
|
232
|
+
try:
|
|
233
|
+
# Read the reference file using async I/O
|
|
234
|
+
async with aiofiles.open(file_path, encoding="utf-8") as f:
|
|
235
|
+
content = await f.read()
|
|
236
|
+
|
|
237
|
+
# If function name is specified, try to extract just that function
|
|
238
|
+
if function_name:
|
|
239
|
+
function_content = self._extract_function_content(
|
|
240
|
+
content, function_name
|
|
241
|
+
)
|
|
242
|
+
if function_content:
|
|
243
|
+
content = function_content
|
|
244
|
+
|
|
245
|
+
# Use the content as the search query
|
|
246
|
+
return await self.search(
|
|
247
|
+
query=content,
|
|
248
|
+
limit=limit,
|
|
249
|
+
similarity_threshold=similarity_threshold,
|
|
250
|
+
include_context=True,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.error(f"Similar search failed for {file_path}: {e}")
|
|
255
|
+
raise SearchError(f"Similar search failed: {e}") from e
|
|
256
|
+
|
|
257
|
+
async def search_by_context(
|
|
258
|
+
self,
|
|
259
|
+
context_description: str,
|
|
260
|
+
focus_areas: list[str] | None = None,
|
|
261
|
+
limit: int = 10,
|
|
262
|
+
) -> list[SearchResult]:
|
|
263
|
+
"""Search for code based on contextual description.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
context_description: Description of what you're looking for
|
|
267
|
+
focus_areas: Areas to focus on (e.g., ["security", "authentication"])
|
|
268
|
+
limit: Maximum number of results
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
List of contextually relevant results
|
|
272
|
+
"""
|
|
273
|
+
# Build enhanced query with focus areas
|
|
274
|
+
query_parts = [context_description]
|
|
275
|
+
|
|
276
|
+
if focus_areas:
|
|
277
|
+
query_parts.extend(focus_areas)
|
|
278
|
+
|
|
279
|
+
enhanced_query = " ".join(query_parts)
|
|
280
|
+
|
|
281
|
+
return await self.search(
|
|
282
|
+
query=enhanced_query,
|
|
283
|
+
limit=limit,
|
|
284
|
+
include_context=True,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
def _preprocess_query(self, query: str) -> str:
|
|
288
|
+
"""Preprocess search query for better results.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
query: Raw search query
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Processed query
|
|
295
|
+
"""
|
|
296
|
+
# Remove extra whitespace
|
|
297
|
+
query = re.sub(r"\s+", " ", query.strip())
|
|
298
|
+
|
|
299
|
+
# Use class-level query expansions (no dict creation overhead)
|
|
300
|
+
words = query.lower().split()
|
|
301
|
+
expanded_words = []
|
|
302
|
+
|
|
303
|
+
for word in words:
|
|
304
|
+
# Add original word
|
|
305
|
+
expanded_words.append(word)
|
|
306
|
+
|
|
307
|
+
# Add expansions if available
|
|
308
|
+
if word in self._QUERY_EXPANSIONS:
|
|
309
|
+
expanded_words.extend(self._QUERY_EXPANSIONS[word].split())
|
|
310
|
+
|
|
311
|
+
# Remove duplicates while preserving order
|
|
312
|
+
seen = set()
|
|
313
|
+
unique_words = []
|
|
314
|
+
for word in expanded_words:
|
|
315
|
+
if word not in seen:
|
|
316
|
+
seen.add(word)
|
|
317
|
+
unique_words.append(word)
|
|
318
|
+
|
|
319
|
+
return " ".join(unique_words)
|
|
320
|
+
|
|
321
|
+
def _get_adaptive_threshold(self, query: str) -> float:
|
|
322
|
+
"""Get adaptive similarity threshold based on query characteristics.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
query: Search query
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Adaptive similarity threshold
|
|
329
|
+
"""
|
|
330
|
+
base_threshold = self.similarity_threshold
|
|
331
|
+
query_lower = query.lower()
|
|
332
|
+
words = query.split()
|
|
333
|
+
|
|
334
|
+
# Adjust threshold based on query characteristics
|
|
335
|
+
|
|
336
|
+
# 1. Single word queries - lower threshold for broader results
|
|
337
|
+
if len(words) == 1:
|
|
338
|
+
return max(0.01, base_threshold - 0.29)
|
|
339
|
+
|
|
340
|
+
# 2. Very specific technical terms - lower threshold
|
|
341
|
+
technical_terms = [
|
|
342
|
+
"javascript",
|
|
343
|
+
"typescript",
|
|
344
|
+
"python",
|
|
345
|
+
"java",
|
|
346
|
+
"cpp",
|
|
347
|
+
"rust",
|
|
348
|
+
"go",
|
|
349
|
+
"function",
|
|
350
|
+
"class",
|
|
351
|
+
"method",
|
|
352
|
+
"variable",
|
|
353
|
+
"import",
|
|
354
|
+
"export",
|
|
355
|
+
"async",
|
|
356
|
+
"await",
|
|
357
|
+
"promise",
|
|
358
|
+
"callback",
|
|
359
|
+
"api",
|
|
360
|
+
"database",
|
|
361
|
+
"parser",
|
|
362
|
+
"compiler",
|
|
363
|
+
"interpreter",
|
|
364
|
+
"syntax",
|
|
365
|
+
"semantic",
|
|
366
|
+
"mcp",
|
|
367
|
+
"gateway",
|
|
368
|
+
"server",
|
|
369
|
+
"client",
|
|
370
|
+
"protocol",
|
|
371
|
+
]
|
|
372
|
+
|
|
373
|
+
if any(term in query_lower for term in technical_terms):
|
|
374
|
+
return max(0.01, base_threshold - 0.29)
|
|
375
|
+
|
|
376
|
+
# 3. Short queries (2-3 words) - slightly lower threshold
|
|
377
|
+
if len(words) <= 3:
|
|
378
|
+
return max(0.1, base_threshold - 0.1)
|
|
379
|
+
|
|
380
|
+
# 4. Long queries (>6 words) - higher threshold for precision
|
|
381
|
+
if len(words) > 6:
|
|
382
|
+
return min(0.8, base_threshold + 0.1)
|
|
383
|
+
|
|
384
|
+
# 5. Queries with exact identifiers (CamelCase, snake_case)
|
|
385
|
+
if re.search(r"\b[A-Z][a-zA-Z]*\b", query) or "_" in query:
|
|
386
|
+
return max(0.05, base_threshold - 0.25)
|
|
387
|
+
|
|
388
|
+
# 6. Common programming patterns
|
|
389
|
+
if any(pattern in query for pattern in ["()", ".", "->", "=>", "::"]):
|
|
390
|
+
return max(0.25, base_threshold - 0.1)
|
|
391
|
+
|
|
392
|
+
return base_threshold
|
|
393
|
+
|
|
394
|
+
async def _read_file_lines_cached(self, file_path: Path) -> list[str]:
|
|
395
|
+
"""Read file lines with proper LRU caching for performance.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
file_path: Path to the file
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
List of file lines
|
|
402
|
+
|
|
403
|
+
Raises:
|
|
404
|
+
FileNotFoundError: If file doesn't exist
|
|
405
|
+
"""
|
|
406
|
+
# Check cache - move to end if found (most recently used)
|
|
407
|
+
if file_path in self._file_cache:
|
|
408
|
+
self._cache_hits += 1
|
|
409
|
+
# Move to end (most recently used)
|
|
410
|
+
self._file_cache.move_to_end(file_path)
|
|
411
|
+
return self._file_cache[file_path]
|
|
412
|
+
|
|
413
|
+
self._cache_misses += 1
|
|
414
|
+
|
|
415
|
+
# Read file asynchronously
|
|
416
|
+
try:
|
|
417
|
+
async with aiofiles.open(file_path, encoding="utf-8") as f:
|
|
418
|
+
content = await f.read()
|
|
419
|
+
lines = content.splitlines(keepends=True)
|
|
420
|
+
|
|
421
|
+
# Proper LRU: if cache is full, remove least recently used (first item)
|
|
422
|
+
if len(self._file_cache) >= self._cache_maxsize:
|
|
423
|
+
# Remove least recently used entry (first item in OrderedDict)
|
|
424
|
+
self._file_cache.popitem(last=False)
|
|
425
|
+
|
|
426
|
+
# Add to cache (will be at end, most recently used)
|
|
427
|
+
self._file_cache[file_path] = lines
|
|
428
|
+
return lines
|
|
429
|
+
|
|
430
|
+
except FileNotFoundError:
|
|
431
|
+
# Cache the miss to avoid repeated failed attempts
|
|
432
|
+
if len(self._file_cache) >= self._cache_maxsize:
|
|
433
|
+
self._file_cache.popitem(last=False)
|
|
434
|
+
self._file_cache[file_path] = []
|
|
435
|
+
raise
|
|
436
|
+
|
|
437
|
+
async def _enhance_result(
|
|
438
|
+
self, result: SearchResult, include_context: bool
|
|
439
|
+
) -> SearchResult:
|
|
440
|
+
"""Enhance search result with additional information.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
result: Original search result
|
|
444
|
+
include_context: Whether to include context lines
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
Enhanced search result
|
|
448
|
+
"""
|
|
449
|
+
if not include_context:
|
|
450
|
+
return result
|
|
451
|
+
|
|
452
|
+
try:
|
|
453
|
+
# Read the source file using cached method
|
|
454
|
+
lines = await self._read_file_lines_cached(result.file_path)
|
|
455
|
+
|
|
456
|
+
if not lines: # File not found or empty
|
|
457
|
+
return result
|
|
458
|
+
|
|
459
|
+
# Get context lines before and after
|
|
460
|
+
context_size = 3
|
|
461
|
+
start_idx = max(0, result.start_line - 1 - context_size)
|
|
462
|
+
end_idx = min(len(lines), result.end_line + context_size)
|
|
463
|
+
|
|
464
|
+
context_before = [
|
|
465
|
+
line.rstrip() for line in lines[start_idx : result.start_line - 1]
|
|
466
|
+
]
|
|
467
|
+
context_after = [line.rstrip() for line in lines[result.end_line : end_idx]]
|
|
468
|
+
|
|
469
|
+
# Update result with context
|
|
470
|
+
result.context_before = context_before
|
|
471
|
+
result.context_after = context_after
|
|
472
|
+
|
|
473
|
+
except Exception as e:
|
|
474
|
+
logger.warning(f"Failed to get context for {result.file_path}: {e}")
|
|
475
|
+
|
|
476
|
+
return result
|
|
477
|
+
|
|
478
|
+
def _rerank_results(
|
|
479
|
+
self, results: list[SearchResult], query: str
|
|
480
|
+
) -> list[SearchResult]:
|
|
481
|
+
"""Apply advanced ranking to search results using multiple factors.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
results: Original search results
|
|
485
|
+
query: Original search query
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
Reranked search results
|
|
489
|
+
"""
|
|
490
|
+
if not results:
|
|
491
|
+
return results
|
|
492
|
+
|
|
493
|
+
# Pre-compute lowercased strings once (avoid repeated .lower() calls)
|
|
494
|
+
query_lower = query.lower()
|
|
495
|
+
query_words = set(query_lower.split())
|
|
496
|
+
|
|
497
|
+
# Pre-compute file extensions for source files
|
|
498
|
+
source_exts = frozenset(
|
|
499
|
+
[".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs"]
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
for result in results:
|
|
503
|
+
# Start with base similarity score
|
|
504
|
+
score = result.similarity_score
|
|
505
|
+
|
|
506
|
+
# Factor 1: Exact matches in identifiers (high boost)
|
|
507
|
+
if result.function_name:
|
|
508
|
+
func_name_lower = result.function_name.lower()
|
|
509
|
+
if query_lower in func_name_lower:
|
|
510
|
+
score += self._BOOST_EXACT_IDENTIFIER
|
|
511
|
+
# Partial word matches
|
|
512
|
+
score += sum(
|
|
513
|
+
self._BOOST_PARTIAL_IDENTIFIER
|
|
514
|
+
for word in query_words
|
|
515
|
+
if word in func_name_lower
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
if result.class_name:
|
|
519
|
+
class_name_lower = result.class_name.lower()
|
|
520
|
+
if query_lower in class_name_lower:
|
|
521
|
+
score += self._BOOST_EXACT_IDENTIFIER
|
|
522
|
+
# Partial word matches
|
|
523
|
+
score += sum(
|
|
524
|
+
self._BOOST_PARTIAL_IDENTIFIER
|
|
525
|
+
for word in query_words
|
|
526
|
+
if word in class_name_lower
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Factor 2: File name relevance
|
|
530
|
+
file_name_lower = result.file_path.name.lower()
|
|
531
|
+
if query_lower in file_name_lower:
|
|
532
|
+
score += self._BOOST_FILE_NAME_EXACT
|
|
533
|
+
score += sum(
|
|
534
|
+
self._BOOST_FILE_NAME_PARTIAL
|
|
535
|
+
for word in query_words
|
|
536
|
+
if word in file_name_lower
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
# Factor 3: Content density (how many query words appear)
|
|
540
|
+
content_lower = result.content.lower()
|
|
541
|
+
word_matches = sum(1 for word in query_words if word in content_lower)
|
|
542
|
+
if word_matches > 0:
|
|
543
|
+
score += (word_matches / len(query_words)) * 0.1
|
|
544
|
+
|
|
545
|
+
# Factor 4: Code structure preferences (combined conditions)
|
|
546
|
+
if result.chunk_type == "function":
|
|
547
|
+
score += self._BOOST_FUNCTION_CHUNK
|
|
548
|
+
elif result.chunk_type == "class":
|
|
549
|
+
score += self._BOOST_CLASS_CHUNK
|
|
550
|
+
|
|
551
|
+
# Factor 5: File type preferences (prefer source files over tests)
|
|
552
|
+
file_ext = result.file_path.suffix.lower()
|
|
553
|
+
if file_ext in source_exts:
|
|
554
|
+
score += self._BOOST_SOURCE_FILE
|
|
555
|
+
if "test" in file_name_lower: # Already computed
|
|
556
|
+
score += self._PENALTY_TEST_FILE
|
|
557
|
+
|
|
558
|
+
# Factor 6: Path depth preference
|
|
559
|
+
path_depth = len(result.file_path.parts)
|
|
560
|
+
if path_depth <= 3:
|
|
561
|
+
score += self._BOOST_SHALLOW_PATH
|
|
562
|
+
elif path_depth > 5:
|
|
563
|
+
score += self._PENALTY_DEEP_PATH
|
|
564
|
+
|
|
565
|
+
# Ensure score doesn't exceed 1.0
|
|
566
|
+
result.similarity_score = min(1.0, score)
|
|
567
|
+
|
|
568
|
+
# Sort by enhanced similarity score
|
|
569
|
+
results.sort(key=lambda r: r.similarity_score, reverse=True)
|
|
570
|
+
|
|
571
|
+
# Update ranks
|
|
572
|
+
for i, result in enumerate(results):
|
|
573
|
+
result.rank = i + 1
|
|
574
|
+
|
|
575
|
+
return results
|
|
576
|
+
|
|
577
|
+
def analyze_query(self, query: str) -> dict[str, Any]:
|
|
578
|
+
"""Analyze search query and provide suggestions for improvement.
|
|
579
|
+
|
|
580
|
+
Args:
|
|
581
|
+
query: Search query to analyze
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
Dictionary with analysis results and suggestions
|
|
585
|
+
"""
|
|
586
|
+
analysis = {
|
|
587
|
+
"original_query": query,
|
|
588
|
+
"processed_query": self._preprocess_query(query),
|
|
589
|
+
"query_type": "general",
|
|
590
|
+
"suggestions": [],
|
|
591
|
+
"confidence": "medium",
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
query_lower = query.lower()
|
|
595
|
+
|
|
596
|
+
# Detect query type
|
|
597
|
+
if any(word in query_lower for word in ["function", "method", "def", "func"]):
|
|
598
|
+
analysis["query_type"] = "function_search"
|
|
599
|
+
analysis["suggestions"].append(
|
|
600
|
+
"Try searching for specific function names or patterns"
|
|
601
|
+
)
|
|
602
|
+
elif any(word in query_lower for word in ["class", "object", "type"]):
|
|
603
|
+
analysis["query_type"] = "class_search"
|
|
604
|
+
analysis["suggestions"].append(
|
|
605
|
+
"Include class inheritance or interface information"
|
|
606
|
+
)
|
|
607
|
+
elif any(word in query_lower for word in ["error", "exception", "bug", "fix"]):
|
|
608
|
+
analysis["query_type"] = "error_handling"
|
|
609
|
+
analysis["suggestions"].append("Include error types or exception names")
|
|
610
|
+
elif any(word in query_lower for word in ["test", "spec", "mock"]):
|
|
611
|
+
analysis["query_type"] = "testing"
|
|
612
|
+
analysis["suggestions"].append("Specify test framework or testing patterns")
|
|
613
|
+
elif any(word in query_lower for word in ["config", "setting", "option"]):
|
|
614
|
+
analysis["query_type"] = "configuration"
|
|
615
|
+
analysis["suggestions"].append(
|
|
616
|
+
"Include configuration file types or setting names"
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# Analyze query complexity
|
|
620
|
+
words = query.split()
|
|
621
|
+
if len(words) == 1:
|
|
622
|
+
analysis["confidence"] = "low"
|
|
623
|
+
analysis["suggestions"].append(
|
|
624
|
+
"Try adding more descriptive words for better results"
|
|
625
|
+
)
|
|
626
|
+
elif len(words) > 10:
|
|
627
|
+
analysis["confidence"] = "low"
|
|
628
|
+
analysis["suggestions"].append(
|
|
629
|
+
"Consider simplifying your query for better matching"
|
|
630
|
+
)
|
|
631
|
+
else:
|
|
632
|
+
analysis["confidence"] = "high"
|
|
633
|
+
|
|
634
|
+
# Check for common programming patterns
|
|
635
|
+
if re.search(r"\b\w+\(\)", query):
|
|
636
|
+
analysis["suggestions"].append(
|
|
637
|
+
"Function call detected - searching for function definitions"
|
|
638
|
+
)
|
|
639
|
+
if re.search(r"\b[A-Z][a-zA-Z]*\b", query):
|
|
640
|
+
analysis["suggestions"].append(
|
|
641
|
+
"CamelCase detected - searching for class or type names"
|
|
642
|
+
)
|
|
643
|
+
if re.search(r"\b\w+\.\w+", query):
|
|
644
|
+
analysis["suggestions"].append(
|
|
645
|
+
"Dot notation detected - searching for method calls or properties"
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
return analysis
|
|
649
|
+
|
|
650
|
+
def suggest_related_queries(
|
|
651
|
+
self, query: str, results: list[SearchResult]
|
|
652
|
+
) -> list[str]:
|
|
653
|
+
"""Suggest related queries based on search results.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
query: Original search query
|
|
657
|
+
results: Search results
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
List of suggested related queries
|
|
661
|
+
"""
|
|
662
|
+
suggestions = []
|
|
663
|
+
|
|
664
|
+
if not results:
|
|
665
|
+
# No results - suggest broader queries
|
|
666
|
+
words = query.lower().split()
|
|
667
|
+
if len(words) > 1:
|
|
668
|
+
# Try individual words
|
|
669
|
+
suggestions.extend(words[:3]) # Top 3 words
|
|
670
|
+
|
|
671
|
+
# Suggest common related terms
|
|
672
|
+
related_terms = {
|
|
673
|
+
"auth": ["login", "user", "session", "token"],
|
|
674
|
+
"database": ["query", "model", "schema", "connection"],
|
|
675
|
+
"api": ["endpoint", "request", "response", "handler"],
|
|
676
|
+
"test": ["mock", "assert", "spec", "unit"],
|
|
677
|
+
"error": ["exception", "handle", "catch", "debug"],
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
for word in words:
|
|
681
|
+
if word in related_terms:
|
|
682
|
+
suggestions.extend(related_terms[word][:2])
|
|
683
|
+
else:
|
|
684
|
+
# Extract common patterns from results
|
|
685
|
+
function_names = [r.function_name for r in results if r.function_name]
|
|
686
|
+
class_names = [r.class_name for r in results if r.class_name]
|
|
687
|
+
|
|
688
|
+
# Suggest function names
|
|
689
|
+
if function_names:
|
|
690
|
+
unique_functions = list(set(function_names))[:3]
|
|
691
|
+
suggestions.extend(unique_functions)
|
|
692
|
+
|
|
693
|
+
# Suggest class names
|
|
694
|
+
if class_names:
|
|
695
|
+
unique_classes = list(set(class_names))[:3]
|
|
696
|
+
suggestions.extend(unique_classes)
|
|
697
|
+
|
|
698
|
+
# Suggest file-based queries
|
|
699
|
+
file_patterns = set()
|
|
700
|
+
for result in results[:5]: # Top 5 results
|
|
701
|
+
file_name = result.file_path.stem
|
|
702
|
+
if "_" in file_name:
|
|
703
|
+
file_patterns.update(file_name.split("_"))
|
|
704
|
+
elif file_name not in suggestions:
|
|
705
|
+
file_patterns.add(file_name)
|
|
706
|
+
|
|
707
|
+
suggestions.extend(list(file_patterns)[:3])
|
|
708
|
+
|
|
709
|
+
# Remove duplicates and original query words
|
|
710
|
+
query_words = set(query.lower().split())
|
|
711
|
+
unique_suggestions = []
|
|
712
|
+
for suggestion in suggestions:
|
|
713
|
+
if (
|
|
714
|
+
suggestion
|
|
715
|
+
and suggestion.lower() not in query_words
|
|
716
|
+
and suggestion not in unique_suggestions
|
|
717
|
+
):
|
|
718
|
+
unique_suggestions.append(suggestion)
|
|
719
|
+
|
|
720
|
+
return unique_suggestions[:5] # Return top 5 suggestions
|
|
721
|
+
|
|
722
|
+
async def search_with_context(
|
|
723
|
+
self,
|
|
724
|
+
query: str,
|
|
725
|
+
context_files: list[Path] | None = None,
|
|
726
|
+
limit: int = 10,
|
|
727
|
+
similarity_threshold: float | None = None,
|
|
728
|
+
) -> dict[str, Any]:
|
|
729
|
+
"""Enhanced search with contextual analysis and suggestions.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
query: Search query
|
|
733
|
+
context_files: Optional list of files to provide context
|
|
734
|
+
limit: Maximum number of results
|
|
735
|
+
similarity_threshold: Minimum similarity score
|
|
736
|
+
|
|
737
|
+
Returns:
|
|
738
|
+
Dictionary with results, analysis, and suggestions
|
|
739
|
+
"""
|
|
740
|
+
# Analyze the query
|
|
741
|
+
query_analysis = self.analyze_query(query)
|
|
742
|
+
|
|
743
|
+
# Perform the search
|
|
744
|
+
results = await self.search(
|
|
745
|
+
query=query,
|
|
746
|
+
limit=limit,
|
|
747
|
+
similarity_threshold=similarity_threshold,
|
|
748
|
+
include_context=True,
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# Get related query suggestions
|
|
752
|
+
suggestions = self.suggest_related_queries(query, results)
|
|
753
|
+
|
|
754
|
+
# Enhance results with additional context if context files provided
|
|
755
|
+
if context_files:
|
|
756
|
+
results = await self._enhance_with_file_context(results, context_files)
|
|
757
|
+
|
|
758
|
+
# Calculate result quality metrics
|
|
759
|
+
quality_metrics = self._calculate_result_quality(results, query)
|
|
760
|
+
|
|
761
|
+
return {
|
|
762
|
+
"query": query,
|
|
763
|
+
"analysis": query_analysis,
|
|
764
|
+
"results": results,
|
|
765
|
+
"suggestions": suggestions,
|
|
766
|
+
"metrics": quality_metrics,
|
|
767
|
+
"total_results": len(results),
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
async def _enhance_with_file_context(
|
|
771
|
+
self, results: list[SearchResult], context_files: list[Path]
|
|
772
|
+
) -> list[SearchResult]:
|
|
773
|
+
"""Enhance results by considering context from specific files.
|
|
774
|
+
|
|
775
|
+
Args:
|
|
776
|
+
results: Original search results
|
|
777
|
+
context_files: Files to use for context
|
|
778
|
+
|
|
779
|
+
Returns:
|
|
780
|
+
Enhanced search results
|
|
781
|
+
"""
|
|
782
|
+
# Read context from files using async I/O
|
|
783
|
+
context_content = []
|
|
784
|
+
for file_path in context_files:
|
|
785
|
+
try:
|
|
786
|
+
async with aiofiles.open(file_path, encoding="utf-8") as f:
|
|
787
|
+
content = await f.read()
|
|
788
|
+
context_content.append(content)
|
|
789
|
+
except Exception as e:
|
|
790
|
+
logger.warning(f"Failed to read context file {file_path}: {e}")
|
|
791
|
+
|
|
792
|
+
if not context_content:
|
|
793
|
+
return results
|
|
794
|
+
|
|
795
|
+
# Boost results that are related to context files
|
|
796
|
+
context_text = " ".join(context_content).lower()
|
|
797
|
+
|
|
798
|
+
for result in results:
|
|
799
|
+
# Check if result is from one of the context files
|
|
800
|
+
if result.file_path in context_files:
|
|
801
|
+
result.similarity_score = min(1.0, result.similarity_score + 0.1)
|
|
802
|
+
|
|
803
|
+
# Check if result content relates to context
|
|
804
|
+
result.content.lower()
|
|
805
|
+
if result.function_name:
|
|
806
|
+
func_name_lower = result.function_name.lower()
|
|
807
|
+
if func_name_lower in context_text:
|
|
808
|
+
result.similarity_score = min(1.0, result.similarity_score + 0.05)
|
|
809
|
+
|
|
810
|
+
if result.class_name:
|
|
811
|
+
class_name_lower = result.class_name.lower()
|
|
812
|
+
if class_name_lower in context_text:
|
|
813
|
+
result.similarity_score = min(1.0, result.similarity_score + 0.05)
|
|
814
|
+
|
|
815
|
+
# Re-sort by updated scores
|
|
816
|
+
results.sort(key=lambda r: r.similarity_score, reverse=True)
|
|
817
|
+
|
|
818
|
+
# Update ranks
|
|
819
|
+
for i, result in enumerate(results):
|
|
820
|
+
result.rank = i + 1
|
|
821
|
+
|
|
822
|
+
return results
|
|
823
|
+
|
|
824
|
+
def _calculate_result_quality(
|
|
825
|
+
self, results: list[SearchResult], query: str
|
|
826
|
+
) -> dict[str, Any]:
|
|
827
|
+
"""Calculate quality metrics for search results.
|
|
828
|
+
|
|
829
|
+
Args:
|
|
830
|
+
results: Search results
|
|
831
|
+
query: Original query
|
|
832
|
+
|
|
833
|
+
Returns:
|
|
834
|
+
Dictionary with quality metrics
|
|
835
|
+
"""
|
|
836
|
+
if not results:
|
|
837
|
+
return {
|
|
838
|
+
"average_score": 0.0,
|
|
839
|
+
"score_distribution": {},
|
|
840
|
+
"diversity": 0.0,
|
|
841
|
+
"coverage": 0.0,
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
# Calculate average similarity score
|
|
845
|
+
scores = [r.similarity_score for r in results]
|
|
846
|
+
avg_score = sum(scores) / len(scores)
|
|
847
|
+
|
|
848
|
+
# Score distribution
|
|
849
|
+
high_quality = sum(1 for s in scores if s >= 0.8)
|
|
850
|
+
medium_quality = sum(1 for s in scores if 0.6 <= s < 0.8)
|
|
851
|
+
low_quality = sum(1 for s in scores if s < 0.6)
|
|
852
|
+
|
|
853
|
+
# Diversity (unique files)
|
|
854
|
+
unique_files = len({r.file_path for r in results})
|
|
855
|
+
diversity = unique_files / len(results) if results else 0.0
|
|
856
|
+
|
|
857
|
+
# Coverage (how many query words are covered)
|
|
858
|
+
query_words = set(query.lower().split())
|
|
859
|
+
covered_words = set()
|
|
860
|
+
for result in results:
|
|
861
|
+
content_words = set(result.content.lower().split())
|
|
862
|
+
covered_words.update(query_words.intersection(content_words))
|
|
863
|
+
|
|
864
|
+
coverage = len(covered_words) / len(query_words) if query_words else 0.0
|
|
865
|
+
|
|
866
|
+
return {
|
|
867
|
+
"average_score": round(avg_score, 3),
|
|
868
|
+
"score_distribution": {
|
|
869
|
+
"high_quality": high_quality,
|
|
870
|
+
"medium_quality": medium_quality,
|
|
871
|
+
"low_quality": low_quality,
|
|
872
|
+
},
|
|
873
|
+
"diversity": round(diversity, 3),
|
|
874
|
+
"coverage": round(coverage, 3),
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
def _extract_function_content(self, content: str, function_name: str) -> str | None:
|
|
878
|
+
"""Extract content of a specific function from code.
|
|
879
|
+
|
|
880
|
+
Args:
|
|
881
|
+
content: Full file content
|
|
882
|
+
function_name: Name of function to extract
|
|
883
|
+
|
|
884
|
+
Returns:
|
|
885
|
+
Function content if found, None otherwise
|
|
886
|
+
"""
|
|
887
|
+
# Simple regex-based extraction (could be improved with AST)
|
|
888
|
+
pattern = rf"^\s*def\s+{re.escape(function_name)}\s*\("
|
|
889
|
+
lines = content.splitlines()
|
|
890
|
+
|
|
891
|
+
for i, line in enumerate(lines):
|
|
892
|
+
if re.match(pattern, line):
|
|
893
|
+
# Found function start, now find the end
|
|
894
|
+
start_line = i
|
|
895
|
+
indent_level = len(line) - len(line.lstrip())
|
|
896
|
+
|
|
897
|
+
# Find end of function
|
|
898
|
+
end_line = len(lines)
|
|
899
|
+
for j in range(i + 1, len(lines)):
|
|
900
|
+
if lines[j].strip(): # Skip empty lines
|
|
901
|
+
current_indent = len(lines[j]) - len(lines[j].lstrip())
|
|
902
|
+
if current_indent <= indent_level:
|
|
903
|
+
end_line = j
|
|
904
|
+
break
|
|
905
|
+
|
|
906
|
+
return "\n".join(lines[start_line:end_line])
|
|
907
|
+
|
|
908
|
+
return None
|
|
909
|
+
|
|
910
|
+
async def get_search_stats(self) -> dict[str, Any]:
|
|
911
|
+
"""Get search engine statistics.
|
|
912
|
+
|
|
913
|
+
Returns:
|
|
914
|
+
Dictionary with search statistics
|
|
915
|
+
"""
|
|
916
|
+
try:
|
|
917
|
+
db_stats = await self.database.get_stats()
|
|
918
|
+
|
|
919
|
+
return {
|
|
920
|
+
"total_chunks": db_stats.total_chunks,
|
|
921
|
+
"languages": db_stats.languages,
|
|
922
|
+
"similarity_threshold": self.similarity_threshold,
|
|
923
|
+
"project_root": str(self.project_root),
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
except Exception as e:
|
|
927
|
+
logger.error(f"Failed to get search stats: {e}")
|
|
928
|
+
return {"error": str(e)}
|
|
929
|
+
|
|
930
|
+
def clear_cache(self) -> None:
|
|
931
|
+
"""Clear the file read cache."""
|
|
932
|
+
self._file_cache.clear()
|
|
933
|
+
self._cache_hits = 0
|
|
934
|
+
self._cache_misses = 0
|
|
935
|
+
logger.debug("File read cache cleared")
|
|
936
|
+
|
|
937
|
+
def get_cache_info(self) -> dict[str, Any]:
|
|
938
|
+
"""Get cache statistics.
|
|
939
|
+
|
|
940
|
+
Returns:
|
|
941
|
+
Dictionary with cache statistics including hits, misses, size, and hit rate
|
|
942
|
+
"""
|
|
943
|
+
total_requests = self._cache_hits + self._cache_misses
|
|
944
|
+
hit_rate = self._cache_hits / total_requests if total_requests > 0 else 0.0
|
|
945
|
+
|
|
946
|
+
return {
|
|
947
|
+
"hits": self._cache_hits,
|
|
948
|
+
"misses": self._cache_misses,
|
|
949
|
+
"size": len(self._file_cache),
|
|
950
|
+
"maxsize": self._cache_maxsize,
|
|
951
|
+
"hit_rate": f"{hit_rate:.2%}",
|
|
952
|
+
}
|