mcp-vector-search 0.15.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (86) hide show
  1. mcp_vector_search/__init__.py +10 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/auto_index.py +397 -0
  5. mcp_vector_search/cli/commands/chat.py +534 -0
  6. mcp_vector_search/cli/commands/config.py +393 -0
  7. mcp_vector_search/cli/commands/demo.py +358 -0
  8. mcp_vector_search/cli/commands/index.py +762 -0
  9. mcp_vector_search/cli/commands/init.py +658 -0
  10. mcp_vector_search/cli/commands/install.py +869 -0
  11. mcp_vector_search/cli/commands/install_old.py +700 -0
  12. mcp_vector_search/cli/commands/mcp.py +1254 -0
  13. mcp_vector_search/cli/commands/reset.py +393 -0
  14. mcp_vector_search/cli/commands/search.py +796 -0
  15. mcp_vector_search/cli/commands/setup.py +1133 -0
  16. mcp_vector_search/cli/commands/status.py +584 -0
  17. mcp_vector_search/cli/commands/uninstall.py +404 -0
  18. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  19. mcp_vector_search/cli/commands/visualize/cli.py +265 -0
  20. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  21. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  22. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
  23. mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
  24. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  25. mcp_vector_search/cli/commands/visualize/server.py +201 -0
  26. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  27. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  28. mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
  29. mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
  30. mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
  31. mcp_vector_search/cli/commands/visualize.py.original +2536 -0
  32. mcp_vector_search/cli/commands/watch.py +287 -0
  33. mcp_vector_search/cli/didyoumean.py +520 -0
  34. mcp_vector_search/cli/export.py +320 -0
  35. mcp_vector_search/cli/history.py +295 -0
  36. mcp_vector_search/cli/interactive.py +342 -0
  37. mcp_vector_search/cli/main.py +484 -0
  38. mcp_vector_search/cli/output.py +414 -0
  39. mcp_vector_search/cli/suggestions.py +375 -0
  40. mcp_vector_search/config/__init__.py +1 -0
  41. mcp_vector_search/config/constants.py +24 -0
  42. mcp_vector_search/config/defaults.py +200 -0
  43. mcp_vector_search/config/settings.py +146 -0
  44. mcp_vector_search/core/__init__.py +1 -0
  45. mcp_vector_search/core/auto_indexer.py +298 -0
  46. mcp_vector_search/core/config_utils.py +394 -0
  47. mcp_vector_search/core/connection_pool.py +360 -0
  48. mcp_vector_search/core/database.py +1237 -0
  49. mcp_vector_search/core/directory_index.py +318 -0
  50. mcp_vector_search/core/embeddings.py +294 -0
  51. mcp_vector_search/core/exceptions.py +89 -0
  52. mcp_vector_search/core/factory.py +318 -0
  53. mcp_vector_search/core/git_hooks.py +345 -0
  54. mcp_vector_search/core/indexer.py +1002 -0
  55. mcp_vector_search/core/llm_client.py +453 -0
  56. mcp_vector_search/core/models.py +294 -0
  57. mcp_vector_search/core/project.py +350 -0
  58. mcp_vector_search/core/scheduler.py +330 -0
  59. mcp_vector_search/core/search.py +952 -0
  60. mcp_vector_search/core/watcher.py +322 -0
  61. mcp_vector_search/mcp/__init__.py +5 -0
  62. mcp_vector_search/mcp/__main__.py +25 -0
  63. mcp_vector_search/mcp/server.py +752 -0
  64. mcp_vector_search/parsers/__init__.py +8 -0
  65. mcp_vector_search/parsers/base.py +296 -0
  66. mcp_vector_search/parsers/dart.py +605 -0
  67. mcp_vector_search/parsers/html.py +413 -0
  68. mcp_vector_search/parsers/javascript.py +643 -0
  69. mcp_vector_search/parsers/php.py +694 -0
  70. mcp_vector_search/parsers/python.py +502 -0
  71. mcp_vector_search/parsers/registry.py +223 -0
  72. mcp_vector_search/parsers/ruby.py +678 -0
  73. mcp_vector_search/parsers/text.py +186 -0
  74. mcp_vector_search/parsers/utils.py +265 -0
  75. mcp_vector_search/py.typed +1 -0
  76. mcp_vector_search/utils/__init__.py +42 -0
  77. mcp_vector_search/utils/gitignore.py +250 -0
  78. mcp_vector_search/utils/gitignore_updater.py +212 -0
  79. mcp_vector_search/utils/monorepo.py +339 -0
  80. mcp_vector_search/utils/timing.py +338 -0
  81. mcp_vector_search/utils/version.py +47 -0
  82. mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
  83. mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
  84. mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
  85. mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
  86. mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,952 @@
1
+ """Semantic search engine for MCP Vector Search."""
2
+
3
+ import re
4
+ import time
5
+ from collections import OrderedDict
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import aiofiles
10
+ from loguru import logger
11
+
12
+ from ..config.constants import DEFAULT_CACHE_SIZE
13
+ from .auto_indexer import AutoIndexer, SearchTriggeredIndexer
14
+ from .database import VectorDatabase
15
+ from .exceptions import SearchError
16
+ from .models import SearchResult
17
+
18
+
19
+ class SemanticSearchEngine:
20
+ """Semantic search engine for code search."""
21
+
22
+ # Query expansion constants (class-level for performance)
23
+ _QUERY_EXPANSIONS = {
24
+ # Common abbreviations
25
+ "auth": "authentication authorize login",
26
+ "db": "database data storage",
27
+ "api": "application programming interface endpoint",
28
+ "ui": "user interface frontend view",
29
+ "util": "utility helper function",
30
+ "config": "configuration settings options",
31
+ "async": "asynchronous await promise",
32
+ "sync": "synchronous blocking",
33
+ "func": "function method",
34
+ "var": "variable",
35
+ "param": "parameter argument",
36
+ "init": "initialize setup create",
37
+ "parse": "parsing parser analyze",
38
+ "validate": "validation check verify",
39
+ "handle": "handler process manage",
40
+ "error": "exception failure bug",
41
+ "test": "testing unittest spec",
42
+ "mock": "mocking stub fake",
43
+ "log": "logging logger debug",
44
+ # Programming concepts
45
+ "class": "class object type",
46
+ "method": "method function procedure",
47
+ "property": "property attribute field",
48
+ "import": "import require include",
49
+ "export": "export module public",
50
+ "return": "return yield output",
51
+ "loop": "loop iterate for while",
52
+ "condition": "condition if else branch",
53
+ "array": "array list collection",
54
+ "string": "string text character",
55
+ "number": "number integer float",
56
+ "boolean": "boolean true false",
57
+ }
58
+
59
+ # Reranking boost constants (class-level for performance)
60
+ _BOOST_EXACT_IDENTIFIER = 0.15
61
+ _BOOST_PARTIAL_IDENTIFIER = 0.05
62
+ _BOOST_FILE_NAME_EXACT = 0.08
63
+ _BOOST_FILE_NAME_PARTIAL = 0.03
64
+ _BOOST_FUNCTION_CHUNK = 0.05
65
+ _BOOST_CLASS_CHUNK = 0.03
66
+ _BOOST_SOURCE_FILE = 0.02
67
+ _BOOST_SHALLOW_PATH = 0.02
68
+ _PENALTY_TEST_FILE = -0.02
69
+ _PENALTY_DEEP_PATH = -0.01
70
+
71
+ def __init__(
72
+ self,
73
+ database: VectorDatabase,
74
+ project_root: Path,
75
+ similarity_threshold: float = 0.3,
76
+ auto_indexer: AutoIndexer | None = None,
77
+ enable_auto_reindex: bool = True,
78
+ ) -> None:
79
+ """Initialize semantic search engine.
80
+
81
+ Args:
82
+ database: Vector database instance
83
+ project_root: Project root directory
84
+ similarity_threshold: Default similarity threshold
85
+ auto_indexer: Optional auto-indexer for semi-automatic reindexing
86
+ enable_auto_reindex: Whether to enable automatic reindexing
87
+ """
88
+ self.database = database
89
+ self.project_root = project_root
90
+ self.similarity_threshold = similarity_threshold
91
+ self.auto_indexer = auto_indexer
92
+ self.enable_auto_reindex = enable_auto_reindex
93
+
94
+ # Initialize search-triggered indexer if auto-indexer is provided
95
+ self.search_triggered_indexer = None
96
+ if auto_indexer and enable_auto_reindex:
97
+ self.search_triggered_indexer = SearchTriggeredIndexer(auto_indexer)
98
+
99
+ # File content cache for performance (proper LRU with OrderedDict)
100
+ self._file_cache: OrderedDict[Path, list[str]] = OrderedDict()
101
+ self._cache_maxsize = DEFAULT_CACHE_SIZE
102
+ self._cache_hits = 0
103
+ self._cache_misses = 0
104
+
105
+ # Health check throttling (only check every 60 seconds)
106
+ self._last_health_check: float = 0.0
107
+ self._health_check_interval: float = 60.0
108
+
109
+ async def search(
110
+ self,
111
+ query: str,
112
+ limit: int = 10,
113
+ filters: dict[str, Any] | None = None,
114
+ similarity_threshold: float | None = None,
115
+ include_context: bool = True,
116
+ ) -> list[SearchResult]:
117
+ """Perform semantic search for code.
118
+
119
+ Args:
120
+ query: Search query
121
+ limit: Maximum number of results
122
+ filters: Optional filters (language, file_path, etc.)
123
+ similarity_threshold: Minimum similarity score
124
+ include_context: Whether to include context lines
125
+
126
+ Returns:
127
+ List of search results
128
+ """
129
+ if not query.strip():
130
+ return []
131
+
132
+ # Throttled health check before search (only every 60 seconds)
133
+ current_time = time.time()
134
+ if current_time - self._last_health_check >= self._health_check_interval:
135
+ try:
136
+ if hasattr(self.database, "health_check"):
137
+ is_healthy = await self.database.health_check()
138
+ if not is_healthy:
139
+ logger.warning(
140
+ "Database health check failed - attempting recovery"
141
+ )
142
+ # Health check already attempts recovery, so we can proceed
143
+ self._last_health_check = current_time
144
+ except Exception as e:
145
+ logger.warning(f"Health check failed: {e}")
146
+ self._last_health_check = current_time
147
+
148
+ # Auto-reindex check before search
149
+ if self.search_triggered_indexer:
150
+ try:
151
+ await self.search_triggered_indexer.pre_search_hook()
152
+ except Exception as e:
153
+ logger.warning(f"Auto-reindex check failed: {e}")
154
+
155
+ threshold = (
156
+ similarity_threshold
157
+ if similarity_threshold is not None
158
+ else self._get_adaptive_threshold(query)
159
+ )
160
+
161
+ try:
162
+ # Preprocess query
163
+ processed_query = self._preprocess_query(query)
164
+
165
+ # Perform vector search
166
+ results = await self.database.search(
167
+ query=processed_query,
168
+ limit=limit,
169
+ filters=filters,
170
+ similarity_threshold=threshold,
171
+ )
172
+
173
+ # Post-process results
174
+ enhanced_results = []
175
+ for result in results:
176
+ enhanced_result = await self._enhance_result(result, include_context)
177
+ enhanced_results.append(enhanced_result)
178
+
179
+ # Apply additional ranking if needed
180
+ ranked_results = self._rerank_results(enhanced_results, query)
181
+
182
+ logger.debug(
183
+ f"Search for '{query}' with threshold {threshold:.3f} returned {len(ranked_results)} results"
184
+ )
185
+ return ranked_results
186
+
187
+ except Exception as e:
188
+ error_msg = str(e).lower()
189
+ # Check for corruption indicators
190
+ if any(
191
+ indicator in error_msg
192
+ for indicator in [
193
+ "pickle",
194
+ "unpickling",
195
+ "eof",
196
+ "ran out of input",
197
+ "hnsw",
198
+ "index",
199
+ "deserialize",
200
+ "corrupt",
201
+ ]
202
+ ):
203
+ logger.error(f"Index corruption detected during search: {e}")
204
+ logger.info(
205
+ "The index appears to be corrupted. Please run 'mcp-vector-search reset' to clear the index and then 'mcp-vector-search index' to rebuild it."
206
+ )
207
+ raise SearchError(
208
+ "Index corruption detected. Please run 'mcp-vector-search reset' followed by 'mcp-vector-search index' to rebuild."
209
+ ) from e
210
+ else:
211
+ logger.error(f"Search failed for query '{query}': {e}")
212
+ raise SearchError(f"Search failed: {e}") from e
213
+
214
+ async def search_similar(
215
+ self,
216
+ file_path: Path,
217
+ function_name: str | None = None,
218
+ limit: int = 10,
219
+ similarity_threshold: float | None = None,
220
+ ) -> list[SearchResult]:
221
+ """Find code similar to a specific function or file.
222
+
223
+ Args:
224
+ file_path: Path to the reference file
225
+ function_name: Specific function name (optional)
226
+ limit: Maximum number of results
227
+ similarity_threshold: Minimum similarity score
228
+
229
+ Returns:
230
+ List of similar code results
231
+ """
232
+ try:
233
+ # Read the reference file using async I/O
234
+ async with aiofiles.open(file_path, encoding="utf-8") as f:
235
+ content = await f.read()
236
+
237
+ # If function name is specified, try to extract just that function
238
+ if function_name:
239
+ function_content = self._extract_function_content(
240
+ content, function_name
241
+ )
242
+ if function_content:
243
+ content = function_content
244
+
245
+ # Use the content as the search query
246
+ return await self.search(
247
+ query=content,
248
+ limit=limit,
249
+ similarity_threshold=similarity_threshold,
250
+ include_context=True,
251
+ )
252
+
253
+ except Exception as e:
254
+ logger.error(f"Similar search failed for {file_path}: {e}")
255
+ raise SearchError(f"Similar search failed: {e}") from e
256
+
257
+ async def search_by_context(
258
+ self,
259
+ context_description: str,
260
+ focus_areas: list[str] | None = None,
261
+ limit: int = 10,
262
+ ) -> list[SearchResult]:
263
+ """Search for code based on contextual description.
264
+
265
+ Args:
266
+ context_description: Description of what you're looking for
267
+ focus_areas: Areas to focus on (e.g., ["security", "authentication"])
268
+ limit: Maximum number of results
269
+
270
+ Returns:
271
+ List of contextually relevant results
272
+ """
273
+ # Build enhanced query with focus areas
274
+ query_parts = [context_description]
275
+
276
+ if focus_areas:
277
+ query_parts.extend(focus_areas)
278
+
279
+ enhanced_query = " ".join(query_parts)
280
+
281
+ return await self.search(
282
+ query=enhanced_query,
283
+ limit=limit,
284
+ include_context=True,
285
+ )
286
+
287
+ def _preprocess_query(self, query: str) -> str:
288
+ """Preprocess search query for better results.
289
+
290
+ Args:
291
+ query: Raw search query
292
+
293
+ Returns:
294
+ Processed query
295
+ """
296
+ # Remove extra whitespace
297
+ query = re.sub(r"\s+", " ", query.strip())
298
+
299
+ # Use class-level query expansions (no dict creation overhead)
300
+ words = query.lower().split()
301
+ expanded_words = []
302
+
303
+ for word in words:
304
+ # Add original word
305
+ expanded_words.append(word)
306
+
307
+ # Add expansions if available
308
+ if word in self._QUERY_EXPANSIONS:
309
+ expanded_words.extend(self._QUERY_EXPANSIONS[word].split())
310
+
311
+ # Remove duplicates while preserving order
312
+ seen = set()
313
+ unique_words = []
314
+ for word in expanded_words:
315
+ if word not in seen:
316
+ seen.add(word)
317
+ unique_words.append(word)
318
+
319
+ return " ".join(unique_words)
320
+
321
+ def _get_adaptive_threshold(self, query: str) -> float:
322
+ """Get adaptive similarity threshold based on query characteristics.
323
+
324
+ Args:
325
+ query: Search query
326
+
327
+ Returns:
328
+ Adaptive similarity threshold
329
+ """
330
+ base_threshold = self.similarity_threshold
331
+ query_lower = query.lower()
332
+ words = query.split()
333
+
334
+ # Adjust threshold based on query characteristics
335
+
336
+ # 1. Single word queries - lower threshold for broader results
337
+ if len(words) == 1:
338
+ return max(0.01, base_threshold - 0.29)
339
+
340
+ # 2. Very specific technical terms - lower threshold
341
+ technical_terms = [
342
+ "javascript",
343
+ "typescript",
344
+ "python",
345
+ "java",
346
+ "cpp",
347
+ "rust",
348
+ "go",
349
+ "function",
350
+ "class",
351
+ "method",
352
+ "variable",
353
+ "import",
354
+ "export",
355
+ "async",
356
+ "await",
357
+ "promise",
358
+ "callback",
359
+ "api",
360
+ "database",
361
+ "parser",
362
+ "compiler",
363
+ "interpreter",
364
+ "syntax",
365
+ "semantic",
366
+ "mcp",
367
+ "gateway",
368
+ "server",
369
+ "client",
370
+ "protocol",
371
+ ]
372
+
373
+ if any(term in query_lower for term in technical_terms):
374
+ return max(0.01, base_threshold - 0.29)
375
+
376
+ # 3. Short queries (2-3 words) - slightly lower threshold
377
+ if len(words) <= 3:
378
+ return max(0.1, base_threshold - 0.1)
379
+
380
+ # 4. Long queries (>6 words) - higher threshold for precision
381
+ if len(words) > 6:
382
+ return min(0.8, base_threshold + 0.1)
383
+
384
+ # 5. Queries with exact identifiers (CamelCase, snake_case)
385
+ if re.search(r"\b[A-Z][a-zA-Z]*\b", query) or "_" in query:
386
+ return max(0.05, base_threshold - 0.25)
387
+
388
+ # 6. Common programming patterns
389
+ if any(pattern in query for pattern in ["()", ".", "->", "=>", "::"]):
390
+ return max(0.25, base_threshold - 0.1)
391
+
392
+ return base_threshold
393
+
394
+ async def _read_file_lines_cached(self, file_path: Path) -> list[str]:
395
+ """Read file lines with proper LRU caching for performance.
396
+
397
+ Args:
398
+ file_path: Path to the file
399
+
400
+ Returns:
401
+ List of file lines
402
+
403
+ Raises:
404
+ FileNotFoundError: If file doesn't exist
405
+ """
406
+ # Check cache - move to end if found (most recently used)
407
+ if file_path in self._file_cache:
408
+ self._cache_hits += 1
409
+ # Move to end (most recently used)
410
+ self._file_cache.move_to_end(file_path)
411
+ return self._file_cache[file_path]
412
+
413
+ self._cache_misses += 1
414
+
415
+ # Read file asynchronously
416
+ try:
417
+ async with aiofiles.open(file_path, encoding="utf-8") as f:
418
+ content = await f.read()
419
+ lines = content.splitlines(keepends=True)
420
+
421
+ # Proper LRU: if cache is full, remove least recently used (first item)
422
+ if len(self._file_cache) >= self._cache_maxsize:
423
+ # Remove least recently used entry (first item in OrderedDict)
424
+ self._file_cache.popitem(last=False)
425
+
426
+ # Add to cache (will be at end, most recently used)
427
+ self._file_cache[file_path] = lines
428
+ return lines
429
+
430
+ except FileNotFoundError:
431
+ # Cache the miss to avoid repeated failed attempts
432
+ if len(self._file_cache) >= self._cache_maxsize:
433
+ self._file_cache.popitem(last=False)
434
+ self._file_cache[file_path] = []
435
+ raise
436
+
437
+ async def _enhance_result(
438
+ self, result: SearchResult, include_context: bool
439
+ ) -> SearchResult:
440
+ """Enhance search result with additional information.
441
+
442
+ Args:
443
+ result: Original search result
444
+ include_context: Whether to include context lines
445
+
446
+ Returns:
447
+ Enhanced search result
448
+ """
449
+ if not include_context:
450
+ return result
451
+
452
+ try:
453
+ # Read the source file using cached method
454
+ lines = await self._read_file_lines_cached(result.file_path)
455
+
456
+ if not lines: # File not found or empty
457
+ return result
458
+
459
+ # Get context lines before and after
460
+ context_size = 3
461
+ start_idx = max(0, result.start_line - 1 - context_size)
462
+ end_idx = min(len(lines), result.end_line + context_size)
463
+
464
+ context_before = [
465
+ line.rstrip() for line in lines[start_idx : result.start_line - 1]
466
+ ]
467
+ context_after = [line.rstrip() for line in lines[result.end_line : end_idx]]
468
+
469
+ # Update result with context
470
+ result.context_before = context_before
471
+ result.context_after = context_after
472
+
473
+ except Exception as e:
474
+ logger.warning(f"Failed to get context for {result.file_path}: {e}")
475
+
476
+ return result
477
+
478
+ def _rerank_results(
479
+ self, results: list[SearchResult], query: str
480
+ ) -> list[SearchResult]:
481
+ """Apply advanced ranking to search results using multiple factors.
482
+
483
+ Args:
484
+ results: Original search results
485
+ query: Original search query
486
+
487
+ Returns:
488
+ Reranked search results
489
+ """
490
+ if not results:
491
+ return results
492
+
493
+ # Pre-compute lowercased strings once (avoid repeated .lower() calls)
494
+ query_lower = query.lower()
495
+ query_words = set(query_lower.split())
496
+
497
+ # Pre-compute file extensions for source files
498
+ source_exts = frozenset(
499
+ [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs"]
500
+ )
501
+
502
+ for result in results:
503
+ # Start with base similarity score
504
+ score = result.similarity_score
505
+
506
+ # Factor 1: Exact matches in identifiers (high boost)
507
+ if result.function_name:
508
+ func_name_lower = result.function_name.lower()
509
+ if query_lower in func_name_lower:
510
+ score += self._BOOST_EXACT_IDENTIFIER
511
+ # Partial word matches
512
+ score += sum(
513
+ self._BOOST_PARTIAL_IDENTIFIER
514
+ for word in query_words
515
+ if word in func_name_lower
516
+ )
517
+
518
+ if result.class_name:
519
+ class_name_lower = result.class_name.lower()
520
+ if query_lower in class_name_lower:
521
+ score += self._BOOST_EXACT_IDENTIFIER
522
+ # Partial word matches
523
+ score += sum(
524
+ self._BOOST_PARTIAL_IDENTIFIER
525
+ for word in query_words
526
+ if word in class_name_lower
527
+ )
528
+
529
+ # Factor 2: File name relevance
530
+ file_name_lower = result.file_path.name.lower()
531
+ if query_lower in file_name_lower:
532
+ score += self._BOOST_FILE_NAME_EXACT
533
+ score += sum(
534
+ self._BOOST_FILE_NAME_PARTIAL
535
+ for word in query_words
536
+ if word in file_name_lower
537
+ )
538
+
539
+ # Factor 3: Content density (how many query words appear)
540
+ content_lower = result.content.lower()
541
+ word_matches = sum(1 for word in query_words if word in content_lower)
542
+ if word_matches > 0:
543
+ score += (word_matches / len(query_words)) * 0.1
544
+
545
+ # Factor 4: Code structure preferences (combined conditions)
546
+ if result.chunk_type == "function":
547
+ score += self._BOOST_FUNCTION_CHUNK
548
+ elif result.chunk_type == "class":
549
+ score += self._BOOST_CLASS_CHUNK
550
+
551
+ # Factor 5: File type preferences (prefer source files over tests)
552
+ file_ext = result.file_path.suffix.lower()
553
+ if file_ext in source_exts:
554
+ score += self._BOOST_SOURCE_FILE
555
+ if "test" in file_name_lower: # Already computed
556
+ score += self._PENALTY_TEST_FILE
557
+
558
+ # Factor 6: Path depth preference
559
+ path_depth = len(result.file_path.parts)
560
+ if path_depth <= 3:
561
+ score += self._BOOST_SHALLOW_PATH
562
+ elif path_depth > 5:
563
+ score += self._PENALTY_DEEP_PATH
564
+
565
+ # Ensure score doesn't exceed 1.0
566
+ result.similarity_score = min(1.0, score)
567
+
568
+ # Sort by enhanced similarity score
569
+ results.sort(key=lambda r: r.similarity_score, reverse=True)
570
+
571
+ # Update ranks
572
+ for i, result in enumerate(results):
573
+ result.rank = i + 1
574
+
575
+ return results
576
+
577
+ def analyze_query(self, query: str) -> dict[str, Any]:
578
+ """Analyze search query and provide suggestions for improvement.
579
+
580
+ Args:
581
+ query: Search query to analyze
582
+
583
+ Returns:
584
+ Dictionary with analysis results and suggestions
585
+ """
586
+ analysis = {
587
+ "original_query": query,
588
+ "processed_query": self._preprocess_query(query),
589
+ "query_type": "general",
590
+ "suggestions": [],
591
+ "confidence": "medium",
592
+ }
593
+
594
+ query_lower = query.lower()
595
+
596
+ # Detect query type
597
+ if any(word in query_lower for word in ["function", "method", "def", "func"]):
598
+ analysis["query_type"] = "function_search"
599
+ analysis["suggestions"].append(
600
+ "Try searching for specific function names or patterns"
601
+ )
602
+ elif any(word in query_lower for word in ["class", "object", "type"]):
603
+ analysis["query_type"] = "class_search"
604
+ analysis["suggestions"].append(
605
+ "Include class inheritance or interface information"
606
+ )
607
+ elif any(word in query_lower for word in ["error", "exception", "bug", "fix"]):
608
+ analysis["query_type"] = "error_handling"
609
+ analysis["suggestions"].append("Include error types or exception names")
610
+ elif any(word in query_lower for word in ["test", "spec", "mock"]):
611
+ analysis["query_type"] = "testing"
612
+ analysis["suggestions"].append("Specify test framework or testing patterns")
613
+ elif any(word in query_lower for word in ["config", "setting", "option"]):
614
+ analysis["query_type"] = "configuration"
615
+ analysis["suggestions"].append(
616
+ "Include configuration file types or setting names"
617
+ )
618
+
619
+ # Analyze query complexity
620
+ words = query.split()
621
+ if len(words) == 1:
622
+ analysis["confidence"] = "low"
623
+ analysis["suggestions"].append(
624
+ "Try adding more descriptive words for better results"
625
+ )
626
+ elif len(words) > 10:
627
+ analysis["confidence"] = "low"
628
+ analysis["suggestions"].append(
629
+ "Consider simplifying your query for better matching"
630
+ )
631
+ else:
632
+ analysis["confidence"] = "high"
633
+
634
+ # Check for common programming patterns
635
+ if re.search(r"\b\w+\(\)", query):
636
+ analysis["suggestions"].append(
637
+ "Function call detected - searching for function definitions"
638
+ )
639
+ if re.search(r"\b[A-Z][a-zA-Z]*\b", query):
640
+ analysis["suggestions"].append(
641
+ "CamelCase detected - searching for class or type names"
642
+ )
643
+ if re.search(r"\b\w+\.\w+", query):
644
+ analysis["suggestions"].append(
645
+ "Dot notation detected - searching for method calls or properties"
646
+ )
647
+
648
+ return analysis
649
+
650
+ def suggest_related_queries(
651
+ self, query: str, results: list[SearchResult]
652
+ ) -> list[str]:
653
+ """Suggest related queries based on search results.
654
+
655
+ Args:
656
+ query: Original search query
657
+ results: Search results
658
+
659
+ Returns:
660
+ List of suggested related queries
661
+ """
662
+ suggestions = []
663
+
664
+ if not results:
665
+ # No results - suggest broader queries
666
+ words = query.lower().split()
667
+ if len(words) > 1:
668
+ # Try individual words
669
+ suggestions.extend(words[:3]) # Top 3 words
670
+
671
+ # Suggest common related terms
672
+ related_terms = {
673
+ "auth": ["login", "user", "session", "token"],
674
+ "database": ["query", "model", "schema", "connection"],
675
+ "api": ["endpoint", "request", "response", "handler"],
676
+ "test": ["mock", "assert", "spec", "unit"],
677
+ "error": ["exception", "handle", "catch", "debug"],
678
+ }
679
+
680
+ for word in words:
681
+ if word in related_terms:
682
+ suggestions.extend(related_terms[word][:2])
683
+ else:
684
+ # Extract common patterns from results
685
+ function_names = [r.function_name for r in results if r.function_name]
686
+ class_names = [r.class_name for r in results if r.class_name]
687
+
688
+ # Suggest function names
689
+ if function_names:
690
+ unique_functions = list(set(function_names))[:3]
691
+ suggestions.extend(unique_functions)
692
+
693
+ # Suggest class names
694
+ if class_names:
695
+ unique_classes = list(set(class_names))[:3]
696
+ suggestions.extend(unique_classes)
697
+
698
+ # Suggest file-based queries
699
+ file_patterns = set()
700
+ for result in results[:5]: # Top 5 results
701
+ file_name = result.file_path.stem
702
+ if "_" in file_name:
703
+ file_patterns.update(file_name.split("_"))
704
+ elif file_name not in suggestions:
705
+ file_patterns.add(file_name)
706
+
707
+ suggestions.extend(list(file_patterns)[:3])
708
+
709
+ # Remove duplicates and original query words
710
+ query_words = set(query.lower().split())
711
+ unique_suggestions = []
712
+ for suggestion in suggestions:
713
+ if (
714
+ suggestion
715
+ and suggestion.lower() not in query_words
716
+ and suggestion not in unique_suggestions
717
+ ):
718
+ unique_suggestions.append(suggestion)
719
+
720
+ return unique_suggestions[:5] # Return top 5 suggestions
721
+
722
+ async def search_with_context(
723
+ self,
724
+ query: str,
725
+ context_files: list[Path] | None = None,
726
+ limit: int = 10,
727
+ similarity_threshold: float | None = None,
728
+ ) -> dict[str, Any]:
729
+ """Enhanced search with contextual analysis and suggestions.
730
+
731
+ Args:
732
+ query: Search query
733
+ context_files: Optional list of files to provide context
734
+ limit: Maximum number of results
735
+ similarity_threshold: Minimum similarity score
736
+
737
+ Returns:
738
+ Dictionary with results, analysis, and suggestions
739
+ """
740
+ # Analyze the query
741
+ query_analysis = self.analyze_query(query)
742
+
743
+ # Perform the search
744
+ results = await self.search(
745
+ query=query,
746
+ limit=limit,
747
+ similarity_threshold=similarity_threshold,
748
+ include_context=True,
749
+ )
750
+
751
+ # Get related query suggestions
752
+ suggestions = self.suggest_related_queries(query, results)
753
+
754
+ # Enhance results with additional context if context files provided
755
+ if context_files:
756
+ results = await self._enhance_with_file_context(results, context_files)
757
+
758
+ # Calculate result quality metrics
759
+ quality_metrics = self._calculate_result_quality(results, query)
760
+
761
+ return {
762
+ "query": query,
763
+ "analysis": query_analysis,
764
+ "results": results,
765
+ "suggestions": suggestions,
766
+ "metrics": quality_metrics,
767
+ "total_results": len(results),
768
+ }
769
+
770
+ async def _enhance_with_file_context(
771
+ self, results: list[SearchResult], context_files: list[Path]
772
+ ) -> list[SearchResult]:
773
+ """Enhance results by considering context from specific files.
774
+
775
+ Args:
776
+ results: Original search results
777
+ context_files: Files to use for context
778
+
779
+ Returns:
780
+ Enhanced search results
781
+ """
782
+ # Read context from files using async I/O
783
+ context_content = []
784
+ for file_path in context_files:
785
+ try:
786
+ async with aiofiles.open(file_path, encoding="utf-8") as f:
787
+ content = await f.read()
788
+ context_content.append(content)
789
+ except Exception as e:
790
+ logger.warning(f"Failed to read context file {file_path}: {e}")
791
+
792
+ if not context_content:
793
+ return results
794
+
795
+ # Boost results that are related to context files
796
+ context_text = " ".join(context_content).lower()
797
+
798
+ for result in results:
799
+ # Check if result is from one of the context files
800
+ if result.file_path in context_files:
801
+ result.similarity_score = min(1.0, result.similarity_score + 0.1)
802
+
803
+ # Check if result content relates to context
804
+ result.content.lower()
805
+ if result.function_name:
806
+ func_name_lower = result.function_name.lower()
807
+ if func_name_lower in context_text:
808
+ result.similarity_score = min(1.0, result.similarity_score + 0.05)
809
+
810
+ if result.class_name:
811
+ class_name_lower = result.class_name.lower()
812
+ if class_name_lower in context_text:
813
+ result.similarity_score = min(1.0, result.similarity_score + 0.05)
814
+
815
+ # Re-sort by updated scores
816
+ results.sort(key=lambda r: r.similarity_score, reverse=True)
817
+
818
+ # Update ranks
819
+ for i, result in enumerate(results):
820
+ result.rank = i + 1
821
+
822
+ return results
823
+
824
+ def _calculate_result_quality(
825
+ self, results: list[SearchResult], query: str
826
+ ) -> dict[str, Any]:
827
+ """Calculate quality metrics for search results.
828
+
829
+ Args:
830
+ results: Search results
831
+ query: Original query
832
+
833
+ Returns:
834
+ Dictionary with quality metrics
835
+ """
836
+ if not results:
837
+ return {
838
+ "average_score": 0.0,
839
+ "score_distribution": {},
840
+ "diversity": 0.0,
841
+ "coverage": 0.0,
842
+ }
843
+
844
+ # Calculate average similarity score
845
+ scores = [r.similarity_score for r in results]
846
+ avg_score = sum(scores) / len(scores)
847
+
848
+ # Score distribution
849
+ high_quality = sum(1 for s in scores if s >= 0.8)
850
+ medium_quality = sum(1 for s in scores if 0.6 <= s < 0.8)
851
+ low_quality = sum(1 for s in scores if s < 0.6)
852
+
853
+ # Diversity (unique files)
854
+ unique_files = len({r.file_path for r in results})
855
+ diversity = unique_files / len(results) if results else 0.0
856
+
857
+ # Coverage (how many query words are covered)
858
+ query_words = set(query.lower().split())
859
+ covered_words = set()
860
+ for result in results:
861
+ content_words = set(result.content.lower().split())
862
+ covered_words.update(query_words.intersection(content_words))
863
+
864
+ coverage = len(covered_words) / len(query_words) if query_words else 0.0
865
+
866
+ return {
867
+ "average_score": round(avg_score, 3),
868
+ "score_distribution": {
869
+ "high_quality": high_quality,
870
+ "medium_quality": medium_quality,
871
+ "low_quality": low_quality,
872
+ },
873
+ "diversity": round(diversity, 3),
874
+ "coverage": round(coverage, 3),
875
+ }
876
+
877
+ def _extract_function_content(self, content: str, function_name: str) -> str | None:
878
+ """Extract content of a specific function from code.
879
+
880
+ Args:
881
+ content: Full file content
882
+ function_name: Name of function to extract
883
+
884
+ Returns:
885
+ Function content if found, None otherwise
886
+ """
887
+ # Simple regex-based extraction (could be improved with AST)
888
+ pattern = rf"^\s*def\s+{re.escape(function_name)}\s*\("
889
+ lines = content.splitlines()
890
+
891
+ for i, line in enumerate(lines):
892
+ if re.match(pattern, line):
893
+ # Found function start, now find the end
894
+ start_line = i
895
+ indent_level = len(line) - len(line.lstrip())
896
+
897
+ # Find end of function
898
+ end_line = len(lines)
899
+ for j in range(i + 1, len(lines)):
900
+ if lines[j].strip(): # Skip empty lines
901
+ current_indent = len(lines[j]) - len(lines[j].lstrip())
902
+ if current_indent <= indent_level:
903
+ end_line = j
904
+ break
905
+
906
+ return "\n".join(lines[start_line:end_line])
907
+
908
+ return None
909
+
910
+ async def get_search_stats(self) -> dict[str, Any]:
911
+ """Get search engine statistics.
912
+
913
+ Returns:
914
+ Dictionary with search statistics
915
+ """
916
+ try:
917
+ db_stats = await self.database.get_stats()
918
+
919
+ return {
920
+ "total_chunks": db_stats.total_chunks,
921
+ "languages": db_stats.languages,
922
+ "similarity_threshold": self.similarity_threshold,
923
+ "project_root": str(self.project_root),
924
+ }
925
+
926
+ except Exception as e:
927
+ logger.error(f"Failed to get search stats: {e}")
928
+ return {"error": str(e)}
929
+
930
+ def clear_cache(self) -> None:
931
+ """Clear the file read cache."""
932
+ self._file_cache.clear()
933
+ self._cache_hits = 0
934
+ self._cache_misses = 0
935
+ logger.debug("File read cache cleared")
936
+
937
+ def get_cache_info(self) -> dict[str, Any]:
938
+ """Get cache statistics.
939
+
940
+ Returns:
941
+ Dictionary with cache statistics including hits, misses, size, and hit rate
942
+ """
943
+ total_requests = self._cache_hits + self._cache_misses
944
+ hit_rate = self._cache_hits / total_requests if total_requests > 0 else 0.0
945
+
946
+ return {
947
+ "hits": self._cache_hits,
948
+ "misses": self._cache_misses,
949
+ "size": len(self._file_cache),
950
+ "maxsize": self._cache_maxsize,
951
+ "hit_rate": f"{hit_rate:.2%}",
952
+ }