mcp-vector-search 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

@@ -1,11 +1,15 @@
1
1
  """Semantic search engine for MCP Vector Search."""
2
2
 
3
3
  import re
4
+ import time
5
+ from collections import OrderedDict
4
6
  from pathlib import Path
5
7
  from typing import Any
6
8
 
9
+ import aiofiles
7
10
  from loguru import logger
8
11
 
12
+ from ..config.constants import DEFAULT_CACHE_SIZE
9
13
  from .auto_indexer import AutoIndexer, SearchTriggeredIndexer
10
14
  from .database import VectorDatabase
11
15
  from .exceptions import SearchError
@@ -15,6 +19,55 @@ from .models import SearchResult
15
19
  class SemanticSearchEngine:
16
20
  """Semantic search engine for code search."""
17
21
 
22
+ # Query expansion constants (class-level for performance)
23
+ _QUERY_EXPANSIONS = {
24
+ # Common abbreviations
25
+ "auth": "authentication authorize login",
26
+ "db": "database data storage",
27
+ "api": "application programming interface endpoint",
28
+ "ui": "user interface frontend view",
29
+ "util": "utility helper function",
30
+ "config": "configuration settings options",
31
+ "async": "asynchronous await promise",
32
+ "sync": "synchronous blocking",
33
+ "func": "function method",
34
+ "var": "variable",
35
+ "param": "parameter argument",
36
+ "init": "initialize setup create",
37
+ "parse": "parsing parser analyze",
38
+ "validate": "validation check verify",
39
+ "handle": "handler process manage",
40
+ "error": "exception failure bug",
41
+ "test": "testing unittest spec",
42
+ "mock": "mocking stub fake",
43
+ "log": "logging logger debug",
44
+ # Programming concepts
45
+ "class": "class object type",
46
+ "method": "method function procedure",
47
+ "property": "property attribute field",
48
+ "import": "import require include",
49
+ "export": "export module public",
50
+ "return": "return yield output",
51
+ "loop": "loop iterate for while",
52
+ "condition": "condition if else branch",
53
+ "array": "array list collection",
54
+ "string": "string text character",
55
+ "number": "number integer float",
56
+ "boolean": "boolean true false",
57
+ }
58
+
59
+ # Reranking boost constants (class-level for performance)
60
+ _BOOST_EXACT_IDENTIFIER = 0.15
61
+ _BOOST_PARTIAL_IDENTIFIER = 0.05
62
+ _BOOST_FILE_NAME_EXACT = 0.08
63
+ _BOOST_FILE_NAME_PARTIAL = 0.03
64
+ _BOOST_FUNCTION_CHUNK = 0.05
65
+ _BOOST_CLASS_CHUNK = 0.03
66
+ _BOOST_SOURCE_FILE = 0.02
67
+ _BOOST_SHALLOW_PATH = 0.02
68
+ _PENALTY_TEST_FILE = -0.02
69
+ _PENALTY_DEEP_PATH = -0.01
70
+
18
71
  def __init__(
19
72
  self,
20
73
  database: VectorDatabase,
@@ -43,6 +96,16 @@ class SemanticSearchEngine:
43
96
  if auto_indexer and enable_auto_reindex:
44
97
  self.search_triggered_indexer = SearchTriggeredIndexer(auto_indexer)
45
98
 
99
+ # File content cache for performance (proper LRU with OrderedDict)
100
+ self._file_cache: OrderedDict[Path, list[str]] = OrderedDict()
101
+ self._cache_maxsize = DEFAULT_CACHE_SIZE
102
+ self._cache_hits = 0
103
+ self._cache_misses = 0
104
+
105
+ # Health check throttling (only check every 60 seconds)
106
+ self._last_health_check: float = 0.0
107
+ self._health_check_interval: float = 60.0
108
+
46
109
  async def search(
47
110
  self,
48
111
  query: str,
@@ -66,15 +129,21 @@ class SemanticSearchEngine:
66
129
  if not query.strip():
67
130
  return []
68
131
 
69
- # Health check before search
70
- try:
71
- if hasattr(self.database, "health_check"):
72
- is_healthy = await self.database.health_check()
73
- if not is_healthy:
74
- logger.warning("Database health check failed - attempting recovery")
75
- # Health check already attempts recovery, so we can proceed
76
- except Exception as e:
77
- logger.warning(f"Health check failed: {e}")
132
+ # Throttled health check before search (only every 60 seconds)
133
+ current_time = time.time()
134
+ if current_time - self._last_health_check >= self._health_check_interval:
135
+ try:
136
+ if hasattr(self.database, "health_check"):
137
+ is_healthy = await self.database.health_check()
138
+ if not is_healthy:
139
+ logger.warning(
140
+ "Database health check failed - attempting recovery"
141
+ )
142
+ # Health check already attempts recovery, so we can proceed
143
+ self._last_health_check = current_time
144
+ except Exception as e:
145
+ logger.warning(f"Health check failed: {e}")
146
+ self._last_health_check = current_time
78
147
 
79
148
  # Auto-reindex check before search
80
149
  if self.search_triggered_indexer:
@@ -161,9 +230,9 @@ class SemanticSearchEngine:
161
230
  List of similar code results
162
231
  """
163
232
  try:
164
- # Read the reference file
165
- with open(file_path, encoding="utf-8") as f:
166
- content = f.read()
233
+ # Read the reference file using async I/O
234
+ async with aiofiles.open(file_path, encoding="utf-8") as f:
235
+ content = await f.read()
167
236
 
168
237
  # If function name is specified, try to extract just that function
169
238
  if function_name:
@@ -227,48 +296,7 @@ class SemanticSearchEngine:
227
296
  # Remove extra whitespace
228
297
  query = re.sub(r"\s+", " ", query.strip())
229
298
 
230
- # Expand common programming abbreviations and synonyms
231
- expansions = {
232
- "auth": "authentication authorize login",
233
- "db": "database data storage",
234
- "api": "application programming interface endpoint",
235
- "ui": "user interface frontend view",
236
- "util": "utility helper function",
237
- "config": "configuration settings options",
238
- "async": "asynchronous await promise",
239
- "sync": "synchronous blocking",
240
- "func": "function method",
241
- "var": "variable",
242
- "param": "parameter argument",
243
- "init": "initialize setup create",
244
- "parse": "parsing parser analyze",
245
- "validate": "validation check verify",
246
- "handle": "handler process manage",
247
- "error": "exception failure bug",
248
- "test": "testing unittest spec",
249
- "mock": "mocking stub fake",
250
- "log": "logging logger debug",
251
- }
252
-
253
- # Add programming language keywords and concepts
254
- programming_concepts = {
255
- "class": "class object type",
256
- "method": "method function procedure",
257
- "property": "property attribute field",
258
- "import": "import require include",
259
- "export": "export module public",
260
- "return": "return yield output",
261
- "loop": "loop iterate for while",
262
- "condition": "condition if else branch",
263
- "array": "array list collection",
264
- "string": "string text character",
265
- "number": "number integer float",
266
- "boolean": "boolean true false",
267
- }
268
-
269
- # Merge all expansions
270
- all_expansions = {**expansions, **programming_concepts}
271
-
299
+ # Use class-level query expansions (no dict creation overhead)
272
300
  words = query.lower().split()
273
301
  expanded_words = []
274
302
 
@@ -277,8 +305,8 @@ class SemanticSearchEngine:
277
305
  expanded_words.append(word)
278
306
 
279
307
  # Add expansions if available
280
- if word in all_expansions:
281
- expanded_words.extend(all_expansions[word].split())
308
+ if word in self._QUERY_EXPANSIONS:
309
+ expanded_words.extend(self._QUERY_EXPANSIONS[word].split())
282
310
 
283
311
  # Remove duplicates while preserving order
284
312
  seen = set()
@@ -363,6 +391,49 @@ class SemanticSearchEngine:
363
391
 
364
392
  return base_threshold
365
393
 
394
+ async def _read_file_lines_cached(self, file_path: Path) -> list[str]:
395
+ """Read file lines with proper LRU caching for performance.
396
+
397
+ Args:
398
+ file_path: Path to the file
399
+
400
+ Returns:
401
+ List of file lines
402
+
403
+ Raises:
404
+ FileNotFoundError: If file doesn't exist
405
+ """
406
+ # Check cache - move to end if found (most recently used)
407
+ if file_path in self._file_cache:
408
+ self._cache_hits += 1
409
+ # Move to end (most recently used)
410
+ self._file_cache.move_to_end(file_path)
411
+ return self._file_cache[file_path]
412
+
413
+ self._cache_misses += 1
414
+
415
+ # Read file asynchronously
416
+ try:
417
+ async with aiofiles.open(file_path, encoding="utf-8") as f:
418
+ content = await f.read()
419
+ lines = content.splitlines(keepends=True)
420
+
421
+ # Proper LRU: if cache is full, remove least recently used (first item)
422
+ if len(self._file_cache) >= self._cache_maxsize:
423
+ # Remove least recently used entry (first item in OrderedDict)
424
+ self._file_cache.popitem(last=False)
425
+
426
+ # Add to cache (will be at end, most recently used)
427
+ self._file_cache[file_path] = lines
428
+ return lines
429
+
430
+ except FileNotFoundError:
431
+ # Cache the miss to avoid repeated failed attempts
432
+ if len(self._file_cache) >= self._cache_maxsize:
433
+ self._file_cache.popitem(last=False)
434
+ self._file_cache[file_path] = []
435
+ raise
436
+
366
437
  async def _enhance_result(
367
438
  self, result: SearchResult, include_context: bool
368
439
  ) -> SearchResult:
@@ -379,9 +450,11 @@ class SemanticSearchEngine:
379
450
  return result
380
451
 
381
452
  try:
382
- # Read the source file to get context
383
- with open(result.file_path, encoding="utf-8") as f:
384
- lines = f.readlines()
453
+ # Read the source file using cached method
454
+ lines = await self._read_file_lines_cached(result.file_path)
455
+
456
+ if not lines: # File not found or empty
457
+ return result
385
458
 
386
459
  # Get context lines before and after
387
460
  context_size = 3
@@ -417,9 +490,15 @@ class SemanticSearchEngine:
417
490
  if not results:
418
491
  return results
419
492
 
493
+ # Pre-compute lowercased strings once (avoid repeated .lower() calls)
420
494
  query_lower = query.lower()
421
495
  query_words = set(query_lower.split())
422
496
 
497
+ # Pre-compute file extensions for source files
498
+ source_exts = frozenset(
499
+ [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs"]
500
+ )
501
+
423
502
  for result in results:
424
503
  # Start with base similarity score
425
504
  score = result.similarity_score
@@ -428,56 +507,60 @@ class SemanticSearchEngine:
428
507
  if result.function_name:
429
508
  func_name_lower = result.function_name.lower()
430
509
  if query_lower in func_name_lower:
431
- score += 0.15 # Strong boost for function name match
510
+ score += self._BOOST_EXACT_IDENTIFIER
432
511
  # Partial word matches
433
- for word in query_words:
434
- if word in func_name_lower:
435
- score += 0.05
512
+ score += sum(
513
+ self._BOOST_PARTIAL_IDENTIFIER
514
+ for word in query_words
515
+ if word in func_name_lower
516
+ )
436
517
 
437
518
  if result.class_name:
438
519
  class_name_lower = result.class_name.lower()
439
520
  if query_lower in class_name_lower:
440
- score += 0.15 # Strong boost for class name match
521
+ score += self._BOOST_EXACT_IDENTIFIER
441
522
  # Partial word matches
442
- for word in query_words:
443
- if word in class_name_lower:
444
- score += 0.05
523
+ score += sum(
524
+ self._BOOST_PARTIAL_IDENTIFIER
525
+ for word in query_words
526
+ if word in class_name_lower
527
+ )
445
528
 
446
529
  # Factor 2: File name relevance
447
530
  file_name_lower = result.file_path.name.lower()
448
531
  if query_lower in file_name_lower:
449
- score += 0.08
450
- for word in query_words:
451
- if word in file_name_lower:
452
- score += 0.03
532
+ score += self._BOOST_FILE_NAME_EXACT
533
+ score += sum(
534
+ self._BOOST_FILE_NAME_PARTIAL
535
+ for word in query_words
536
+ if word in file_name_lower
537
+ )
453
538
 
454
539
  # Factor 3: Content density (how many query words appear)
455
540
  content_lower = result.content.lower()
456
541
  word_matches = sum(1 for word in query_words if word in content_lower)
457
542
  if word_matches > 0:
458
- density_boost = (word_matches / len(query_words)) * 0.1
459
- score += density_boost
543
+ score += (word_matches / len(query_words)) * 0.1
460
544
 
461
- # Factor 4: Code structure preferences
462
- # Boost functions over general code blocks
545
+ # Factor 4: Code structure preferences (combined conditions)
463
546
  if result.chunk_type == "function":
464
- score += 0.05
547
+ score += self._BOOST_FUNCTION_CHUNK
465
548
  elif result.chunk_type == "class":
466
- score += 0.03
549
+ score += self._BOOST_CLASS_CHUNK
467
550
 
468
- # Factor 5: File type preferences (prefer source files over tests/docs)
551
+ # Factor 5: File type preferences (prefer source files over tests)
469
552
  file_ext = result.file_path.suffix.lower()
470
- if file_ext in [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs"]:
471
- score += 0.02
472
- elif "test" in result.file_path.name.lower():
473
- score -= 0.02 # Slightly penalize test files unless specifically searching for tests
553
+ if file_ext in source_exts:
554
+ score += self._BOOST_SOURCE_FILE
555
+ if "test" in file_name_lower: # Already computed
556
+ score += self._PENALTY_TEST_FILE
474
557
 
475
- # Factor 6: Recency bias (prefer shorter file paths - often more core files)
558
+ # Factor 6: Path depth preference
476
559
  path_depth = len(result.file_path.parts)
477
560
  if path_depth <= 3:
478
- score += 0.02
561
+ score += self._BOOST_SHALLOW_PATH
479
562
  elif path_depth > 5:
480
- score -= 0.01
563
+ score += self._PENALTY_DEEP_PATH
481
564
 
482
565
  # Ensure score doesn't exceed 1.0
483
566
  result.similarity_score = min(1.0, score)
@@ -696,12 +779,12 @@ class SemanticSearchEngine:
696
779
  Returns:
697
780
  Enhanced search results
698
781
  """
699
- # Read context from files
782
+ # Read context from files using async I/O
700
783
  context_content = []
701
784
  for file_path in context_files:
702
785
  try:
703
- with open(file_path, encoding="utf-8") as f:
704
- content = f.read()
786
+ async with aiofiles.open(file_path, encoding="utf-8") as f:
787
+ content = await f.read()
705
788
  context_content.append(content)
706
789
  except Exception as e:
707
790
  logger.warning(f"Failed to read context file {file_path}: {e}")
@@ -843,3 +926,27 @@ class SemanticSearchEngine:
843
926
  except Exception as e:
844
927
  logger.error(f"Failed to get search stats: {e}")
845
928
  return {"error": str(e)}
929
+
930
+ def clear_cache(self) -> None:
931
+ """Clear the file read cache."""
932
+ self._file_cache.clear()
933
+ self._cache_hits = 0
934
+ self._cache_misses = 0
935
+ logger.debug("File read cache cleared")
936
+
937
+ def get_cache_info(self) -> dict[str, Any]:
938
+ """Get cache statistics.
939
+
940
+ Returns:
941
+ Dictionary with cache statistics including hits, misses, size, and hit rate
942
+ """
943
+ total_requests = self._cache_hits + self._cache_misses
944
+ hit_rate = self._cache_hits / total_requests if total_requests > 0 else 0.0
945
+
946
+ return {
947
+ "hits": self._cache_hits,
948
+ "misses": self._cache_misses,
949
+ "size": len(self._file_cache),
950
+ "maxsize": self._cache_maxsize,
951
+ "hit_rate": f"{hit_rate:.2%}",
952
+ }
@@ -3,7 +3,9 @@
3
3
  from abc import ABC, abstractmethod
4
4
  from pathlib import Path
5
5
 
6
+ from ..config.constants import DEFAULT_CHUNK_SIZE
6
7
  from ..core.models import CodeChunk
8
+ from . import utils
7
9
 
8
10
 
9
11
  class BaseParser(ABC):
@@ -109,7 +111,7 @@ class BaseParser(ABC):
109
111
  Returns:
110
112
  List of lines
111
113
  """
112
- return content.splitlines(keepends=True)
114
+ return utils.split_into_lines(content)
113
115
 
114
116
  def _get_line_range(self, lines: list[str], start_line: int, end_line: int) -> str:
115
117
  """Extract a range of lines from content.
@@ -122,11 +124,7 @@ class BaseParser(ABC):
122
124
  Returns:
123
125
  Content for the specified line range
124
126
  """
125
- # Convert to 0-based indexing
126
- start_idx = max(0, start_line - 1)
127
- end_idx = min(len(lines), end_line)
128
-
129
- return "".join(lines[start_idx:end_idx])
127
+ return utils.get_line_range(lines, start_line, end_line)
130
128
 
131
129
 
132
130
  class FallbackParser(BaseParser):
@@ -155,7 +153,7 @@ class FallbackParser(BaseParser):
155
153
  chunks = []
156
154
 
157
155
  # Simple chunking: split into chunks of ~50 lines
158
- chunk_size = 50
156
+ chunk_size = DEFAULT_CHUNK_SIZE
159
157
  for i in range(0, len(lines), chunk_size):
160
158
  start_line = i + 1
161
159
  end_line = min(i + chunk_size, len(lines))
@@ -510,14 +510,14 @@ class DartParser(BaseParser):
510
510
 
511
511
  # Calculate line number within class
512
512
  lines_before = class_content[: match.start()].count("\n")
513
- start_line = class_start_line + lines_before
513
+ class_start_line + lines_before
514
514
 
515
515
  # Find end of build method
516
516
  class_lines = class_content.splitlines(keepends=True)
517
517
  build_start_idx = lines_before
518
518
 
519
519
  # Simple heuristic: find matching braces
520
- end_line = self._find_method_end(class_lines, build_start_idx)
520
+ self._find_method_end(class_lines, build_start_idx)
521
521
 
522
522
  return None # Simplified - build method is already in class chunk
523
523
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from pathlib import Path
4
4
 
5
+ from ..config.constants import TEXT_CHUNK_SIZE
5
6
  from ..core.models import CodeChunk
6
7
  from .base import BaseParser
7
8
 
@@ -66,7 +67,7 @@ class TextParser(BaseParser):
66
67
  else:
67
68
  # Fall back to line-based chunking for non-paragraph text
68
69
  # Use smaller chunks for text files (30 lines instead of 50)
69
- chunk_size = 30
70
+ chunk_size = TEXT_CHUNK_SIZE
70
71
  for i in range(0, len(lines), chunk_size):
71
72
  start_line = i + 1
72
73
  end_line = min(i + chunk_size, len(lines))