mcp-vector-search 0.0.3__py3-none-any.whl → 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (49) hide show
  1. mcp_vector_search/__init__.py +3 -2
  2. mcp_vector_search/cli/commands/auto_index.py +397 -0
  3. mcp_vector_search/cli/commands/config.py +88 -40
  4. mcp_vector_search/cli/commands/index.py +198 -52
  5. mcp_vector_search/cli/commands/init.py +471 -58
  6. mcp_vector_search/cli/commands/install.py +284 -0
  7. mcp_vector_search/cli/commands/mcp.py +495 -0
  8. mcp_vector_search/cli/commands/search.py +241 -87
  9. mcp_vector_search/cli/commands/status.py +184 -58
  10. mcp_vector_search/cli/commands/watch.py +34 -35
  11. mcp_vector_search/cli/didyoumean.py +184 -0
  12. mcp_vector_search/cli/export.py +320 -0
  13. mcp_vector_search/cli/history.py +292 -0
  14. mcp_vector_search/cli/interactive.py +342 -0
  15. mcp_vector_search/cli/main.py +175 -27
  16. mcp_vector_search/cli/output.py +63 -45
  17. mcp_vector_search/config/defaults.py +50 -36
  18. mcp_vector_search/config/settings.py +49 -35
  19. mcp_vector_search/core/auto_indexer.py +298 -0
  20. mcp_vector_search/core/connection_pool.py +322 -0
  21. mcp_vector_search/core/database.py +335 -25
  22. mcp_vector_search/core/embeddings.py +73 -29
  23. mcp_vector_search/core/exceptions.py +19 -2
  24. mcp_vector_search/core/factory.py +310 -0
  25. mcp_vector_search/core/git_hooks.py +345 -0
  26. mcp_vector_search/core/indexer.py +237 -73
  27. mcp_vector_search/core/models.py +21 -19
  28. mcp_vector_search/core/project.py +73 -58
  29. mcp_vector_search/core/scheduler.py +330 -0
  30. mcp_vector_search/core/search.py +574 -86
  31. mcp_vector_search/core/watcher.py +48 -46
  32. mcp_vector_search/mcp/__init__.py +4 -0
  33. mcp_vector_search/mcp/__main__.py +25 -0
  34. mcp_vector_search/mcp/server.py +701 -0
  35. mcp_vector_search/parsers/base.py +30 -31
  36. mcp_vector_search/parsers/javascript.py +74 -48
  37. mcp_vector_search/parsers/python.py +57 -49
  38. mcp_vector_search/parsers/registry.py +47 -32
  39. mcp_vector_search/parsers/text.py +179 -0
  40. mcp_vector_search/utils/__init__.py +40 -0
  41. mcp_vector_search/utils/gitignore.py +229 -0
  42. mcp_vector_search/utils/timing.py +334 -0
  43. mcp_vector_search/utils/version.py +47 -0
  44. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/METADATA +173 -7
  45. mcp_vector_search-0.4.12.dist-info/RECORD +54 -0
  46. mcp_vector_search-0.0.3.dist-info/RECORD +0 -35
  47. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/WHEEL +0 -0
  48. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/entry_points.txt +0 -0
  49. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,12 @@
1
1
  """Database abstraction and ChromaDB implementation for MCP Vector Search."""
2
2
 
3
- import asyncio
4
3
  from abc import ABC, abstractmethod
5
4
  from pathlib import Path
6
- from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
5
+ from typing import Any, Protocol, runtime_checkable
7
6
 
8
7
  from loguru import logger
9
8
 
9
+ from .connection_pool import ChromaConnectionPool
10
10
  from .exceptions import (
11
11
  DatabaseError,
12
12
  DatabaseInitializationError,
@@ -21,7 +21,7 @@ from .models import CodeChunk, IndexStats, SearchResult
21
21
  class EmbeddingFunction(Protocol):
22
22
  """Protocol for embedding functions."""
23
23
 
24
- def __call__(self, texts: List[str]) -> List[List[float]]:
24
+ def __call__(self, texts: list[str]) -> list[list[float]]:
25
25
  """Generate embeddings for input texts."""
26
26
  ...
27
27
 
@@ -40,9 +40,9 @@ class VectorDatabase(ABC):
40
40
  ...
41
41
 
42
42
  @abstractmethod
43
- async def add_chunks(self, chunks: List[CodeChunk]) -> None:
43
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
44
44
  """Add code chunks to the database.
45
-
45
+
46
46
  Args:
47
47
  chunks: List of code chunks to add
48
48
  """
@@ -53,17 +53,17 @@ class VectorDatabase(ABC):
53
53
  self,
54
54
  query: str,
55
55
  limit: int = 10,
56
- filters: Optional[Dict[str, Any]] = None,
56
+ filters: dict[str, Any] | None = None,
57
57
  similarity_threshold: float = 0.7,
58
- ) -> List[SearchResult]:
58
+ ) -> list[SearchResult]:
59
59
  """Search for similar code chunks.
60
-
60
+
61
61
  Args:
62
62
  query: Search query
63
63
  limit: Maximum number of results
64
64
  filters: Optional filters to apply
65
65
  similarity_threshold: Minimum similarity score
66
-
66
+
67
67
  Returns:
68
68
  List of search results
69
69
  """
@@ -72,10 +72,10 @@ class VectorDatabase(ABC):
72
72
  @abstractmethod
73
73
  async def delete_by_file(self, file_path: Path) -> int:
74
74
  """Delete all chunks for a specific file.
75
-
75
+
76
76
  Args:
77
77
  file_path: Path to the file
78
-
78
+
79
79
  Returns:
80
80
  Number of deleted chunks
81
81
  """
@@ -84,7 +84,7 @@ class VectorDatabase(ABC):
84
84
  @abstractmethod
85
85
  async def get_stats(self) -> IndexStats:
86
86
  """Get database statistics.
87
-
87
+
88
88
  Returns:
89
89
  Index statistics
90
90
  """
@@ -115,7 +115,7 @@ class ChromaVectorDatabase(VectorDatabase):
115
115
  collection_name: str = "code_search",
116
116
  ) -> None:
117
117
  """Initialize ChromaDB vector database.
118
-
118
+
119
119
  Args:
120
120
  persist_directory: Directory to persist database
121
121
  embedding_function: Function to generate embeddings
@@ -141,7 +141,7 @@ class ChromaVectorDatabase(VectorDatabase):
141
141
  settings=chromadb.Settings(
142
142
  anonymized_telemetry=False,
143
143
  allow_reset=True,
144
- )
144
+ ),
145
145
  )
146
146
 
147
147
  # Create or get collection
@@ -153,11 +153,13 @@ class ChromaVectorDatabase(VectorDatabase):
153
153
  },
154
154
  )
155
155
 
156
- logger.info(f"ChromaDB initialized at {self.persist_directory}")
156
+ logger.debug(f"ChromaDB initialized at {self.persist_directory}")
157
157
 
158
158
  except Exception as e:
159
159
  logger.error(f"Failed to initialize ChromaDB: {e}")
160
- raise DatabaseInitializationError(f"ChromaDB initialization failed: {e}") from e
160
+ raise DatabaseInitializationError(
161
+ f"ChromaDB initialization failed: {e}"
162
+ ) from e
161
163
 
162
164
  async def remove_file_chunks(self, file_path: str) -> int:
163
165
  """Remove all chunks for a specific file.
@@ -173,9 +175,7 @@ class ChromaVectorDatabase(VectorDatabase):
173
175
 
174
176
  try:
175
177
  # Get all chunks for this file
176
- results = self._collection.get(
177
- where={"file_path": file_path}
178
- )
178
+ results = self._collection.get(where={"file_path": file_path})
179
179
 
180
180
  if not results["ids"]:
181
181
  return 0
@@ -199,7 +199,7 @@ class ChromaVectorDatabase(VectorDatabase):
199
199
  self._collection = None
200
200
  logger.debug("ChromaDB connections closed")
201
201
 
202
- async def add_chunks(self, chunks: List[CodeChunk]) -> None:
202
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
203
203
  """Add code chunks to the database."""
204
204
  if not self._collection:
205
205
  raise DatabaseNotInitializedError("Database not initialized")
@@ -251,9 +251,9 @@ class ChromaVectorDatabase(VectorDatabase):
251
251
  self,
252
252
  query: str,
253
253
  limit: int = 10,
254
- filters: Optional[Dict[str, Any]] = None,
254
+ filters: dict[str, Any] | None = None,
255
255
  similarity_threshold: float = 0.7,
256
- ) -> List[SearchResult]:
256
+ ) -> list[SearchResult]:
257
257
  """Search for similar code chunks."""
258
258
  if not self._collection:
259
259
  raise DatabaseNotInitializedError("Database not initialized")
@@ -279,10 +279,13 @@ class ChromaVectorDatabase(VectorDatabase):
279
279
  results["documents"][0],
280
280
  results["metadatas"][0],
281
281
  results["distances"][0],
282
+ strict=False,
282
283
  )
283
284
  ):
284
285
  # Convert distance to similarity (ChromaDB uses cosine distance)
285
- similarity = 1.0 - distance
286
+ # For cosine distance, use a more permissive conversion that handles distances > 1.0
287
+ # Convert to a 0-1 similarity score where lower distances = higher similarity
288
+ similarity = max(0.0, 1.0 / (1.0 + distance))
286
289
 
287
290
  if similarity >= similarity_threshold:
288
291
  result = SearchResult(
@@ -363,7 +366,12 @@ class ChromaVectorDatabase(VectorDatabase):
363
366
  index_size_mb = count * 0.001 # Rough estimate
364
367
 
365
368
  return IndexStats(
366
- total_files=len(set(m.get("file_path", "") for m in sample_results.get("metadatas", []))),
369
+ total_files=len(
370
+ {
371
+ m.get("file_path", "")
372
+ for m in sample_results.get("metadatas", [])
373
+ }
374
+ ),
367
375
  total_chunks=count,
368
376
  languages=languages,
369
377
  file_types=file_types,
@@ -416,7 +424,7 @@ class ChromaVectorDatabase(VectorDatabase):
416
424
 
417
425
  return "\n".join(parts)
418
426
 
419
- def _build_where_clause(self, filters: Dict[str, Any]) -> Dict[str, Any]:
427
+ def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
420
428
  """Build ChromaDB where clause from filters."""
421
429
  where = {}
422
430
 
@@ -429,3 +437,305 @@ class ChromaVectorDatabase(VectorDatabase):
429
437
  where[key] = value
430
438
 
431
439
  return where
440
+
441
+
442
+ class PooledChromaVectorDatabase(VectorDatabase):
443
+ """ChromaDB implementation with connection pooling for improved performance."""
444
+
445
+ def __init__(
446
+ self,
447
+ persist_directory: Path,
448
+ embedding_function: EmbeddingFunction,
449
+ collection_name: str = "code_search",
450
+ max_connections: int = 10,
451
+ min_connections: int = 2,
452
+ max_idle_time: float = 300.0,
453
+ max_connection_age: float = 3600.0,
454
+ ) -> None:
455
+ """Initialize pooled ChromaDB vector database.
456
+
457
+ Args:
458
+ persist_directory: Directory to persist database
459
+ embedding_function: Function to generate embeddings
460
+ collection_name: Name of the collection
461
+ max_connections: Maximum number of connections in pool
462
+ min_connections: Minimum number of connections to maintain
463
+ max_idle_time: Maximum time a connection can be idle (seconds)
464
+ max_connection_age: Maximum age of a connection (seconds)
465
+ """
466
+ self.persist_directory = persist_directory
467
+ self.embedding_function = embedding_function
468
+ self.collection_name = collection_name
469
+
470
+ self._pool = ChromaConnectionPool(
471
+ persist_directory=persist_directory,
472
+ embedding_function=embedding_function,
473
+ collection_name=collection_name,
474
+ max_connections=max_connections,
475
+ min_connections=min_connections,
476
+ max_idle_time=max_idle_time,
477
+ max_connection_age=max_connection_age,
478
+ )
479
+
480
+ async def initialize(self) -> None:
481
+ """Initialize the connection pool."""
482
+ await self._pool.initialize()
483
+ logger.debug(f"Pooled ChromaDB initialized at {self.persist_directory}")
484
+
485
+ async def close(self) -> None:
486
+ """Close the connection pool."""
487
+ await self._pool.close()
488
+ logger.debug("Pooled ChromaDB connections closed")
489
+
490
+ async def add_chunks(self, chunks: list[CodeChunk]) -> None:
491
+ """Add code chunks to the database using pooled connection."""
492
+ if not chunks:
493
+ return
494
+
495
+ # Ensure pool is initialized
496
+ if not self._pool._initialized:
497
+ await self._pool.initialize()
498
+
499
+ try:
500
+ async with self._pool.get_connection() as conn:
501
+ # Prepare data for ChromaDB
502
+ documents = []
503
+ metadatas = []
504
+ ids = []
505
+
506
+ for chunk in chunks:
507
+ documents.append(chunk.content)
508
+ metadatas.append(
509
+ {
510
+ "file_path": str(chunk.file_path),
511
+ "start_line": chunk.start_line,
512
+ "end_line": chunk.end_line,
513
+ "language": chunk.language,
514
+ "chunk_type": chunk.chunk_type,
515
+ "function_name": chunk.function_name or "",
516
+ "class_name": chunk.class_name or "",
517
+ }
518
+ )
519
+ ids.append(chunk.id)
520
+
521
+ # Add to collection
522
+ conn.collection.add(documents=documents, metadatas=metadatas, ids=ids)
523
+
524
+ logger.debug(f"Added {len(chunks)} chunks to database")
525
+
526
+ except Exception as e:
527
+ logger.error(f"Failed to add chunks: {e}")
528
+ raise DocumentAdditionError(f"Failed to add chunks: {e}") from e
529
+
530
+ async def search(
531
+ self,
532
+ query: str,
533
+ limit: int = 10,
534
+ filters: dict[str, Any] | None = None,
535
+ similarity_threshold: float = 0.7,
536
+ ) -> list[SearchResult]:
537
+ """Search for similar code chunks using pooled connection."""
538
+ # Ensure pool is initialized
539
+ if not self._pool._initialized:
540
+ await self._pool.initialize()
541
+
542
+ try:
543
+ async with self._pool.get_connection() as conn:
544
+ # Build where clause
545
+ where_clause = self._build_where_clause(filters) if filters else None
546
+
547
+ # Perform search
548
+ results = conn.collection.query(
549
+ query_texts=[query],
550
+ n_results=limit,
551
+ where=where_clause,
552
+ include=["documents", "metadatas", "distances"],
553
+ )
554
+
555
+ # Process results
556
+ search_results = []
557
+
558
+ if results["documents"] and results["documents"][0]:
559
+ for i, (doc, metadata, distance) in enumerate(
560
+ zip(
561
+ results["documents"][0],
562
+ results["metadatas"][0],
563
+ results["distances"][0],
564
+ strict=False,
565
+ )
566
+ ):
567
+ # Convert distance to similarity (ChromaDB uses cosine distance)
568
+ # For cosine distance, use a more permissive conversion that handles distances > 1.0
569
+ # Convert to a 0-1 similarity score where lower distances = higher similarity
570
+ similarity = max(0.0, 1.0 / (1.0 + distance))
571
+
572
+ if similarity >= similarity_threshold:
573
+ result = SearchResult(
574
+ content=doc,
575
+ file_path=Path(metadata["file_path"]),
576
+ start_line=metadata["start_line"],
577
+ end_line=metadata["end_line"],
578
+ language=metadata["language"],
579
+ similarity_score=similarity,
580
+ rank=i + 1,
581
+ chunk_type=metadata.get("chunk_type", "code"),
582
+ function_name=metadata.get("function_name") or None,
583
+ class_name=metadata.get("class_name") or None,
584
+ )
585
+ search_results.append(result)
586
+
587
+ logger.debug(f"Found {len(search_results)} results for query: {query}")
588
+ return search_results
589
+
590
+ except Exception as e:
591
+ logger.error(f"Search failed: {e}")
592
+ raise SearchError(f"Search failed: {e}") from e
593
+
594
+ async def delete_by_file(self, file_path: Path) -> int:
595
+ """Delete all chunks for a specific file using pooled connection."""
596
+ try:
597
+ async with self._pool.get_connection() as conn:
598
+ # Get all chunks for this file
599
+ results = conn.collection.get(
600
+ where={"file_path": str(file_path)}, include=["metadatas"]
601
+ )
602
+
603
+ if not results["ids"]:
604
+ return 0
605
+
606
+ # Delete the chunks
607
+ conn.collection.delete(ids=results["ids"])
608
+
609
+ deleted_count = len(results["ids"])
610
+ logger.debug(f"Deleted {deleted_count} chunks for file: {file_path}")
611
+ return deleted_count
612
+
613
+ except Exception as e:
614
+ logger.error(f"Failed to delete chunks for file {file_path}: {e}")
615
+ raise DatabaseError(f"Failed to delete chunks: {e}") from e
616
+
617
+ async def get_stats(self) -> IndexStats:
618
+ """Get database statistics using pooled connection."""
619
+ try:
620
+ async with self._pool.get_connection() as conn:
621
+ # Get total count
622
+ count = conn.collection.count()
623
+
624
+ # Get all metadata to analyze
625
+ results = conn.collection.get(include=["metadatas"])
626
+
627
+ # Analyze languages and files
628
+ languages = set()
629
+ files = set()
630
+
631
+ for metadata in results["metadatas"]:
632
+ if "language" in metadata:
633
+ languages.add(metadata["language"])
634
+ if "file_path" in metadata:
635
+ files.add(metadata["file_path"])
636
+
637
+ # Count languages and file types
638
+ language_counts = {}
639
+ file_type_counts = {}
640
+
641
+ for metadata in results["metadatas"]:
642
+ # Count languages
643
+ lang = metadata.get("language", "unknown")
644
+ language_counts[lang] = language_counts.get(lang, 0) + 1
645
+
646
+ # Count file types
647
+ file_path = metadata.get("file_path", "")
648
+ if file_path:
649
+ ext = Path(file_path).suffix or "no_extension"
650
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
651
+
652
+ # Estimate index size (rough approximation)
653
+ index_size_mb = count * 0.001 # Rough estimate
654
+
655
+ return IndexStats(
656
+ total_chunks=count,
657
+ total_files=len(files),
658
+ languages=language_counts,
659
+ file_types=file_type_counts,
660
+ index_size_mb=index_size_mb,
661
+ last_updated="unknown", # ChromaDB doesn't track this
662
+ embedding_model="unknown" # TODO: Track this in metadata
663
+ )
664
+
665
+ except Exception as e:
666
+ logger.error(f"Failed to get database stats: {e}")
667
+ raise DatabaseError(f"Failed to get stats: {e}") from e
668
+
669
+ async def remove_file_chunks(self, file_path: str) -> int:
670
+ """Remove all chunks for a specific file using pooled connection."""
671
+ try:
672
+ async with self._pool.get_connection() as conn:
673
+ # Get all chunks for this file
674
+ results = conn.collection.get(where={"file_path": file_path})
675
+
676
+ if not results["ids"]:
677
+ return 0
678
+
679
+ # Delete the chunks
680
+ conn.collection.delete(ids=results["ids"])
681
+
682
+ return len(results["ids"])
683
+
684
+ except Exception as e:
685
+ logger.error(f"Failed to remove chunks for file {file_path}: {e}")
686
+ return 0
687
+
688
+ async def reset(self) -> None:
689
+ """Reset the database using pooled connection."""
690
+ try:
691
+ async with self._pool.get_connection() as conn:
692
+ conn.client.reset()
693
+ # Reinitialize the pool after reset
694
+ await self._pool.close()
695
+ await self._pool.initialize()
696
+ logger.info("Database reset successfully")
697
+ except Exception as e:
698
+ logger.error(f"Failed to reset database: {e}")
699
+ raise DatabaseError(f"Failed to reset database: {e}") from e
700
+
701
+ def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any] | None:
702
+ """Build ChromaDB where clause from filters."""
703
+ if not filters:
704
+ return None
705
+
706
+ conditions = []
707
+
708
+ for key, value in filters.items():
709
+ if key == "language" and value:
710
+ conditions.append({"language": {"$eq": value}})
711
+ elif key == "file_path" and value:
712
+ if isinstance(value, list):
713
+ conditions.append({"file_path": {"$in": [str(p) for p in value]}})
714
+ else:
715
+ conditions.append({"file_path": {"$eq": str(value)}})
716
+ elif key == "chunk_type" and value:
717
+ conditions.append({"chunk_type": {"$eq": value}})
718
+
719
+ if not conditions:
720
+ return None
721
+ elif len(conditions) > 1:
722
+ return {"$and": conditions}
723
+ else:
724
+ return conditions[0]
725
+
726
+ def get_pool_stats(self) -> dict[str, Any]:
727
+ """Get connection pool statistics."""
728
+ return self._pool.get_stats()
729
+
730
+ async def health_check(self) -> bool:
731
+ """Perform a health check on the database and connection pool."""
732
+ return await self._pool.health_check()
733
+
734
+ async def __aenter__(self):
735
+ """Async context manager entry."""
736
+ await self.initialize()
737
+ return self
738
+
739
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
740
+ """Async context manager exit."""
741
+ await self.close()