mcp-vector-search 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (35) hide show
  1. mcp_vector_search/__init__.py +9 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/config.py +303 -0
  5. mcp_vector_search/cli/commands/index.py +304 -0
  6. mcp_vector_search/cli/commands/init.py +212 -0
  7. mcp_vector_search/cli/commands/search.py +395 -0
  8. mcp_vector_search/cli/commands/status.py +340 -0
  9. mcp_vector_search/cli/commands/watch.py +288 -0
  10. mcp_vector_search/cli/main.py +117 -0
  11. mcp_vector_search/cli/output.py +242 -0
  12. mcp_vector_search/config/__init__.py +1 -0
  13. mcp_vector_search/config/defaults.py +175 -0
  14. mcp_vector_search/config/settings.py +108 -0
  15. mcp_vector_search/core/__init__.py +1 -0
  16. mcp_vector_search/core/database.py +431 -0
  17. mcp_vector_search/core/embeddings.py +250 -0
  18. mcp_vector_search/core/exceptions.py +66 -0
  19. mcp_vector_search/core/indexer.py +310 -0
  20. mcp_vector_search/core/models.py +174 -0
  21. mcp_vector_search/core/project.py +304 -0
  22. mcp_vector_search/core/search.py +324 -0
  23. mcp_vector_search/core/watcher.py +320 -0
  24. mcp_vector_search/mcp/__init__.py +1 -0
  25. mcp_vector_search/parsers/__init__.py +1 -0
  26. mcp_vector_search/parsers/base.py +180 -0
  27. mcp_vector_search/parsers/javascript.py +238 -0
  28. mcp_vector_search/parsers/python.py +407 -0
  29. mcp_vector_search/parsers/registry.py +187 -0
  30. mcp_vector_search/py.typed +1 -0
  31. mcp_vector_search-0.0.3.dist-info/METADATA +333 -0
  32. mcp_vector_search-0.0.3.dist-info/RECORD +35 -0
  33. mcp_vector_search-0.0.3.dist-info/WHEEL +4 -0
  34. mcp_vector_search-0.0.3.dist-info/entry_points.txt +2 -0
  35. mcp_vector_search-0.0.3.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,66 @@
1
+ """Custom exception hierarchy for MCP Vector Search."""
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+
6
+ class MCPVectorSearchError(Exception):
7
+ """Base exception for MCP Vector Search."""
8
+
9
+ def __init__(self, message: str, context: Optional[Dict[str, Any]] = None) -> None:
10
+ super().__init__(message)
11
+ self.context = context or {}
12
+
13
+
14
+ class DatabaseError(MCPVectorSearchError):
15
+ """Database-related errors."""
16
+ pass
17
+
18
+
19
+ class DatabaseInitializationError(DatabaseError):
20
+ """Database initialization failed."""
21
+ pass
22
+
23
+
24
+ class DatabaseNotInitializedError(DatabaseError):
25
+ """Operation attempted on uninitialized database."""
26
+ pass
27
+
28
+
29
+ class DocumentAdditionError(DatabaseError):
30
+ """Failed to add documents to database."""
31
+ pass
32
+
33
+
34
+ class SearchError(DatabaseError):
35
+ """Search operation failed."""
36
+ pass
37
+
38
+
39
+ class ParsingError(MCPVectorSearchError):
40
+ """Code parsing errors."""
41
+ pass
42
+
43
+
44
+ class EmbeddingError(MCPVectorSearchError):
45
+ """Embedding generation errors."""
46
+ pass
47
+
48
+
49
+ class ConfigurationError(MCPVectorSearchError):
50
+ """Configuration validation errors."""
51
+ pass
52
+
53
+
54
+ class ProjectError(MCPVectorSearchError):
55
+ """Project management errors."""
56
+ pass
57
+
58
+
59
+ class ProjectNotFoundError(ProjectError):
60
+ """Project directory or configuration not found."""
61
+ pass
62
+
63
+
64
+ class ProjectInitializationError(ProjectError):
65
+ """Failed to initialize project."""
66
+ pass
@@ -0,0 +1,310 @@
1
+ """Semantic indexer for MCP Vector Search."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from typing import List, Optional, Set
6
+
7
+ from loguru import logger
8
+
9
+ from ..config.defaults import DEFAULT_IGNORE_PATTERNS
10
+ from ..parsers.registry import get_parser_registry
11
+ from .database import VectorDatabase
12
+ from .exceptions import ParsingError
13
+ from .models import CodeChunk
14
+
15
+
16
+ class SemanticIndexer:
17
+ """Semantic indexer for parsing and indexing code files."""
18
+
19
+ def __init__(
20
+ self,
21
+ database: VectorDatabase,
22
+ project_root: Path,
23
+ file_extensions: List[str],
24
+ ) -> None:
25
+ """Initialize semantic indexer.
26
+
27
+ Args:
28
+ database: Vector database instance
29
+ project_root: Project root directory
30
+ file_extensions: File extensions to index
31
+ """
32
+ self.database = database
33
+ self.project_root = project_root
34
+ self.file_extensions = set(ext.lower() for ext in file_extensions)
35
+ self.parser_registry = get_parser_registry()
36
+ self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
37
+
38
+ async def index_project(
39
+ self,
40
+ force_reindex: bool = False,
41
+ show_progress: bool = True,
42
+ ) -> int:
43
+ """Index all files in the project.
44
+
45
+ Args:
46
+ force_reindex: Whether to reindex existing files
47
+ show_progress: Whether to show progress information
48
+
49
+ Returns:
50
+ Number of files indexed
51
+ """
52
+ logger.info(f"Starting indexing of project: {self.project_root}")
53
+
54
+ # Find all indexable files
55
+ files_to_index = self._find_indexable_files()
56
+
57
+ if not files_to_index:
58
+ logger.warning("No indexable files found")
59
+ return 0
60
+
61
+ logger.info(f"Found {len(files_to_index)} files to index")
62
+
63
+ # Index files
64
+ indexed_count = 0
65
+ failed_count = 0
66
+
67
+ for i, file_path in enumerate(files_to_index):
68
+ if show_progress and (i + 1) % 10 == 0:
69
+ logger.info(f"Indexing progress: {i + 1}/{len(files_to_index)}")
70
+
71
+ try:
72
+ success = await self.index_file(file_path, force_reindex)
73
+ if success:
74
+ indexed_count += 1
75
+ else:
76
+ failed_count += 1
77
+ except Exception as e:
78
+ logger.error(f"Failed to index {file_path}: {e}")
79
+ failed_count += 1
80
+
81
+ logger.info(
82
+ f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
83
+ )
84
+
85
+ return indexed_count
86
+
87
+ async def index_file(
88
+ self,
89
+ file_path: Path,
90
+ force_reindex: bool = False,
91
+ ) -> bool:
92
+ """Index a single file.
93
+
94
+ Args:
95
+ file_path: Path to the file to index
96
+ force_reindex: Whether to reindex if already indexed
97
+
98
+ Returns:
99
+ True if file was successfully indexed
100
+ """
101
+ try:
102
+ # Check if file should be indexed
103
+ if not self._should_index_file(file_path):
104
+ return False
105
+
106
+ # Remove existing chunks for this file if reindexing
107
+ if force_reindex:
108
+ await self.database.delete_by_file(file_path)
109
+
110
+ # Parse file into chunks
111
+ chunks = await self._parse_file(file_path)
112
+
113
+ if not chunks:
114
+ logger.debug(f"No chunks extracted from {file_path}")
115
+ return True # Not an error, just empty file
116
+
117
+ # Add chunks to database
118
+ await self.database.add_chunks(chunks)
119
+
120
+ logger.debug(f"Indexed {len(chunks)} chunks from {file_path}")
121
+ return True
122
+
123
+ except Exception as e:
124
+ logger.error(f"Failed to index file {file_path}: {e}")
125
+ raise ParsingError(f"Failed to index file {file_path}: {e}") from e
126
+
127
+ async def reindex_file(self, file_path: Path) -> bool:
128
+ """Reindex a single file (removes existing chunks first).
129
+
130
+ Args:
131
+ file_path: Path to the file to reindex
132
+
133
+ Returns:
134
+ True if file was successfully reindexed
135
+ """
136
+ return await self.index_file(file_path, force_reindex=True)
137
+
138
+ async def remove_file(self, file_path: Path) -> int:
139
+ """Remove all chunks for a file from the index.
140
+
141
+ Args:
142
+ file_path: Path to the file to remove
143
+
144
+ Returns:
145
+ Number of chunks removed
146
+ """
147
+ try:
148
+ count = await self.database.delete_by_file(file_path)
149
+ logger.debug(f"Removed {count} chunks for {file_path}")
150
+ return count
151
+ except Exception as e:
152
+ logger.error(f"Failed to remove file {file_path}: {e}")
153
+ return 0
154
+
155
+ def _find_indexable_files(self) -> List[Path]:
156
+ """Find all files that should be indexed.
157
+
158
+ Returns:
159
+ List of file paths to index
160
+ """
161
+ indexable_files = []
162
+
163
+ for file_path in self.project_root.rglob("*"):
164
+ if self._should_index_file(file_path):
165
+ indexable_files.append(file_path)
166
+
167
+ return sorted(indexable_files)
168
+
169
+ def _should_index_file(self, file_path: Path) -> bool:
170
+ """Check if a file should be indexed.
171
+
172
+ Args:
173
+ file_path: Path to check
174
+
175
+ Returns:
176
+ True if file should be indexed
177
+ """
178
+ # Must be a file
179
+ if not file_path.is_file():
180
+ return False
181
+
182
+ # Check file extension
183
+ if file_path.suffix.lower() not in self.file_extensions:
184
+ return False
185
+
186
+ # Check if path should be ignored
187
+ if self._should_ignore_path(file_path):
188
+ return False
189
+
190
+ # Check file size (skip very large files)
191
+ try:
192
+ file_size = file_path.stat().st_size
193
+ if file_size > 10 * 1024 * 1024: # 10MB limit
194
+ logger.warning(f"Skipping large file: {file_path} ({file_size} bytes)")
195
+ return False
196
+ except OSError:
197
+ return False
198
+
199
+ return True
200
+
201
+ def _should_ignore_path(self, file_path: Path) -> bool:
202
+ """Check if a path should be ignored.
203
+
204
+ Args:
205
+ file_path: Path to check
206
+
207
+ Returns:
208
+ True if path should be ignored
209
+ """
210
+ try:
211
+ # Get relative path from project root
212
+ relative_path = file_path.relative_to(self.project_root)
213
+
214
+ # Check each part of the path
215
+ for part in relative_path.parts:
216
+ if part in self._ignore_patterns:
217
+ return True
218
+
219
+ # Check if any parent directory should be ignored
220
+ for parent in relative_path.parents:
221
+ for part in parent.parts:
222
+ if part in self._ignore_patterns:
223
+ return True
224
+
225
+ return False
226
+
227
+ except ValueError:
228
+ # Path is not relative to project root
229
+ return True
230
+
231
+ async def _parse_file(self, file_path: Path) -> List[CodeChunk]:
232
+ """Parse a file into code chunks.
233
+
234
+ Args:
235
+ file_path: Path to the file to parse
236
+
237
+ Returns:
238
+ List of code chunks
239
+ """
240
+ try:
241
+ # Get appropriate parser
242
+ parser = self.parser_registry.get_parser_for_file(file_path)
243
+
244
+ # Parse file
245
+ chunks = await parser.parse_file(file_path)
246
+
247
+ # Filter out empty chunks
248
+ valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
249
+
250
+ return valid_chunks
251
+
252
+ except Exception as e:
253
+ logger.error(f"Failed to parse file {file_path}: {e}")
254
+ raise ParsingError(f"Failed to parse file {file_path}: {e}") from e
255
+
256
+ def add_ignore_pattern(self, pattern: str) -> None:
257
+ """Add a pattern to ignore during indexing.
258
+
259
+ Args:
260
+ pattern: Pattern to ignore (directory or file name)
261
+ """
262
+ self._ignore_patterns.add(pattern)
263
+
264
+ def remove_ignore_pattern(self, pattern: str) -> None:
265
+ """Remove an ignore pattern.
266
+
267
+ Args:
268
+ pattern: Pattern to remove
269
+ """
270
+ self._ignore_patterns.discard(pattern)
271
+
272
+ def get_ignore_patterns(self) -> Set[str]:
273
+ """Get current ignore patterns.
274
+
275
+ Returns:
276
+ Set of ignore patterns
277
+ """
278
+ return self._ignore_patterns.copy()
279
+
280
+ async def get_indexing_stats(self) -> dict:
281
+ """Get statistics about the indexing process.
282
+
283
+ Returns:
284
+ Dictionary with indexing statistics
285
+ """
286
+ try:
287
+ # Get database stats
288
+ db_stats = await self.database.get_stats()
289
+
290
+ # Count indexable files
291
+ indexable_files = self._find_indexable_files()
292
+
293
+ return {
294
+ "total_indexable_files": len(indexable_files),
295
+ "indexed_files": db_stats.total_files,
296
+ "total_chunks": db_stats.total_chunks,
297
+ "languages": db_stats.languages,
298
+ "file_extensions": list(self.file_extensions),
299
+ "ignore_patterns": list(self._ignore_patterns),
300
+ "parser_info": self.parser_registry.get_parser_info(),
301
+ }
302
+
303
+ except Exception as e:
304
+ logger.error(f"Failed to get indexing stats: {e}")
305
+ return {
306
+ "error": str(e),
307
+ "total_indexable_files": 0,
308
+ "indexed_files": 0,
309
+ "total_chunks": 0,
310
+ }
@@ -0,0 +1,174 @@
1
+ """Data models for MCP Vector Search."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ @dataclass
11
+ class CodeChunk:
12
+ """Represents a chunk of code with metadata."""
13
+
14
+ content: str
15
+ file_path: Path
16
+ start_line: int
17
+ end_line: int
18
+ language: str
19
+ chunk_type: str = "code" # code, function, class, comment, docstring
20
+ function_name: Optional[str] = None
21
+ class_name: Optional[str] = None
22
+ docstring: Optional[str] = None
23
+ imports: List[str] = None
24
+ complexity_score: float = 0.0
25
+
26
+ def __post_init__(self) -> None:
27
+ """Initialize default values."""
28
+ if self.imports is None:
29
+ self.imports = []
30
+
31
+ @property
32
+ def id(self) -> str:
33
+ """Generate unique ID for this chunk."""
34
+ return f"{self.file_path}:{self.start_line}:{self.end_line}"
35
+
36
+ @property
37
+ def line_count(self) -> int:
38
+ """Get the number of lines in this chunk."""
39
+ return self.end_line - self.start_line + 1
40
+
41
+ def to_dict(self) -> Dict[str, Any]:
42
+ """Convert to dictionary for storage."""
43
+ return {
44
+ "content": self.content,
45
+ "file_path": str(self.file_path),
46
+ "start_line": self.start_line,
47
+ "end_line": self.end_line,
48
+ "language": self.language,
49
+ "chunk_type": self.chunk_type,
50
+ "function_name": self.function_name,
51
+ "class_name": self.class_name,
52
+ "docstring": self.docstring,
53
+ "imports": self.imports,
54
+ "complexity_score": self.complexity_score,
55
+ }
56
+
57
+ @classmethod
58
+ def from_dict(cls, data: Dict[str, Any]) -> "CodeChunk":
59
+ """Create from dictionary."""
60
+ return cls(
61
+ content=data["content"],
62
+ file_path=Path(data["file_path"]),
63
+ start_line=data["start_line"],
64
+ end_line=data["end_line"],
65
+ language=data["language"],
66
+ chunk_type=data.get("chunk_type", "code"),
67
+ function_name=data.get("function_name"),
68
+ class_name=data.get("class_name"),
69
+ docstring=data.get("docstring"),
70
+ imports=data.get("imports", []),
71
+ complexity_score=data.get("complexity_score", 0.0),
72
+ )
73
+
74
+
75
+ class SearchResult(BaseModel):
76
+ """Represents a search result with metadata."""
77
+
78
+ content: str = Field(..., description="The matched code content")
79
+ file_path: Path = Field(..., description="Path to the source file")
80
+ start_line: int = Field(..., description="Starting line number")
81
+ end_line: int = Field(..., description="Ending line number")
82
+ language: str = Field(..., description="Programming language")
83
+ similarity_score: float = Field(..., description="Similarity score (0.0 to 1.0)")
84
+ rank: int = Field(..., description="Result rank in search results")
85
+ chunk_type: str = Field(default="code", description="Type of code chunk")
86
+ function_name: Optional[str] = Field(default=None, description="Function name if applicable")
87
+ class_name: Optional[str] = Field(default=None, description="Class name if applicable")
88
+ context_before: List[str] = Field(default=[], description="Lines before the match")
89
+ context_after: List[str] = Field(default=[], description="Lines after the match")
90
+ highlights: List[str] = Field(default=[], description="Highlighted terms")
91
+
92
+ class Config:
93
+ arbitrary_types_allowed = True
94
+
95
+ @property
96
+ def line_count(self) -> int:
97
+ """Get the number of lines in this result."""
98
+ return self.end_line - self.start_line + 1
99
+
100
+ @property
101
+ def location(self) -> str:
102
+ """Get a human-readable location string."""
103
+ return f"{self.file_path}:{self.start_line}-{self.end_line}"
104
+
105
+ def to_dict(self) -> Dict[str, Any]:
106
+ """Convert to dictionary for serialization."""
107
+ return {
108
+ "content": self.content,
109
+ "file_path": str(self.file_path),
110
+ "start_line": self.start_line,
111
+ "end_line": self.end_line,
112
+ "language": self.language,
113
+ "similarity_score": self.similarity_score,
114
+ "rank": self.rank,
115
+ "chunk_type": self.chunk_type,
116
+ "function_name": self.function_name,
117
+ "class_name": self.class_name,
118
+ "context_before": self.context_before,
119
+ "context_after": self.context_after,
120
+ "highlights": self.highlights,
121
+ "location": self.location,
122
+ "line_count": self.line_count,
123
+ }
124
+
125
+
126
+ class IndexStats(BaseModel):
127
+ """Statistics about the search index."""
128
+
129
+ total_files: int = Field(..., description="Total number of indexed files")
130
+ total_chunks: int = Field(..., description="Total number of code chunks")
131
+ languages: Dict[str, int] = Field(..., description="Language distribution")
132
+ file_types: Dict[str, int] = Field(..., description="File type distribution")
133
+ index_size_mb: float = Field(..., description="Index size in megabytes")
134
+ last_updated: str = Field(..., description="Last update timestamp")
135
+ embedding_model: str = Field(..., description="Embedding model used")
136
+
137
+ def to_dict(self) -> Dict[str, Any]:
138
+ """Convert to dictionary for serialization."""
139
+ return {
140
+ "total_files": self.total_files,
141
+ "total_chunks": self.total_chunks,
142
+ "languages": self.languages,
143
+ "file_types": self.file_types,
144
+ "index_size_mb": self.index_size_mb,
145
+ "last_updated": self.last_updated,
146
+ "embedding_model": self.embedding_model,
147
+ }
148
+
149
+
150
+ class ProjectInfo(BaseModel):
151
+ """Information about a project."""
152
+
153
+ name: str = Field(..., description="Project name")
154
+ root_path: Path = Field(..., description="Project root directory")
155
+ config_path: Path = Field(..., description="Configuration file path")
156
+ index_path: Path = Field(..., description="Index directory path")
157
+ is_initialized: bool = Field(..., description="Whether project is initialized")
158
+ languages: List[str] = Field(default=[], description="Detected languages")
159
+ file_count: int = Field(default=0, description="Number of indexable files")
160
+
161
+ class Config:
162
+ arbitrary_types_allowed = True
163
+
164
+ def to_dict(self) -> Dict[str, Any]:
165
+ """Convert to dictionary for serialization."""
166
+ return {
167
+ "name": self.name,
168
+ "root_path": str(self.root_path),
169
+ "config_path": str(self.config_path),
170
+ "index_path": str(self.index_path),
171
+ "is_initialized": self.is_initialized,
172
+ "languages": self.languages,
173
+ "file_count": self.file_count,
174
+ }