mcp-vector-search 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (35) hide show
  1. mcp_vector_search/__init__.py +9 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/config.py +303 -0
  5. mcp_vector_search/cli/commands/index.py +304 -0
  6. mcp_vector_search/cli/commands/init.py +212 -0
  7. mcp_vector_search/cli/commands/search.py +395 -0
  8. mcp_vector_search/cli/commands/status.py +340 -0
  9. mcp_vector_search/cli/commands/watch.py +288 -0
  10. mcp_vector_search/cli/main.py +117 -0
  11. mcp_vector_search/cli/output.py +242 -0
  12. mcp_vector_search/config/__init__.py +1 -0
  13. mcp_vector_search/config/defaults.py +175 -0
  14. mcp_vector_search/config/settings.py +108 -0
  15. mcp_vector_search/core/__init__.py +1 -0
  16. mcp_vector_search/core/database.py +431 -0
  17. mcp_vector_search/core/embeddings.py +250 -0
  18. mcp_vector_search/core/exceptions.py +66 -0
  19. mcp_vector_search/core/indexer.py +310 -0
  20. mcp_vector_search/core/models.py +174 -0
  21. mcp_vector_search/core/project.py +304 -0
  22. mcp_vector_search/core/search.py +324 -0
  23. mcp_vector_search/core/watcher.py +320 -0
  24. mcp_vector_search/mcp/__init__.py +1 -0
  25. mcp_vector_search/parsers/__init__.py +1 -0
  26. mcp_vector_search/parsers/base.py +180 -0
  27. mcp_vector_search/parsers/javascript.py +238 -0
  28. mcp_vector_search/parsers/python.py +407 -0
  29. mcp_vector_search/parsers/registry.py +187 -0
  30. mcp_vector_search/py.typed +1 -0
  31. mcp_vector_search-0.0.3.dist-info/METADATA +333 -0
  32. mcp_vector_search-0.0.3.dist-info/RECORD +35 -0
  33. mcp_vector_search-0.0.3.dist-info/WHEEL +4 -0
  34. mcp_vector_search-0.0.3.dist-info/entry_points.txt +2 -0
  35. mcp_vector_search-0.0.3.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,304 @@
1
+ """Project detection and management for MCP Vector Search."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import List, Optional, Set
6
+
7
+ from loguru import logger
8
+
9
+ from ..config.defaults import (
10
+ DEFAULT_FILE_EXTENSIONS,
11
+ DEFAULT_IGNORE_PATTERNS,
12
+ get_default_config_path,
13
+ get_default_index_path,
14
+ get_language_from_extension,
15
+ )
16
+ from ..config.settings import ProjectConfig
17
+ from .exceptions import (
18
+ ConfigurationError,
19
+ ProjectInitializationError,
20
+ ProjectNotFoundError,
21
+ )
22
+ from .models import ProjectInfo
23
+
24
+
25
+ class ProjectManager:
26
+ """Manages project detection, initialization, and configuration."""
27
+
28
+ def __init__(self, project_root: Optional[Path] = None) -> None:
29
+ """Initialize project manager.
30
+
31
+ Args:
32
+ project_root: Project root directory. If None, will auto-detect.
33
+ """
34
+ self.project_root = project_root or self._detect_project_root()
35
+ self._config: Optional[ProjectConfig] = None
36
+
37
+ def _detect_project_root(self) -> Path:
38
+ """Auto-detect project root directory."""
39
+ current = Path.cwd()
40
+
41
+ # Look for common project indicators
42
+ indicators = [
43
+ ".git",
44
+ ".mcp-vector-search",
45
+ "pyproject.toml",
46
+ "package.json",
47
+ "Cargo.toml",
48
+ "go.mod",
49
+ "pom.xml",
50
+ "build.gradle",
51
+ ".project",
52
+ ]
53
+
54
+ # Walk up the directory tree
55
+ for path in [current] + list(current.parents):
56
+ for indicator in indicators:
57
+ if (path / indicator).exists():
58
+ logger.debug(f"Detected project root: {path} (found {indicator})")
59
+ return path
60
+
61
+ # Default to current directory
62
+ logger.debug(f"Using current directory as project root: {current}")
63
+ return current
64
+
65
+ def is_initialized(self) -> bool:
66
+ """Check if project is initialized for MCP Vector Search."""
67
+ config_path = get_default_config_path(self.project_root)
68
+ index_path = get_default_index_path(self.project_root)
69
+
70
+ return config_path.exists() and index_path.exists()
71
+
72
+ def initialize(
73
+ self,
74
+ file_extensions: Optional[List[str]] = None,
75
+ embedding_model: str = "microsoft/codebert-base",
76
+ similarity_threshold: float = 0.75,
77
+ force: bool = False,
78
+ ) -> ProjectConfig:
79
+ """Initialize project for MCP Vector Search.
80
+
81
+ Args:
82
+ file_extensions: File extensions to index
83
+ embedding_model: Embedding model to use
84
+ similarity_threshold: Similarity threshold for search
85
+ force: Force re-initialization if already exists
86
+
87
+ Returns:
88
+ Project configuration
89
+
90
+ Raises:
91
+ ProjectInitializationError: If initialization fails
92
+ """
93
+ if self.is_initialized() and not force:
94
+ raise ProjectInitializationError(
95
+ f"Project already initialized at {self.project_root}. Use --force to re-initialize."
96
+ )
97
+
98
+ try:
99
+ # Create index directory
100
+ index_path = get_default_index_path(self.project_root)
101
+ index_path.mkdir(parents=True, exist_ok=True)
102
+
103
+ # Detect languages and files
104
+ detected_languages = self.detect_languages()
105
+ file_count = self.count_indexable_files(file_extensions or DEFAULT_FILE_EXTENSIONS)
106
+
107
+ # Create configuration
108
+ config = ProjectConfig(
109
+ project_root=self.project_root,
110
+ index_path=index_path,
111
+ file_extensions=file_extensions or DEFAULT_FILE_EXTENSIONS,
112
+ embedding_model=embedding_model,
113
+ similarity_threshold=similarity_threshold,
114
+ languages=detected_languages,
115
+ )
116
+
117
+ # Save configuration
118
+ self.save_config(config)
119
+
120
+ logger.info(
121
+ f"Initialized project at {self.project_root}",
122
+ languages=detected_languages,
123
+ file_count=file_count,
124
+ extensions=config.file_extensions,
125
+ )
126
+
127
+ self._config = config
128
+ return config
129
+
130
+ except Exception as e:
131
+ raise ProjectInitializationError(f"Failed to initialize project: {e}") from e
132
+
133
+ def load_config(self) -> ProjectConfig:
134
+ """Load project configuration.
135
+
136
+ Returns:
137
+ Project configuration
138
+
139
+ Raises:
140
+ ProjectNotFoundError: If project is not initialized
141
+ ConfigurationError: If configuration is invalid
142
+ """
143
+ if not self.is_initialized():
144
+ raise ProjectNotFoundError(
145
+ f"Project not initialized at {self.project_root}. Run 'mcp-vector-search init' first."
146
+ )
147
+
148
+ config_path = get_default_config_path(self.project_root)
149
+
150
+ try:
151
+ with open(config_path, "r") as f:
152
+ config_data = json.load(f)
153
+
154
+ # Convert paths back to Path objects
155
+ config_data["project_root"] = Path(config_data["project_root"])
156
+ config_data["index_path"] = Path(config_data["index_path"])
157
+
158
+ config = ProjectConfig(**config_data)
159
+ self._config = config
160
+ return config
161
+
162
+ except Exception as e:
163
+ raise ConfigurationError(f"Failed to load configuration: {e}") from e
164
+
165
+ def save_config(self, config: ProjectConfig) -> None:
166
+ """Save project configuration.
167
+
168
+ Args:
169
+ config: Project configuration to save
170
+
171
+ Raises:
172
+ ConfigurationError: If saving fails
173
+ """
174
+ config_path = get_default_config_path(self.project_root)
175
+ config_path.parent.mkdir(parents=True, exist_ok=True)
176
+
177
+ try:
178
+ # Convert to JSON-serializable format
179
+ config_data = config.dict()
180
+ config_data["project_root"] = str(config.project_root)
181
+ config_data["index_path"] = str(config.index_path)
182
+
183
+ with open(config_path, "w") as f:
184
+ json.dump(config_data, f, indent=2)
185
+
186
+ logger.debug(f"Saved configuration to {config_path}")
187
+
188
+ except Exception as e:
189
+ raise ConfigurationError(f"Failed to save configuration: {e}") from e
190
+
191
+ @property
192
+ def config(self) -> ProjectConfig:
193
+ """Get project configuration, loading if necessary."""
194
+ if self._config is None:
195
+ self._config = self.load_config()
196
+ return self._config
197
+
198
+ def detect_languages(self) -> List[str]:
199
+ """Detect programming languages in the project.
200
+
201
+ Returns:
202
+ List of detected language names
203
+ """
204
+ languages: Set[str] = set()
205
+
206
+ for file_path in self._iter_source_files():
207
+ language = get_language_from_extension(file_path.suffix)
208
+ if language != "text":
209
+ languages.add(language)
210
+
211
+ return sorted(list(languages))
212
+
213
+ def count_indexable_files(self, extensions: List[str]) -> int:
214
+ """Count files that can be indexed.
215
+
216
+ Args:
217
+ extensions: File extensions to count
218
+
219
+ Returns:
220
+ Number of indexable files
221
+ """
222
+ count = 0
223
+ for file_path in self._iter_source_files():
224
+ if file_path.suffix in extensions:
225
+ count += 1
226
+ return count
227
+
228
+ def get_project_info(self) -> ProjectInfo:
229
+ """Get comprehensive project information.
230
+
231
+ Returns:
232
+ Project information
233
+ """
234
+ config_path = get_default_config_path(self.project_root)
235
+ index_path = get_default_index_path(self.project_root)
236
+
237
+ is_initialized = self.is_initialized()
238
+ languages = []
239
+ file_count = 0
240
+
241
+ if is_initialized:
242
+ try:
243
+ config = self.config
244
+ languages = config.languages
245
+ file_count = self.count_indexable_files(config.file_extensions)
246
+ except Exception:
247
+ # Ignore errors when getting detailed info
248
+ pass
249
+
250
+ return ProjectInfo(
251
+ name=self.project_root.name,
252
+ root_path=self.project_root,
253
+ config_path=config_path,
254
+ index_path=index_path,
255
+ is_initialized=is_initialized,
256
+ languages=languages,
257
+ file_count=file_count,
258
+ )
259
+
260
+ def _iter_source_files(self) -> List[Path]:
261
+ """Iterate over source files in the project.
262
+
263
+ Returns:
264
+ List of source file paths
265
+ """
266
+ files = []
267
+
268
+ for path in self.project_root.rglob("*"):
269
+ if not path.is_file():
270
+ continue
271
+
272
+ # Skip ignored patterns
273
+ if self._should_ignore_path(path):
274
+ continue
275
+
276
+ files.append(path)
277
+
278
+ return files
279
+
280
+ def _should_ignore_path(self, path: Path) -> bool:
281
+ """Check if a path should be ignored.
282
+
283
+ Args:
284
+ path: Path to check
285
+
286
+ Returns:
287
+ True if path should be ignored
288
+ """
289
+ # Check if any parent directory is in ignore patterns
290
+ for part in path.parts:
291
+ if part in DEFAULT_IGNORE_PATTERNS:
292
+ return True
293
+
294
+ # Check relative path from project root
295
+ try:
296
+ relative_path = path.relative_to(self.project_root)
297
+ for part in relative_path.parts:
298
+ if part in DEFAULT_IGNORE_PATTERNS:
299
+ return True
300
+ except ValueError:
301
+ # Path is not relative to project root
302
+ return True
303
+
304
+ return False
@@ -0,0 +1,324 @@
1
+ """Semantic search engine for MCP Vector Search."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from loguru import logger
8
+
9
+ from .database import VectorDatabase
10
+ from .exceptions import SearchError
11
+ from .models import SearchResult
12
+
13
+
14
+ class SemanticSearchEngine:
15
+ """Semantic search engine for code search."""
16
+
17
+ def __init__(
18
+ self,
19
+ database: VectorDatabase,
20
+ project_root: Path,
21
+ similarity_threshold: float = 0.7,
22
+ ) -> None:
23
+ """Initialize semantic search engine.
24
+
25
+ Args:
26
+ database: Vector database instance
27
+ project_root: Project root directory
28
+ similarity_threshold: Default similarity threshold
29
+ """
30
+ self.database = database
31
+ self.project_root = project_root
32
+ self.similarity_threshold = similarity_threshold
33
+
34
+ async def search(
35
+ self,
36
+ query: str,
37
+ limit: int = 10,
38
+ filters: Optional[Dict[str, Any]] = None,
39
+ similarity_threshold: Optional[float] = None,
40
+ include_context: bool = True,
41
+ ) -> List[SearchResult]:
42
+ """Perform semantic search for code.
43
+
44
+ Args:
45
+ query: Search query
46
+ limit: Maximum number of results
47
+ filters: Optional filters (language, file_path, etc.)
48
+ similarity_threshold: Minimum similarity score
49
+ include_context: Whether to include context lines
50
+
51
+ Returns:
52
+ List of search results
53
+ """
54
+ if not query.strip():
55
+ return []
56
+
57
+ threshold = similarity_threshold or self.similarity_threshold
58
+
59
+ try:
60
+ # Preprocess query
61
+ processed_query = self._preprocess_query(query)
62
+
63
+ # Perform vector search
64
+ results = await self.database.search(
65
+ query=processed_query,
66
+ limit=limit,
67
+ filters=filters,
68
+ similarity_threshold=threshold,
69
+ )
70
+
71
+ # Post-process results
72
+ enhanced_results = []
73
+ for result in results:
74
+ enhanced_result = await self._enhance_result(result, include_context)
75
+ enhanced_results.append(enhanced_result)
76
+
77
+ # Apply additional ranking if needed
78
+ ranked_results = self._rerank_results(enhanced_results, query)
79
+
80
+ logger.debug(f"Search for '{query}' returned {len(ranked_results)} results")
81
+ return ranked_results
82
+
83
+ except Exception as e:
84
+ logger.error(f"Search failed for query '{query}': {e}")
85
+ raise SearchError(f"Search failed: {e}") from e
86
+
87
+ async def search_similar(
88
+ self,
89
+ file_path: Path,
90
+ function_name: Optional[str] = None,
91
+ limit: int = 10,
92
+ similarity_threshold: Optional[float] = None,
93
+ ) -> List[SearchResult]:
94
+ """Find code similar to a specific function or file.
95
+
96
+ Args:
97
+ file_path: Path to the reference file
98
+ function_name: Specific function name (optional)
99
+ limit: Maximum number of results
100
+ similarity_threshold: Minimum similarity score
101
+
102
+ Returns:
103
+ List of similar code results
104
+ """
105
+ try:
106
+ # Read the reference file
107
+ with open(file_path, "r", encoding="utf-8") as f:
108
+ content = f.read()
109
+
110
+ # If function name is specified, try to extract just that function
111
+ if function_name:
112
+ function_content = self._extract_function_content(content, function_name)
113
+ if function_content:
114
+ content = function_content
115
+
116
+ # Use the content as the search query
117
+ return await self.search(
118
+ query=content,
119
+ limit=limit,
120
+ similarity_threshold=similarity_threshold,
121
+ include_context=True,
122
+ )
123
+
124
+ except Exception as e:
125
+ logger.error(f"Similar search failed for {file_path}: {e}")
126
+ raise SearchError(f"Similar search failed: {e}") from e
127
+
128
+ async def search_by_context(
129
+ self,
130
+ context_description: str,
131
+ focus_areas: Optional[List[str]] = None,
132
+ limit: int = 10,
133
+ ) -> List[SearchResult]:
134
+ """Search for code based on contextual description.
135
+
136
+ Args:
137
+ context_description: Description of what you're looking for
138
+ focus_areas: Areas to focus on (e.g., ["security", "authentication"])
139
+ limit: Maximum number of results
140
+
141
+ Returns:
142
+ List of contextually relevant results
143
+ """
144
+ # Build enhanced query with focus areas
145
+ query_parts = [context_description]
146
+
147
+ if focus_areas:
148
+ query_parts.extend(focus_areas)
149
+
150
+ enhanced_query = " ".join(query_parts)
151
+
152
+ return await self.search(
153
+ query=enhanced_query,
154
+ limit=limit,
155
+ include_context=True,
156
+ )
157
+
158
+ def _preprocess_query(self, query: str) -> str:
159
+ """Preprocess search query for better results.
160
+
161
+ Args:
162
+ query: Raw search query
163
+
164
+ Returns:
165
+ Processed query
166
+ """
167
+ # Remove extra whitespace
168
+ query = re.sub(r"\s+", " ", query.strip())
169
+
170
+ # Expand common abbreviations
171
+ expansions = {
172
+ "auth": "authentication",
173
+ "db": "database",
174
+ "api": "application programming interface",
175
+ "ui": "user interface",
176
+ "util": "utility",
177
+ "config": "configuration",
178
+ }
179
+
180
+ words = query.lower().split()
181
+ expanded_words = []
182
+
183
+ for word in words:
184
+ if word in expansions:
185
+ expanded_words.extend([word, expansions[word]])
186
+ else:
187
+ expanded_words.append(word)
188
+
189
+ return " ".join(expanded_words)
190
+
191
+ async def _enhance_result(
192
+ self, result: SearchResult, include_context: bool
193
+ ) -> SearchResult:
194
+ """Enhance search result with additional information.
195
+
196
+ Args:
197
+ result: Original search result
198
+ include_context: Whether to include context lines
199
+
200
+ Returns:
201
+ Enhanced search result
202
+ """
203
+ if not include_context:
204
+ return result
205
+
206
+ try:
207
+ # Read the source file to get context
208
+ with open(result.file_path, "r", encoding="utf-8") as f:
209
+ lines = f.readlines()
210
+
211
+ # Get context lines before and after
212
+ context_size = 3
213
+ start_idx = max(0, result.start_line - 1 - context_size)
214
+ end_idx = min(len(lines), result.end_line + context_size)
215
+
216
+ context_before = [
217
+ line.rstrip() for line in lines[start_idx : result.start_line - 1]
218
+ ]
219
+ context_after = [
220
+ line.rstrip() for line in lines[result.end_line : end_idx]
221
+ ]
222
+
223
+ # Update result with context
224
+ result.context_before = context_before
225
+ result.context_after = context_after
226
+
227
+ except Exception as e:
228
+ logger.warning(f"Failed to get context for {result.file_path}: {e}")
229
+
230
+ return result
231
+
232
+ def _rerank_results(
233
+ self, results: List[SearchResult], query: str
234
+ ) -> List[SearchResult]:
235
+ """Apply additional ranking to search results.
236
+
237
+ Args:
238
+ results: Original search results
239
+ query: Original search query
240
+
241
+ Returns:
242
+ Reranked search results
243
+ """
244
+ # Simple reranking based on additional factors
245
+ query_lower = query.lower()
246
+
247
+ for result in results:
248
+ # Boost score for exact matches in function/class names
249
+ boost = 0.0
250
+
251
+ if result.function_name and query_lower in result.function_name.lower():
252
+ boost += 0.1
253
+
254
+ if result.class_name and query_lower in result.class_name.lower():
255
+ boost += 0.1
256
+
257
+ # Boost score for matches in file name
258
+ if query_lower in result.file_path.name.lower():
259
+ boost += 0.05
260
+
261
+ # Apply boost
262
+ result.similarity_score = min(1.0, result.similarity_score + boost)
263
+
264
+ # Re-sort by similarity score
265
+ results.sort(key=lambda r: r.similarity_score, reverse=True)
266
+
267
+ # Update ranks
268
+ for i, result in enumerate(results):
269
+ result.rank = i + 1
270
+
271
+ return results
272
+
273
+ def _extract_function_content(self, content: str, function_name: str) -> Optional[str]:
274
+ """Extract content of a specific function from code.
275
+
276
+ Args:
277
+ content: Full file content
278
+ function_name: Name of function to extract
279
+
280
+ Returns:
281
+ Function content if found, None otherwise
282
+ """
283
+ # Simple regex-based extraction (could be improved with AST)
284
+ pattern = rf"^\s*def\s+{re.escape(function_name)}\s*\("
285
+ lines = content.splitlines()
286
+
287
+ for i, line in enumerate(lines):
288
+ if re.match(pattern, line):
289
+ # Found function start, now find the end
290
+ start_line = i
291
+ indent_level = len(line) - len(line.lstrip())
292
+
293
+ # Find end of function
294
+ end_line = len(lines)
295
+ for j in range(i + 1, len(lines)):
296
+ if lines[j].strip(): # Skip empty lines
297
+ current_indent = len(lines[j]) - len(lines[j].lstrip())
298
+ if current_indent <= indent_level:
299
+ end_line = j
300
+ break
301
+
302
+ return "\n".join(lines[start_line:end_line])
303
+
304
+ return None
305
+
306
+ async def get_search_stats(self) -> Dict[str, Any]:
307
+ """Get search engine statistics.
308
+
309
+ Returns:
310
+ Dictionary with search statistics
311
+ """
312
+ try:
313
+ db_stats = await self.database.get_stats()
314
+
315
+ return {
316
+ "total_chunks": db_stats.total_chunks,
317
+ "languages": db_stats.languages,
318
+ "similarity_threshold": self.similarity_threshold,
319
+ "project_root": str(self.project_root),
320
+ }
321
+
322
+ except Exception as e:
323
+ logger.error(f"Failed to get search stats: {e}")
324
+ return {"error": str(e)}