mcp-vector-search 0.0.3__py3-none-any.whl → 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +3 -2
- mcp_vector_search/cli/commands/auto_index.py +397 -0
- mcp_vector_search/cli/commands/config.py +88 -40
- mcp_vector_search/cli/commands/index.py +198 -52
- mcp_vector_search/cli/commands/init.py +471 -58
- mcp_vector_search/cli/commands/install.py +284 -0
- mcp_vector_search/cli/commands/mcp.py +495 -0
- mcp_vector_search/cli/commands/search.py +241 -87
- mcp_vector_search/cli/commands/status.py +184 -58
- mcp_vector_search/cli/commands/watch.py +34 -35
- mcp_vector_search/cli/didyoumean.py +184 -0
- mcp_vector_search/cli/export.py +320 -0
- mcp_vector_search/cli/history.py +292 -0
- mcp_vector_search/cli/interactive.py +342 -0
- mcp_vector_search/cli/main.py +175 -27
- mcp_vector_search/cli/output.py +63 -45
- mcp_vector_search/config/defaults.py +50 -36
- mcp_vector_search/config/settings.py +49 -35
- mcp_vector_search/core/auto_indexer.py +298 -0
- mcp_vector_search/core/connection_pool.py +322 -0
- mcp_vector_search/core/database.py +335 -25
- mcp_vector_search/core/embeddings.py +73 -29
- mcp_vector_search/core/exceptions.py +19 -2
- mcp_vector_search/core/factory.py +310 -0
- mcp_vector_search/core/git_hooks.py +345 -0
- mcp_vector_search/core/indexer.py +237 -73
- mcp_vector_search/core/models.py +21 -19
- mcp_vector_search/core/project.py +73 -58
- mcp_vector_search/core/scheduler.py +330 -0
- mcp_vector_search/core/search.py +574 -86
- mcp_vector_search/core/watcher.py +48 -46
- mcp_vector_search/mcp/__init__.py +4 -0
- mcp_vector_search/mcp/__main__.py +25 -0
- mcp_vector_search/mcp/server.py +701 -0
- mcp_vector_search/parsers/base.py +30 -31
- mcp_vector_search/parsers/javascript.py +74 -48
- mcp_vector_search/parsers/python.py +57 -49
- mcp_vector_search/parsers/registry.py +47 -32
- mcp_vector_search/parsers/text.py +179 -0
- mcp_vector_search/utils/__init__.py +40 -0
- mcp_vector_search/utils/gitignore.py +229 -0
- mcp_vector_search/utils/timing.py +334 -0
- mcp_vector_search/utils/version.py +47 -0
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/METADATA +173 -7
- mcp_vector_search-0.4.12.dist-info/RECORD +54 -0
- mcp_vector_search-0.0.3.dist-info/RECORD +0 -35
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
"""Semantic indexer for MCP Vector Search."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import List, Optional, Set
|
|
6
7
|
|
|
7
8
|
from loguru import logger
|
|
8
9
|
|
|
9
10
|
from ..config.defaults import DEFAULT_IGNORE_PATTERNS
|
|
10
11
|
from ..parsers.registry import get_parser_registry
|
|
12
|
+
from ..utils.gitignore import create_gitignore_parser, GitignoreParser
|
|
11
13
|
from .database import VectorDatabase
|
|
12
14
|
from .exceptions import ParsingError
|
|
13
15
|
from .models import CodeChunk
|
|
@@ -20,20 +22,39 @@ class SemanticIndexer:
|
|
|
20
22
|
self,
|
|
21
23
|
database: VectorDatabase,
|
|
22
24
|
project_root: Path,
|
|
23
|
-
file_extensions:
|
|
25
|
+
file_extensions: list[str],
|
|
26
|
+
max_workers: int | None = None,
|
|
27
|
+
batch_size: int = 10,
|
|
24
28
|
) -> None:
|
|
25
29
|
"""Initialize semantic indexer.
|
|
26
|
-
|
|
30
|
+
|
|
27
31
|
Args:
|
|
28
32
|
database: Vector database instance
|
|
29
33
|
project_root: Project root directory
|
|
30
34
|
file_extensions: File extensions to index
|
|
35
|
+
max_workers: Maximum number of worker threads for parallel processing
|
|
36
|
+
batch_size: Number of files to process in each batch
|
|
31
37
|
"""
|
|
32
38
|
self.database = database
|
|
33
39
|
self.project_root = project_root
|
|
34
|
-
self.file_extensions =
|
|
40
|
+
self.file_extensions = {ext.lower() for ext in file_extensions}
|
|
35
41
|
self.parser_registry = get_parser_registry()
|
|
36
42
|
self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
|
|
43
|
+
self.max_workers = max_workers or min(
|
|
44
|
+
4, (asyncio.get_event_loop().get_debug() and 1) or 4
|
|
45
|
+
)
|
|
46
|
+
self.batch_size = batch_size
|
|
47
|
+
self._index_metadata_file = (
|
|
48
|
+
project_root / ".mcp-vector-search" / "index_metadata.json"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Initialize gitignore parser
|
|
52
|
+
try:
|
|
53
|
+
self.gitignore_parser = create_gitignore_parser(project_root)
|
|
54
|
+
logger.debug(f"Loaded {len(self.gitignore_parser.patterns)} gitignore patterns")
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.warning(f"Failed to load gitignore patterns: {e}")
|
|
57
|
+
self.gitignore_parser = None
|
|
37
58
|
|
|
38
59
|
async def index_project(
|
|
39
60
|
self,
|
|
@@ -41,60 +62,191 @@ class SemanticIndexer:
|
|
|
41
62
|
show_progress: bool = True,
|
|
42
63
|
) -> int:
|
|
43
64
|
"""Index all files in the project.
|
|
44
|
-
|
|
65
|
+
|
|
45
66
|
Args:
|
|
46
67
|
force_reindex: Whether to reindex existing files
|
|
47
68
|
show_progress: Whether to show progress information
|
|
48
|
-
|
|
69
|
+
|
|
49
70
|
Returns:
|
|
50
71
|
Number of files indexed
|
|
51
72
|
"""
|
|
52
73
|
logger.info(f"Starting indexing of project: {self.project_root}")
|
|
53
|
-
|
|
74
|
+
|
|
54
75
|
# Find all indexable files
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
if not
|
|
76
|
+
all_files = self._find_indexable_files()
|
|
77
|
+
|
|
78
|
+
if not all_files:
|
|
58
79
|
logger.warning("No indexable files found")
|
|
59
80
|
return 0
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
81
|
+
|
|
82
|
+
# Load existing metadata for incremental indexing
|
|
83
|
+
metadata = self._load_index_metadata()
|
|
84
|
+
|
|
85
|
+
# Filter files that need indexing
|
|
86
|
+
if force_reindex:
|
|
87
|
+
files_to_index = all_files
|
|
88
|
+
logger.info(f"Force reindex: processing all {len(files_to_index)} files")
|
|
89
|
+
else:
|
|
90
|
+
files_to_index = [
|
|
91
|
+
f for f in all_files if self._needs_reindexing(f, metadata)
|
|
92
|
+
]
|
|
93
|
+
logger.info(
|
|
94
|
+
f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if not files_to_index:
|
|
98
|
+
logger.info("All files are up to date")
|
|
99
|
+
return 0
|
|
100
|
+
|
|
101
|
+
# Index files in parallel batches
|
|
64
102
|
indexed_count = 0
|
|
65
103
|
failed_count = 0
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
104
|
+
|
|
105
|
+
# Process files in batches for better memory management
|
|
106
|
+
for i in range(0, len(files_to_index), self.batch_size):
|
|
107
|
+
batch = files_to_index[i : i + self.batch_size]
|
|
108
|
+
|
|
109
|
+
if show_progress:
|
|
110
|
+
logger.info(
|
|
111
|
+
f"Processing batch {i // self.batch_size + 1}/{(len(files_to_index) + self.batch_size - 1) // self.batch_size} ({len(batch)} files)"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Process batch in parallel
|
|
115
|
+
batch_results = await self._process_file_batch(batch, force_reindex)
|
|
116
|
+
|
|
117
|
+
# Count results
|
|
118
|
+
for success in batch_results:
|
|
73
119
|
if success:
|
|
74
120
|
indexed_count += 1
|
|
75
121
|
else:
|
|
76
122
|
failed_count += 1
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
123
|
+
|
|
124
|
+
# Update metadata for successfully indexed files
|
|
125
|
+
if indexed_count > 0:
|
|
126
|
+
for file_path in files_to_index:
|
|
127
|
+
try:
|
|
128
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
129
|
+
except OSError:
|
|
130
|
+
pass # File might have been deleted during indexing
|
|
131
|
+
|
|
132
|
+
self._save_index_metadata(metadata)
|
|
133
|
+
|
|
81
134
|
logger.info(
|
|
82
135
|
f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
|
|
83
136
|
)
|
|
84
|
-
|
|
137
|
+
|
|
85
138
|
return indexed_count
|
|
86
139
|
|
|
140
|
+
async def _process_file_batch(
|
|
141
|
+
self, file_paths: list[Path], force_reindex: bool = False
|
|
142
|
+
) -> list[bool]:
|
|
143
|
+
"""Process a batch of files in parallel.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
file_paths: List of file paths to process
|
|
147
|
+
force_reindex: Whether to force reindexing
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
List of success flags for each file
|
|
151
|
+
"""
|
|
152
|
+
# Create tasks for parallel processing
|
|
153
|
+
tasks = []
|
|
154
|
+
for file_path in file_paths:
|
|
155
|
+
task = asyncio.create_task(self._index_file_safe(file_path, force_reindex))
|
|
156
|
+
tasks.append(task)
|
|
157
|
+
|
|
158
|
+
# Wait for all tasks to complete
|
|
159
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
160
|
+
|
|
161
|
+
# Convert results to success flags
|
|
162
|
+
success_flags = []
|
|
163
|
+
for i, result in enumerate(results):
|
|
164
|
+
if isinstance(result, Exception):
|
|
165
|
+
logger.error(f"Failed to index {file_paths[i]}: {result}")
|
|
166
|
+
success_flags.append(False)
|
|
167
|
+
else:
|
|
168
|
+
success_flags.append(result)
|
|
169
|
+
|
|
170
|
+
return success_flags
|
|
171
|
+
|
|
172
|
+
def _load_index_metadata(self) -> dict[str, float]:
|
|
173
|
+
"""Load file modification times from metadata file.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Dictionary mapping file paths to modification times
|
|
177
|
+
"""
|
|
178
|
+
if not self._index_metadata_file.exists():
|
|
179
|
+
return {}
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
with open(self._index_metadata_file) as f:
|
|
183
|
+
return json.load(f)
|
|
184
|
+
except Exception as e:
|
|
185
|
+
logger.warning(f"Failed to load index metadata: {e}")
|
|
186
|
+
return {}
|
|
187
|
+
|
|
188
|
+
def _save_index_metadata(self, metadata: dict[str, float]) -> None:
|
|
189
|
+
"""Save file modification times to metadata file.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
metadata: Dictionary mapping file paths to modification times
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
# Ensure directory exists
|
|
196
|
+
self._index_metadata_file.parent.mkdir(parents=True, exist_ok=True)
|
|
197
|
+
|
|
198
|
+
with open(self._index_metadata_file, "w") as f:
|
|
199
|
+
json.dump(metadata, f, indent=2)
|
|
200
|
+
except Exception as e:
|
|
201
|
+
logger.warning(f"Failed to save index metadata: {e}")
|
|
202
|
+
|
|
203
|
+
def _needs_reindexing(self, file_path: Path, metadata: dict[str, float]) -> bool:
|
|
204
|
+
"""Check if a file needs reindexing based on modification time.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
file_path: Path to the file
|
|
208
|
+
metadata: Current metadata dictionary
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
True if file needs reindexing
|
|
212
|
+
"""
|
|
213
|
+
try:
|
|
214
|
+
current_mtime = os.path.getmtime(file_path)
|
|
215
|
+
stored_mtime = metadata.get(str(file_path), 0)
|
|
216
|
+
return current_mtime > stored_mtime
|
|
217
|
+
except OSError:
|
|
218
|
+
# File doesn't exist or can't be accessed
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
async def _index_file_safe(
|
|
222
|
+
self, file_path: Path, force_reindex: bool = False
|
|
223
|
+
) -> bool:
|
|
224
|
+
"""Safely index a single file with error handling.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
file_path: Path to the file to index
|
|
228
|
+
force_reindex: Whether to force reindexing
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
True if successful, False otherwise
|
|
232
|
+
"""
|
|
233
|
+
try:
|
|
234
|
+
return await self.index_file(file_path, force_reindex)
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.error(f"Error indexing {file_path}: {e}")
|
|
237
|
+
return False
|
|
238
|
+
|
|
87
239
|
async def index_file(
|
|
88
240
|
self,
|
|
89
241
|
file_path: Path,
|
|
90
242
|
force_reindex: bool = False,
|
|
91
243
|
) -> bool:
|
|
92
244
|
"""Index a single file.
|
|
93
|
-
|
|
245
|
+
|
|
94
246
|
Args:
|
|
95
247
|
file_path: Path to the file to index
|
|
96
248
|
force_reindex: Whether to reindex if already indexed
|
|
97
|
-
|
|
249
|
+
|
|
98
250
|
Returns:
|
|
99
251
|
True if file was successfully indexed
|
|
100
252
|
"""
|
|
@@ -102,34 +254,39 @@ class SemanticIndexer:
|
|
|
102
254
|
# Check if file should be indexed
|
|
103
255
|
if not self._should_index_file(file_path):
|
|
104
256
|
return False
|
|
105
|
-
|
|
106
|
-
#
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
257
|
+
|
|
258
|
+
# Always remove existing chunks when reindexing a file
|
|
259
|
+
# This prevents duplicate chunks and ensures consistency
|
|
260
|
+
await self.database.delete_by_file(file_path)
|
|
261
|
+
|
|
110
262
|
# Parse file into chunks
|
|
111
263
|
chunks = await self._parse_file(file_path)
|
|
112
|
-
|
|
264
|
+
|
|
113
265
|
if not chunks:
|
|
114
266
|
logger.debug(f"No chunks extracted from {file_path}")
|
|
115
267
|
return True # Not an error, just empty file
|
|
116
|
-
|
|
268
|
+
|
|
117
269
|
# Add chunks to database
|
|
118
270
|
await self.database.add_chunks(chunks)
|
|
119
|
-
|
|
271
|
+
|
|
272
|
+
# Update metadata after successful indexing
|
|
273
|
+
metadata = self._load_index_metadata()
|
|
274
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
275
|
+
self._save_index_metadata(metadata)
|
|
276
|
+
|
|
120
277
|
logger.debug(f"Indexed {len(chunks)} chunks from {file_path}")
|
|
121
278
|
return True
|
|
122
|
-
|
|
279
|
+
|
|
123
280
|
except Exception as e:
|
|
124
281
|
logger.error(f"Failed to index file {file_path}: {e}")
|
|
125
282
|
raise ParsingError(f"Failed to index file {file_path}: {e}") from e
|
|
126
283
|
|
|
127
284
|
async def reindex_file(self, file_path: Path) -> bool:
|
|
128
285
|
"""Reindex a single file (removes existing chunks first).
|
|
129
|
-
|
|
286
|
+
|
|
130
287
|
Args:
|
|
131
288
|
file_path: Path to the file to reindex
|
|
132
|
-
|
|
289
|
+
|
|
133
290
|
Returns:
|
|
134
291
|
True if file was successfully reindexed
|
|
135
292
|
"""
|
|
@@ -137,10 +294,10 @@ class SemanticIndexer:
|
|
|
137
294
|
|
|
138
295
|
async def remove_file(self, file_path: Path) -> int:
|
|
139
296
|
"""Remove all chunks for a file from the index.
|
|
140
|
-
|
|
297
|
+
|
|
141
298
|
Args:
|
|
142
299
|
file_path: Path to the file to remove
|
|
143
|
-
|
|
300
|
+
|
|
144
301
|
Returns:
|
|
145
302
|
Number of chunks removed
|
|
146
303
|
"""
|
|
@@ -152,41 +309,41 @@ class SemanticIndexer:
|
|
|
152
309
|
logger.error(f"Failed to remove file {file_path}: {e}")
|
|
153
310
|
return 0
|
|
154
311
|
|
|
155
|
-
def _find_indexable_files(self) ->
|
|
312
|
+
def _find_indexable_files(self) -> list[Path]:
|
|
156
313
|
"""Find all files that should be indexed.
|
|
157
|
-
|
|
314
|
+
|
|
158
315
|
Returns:
|
|
159
316
|
List of file paths to index
|
|
160
317
|
"""
|
|
161
318
|
indexable_files = []
|
|
162
|
-
|
|
319
|
+
|
|
163
320
|
for file_path in self.project_root.rglob("*"):
|
|
164
321
|
if self._should_index_file(file_path):
|
|
165
322
|
indexable_files.append(file_path)
|
|
166
|
-
|
|
323
|
+
|
|
167
324
|
return sorted(indexable_files)
|
|
168
325
|
|
|
169
326
|
def _should_index_file(self, file_path: Path) -> bool:
|
|
170
327
|
"""Check if a file should be indexed.
|
|
171
|
-
|
|
328
|
+
|
|
172
329
|
Args:
|
|
173
330
|
file_path: Path to check
|
|
174
|
-
|
|
331
|
+
|
|
175
332
|
Returns:
|
|
176
333
|
True if file should be indexed
|
|
177
334
|
"""
|
|
178
335
|
# Must be a file
|
|
179
336
|
if not file_path.is_file():
|
|
180
337
|
return False
|
|
181
|
-
|
|
338
|
+
|
|
182
339
|
# Check file extension
|
|
183
340
|
if file_path.suffix.lower() not in self.file_extensions:
|
|
184
341
|
return False
|
|
185
|
-
|
|
342
|
+
|
|
186
343
|
# Check if path should be ignored
|
|
187
344
|
if self._should_ignore_path(file_path):
|
|
188
345
|
return False
|
|
189
|
-
|
|
346
|
+
|
|
190
347
|
# Check file size (skip very large files)
|
|
191
348
|
try:
|
|
192
349
|
file_size = file_path.stat().st_size
|
|
@@ -195,67 +352,74 @@ class SemanticIndexer:
|
|
|
195
352
|
return False
|
|
196
353
|
except OSError:
|
|
197
354
|
return False
|
|
198
|
-
|
|
355
|
+
|
|
199
356
|
return True
|
|
200
357
|
|
|
201
358
|
def _should_ignore_path(self, file_path: Path) -> bool:
|
|
202
359
|
"""Check if a path should be ignored.
|
|
203
|
-
|
|
360
|
+
|
|
204
361
|
Args:
|
|
205
362
|
file_path: Path to check
|
|
206
|
-
|
|
363
|
+
|
|
207
364
|
Returns:
|
|
208
365
|
True if path should be ignored
|
|
209
366
|
"""
|
|
210
367
|
try:
|
|
368
|
+
# First check gitignore rules if available
|
|
369
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path):
|
|
370
|
+
logger.debug(f"Path ignored by .gitignore: {file_path}")
|
|
371
|
+
return True
|
|
372
|
+
|
|
211
373
|
# Get relative path from project root
|
|
212
374
|
relative_path = file_path.relative_to(self.project_root)
|
|
213
|
-
|
|
214
|
-
# Check each part of the path
|
|
375
|
+
|
|
376
|
+
# Check each part of the path against default ignore patterns
|
|
215
377
|
for part in relative_path.parts:
|
|
216
378
|
if part in self._ignore_patterns:
|
|
379
|
+
logger.debug(f"Path ignored by default pattern '{part}': {file_path}")
|
|
217
380
|
return True
|
|
218
|
-
|
|
381
|
+
|
|
219
382
|
# Check if any parent directory should be ignored
|
|
220
383
|
for parent in relative_path.parents:
|
|
221
384
|
for part in parent.parts:
|
|
222
385
|
if part in self._ignore_patterns:
|
|
386
|
+
logger.debug(f"Path ignored by parent pattern '{part}': {file_path}")
|
|
223
387
|
return True
|
|
224
|
-
|
|
388
|
+
|
|
225
389
|
return False
|
|
226
|
-
|
|
390
|
+
|
|
227
391
|
except ValueError:
|
|
228
392
|
# Path is not relative to project root
|
|
229
393
|
return True
|
|
230
394
|
|
|
231
|
-
async def _parse_file(self, file_path: Path) ->
|
|
395
|
+
async def _parse_file(self, file_path: Path) -> list[CodeChunk]:
|
|
232
396
|
"""Parse a file into code chunks.
|
|
233
|
-
|
|
397
|
+
|
|
234
398
|
Args:
|
|
235
399
|
file_path: Path to the file to parse
|
|
236
|
-
|
|
400
|
+
|
|
237
401
|
Returns:
|
|
238
402
|
List of code chunks
|
|
239
403
|
"""
|
|
240
404
|
try:
|
|
241
405
|
# Get appropriate parser
|
|
242
406
|
parser = self.parser_registry.get_parser_for_file(file_path)
|
|
243
|
-
|
|
407
|
+
|
|
244
408
|
# Parse file
|
|
245
409
|
chunks = await parser.parse_file(file_path)
|
|
246
|
-
|
|
410
|
+
|
|
247
411
|
# Filter out empty chunks
|
|
248
412
|
valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
|
|
249
|
-
|
|
413
|
+
|
|
250
414
|
return valid_chunks
|
|
251
|
-
|
|
415
|
+
|
|
252
416
|
except Exception as e:
|
|
253
417
|
logger.error(f"Failed to parse file {file_path}: {e}")
|
|
254
418
|
raise ParsingError(f"Failed to parse file {file_path}: {e}") from e
|
|
255
419
|
|
|
256
420
|
def add_ignore_pattern(self, pattern: str) -> None:
|
|
257
421
|
"""Add a pattern to ignore during indexing.
|
|
258
|
-
|
|
422
|
+
|
|
259
423
|
Args:
|
|
260
424
|
pattern: Pattern to ignore (directory or file name)
|
|
261
425
|
"""
|
|
@@ -263,15 +427,15 @@ class SemanticIndexer:
|
|
|
263
427
|
|
|
264
428
|
def remove_ignore_pattern(self, pattern: str) -> None:
|
|
265
429
|
"""Remove an ignore pattern.
|
|
266
|
-
|
|
430
|
+
|
|
267
431
|
Args:
|
|
268
432
|
pattern: Pattern to remove
|
|
269
433
|
"""
|
|
270
434
|
self._ignore_patterns.discard(pattern)
|
|
271
435
|
|
|
272
|
-
def get_ignore_patterns(self) ->
|
|
436
|
+
def get_ignore_patterns(self) -> set[str]:
|
|
273
437
|
"""Get current ignore patterns.
|
|
274
|
-
|
|
438
|
+
|
|
275
439
|
Returns:
|
|
276
440
|
Set of ignore patterns
|
|
277
441
|
"""
|
|
@@ -279,17 +443,17 @@ class SemanticIndexer:
|
|
|
279
443
|
|
|
280
444
|
async def get_indexing_stats(self) -> dict:
|
|
281
445
|
"""Get statistics about the indexing process.
|
|
282
|
-
|
|
446
|
+
|
|
283
447
|
Returns:
|
|
284
448
|
Dictionary with indexing statistics
|
|
285
449
|
"""
|
|
286
450
|
try:
|
|
287
451
|
# Get database stats
|
|
288
452
|
db_stats = await self.database.get_stats()
|
|
289
|
-
|
|
453
|
+
|
|
290
454
|
# Count indexable files
|
|
291
455
|
indexable_files = self._find_indexable_files()
|
|
292
|
-
|
|
456
|
+
|
|
293
457
|
return {
|
|
294
458
|
"total_indexable_files": len(indexable_files),
|
|
295
459
|
"indexed_files": db_stats.total_files,
|
|
@@ -299,7 +463,7 @@ class SemanticIndexer:
|
|
|
299
463
|
"ignore_patterns": list(self._ignore_patterns),
|
|
300
464
|
"parser_info": self.parser_registry.get_parser_info(),
|
|
301
465
|
}
|
|
302
|
-
|
|
466
|
+
|
|
303
467
|
except Exception as e:
|
|
304
468
|
logger.error(f"Failed to get indexing stats: {e}")
|
|
305
469
|
return {
|
mcp_vector_search/core/models.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import Any
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
@@ -17,12 +17,12 @@ class CodeChunk:
|
|
|
17
17
|
end_line: int
|
|
18
18
|
language: str
|
|
19
19
|
chunk_type: str = "code" # code, function, class, comment, docstring
|
|
20
|
-
function_name:
|
|
21
|
-
class_name:
|
|
22
|
-
docstring:
|
|
23
|
-
imports:
|
|
20
|
+
function_name: str | None = None
|
|
21
|
+
class_name: str | None = None
|
|
22
|
+
docstring: str | None = None
|
|
23
|
+
imports: list[str] = None
|
|
24
24
|
complexity_score: float = 0.0
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
def __post_init__(self) -> None:
|
|
27
27
|
"""Initialize default values."""
|
|
28
28
|
if self.imports is None:
|
|
@@ -38,7 +38,7 @@ class CodeChunk:
|
|
|
38
38
|
"""Get the number of lines in this chunk."""
|
|
39
39
|
return self.end_line - self.start_line + 1
|
|
40
40
|
|
|
41
|
-
def to_dict(self) ->
|
|
41
|
+
def to_dict(self) -> dict[str, Any]:
|
|
42
42
|
"""Convert to dictionary for storage."""
|
|
43
43
|
return {
|
|
44
44
|
"content": self.content,
|
|
@@ -55,7 +55,7 @@ class CodeChunk:
|
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
@classmethod
|
|
58
|
-
def from_dict(cls, data:
|
|
58
|
+
def from_dict(cls, data: dict[str, Any]) -> "CodeChunk":
|
|
59
59
|
"""Create from dictionary."""
|
|
60
60
|
return cls(
|
|
61
61
|
content=data["content"],
|
|
@@ -83,11 +83,13 @@ class SearchResult(BaseModel):
|
|
|
83
83
|
similarity_score: float = Field(..., description="Similarity score (0.0 to 1.0)")
|
|
84
84
|
rank: int = Field(..., description="Result rank in search results")
|
|
85
85
|
chunk_type: str = Field(default="code", description="Type of code chunk")
|
|
86
|
-
function_name:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
86
|
+
function_name: str | None = Field(
|
|
87
|
+
default=None, description="Function name if applicable"
|
|
88
|
+
)
|
|
89
|
+
class_name: str | None = Field(default=None, description="Class name if applicable")
|
|
90
|
+
context_before: list[str] = Field(default=[], description="Lines before the match")
|
|
91
|
+
context_after: list[str] = Field(default=[], description="Lines after the match")
|
|
92
|
+
highlights: list[str] = Field(default=[], description="Highlighted terms")
|
|
91
93
|
|
|
92
94
|
class Config:
|
|
93
95
|
arbitrary_types_allowed = True
|
|
@@ -102,7 +104,7 @@ class SearchResult(BaseModel):
|
|
|
102
104
|
"""Get a human-readable location string."""
|
|
103
105
|
return f"{self.file_path}:{self.start_line}-{self.end_line}"
|
|
104
106
|
|
|
105
|
-
def to_dict(self) ->
|
|
107
|
+
def to_dict(self) -> dict[str, Any]:
|
|
106
108
|
"""Convert to dictionary for serialization."""
|
|
107
109
|
return {
|
|
108
110
|
"content": self.content,
|
|
@@ -128,13 +130,13 @@ class IndexStats(BaseModel):
|
|
|
128
130
|
|
|
129
131
|
total_files: int = Field(..., description="Total number of indexed files")
|
|
130
132
|
total_chunks: int = Field(..., description="Total number of code chunks")
|
|
131
|
-
languages:
|
|
132
|
-
file_types:
|
|
133
|
+
languages: dict[str, int] = Field(..., description="Language distribution")
|
|
134
|
+
file_types: dict[str, int] = Field(..., description="File type distribution")
|
|
133
135
|
index_size_mb: float = Field(..., description="Index size in megabytes")
|
|
134
136
|
last_updated: str = Field(..., description="Last update timestamp")
|
|
135
137
|
embedding_model: str = Field(..., description="Embedding model used")
|
|
136
138
|
|
|
137
|
-
def to_dict(self) ->
|
|
139
|
+
def to_dict(self) -> dict[str, Any]:
|
|
138
140
|
"""Convert to dictionary for serialization."""
|
|
139
141
|
return {
|
|
140
142
|
"total_files": self.total_files,
|
|
@@ -155,13 +157,13 @@ class ProjectInfo(BaseModel):
|
|
|
155
157
|
config_path: Path = Field(..., description="Configuration file path")
|
|
156
158
|
index_path: Path = Field(..., description="Index directory path")
|
|
157
159
|
is_initialized: bool = Field(..., description="Whether project is initialized")
|
|
158
|
-
languages:
|
|
160
|
+
languages: list[str] = Field(default=[], description="Detected languages")
|
|
159
161
|
file_count: int = Field(default=0, description="Number of indexable files")
|
|
160
162
|
|
|
161
163
|
class Config:
|
|
162
164
|
arbitrary_types_allowed = True
|
|
163
165
|
|
|
164
|
-
def to_dict(self) ->
|
|
166
|
+
def to_dict(self) -> dict[str, Any]:
|
|
165
167
|
"""Convert to dictionary for serialization."""
|
|
166
168
|
return {
|
|
167
169
|
"name": self.name,
|