mcp-vector-search 0.15.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +10 -0
- mcp_vector_search/cli/__init__.py +1 -0
- mcp_vector_search/cli/commands/__init__.py +1 -0
- mcp_vector_search/cli/commands/auto_index.py +397 -0
- mcp_vector_search/cli/commands/chat.py +534 -0
- mcp_vector_search/cli/commands/config.py +393 -0
- mcp_vector_search/cli/commands/demo.py +358 -0
- mcp_vector_search/cli/commands/index.py +762 -0
- mcp_vector_search/cli/commands/init.py +658 -0
- mcp_vector_search/cli/commands/install.py +869 -0
- mcp_vector_search/cli/commands/install_old.py +700 -0
- mcp_vector_search/cli/commands/mcp.py +1254 -0
- mcp_vector_search/cli/commands/reset.py +393 -0
- mcp_vector_search/cli/commands/search.py +796 -0
- mcp_vector_search/cli/commands/setup.py +1133 -0
- mcp_vector_search/cli/commands/status.py +584 -0
- mcp_vector_search/cli/commands/uninstall.py +404 -0
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +265 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +201 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
- mcp_vector_search/cli/commands/visualize.py.original +2536 -0
- mcp_vector_search/cli/commands/watch.py +287 -0
- mcp_vector_search/cli/didyoumean.py +520 -0
- mcp_vector_search/cli/export.py +320 -0
- mcp_vector_search/cli/history.py +295 -0
- mcp_vector_search/cli/interactive.py +342 -0
- mcp_vector_search/cli/main.py +484 -0
- mcp_vector_search/cli/output.py +414 -0
- mcp_vector_search/cli/suggestions.py +375 -0
- mcp_vector_search/config/__init__.py +1 -0
- mcp_vector_search/config/constants.py +24 -0
- mcp_vector_search/config/defaults.py +200 -0
- mcp_vector_search/config/settings.py +146 -0
- mcp_vector_search/core/__init__.py +1 -0
- mcp_vector_search/core/auto_indexer.py +298 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/connection_pool.py +360 -0
- mcp_vector_search/core/database.py +1237 -0
- mcp_vector_search/core/directory_index.py +318 -0
- mcp_vector_search/core/embeddings.py +294 -0
- mcp_vector_search/core/exceptions.py +89 -0
- mcp_vector_search/core/factory.py +318 -0
- mcp_vector_search/core/git_hooks.py +345 -0
- mcp_vector_search/core/indexer.py +1002 -0
- mcp_vector_search/core/llm_client.py +453 -0
- mcp_vector_search/core/models.py +294 -0
- mcp_vector_search/core/project.py +350 -0
- mcp_vector_search/core/scheduler.py +330 -0
- mcp_vector_search/core/search.py +952 -0
- mcp_vector_search/core/watcher.py +322 -0
- mcp_vector_search/mcp/__init__.py +5 -0
- mcp_vector_search/mcp/__main__.py +25 -0
- mcp_vector_search/mcp/server.py +752 -0
- mcp_vector_search/parsers/__init__.py +8 -0
- mcp_vector_search/parsers/base.py +296 -0
- mcp_vector_search/parsers/dart.py +605 -0
- mcp_vector_search/parsers/html.py +413 -0
- mcp_vector_search/parsers/javascript.py +643 -0
- mcp_vector_search/parsers/php.py +694 -0
- mcp_vector_search/parsers/python.py +502 -0
- mcp_vector_search/parsers/registry.py +223 -0
- mcp_vector_search/parsers/ruby.py +678 -0
- mcp_vector_search/parsers/text.py +186 -0
- mcp_vector_search/parsers/utils.py +265 -0
- mcp_vector_search/py.typed +1 -0
- mcp_vector_search/utils/__init__.py +42 -0
- mcp_vector_search/utils/gitignore.py +250 -0
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +339 -0
- mcp_vector_search/utils/timing.py +338 -0
- mcp_vector_search/utils/version.py +47 -0
- mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
- mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
- mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
- mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
- mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1002 @@
|
|
|
1
|
+
"""Semantic indexer for MCP Vector Search."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from packaging import version
|
|
11
|
+
|
|
12
|
+
from .. import __version__
|
|
13
|
+
from ..config.defaults import ALLOWED_DOTFILES, DEFAULT_IGNORE_PATTERNS
|
|
14
|
+
from ..config.settings import ProjectConfig
|
|
15
|
+
from ..parsers.registry import get_parser_registry
|
|
16
|
+
from ..utils.gitignore import create_gitignore_parser
|
|
17
|
+
from ..utils.monorepo import MonorepoDetector
|
|
18
|
+
from .database import VectorDatabase
|
|
19
|
+
from .directory_index import DirectoryIndex
|
|
20
|
+
from .exceptions import ParsingError
|
|
21
|
+
from .models import CodeChunk, IndexStats
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SemanticIndexer:
|
|
25
|
+
"""Semantic indexer for parsing and indexing code files."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
database: VectorDatabase,
|
|
30
|
+
project_root: Path,
|
|
31
|
+
file_extensions: list[str] | None = None,
|
|
32
|
+
config: ProjectConfig | None = None,
|
|
33
|
+
max_workers: int | None = None,
|
|
34
|
+
batch_size: int = 10,
|
|
35
|
+
debug: bool = False,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Initialize semantic indexer.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
database: Vector database instance
|
|
41
|
+
project_root: Project root directory
|
|
42
|
+
file_extensions: File extensions to index (deprecated, use config)
|
|
43
|
+
config: Project configuration (preferred over file_extensions)
|
|
44
|
+
max_workers: Maximum number of worker threads for parallel processing
|
|
45
|
+
batch_size: Number of files to process in each batch
|
|
46
|
+
debug: Enable debug output for hierarchy building
|
|
47
|
+
"""
|
|
48
|
+
self.database = database
|
|
49
|
+
self.project_root = project_root
|
|
50
|
+
|
|
51
|
+
# Store config for filtering behavior
|
|
52
|
+
self.config = config
|
|
53
|
+
|
|
54
|
+
# Handle backward compatibility: use config.file_extensions or fallback to parameter
|
|
55
|
+
if config is not None:
|
|
56
|
+
self.file_extensions = {ext.lower() for ext in config.file_extensions}
|
|
57
|
+
elif file_extensions is not None:
|
|
58
|
+
self.file_extensions = {ext.lower() for ext in file_extensions}
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError("Either config or file_extensions must be provided")
|
|
61
|
+
|
|
62
|
+
self.parser_registry = get_parser_registry()
|
|
63
|
+
self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
|
|
64
|
+
self.debug = debug
|
|
65
|
+
|
|
66
|
+
# Safely get event loop for max_workers
|
|
67
|
+
try:
|
|
68
|
+
loop = asyncio.get_event_loop()
|
|
69
|
+
self.max_workers = max_workers or min(4, (loop.get_debug() and 1) or 4)
|
|
70
|
+
except RuntimeError:
|
|
71
|
+
# No event loop in current thread
|
|
72
|
+
self.max_workers = max_workers or 4
|
|
73
|
+
|
|
74
|
+
self.batch_size = batch_size
|
|
75
|
+
self._index_metadata_file = (
|
|
76
|
+
project_root / ".mcp-vector-search" / "index_metadata.json"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Add cache for indexable files to avoid repeated filesystem scans
|
|
80
|
+
self._indexable_files_cache: list[Path] | None = None
|
|
81
|
+
self._cache_timestamp: float = 0
|
|
82
|
+
self._cache_ttl: float = 60.0 # 60 second TTL
|
|
83
|
+
|
|
84
|
+
# Initialize gitignore parser (only if respect_gitignore is True)
|
|
85
|
+
if config is None or config.respect_gitignore:
|
|
86
|
+
try:
|
|
87
|
+
self.gitignore_parser = create_gitignore_parser(project_root)
|
|
88
|
+
logger.debug(
|
|
89
|
+
f"Loaded {len(self.gitignore_parser.patterns)} gitignore patterns"
|
|
90
|
+
)
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.warning(f"Failed to load gitignore patterns: {e}")
|
|
93
|
+
self.gitignore_parser = None
|
|
94
|
+
else:
|
|
95
|
+
self.gitignore_parser = None
|
|
96
|
+
logger.debug("Gitignore filtering disabled by configuration")
|
|
97
|
+
|
|
98
|
+
# Initialize monorepo detector
|
|
99
|
+
self.monorepo_detector = MonorepoDetector(project_root)
|
|
100
|
+
if self.monorepo_detector.is_monorepo():
|
|
101
|
+
subprojects = self.monorepo_detector.detect_subprojects()
|
|
102
|
+
logger.info(f"Detected monorepo with {len(subprojects)} subprojects")
|
|
103
|
+
for sp in subprojects:
|
|
104
|
+
logger.debug(f" - {sp.name} ({sp.relative_path})")
|
|
105
|
+
|
|
106
|
+
# Initialize directory index
|
|
107
|
+
self.directory_index = DirectoryIndex(
|
|
108
|
+
project_root / ".mcp-vector-search" / "directory_index.json"
|
|
109
|
+
)
|
|
110
|
+
# Load existing directory index
|
|
111
|
+
self.directory_index.load()
|
|
112
|
+
|
|
113
|
+
async def index_project(
|
|
114
|
+
self,
|
|
115
|
+
force_reindex: bool = False,
|
|
116
|
+
show_progress: bool = True,
|
|
117
|
+
) -> int:
|
|
118
|
+
"""Index all files in the project.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
force_reindex: Whether to reindex existing files
|
|
122
|
+
show_progress: Whether to show progress information
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Number of files indexed
|
|
126
|
+
"""
|
|
127
|
+
logger.info(f"Starting indexing of project: {self.project_root}")
|
|
128
|
+
|
|
129
|
+
# Find all indexable files
|
|
130
|
+
all_files = self._find_indexable_files()
|
|
131
|
+
|
|
132
|
+
if not all_files:
|
|
133
|
+
logger.warning("No indexable files found")
|
|
134
|
+
return 0
|
|
135
|
+
|
|
136
|
+
# Load existing metadata for incremental indexing
|
|
137
|
+
metadata = self._load_index_metadata()
|
|
138
|
+
|
|
139
|
+
# Filter files that need indexing
|
|
140
|
+
if force_reindex:
|
|
141
|
+
files_to_index = all_files
|
|
142
|
+
logger.info(f"Force reindex: processing all {len(files_to_index)} files")
|
|
143
|
+
else:
|
|
144
|
+
files_to_index = [
|
|
145
|
+
f for f in all_files if self._needs_reindexing(f, metadata)
|
|
146
|
+
]
|
|
147
|
+
logger.info(
|
|
148
|
+
f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if not files_to_index:
|
|
152
|
+
logger.info("All files are up to date")
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
# Index files in parallel batches
|
|
156
|
+
indexed_count = 0
|
|
157
|
+
failed_count = 0
|
|
158
|
+
|
|
159
|
+
# Process files in batches for better memory management
|
|
160
|
+
for i in range(0, len(files_to_index), self.batch_size):
|
|
161
|
+
batch = files_to_index[i : i + self.batch_size]
|
|
162
|
+
|
|
163
|
+
if show_progress:
|
|
164
|
+
logger.info(
|
|
165
|
+
f"Processing batch {i // self.batch_size + 1}/{(len(files_to_index) + self.batch_size - 1) // self.batch_size} ({len(batch)} files)"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Process batch in parallel
|
|
169
|
+
batch_results = await self._process_file_batch(batch, force_reindex)
|
|
170
|
+
|
|
171
|
+
# Count results
|
|
172
|
+
for success in batch_results:
|
|
173
|
+
if success:
|
|
174
|
+
indexed_count += 1
|
|
175
|
+
else:
|
|
176
|
+
failed_count += 1
|
|
177
|
+
|
|
178
|
+
# Update metadata for successfully indexed files
|
|
179
|
+
if indexed_count > 0:
|
|
180
|
+
for file_path in files_to_index:
|
|
181
|
+
try:
|
|
182
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
183
|
+
except OSError:
|
|
184
|
+
pass # File might have been deleted during indexing
|
|
185
|
+
|
|
186
|
+
self._save_index_metadata(metadata)
|
|
187
|
+
|
|
188
|
+
# Rebuild directory index from successfully indexed files
|
|
189
|
+
try:
|
|
190
|
+
logger.debug("Rebuilding directory index...")
|
|
191
|
+
# We don't have chunk counts here, but we have file modification times
|
|
192
|
+
# Build a simple stats dict with file mod times for recency tracking
|
|
193
|
+
chunk_stats = {}
|
|
194
|
+
for file_path in files_to_index:
|
|
195
|
+
try:
|
|
196
|
+
mtime = os.path.getmtime(file_path)
|
|
197
|
+
# For now, just track modification time
|
|
198
|
+
# Chunk counts will be aggregated from the database later if needed
|
|
199
|
+
chunk_stats[str(file_path)] = {
|
|
200
|
+
"modified": mtime,
|
|
201
|
+
"chunks": 1, # Placeholder - real count from chunks
|
|
202
|
+
}
|
|
203
|
+
except OSError:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
self.directory_index.rebuild_from_files(
|
|
207
|
+
files_to_index, self.project_root, chunk_stats=chunk_stats
|
|
208
|
+
)
|
|
209
|
+
self.directory_index.save()
|
|
210
|
+
dir_stats = self.directory_index.get_stats()
|
|
211
|
+
logger.info(
|
|
212
|
+
f"Directory index updated: {dir_stats['total_directories']} directories, "
|
|
213
|
+
f"{dir_stats['total_files']} files"
|
|
214
|
+
)
|
|
215
|
+
except Exception as e:
|
|
216
|
+
logger.error(f"Failed to update directory index: {e}")
|
|
217
|
+
import traceback
|
|
218
|
+
|
|
219
|
+
logger.debug(traceback.format_exc())
|
|
220
|
+
|
|
221
|
+
logger.info(
|
|
222
|
+
f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return indexed_count
|
|
226
|
+
|
|
227
|
+
async def _process_file_batch(
|
|
228
|
+
self, file_paths: list[Path], force_reindex: bool = False
|
|
229
|
+
) -> list[bool]:
|
|
230
|
+
"""Process a batch of files in parallel.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
file_paths: List of file paths to process
|
|
234
|
+
force_reindex: Whether to force reindexing
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
List of success flags for each file
|
|
238
|
+
"""
|
|
239
|
+
# Create tasks for parallel processing
|
|
240
|
+
tasks = []
|
|
241
|
+
for file_path in file_paths:
|
|
242
|
+
task = asyncio.create_task(self._index_file_safe(file_path, force_reindex))
|
|
243
|
+
tasks.append(task)
|
|
244
|
+
|
|
245
|
+
# Wait for all tasks to complete
|
|
246
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
247
|
+
|
|
248
|
+
# Convert results to success flags
|
|
249
|
+
success_flags = []
|
|
250
|
+
for i, result in enumerate(results):
|
|
251
|
+
if isinstance(result, Exception):
|
|
252
|
+
logger.error(f"Failed to index {file_paths[i]}: {result}")
|
|
253
|
+
success_flags.append(False)
|
|
254
|
+
else:
|
|
255
|
+
success_flags.append(result)
|
|
256
|
+
|
|
257
|
+
return success_flags
|
|
258
|
+
|
|
259
|
+
def _load_index_metadata(self) -> dict[str, float]:
|
|
260
|
+
"""Load file modification times from metadata file.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Dictionary mapping file paths to modification times
|
|
264
|
+
"""
|
|
265
|
+
if not self._index_metadata_file.exists():
|
|
266
|
+
return {}
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
with open(self._index_metadata_file) as f:
|
|
270
|
+
data = json.load(f)
|
|
271
|
+
# Handle legacy format (just file_mtimes dict) and new format
|
|
272
|
+
if "file_mtimes" in data:
|
|
273
|
+
return data["file_mtimes"]
|
|
274
|
+
else:
|
|
275
|
+
# Legacy format - just return as-is
|
|
276
|
+
return data
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.warning(f"Failed to load index metadata: {e}")
|
|
279
|
+
return {}
|
|
280
|
+
|
|
281
|
+
def _save_index_metadata(self, metadata: dict[str, float]) -> None:
|
|
282
|
+
"""Save file modification times to metadata file.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
metadata: Dictionary mapping file paths to modification times
|
|
286
|
+
"""
|
|
287
|
+
try:
|
|
288
|
+
# Ensure directory exists
|
|
289
|
+
self._index_metadata_file.parent.mkdir(parents=True, exist_ok=True)
|
|
290
|
+
|
|
291
|
+
# New metadata format with version tracking
|
|
292
|
+
data = {
|
|
293
|
+
"index_version": __version__,
|
|
294
|
+
"indexed_at": datetime.now(UTC).isoformat(),
|
|
295
|
+
"file_mtimes": metadata,
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
with open(self._index_metadata_file, "w") as f:
|
|
299
|
+
json.dump(data, f, indent=2)
|
|
300
|
+
except Exception as e:
|
|
301
|
+
logger.warning(f"Failed to save index metadata: {e}")
|
|
302
|
+
|
|
303
|
+
def _needs_reindexing(self, file_path: Path, metadata: dict[str, float]) -> bool:
|
|
304
|
+
"""Check if a file needs reindexing based on modification time.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
file_path: Path to the file
|
|
308
|
+
metadata: Current metadata dictionary
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
True if file needs reindexing
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
current_mtime = os.path.getmtime(file_path)
|
|
315
|
+
stored_mtime = metadata.get(str(file_path), 0)
|
|
316
|
+
return current_mtime > stored_mtime
|
|
317
|
+
except OSError:
|
|
318
|
+
# File doesn't exist or can't be accessed
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
async def _index_file_safe(
|
|
322
|
+
self, file_path: Path, force_reindex: bool = False
|
|
323
|
+
) -> bool:
|
|
324
|
+
"""Safely index a single file with error handling.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
file_path: Path to the file to index
|
|
328
|
+
force_reindex: Whether to force reindexing
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
True if successful, False otherwise
|
|
332
|
+
"""
|
|
333
|
+
try:
|
|
334
|
+
return await self.index_file(file_path, force_reindex)
|
|
335
|
+
except Exception as e:
|
|
336
|
+
logger.error(f"Error indexing {file_path}: {e}")
|
|
337
|
+
return False
|
|
338
|
+
|
|
339
|
+
async def index_file(
|
|
340
|
+
self,
|
|
341
|
+
file_path: Path,
|
|
342
|
+
force_reindex: bool = False,
|
|
343
|
+
) -> bool:
|
|
344
|
+
"""Index a single file.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
file_path: Path to the file to index
|
|
348
|
+
force_reindex: Whether to reindex if already indexed
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
True if file was successfully indexed
|
|
352
|
+
"""
|
|
353
|
+
try:
|
|
354
|
+
# Check if file should be indexed
|
|
355
|
+
if not self._should_index_file(file_path):
|
|
356
|
+
return False
|
|
357
|
+
|
|
358
|
+
# Always remove existing chunks when reindexing a file
|
|
359
|
+
# This prevents duplicate chunks and ensures consistency
|
|
360
|
+
await self.database.delete_by_file(file_path)
|
|
361
|
+
|
|
362
|
+
# Parse file into chunks
|
|
363
|
+
chunks = await self._parse_file(file_path)
|
|
364
|
+
|
|
365
|
+
if not chunks:
|
|
366
|
+
logger.debug(f"No chunks extracted from {file_path}")
|
|
367
|
+
return True # Not an error, just empty file
|
|
368
|
+
|
|
369
|
+
# Build hierarchical relationships between chunks
|
|
370
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
371
|
+
|
|
372
|
+
# Debug: Check if hierarchy was built
|
|
373
|
+
methods_with_parents = sum(
|
|
374
|
+
1
|
|
375
|
+
for c in chunks_with_hierarchy
|
|
376
|
+
if c.chunk_type in ("method", "function") and c.parent_chunk_id
|
|
377
|
+
)
|
|
378
|
+
logger.debug(
|
|
379
|
+
f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Add chunks to database
|
|
383
|
+
await self.database.add_chunks(chunks_with_hierarchy)
|
|
384
|
+
|
|
385
|
+
# Update metadata after successful indexing
|
|
386
|
+
metadata = self._load_index_metadata()
|
|
387
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
388
|
+
self._save_index_metadata(metadata)
|
|
389
|
+
|
|
390
|
+
logger.debug(f"Indexed {len(chunks)} chunks from {file_path}")
|
|
391
|
+
return True
|
|
392
|
+
|
|
393
|
+
except Exception as e:
|
|
394
|
+
logger.error(f"Failed to index file {file_path}: {e}")
|
|
395
|
+
raise ParsingError(f"Failed to index file {file_path}: {e}") from e
|
|
396
|
+
|
|
397
|
+
async def reindex_file(self, file_path: Path) -> bool:
|
|
398
|
+
"""Reindex a single file (removes existing chunks first).
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
file_path: Path to the file to reindex
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
True if file was successfully reindexed
|
|
405
|
+
"""
|
|
406
|
+
return await self.index_file(file_path, force_reindex=True)
|
|
407
|
+
|
|
408
|
+
async def remove_file(self, file_path: Path) -> int:
|
|
409
|
+
"""Remove all chunks for a file from the index.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
file_path: Path to the file to remove
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
Number of chunks removed
|
|
416
|
+
"""
|
|
417
|
+
try:
|
|
418
|
+
count = await self.database.delete_by_file(file_path)
|
|
419
|
+
logger.debug(f"Removed {count} chunks for {file_path}")
|
|
420
|
+
return count
|
|
421
|
+
except Exception as e:
|
|
422
|
+
logger.error(f"Failed to remove file {file_path}: {e}")
|
|
423
|
+
return 0
|
|
424
|
+
|
|
425
|
+
def _find_indexable_files(self) -> list[Path]:
|
|
426
|
+
"""Find all files that should be indexed with caching.
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
List of file paths to index
|
|
430
|
+
"""
|
|
431
|
+
import time
|
|
432
|
+
|
|
433
|
+
# Check cache
|
|
434
|
+
current_time = time.time()
|
|
435
|
+
if (
|
|
436
|
+
self._indexable_files_cache is not None
|
|
437
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
438
|
+
):
|
|
439
|
+
logger.debug(
|
|
440
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
441
|
+
)
|
|
442
|
+
return self._indexable_files_cache
|
|
443
|
+
|
|
444
|
+
# Rebuild cache using efficient directory filtering
|
|
445
|
+
logger.debug("Rebuilding indexable files cache...")
|
|
446
|
+
indexable_files = self._scan_files_sync()
|
|
447
|
+
|
|
448
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
449
|
+
self._cache_timestamp = current_time
|
|
450
|
+
logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
|
|
451
|
+
|
|
452
|
+
return self._indexable_files_cache
|
|
453
|
+
|
|
454
|
+
def _scan_files_sync(self) -> list[Path]:
|
|
455
|
+
"""Synchronous file scanning (runs in thread pool).
|
|
456
|
+
|
|
457
|
+
Uses os.walk with directory filtering to avoid traversing ignored directories.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
List of indexable file paths
|
|
461
|
+
"""
|
|
462
|
+
indexable_files = []
|
|
463
|
+
|
|
464
|
+
# Use os.walk for efficient directory traversal with early filtering
|
|
465
|
+
for root, dirs, files in os.walk(self.project_root):
|
|
466
|
+
root_path = Path(root)
|
|
467
|
+
|
|
468
|
+
# Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
|
|
469
|
+
# This is much more efficient than checking every file in ignored directories
|
|
470
|
+
# PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
|
|
471
|
+
dirs[:] = [
|
|
472
|
+
d
|
|
473
|
+
for d in dirs
|
|
474
|
+
if not self._should_ignore_path(root_path / d, is_directory=True)
|
|
475
|
+
]
|
|
476
|
+
|
|
477
|
+
# Check each file in the current directory
|
|
478
|
+
# PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
|
|
479
|
+
for filename in files:
|
|
480
|
+
file_path = root_path / filename
|
|
481
|
+
if self._should_index_file(file_path, skip_file_check=True):
|
|
482
|
+
indexable_files.append(file_path)
|
|
483
|
+
|
|
484
|
+
return indexable_files
|
|
485
|
+
|
|
486
|
+
async def _find_indexable_files_async(self) -> list[Path]:
|
|
487
|
+
"""Find all files asynchronously without blocking event loop.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
List of file paths to index
|
|
491
|
+
"""
|
|
492
|
+
import time
|
|
493
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
494
|
+
|
|
495
|
+
# Check cache first
|
|
496
|
+
current_time = time.time()
|
|
497
|
+
if (
|
|
498
|
+
self._indexable_files_cache is not None
|
|
499
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
500
|
+
):
|
|
501
|
+
logger.debug(
|
|
502
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
503
|
+
)
|
|
504
|
+
return self._indexable_files_cache
|
|
505
|
+
|
|
506
|
+
# Run filesystem scan in thread pool to avoid blocking
|
|
507
|
+
logger.debug("Scanning files in background thread...")
|
|
508
|
+
loop = asyncio.get_running_loop()
|
|
509
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
510
|
+
indexable_files = await loop.run_in_executor(
|
|
511
|
+
executor, self._scan_files_sync
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Update cache
|
|
515
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
516
|
+
self._cache_timestamp = current_time
|
|
517
|
+
logger.debug(f"Found {len(indexable_files)} indexable files")
|
|
518
|
+
|
|
519
|
+
return self._indexable_files_cache
|
|
520
|
+
|
|
521
|
+
def _should_index_file(
|
|
522
|
+
self, file_path: Path, skip_file_check: bool = False
|
|
523
|
+
) -> bool:
|
|
524
|
+
"""Check if a file should be indexed.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
file_path: Path to check
|
|
528
|
+
skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
True if file should be indexed
|
|
532
|
+
"""
|
|
533
|
+
# PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
|
|
534
|
+
# This eliminates most files without any filesystem calls
|
|
535
|
+
if file_path.suffix.lower() not in self.file_extensions:
|
|
536
|
+
return False
|
|
537
|
+
|
|
538
|
+
# PERFORMANCE: Only check is_file() if not coming from os.walk
|
|
539
|
+
# os.walk already guarantees files, so we skip this expensive check
|
|
540
|
+
if not skip_file_check and not file_path.is_file():
|
|
541
|
+
return False
|
|
542
|
+
|
|
543
|
+
# Check if path should be ignored
|
|
544
|
+
# PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
|
|
545
|
+
if self._should_ignore_path(file_path, is_directory=False):
|
|
546
|
+
return False
|
|
547
|
+
|
|
548
|
+
# Check file size (skip very large files)
|
|
549
|
+
try:
|
|
550
|
+
file_size = file_path.stat().st_size
|
|
551
|
+
if file_size > 10 * 1024 * 1024: # 10MB limit
|
|
552
|
+
logger.warning(f"Skipping large file: {file_path} ({file_size} bytes)")
|
|
553
|
+
return False
|
|
554
|
+
except OSError:
|
|
555
|
+
return False
|
|
556
|
+
|
|
557
|
+
return True
|
|
558
|
+
|
|
559
|
+
def _should_ignore_path(
|
|
560
|
+
self, file_path: Path, is_directory: bool | None = None
|
|
561
|
+
) -> bool:
|
|
562
|
+
"""Check if a path should be ignored.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
file_path: Path to check
|
|
566
|
+
is_directory: Optional hint if path is a directory (avoids filesystem check)
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
True if path should be ignored
|
|
570
|
+
"""
|
|
571
|
+
try:
|
|
572
|
+
# Get relative path from project root for checking
|
|
573
|
+
relative_path = file_path.relative_to(self.project_root)
|
|
574
|
+
|
|
575
|
+
# 1. Check dotfile filtering (if enabled in config)
|
|
576
|
+
if self.config and self.config.skip_dotfiles:
|
|
577
|
+
for part in relative_path.parts:
|
|
578
|
+
# Skip dotfiles unless they're in the whitelist
|
|
579
|
+
if part.startswith(".") and part not in ALLOWED_DOTFILES:
|
|
580
|
+
logger.debug(
|
|
581
|
+
f"Path ignored by dotfile filter '{part}': {file_path}"
|
|
582
|
+
)
|
|
583
|
+
return True
|
|
584
|
+
|
|
585
|
+
# 2. Check gitignore rules if available and enabled
|
|
586
|
+
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
587
|
+
if self.config and self.config.respect_gitignore:
|
|
588
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(
|
|
589
|
+
file_path, is_directory=is_directory
|
|
590
|
+
):
|
|
591
|
+
logger.debug(f"Path ignored by .gitignore: {file_path}")
|
|
592
|
+
return True
|
|
593
|
+
|
|
594
|
+
# 3. Check each part of the path against default ignore patterns
|
|
595
|
+
for part in relative_path.parts:
|
|
596
|
+
if part in self._ignore_patterns:
|
|
597
|
+
logger.debug(
|
|
598
|
+
f"Path ignored by default pattern '{part}': {file_path}"
|
|
599
|
+
)
|
|
600
|
+
return True
|
|
601
|
+
|
|
602
|
+
# 4. Check if any parent directory should be ignored
|
|
603
|
+
for parent in relative_path.parents:
|
|
604
|
+
for part in parent.parts:
|
|
605
|
+
if part in self._ignore_patterns:
|
|
606
|
+
logger.debug(
|
|
607
|
+
f"Path ignored by parent pattern '{part}': {file_path}"
|
|
608
|
+
)
|
|
609
|
+
return True
|
|
610
|
+
|
|
611
|
+
return False
|
|
612
|
+
|
|
613
|
+
except ValueError:
|
|
614
|
+
# Path is not relative to project root
|
|
615
|
+
return True
|
|
616
|
+
|
|
617
|
+
async def _parse_file(self, file_path: Path) -> list[CodeChunk]:
|
|
618
|
+
"""Parse a file into code chunks.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
file_path: Path to the file to parse
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
List of code chunks with subproject information
|
|
625
|
+
"""
|
|
626
|
+
try:
|
|
627
|
+
# Get appropriate parser
|
|
628
|
+
parser = self.parser_registry.get_parser_for_file(file_path)
|
|
629
|
+
|
|
630
|
+
# Parse file
|
|
631
|
+
chunks = await parser.parse_file(file_path)
|
|
632
|
+
|
|
633
|
+
# Filter out empty chunks
|
|
634
|
+
valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
|
|
635
|
+
|
|
636
|
+
# Assign subproject information for monorepos
|
|
637
|
+
subproject = self.monorepo_detector.get_subproject_for_file(file_path)
|
|
638
|
+
if subproject:
|
|
639
|
+
for chunk in valid_chunks:
|
|
640
|
+
chunk.subproject_name = subproject.name
|
|
641
|
+
chunk.subproject_path = subproject.relative_path
|
|
642
|
+
|
|
643
|
+
return valid_chunks
|
|
644
|
+
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.error(f"Failed to parse file {file_path}: {e}")
|
|
647
|
+
raise ParsingError(f"Failed to parse file {file_path}: {e}") from e
|
|
648
|
+
|
|
649
|
+
def add_ignore_pattern(self, pattern: str) -> None:
|
|
650
|
+
"""Add a pattern to ignore during indexing.
|
|
651
|
+
|
|
652
|
+
Args:
|
|
653
|
+
pattern: Pattern to ignore (directory or file name)
|
|
654
|
+
"""
|
|
655
|
+
self._ignore_patterns.add(pattern)
|
|
656
|
+
|
|
657
|
+
def remove_ignore_pattern(self, pattern: str) -> None:
|
|
658
|
+
"""Remove an ignore pattern.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
pattern: Pattern to remove
|
|
662
|
+
"""
|
|
663
|
+
self._ignore_patterns.discard(pattern)
|
|
664
|
+
|
|
665
|
+
def get_ignore_patterns(self) -> set[str]:
|
|
666
|
+
"""Get current ignore patterns.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
Set of ignore patterns
|
|
670
|
+
"""
|
|
671
|
+
return self._ignore_patterns.copy()
|
|
672
|
+
|
|
673
|
+
def get_index_version(self) -> str | None:
|
|
674
|
+
"""Get the version of the tool that created the current index.
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
Version string or None if not available
|
|
678
|
+
"""
|
|
679
|
+
if not self._index_metadata_file.exists():
|
|
680
|
+
return None
|
|
681
|
+
|
|
682
|
+
try:
|
|
683
|
+
with open(self._index_metadata_file) as f:
|
|
684
|
+
data = json.load(f)
|
|
685
|
+
return data.get("index_version")
|
|
686
|
+
except Exception as e:
|
|
687
|
+
logger.warning(f"Failed to read index version: {e}")
|
|
688
|
+
return None
|
|
689
|
+
|
|
690
|
+
def needs_reindex_for_version(self) -> bool:
|
|
691
|
+
"""Check if reindex is needed due to version upgrade.
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
True if reindex is needed for version compatibility
|
|
695
|
+
"""
|
|
696
|
+
index_version = self.get_index_version()
|
|
697
|
+
|
|
698
|
+
if not index_version:
|
|
699
|
+
# No version recorded - this is either a new index or legacy format
|
|
700
|
+
# Reindex to establish version tracking
|
|
701
|
+
return True
|
|
702
|
+
|
|
703
|
+
try:
|
|
704
|
+
current = version.parse(__version__)
|
|
705
|
+
indexed = version.parse(index_version)
|
|
706
|
+
|
|
707
|
+
# Reindex on major or minor version change
|
|
708
|
+
# Patch versions (0.5.1 -> 0.5.2) don't require reindex
|
|
709
|
+
needs_reindex = (
|
|
710
|
+
current.major != indexed.major or current.minor != indexed.minor
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
if needs_reindex:
|
|
714
|
+
logger.info(
|
|
715
|
+
f"Version upgrade detected: {index_version} -> {__version__} "
|
|
716
|
+
f"(reindex recommended)"
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
return needs_reindex
|
|
720
|
+
|
|
721
|
+
except Exception as e:
|
|
722
|
+
logger.warning(f"Failed to compare versions: {e}")
|
|
723
|
+
# If we can't parse versions, be safe and reindex
|
|
724
|
+
return True
|
|
725
|
+
|
|
726
|
+
async def get_indexing_stats(self, db_stats: IndexStats | None = None) -> dict:
|
|
727
|
+
"""Get statistics about the indexing process.
|
|
728
|
+
|
|
729
|
+
Args:
|
|
730
|
+
db_stats: Optional pre-fetched database stats to avoid duplicate queries
|
|
731
|
+
|
|
732
|
+
Returns:
|
|
733
|
+
Dictionary with indexing statistics
|
|
734
|
+
|
|
735
|
+
Note:
|
|
736
|
+
Uses database statistics only for performance on large projects.
|
|
737
|
+
Filesystem scanning would timeout on 100K+ file projects.
|
|
738
|
+
Pass db_stats parameter to avoid calling database.get_stats() twice.
|
|
739
|
+
"""
|
|
740
|
+
try:
|
|
741
|
+
# Get database stats if not provided (fast, no filesystem scan)
|
|
742
|
+
if db_stats is None:
|
|
743
|
+
db_stats = await self.database.get_stats()
|
|
744
|
+
|
|
745
|
+
# Use database stats for all file counts
|
|
746
|
+
# This avoids expensive filesystem scans on large projects
|
|
747
|
+
return {
|
|
748
|
+
"total_indexable_files": db_stats.total_files,
|
|
749
|
+
"indexed_files": db_stats.total_files,
|
|
750
|
+
"total_files": db_stats.total_files, # For backward compatibility
|
|
751
|
+
"total_chunks": db_stats.total_chunks,
|
|
752
|
+
"languages": db_stats.languages,
|
|
753
|
+
"file_types": db_stats.file_types, # Include file type distribution
|
|
754
|
+
"file_extensions": list(self.file_extensions),
|
|
755
|
+
"ignore_patterns": list(self._ignore_patterns),
|
|
756
|
+
"parser_info": self.parser_registry.get_parser_info(),
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
except Exception as e:
|
|
760
|
+
logger.error(f"Failed to get indexing stats: {e}")
|
|
761
|
+
return {
|
|
762
|
+
"error": str(e),
|
|
763
|
+
"total_indexable_files": 0,
|
|
764
|
+
"indexed_files": 0,
|
|
765
|
+
"total_files": 0,
|
|
766
|
+
"total_chunks": 0,
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
async def get_files_to_index(
|
|
770
|
+
self, force_reindex: bool = False
|
|
771
|
+
) -> tuple[list[Path], list[Path]]:
|
|
772
|
+
"""Get all indexable files and those that need indexing.
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
force_reindex: Whether to force reindex of all files
|
|
776
|
+
|
|
777
|
+
Returns:
|
|
778
|
+
Tuple of (all_indexable_files, files_to_index)
|
|
779
|
+
"""
|
|
780
|
+
# Find all indexable files
|
|
781
|
+
all_files = await self._find_indexable_files_async()
|
|
782
|
+
|
|
783
|
+
if not all_files:
|
|
784
|
+
return [], []
|
|
785
|
+
|
|
786
|
+
# Load existing metadata for incremental indexing
|
|
787
|
+
metadata = self._load_index_metadata()
|
|
788
|
+
|
|
789
|
+
# Filter files that need indexing
|
|
790
|
+
if force_reindex:
|
|
791
|
+
files_to_index = all_files
|
|
792
|
+
logger.info(f"Force reindex: processing all {len(files_to_index)} files")
|
|
793
|
+
else:
|
|
794
|
+
files_to_index = [
|
|
795
|
+
f for f in all_files if self._needs_reindexing(f, metadata)
|
|
796
|
+
]
|
|
797
|
+
logger.info(
|
|
798
|
+
f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
return all_files, files_to_index
|
|
802
|
+
|
|
803
|
+
async def index_files_with_progress(
|
|
804
|
+
self,
|
|
805
|
+
files_to_index: list[Path],
|
|
806
|
+
force_reindex: bool = False,
|
|
807
|
+
):
|
|
808
|
+
"""Index files and yield progress updates for each file.
|
|
809
|
+
|
|
810
|
+
Args:
|
|
811
|
+
files_to_index: List of file paths to index
|
|
812
|
+
force_reindex: Whether to force reindexing
|
|
813
|
+
|
|
814
|
+
Yields:
|
|
815
|
+
Tuple of (file_path, chunks_added, success) for each processed file
|
|
816
|
+
"""
|
|
817
|
+
# Write version header to error log at start of indexing run
|
|
818
|
+
self._write_indexing_run_header()
|
|
819
|
+
|
|
820
|
+
metadata = self._load_index_metadata()
|
|
821
|
+
|
|
822
|
+
# Process files in batches for better memory management
|
|
823
|
+
for i in range(0, len(files_to_index), self.batch_size):
|
|
824
|
+
batch = files_to_index[i : i + self.batch_size]
|
|
825
|
+
|
|
826
|
+
# Process each file in the batch
|
|
827
|
+
for file_path in batch:
|
|
828
|
+
chunks_added = 0
|
|
829
|
+
success = False
|
|
830
|
+
|
|
831
|
+
try:
|
|
832
|
+
# Always remove existing chunks when reindexing
|
|
833
|
+
await self.database.delete_by_file(file_path)
|
|
834
|
+
|
|
835
|
+
# Parse file into chunks
|
|
836
|
+
chunks = await self._parse_file(file_path)
|
|
837
|
+
|
|
838
|
+
if chunks:
|
|
839
|
+
# Build hierarchical relationships
|
|
840
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
841
|
+
|
|
842
|
+
# Add chunks to database
|
|
843
|
+
await self.database.add_chunks(chunks_with_hierarchy)
|
|
844
|
+
chunks_added = len(chunks)
|
|
845
|
+
logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
|
|
846
|
+
|
|
847
|
+
success = True
|
|
848
|
+
|
|
849
|
+
# Update metadata after successful indexing
|
|
850
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
851
|
+
|
|
852
|
+
except Exception as e:
|
|
853
|
+
error_msg = f"Failed to index file {file_path}: {type(e).__name__}: {str(e)}"
|
|
854
|
+
logger.error(error_msg)
|
|
855
|
+
success = False
|
|
856
|
+
|
|
857
|
+
# Save error to error log file
|
|
858
|
+
try:
|
|
859
|
+
error_log_path = (
|
|
860
|
+
self.project_root
|
|
861
|
+
/ ".mcp-vector-search"
|
|
862
|
+
/ "indexing_errors.log"
|
|
863
|
+
)
|
|
864
|
+
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
865
|
+
from datetime import datetime
|
|
866
|
+
|
|
867
|
+
timestamp = datetime.now().isoformat()
|
|
868
|
+
f.write(f"[{timestamp}] {error_msg}\n")
|
|
869
|
+
except Exception as log_err:
|
|
870
|
+
logger.debug(f"Failed to write error log: {log_err}")
|
|
871
|
+
|
|
872
|
+
# Yield progress update
|
|
873
|
+
yield (file_path, chunks_added, success)
|
|
874
|
+
|
|
875
|
+
# Save metadata at the end
|
|
876
|
+
self._save_index_metadata(metadata)
|
|
877
|
+
|
|
878
|
+
def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
|
|
879
|
+
"""Build parent-child relationships between chunks.
|
|
880
|
+
|
|
881
|
+
Logic:
|
|
882
|
+
- Module chunks (chunk_type="module") have depth 0
|
|
883
|
+
- Class chunks have depth 1, parent is module
|
|
884
|
+
- Method chunks have depth 2, parent is class
|
|
885
|
+
- Function chunks outside classes have depth 1, parent is module
|
|
886
|
+
- Nested classes increment depth
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
chunks: List of code chunks to process
|
|
890
|
+
|
|
891
|
+
Returns:
|
|
892
|
+
List of chunks with hierarchy relationships established
|
|
893
|
+
"""
|
|
894
|
+
if not chunks:
|
|
895
|
+
return chunks
|
|
896
|
+
|
|
897
|
+
# Group chunks by type and name
|
|
898
|
+
module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
|
|
899
|
+
class_chunks = [
|
|
900
|
+
c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
|
|
901
|
+
]
|
|
902
|
+
function_chunks = [
|
|
903
|
+
c for c in chunks if c.chunk_type in ("function", "method", "constructor")
|
|
904
|
+
]
|
|
905
|
+
|
|
906
|
+
# DEBUG: Print what we have (if debug enabled)
|
|
907
|
+
if self.debug:
|
|
908
|
+
import sys
|
|
909
|
+
|
|
910
|
+
print(
|
|
911
|
+
f"\n[DEBUG] Building hierarchy: {len(module_chunks)} modules, {len(class_chunks)} classes, {len(function_chunks)} functions",
|
|
912
|
+
file=sys.stderr,
|
|
913
|
+
)
|
|
914
|
+
if class_chunks:
|
|
915
|
+
print(
|
|
916
|
+
f"[DEBUG] Class names: {[c.class_name for c in class_chunks[:5]]}",
|
|
917
|
+
file=sys.stderr,
|
|
918
|
+
)
|
|
919
|
+
if function_chunks:
|
|
920
|
+
print(
|
|
921
|
+
f"[DEBUG] First 5 functions with class_name: {[(f.function_name, f.class_name) for f in function_chunks[:5]]}",
|
|
922
|
+
file=sys.stderr,
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Build relationships
|
|
926
|
+
for func in function_chunks:
|
|
927
|
+
if func.class_name:
|
|
928
|
+
# Find parent class
|
|
929
|
+
parent_class = next(
|
|
930
|
+
(c for c in class_chunks if c.class_name == func.class_name), None
|
|
931
|
+
)
|
|
932
|
+
if parent_class:
|
|
933
|
+
func.parent_chunk_id = parent_class.chunk_id
|
|
934
|
+
func.chunk_depth = parent_class.chunk_depth + 1
|
|
935
|
+
if func.chunk_id not in parent_class.child_chunk_ids:
|
|
936
|
+
parent_class.child_chunk_ids.append(func.chunk_id)
|
|
937
|
+
if self.debug:
|
|
938
|
+
import sys
|
|
939
|
+
|
|
940
|
+
print(
|
|
941
|
+
f"[DEBUG] ✓ Linked '{func.function_name}' to class '{parent_class.class_name}'",
|
|
942
|
+
file=sys.stderr,
|
|
943
|
+
)
|
|
944
|
+
logger.debug(
|
|
945
|
+
f"Linked method '{func.function_name}' (ID: {func.chunk_id[:8]}) to class '{parent_class.class_name}' (ID: {parent_class.chunk_id[:8]})"
|
|
946
|
+
)
|
|
947
|
+
else:
|
|
948
|
+
# Top-level function
|
|
949
|
+
if not func.chunk_depth:
|
|
950
|
+
func.chunk_depth = 1
|
|
951
|
+
# Link to module if exists
|
|
952
|
+
if module_chunks and not func.parent_chunk_id:
|
|
953
|
+
func.parent_chunk_id = module_chunks[0].chunk_id
|
|
954
|
+
if func.chunk_id not in module_chunks[0].child_chunk_ids:
|
|
955
|
+
module_chunks[0].child_chunk_ids.append(func.chunk_id)
|
|
956
|
+
|
|
957
|
+
for cls in class_chunks:
|
|
958
|
+
# Classes without parent are top-level (depth 1)
|
|
959
|
+
if not cls.chunk_depth:
|
|
960
|
+
cls.chunk_depth = 1
|
|
961
|
+
# Link to module if exists
|
|
962
|
+
if module_chunks and not cls.parent_chunk_id:
|
|
963
|
+
cls.parent_chunk_id = module_chunks[0].chunk_id
|
|
964
|
+
if cls.chunk_id not in module_chunks[0].child_chunk_ids:
|
|
965
|
+
module_chunks[0].child_chunk_ids.append(cls.chunk_id)
|
|
966
|
+
|
|
967
|
+
# Module chunks stay at depth 0
|
|
968
|
+
for mod in module_chunks:
|
|
969
|
+
if not mod.chunk_depth:
|
|
970
|
+
mod.chunk_depth = 0
|
|
971
|
+
|
|
972
|
+
# DEBUG: Print summary
|
|
973
|
+
if self.debug:
|
|
974
|
+
import sys
|
|
975
|
+
|
|
976
|
+
funcs_with_parents = sum(1 for f in function_chunks if f.parent_chunk_id)
|
|
977
|
+
classes_with_parents = sum(1 for c in class_chunks if c.parent_chunk_id)
|
|
978
|
+
print(
|
|
979
|
+
f"[DEBUG] Hierarchy built: {funcs_with_parents}/{len(function_chunks)} functions linked, {classes_with_parents}/{len(class_chunks)} classes linked\n",
|
|
980
|
+
file=sys.stderr,
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
return chunks
|
|
984
|
+
|
|
985
|
+
def _write_indexing_run_header(self) -> None:
|
|
986
|
+
"""Write version and timestamp header to error log at start of indexing run."""
|
|
987
|
+
try:
|
|
988
|
+
error_log_path = (
|
|
989
|
+
self.project_root / ".mcp-vector-search" / "indexing_errors.log"
|
|
990
|
+
)
|
|
991
|
+
error_log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
992
|
+
|
|
993
|
+
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
994
|
+
timestamp = datetime.now(UTC).isoformat()
|
|
995
|
+
separator = "=" * 80
|
|
996
|
+
f.write(f"\n{separator}\n")
|
|
997
|
+
f.write(
|
|
998
|
+
f"[{timestamp}] Indexing run started - mcp-vector-search v{__version__}\n"
|
|
999
|
+
)
|
|
1000
|
+
f.write(f"{separator}\n")
|
|
1001
|
+
except Exception as e:
|
|
1002
|
+
logger.debug(f"Failed to write indexing run header: {e}")
|