mcp-vector-search 0.12.6__py3-none-any.whl → 1.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_vector_search/__init__.py +3 -3
- mcp_vector_search/analysis/__init__.py +111 -0
- mcp_vector_search/analysis/baseline/__init__.py +68 -0
- mcp_vector_search/analysis/baseline/comparator.py +462 -0
- mcp_vector_search/analysis/baseline/manager.py +621 -0
- mcp_vector_search/analysis/collectors/__init__.py +74 -0
- mcp_vector_search/analysis/collectors/base.py +164 -0
- mcp_vector_search/analysis/collectors/cohesion.py +463 -0
- mcp_vector_search/analysis/collectors/complexity.py +743 -0
- mcp_vector_search/analysis/collectors/coupling.py +1162 -0
- mcp_vector_search/analysis/collectors/halstead.py +514 -0
- mcp_vector_search/analysis/collectors/smells.py +325 -0
- mcp_vector_search/analysis/debt.py +516 -0
- mcp_vector_search/analysis/interpretation.py +685 -0
- mcp_vector_search/analysis/metrics.py +414 -0
- mcp_vector_search/analysis/reporters/__init__.py +7 -0
- mcp_vector_search/analysis/reporters/console.py +646 -0
- mcp_vector_search/analysis/reporters/markdown.py +480 -0
- mcp_vector_search/analysis/reporters/sarif.py +377 -0
- mcp_vector_search/analysis/storage/__init__.py +93 -0
- mcp_vector_search/analysis/storage/metrics_store.py +762 -0
- mcp_vector_search/analysis/storage/schema.py +245 -0
- mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
- mcp_vector_search/analysis/trends.py +308 -0
- mcp_vector_search/analysis/visualizer/__init__.py +90 -0
- mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
- mcp_vector_search/analysis/visualizer/exporter.py +484 -0
- mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
- mcp_vector_search/analysis/visualizer/schemas.py +525 -0
- mcp_vector_search/cli/commands/analyze.py +1062 -0
- mcp_vector_search/cli/commands/chat.py +1455 -0
- mcp_vector_search/cli/commands/index.py +621 -5
- mcp_vector_search/cli/commands/index_background.py +467 -0
- mcp_vector_search/cli/commands/init.py +13 -0
- mcp_vector_search/cli/commands/install.py +597 -335
- mcp_vector_search/cli/commands/install_old.py +8 -4
- mcp_vector_search/cli/commands/mcp.py +78 -6
- mcp_vector_search/cli/commands/reset.py +68 -26
- mcp_vector_search/cli/commands/search.py +224 -8
- mcp_vector_search/cli/commands/setup.py +1184 -0
- mcp_vector_search/cli/commands/status.py +339 -5
- mcp_vector_search/cli/commands/uninstall.py +276 -357
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +292 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +647 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +600 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +234 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +4542 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +2522 -0
- mcp_vector_search/cli/didyoumean.py +27 -2
- mcp_vector_search/cli/main.py +127 -160
- mcp_vector_search/cli/output.py +158 -13
- mcp_vector_search/config/__init__.py +4 -0
- mcp_vector_search/config/default_thresholds.yaml +52 -0
- mcp_vector_search/config/settings.py +12 -0
- mcp_vector_search/config/thresholds.py +273 -0
- mcp_vector_search/core/__init__.py +16 -0
- mcp_vector_search/core/auto_indexer.py +3 -3
- mcp_vector_search/core/boilerplate.py +186 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/database.py +406 -94
- mcp_vector_search/core/embeddings.py +24 -0
- mcp_vector_search/core/exceptions.py +11 -0
- mcp_vector_search/core/git.py +380 -0
- mcp_vector_search/core/git_hooks.py +4 -4
- mcp_vector_search/core/indexer.py +632 -54
- mcp_vector_search/core/llm_client.py +756 -0
- mcp_vector_search/core/models.py +91 -1
- mcp_vector_search/core/project.py +17 -0
- mcp_vector_search/core/relationships.py +473 -0
- mcp_vector_search/core/scheduler.py +11 -11
- mcp_vector_search/core/search.py +179 -29
- mcp_vector_search/mcp/server.py +819 -9
- mcp_vector_search/parsers/python.py +285 -5
- mcp_vector_search/utils/__init__.py +2 -0
- mcp_vector_search/utils/gitignore.py +0 -3
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +66 -4
- mcp_vector_search/utils/timing.py +10 -6
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +184 -53
- mcp_vector_search-1.1.22.dist-info/RECORD +120 -0
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +1 -1
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +1 -0
- mcp_vector_search/cli/commands/visualize.py +0 -1467
- mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,14 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import json
|
|
5
|
+
import multiprocessing
|
|
5
6
|
import os
|
|
7
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
6
8
|
from datetime import UTC, datetime
|
|
7
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
8
11
|
|
|
9
12
|
from loguru import logger
|
|
10
13
|
from packaging import version
|
|
11
14
|
|
|
12
15
|
from .. import __version__
|
|
16
|
+
from ..analysis.collectors.base import MetricCollector
|
|
17
|
+
from ..analysis.metrics import ChunkMetrics
|
|
18
|
+
from ..analysis.trends import TrendTracker
|
|
13
19
|
from ..config.defaults import ALLOWED_DOTFILES, DEFAULT_IGNORE_PATTERNS
|
|
14
20
|
from ..config.settings import ProjectConfig
|
|
15
21
|
from ..parsers.registry import get_parser_registry
|
|
@@ -19,6 +25,81 @@ from .database import VectorDatabase
|
|
|
19
25
|
from .directory_index import DirectoryIndex
|
|
20
26
|
from .exceptions import ParsingError
|
|
21
27
|
from .models import CodeChunk, IndexStats
|
|
28
|
+
from .relationships import RelationshipStore
|
|
29
|
+
|
|
30
|
+
# Extension to language mapping for metric collection
|
|
31
|
+
EXTENSION_TO_LANGUAGE = {
|
|
32
|
+
".py": "python",
|
|
33
|
+
".js": "javascript",
|
|
34
|
+
".ts": "typescript",
|
|
35
|
+
".jsx": "javascript",
|
|
36
|
+
".tsx": "typescript",
|
|
37
|
+
".java": "java",
|
|
38
|
+
".rs": "rust",
|
|
39
|
+
".php": "php",
|
|
40
|
+
".rb": "ruby",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _parse_file_standalone(
|
|
45
|
+
args: tuple[Path, str | None],
|
|
46
|
+
) -> tuple[Path, list[CodeChunk], Exception | None]:
|
|
47
|
+
"""Parse a single file - standalone function for multiprocessing.
|
|
48
|
+
|
|
49
|
+
This function must be at module level (not a method) to be picklable for
|
|
50
|
+
multiprocessing. It creates its own parser registry to avoid serialization issues.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
args: Tuple of (file_path, subproject_info_json)
|
|
54
|
+
- file_path: Path to the file to parse
|
|
55
|
+
- subproject_info_json: JSON string with subproject info or None
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Tuple of (file_path, chunks, error)
|
|
59
|
+
- file_path: The file path that was parsed
|
|
60
|
+
- chunks: List of parsed CodeChunk objects (empty if error)
|
|
61
|
+
- error: Exception if parsing failed, None if successful
|
|
62
|
+
"""
|
|
63
|
+
file_path, subproject_info_json = args
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Create parser registry in this process
|
|
67
|
+
parser_registry = get_parser_registry()
|
|
68
|
+
|
|
69
|
+
# Get appropriate parser
|
|
70
|
+
parser = parser_registry.get_parser_for_file(file_path)
|
|
71
|
+
|
|
72
|
+
# Parse file synchronously (tree-sitter is synchronous anyway)
|
|
73
|
+
# We need to use the synchronous version of parse_file
|
|
74
|
+
# Since parsers may have async methods, we'll read and parse directly
|
|
75
|
+
import asyncio
|
|
76
|
+
|
|
77
|
+
# Create event loop for this process if needed
|
|
78
|
+
try:
|
|
79
|
+
loop = asyncio.get_event_loop()
|
|
80
|
+
except RuntimeError:
|
|
81
|
+
loop = asyncio.new_event_loop()
|
|
82
|
+
asyncio.set_event_loop(loop)
|
|
83
|
+
|
|
84
|
+
# Run the async parse_file in this process's event loop
|
|
85
|
+
chunks = loop.run_until_complete(parser.parse_file(file_path))
|
|
86
|
+
|
|
87
|
+
# Filter out empty chunks
|
|
88
|
+
valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
|
|
89
|
+
|
|
90
|
+
# Apply subproject information if available
|
|
91
|
+
if subproject_info_json:
|
|
92
|
+
subproject_info = json.loads(subproject_info_json)
|
|
93
|
+
for chunk in valid_chunks:
|
|
94
|
+
chunk.subproject_name = subproject_info.get("name")
|
|
95
|
+
chunk.subproject_path = subproject_info.get("relative_path")
|
|
96
|
+
|
|
97
|
+
return (file_path, valid_chunks, None)
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
# Return error instead of raising to avoid process crashes
|
|
101
|
+
logger.error(f"Failed to parse file {file_path} in worker process: {e}")
|
|
102
|
+
return (file_path, [], e)
|
|
22
103
|
|
|
23
104
|
|
|
24
105
|
class SemanticIndexer:
|
|
@@ -33,6 +114,8 @@ class SemanticIndexer:
|
|
|
33
114
|
max_workers: int | None = None,
|
|
34
115
|
batch_size: int = 10,
|
|
35
116
|
debug: bool = False,
|
|
117
|
+
collectors: list[MetricCollector] | None = None,
|
|
118
|
+
use_multiprocessing: bool = True,
|
|
36
119
|
) -> None:
|
|
37
120
|
"""Initialize semantic indexer.
|
|
38
121
|
|
|
@@ -41,9 +124,11 @@ class SemanticIndexer:
|
|
|
41
124
|
project_root: Project root directory
|
|
42
125
|
file_extensions: File extensions to index (deprecated, use config)
|
|
43
126
|
config: Project configuration (preferred over file_extensions)
|
|
44
|
-
max_workers: Maximum number of worker
|
|
127
|
+
max_workers: Maximum number of worker processes for parallel parsing (ignored if use_multiprocessing=False)
|
|
45
128
|
batch_size: Number of files to process in each batch
|
|
46
129
|
debug: Enable debug output for hierarchy building
|
|
130
|
+
collectors: Metric collectors to run during indexing (defaults to all complexity collectors)
|
|
131
|
+
use_multiprocessing: Enable multiprocess parallel parsing (default: True, disable for debugging)
|
|
47
132
|
"""
|
|
48
133
|
self.database = database
|
|
49
134
|
self.project_root = project_root
|
|
@@ -63,13 +148,23 @@ class SemanticIndexer:
|
|
|
63
148
|
self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
|
|
64
149
|
self.debug = debug
|
|
65
150
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
151
|
+
# Initialize metric collectors
|
|
152
|
+
self.collectors = (
|
|
153
|
+
collectors if collectors is not None else self._default_collectors()
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Configure multiprocessing for parallel parsing
|
|
157
|
+
self.use_multiprocessing = use_multiprocessing
|
|
158
|
+
if use_multiprocessing:
|
|
159
|
+
# Use 75% of CPU cores for parsing, but cap at 8 to avoid overhead
|
|
160
|
+
cpu_count = multiprocessing.cpu_count()
|
|
161
|
+
self.max_workers = max_workers or min(max(1, int(cpu_count * 0.75)), 8)
|
|
162
|
+
logger.debug(
|
|
163
|
+
f"Multiprocessing enabled with {self.max_workers} workers (CPU count: {cpu_count})"
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
self.max_workers = 1
|
|
167
|
+
logger.debug("Multiprocessing disabled (single-threaded mode)")
|
|
73
168
|
|
|
74
169
|
self.batch_size = batch_size
|
|
75
170
|
self._index_metadata_file = (
|
|
@@ -110,16 +205,162 @@ class SemanticIndexer:
|
|
|
110
205
|
# Load existing directory index
|
|
111
206
|
self.directory_index.load()
|
|
112
207
|
|
|
208
|
+
# Initialize relationship store for pre-computing visualization relationships
|
|
209
|
+
self.relationship_store = RelationshipStore(project_root)
|
|
210
|
+
|
|
211
|
+
# Initialize trend tracker for historical metrics
|
|
212
|
+
self.trend_tracker = TrendTracker(project_root)
|
|
213
|
+
|
|
214
|
+
def _default_collectors(self) -> list[MetricCollector]:
|
|
215
|
+
"""Return default set of metric collectors.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
List of all complexity collectors (cognitive, cyclomatic, nesting, parameters, methods)
|
|
219
|
+
"""
|
|
220
|
+
from ..analysis.collectors.complexity import (
|
|
221
|
+
CognitiveComplexityCollector,
|
|
222
|
+
CyclomaticComplexityCollector,
|
|
223
|
+
MethodCountCollector,
|
|
224
|
+
NestingDepthCollector,
|
|
225
|
+
ParameterCountCollector,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return [
|
|
229
|
+
CognitiveComplexityCollector(),
|
|
230
|
+
CyclomaticComplexityCollector(),
|
|
231
|
+
NestingDepthCollector(),
|
|
232
|
+
ParameterCountCollector(),
|
|
233
|
+
MethodCountCollector(),
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
def _collect_metrics(
|
|
237
|
+
self, chunk: CodeChunk, source_code: bytes, language: str
|
|
238
|
+
) -> ChunkMetrics | None:
|
|
239
|
+
"""Collect metrics for a code chunk.
|
|
240
|
+
|
|
241
|
+
This is a simplified version that estimates metrics from chunk content
|
|
242
|
+
without full TreeSitter traversal. Future implementation will use
|
|
243
|
+
TreeSitter node traversal for accurate metric collection.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
chunk: The parsed code chunk
|
|
247
|
+
source_code: Raw source code bytes
|
|
248
|
+
language: Programming language identifier
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
ChunkMetrics for the chunk, or None if no metrics collected
|
|
252
|
+
"""
|
|
253
|
+
# For now, create basic metrics from chunk content
|
|
254
|
+
# TODO: Implement full TreeSitter traversal in Phase 2
|
|
255
|
+
lines_of_code = chunk.line_count
|
|
256
|
+
|
|
257
|
+
# Estimate complexity from simple heuristics
|
|
258
|
+
content = chunk.content
|
|
259
|
+
cognitive_complexity = self._estimate_cognitive_complexity(content)
|
|
260
|
+
cyclomatic_complexity = self._estimate_cyclomatic_complexity(content)
|
|
261
|
+
max_nesting_depth = self._estimate_nesting_depth(content)
|
|
262
|
+
parameter_count = len(chunk.parameters) if chunk.parameters else 0
|
|
263
|
+
|
|
264
|
+
metrics = ChunkMetrics(
|
|
265
|
+
cognitive_complexity=cognitive_complexity,
|
|
266
|
+
cyclomatic_complexity=cyclomatic_complexity,
|
|
267
|
+
max_nesting_depth=max_nesting_depth,
|
|
268
|
+
parameter_count=parameter_count,
|
|
269
|
+
lines_of_code=lines_of_code,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return metrics
|
|
273
|
+
|
|
274
|
+
def _estimate_cognitive_complexity(self, content: str) -> int:
|
|
275
|
+
"""Estimate cognitive complexity from content (simplified heuristic).
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
content: Code content
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Estimated cognitive complexity score
|
|
282
|
+
"""
|
|
283
|
+
# Simple heuristic: count control flow keywords
|
|
284
|
+
keywords = [
|
|
285
|
+
"if",
|
|
286
|
+
"elif",
|
|
287
|
+
"else",
|
|
288
|
+
"for",
|
|
289
|
+
"while",
|
|
290
|
+
"try",
|
|
291
|
+
"except",
|
|
292
|
+
"case",
|
|
293
|
+
"when",
|
|
294
|
+
]
|
|
295
|
+
complexity = 0
|
|
296
|
+
for keyword in keywords:
|
|
297
|
+
complexity += content.count(f" {keyword} ")
|
|
298
|
+
complexity += content.count(f"\t{keyword} ")
|
|
299
|
+
complexity += content.count(f"\n{keyword} ")
|
|
300
|
+
return complexity
|
|
301
|
+
|
|
302
|
+
def _estimate_cyclomatic_complexity(self, content: str) -> int:
|
|
303
|
+
"""Estimate cyclomatic complexity from content (simplified heuristic).
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
content: Code content
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Estimated cyclomatic complexity score (minimum 1)
|
|
310
|
+
"""
|
|
311
|
+
# Start with baseline of 1
|
|
312
|
+
complexity = 1
|
|
313
|
+
|
|
314
|
+
# Count decision points
|
|
315
|
+
keywords = [
|
|
316
|
+
"if",
|
|
317
|
+
"elif",
|
|
318
|
+
"for",
|
|
319
|
+
"while",
|
|
320
|
+
"case",
|
|
321
|
+
"when",
|
|
322
|
+
"&&",
|
|
323
|
+
"||",
|
|
324
|
+
"and",
|
|
325
|
+
"or",
|
|
326
|
+
]
|
|
327
|
+
for keyword in keywords:
|
|
328
|
+
complexity += content.count(keyword)
|
|
329
|
+
|
|
330
|
+
return complexity
|
|
331
|
+
|
|
332
|
+
def _estimate_nesting_depth(self, content: str) -> int:
|
|
333
|
+
"""Estimate maximum nesting depth from indentation (simplified heuristic).
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
content: Code content
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Estimated maximum nesting depth
|
|
340
|
+
"""
|
|
341
|
+
max_depth = 0
|
|
342
|
+
for line in content.split("\n"):
|
|
343
|
+
# Count leading whitespace (4 spaces or 1 tab = 1 level)
|
|
344
|
+
leading = len(line) - len(line.lstrip())
|
|
345
|
+
if "\t" in line[:leading]:
|
|
346
|
+
depth = line[:leading].count("\t")
|
|
347
|
+
else:
|
|
348
|
+
depth = leading // 4
|
|
349
|
+
max_depth = max(max_depth, depth)
|
|
350
|
+
return max_depth
|
|
351
|
+
|
|
113
352
|
async def index_project(
|
|
114
353
|
self,
|
|
115
354
|
force_reindex: bool = False,
|
|
116
355
|
show_progress: bool = True,
|
|
356
|
+
skip_relationships: bool = False,
|
|
117
357
|
) -> int:
|
|
118
358
|
"""Index all files in the project.
|
|
119
359
|
|
|
120
360
|
Args:
|
|
121
361
|
force_reindex: Whether to reindex existing files
|
|
122
362
|
show_progress: Whether to show progress information
|
|
363
|
+
skip_relationships: Skip computing relationships for visualization (faster, but visualize will be slower)
|
|
123
364
|
|
|
124
365
|
Returns:
|
|
125
366
|
Number of files indexed
|
|
@@ -222,12 +463,134 @@ class SemanticIndexer:
|
|
|
222
463
|
f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
|
|
223
464
|
)
|
|
224
465
|
|
|
466
|
+
# Mark relationships for background computation (unless skipped)
|
|
467
|
+
# Default behavior: skip blocking computation, mark for background processing
|
|
468
|
+
if not skip_relationships and indexed_count > 0:
|
|
469
|
+
try:
|
|
470
|
+
logger.info("Marking relationships for background computation...")
|
|
471
|
+
# Get all chunks from database for relationship computation
|
|
472
|
+
all_chunks = await self.database.get_all_chunks()
|
|
473
|
+
|
|
474
|
+
if len(all_chunks) > 0:
|
|
475
|
+
# Mark for background computation (non-blocking)
|
|
476
|
+
await self.relationship_store.compute_and_store(
|
|
477
|
+
all_chunks, self.database, background=True
|
|
478
|
+
)
|
|
479
|
+
logger.info("✓ Relationships marked for background computation")
|
|
480
|
+
logger.info(
|
|
481
|
+
" Use 'mcp-vector-search index relationships' to compute now or wait for background task"
|
|
482
|
+
)
|
|
483
|
+
else:
|
|
484
|
+
logger.warning("No chunks found for relationship computation")
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.warning(f"Failed to mark relationships: {e}")
|
|
487
|
+
logger.debug("Visualization will compute relationships on demand")
|
|
488
|
+
|
|
489
|
+
# Save trend snapshot after successful indexing
|
|
490
|
+
if indexed_count > 0:
|
|
491
|
+
try:
|
|
492
|
+
logger.info("Saving metrics snapshot for trend tracking...")
|
|
493
|
+
# Get database stats
|
|
494
|
+
stats = await self.database.get_stats()
|
|
495
|
+
# Get all chunks for detailed metrics
|
|
496
|
+
all_chunks = await self.database.get_all_chunks()
|
|
497
|
+
# Compute metrics from stats and chunks
|
|
498
|
+
metrics = self.trend_tracker.compute_metrics_from_stats(
|
|
499
|
+
stats.to_dict(), all_chunks
|
|
500
|
+
)
|
|
501
|
+
# Save snapshot (updates today's entry if exists)
|
|
502
|
+
self.trend_tracker.save_snapshot(metrics)
|
|
503
|
+
logger.info(
|
|
504
|
+
f"✓ Saved trend snapshot: {metrics['total_files']} files, "
|
|
505
|
+
f"{metrics['total_chunks']} chunks, health score {metrics['health_score']}"
|
|
506
|
+
)
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.warning(f"Failed to save trend snapshot: {e}")
|
|
509
|
+
|
|
225
510
|
return indexed_count
|
|
226
511
|
|
|
512
|
+
async def _parse_and_prepare_file(
|
|
513
|
+
self, file_path: Path, force_reindex: bool = False
|
|
514
|
+
) -> tuple[list[CodeChunk], dict[str, Any] | None]:
|
|
515
|
+
"""Parse file and prepare chunks with metrics (no database insertion).
|
|
516
|
+
|
|
517
|
+
This method extracts the parsing and metric collection logic from index_file()
|
|
518
|
+
to enable batch processing across multiple files.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
file_path: Path to the file to parse
|
|
522
|
+
force_reindex: Whether to force reindexing (always deletes existing chunks)
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
Tuple of (chunks_with_hierarchy, chunk_metrics)
|
|
526
|
+
|
|
527
|
+
Raises:
|
|
528
|
+
ParsingError: If file parsing fails
|
|
529
|
+
"""
|
|
530
|
+
# Check if file should be indexed
|
|
531
|
+
if not self._should_index_file(file_path):
|
|
532
|
+
return ([], None)
|
|
533
|
+
|
|
534
|
+
# Always remove existing chunks when reindexing a file
|
|
535
|
+
# This prevents duplicate chunks and ensures consistency
|
|
536
|
+
await self.database.delete_by_file(file_path)
|
|
537
|
+
|
|
538
|
+
# Parse file into chunks
|
|
539
|
+
chunks = await self._parse_file(file_path)
|
|
540
|
+
|
|
541
|
+
if not chunks:
|
|
542
|
+
logger.debug(f"No chunks extracted from {file_path}")
|
|
543
|
+
return ([], None)
|
|
544
|
+
|
|
545
|
+
# Build hierarchical relationships between chunks
|
|
546
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
547
|
+
|
|
548
|
+
# Debug: Check if hierarchy was built
|
|
549
|
+
methods_with_parents = sum(
|
|
550
|
+
1
|
|
551
|
+
for c in chunks_with_hierarchy
|
|
552
|
+
if c.chunk_type in ("method", "function") and c.parent_chunk_id
|
|
553
|
+
)
|
|
554
|
+
logger.debug(
|
|
555
|
+
f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# Collect metrics for chunks (if collectors are enabled)
|
|
559
|
+
chunk_metrics: dict[str, Any] | None = None
|
|
560
|
+
if self.collectors:
|
|
561
|
+
try:
|
|
562
|
+
# Read source code
|
|
563
|
+
source_code = file_path.read_bytes()
|
|
564
|
+
|
|
565
|
+
# Detect language from file extension
|
|
566
|
+
language = EXTENSION_TO_LANGUAGE.get(
|
|
567
|
+
file_path.suffix.lower(), "unknown"
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Collect metrics for each chunk
|
|
571
|
+
chunk_metrics = {}
|
|
572
|
+
for chunk in chunks_with_hierarchy:
|
|
573
|
+
metrics = self._collect_metrics(chunk, source_code, language)
|
|
574
|
+
if metrics:
|
|
575
|
+
chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
|
|
576
|
+
|
|
577
|
+
logger.debug(
|
|
578
|
+
f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
|
|
579
|
+
)
|
|
580
|
+
except Exception as e:
|
|
581
|
+
logger.warning(f"Failed to collect metrics for {file_path}: {e}")
|
|
582
|
+
chunk_metrics = None
|
|
583
|
+
|
|
584
|
+
return (chunks_with_hierarchy, chunk_metrics)
|
|
585
|
+
|
|
227
586
|
async def _process_file_batch(
|
|
228
587
|
self, file_paths: list[Path], force_reindex: bool = False
|
|
229
588
|
) -> list[bool]:
|
|
230
|
-
"""Process a batch of files
|
|
589
|
+
"""Process a batch of files and accumulate chunks for batch embedding.
|
|
590
|
+
|
|
591
|
+
This method processes multiple files in parallel (using multiprocessing for
|
|
592
|
+
CPU-bound parsing) and then performs a single database insertion for all chunks,
|
|
593
|
+
enabling efficient batch embedding generation.
|
|
231
594
|
|
|
232
595
|
Args:
|
|
233
596
|
file_paths: List of file paths to process
|
|
@@ -236,26 +599,166 @@ class SemanticIndexer:
|
|
|
236
599
|
Returns:
|
|
237
600
|
List of success flags for each file
|
|
238
601
|
"""
|
|
239
|
-
|
|
240
|
-
|
|
602
|
+
all_chunks: list[CodeChunk] = []
|
|
603
|
+
all_metrics: dict[str, Any] = {}
|
|
604
|
+
file_to_chunks_map: dict[str, tuple[int, int]] = {}
|
|
605
|
+
success_flags: list[bool] = []
|
|
606
|
+
|
|
607
|
+
# Filter files that should be indexed and delete old chunks
|
|
608
|
+
files_to_parse = []
|
|
241
609
|
for file_path in file_paths:
|
|
242
|
-
|
|
243
|
-
|
|
610
|
+
if not self._should_index_file(file_path):
|
|
611
|
+
success_flags.append(True) # Skipped file is not an error
|
|
612
|
+
continue
|
|
613
|
+
# Delete old chunks before parsing
|
|
614
|
+
await self.database.delete_by_file(file_path)
|
|
615
|
+
files_to_parse.append(file_path)
|
|
244
616
|
|
|
245
|
-
|
|
246
|
-
|
|
617
|
+
if not files_to_parse:
|
|
618
|
+
return success_flags
|
|
247
619
|
|
|
248
|
-
#
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
620
|
+
# Parse files using multiprocessing if enabled
|
|
621
|
+
if self.use_multiprocessing and len(files_to_parse) > 1:
|
|
622
|
+
# Use ProcessPoolExecutor for CPU-bound parsing
|
|
623
|
+
parse_results = await self._parse_files_multiprocess(files_to_parse)
|
|
624
|
+
else:
|
|
625
|
+
# Fall back to async processing (for single file or disabled multiprocessing)
|
|
626
|
+
parse_results = await self._parse_files_async(files_to_parse)
|
|
627
|
+
|
|
628
|
+
# Accumulate chunks from all successfully parsed files
|
|
629
|
+
metadata = self._load_index_metadata()
|
|
630
|
+
for file_path, chunks, error in parse_results:
|
|
631
|
+
if error:
|
|
632
|
+
logger.error(f"Failed to parse {file_path}: {error}")
|
|
253
633
|
success_flags.append(False)
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
if chunks:
|
|
637
|
+
# Build hierarchy and collect metrics for parsed chunks
|
|
638
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
639
|
+
|
|
640
|
+
# Collect metrics if enabled
|
|
641
|
+
chunk_metrics = None
|
|
642
|
+
if self.collectors:
|
|
643
|
+
try:
|
|
644
|
+
source_code = file_path.read_bytes()
|
|
645
|
+
language = EXTENSION_TO_LANGUAGE.get(
|
|
646
|
+
file_path.suffix.lower(), "unknown"
|
|
647
|
+
)
|
|
648
|
+
chunk_metrics = {}
|
|
649
|
+
for chunk in chunks_with_hierarchy:
|
|
650
|
+
metrics = self._collect_metrics(
|
|
651
|
+
chunk, source_code, language
|
|
652
|
+
)
|
|
653
|
+
if metrics:
|
|
654
|
+
chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
|
|
655
|
+
except Exception as e:
|
|
656
|
+
logger.warning(
|
|
657
|
+
f"Failed to collect metrics for {file_path}: {e}"
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Accumulate chunks
|
|
661
|
+
start_idx = len(all_chunks)
|
|
662
|
+
all_chunks.extend(chunks_with_hierarchy)
|
|
663
|
+
end_idx = len(all_chunks)
|
|
664
|
+
file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
|
|
665
|
+
|
|
666
|
+
# Merge metrics
|
|
667
|
+
if chunk_metrics:
|
|
668
|
+
all_metrics.update(chunk_metrics)
|
|
669
|
+
|
|
670
|
+
# Update metadata for successfully parsed file
|
|
671
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
672
|
+
success_flags.append(True)
|
|
254
673
|
else:
|
|
255
|
-
|
|
674
|
+
# Empty file is not an error
|
|
675
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
676
|
+
success_flags.append(True)
|
|
677
|
+
|
|
678
|
+
# Single database insertion for entire batch
|
|
679
|
+
if all_chunks:
|
|
680
|
+
logger.info(
|
|
681
|
+
f"Batch inserting {len(all_chunks)} chunks from {len(file_paths)} files"
|
|
682
|
+
)
|
|
683
|
+
try:
|
|
684
|
+
await self.database.add_chunks(all_chunks, metrics=all_metrics)
|
|
685
|
+
logger.debug(
|
|
686
|
+
f"Successfully indexed {len(all_chunks)} chunks from {sum(success_flags)} files"
|
|
687
|
+
)
|
|
688
|
+
except Exception as e:
|
|
689
|
+
logger.error(f"Failed to insert batch of chunks: {e}")
|
|
690
|
+
# Mark all files in this batch as failed
|
|
691
|
+
return [False] * len(file_paths)
|
|
692
|
+
|
|
693
|
+
# Save updated metadata after successful batch
|
|
694
|
+
self._save_index_metadata(metadata)
|
|
256
695
|
|
|
257
696
|
return success_flags
|
|
258
697
|
|
|
698
|
+
async def _parse_files_multiprocess(
|
|
699
|
+
self, file_paths: list[Path]
|
|
700
|
+
) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
|
|
701
|
+
"""Parse multiple files using multiprocessing for CPU-bound parallelism.
|
|
702
|
+
|
|
703
|
+
Args:
|
|
704
|
+
file_paths: List of file paths to parse
|
|
705
|
+
|
|
706
|
+
Returns:
|
|
707
|
+
List of tuples (file_path, chunks, error) for each file
|
|
708
|
+
"""
|
|
709
|
+
# Prepare arguments for worker processes
|
|
710
|
+
parse_args = []
|
|
711
|
+
for file_path in file_paths:
|
|
712
|
+
# Get subproject info if available
|
|
713
|
+
subproject = self.monorepo_detector.get_subproject_for_file(file_path)
|
|
714
|
+
subproject_info_json = None
|
|
715
|
+
if subproject:
|
|
716
|
+
subproject_info_json = json.dumps(
|
|
717
|
+
{
|
|
718
|
+
"name": subproject.name,
|
|
719
|
+
"relative_path": subproject.relative_path,
|
|
720
|
+
}
|
|
721
|
+
)
|
|
722
|
+
parse_args.append((file_path, subproject_info_json))
|
|
723
|
+
|
|
724
|
+
# Limit workers to avoid overhead
|
|
725
|
+
max_workers = min(self.max_workers, len(file_paths))
|
|
726
|
+
|
|
727
|
+
# Run parsing in ProcessPoolExecutor
|
|
728
|
+
loop = asyncio.get_running_loop()
|
|
729
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
730
|
+
# Submit all tasks and wait for results
|
|
731
|
+
results = await loop.run_in_executor(
|
|
732
|
+
None, lambda: list(executor.map(_parse_file_standalone, parse_args))
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
logger.debug(
|
|
736
|
+
f"Multiprocess parsing completed: {len(results)} files parsed with {max_workers} workers"
|
|
737
|
+
)
|
|
738
|
+
return results
|
|
739
|
+
|
|
740
|
+
async def _parse_files_async(
|
|
741
|
+
self, file_paths: list[Path]
|
|
742
|
+
) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
|
|
743
|
+
"""Parse multiple files using async (fallback for single file or disabled multiprocessing).
|
|
744
|
+
|
|
745
|
+
Args:
|
|
746
|
+
file_paths: List of file paths to parse
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
List of tuples (file_path, chunks, error) for each file
|
|
750
|
+
"""
|
|
751
|
+
results = []
|
|
752
|
+
for file_path in file_paths:
|
|
753
|
+
try:
|
|
754
|
+
chunks = await self._parse_file(file_path)
|
|
755
|
+
results.append((file_path, chunks, None))
|
|
756
|
+
except Exception as e:
|
|
757
|
+
logger.error(f"Failed to parse {file_path}: {e}")
|
|
758
|
+
results.append((file_path, [], e))
|
|
759
|
+
|
|
760
|
+
return results
|
|
761
|
+
|
|
259
762
|
def _load_index_metadata(self) -> dict[str, float]:
|
|
260
763
|
"""Load file modification times from metadata file.
|
|
261
764
|
|
|
@@ -379,8 +882,34 @@ class SemanticIndexer:
|
|
|
379
882
|
f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
|
|
380
883
|
)
|
|
381
884
|
|
|
382
|
-
#
|
|
383
|
-
|
|
885
|
+
# Collect metrics for chunks (if collectors are enabled)
|
|
886
|
+
chunk_metrics: dict[str, Any] | None = None
|
|
887
|
+
if self.collectors:
|
|
888
|
+
try:
|
|
889
|
+
# Read source code
|
|
890
|
+
source_code = file_path.read_bytes()
|
|
891
|
+
|
|
892
|
+
# Detect language from file extension
|
|
893
|
+
language = EXTENSION_TO_LANGUAGE.get(
|
|
894
|
+
file_path.suffix.lower(), "unknown"
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
# Collect metrics for each chunk
|
|
898
|
+
chunk_metrics = {}
|
|
899
|
+
for chunk in chunks_with_hierarchy:
|
|
900
|
+
metrics = self._collect_metrics(chunk, source_code, language)
|
|
901
|
+
if metrics:
|
|
902
|
+
chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
|
|
903
|
+
|
|
904
|
+
logger.debug(
|
|
905
|
+
f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
|
|
906
|
+
)
|
|
907
|
+
except Exception as e:
|
|
908
|
+
logger.warning(f"Failed to collect metrics for {file_path}: {e}")
|
|
909
|
+
chunk_metrics = None
|
|
910
|
+
|
|
911
|
+
# Add chunks to database with metrics
|
|
912
|
+
await self.database.add_chunks(chunks_with_hierarchy, metrics=chunk_metrics)
|
|
384
913
|
|
|
385
914
|
# Update metadata after successful indexing
|
|
386
915
|
metadata = self._load_index_metadata()
|
|
@@ -572,8 +1101,10 @@ class SemanticIndexer:
|
|
|
572
1101
|
# Get relative path from project root for checking
|
|
573
1102
|
relative_path = file_path.relative_to(self.project_root)
|
|
574
1103
|
|
|
575
|
-
# 1. Check dotfile filtering (
|
|
576
|
-
|
|
1104
|
+
# 1. Check dotfile filtering (ENABLED BY DEFAULT)
|
|
1105
|
+
# Skip dotfiles unless config explicitly disables it
|
|
1106
|
+
skip_dotfiles = self.config.skip_dotfiles if self.config else True
|
|
1107
|
+
if skip_dotfiles:
|
|
577
1108
|
for part in relative_path.parts:
|
|
578
1109
|
# Skip dotfiles unless they're in the whitelist
|
|
579
1110
|
if part.startswith(".") and part not in ALLOWED_DOTFILES:
|
|
@@ -807,6 +1338,9 @@ class SemanticIndexer:
|
|
|
807
1338
|
):
|
|
808
1339
|
"""Index files and yield progress updates for each file.
|
|
809
1340
|
|
|
1341
|
+
This method processes files in batches and accumulates chunks across files
|
|
1342
|
+
before performing a single database insertion per batch for better performance.
|
|
1343
|
+
|
|
810
1344
|
Args:
|
|
811
1345
|
files_to_index: List of file paths to index
|
|
812
1346
|
force_reindex: Whether to force reindexing
|
|
@@ -817,42 +1351,84 @@ class SemanticIndexer:
|
|
|
817
1351
|
# Write version header to error log at start of indexing run
|
|
818
1352
|
self._write_indexing_run_header()
|
|
819
1353
|
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
# Process files in batches for better memory management
|
|
1354
|
+
# Process files in batches for better memory management and embedding efficiency
|
|
823
1355
|
for i in range(0, len(files_to_index), self.batch_size):
|
|
824
1356
|
batch = files_to_index[i : i + self.batch_size]
|
|
825
1357
|
|
|
826
|
-
#
|
|
1358
|
+
# Accumulate chunks from all files in batch
|
|
1359
|
+
all_chunks: list[CodeChunk] = []
|
|
1360
|
+
all_metrics: dict[str, Any] = {}
|
|
1361
|
+
file_to_chunks_map: dict[str, tuple[int, int]] = {}
|
|
1362
|
+
file_results: dict[Path, tuple[int, bool]] = {}
|
|
1363
|
+
|
|
1364
|
+
# Parse all files in parallel
|
|
1365
|
+
tasks = []
|
|
827
1366
|
for file_path in batch:
|
|
828
|
-
|
|
829
|
-
|
|
1367
|
+
task = asyncio.create_task(
|
|
1368
|
+
self._parse_and_prepare_file(file_path, force_reindex)
|
|
1369
|
+
)
|
|
1370
|
+
tasks.append(task)
|
|
830
1371
|
|
|
831
|
-
|
|
832
|
-
# Always remove existing chunks when reindexing
|
|
833
|
-
await self.database.delete_by_file(file_path)
|
|
1372
|
+
parse_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
834
1373
|
|
|
835
|
-
|
|
836
|
-
|
|
1374
|
+
# Accumulate chunks from successfully parsed files
|
|
1375
|
+
metadata = self._load_index_metadata()
|
|
1376
|
+
for file_path, result in zip(batch, parse_results, strict=True):
|
|
1377
|
+
if isinstance(result, Exception):
|
|
1378
|
+
error_msg = f"Failed to index file {file_path}: {type(result).__name__}: {str(result)}"
|
|
1379
|
+
logger.error(error_msg)
|
|
1380
|
+
file_results[file_path] = (0, False)
|
|
837
1381
|
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
1382
|
+
# Save error to error log file
|
|
1383
|
+
try:
|
|
1384
|
+
error_log_path = (
|
|
1385
|
+
self.project_root
|
|
1386
|
+
/ ".mcp-vector-search"
|
|
1387
|
+
/ "indexing_errors.log"
|
|
1388
|
+
)
|
|
1389
|
+
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
1390
|
+
timestamp = datetime.now().isoformat()
|
|
1391
|
+
f.write(f"[{timestamp}] {error_msg}\n")
|
|
1392
|
+
except Exception as log_err:
|
|
1393
|
+
logger.debug(f"Failed to write error log: {log_err}")
|
|
1394
|
+
continue
|
|
841
1395
|
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
1396
|
+
chunks, metrics = result
|
|
1397
|
+
if chunks:
|
|
1398
|
+
start_idx = len(all_chunks)
|
|
1399
|
+
all_chunks.extend(chunks)
|
|
1400
|
+
end_idx = len(all_chunks)
|
|
1401
|
+
file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
|
|
846
1402
|
|
|
847
|
-
|
|
1403
|
+
# Merge metrics
|
|
1404
|
+
if metrics:
|
|
1405
|
+
all_metrics.update(metrics)
|
|
848
1406
|
|
|
849
|
-
# Update metadata
|
|
1407
|
+
# Update metadata for successfully parsed file
|
|
1408
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
1409
|
+
file_results[file_path] = (len(chunks), True)
|
|
1410
|
+
logger.debug(f"Prepared {len(chunks)} chunks from {file_path}")
|
|
1411
|
+
else:
|
|
1412
|
+
# Empty file is not an error
|
|
850
1413
|
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
1414
|
+
file_results[file_path] = (0, True)
|
|
851
1415
|
|
|
1416
|
+
# Single database insertion for entire batch
|
|
1417
|
+
if all_chunks:
|
|
1418
|
+
logger.info(
|
|
1419
|
+
f"Batch inserting {len(all_chunks)} chunks from {len(batch)} files"
|
|
1420
|
+
)
|
|
1421
|
+
try:
|
|
1422
|
+
await self.database.add_chunks(all_chunks, metrics=all_metrics)
|
|
1423
|
+
logger.debug(
|
|
1424
|
+
f"Successfully indexed {len(all_chunks)} chunks from batch"
|
|
1425
|
+
)
|
|
852
1426
|
except Exception as e:
|
|
853
|
-
error_msg = f"Failed to
|
|
1427
|
+
error_msg = f"Failed to insert batch of chunks: {e}"
|
|
854
1428
|
logger.error(error_msg)
|
|
855
|
-
|
|
1429
|
+
# Mark all files with chunks in this batch as failed
|
|
1430
|
+
for file_path in file_to_chunks_map.keys():
|
|
1431
|
+
file_results[Path(file_path)] = (0, False)
|
|
856
1432
|
|
|
857
1433
|
# Save error to error log file
|
|
858
1434
|
try:
|
|
@@ -862,18 +1438,18 @@ class SemanticIndexer:
|
|
|
862
1438
|
/ "indexing_errors.log"
|
|
863
1439
|
)
|
|
864
1440
|
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
865
|
-
from datetime import datetime
|
|
866
|
-
|
|
867
1441
|
timestamp = datetime.now().isoformat()
|
|
868
1442
|
f.write(f"[{timestamp}] {error_msg}\n")
|
|
869
1443
|
except Exception as log_err:
|
|
870
1444
|
logger.debug(f"Failed to write error log: {log_err}")
|
|
871
1445
|
|
|
872
|
-
|
|
873
|
-
|
|
1446
|
+
# Save metadata after batch
|
|
1447
|
+
self._save_index_metadata(metadata)
|
|
874
1448
|
|
|
875
|
-
|
|
876
|
-
|
|
1449
|
+
# Yield progress updates for each file in batch
|
|
1450
|
+
for file_path in batch:
|
|
1451
|
+
chunks_added, success = file_results.get(file_path, (0, False))
|
|
1452
|
+
yield (file_path, chunks_added, success)
|
|
877
1453
|
|
|
878
1454
|
def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
|
|
879
1455
|
"""Build parent-child relationships between chunks.
|
|
@@ -895,7 +1471,9 @@ class SemanticIndexer:
|
|
|
895
1471
|
return chunks
|
|
896
1472
|
|
|
897
1473
|
# Group chunks by type and name
|
|
898
|
-
|
|
1474
|
+
# Only actual module chunks (not imports) serve as parents for top-level code
|
|
1475
|
+
# imports chunks should remain siblings of classes/functions, not parents
|
|
1476
|
+
module_chunks = [c for c in chunks if c.chunk_type == "module"]
|
|
899
1477
|
class_chunks = [
|
|
900
1478
|
c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
|
|
901
1479
|
]
|