mcp-vector-search 0.12.6__py3-none-any.whl → 1.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_vector_search/__init__.py +3 -3
- mcp_vector_search/analysis/__init__.py +111 -0
- mcp_vector_search/analysis/baseline/__init__.py +68 -0
- mcp_vector_search/analysis/baseline/comparator.py +462 -0
- mcp_vector_search/analysis/baseline/manager.py +621 -0
- mcp_vector_search/analysis/collectors/__init__.py +74 -0
- mcp_vector_search/analysis/collectors/base.py +164 -0
- mcp_vector_search/analysis/collectors/cohesion.py +463 -0
- mcp_vector_search/analysis/collectors/complexity.py +743 -0
- mcp_vector_search/analysis/collectors/coupling.py +1162 -0
- mcp_vector_search/analysis/collectors/halstead.py +514 -0
- mcp_vector_search/analysis/collectors/smells.py +325 -0
- mcp_vector_search/analysis/debt.py +516 -0
- mcp_vector_search/analysis/interpretation.py +685 -0
- mcp_vector_search/analysis/metrics.py +414 -0
- mcp_vector_search/analysis/reporters/__init__.py +7 -0
- mcp_vector_search/analysis/reporters/console.py +646 -0
- mcp_vector_search/analysis/reporters/markdown.py +480 -0
- mcp_vector_search/analysis/reporters/sarif.py +377 -0
- mcp_vector_search/analysis/storage/__init__.py +93 -0
- mcp_vector_search/analysis/storage/metrics_store.py +762 -0
- mcp_vector_search/analysis/storage/schema.py +245 -0
- mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
- mcp_vector_search/analysis/trends.py +308 -0
- mcp_vector_search/analysis/visualizer/__init__.py +90 -0
- mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
- mcp_vector_search/analysis/visualizer/exporter.py +484 -0
- mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
- mcp_vector_search/analysis/visualizer/schemas.py +525 -0
- mcp_vector_search/cli/commands/analyze.py +1062 -0
- mcp_vector_search/cli/commands/chat.py +1455 -0
- mcp_vector_search/cli/commands/index.py +621 -5
- mcp_vector_search/cli/commands/index_background.py +467 -0
- mcp_vector_search/cli/commands/init.py +13 -0
- mcp_vector_search/cli/commands/install.py +597 -335
- mcp_vector_search/cli/commands/install_old.py +8 -4
- mcp_vector_search/cli/commands/mcp.py +78 -6
- mcp_vector_search/cli/commands/reset.py +68 -26
- mcp_vector_search/cli/commands/search.py +224 -8
- mcp_vector_search/cli/commands/setup.py +1184 -0
- mcp_vector_search/cli/commands/status.py +339 -5
- mcp_vector_search/cli/commands/uninstall.py +276 -357
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +292 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +647 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +600 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +234 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +4542 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +2522 -0
- mcp_vector_search/cli/didyoumean.py +27 -2
- mcp_vector_search/cli/main.py +127 -160
- mcp_vector_search/cli/output.py +158 -13
- mcp_vector_search/config/__init__.py +4 -0
- mcp_vector_search/config/default_thresholds.yaml +52 -0
- mcp_vector_search/config/settings.py +12 -0
- mcp_vector_search/config/thresholds.py +273 -0
- mcp_vector_search/core/__init__.py +16 -0
- mcp_vector_search/core/auto_indexer.py +3 -3
- mcp_vector_search/core/boilerplate.py +186 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/database.py +406 -94
- mcp_vector_search/core/embeddings.py +24 -0
- mcp_vector_search/core/exceptions.py +11 -0
- mcp_vector_search/core/git.py +380 -0
- mcp_vector_search/core/git_hooks.py +4 -4
- mcp_vector_search/core/indexer.py +632 -54
- mcp_vector_search/core/llm_client.py +756 -0
- mcp_vector_search/core/models.py +91 -1
- mcp_vector_search/core/project.py +17 -0
- mcp_vector_search/core/relationships.py +473 -0
- mcp_vector_search/core/scheduler.py +11 -11
- mcp_vector_search/core/search.py +179 -29
- mcp_vector_search/mcp/server.py +819 -9
- mcp_vector_search/parsers/python.py +285 -5
- mcp_vector_search/utils/__init__.py +2 -0
- mcp_vector_search/utils/gitignore.py +0 -3
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +66 -4
- mcp_vector_search/utils/timing.py +10 -6
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +184 -53
- mcp_vector_search-1.1.22.dist-info/RECORD +120 -0
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +1 -1
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +1 -0
- mcp_vector_search/cli/commands/visualize.py +0 -1467
- mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
mcp_vector_search/core/models.py
CHANGED
|
@@ -147,6 +147,36 @@ class SearchResult(BaseModel):
|
|
|
147
147
|
context_before: list[str] = Field(default=[], description="Lines before the match")
|
|
148
148
|
context_after: list[str] = Field(default=[], description="Lines after the match")
|
|
149
149
|
highlights: list[str] = Field(default=[], description="Highlighted terms")
|
|
150
|
+
file_missing: bool = Field(
|
|
151
|
+
default=False, description="True if file no longer exists (stale index)"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Quality metrics (from structural analysis)
|
|
155
|
+
cognitive_complexity: int | None = Field(
|
|
156
|
+
default=None, description="Cognitive complexity score"
|
|
157
|
+
)
|
|
158
|
+
cyclomatic_complexity: int | None = Field(
|
|
159
|
+
default=None, description="Cyclomatic complexity score"
|
|
160
|
+
)
|
|
161
|
+
max_nesting_depth: int | None = Field(
|
|
162
|
+
default=None, description="Maximum nesting depth"
|
|
163
|
+
)
|
|
164
|
+
parameter_count: int | None = Field(
|
|
165
|
+
default=None, description="Number of function parameters"
|
|
166
|
+
)
|
|
167
|
+
lines_of_code: int | None = Field(
|
|
168
|
+
default=None, description="Lines of code in chunk"
|
|
169
|
+
)
|
|
170
|
+
complexity_grade: str | None = Field(
|
|
171
|
+
default=None, description="Complexity grade (A-F)"
|
|
172
|
+
)
|
|
173
|
+
code_smells: list[str] = Field(default=[], description="Detected code smells")
|
|
174
|
+
smell_count: int | None = Field(
|
|
175
|
+
default=None, description="Number of code smells detected"
|
|
176
|
+
)
|
|
177
|
+
quality_score: int | None = Field(
|
|
178
|
+
default=None, description="Overall quality score (0-100)"
|
|
179
|
+
)
|
|
150
180
|
|
|
151
181
|
class Config:
|
|
152
182
|
arbitrary_types_allowed = True
|
|
@@ -161,9 +191,47 @@ class SearchResult(BaseModel):
|
|
|
161
191
|
"""Get a human-readable location string."""
|
|
162
192
|
return f"{self.file_path}:{self.start_line}-{self.end_line}"
|
|
163
193
|
|
|
194
|
+
def calculate_quality_score(self) -> int:
|
|
195
|
+
"""Calculate quality score based on complexity grade and code smells.
|
|
196
|
+
|
|
197
|
+
Formula:
|
|
198
|
+
- Base: complexity_grade (A=100, B=80, C=60, D=40, F=20)
|
|
199
|
+
- Penalty: -10 per code smell
|
|
200
|
+
- Bonus: +20 if no smells (already factored into base if no smells)
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Quality score (0-100), or None if no quality metrics available
|
|
204
|
+
"""
|
|
205
|
+
# If no quality metrics, return None (will be stored in quality_score field)
|
|
206
|
+
if self.complexity_grade is None:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
# Map complexity grade to base score
|
|
210
|
+
grade_scores = {
|
|
211
|
+
"A": 100,
|
|
212
|
+
"B": 80,
|
|
213
|
+
"C": 60,
|
|
214
|
+
"D": 40,
|
|
215
|
+
"F": 20,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
base_score = grade_scores.get(self.complexity_grade, 0)
|
|
219
|
+
|
|
220
|
+
# Apply smell penalty
|
|
221
|
+
smell_count = self.smell_count or 0
|
|
222
|
+
penalty = smell_count * 10
|
|
223
|
+
|
|
224
|
+
# Calculate final score (with bonus for no smells already in base)
|
|
225
|
+
# Bonus: +20 if no smells (effectively makes A without smells = 100+20 = 120, capped at 100)
|
|
226
|
+
bonus = 20 if smell_count == 0 else 0
|
|
227
|
+
quality_score = base_score - penalty + bonus
|
|
228
|
+
|
|
229
|
+
# Clamp to 0-100 range
|
|
230
|
+
return max(0, min(100, quality_score))
|
|
231
|
+
|
|
164
232
|
def to_dict(self) -> dict[str, Any]:
|
|
165
233
|
"""Convert to dictionary for serialization."""
|
|
166
|
-
|
|
234
|
+
result = {
|
|
167
235
|
"content": self.content,
|
|
168
236
|
"file_path": str(self.file_path),
|
|
169
237
|
"start_line": self.start_line,
|
|
@@ -181,6 +249,28 @@ class SearchResult(BaseModel):
|
|
|
181
249
|
"line_count": self.line_count,
|
|
182
250
|
}
|
|
183
251
|
|
|
252
|
+
# Add quality metrics if available
|
|
253
|
+
if self.cognitive_complexity is not None:
|
|
254
|
+
result["cognitive_complexity"] = self.cognitive_complexity
|
|
255
|
+
if self.cyclomatic_complexity is not None:
|
|
256
|
+
result["cyclomatic_complexity"] = self.cyclomatic_complexity
|
|
257
|
+
if self.max_nesting_depth is not None:
|
|
258
|
+
result["max_nesting_depth"] = self.max_nesting_depth
|
|
259
|
+
if self.parameter_count is not None:
|
|
260
|
+
result["parameter_count"] = self.parameter_count
|
|
261
|
+
if self.lines_of_code is not None:
|
|
262
|
+
result["lines_of_code"] = self.lines_of_code
|
|
263
|
+
if self.complexity_grade is not None:
|
|
264
|
+
result["complexity_grade"] = self.complexity_grade
|
|
265
|
+
if self.code_smells:
|
|
266
|
+
result["code_smells"] = self.code_smells
|
|
267
|
+
if self.smell_count is not None:
|
|
268
|
+
result["smell_count"] = self.smell_count
|
|
269
|
+
if self.quality_score is not None:
|
|
270
|
+
result["quality_score"] = self.quality_score
|
|
271
|
+
|
|
272
|
+
return result
|
|
273
|
+
|
|
184
274
|
|
|
185
275
|
class IndexStats(BaseModel):
|
|
186
276
|
"""Statistics about the search index."""
|
|
@@ -107,6 +107,23 @@ class ProjectManager:
|
|
|
107
107
|
index_path = get_default_index_path(self.project_root)
|
|
108
108
|
index_path.mkdir(parents=True, exist_ok=True)
|
|
109
109
|
|
|
110
|
+
# Ensure .mcp-vector-search/ is in .gitignore
|
|
111
|
+
# This is a non-critical operation - failures are logged but don't block initialization
|
|
112
|
+
try:
|
|
113
|
+
from ..utils.gitignore_updater import ensure_gitignore_entry
|
|
114
|
+
|
|
115
|
+
ensure_gitignore_entry(
|
|
116
|
+
self.project_root,
|
|
117
|
+
pattern=".mcp-vector-search/",
|
|
118
|
+
comment="MCP Vector Search index directory",
|
|
119
|
+
)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
# Log warning but continue initialization
|
|
122
|
+
logger.warning(f"Could not update .gitignore: {e}")
|
|
123
|
+
logger.info(
|
|
124
|
+
"Please manually add '.mcp-vector-search/' to your .gitignore file"
|
|
125
|
+
)
|
|
126
|
+
|
|
110
127
|
# Detect languages and files
|
|
111
128
|
detected_languages = self.detect_languages()
|
|
112
129
|
file_count = self.count_indexable_files(
|
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
"""Pre-computed relationship storage for instant visualization.
|
|
2
|
+
|
|
3
|
+
This module handles computing and storing code chunk relationships at index time,
|
|
4
|
+
eliminating the expensive computation during visualization startup.
|
|
5
|
+
|
|
6
|
+
Relationships stored:
|
|
7
|
+
- Semantic relationships: Which chunks are similar (based on embeddings)
|
|
8
|
+
- Caller relationships: Which chunks call which (based on AST analysis)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
import asyncio
|
|
13
|
+
import json
|
|
14
|
+
import time
|
|
15
|
+
from datetime import UTC, datetime
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from loguru import logger
|
|
20
|
+
from rich.console import Console
|
|
21
|
+
from rich.progress import (
|
|
22
|
+
BarColumn,
|
|
23
|
+
Progress,
|
|
24
|
+
SpinnerColumn,
|
|
25
|
+
TaskProgressColumn,
|
|
26
|
+
TextColumn,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
from .models import CodeChunk
|
|
30
|
+
|
|
31
|
+
console = Console()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def extract_function_calls(code: str) -> set[str]:
|
|
35
|
+
"""Extract actual function calls from Python code using AST.
|
|
36
|
+
|
|
37
|
+
Returns set of function names that are actually called (not just mentioned).
|
|
38
|
+
Avoids false positives from comments, docstrings, and string literals.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
code: Python source code to analyze
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Set of function names that are actually called in the code
|
|
45
|
+
"""
|
|
46
|
+
calls = set()
|
|
47
|
+
try:
|
|
48
|
+
tree = ast.parse(code)
|
|
49
|
+
for node in ast.walk(tree):
|
|
50
|
+
if isinstance(node, ast.Call):
|
|
51
|
+
# Handle direct calls: foo()
|
|
52
|
+
if isinstance(node.func, ast.Name):
|
|
53
|
+
calls.add(node.func.id)
|
|
54
|
+
# Handle method calls: obj.foo() - extract 'foo'
|
|
55
|
+
elif isinstance(node.func, ast.Attribute):
|
|
56
|
+
calls.add(node.func.attr)
|
|
57
|
+
return calls
|
|
58
|
+
except SyntaxError:
|
|
59
|
+
# If code can't be parsed (incomplete, etc.), fall back to empty set
|
|
60
|
+
# This is safer than false positives from naive substring matching
|
|
61
|
+
return set()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_chunk_name(content: str, fallback: str = "chunk") -> str:
|
|
65
|
+
"""Extract first meaningful word from chunk content for labeling.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
content: The chunk's code content
|
|
69
|
+
fallback: Fallback name if no meaningful word found
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
First meaningful identifier found in the content
|
|
73
|
+
"""
|
|
74
|
+
import re
|
|
75
|
+
|
|
76
|
+
# Skip common keywords that aren't meaningful as chunk labels
|
|
77
|
+
skip_words = {
|
|
78
|
+
"def",
|
|
79
|
+
"class",
|
|
80
|
+
"function",
|
|
81
|
+
"const",
|
|
82
|
+
"let",
|
|
83
|
+
"var",
|
|
84
|
+
"import",
|
|
85
|
+
"from",
|
|
86
|
+
"return",
|
|
87
|
+
"if",
|
|
88
|
+
"else",
|
|
89
|
+
"elif",
|
|
90
|
+
"for",
|
|
91
|
+
"while",
|
|
92
|
+
"try",
|
|
93
|
+
"except",
|
|
94
|
+
"finally",
|
|
95
|
+
"with",
|
|
96
|
+
"as",
|
|
97
|
+
"async",
|
|
98
|
+
"await",
|
|
99
|
+
"yield",
|
|
100
|
+
"self",
|
|
101
|
+
"this",
|
|
102
|
+
"true",
|
|
103
|
+
"false",
|
|
104
|
+
"none",
|
|
105
|
+
"null",
|
|
106
|
+
"undefined",
|
|
107
|
+
"public",
|
|
108
|
+
"private",
|
|
109
|
+
"protected",
|
|
110
|
+
"static",
|
|
111
|
+
"export",
|
|
112
|
+
"default",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Find all words (alphanumeric + underscore, at least 2 chars)
|
|
116
|
+
words = re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]+\b", content)
|
|
117
|
+
|
|
118
|
+
for word in words:
|
|
119
|
+
if word.lower() not in skip_words:
|
|
120
|
+
return word
|
|
121
|
+
|
|
122
|
+
return fallback
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class RelationshipStore:
|
|
126
|
+
"""Store and load pre-computed chunk relationships.
|
|
127
|
+
|
|
128
|
+
Relationships are stored in .mcp-vector-search/relationships.json and include:
|
|
129
|
+
- Semantic links (similar chunks based on embeddings)
|
|
130
|
+
- Caller links (which chunks call which)
|
|
131
|
+
- Metadata (chunk count, computation time, version)
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
def __init__(self, project_root: Path):
|
|
135
|
+
"""Initialize relationship store.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
project_root: Root directory of the project
|
|
139
|
+
"""
|
|
140
|
+
self.project_root = project_root
|
|
141
|
+
self.store_path = project_root / ".mcp-vector-search" / "relationships.json"
|
|
142
|
+
|
|
143
|
+
async def compute_and_store(
|
|
144
|
+
self,
|
|
145
|
+
chunks: list[CodeChunk],
|
|
146
|
+
database: Any,
|
|
147
|
+
max_concurrent_queries: int = 50,
|
|
148
|
+
background: bool = False,
|
|
149
|
+
) -> dict[str, Any]:
|
|
150
|
+
"""Compute relationships and save to disk.
|
|
151
|
+
|
|
152
|
+
This is called during indexing to pre-compute expensive relationships.
|
|
153
|
+
NOTE: Caller relationships are now lazy-loaded via /api/callers/{chunk_id}
|
|
154
|
+
to avoid the expensive O(n²) computation at startup.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
chunks: List of all code chunks
|
|
158
|
+
database: Vector database instance for semantic search
|
|
159
|
+
max_concurrent_queries: Maximum number of concurrent database queries (default: 50)
|
|
160
|
+
background: If True, skip computation and return immediately (for background processing)
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dictionary with relationship statistics
|
|
164
|
+
"""
|
|
165
|
+
logger.info("Computing relationships for visualization...")
|
|
166
|
+
start_time = time.time()
|
|
167
|
+
|
|
168
|
+
# Filter to code chunks only
|
|
169
|
+
code_chunks = [
|
|
170
|
+
c for c in chunks if c.chunk_type in ["function", "method", "class"]
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
# If background mode, create empty relationships file and return
|
|
174
|
+
# Actual computation will happen in background task
|
|
175
|
+
if background:
|
|
176
|
+
relationships = {
|
|
177
|
+
"version": "1.1",
|
|
178
|
+
"computed_at": datetime.now(UTC).isoformat(),
|
|
179
|
+
"chunk_count": len(chunks),
|
|
180
|
+
"code_chunk_count": len(code_chunks),
|
|
181
|
+
"computation_time_seconds": 0,
|
|
182
|
+
"semantic": [],
|
|
183
|
+
"callers": {},
|
|
184
|
+
"status": "pending", # Mark as pending background computation
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# Save empty file
|
|
188
|
+
self.store_path.parent.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
with open(self.store_path, "w") as f:
|
|
190
|
+
json.dump(relationships, f, indent=2)
|
|
191
|
+
|
|
192
|
+
logger.info("✓ Relationships marked for background computation")
|
|
193
|
+
return {
|
|
194
|
+
"semantic_links": 0,
|
|
195
|
+
"caller_relationships": 0,
|
|
196
|
+
"computation_time": 0,
|
|
197
|
+
"background": True,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# Compute semantic relationships only
|
|
201
|
+
# Caller relationships are lazy-loaded on-demand via API
|
|
202
|
+
logger.info(
|
|
203
|
+
f"Computing semantic relationships for {len(code_chunks)} chunks "
|
|
204
|
+
f"(max {max_concurrent_queries} concurrent queries)..."
|
|
205
|
+
)
|
|
206
|
+
semantic_links = await self._compute_semantic_relationships(
|
|
207
|
+
code_chunks, database, max_concurrent_queries
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
elapsed = time.time() - start_time
|
|
211
|
+
|
|
212
|
+
# Build relationship data (no caller_map - it's lazy loaded)
|
|
213
|
+
relationships = {
|
|
214
|
+
"version": "1.1", # Version bump for lazy callers
|
|
215
|
+
"computed_at": datetime.now(UTC).isoformat(),
|
|
216
|
+
"chunk_count": len(chunks),
|
|
217
|
+
"code_chunk_count": len(code_chunks),
|
|
218
|
+
"computation_time_seconds": elapsed,
|
|
219
|
+
"semantic": semantic_links,
|
|
220
|
+
"callers": {}, # Empty - loaded on-demand via /api/callers/{chunk_id}
|
|
221
|
+
"status": "complete",
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
# Save to disk
|
|
225
|
+
self.store_path.parent.mkdir(parents=True, exist_ok=True)
|
|
226
|
+
with open(self.store_path, "w") as f:
|
|
227
|
+
json.dump(relationships, f, indent=2)
|
|
228
|
+
|
|
229
|
+
logger.info(
|
|
230
|
+
f"✓ Computed {len(semantic_links)} semantic links in {elapsed:.1f}s "
|
|
231
|
+
"(callers lazy-loaded on-demand)"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
"semantic_links": len(semantic_links),
|
|
236
|
+
"caller_relationships": 0, # Now lazy-loaded
|
|
237
|
+
"computation_time": elapsed,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
async def _compute_semantic_relationships(
|
|
241
|
+
self,
|
|
242
|
+
code_chunks: list[CodeChunk],
|
|
243
|
+
database: Any,
|
|
244
|
+
max_concurrent_queries: int = 50,
|
|
245
|
+
) -> list[dict[str, Any]]:
|
|
246
|
+
"""Compute semantic similarity relationships between chunks using async parallel processing.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
code_chunks: List of code chunks (functions, methods, classes)
|
|
250
|
+
database: Vector database for similarity search
|
|
251
|
+
max_concurrent_queries: Maximum number of concurrent database queries (default: 50)
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
List of semantic link dictionaries
|
|
255
|
+
"""
|
|
256
|
+
semantic_links = []
|
|
257
|
+
semaphore = asyncio.Semaphore(max_concurrent_queries)
|
|
258
|
+
completed_count = 0
|
|
259
|
+
total_chunks = len(code_chunks)
|
|
260
|
+
|
|
261
|
+
# Use Rich progress bar
|
|
262
|
+
with Progress(
|
|
263
|
+
SpinnerColumn(),
|
|
264
|
+
TextColumn("[cyan]Computing semantic relationships...[/cyan]"),
|
|
265
|
+
BarColumn(bar_width=40),
|
|
266
|
+
TaskProgressColumn(),
|
|
267
|
+
TextColumn("[dim]{task.completed}/{task.total} chunks[/dim]"),
|
|
268
|
+
console=console,
|
|
269
|
+
transient=False,
|
|
270
|
+
) as progress:
|
|
271
|
+
task = progress.add_task("semantic", total=total_chunks)
|
|
272
|
+
|
|
273
|
+
async def process_chunk(chunk: CodeChunk) -> list[dict[str, Any]]:
|
|
274
|
+
"""Process a single chunk and return its semantic links."""
|
|
275
|
+
nonlocal completed_count
|
|
276
|
+
|
|
277
|
+
async with semaphore:
|
|
278
|
+
try:
|
|
279
|
+
# Search for similar chunks
|
|
280
|
+
similar_results = await database.search(
|
|
281
|
+
query=chunk.content[:500], # First 500 chars
|
|
282
|
+
limit=6, # Get 6 (exclude self = 5)
|
|
283
|
+
similarity_threshold=0.3,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
chunk_links = []
|
|
287
|
+
source_chunk_id = chunk.chunk_id or chunk.id
|
|
288
|
+
|
|
289
|
+
# Filter out self and create links
|
|
290
|
+
for result in similar_results:
|
|
291
|
+
target_chunk = next(
|
|
292
|
+
(
|
|
293
|
+
c
|
|
294
|
+
for c in code_chunks
|
|
295
|
+
if str(c.file_path) == str(result.file_path)
|
|
296
|
+
and c.start_line == result.start_line
|
|
297
|
+
and c.end_line == result.end_line
|
|
298
|
+
),
|
|
299
|
+
None,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if not target_chunk:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
target_chunk_id = target_chunk.chunk_id or target_chunk.id
|
|
306
|
+
|
|
307
|
+
# Skip self-references
|
|
308
|
+
if target_chunk_id == source_chunk_id:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
# Add semantic link
|
|
312
|
+
if result.similarity_score >= 0.2:
|
|
313
|
+
chunk_links.append(
|
|
314
|
+
{
|
|
315
|
+
"source": source_chunk_id,
|
|
316
|
+
"target": target_chunk_id,
|
|
317
|
+
"type": "semantic",
|
|
318
|
+
"similarity": result.similarity_score,
|
|
319
|
+
}
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Only keep top 5 per chunk
|
|
323
|
+
if len(chunk_links) >= 5:
|
|
324
|
+
break
|
|
325
|
+
|
|
326
|
+
# Update progress
|
|
327
|
+
completed_count += 1
|
|
328
|
+
progress.update(task, completed=completed_count)
|
|
329
|
+
|
|
330
|
+
return chunk_links
|
|
331
|
+
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.debug(
|
|
334
|
+
f"Failed to compute semantic for {chunk.chunk_id}: {e}"
|
|
335
|
+
)
|
|
336
|
+
completed_count += 1
|
|
337
|
+
progress.update(task, completed=completed_count)
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
# Process all chunks in parallel
|
|
341
|
+
tasks = [process_chunk(chunk) for chunk in code_chunks]
|
|
342
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
343
|
+
|
|
344
|
+
# Flatten results and handle exceptions
|
|
345
|
+
for result in results:
|
|
346
|
+
if isinstance(result, Exception):
|
|
347
|
+
logger.debug(f"Task failed with exception: {result}")
|
|
348
|
+
continue
|
|
349
|
+
semantic_links.extend(result)
|
|
350
|
+
|
|
351
|
+
return semantic_links
|
|
352
|
+
|
|
353
|
+
def _compute_caller_relationships(
|
|
354
|
+
self, chunks: list[CodeChunk]
|
|
355
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
356
|
+
"""Compute which chunks call which other chunks.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
chunks: List of all code chunks
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Map of chunk_id -> list of caller info
|
|
363
|
+
"""
|
|
364
|
+
caller_map = {}
|
|
365
|
+
|
|
366
|
+
code_chunks = [
|
|
367
|
+
c for c in chunks if c.chunk_type in ["function", "method", "class"]
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
logger.debug(f"Processing {len(code_chunks)} code chunks for callers...")
|
|
371
|
+
|
|
372
|
+
for chunk_idx, chunk in enumerate(code_chunks):
|
|
373
|
+
if chunk_idx % 50 == 0: # Progress
|
|
374
|
+
logger.debug(f"Callers: {chunk_idx}/{len(code_chunks)} chunks")
|
|
375
|
+
|
|
376
|
+
chunk_id = chunk.chunk_id or chunk.id
|
|
377
|
+
file_path = str(chunk.file_path)
|
|
378
|
+
function_name = chunk.function_name or chunk.class_name
|
|
379
|
+
|
|
380
|
+
if not function_name:
|
|
381
|
+
continue
|
|
382
|
+
|
|
383
|
+
# Search other chunks that reference this function
|
|
384
|
+
for other_chunk in chunks:
|
|
385
|
+
other_file_path = str(other_chunk.file_path)
|
|
386
|
+
|
|
387
|
+
# Only track EXTERNAL callers (different file)
|
|
388
|
+
if other_file_path == file_path:
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
# Extract actual function calls using AST
|
|
392
|
+
actual_calls = extract_function_calls(other_chunk.content)
|
|
393
|
+
|
|
394
|
+
# Check if this function is actually called
|
|
395
|
+
if function_name in actual_calls:
|
|
396
|
+
other_chunk_id = other_chunk.chunk_id or other_chunk.id
|
|
397
|
+
|
|
398
|
+
# Generate meaningful caller name
|
|
399
|
+
other_name = other_chunk.function_name or other_chunk.class_name
|
|
400
|
+
if not other_name:
|
|
401
|
+
other_name = extract_chunk_name(
|
|
402
|
+
other_chunk.content,
|
|
403
|
+
fallback=f"chunk_{other_chunk.start_line}",
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Skip __init__ functions as callers (noise)
|
|
407
|
+
if other_name == "__init__":
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
if chunk_id not in caller_map:
|
|
411
|
+
caller_map[chunk_id] = []
|
|
412
|
+
|
|
413
|
+
# Store caller information
|
|
414
|
+
caller_map[chunk_id].append(
|
|
415
|
+
{
|
|
416
|
+
"file": other_file_path,
|
|
417
|
+
"chunk_id": other_chunk_id,
|
|
418
|
+
"name": other_name,
|
|
419
|
+
"type": other_chunk.chunk_type,
|
|
420
|
+
}
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
logger.debug(
|
|
424
|
+
f"Found call: {other_name} ({other_file_path}) -> "
|
|
425
|
+
f"{function_name} ({file_path})"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
return caller_map
|
|
429
|
+
|
|
430
|
+
def load(self) -> dict[str, Any]:
|
|
431
|
+
"""Load pre-computed relationships from disk.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
Dictionary with semantic and caller relationships, or empty structure if not found
|
|
435
|
+
"""
|
|
436
|
+
if not self.store_path.exists():
|
|
437
|
+
logger.warning(
|
|
438
|
+
f"No pre-computed relationships found at {self.store_path}. "
|
|
439
|
+
"Run 'mcp-vector-search index' to compute relationships."
|
|
440
|
+
)
|
|
441
|
+
return {"semantic": [], "callers": {}}
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
with open(self.store_path) as f:
|
|
445
|
+
data = json.load(f)
|
|
446
|
+
|
|
447
|
+
logger.info(
|
|
448
|
+
f"✓ Loaded {len(data.get('semantic', []))} semantic links and "
|
|
449
|
+
f"{sum(len(callers) for callers in data.get('callers', {}).values())} "
|
|
450
|
+
f"caller relationships (computed {data.get('computed_at', 'unknown')})"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return data
|
|
454
|
+
except Exception as e:
|
|
455
|
+
logger.error(f"Failed to load relationships: {e}")
|
|
456
|
+
return {"semantic": [], "callers": {}}
|
|
457
|
+
|
|
458
|
+
def exists(self) -> bool:
|
|
459
|
+
"""Check if pre-computed relationships exist.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
True if relationships file exists
|
|
463
|
+
"""
|
|
464
|
+
return self.store_path.exists()
|
|
465
|
+
|
|
466
|
+
def invalidate(self) -> None:
|
|
467
|
+
"""Delete stored relationships (called when index changes).
|
|
468
|
+
|
|
469
|
+
This forces re-computation on next full index.
|
|
470
|
+
"""
|
|
471
|
+
if self.store_path.exists():
|
|
472
|
+
self.store_path.unlink()
|
|
473
|
+
logger.debug("Invalidated pre-computed relationships")
|