mcp-vector-search 1.0.3__py3-none-any.whl → 1.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_vector_search/__init__.py +3 -3
- mcp_vector_search/analysis/__init__.py +48 -1
- mcp_vector_search/analysis/baseline/__init__.py +68 -0
- mcp_vector_search/analysis/baseline/comparator.py +462 -0
- mcp_vector_search/analysis/baseline/manager.py +621 -0
- mcp_vector_search/analysis/collectors/__init__.py +35 -0
- mcp_vector_search/analysis/collectors/cohesion.py +463 -0
- mcp_vector_search/analysis/collectors/coupling.py +1162 -0
- mcp_vector_search/analysis/collectors/halstead.py +514 -0
- mcp_vector_search/analysis/collectors/smells.py +325 -0
- mcp_vector_search/analysis/debt.py +516 -0
- mcp_vector_search/analysis/interpretation.py +685 -0
- mcp_vector_search/analysis/metrics.py +74 -1
- mcp_vector_search/analysis/reporters/__init__.py +3 -1
- mcp_vector_search/analysis/reporters/console.py +424 -0
- mcp_vector_search/analysis/reporters/markdown.py +480 -0
- mcp_vector_search/analysis/reporters/sarif.py +377 -0
- mcp_vector_search/analysis/storage/__init__.py +93 -0
- mcp_vector_search/analysis/storage/metrics_store.py +762 -0
- mcp_vector_search/analysis/storage/schema.py +245 -0
- mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
- mcp_vector_search/analysis/trends.py +308 -0
- mcp_vector_search/analysis/visualizer/__init__.py +90 -0
- mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
- mcp_vector_search/analysis/visualizer/exporter.py +484 -0
- mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
- mcp_vector_search/analysis/visualizer/schemas.py +525 -0
- mcp_vector_search/cli/commands/analyze.py +665 -11
- mcp_vector_search/cli/commands/chat.py +193 -0
- mcp_vector_search/cli/commands/index.py +600 -2
- mcp_vector_search/cli/commands/index_background.py +467 -0
- mcp_vector_search/cli/commands/search.py +194 -1
- mcp_vector_search/cli/commands/setup.py +64 -13
- mcp_vector_search/cli/commands/status.py +302 -3
- mcp_vector_search/cli/commands/visualize/cli.py +26 -10
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +8 -4
- mcp_vector_search/cli/commands/visualize/graph_builder.py +167 -234
- mcp_vector_search/cli/commands/visualize/server.py +304 -15
- mcp_vector_search/cli/commands/visualize/templates/base.py +60 -6
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +2100 -65
- mcp_vector_search/cli/commands/visualize/templates/styles.py +1297 -88
- mcp_vector_search/cli/didyoumean.py +5 -0
- mcp_vector_search/cli/main.py +16 -5
- mcp_vector_search/cli/output.py +134 -5
- mcp_vector_search/config/thresholds.py +89 -1
- mcp_vector_search/core/__init__.py +16 -0
- mcp_vector_search/core/database.py +39 -2
- mcp_vector_search/core/embeddings.py +24 -0
- mcp_vector_search/core/git.py +380 -0
- mcp_vector_search/core/indexer.py +445 -84
- mcp_vector_search/core/llm_client.py +9 -4
- mcp_vector_search/core/models.py +88 -1
- mcp_vector_search/core/relationships.py +473 -0
- mcp_vector_search/core/search.py +1 -1
- mcp_vector_search/mcp/server.py +795 -4
- mcp_vector_search/parsers/python.py +285 -5
- mcp_vector_search/utils/gitignore.py +0 -3
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +3 -2
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/RECORD +62 -39
- mcp_vector_search/cli/commands/visualize.py.original +0 -2536
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +0 -0
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,7 +15,7 @@ from .exceptions import SearchError
|
|
|
15
15
|
LLMProvider = Literal["openai", "openrouter"]
|
|
16
16
|
|
|
17
17
|
# Type alias for intent
|
|
18
|
-
IntentType = Literal["find", "answer"]
|
|
18
|
+
IntentType = Literal["find", "answer", "analyze"]
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class LLMClient:
|
|
@@ -480,7 +480,7 @@ Select the top {top_n} most relevant results:"""
|
|
|
480
480
|
query: User's natural language query
|
|
481
481
|
|
|
482
482
|
Returns:
|
|
483
|
-
Intent type: "find" or "
|
|
483
|
+
Intent type: "find", "answer", or "analyze"
|
|
484
484
|
|
|
485
485
|
Raises:
|
|
486
486
|
SearchError: If API call fails
|
|
@@ -493,7 +493,12 @@ Select the top {top_n} most relevant results:"""
|
|
|
493
493
|
2. "answer" - User wants an explanation/answer about the codebase
|
|
494
494
|
Examples: "what does this do", "how does X work", "explain the architecture", "why is X used"
|
|
495
495
|
|
|
496
|
-
|
|
496
|
+
3. "analyze" - User wants analysis of code quality, metrics, complexity, or smells
|
|
497
|
+
Examples: "what's complex", "code smells", "cognitive complexity", "quality issues",
|
|
498
|
+
"dependencies", "coupling", "circular dependencies", "getting worse", "improving",
|
|
499
|
+
"analyze the complexity", "find the worst code", "most complex functions"
|
|
500
|
+
|
|
501
|
+
Return ONLY the word "find", "answer", or "analyze" with no other text."""
|
|
497
502
|
|
|
498
503
|
user_prompt = f"""Query: {query}
|
|
499
504
|
|
|
@@ -512,7 +517,7 @@ Intent:"""
|
|
|
512
517
|
)
|
|
513
518
|
intent = content.strip().lower()
|
|
514
519
|
|
|
515
|
-
if intent not in ("find", "answer"):
|
|
520
|
+
if intent not in ("find", "answer", "analyze"):
|
|
516
521
|
# Default to find if unclear
|
|
517
522
|
logger.warning(
|
|
518
523
|
f"Unclear intent '{intent}' for query '{query}', defaulting to 'find'"
|
mcp_vector_search/core/models.py
CHANGED
|
@@ -151,6 +151,33 @@ class SearchResult(BaseModel):
|
|
|
151
151
|
default=False, description="True if file no longer exists (stale index)"
|
|
152
152
|
)
|
|
153
153
|
|
|
154
|
+
# Quality metrics (from structural analysis)
|
|
155
|
+
cognitive_complexity: int | None = Field(
|
|
156
|
+
default=None, description="Cognitive complexity score"
|
|
157
|
+
)
|
|
158
|
+
cyclomatic_complexity: int | None = Field(
|
|
159
|
+
default=None, description="Cyclomatic complexity score"
|
|
160
|
+
)
|
|
161
|
+
max_nesting_depth: int | None = Field(
|
|
162
|
+
default=None, description="Maximum nesting depth"
|
|
163
|
+
)
|
|
164
|
+
parameter_count: int | None = Field(
|
|
165
|
+
default=None, description="Number of function parameters"
|
|
166
|
+
)
|
|
167
|
+
lines_of_code: int | None = Field(
|
|
168
|
+
default=None, description="Lines of code in chunk"
|
|
169
|
+
)
|
|
170
|
+
complexity_grade: str | None = Field(
|
|
171
|
+
default=None, description="Complexity grade (A-F)"
|
|
172
|
+
)
|
|
173
|
+
code_smells: list[str] = Field(default=[], description="Detected code smells")
|
|
174
|
+
smell_count: int | None = Field(
|
|
175
|
+
default=None, description="Number of code smells detected"
|
|
176
|
+
)
|
|
177
|
+
quality_score: int | None = Field(
|
|
178
|
+
default=None, description="Overall quality score (0-100)"
|
|
179
|
+
)
|
|
180
|
+
|
|
154
181
|
class Config:
|
|
155
182
|
arbitrary_types_allowed = True
|
|
156
183
|
|
|
@@ -164,9 +191,47 @@ class SearchResult(BaseModel):
|
|
|
164
191
|
"""Get a human-readable location string."""
|
|
165
192
|
return f"{self.file_path}:{self.start_line}-{self.end_line}"
|
|
166
193
|
|
|
194
|
+
def calculate_quality_score(self) -> int:
|
|
195
|
+
"""Calculate quality score based on complexity grade and code smells.
|
|
196
|
+
|
|
197
|
+
Formula:
|
|
198
|
+
- Base: complexity_grade (A=100, B=80, C=60, D=40, F=20)
|
|
199
|
+
- Penalty: -10 per code smell
|
|
200
|
+
- Bonus: +20 if no smells (already factored into base if no smells)
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Quality score (0-100), or None if no quality metrics available
|
|
204
|
+
"""
|
|
205
|
+
# If no quality metrics, return None (will be stored in quality_score field)
|
|
206
|
+
if self.complexity_grade is None:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
# Map complexity grade to base score
|
|
210
|
+
grade_scores = {
|
|
211
|
+
"A": 100,
|
|
212
|
+
"B": 80,
|
|
213
|
+
"C": 60,
|
|
214
|
+
"D": 40,
|
|
215
|
+
"F": 20,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
base_score = grade_scores.get(self.complexity_grade, 0)
|
|
219
|
+
|
|
220
|
+
# Apply smell penalty
|
|
221
|
+
smell_count = self.smell_count or 0
|
|
222
|
+
penalty = smell_count * 10
|
|
223
|
+
|
|
224
|
+
# Calculate final score (with bonus for no smells already in base)
|
|
225
|
+
# Bonus: +20 if no smells (effectively makes A without smells = 100+20 = 120, capped at 100)
|
|
226
|
+
bonus = 20 if smell_count == 0 else 0
|
|
227
|
+
quality_score = base_score - penalty + bonus
|
|
228
|
+
|
|
229
|
+
# Clamp to 0-100 range
|
|
230
|
+
return max(0, min(100, quality_score))
|
|
231
|
+
|
|
167
232
|
def to_dict(self) -> dict[str, Any]:
|
|
168
233
|
"""Convert to dictionary for serialization."""
|
|
169
|
-
|
|
234
|
+
result = {
|
|
170
235
|
"content": self.content,
|
|
171
236
|
"file_path": str(self.file_path),
|
|
172
237
|
"start_line": self.start_line,
|
|
@@ -184,6 +249,28 @@ class SearchResult(BaseModel):
|
|
|
184
249
|
"line_count": self.line_count,
|
|
185
250
|
}
|
|
186
251
|
|
|
252
|
+
# Add quality metrics if available
|
|
253
|
+
if self.cognitive_complexity is not None:
|
|
254
|
+
result["cognitive_complexity"] = self.cognitive_complexity
|
|
255
|
+
if self.cyclomatic_complexity is not None:
|
|
256
|
+
result["cyclomatic_complexity"] = self.cyclomatic_complexity
|
|
257
|
+
if self.max_nesting_depth is not None:
|
|
258
|
+
result["max_nesting_depth"] = self.max_nesting_depth
|
|
259
|
+
if self.parameter_count is not None:
|
|
260
|
+
result["parameter_count"] = self.parameter_count
|
|
261
|
+
if self.lines_of_code is not None:
|
|
262
|
+
result["lines_of_code"] = self.lines_of_code
|
|
263
|
+
if self.complexity_grade is not None:
|
|
264
|
+
result["complexity_grade"] = self.complexity_grade
|
|
265
|
+
if self.code_smells:
|
|
266
|
+
result["code_smells"] = self.code_smells
|
|
267
|
+
if self.smell_count is not None:
|
|
268
|
+
result["smell_count"] = self.smell_count
|
|
269
|
+
if self.quality_score is not None:
|
|
270
|
+
result["quality_score"] = self.quality_score
|
|
271
|
+
|
|
272
|
+
return result
|
|
273
|
+
|
|
187
274
|
|
|
188
275
|
class IndexStats(BaseModel):
|
|
189
276
|
"""Statistics about the search index."""
|
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
"""Pre-computed relationship storage for instant visualization.
|
|
2
|
+
|
|
3
|
+
This module handles computing and storing code chunk relationships at index time,
|
|
4
|
+
eliminating the expensive computation during visualization startup.
|
|
5
|
+
|
|
6
|
+
Relationships stored:
|
|
7
|
+
- Semantic relationships: Which chunks are similar (based on embeddings)
|
|
8
|
+
- Caller relationships: Which chunks call which (based on AST analysis)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
import asyncio
|
|
13
|
+
import json
|
|
14
|
+
import time
|
|
15
|
+
from datetime import UTC, datetime
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from loguru import logger
|
|
20
|
+
from rich.console import Console
|
|
21
|
+
from rich.progress import (
|
|
22
|
+
BarColumn,
|
|
23
|
+
Progress,
|
|
24
|
+
SpinnerColumn,
|
|
25
|
+
TaskProgressColumn,
|
|
26
|
+
TextColumn,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
from .models import CodeChunk
|
|
30
|
+
|
|
31
|
+
console = Console()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def extract_function_calls(code: str) -> set[str]:
|
|
35
|
+
"""Extract actual function calls from Python code using AST.
|
|
36
|
+
|
|
37
|
+
Returns set of function names that are actually called (not just mentioned).
|
|
38
|
+
Avoids false positives from comments, docstrings, and string literals.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
code: Python source code to analyze
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Set of function names that are actually called in the code
|
|
45
|
+
"""
|
|
46
|
+
calls = set()
|
|
47
|
+
try:
|
|
48
|
+
tree = ast.parse(code)
|
|
49
|
+
for node in ast.walk(tree):
|
|
50
|
+
if isinstance(node, ast.Call):
|
|
51
|
+
# Handle direct calls: foo()
|
|
52
|
+
if isinstance(node.func, ast.Name):
|
|
53
|
+
calls.add(node.func.id)
|
|
54
|
+
# Handle method calls: obj.foo() - extract 'foo'
|
|
55
|
+
elif isinstance(node.func, ast.Attribute):
|
|
56
|
+
calls.add(node.func.attr)
|
|
57
|
+
return calls
|
|
58
|
+
except SyntaxError:
|
|
59
|
+
# If code can't be parsed (incomplete, etc.), fall back to empty set
|
|
60
|
+
# This is safer than false positives from naive substring matching
|
|
61
|
+
return set()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_chunk_name(content: str, fallback: str = "chunk") -> str:
|
|
65
|
+
"""Extract first meaningful word from chunk content for labeling.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
content: The chunk's code content
|
|
69
|
+
fallback: Fallback name if no meaningful word found
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
First meaningful identifier found in the content
|
|
73
|
+
"""
|
|
74
|
+
import re
|
|
75
|
+
|
|
76
|
+
# Skip common keywords that aren't meaningful as chunk labels
|
|
77
|
+
skip_words = {
|
|
78
|
+
"def",
|
|
79
|
+
"class",
|
|
80
|
+
"function",
|
|
81
|
+
"const",
|
|
82
|
+
"let",
|
|
83
|
+
"var",
|
|
84
|
+
"import",
|
|
85
|
+
"from",
|
|
86
|
+
"return",
|
|
87
|
+
"if",
|
|
88
|
+
"else",
|
|
89
|
+
"elif",
|
|
90
|
+
"for",
|
|
91
|
+
"while",
|
|
92
|
+
"try",
|
|
93
|
+
"except",
|
|
94
|
+
"finally",
|
|
95
|
+
"with",
|
|
96
|
+
"as",
|
|
97
|
+
"async",
|
|
98
|
+
"await",
|
|
99
|
+
"yield",
|
|
100
|
+
"self",
|
|
101
|
+
"this",
|
|
102
|
+
"true",
|
|
103
|
+
"false",
|
|
104
|
+
"none",
|
|
105
|
+
"null",
|
|
106
|
+
"undefined",
|
|
107
|
+
"public",
|
|
108
|
+
"private",
|
|
109
|
+
"protected",
|
|
110
|
+
"static",
|
|
111
|
+
"export",
|
|
112
|
+
"default",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Find all words (alphanumeric + underscore, at least 2 chars)
|
|
116
|
+
words = re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]+\b", content)
|
|
117
|
+
|
|
118
|
+
for word in words:
|
|
119
|
+
if word.lower() not in skip_words:
|
|
120
|
+
return word
|
|
121
|
+
|
|
122
|
+
return fallback
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class RelationshipStore:
|
|
126
|
+
"""Store and load pre-computed chunk relationships.
|
|
127
|
+
|
|
128
|
+
Relationships are stored in .mcp-vector-search/relationships.json and include:
|
|
129
|
+
- Semantic links (similar chunks based on embeddings)
|
|
130
|
+
- Caller links (which chunks call which)
|
|
131
|
+
- Metadata (chunk count, computation time, version)
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
def __init__(self, project_root: Path):
|
|
135
|
+
"""Initialize relationship store.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
project_root: Root directory of the project
|
|
139
|
+
"""
|
|
140
|
+
self.project_root = project_root
|
|
141
|
+
self.store_path = project_root / ".mcp-vector-search" / "relationships.json"
|
|
142
|
+
|
|
143
|
+
async def compute_and_store(
|
|
144
|
+
self,
|
|
145
|
+
chunks: list[CodeChunk],
|
|
146
|
+
database: Any,
|
|
147
|
+
max_concurrent_queries: int = 50,
|
|
148
|
+
background: bool = False,
|
|
149
|
+
) -> dict[str, Any]:
|
|
150
|
+
"""Compute relationships and save to disk.
|
|
151
|
+
|
|
152
|
+
This is called during indexing to pre-compute expensive relationships.
|
|
153
|
+
NOTE: Caller relationships are now lazy-loaded via /api/callers/{chunk_id}
|
|
154
|
+
to avoid the expensive O(n²) computation at startup.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
chunks: List of all code chunks
|
|
158
|
+
database: Vector database instance for semantic search
|
|
159
|
+
max_concurrent_queries: Maximum number of concurrent database queries (default: 50)
|
|
160
|
+
background: If True, skip computation and return immediately (for background processing)
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dictionary with relationship statistics
|
|
164
|
+
"""
|
|
165
|
+
logger.info("Computing relationships for visualization...")
|
|
166
|
+
start_time = time.time()
|
|
167
|
+
|
|
168
|
+
# Filter to code chunks only
|
|
169
|
+
code_chunks = [
|
|
170
|
+
c for c in chunks if c.chunk_type in ["function", "method", "class"]
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
# If background mode, create empty relationships file and return
|
|
174
|
+
# Actual computation will happen in background task
|
|
175
|
+
if background:
|
|
176
|
+
relationships = {
|
|
177
|
+
"version": "1.1",
|
|
178
|
+
"computed_at": datetime.now(UTC).isoformat(),
|
|
179
|
+
"chunk_count": len(chunks),
|
|
180
|
+
"code_chunk_count": len(code_chunks),
|
|
181
|
+
"computation_time_seconds": 0,
|
|
182
|
+
"semantic": [],
|
|
183
|
+
"callers": {},
|
|
184
|
+
"status": "pending", # Mark as pending background computation
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# Save empty file
|
|
188
|
+
self.store_path.parent.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
with open(self.store_path, "w") as f:
|
|
190
|
+
json.dump(relationships, f, indent=2)
|
|
191
|
+
|
|
192
|
+
logger.info("✓ Relationships marked for background computation")
|
|
193
|
+
return {
|
|
194
|
+
"semantic_links": 0,
|
|
195
|
+
"caller_relationships": 0,
|
|
196
|
+
"computation_time": 0,
|
|
197
|
+
"background": True,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# Compute semantic relationships only
|
|
201
|
+
# Caller relationships are lazy-loaded on-demand via API
|
|
202
|
+
logger.info(
|
|
203
|
+
f"Computing semantic relationships for {len(code_chunks)} chunks "
|
|
204
|
+
f"(max {max_concurrent_queries} concurrent queries)..."
|
|
205
|
+
)
|
|
206
|
+
semantic_links = await self._compute_semantic_relationships(
|
|
207
|
+
code_chunks, database, max_concurrent_queries
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
elapsed = time.time() - start_time
|
|
211
|
+
|
|
212
|
+
# Build relationship data (no caller_map - it's lazy loaded)
|
|
213
|
+
relationships = {
|
|
214
|
+
"version": "1.1", # Version bump for lazy callers
|
|
215
|
+
"computed_at": datetime.now(UTC).isoformat(),
|
|
216
|
+
"chunk_count": len(chunks),
|
|
217
|
+
"code_chunk_count": len(code_chunks),
|
|
218
|
+
"computation_time_seconds": elapsed,
|
|
219
|
+
"semantic": semantic_links,
|
|
220
|
+
"callers": {}, # Empty - loaded on-demand via /api/callers/{chunk_id}
|
|
221
|
+
"status": "complete",
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
# Save to disk
|
|
225
|
+
self.store_path.parent.mkdir(parents=True, exist_ok=True)
|
|
226
|
+
with open(self.store_path, "w") as f:
|
|
227
|
+
json.dump(relationships, f, indent=2)
|
|
228
|
+
|
|
229
|
+
logger.info(
|
|
230
|
+
f"✓ Computed {len(semantic_links)} semantic links in {elapsed:.1f}s "
|
|
231
|
+
"(callers lazy-loaded on-demand)"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
"semantic_links": len(semantic_links),
|
|
236
|
+
"caller_relationships": 0, # Now lazy-loaded
|
|
237
|
+
"computation_time": elapsed,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
async def _compute_semantic_relationships(
|
|
241
|
+
self,
|
|
242
|
+
code_chunks: list[CodeChunk],
|
|
243
|
+
database: Any,
|
|
244
|
+
max_concurrent_queries: int = 50,
|
|
245
|
+
) -> list[dict[str, Any]]:
|
|
246
|
+
"""Compute semantic similarity relationships between chunks using async parallel processing.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
code_chunks: List of code chunks (functions, methods, classes)
|
|
250
|
+
database: Vector database for similarity search
|
|
251
|
+
max_concurrent_queries: Maximum number of concurrent database queries (default: 50)
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
List of semantic link dictionaries
|
|
255
|
+
"""
|
|
256
|
+
semantic_links = []
|
|
257
|
+
semaphore = asyncio.Semaphore(max_concurrent_queries)
|
|
258
|
+
completed_count = 0
|
|
259
|
+
total_chunks = len(code_chunks)
|
|
260
|
+
|
|
261
|
+
# Use Rich progress bar
|
|
262
|
+
with Progress(
|
|
263
|
+
SpinnerColumn(),
|
|
264
|
+
TextColumn("[cyan]Computing semantic relationships...[/cyan]"),
|
|
265
|
+
BarColumn(bar_width=40),
|
|
266
|
+
TaskProgressColumn(),
|
|
267
|
+
TextColumn("[dim]{task.completed}/{task.total} chunks[/dim]"),
|
|
268
|
+
console=console,
|
|
269
|
+
transient=False,
|
|
270
|
+
) as progress:
|
|
271
|
+
task = progress.add_task("semantic", total=total_chunks)
|
|
272
|
+
|
|
273
|
+
async def process_chunk(chunk: CodeChunk) -> list[dict[str, Any]]:
|
|
274
|
+
"""Process a single chunk and return its semantic links."""
|
|
275
|
+
nonlocal completed_count
|
|
276
|
+
|
|
277
|
+
async with semaphore:
|
|
278
|
+
try:
|
|
279
|
+
# Search for similar chunks
|
|
280
|
+
similar_results = await database.search(
|
|
281
|
+
query=chunk.content[:500], # First 500 chars
|
|
282
|
+
limit=6, # Get 6 (exclude self = 5)
|
|
283
|
+
similarity_threshold=0.3,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
chunk_links = []
|
|
287
|
+
source_chunk_id = chunk.chunk_id or chunk.id
|
|
288
|
+
|
|
289
|
+
# Filter out self and create links
|
|
290
|
+
for result in similar_results:
|
|
291
|
+
target_chunk = next(
|
|
292
|
+
(
|
|
293
|
+
c
|
|
294
|
+
for c in code_chunks
|
|
295
|
+
if str(c.file_path) == str(result.file_path)
|
|
296
|
+
and c.start_line == result.start_line
|
|
297
|
+
and c.end_line == result.end_line
|
|
298
|
+
),
|
|
299
|
+
None,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if not target_chunk:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
target_chunk_id = target_chunk.chunk_id or target_chunk.id
|
|
306
|
+
|
|
307
|
+
# Skip self-references
|
|
308
|
+
if target_chunk_id == source_chunk_id:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
# Add semantic link
|
|
312
|
+
if result.similarity_score >= 0.2:
|
|
313
|
+
chunk_links.append(
|
|
314
|
+
{
|
|
315
|
+
"source": source_chunk_id,
|
|
316
|
+
"target": target_chunk_id,
|
|
317
|
+
"type": "semantic",
|
|
318
|
+
"similarity": result.similarity_score,
|
|
319
|
+
}
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Only keep top 5 per chunk
|
|
323
|
+
if len(chunk_links) >= 5:
|
|
324
|
+
break
|
|
325
|
+
|
|
326
|
+
# Update progress
|
|
327
|
+
completed_count += 1
|
|
328
|
+
progress.update(task, completed=completed_count)
|
|
329
|
+
|
|
330
|
+
return chunk_links
|
|
331
|
+
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.debug(
|
|
334
|
+
f"Failed to compute semantic for {chunk.chunk_id}: {e}"
|
|
335
|
+
)
|
|
336
|
+
completed_count += 1
|
|
337
|
+
progress.update(task, completed=completed_count)
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
# Process all chunks in parallel
|
|
341
|
+
tasks = [process_chunk(chunk) for chunk in code_chunks]
|
|
342
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
343
|
+
|
|
344
|
+
# Flatten results and handle exceptions
|
|
345
|
+
for result in results:
|
|
346
|
+
if isinstance(result, Exception):
|
|
347
|
+
logger.debug(f"Task failed with exception: {result}")
|
|
348
|
+
continue
|
|
349
|
+
semantic_links.extend(result)
|
|
350
|
+
|
|
351
|
+
return semantic_links
|
|
352
|
+
|
|
353
|
+
def _compute_caller_relationships(
|
|
354
|
+
self, chunks: list[CodeChunk]
|
|
355
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
356
|
+
"""Compute which chunks call which other chunks.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
chunks: List of all code chunks
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Map of chunk_id -> list of caller info
|
|
363
|
+
"""
|
|
364
|
+
caller_map = {}
|
|
365
|
+
|
|
366
|
+
code_chunks = [
|
|
367
|
+
c for c in chunks if c.chunk_type in ["function", "method", "class"]
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
logger.debug(f"Processing {len(code_chunks)} code chunks for callers...")
|
|
371
|
+
|
|
372
|
+
for chunk_idx, chunk in enumerate(code_chunks):
|
|
373
|
+
if chunk_idx % 50 == 0: # Progress
|
|
374
|
+
logger.debug(f"Callers: {chunk_idx}/{len(code_chunks)} chunks")
|
|
375
|
+
|
|
376
|
+
chunk_id = chunk.chunk_id or chunk.id
|
|
377
|
+
file_path = str(chunk.file_path)
|
|
378
|
+
function_name = chunk.function_name or chunk.class_name
|
|
379
|
+
|
|
380
|
+
if not function_name:
|
|
381
|
+
continue
|
|
382
|
+
|
|
383
|
+
# Search other chunks that reference this function
|
|
384
|
+
for other_chunk in chunks:
|
|
385
|
+
other_file_path = str(other_chunk.file_path)
|
|
386
|
+
|
|
387
|
+
# Only track EXTERNAL callers (different file)
|
|
388
|
+
if other_file_path == file_path:
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
# Extract actual function calls using AST
|
|
392
|
+
actual_calls = extract_function_calls(other_chunk.content)
|
|
393
|
+
|
|
394
|
+
# Check if this function is actually called
|
|
395
|
+
if function_name in actual_calls:
|
|
396
|
+
other_chunk_id = other_chunk.chunk_id or other_chunk.id
|
|
397
|
+
|
|
398
|
+
# Generate meaningful caller name
|
|
399
|
+
other_name = other_chunk.function_name or other_chunk.class_name
|
|
400
|
+
if not other_name:
|
|
401
|
+
other_name = extract_chunk_name(
|
|
402
|
+
other_chunk.content,
|
|
403
|
+
fallback=f"chunk_{other_chunk.start_line}",
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Skip __init__ functions as callers (noise)
|
|
407
|
+
if other_name == "__init__":
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
if chunk_id not in caller_map:
|
|
411
|
+
caller_map[chunk_id] = []
|
|
412
|
+
|
|
413
|
+
# Store caller information
|
|
414
|
+
caller_map[chunk_id].append(
|
|
415
|
+
{
|
|
416
|
+
"file": other_file_path,
|
|
417
|
+
"chunk_id": other_chunk_id,
|
|
418
|
+
"name": other_name,
|
|
419
|
+
"type": other_chunk.chunk_type,
|
|
420
|
+
}
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
logger.debug(
|
|
424
|
+
f"Found call: {other_name} ({other_file_path}) -> "
|
|
425
|
+
f"{function_name} ({file_path})"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
return caller_map
|
|
429
|
+
|
|
430
|
+
def load(self) -> dict[str, Any]:
|
|
431
|
+
"""Load pre-computed relationships from disk.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
Dictionary with semantic and caller relationships, or empty structure if not found
|
|
435
|
+
"""
|
|
436
|
+
if not self.store_path.exists():
|
|
437
|
+
logger.warning(
|
|
438
|
+
f"No pre-computed relationships found at {self.store_path}. "
|
|
439
|
+
"Run 'mcp-vector-search index' to compute relationships."
|
|
440
|
+
)
|
|
441
|
+
return {"semantic": [], "callers": {}}
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
with open(self.store_path) as f:
|
|
445
|
+
data = json.load(f)
|
|
446
|
+
|
|
447
|
+
logger.info(
|
|
448
|
+
f"✓ Loaded {len(data.get('semantic', []))} semantic links and "
|
|
449
|
+
f"{sum(len(callers) for callers in data.get('callers', {}).values())} "
|
|
450
|
+
f"caller relationships (computed {data.get('computed_at', 'unknown')})"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return data
|
|
454
|
+
except Exception as e:
|
|
455
|
+
logger.error(f"Failed to load relationships: {e}")
|
|
456
|
+
return {"semantic": [], "callers": {}}
|
|
457
|
+
|
|
458
|
+
def exists(self) -> bool:
|
|
459
|
+
"""Check if pre-computed relationships exist.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
True if relationships file exists
|
|
463
|
+
"""
|
|
464
|
+
return self.store_path.exists()
|
|
465
|
+
|
|
466
|
+
def invalidate(self) -> None:
|
|
467
|
+
"""Delete stored relationships (called when index changes).
|
|
468
|
+
|
|
469
|
+
This forces re-computation on next full index.
|
|
470
|
+
"""
|
|
471
|
+
if self.store_path.exists():
|
|
472
|
+
self.store_path.unlink()
|
|
473
|
+
logger.debug("Invalidated pre-computed relationships")
|
mcp_vector_search/core/search.py
CHANGED
|
@@ -217,7 +217,7 @@ class SemanticSearchEngine:
|
|
|
217
217
|
|
|
218
218
|
except BaseException as e:
|
|
219
219
|
# Re-raise system exceptions we should never catch
|
|
220
|
-
if isinstance(e,
|
|
220
|
+
if isinstance(e, KeyboardInterrupt | SystemExit | GeneratorExit):
|
|
221
221
|
raise
|
|
222
222
|
|
|
223
223
|
last_error = e
|