mcp-vector-search 0.12.6__py3-none-any.whl → 1.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_vector_search/__init__.py +3 -3
- mcp_vector_search/analysis/__init__.py +111 -0
- mcp_vector_search/analysis/baseline/__init__.py +68 -0
- mcp_vector_search/analysis/baseline/comparator.py +462 -0
- mcp_vector_search/analysis/baseline/manager.py +621 -0
- mcp_vector_search/analysis/collectors/__init__.py +74 -0
- mcp_vector_search/analysis/collectors/base.py +164 -0
- mcp_vector_search/analysis/collectors/cohesion.py +463 -0
- mcp_vector_search/analysis/collectors/complexity.py +743 -0
- mcp_vector_search/analysis/collectors/coupling.py +1162 -0
- mcp_vector_search/analysis/collectors/halstead.py +514 -0
- mcp_vector_search/analysis/collectors/smells.py +325 -0
- mcp_vector_search/analysis/debt.py +516 -0
- mcp_vector_search/analysis/interpretation.py +685 -0
- mcp_vector_search/analysis/metrics.py +414 -0
- mcp_vector_search/analysis/reporters/__init__.py +7 -0
- mcp_vector_search/analysis/reporters/console.py +646 -0
- mcp_vector_search/analysis/reporters/markdown.py +480 -0
- mcp_vector_search/analysis/reporters/sarif.py +377 -0
- mcp_vector_search/analysis/storage/__init__.py +93 -0
- mcp_vector_search/analysis/storage/metrics_store.py +762 -0
- mcp_vector_search/analysis/storage/schema.py +245 -0
- mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
- mcp_vector_search/analysis/trends.py +308 -0
- mcp_vector_search/analysis/visualizer/__init__.py +90 -0
- mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
- mcp_vector_search/analysis/visualizer/exporter.py +484 -0
- mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
- mcp_vector_search/analysis/visualizer/schemas.py +525 -0
- mcp_vector_search/cli/commands/analyze.py +1062 -0
- mcp_vector_search/cli/commands/chat.py +1455 -0
- mcp_vector_search/cli/commands/index.py +621 -5
- mcp_vector_search/cli/commands/index_background.py +467 -0
- mcp_vector_search/cli/commands/init.py +13 -0
- mcp_vector_search/cli/commands/install.py +597 -335
- mcp_vector_search/cli/commands/install_old.py +8 -4
- mcp_vector_search/cli/commands/mcp.py +78 -6
- mcp_vector_search/cli/commands/reset.py +68 -26
- mcp_vector_search/cli/commands/search.py +224 -8
- mcp_vector_search/cli/commands/setup.py +1184 -0
- mcp_vector_search/cli/commands/status.py +339 -5
- mcp_vector_search/cli/commands/uninstall.py +276 -357
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +292 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +647 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +600 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +234 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +4542 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +2522 -0
- mcp_vector_search/cli/didyoumean.py +27 -2
- mcp_vector_search/cli/main.py +127 -160
- mcp_vector_search/cli/output.py +158 -13
- mcp_vector_search/config/__init__.py +4 -0
- mcp_vector_search/config/default_thresholds.yaml +52 -0
- mcp_vector_search/config/settings.py +12 -0
- mcp_vector_search/config/thresholds.py +273 -0
- mcp_vector_search/core/__init__.py +16 -0
- mcp_vector_search/core/auto_indexer.py +3 -3
- mcp_vector_search/core/boilerplate.py +186 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/database.py +406 -94
- mcp_vector_search/core/embeddings.py +24 -0
- mcp_vector_search/core/exceptions.py +11 -0
- mcp_vector_search/core/git.py +380 -0
- mcp_vector_search/core/git_hooks.py +4 -4
- mcp_vector_search/core/indexer.py +632 -54
- mcp_vector_search/core/llm_client.py +756 -0
- mcp_vector_search/core/models.py +91 -1
- mcp_vector_search/core/project.py +17 -0
- mcp_vector_search/core/relationships.py +473 -0
- mcp_vector_search/core/scheduler.py +11 -11
- mcp_vector_search/core/search.py +179 -29
- mcp_vector_search/mcp/server.py +819 -9
- mcp_vector_search/parsers/python.py +285 -5
- mcp_vector_search/utils/__init__.py +2 -0
- mcp_vector_search/utils/gitignore.py +0 -3
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +66 -4
- mcp_vector_search/utils/timing.py +10 -6
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +184 -53
- mcp_vector_search-1.1.22.dist-info/RECORD +120 -0
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +1 -1
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +1 -0
- mcp_vector_search/cli/commands/visualize.py +0 -1467
- mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
- {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
|
@@ -44,11 +44,14 @@ class VectorDatabase(ABC):
|
|
|
44
44
|
...
|
|
45
45
|
|
|
46
46
|
@abstractmethod
|
|
47
|
-
async def add_chunks(
|
|
48
|
-
|
|
47
|
+
async def add_chunks(
|
|
48
|
+
self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Add code chunks to the database with optional structural metrics.
|
|
49
51
|
|
|
50
52
|
Args:
|
|
51
53
|
chunks: List of code chunks to add
|
|
54
|
+
metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
|
|
52
55
|
"""
|
|
53
56
|
...
|
|
54
57
|
|
|
@@ -148,6 +151,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
148
151
|
self.collection_name = collection_name
|
|
149
152
|
self._client = None
|
|
150
153
|
self._collection = None
|
|
154
|
+
self._recovery_attempted = False # Guard against infinite recursion
|
|
151
155
|
|
|
152
156
|
async def initialize(self) -> None:
|
|
153
157
|
"""Initialize ChromaDB client and collection with corruption recovery."""
|
|
@@ -157,49 +161,144 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
157
161
|
# Ensure directory exists
|
|
158
162
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
159
163
|
|
|
160
|
-
# Check for corruption before initializing
|
|
164
|
+
# LAYER 1: Check for corruption before initializing (SQLite + HNSW checks)
|
|
161
165
|
await self._detect_and_recover_corruption()
|
|
162
166
|
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
167
|
+
# LAYER 2: Wrap ChromaDB initialization with Rust panic detection
|
|
168
|
+
try:
|
|
169
|
+
# Create client with new API
|
|
170
|
+
self._client = chromadb.PersistentClient(
|
|
171
|
+
path=str(self.persist_directory),
|
|
172
|
+
settings=chromadb.Settings(
|
|
173
|
+
anonymized_telemetry=False,
|
|
174
|
+
allow_reset=True,
|
|
175
|
+
),
|
|
176
|
+
)
|
|
171
177
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
178
|
+
# Create or get collection
|
|
179
|
+
self._collection = self._client.get_or_create_collection(
|
|
180
|
+
name=self.collection_name,
|
|
181
|
+
embedding_function=self.embedding_function,
|
|
182
|
+
metadata={
|
|
183
|
+
"description": "Semantic code search collection",
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Reset recovery flag on successful initialization
|
|
188
|
+
self._recovery_attempted = False
|
|
189
|
+
|
|
190
|
+
logger.debug(f"ChromaDB initialized at {self.persist_directory}")
|
|
191
|
+
|
|
192
|
+
except BaseException as init_error:
|
|
193
|
+
# Re-raise system exceptions we should never catch
|
|
194
|
+
if isinstance(
|
|
195
|
+
init_error, KeyboardInterrupt | SystemExit | GeneratorExit
|
|
196
|
+
):
|
|
197
|
+
raise
|
|
198
|
+
|
|
199
|
+
# LAYER 2: Detect Rust panic patterns during initialization
|
|
200
|
+
error_msg = str(init_error).lower()
|
|
201
|
+
|
|
202
|
+
# Rust panic patterns (common ChromaDB Rust panics)
|
|
203
|
+
rust_panic_patterns = [
|
|
204
|
+
"range start index",
|
|
205
|
+
"out of range",
|
|
206
|
+
"panic",
|
|
207
|
+
"thread panicked",
|
|
208
|
+
"slice of length",
|
|
209
|
+
"index out of bounds",
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
if any(pattern in error_msg for pattern in rust_panic_patterns):
|
|
213
|
+
logger.warning(
|
|
214
|
+
f"Rust panic detected during ChromaDB initialization: {init_error}"
|
|
215
|
+
)
|
|
216
|
+
logger.info(
|
|
217
|
+
"Attempting automatic recovery from database corruption..."
|
|
218
|
+
)
|
|
219
|
+
await self._recover_from_corruption()
|
|
220
|
+
|
|
221
|
+
# Retry initialization ONCE after recovery
|
|
222
|
+
try:
|
|
223
|
+
logger.info(
|
|
224
|
+
"Retrying ChromaDB initialization after recovery..."
|
|
225
|
+
)
|
|
226
|
+
self._client = chromadb.PersistentClient(
|
|
227
|
+
path=str(self.persist_directory),
|
|
228
|
+
settings=chromadb.Settings(
|
|
229
|
+
anonymized_telemetry=False,
|
|
230
|
+
allow_reset=True,
|
|
231
|
+
),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
self._collection = self._client.get_or_create_collection(
|
|
235
|
+
name=self.collection_name,
|
|
236
|
+
embedding_function=self.embedding_function,
|
|
237
|
+
metadata={
|
|
238
|
+
"description": "Semantic code search collection",
|
|
239
|
+
},
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
logger.info("ChromaDB successfully initialized after recovery")
|
|
180
243
|
|
|
181
|
-
|
|
244
|
+
except BaseException as retry_error:
|
|
245
|
+
# Re-raise system exceptions
|
|
246
|
+
if isinstance(
|
|
247
|
+
retry_error, KeyboardInterrupt | SystemExit | GeneratorExit
|
|
248
|
+
):
|
|
249
|
+
raise
|
|
182
250
|
|
|
251
|
+
logger.error(
|
|
252
|
+
f"Failed to recover from database corruption: {retry_error}"
|
|
253
|
+
)
|
|
254
|
+
# Mark recovery as attempted to prevent infinite loops
|
|
255
|
+
self._recovery_attempted = True
|
|
256
|
+
raise DatabaseError(
|
|
257
|
+
f"Failed to recover from database corruption. "
|
|
258
|
+
f"Please run 'mcp-vector-search reset index' to clear the database. "
|
|
259
|
+
f"Error: {retry_error}"
|
|
260
|
+
) from retry_error
|
|
261
|
+
else:
|
|
262
|
+
# Not a Rust panic, re-raise original exception
|
|
263
|
+
raise
|
|
264
|
+
|
|
265
|
+
except (DatabaseError, DatabaseInitializationError):
|
|
266
|
+
# Re-raise our own errors without re-processing
|
|
267
|
+
raise
|
|
183
268
|
except Exception as e:
|
|
184
|
-
# Check if this is a corruption error
|
|
269
|
+
# Check if this is a corruption error (legacy detection for backward compatibility)
|
|
185
270
|
error_msg = str(e).lower()
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
271
|
+
corruption_indicators = [
|
|
272
|
+
"pickle",
|
|
273
|
+
"unpickling",
|
|
274
|
+
"eof",
|
|
275
|
+
"ran out of input",
|
|
276
|
+
"hnsw",
|
|
277
|
+
"index",
|
|
278
|
+
"deserialize",
|
|
279
|
+
"corrupt",
|
|
280
|
+
"file is not a database", # SQLite corruption
|
|
281
|
+
"database error", # ChromaDB database errors
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
if any(indicator in error_msg for indicator in corruption_indicators):
|
|
285
|
+
# Prevent infinite recursion - only attempt recovery once
|
|
286
|
+
if self._recovery_attempted:
|
|
287
|
+
logger.error(
|
|
288
|
+
f"Recovery already attempted but corruption persists: {e}"
|
|
289
|
+
)
|
|
290
|
+
raise DatabaseInitializationError(
|
|
291
|
+
f"Failed to recover from database corruption. "
|
|
292
|
+
f"Please run 'mcp-vector-search reset index' to clear and rebuild the database. Error: {e}"
|
|
293
|
+
) from e
|
|
294
|
+
|
|
199
295
|
logger.warning(f"Detected index corruption: {e}")
|
|
296
|
+
self._recovery_attempted = True
|
|
297
|
+
|
|
200
298
|
# Try to recover
|
|
201
299
|
await self._recover_from_corruption()
|
|
202
|
-
|
|
300
|
+
|
|
301
|
+
# Retry initialization ONE TIME
|
|
203
302
|
await self.initialize()
|
|
204
303
|
else:
|
|
205
304
|
logger.error(f"Failed to initialize ChromaDB: {e}")
|
|
@@ -245,8 +344,16 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
245
344
|
self._collection = None
|
|
246
345
|
logger.debug("ChromaDB connections closed")
|
|
247
346
|
|
|
248
|
-
async def add_chunks(
|
|
249
|
-
|
|
347
|
+
async def add_chunks(
|
|
348
|
+
self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
|
|
349
|
+
) -> None:
|
|
350
|
+
"""Add code chunks to the database with optional structural metrics.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
chunks: List of code chunks to add
|
|
354
|
+
metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
|
|
355
|
+
Example: {"chunk_id_1": {"cognitive_complexity": 5, ...}, ...}
|
|
356
|
+
"""
|
|
250
357
|
if not self._collection:
|
|
251
358
|
raise DatabaseNotInitializedError("Database not initialized")
|
|
252
359
|
|
|
@@ -259,11 +366,27 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
259
366
|
ids = []
|
|
260
367
|
|
|
261
368
|
for chunk in chunks:
|
|
262
|
-
#
|
|
263
|
-
|
|
264
|
-
|
|
369
|
+
# Debug: Check first chunk content
|
|
370
|
+
if len(documents) == 0:
|
|
371
|
+
import sys
|
|
372
|
+
|
|
373
|
+
has_meta = "Language:" in chunk.content and "File:" in chunk.content
|
|
374
|
+
print("\n[DATABASE] First chunk content check:", file=sys.stderr)
|
|
375
|
+
print(f" Type: {chunk.chunk_type}", file=sys.stderr)
|
|
376
|
+
print(f" File: {chunk.file_path.name}", file=sys.stderr)
|
|
377
|
+
print(
|
|
378
|
+
f" Has metadata IN chunk.content: {has_meta}", file=sys.stderr
|
|
379
|
+
)
|
|
380
|
+
print(
|
|
381
|
+
f" Last 100 chars: {repr(chunk.content[-100:])}",
|
|
382
|
+
file=sys.stderr,
|
|
383
|
+
)
|
|
265
384
|
|
|
266
|
-
#
|
|
385
|
+
# Store original content directly in documents (no metadata appended)
|
|
386
|
+
# The embedding will be created from the original content
|
|
387
|
+
documents.append(chunk.content)
|
|
388
|
+
|
|
389
|
+
# Create metadata (searchable fields as metadata, not appended to content)
|
|
267
390
|
metadata = {
|
|
268
391
|
"file_path": str(chunk.file_path),
|
|
269
392
|
"start_line": chunk.start_line,
|
|
@@ -288,6 +411,12 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
288
411
|
"subproject_name": chunk.subproject_name or "",
|
|
289
412
|
"subproject_path": chunk.subproject_path or "",
|
|
290
413
|
}
|
|
414
|
+
|
|
415
|
+
# Merge structural metrics if provided
|
|
416
|
+
if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
|
|
417
|
+
chunk_metrics = metrics[chunk.chunk_id]
|
|
418
|
+
metadata.update(chunk_metrics)
|
|
419
|
+
|
|
291
420
|
metadatas.append(metadata)
|
|
292
421
|
|
|
293
422
|
# Use chunk ID
|
|
@@ -347,6 +476,34 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
347
476
|
similarity = max(0.0, 1.0 / (1.0 + distance))
|
|
348
477
|
|
|
349
478
|
if similarity >= similarity_threshold:
|
|
479
|
+
# Document contains the original content (no metadata appended)
|
|
480
|
+
# Parse code smells from JSON if present
|
|
481
|
+
code_smells = []
|
|
482
|
+
if "code_smells" in metadata:
|
|
483
|
+
try:
|
|
484
|
+
code_smells = json.loads(metadata["code_smells"])
|
|
485
|
+
except (json.JSONDecodeError, TypeError):
|
|
486
|
+
code_smells = []
|
|
487
|
+
|
|
488
|
+
# Calculate quality score from metrics (0-100 scale)
|
|
489
|
+
quality_score = None
|
|
490
|
+
if (
|
|
491
|
+
"cognitive_complexity" in metadata
|
|
492
|
+
and "smell_count" in metadata
|
|
493
|
+
):
|
|
494
|
+
# Simple quality score: penalize complexity and smells
|
|
495
|
+
complexity = metadata["cognitive_complexity"]
|
|
496
|
+
smells = metadata["smell_count"]
|
|
497
|
+
|
|
498
|
+
# Start with 100, penalize for complexity and smells
|
|
499
|
+
score = 100
|
|
500
|
+
# Complexity penalty: -2 points per complexity unit
|
|
501
|
+
score -= min(50, complexity * 2)
|
|
502
|
+
# Smell penalty: -10 points per smell
|
|
503
|
+
score -= min(30, smells * 10)
|
|
504
|
+
|
|
505
|
+
quality_score = max(0, score)
|
|
506
|
+
|
|
350
507
|
result = SearchResult(
|
|
351
508
|
content=doc,
|
|
352
509
|
file_path=Path(metadata["file_path"]),
|
|
@@ -358,6 +515,16 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
358
515
|
chunk_type=metadata.get("chunk_type", "code"),
|
|
359
516
|
function_name=metadata.get("function_name") or None,
|
|
360
517
|
class_name=metadata.get("class_name") or None,
|
|
518
|
+
# Quality metrics from structural analysis
|
|
519
|
+
cognitive_complexity=metadata.get("cognitive_complexity"),
|
|
520
|
+
cyclomatic_complexity=metadata.get("cyclomatic_complexity"),
|
|
521
|
+
max_nesting_depth=metadata.get("max_nesting_depth"),
|
|
522
|
+
parameter_count=metadata.get("parameter_count"),
|
|
523
|
+
lines_of_code=metadata.get("lines_of_code"),
|
|
524
|
+
complexity_grade=metadata.get("complexity_grade"),
|
|
525
|
+
code_smells=code_smells,
|
|
526
|
+
smell_count=metadata.get("smell_count"),
|
|
527
|
+
quality_score=quality_score,
|
|
361
528
|
)
|
|
362
529
|
search_results.append(result)
|
|
363
530
|
|
|
@@ -507,6 +674,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
507
674
|
if results and results.get("ids"):
|
|
508
675
|
for i, _chunk_id in enumerate(results["ids"]):
|
|
509
676
|
metadata = results["metadatas"][i]
|
|
677
|
+
# Document now contains the original content (no metadata appended)
|
|
510
678
|
content = results["documents"][i]
|
|
511
679
|
|
|
512
680
|
# Parse JSON strings back to lists/dicts
|
|
@@ -560,6 +728,9 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
560
728
|
|
|
561
729
|
def _create_searchable_text(self, chunk: CodeChunk) -> str:
|
|
562
730
|
"""Create optimized searchable text from code chunk."""
|
|
731
|
+
import sys
|
|
732
|
+
|
|
733
|
+
print("WARNING: _create_searchable_text IS BEING CALLED!", file=sys.stderr)
|
|
563
734
|
parts = [chunk.content]
|
|
564
735
|
|
|
565
736
|
# Add contextual information
|
|
@@ -579,7 +750,24 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
579
750
|
return "\n".join(parts)
|
|
580
751
|
|
|
581
752
|
def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
|
|
582
|
-
"""Build ChromaDB where clause from filters.
|
|
753
|
+
"""Build ChromaDB where clause from filters.
|
|
754
|
+
|
|
755
|
+
Supports filtering by:
|
|
756
|
+
- language, file_path, chunk_type (standard fields)
|
|
757
|
+
- complexity_grade (A, B, C, D, F)
|
|
758
|
+
- smell_count (0, >0)
|
|
759
|
+
- cognitive_complexity (range queries using $and)
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
filters: Dictionary of filter criteria
|
|
763
|
+
|
|
764
|
+
Returns:
|
|
765
|
+
ChromaDB where clause
|
|
766
|
+
"""
|
|
767
|
+
# If filters already contain ChromaDB operators ($and, $or), pass through
|
|
768
|
+
if "$and" in filters or "$or" in filters:
|
|
769
|
+
return filters
|
|
770
|
+
|
|
583
771
|
where = {}
|
|
584
772
|
|
|
585
773
|
for key, value in filters.items():
|
|
@@ -587,46 +775,140 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
587
775
|
where[key] = {"$in": value}
|
|
588
776
|
elif isinstance(value, str) and value.startswith("!"):
|
|
589
777
|
where[key] = {"$ne": value[1:]}
|
|
778
|
+
elif isinstance(value, dict):
|
|
779
|
+
# Support operator queries like {"$gte": 10}
|
|
780
|
+
where[key] = value
|
|
590
781
|
else:
|
|
591
782
|
where[key] = value
|
|
592
783
|
|
|
593
784
|
return where
|
|
594
785
|
|
|
595
786
|
async def _detect_and_recover_corruption(self) -> None:
|
|
596
|
-
"""Detect and recover from index corruption proactively.
|
|
597
|
-
|
|
787
|
+
"""Detect and recover from index corruption proactively.
|
|
788
|
+
|
|
789
|
+
This method checks for:
|
|
790
|
+
1. SQLite database corruption (LAYER 1: Pre-initialization check)
|
|
791
|
+
2. HNSW pickle file corruption
|
|
792
|
+
3. Metadata/data inconsistencies
|
|
793
|
+
4. File size anomalies
|
|
794
|
+
"""
|
|
795
|
+
# LAYER 1: Check SQLite database integrity FIRST (before ChromaDB initialization)
|
|
598
796
|
chroma_db_path = self.persist_directory / "chroma.sqlite3"
|
|
599
797
|
|
|
600
798
|
# If database doesn't exist yet, nothing to check
|
|
601
799
|
if not chroma_db_path.exists():
|
|
602
800
|
return
|
|
603
801
|
|
|
802
|
+
# SQLite integrity check - catches corruption BEFORE Rust panic
|
|
803
|
+
try:
|
|
804
|
+
import sqlite3
|
|
805
|
+
|
|
806
|
+
logger.debug("Running SQLite integrity check...")
|
|
807
|
+
conn = sqlite3.connect(str(chroma_db_path))
|
|
808
|
+
cursor = conn.execute("PRAGMA quick_check")
|
|
809
|
+
result = cursor.fetchone()[0]
|
|
810
|
+
conn.close()
|
|
811
|
+
|
|
812
|
+
if result != "ok":
|
|
813
|
+
logger.warning(f"SQLite database corruption detected: {result}")
|
|
814
|
+
logger.info("Initiating automatic recovery from database corruption...")
|
|
815
|
+
await self._recover_from_corruption()
|
|
816
|
+
return
|
|
817
|
+
|
|
818
|
+
logger.debug("SQLite integrity check passed")
|
|
819
|
+
|
|
820
|
+
except sqlite3.Error as e:
|
|
821
|
+
logger.warning(f"SQLite database error during integrity check: {e}")
|
|
822
|
+
logger.info("Initiating automatic recovery from database corruption...")
|
|
823
|
+
await self._recover_from_corruption()
|
|
824
|
+
return
|
|
825
|
+
|
|
604
826
|
# Check for HNSW index files that might be corrupted
|
|
605
|
-
self.persist_directory / "chroma-collections.parquet"
|
|
606
827
|
index_path = self.persist_directory / "index"
|
|
607
828
|
|
|
608
829
|
if index_path.exists():
|
|
609
|
-
# Look for pickle files in the index
|
|
830
|
+
# Look for pickle files in the index (HNSW metadata)
|
|
610
831
|
pickle_files = list(index_path.glob("**/*.pkl"))
|
|
611
832
|
pickle_files.extend(list(index_path.glob("**/*.pickle")))
|
|
833
|
+
pickle_files.extend(list(index_path.glob("**/*.bin"))) # Binary HNSW files
|
|
834
|
+
|
|
835
|
+
logger.debug(
|
|
836
|
+
f"Checking {len(pickle_files)} HNSW index files for corruption..."
|
|
837
|
+
)
|
|
612
838
|
|
|
613
839
|
for pickle_file in pickle_files:
|
|
614
840
|
try:
|
|
615
|
-
#
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
841
|
+
# Check file size - suspiciously small files might be corrupted
|
|
842
|
+
file_size = pickle_file.stat().st_size
|
|
843
|
+
if file_size == 0:
|
|
844
|
+
logger.warning(
|
|
845
|
+
f"Empty HNSW index file detected: {pickle_file} (0 bytes)"
|
|
846
|
+
)
|
|
847
|
+
await self._recover_from_corruption()
|
|
848
|
+
return
|
|
849
|
+
|
|
850
|
+
# Only validate pickle files (not binary .bin files)
|
|
851
|
+
if pickle_file.suffix in (".pkl", ".pickle"):
|
|
852
|
+
# Try to read the pickle file to detect corruption
|
|
853
|
+
import pickle # nosec B403 # Trusted internal index files only
|
|
854
|
+
|
|
855
|
+
with open(pickle_file, "rb") as f:
|
|
856
|
+
data = pickle.load(f) # nosec B301 # Trusted internal index files only
|
|
857
|
+
|
|
858
|
+
# Additional validation: check if data structure is valid
|
|
859
|
+
if data is None:
|
|
860
|
+
logger.warning(
|
|
861
|
+
f"HNSW index file contains None data: {pickle_file}"
|
|
862
|
+
)
|
|
863
|
+
await self._recover_from_corruption()
|
|
864
|
+
return
|
|
865
|
+
|
|
866
|
+
# Check for metadata consistency (if it's a dict)
|
|
867
|
+
if isinstance(data, dict):
|
|
868
|
+
# Look for known metadata keys that should exist
|
|
869
|
+
if "space" in data and "dim" in data:
|
|
870
|
+
# Validate dimensions are reasonable
|
|
871
|
+
if data.get("dim", 0) <= 0:
|
|
872
|
+
logger.warning(
|
|
873
|
+
f"Invalid dimensions in HNSW index: {pickle_file} (dim={data.get('dim')})"
|
|
874
|
+
)
|
|
875
|
+
await self._recover_from_corruption()
|
|
876
|
+
return
|
|
877
|
+
|
|
878
|
+
except (EOFError, pickle.UnpicklingError) as e:
|
|
879
|
+
logger.warning(f"Pickle corruption detected in {pickle_file}: {e}")
|
|
624
880
|
await self._recover_from_corruption()
|
|
625
881
|
return
|
|
882
|
+
except Exception as e:
|
|
883
|
+
# Check if this is a Rust panic pattern
|
|
884
|
+
error_msg = str(e).lower()
|
|
885
|
+
if "range start index" in error_msg and "out of range" in error_msg:
|
|
886
|
+
logger.warning(
|
|
887
|
+
f"Rust panic pattern detected in {pickle_file}: {e}"
|
|
888
|
+
)
|
|
889
|
+
await self._recover_from_corruption()
|
|
890
|
+
return
|
|
891
|
+
else:
|
|
892
|
+
logger.warning(
|
|
893
|
+
f"Error reading HNSW index file {pickle_file}: {e}"
|
|
894
|
+
)
|
|
895
|
+
# Continue checking other files before deciding to recover
|
|
896
|
+
continue
|
|
897
|
+
|
|
898
|
+
logger.debug("HNSW index files validation passed")
|
|
626
899
|
|
|
627
900
|
async def _recover_from_corruption(self) -> None:
|
|
628
|
-
"""Recover from index corruption by rebuilding the index.
|
|
629
|
-
|
|
901
|
+
"""Recover from index corruption by rebuilding the index.
|
|
902
|
+
|
|
903
|
+
This method:
|
|
904
|
+
1. Creates a timestamped backup of the corrupted index
|
|
905
|
+
2. Clears the corrupted index directory
|
|
906
|
+
3. Recreates the directory structure
|
|
907
|
+
4. Logs detailed recovery steps and instructions
|
|
908
|
+
"""
|
|
909
|
+
logger.warning("=" * 80)
|
|
910
|
+
logger.warning("INDEX CORRUPTION DETECTED - Initiating recovery...")
|
|
911
|
+
logger.warning("=" * 80)
|
|
630
912
|
|
|
631
913
|
# Create backup directory
|
|
632
914
|
backup_dir = (
|
|
@@ -634,7 +916,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
634
916
|
)
|
|
635
917
|
backup_dir.mkdir(exist_ok=True)
|
|
636
918
|
|
|
637
|
-
# Backup current state (in case we need it)
|
|
919
|
+
# Backup current state (in case we need it for debugging)
|
|
638
920
|
import time
|
|
639
921
|
|
|
640
922
|
timestamp = int(time.time())
|
|
@@ -643,24 +925,41 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
643
925
|
if self.persist_directory.exists():
|
|
644
926
|
try:
|
|
645
927
|
shutil.copytree(self.persist_directory, backup_path)
|
|
646
|
-
logger.info(f"Created backup at {backup_path}")
|
|
928
|
+
logger.info(f"✓ Created backup at {backup_path}")
|
|
647
929
|
except Exception as e:
|
|
648
|
-
logger.warning(f"Could not create backup: {e}")
|
|
930
|
+
logger.warning(f"⚠ Could not create backup: {e}")
|
|
649
931
|
|
|
650
932
|
# Clear the corrupted index
|
|
651
933
|
if self.persist_directory.exists():
|
|
652
934
|
try:
|
|
935
|
+
# Log what we're about to delete
|
|
936
|
+
total_size = sum(
|
|
937
|
+
f.stat().st_size
|
|
938
|
+
for f in self.persist_directory.rglob("*")
|
|
939
|
+
if f.is_file()
|
|
940
|
+
)
|
|
941
|
+
logger.info(
|
|
942
|
+
f"Clearing corrupted index ({total_size / 1024 / 1024:.2f} MB)..."
|
|
943
|
+
)
|
|
944
|
+
|
|
653
945
|
shutil.rmtree(self.persist_directory)
|
|
654
|
-
logger.info(f"Cleared corrupted index at {self.persist_directory}")
|
|
946
|
+
logger.info(f"✓ Cleared corrupted index at {self.persist_directory}")
|
|
655
947
|
except Exception as e:
|
|
656
|
-
logger.error(f"Failed to clear corrupted index: {e}")
|
|
948
|
+
logger.error(f"✗ Failed to clear corrupted index: {e}")
|
|
657
949
|
raise IndexCorruptionError(
|
|
658
|
-
f"Could not clear corrupted index: {e}"
|
|
950
|
+
f"Could not clear corrupted index: {e}. "
|
|
951
|
+
f"Please manually delete {self.persist_directory} and try again."
|
|
659
952
|
) from e
|
|
660
953
|
|
|
661
954
|
# Recreate the directory
|
|
662
955
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
663
|
-
logger.info("Index directory recreated
|
|
956
|
+
logger.info("✓ Index directory recreated")
|
|
957
|
+
|
|
958
|
+
logger.warning("=" * 80)
|
|
959
|
+
logger.warning("RECOVERY COMPLETE - Next steps:")
|
|
960
|
+
logger.warning(" 1. Run 'mcp-vector-search index' to rebuild the index")
|
|
961
|
+
logger.warning(f" 2. Backup saved to: {backup_path}")
|
|
962
|
+
logger.warning("=" * 80)
|
|
664
963
|
|
|
665
964
|
async def health_check(self) -> bool:
|
|
666
965
|
"""Check database health and integrity.
|
|
@@ -762,8 +1061,15 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
762
1061
|
await self._pool.close()
|
|
763
1062
|
logger.debug("Pooled ChromaDB connections closed")
|
|
764
1063
|
|
|
765
|
-
async def add_chunks(
|
|
766
|
-
|
|
1064
|
+
async def add_chunks(
|
|
1065
|
+
self, chunks: list[CodeChunk], metrics: dict[str, Any] | None = None
|
|
1066
|
+
) -> None:
|
|
1067
|
+
"""Add code chunks to the database using pooled connection with optional metrics.
|
|
1068
|
+
|
|
1069
|
+
Args:
|
|
1070
|
+
chunks: List of code chunks to add
|
|
1071
|
+
metrics: Optional dict mapping chunk IDs to ChunkMetrics.to_metadata() dicts
|
|
1072
|
+
"""
|
|
767
1073
|
if not chunks:
|
|
768
1074
|
return
|
|
769
1075
|
|
|
@@ -779,35 +1085,40 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
779
1085
|
ids = []
|
|
780
1086
|
|
|
781
1087
|
for chunk in chunks:
|
|
1088
|
+
# Store original content in documents (no metadata appended)
|
|
782
1089
|
documents.append(chunk.content)
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
1090
|
+
|
|
1091
|
+
metadata = {
|
|
1092
|
+
"file_path": str(chunk.file_path),
|
|
1093
|
+
"start_line": chunk.start_line,
|
|
1094
|
+
"end_line": chunk.end_line,
|
|
1095
|
+
"language": chunk.language,
|
|
1096
|
+
"chunk_type": chunk.chunk_type,
|
|
1097
|
+
"function_name": chunk.function_name or "",
|
|
1098
|
+
"class_name": chunk.class_name or "",
|
|
1099
|
+
"docstring": chunk.docstring or "",
|
|
1100
|
+
"complexity_score": chunk.complexity_score,
|
|
1101
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
1102
|
+
"chunk_id": chunk.chunk_id or "",
|
|
1103
|
+
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
1104
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
1105
|
+
"chunk_depth": chunk.chunk_depth,
|
|
1106
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
1107
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
1108
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
1109
|
+
"return_type": chunk.return_type or "",
|
|
1110
|
+
"type_annotations": json.dumps(chunk.type_annotations or {}),
|
|
1111
|
+
# Monorepo support
|
|
1112
|
+
"subproject_name": chunk.subproject_name or "",
|
|
1113
|
+
"subproject_path": chunk.subproject_path or "",
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
# Merge structural metrics if provided
|
|
1117
|
+
if metrics and chunk.chunk_id and chunk.chunk_id in metrics:
|
|
1118
|
+
chunk_metrics = metrics[chunk.chunk_id]
|
|
1119
|
+
metadata.update(chunk_metrics)
|
|
1120
|
+
|
|
1121
|
+
metadatas.append(metadata)
|
|
811
1122
|
ids.append(chunk.id)
|
|
812
1123
|
|
|
813
1124
|
# Add to collection
|
|
@@ -862,6 +1173,7 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
862
1173
|
similarity = max(0.0, 1.0 / (1.0 + distance))
|
|
863
1174
|
|
|
864
1175
|
if similarity >= similarity_threshold:
|
|
1176
|
+
# Document contains the original content (no metadata appended)
|
|
865
1177
|
result = SearchResult(
|
|
866
1178
|
content=doc,
|
|
867
1179
|
file_path=Path(metadata["file_path"]),
|