mcp-vector-search 0.15.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +10 -0
- mcp_vector_search/cli/__init__.py +1 -0
- mcp_vector_search/cli/commands/__init__.py +1 -0
- mcp_vector_search/cli/commands/auto_index.py +397 -0
- mcp_vector_search/cli/commands/chat.py +534 -0
- mcp_vector_search/cli/commands/config.py +393 -0
- mcp_vector_search/cli/commands/demo.py +358 -0
- mcp_vector_search/cli/commands/index.py +762 -0
- mcp_vector_search/cli/commands/init.py +658 -0
- mcp_vector_search/cli/commands/install.py +869 -0
- mcp_vector_search/cli/commands/install_old.py +700 -0
- mcp_vector_search/cli/commands/mcp.py +1254 -0
- mcp_vector_search/cli/commands/reset.py +393 -0
- mcp_vector_search/cli/commands/search.py +796 -0
- mcp_vector_search/cli/commands/setup.py +1133 -0
- mcp_vector_search/cli/commands/status.py +584 -0
- mcp_vector_search/cli/commands/uninstall.py +404 -0
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +265 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +201 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
- mcp_vector_search/cli/commands/visualize.py.original +2536 -0
- mcp_vector_search/cli/commands/watch.py +287 -0
- mcp_vector_search/cli/didyoumean.py +520 -0
- mcp_vector_search/cli/export.py +320 -0
- mcp_vector_search/cli/history.py +295 -0
- mcp_vector_search/cli/interactive.py +342 -0
- mcp_vector_search/cli/main.py +484 -0
- mcp_vector_search/cli/output.py +414 -0
- mcp_vector_search/cli/suggestions.py +375 -0
- mcp_vector_search/config/__init__.py +1 -0
- mcp_vector_search/config/constants.py +24 -0
- mcp_vector_search/config/defaults.py +200 -0
- mcp_vector_search/config/settings.py +146 -0
- mcp_vector_search/core/__init__.py +1 -0
- mcp_vector_search/core/auto_indexer.py +298 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/connection_pool.py +360 -0
- mcp_vector_search/core/database.py +1237 -0
- mcp_vector_search/core/directory_index.py +318 -0
- mcp_vector_search/core/embeddings.py +294 -0
- mcp_vector_search/core/exceptions.py +89 -0
- mcp_vector_search/core/factory.py +318 -0
- mcp_vector_search/core/git_hooks.py +345 -0
- mcp_vector_search/core/indexer.py +1002 -0
- mcp_vector_search/core/llm_client.py +453 -0
- mcp_vector_search/core/models.py +294 -0
- mcp_vector_search/core/project.py +350 -0
- mcp_vector_search/core/scheduler.py +330 -0
- mcp_vector_search/core/search.py +952 -0
- mcp_vector_search/core/watcher.py +322 -0
- mcp_vector_search/mcp/__init__.py +5 -0
- mcp_vector_search/mcp/__main__.py +25 -0
- mcp_vector_search/mcp/server.py +752 -0
- mcp_vector_search/parsers/__init__.py +8 -0
- mcp_vector_search/parsers/base.py +296 -0
- mcp_vector_search/parsers/dart.py +605 -0
- mcp_vector_search/parsers/html.py +413 -0
- mcp_vector_search/parsers/javascript.py +643 -0
- mcp_vector_search/parsers/php.py +694 -0
- mcp_vector_search/parsers/python.py +502 -0
- mcp_vector_search/parsers/registry.py +223 -0
- mcp_vector_search/parsers/ruby.py +678 -0
- mcp_vector_search/parsers/text.py +186 -0
- mcp_vector_search/parsers/utils.py +265 -0
- mcp_vector_search/py.typed +1 -0
- mcp_vector_search/utils/__init__.py +42 -0
- mcp_vector_search/utils/gitignore.py +250 -0
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +339 -0
- mcp_vector_search/utils/timing.py +338 -0
- mcp_vector_search/utils/version.py +47 -0
- mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
- mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
- mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
- mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
- mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1237 @@
|
|
|
1
|
+
"""Database abstraction and ChromaDB implementation for MCP Vector Search."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import shutil
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Protocol, runtime_checkable
|
|
9
|
+
|
|
10
|
+
from loguru import logger
|
|
11
|
+
|
|
12
|
+
from .connection_pool import ChromaConnectionPool
|
|
13
|
+
from .exceptions import (
|
|
14
|
+
DatabaseError,
|
|
15
|
+
DatabaseInitializationError,
|
|
16
|
+
DatabaseNotInitializedError,
|
|
17
|
+
DocumentAdditionError,
|
|
18
|
+
IndexCorruptionError,
|
|
19
|
+
SearchError,
|
|
20
|
+
)
|
|
21
|
+
from .models import CodeChunk, IndexStats, SearchResult
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@runtime_checkable
|
|
25
|
+
class EmbeddingFunction(Protocol):
|
|
26
|
+
"""Protocol for embedding functions."""
|
|
27
|
+
|
|
28
|
+
def __call__(self, texts: list[str]) -> list[list[float]]:
|
|
29
|
+
"""Generate embeddings for input texts."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class VectorDatabase(ABC):
|
|
34
|
+
"""Abstract interface for vector database operations."""
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
async def initialize(self) -> None:
|
|
38
|
+
"""Initialize the database connection and collections."""
|
|
39
|
+
...
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
async def close(self) -> None:
|
|
43
|
+
"""Close database connections and cleanup resources."""
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
async def add_chunks(self, chunks: list[CodeChunk]) -> None:
|
|
48
|
+
"""Add code chunks to the database.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
chunks: List of code chunks to add
|
|
52
|
+
"""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
async def search(
|
|
57
|
+
self,
|
|
58
|
+
query: str,
|
|
59
|
+
limit: int = 10,
|
|
60
|
+
filters: dict[str, Any] | None = None,
|
|
61
|
+
similarity_threshold: float = 0.7,
|
|
62
|
+
) -> list[SearchResult]:
|
|
63
|
+
"""Search for similar code chunks.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
query: Search query
|
|
67
|
+
limit: Maximum number of results
|
|
68
|
+
filters: Optional filters to apply
|
|
69
|
+
similarity_threshold: Minimum similarity score
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
List of search results
|
|
73
|
+
"""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
async def delete_by_file(self, file_path: Path) -> int:
|
|
78
|
+
"""Delete all chunks for a specific file.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
file_path: Path to the file
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Number of deleted chunks
|
|
85
|
+
"""
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
@abstractmethod
|
|
89
|
+
async def get_stats(self) -> IndexStats:
|
|
90
|
+
"""Get database statistics.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Index statistics
|
|
94
|
+
"""
|
|
95
|
+
...
|
|
96
|
+
|
|
97
|
+
@abstractmethod
|
|
98
|
+
async def reset(self) -> None:
|
|
99
|
+
"""Reset the database (delete all data)."""
|
|
100
|
+
...
|
|
101
|
+
|
|
102
|
+
@abstractmethod
|
|
103
|
+
async def get_all_chunks(self) -> list[CodeChunk]:
|
|
104
|
+
"""Get all chunks from the database.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
List of all code chunks with metadata
|
|
108
|
+
"""
|
|
109
|
+
...
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
async def health_check(self) -> bool:
|
|
113
|
+
"""Check database health and integrity.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
True if database is healthy, False otherwise
|
|
117
|
+
"""
|
|
118
|
+
...
|
|
119
|
+
|
|
120
|
+
async def __aenter__(self) -> "VectorDatabase":
|
|
121
|
+
"""Async context manager entry."""
|
|
122
|
+
await self.initialize()
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
126
|
+
"""Async context manager exit."""
|
|
127
|
+
await self.close()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class ChromaVectorDatabase(VectorDatabase):
|
|
131
|
+
"""ChromaDB implementation of vector database."""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
persist_directory: Path,
|
|
136
|
+
embedding_function: EmbeddingFunction,
|
|
137
|
+
collection_name: str = "code_search",
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Initialize ChromaDB vector database.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
persist_directory: Directory to persist database
|
|
143
|
+
embedding_function: Function to generate embeddings
|
|
144
|
+
collection_name: Name of the collection
|
|
145
|
+
"""
|
|
146
|
+
self.persist_directory = persist_directory
|
|
147
|
+
self.embedding_function = embedding_function
|
|
148
|
+
self.collection_name = collection_name
|
|
149
|
+
self._client = None
|
|
150
|
+
self._collection = None
|
|
151
|
+
|
|
152
|
+
async def initialize(self) -> None:
|
|
153
|
+
"""Initialize ChromaDB client and collection with corruption recovery."""
|
|
154
|
+
try:
|
|
155
|
+
import chromadb
|
|
156
|
+
|
|
157
|
+
# Ensure directory exists
|
|
158
|
+
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
159
|
+
|
|
160
|
+
# Check for corruption before initializing
|
|
161
|
+
await self._detect_and_recover_corruption()
|
|
162
|
+
|
|
163
|
+
# Create client with new API
|
|
164
|
+
self._client = chromadb.PersistentClient(
|
|
165
|
+
path=str(self.persist_directory),
|
|
166
|
+
settings=chromadb.Settings(
|
|
167
|
+
anonymized_telemetry=False,
|
|
168
|
+
allow_reset=True,
|
|
169
|
+
),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Create or get collection
|
|
173
|
+
self._collection = self._client.get_or_create_collection(
|
|
174
|
+
name=self.collection_name,
|
|
175
|
+
embedding_function=self.embedding_function,
|
|
176
|
+
metadata={
|
|
177
|
+
"description": "Semantic code search collection",
|
|
178
|
+
},
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
logger.debug(f"ChromaDB initialized at {self.persist_directory}")
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
# Check if this is a corruption error
|
|
185
|
+
error_msg = str(e).lower()
|
|
186
|
+
if any(
|
|
187
|
+
indicator in error_msg
|
|
188
|
+
for indicator in [
|
|
189
|
+
"pickle",
|
|
190
|
+
"unpickling",
|
|
191
|
+
"eof",
|
|
192
|
+
"ran out of input",
|
|
193
|
+
"hnsw",
|
|
194
|
+
"index",
|
|
195
|
+
"deserialize",
|
|
196
|
+
"corrupt",
|
|
197
|
+
]
|
|
198
|
+
):
|
|
199
|
+
logger.warning(f"Detected index corruption: {e}")
|
|
200
|
+
# Try to recover
|
|
201
|
+
await self._recover_from_corruption()
|
|
202
|
+
# Retry initialization
|
|
203
|
+
await self.initialize()
|
|
204
|
+
else:
|
|
205
|
+
logger.error(f"Failed to initialize ChromaDB: {e}")
|
|
206
|
+
raise DatabaseInitializationError(
|
|
207
|
+
f"ChromaDB initialization failed: {e}"
|
|
208
|
+
) from e
|
|
209
|
+
|
|
210
|
+
async def remove_file_chunks(self, file_path: str) -> int:
|
|
211
|
+
"""Remove all chunks for a specific file.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
file_path: Relative path to the file
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Number of chunks removed
|
|
218
|
+
"""
|
|
219
|
+
if not self._collection:
|
|
220
|
+
raise DatabaseNotInitializedError("Database not initialized")
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
# Get all chunks for this file
|
|
224
|
+
results = self._collection.get(where={"file_path": file_path})
|
|
225
|
+
|
|
226
|
+
if not results["ids"]:
|
|
227
|
+
return 0
|
|
228
|
+
|
|
229
|
+
# Delete the chunks
|
|
230
|
+
self._collection.delete(ids=results["ids"])
|
|
231
|
+
|
|
232
|
+
removed_count = len(results["ids"])
|
|
233
|
+
logger.debug(f"Removed {removed_count} chunks for file: {file_path}")
|
|
234
|
+
return removed_count
|
|
235
|
+
|
|
236
|
+
except Exception as e:
|
|
237
|
+
logger.error(f"Failed to remove chunks for file {file_path}: {e}")
|
|
238
|
+
return 0
|
|
239
|
+
|
|
240
|
+
async def close(self) -> None:
|
|
241
|
+
"""Close database connections."""
|
|
242
|
+
if self._client:
|
|
243
|
+
# ChromaDB doesn't require explicit closing
|
|
244
|
+
self._client = None
|
|
245
|
+
self._collection = None
|
|
246
|
+
logger.debug("ChromaDB connections closed")
|
|
247
|
+
|
|
248
|
+
async def add_chunks(self, chunks: list[CodeChunk]) -> None:
|
|
249
|
+
"""Add code chunks to the database."""
|
|
250
|
+
if not self._collection:
|
|
251
|
+
raise DatabaseNotInitializedError("Database not initialized")
|
|
252
|
+
|
|
253
|
+
if not chunks:
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
documents = []
|
|
258
|
+
metadatas = []
|
|
259
|
+
ids = []
|
|
260
|
+
|
|
261
|
+
for chunk in chunks:
|
|
262
|
+
# Debug: Check first chunk content
|
|
263
|
+
if len(documents) == 0:
|
|
264
|
+
import sys
|
|
265
|
+
|
|
266
|
+
has_meta = "Language:" in chunk.content and "File:" in chunk.content
|
|
267
|
+
print("\n[DATABASE] First chunk content check:", file=sys.stderr)
|
|
268
|
+
print(f" Type: {chunk.chunk_type}", file=sys.stderr)
|
|
269
|
+
print(f" File: {chunk.file_path.name}", file=sys.stderr)
|
|
270
|
+
print(
|
|
271
|
+
f" Has metadata IN chunk.content: {has_meta}", file=sys.stderr
|
|
272
|
+
)
|
|
273
|
+
print(
|
|
274
|
+
f" Last 100 chars: {repr(chunk.content[-100:])}",
|
|
275
|
+
file=sys.stderr,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Store original content directly in documents (no metadata appended)
|
|
279
|
+
# The embedding will be created from the original content
|
|
280
|
+
documents.append(chunk.content)
|
|
281
|
+
|
|
282
|
+
# Create metadata (searchable fields as metadata, not appended to content)
|
|
283
|
+
metadata = {
|
|
284
|
+
"file_path": str(chunk.file_path),
|
|
285
|
+
"start_line": chunk.start_line,
|
|
286
|
+
"end_line": chunk.end_line,
|
|
287
|
+
"language": chunk.language,
|
|
288
|
+
"chunk_type": chunk.chunk_type,
|
|
289
|
+
"function_name": chunk.function_name or "",
|
|
290
|
+
"class_name": chunk.class_name or "",
|
|
291
|
+
"docstring": chunk.docstring or "",
|
|
292
|
+
"complexity_score": chunk.complexity_score,
|
|
293
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
294
|
+
"chunk_id": chunk.chunk_id or "",
|
|
295
|
+
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
296
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
297
|
+
"chunk_depth": chunk.chunk_depth,
|
|
298
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
299
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
300
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
301
|
+
"return_type": chunk.return_type or "",
|
|
302
|
+
"type_annotations": json.dumps(chunk.type_annotations or {}),
|
|
303
|
+
# Monorepo support
|
|
304
|
+
"subproject_name": chunk.subproject_name or "",
|
|
305
|
+
"subproject_path": chunk.subproject_path or "",
|
|
306
|
+
}
|
|
307
|
+
metadatas.append(metadata)
|
|
308
|
+
|
|
309
|
+
# Use chunk ID
|
|
310
|
+
ids.append(chunk.id)
|
|
311
|
+
|
|
312
|
+
# Add to collection
|
|
313
|
+
self._collection.add(
|
|
314
|
+
documents=documents,
|
|
315
|
+
metadatas=metadatas,
|
|
316
|
+
ids=ids,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
logger.debug(f"Added {len(chunks)} chunks to database")
|
|
320
|
+
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.error(f"Failed to add chunks: {e}")
|
|
323
|
+
raise DocumentAdditionError(f"Failed to add chunks: {e}") from e
|
|
324
|
+
|
|
325
|
+
async def search(
|
|
326
|
+
self,
|
|
327
|
+
query: str,
|
|
328
|
+
limit: int = 10,
|
|
329
|
+
filters: dict[str, Any] | None = None,
|
|
330
|
+
similarity_threshold: float = 0.7,
|
|
331
|
+
) -> list[SearchResult]:
|
|
332
|
+
"""Search for similar code chunks."""
|
|
333
|
+
if not self._collection:
|
|
334
|
+
raise DatabaseNotInitializedError("Database not initialized")
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
# Build where clause
|
|
338
|
+
where_clause = self._build_where_clause(filters) if filters else None
|
|
339
|
+
|
|
340
|
+
# Perform search
|
|
341
|
+
results = self._collection.query(
|
|
342
|
+
query_texts=[query],
|
|
343
|
+
n_results=limit,
|
|
344
|
+
where=where_clause,
|
|
345
|
+
include=["documents", "metadatas", "distances"],
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Process results
|
|
349
|
+
search_results = []
|
|
350
|
+
|
|
351
|
+
if results["documents"] and results["documents"][0]:
|
|
352
|
+
for i, (doc, metadata, distance) in enumerate(
|
|
353
|
+
zip(
|
|
354
|
+
results["documents"][0],
|
|
355
|
+
results["metadatas"][0],
|
|
356
|
+
results["distances"][0],
|
|
357
|
+
strict=False,
|
|
358
|
+
)
|
|
359
|
+
):
|
|
360
|
+
# Convert distance to similarity (ChromaDB uses cosine distance)
|
|
361
|
+
# For cosine distance, use a more permissive conversion that handles distances > 1.0
|
|
362
|
+
# Convert to a 0-1 similarity score where lower distances = higher similarity
|
|
363
|
+
similarity = max(0.0, 1.0 / (1.0 + distance))
|
|
364
|
+
|
|
365
|
+
if similarity >= similarity_threshold:
|
|
366
|
+
# Document contains the original content (no metadata appended)
|
|
367
|
+
result = SearchResult(
|
|
368
|
+
content=doc,
|
|
369
|
+
file_path=Path(metadata["file_path"]),
|
|
370
|
+
start_line=metadata["start_line"],
|
|
371
|
+
end_line=metadata["end_line"],
|
|
372
|
+
language=metadata["language"],
|
|
373
|
+
similarity_score=similarity,
|
|
374
|
+
rank=i + 1,
|
|
375
|
+
chunk_type=metadata.get("chunk_type", "code"),
|
|
376
|
+
function_name=metadata.get("function_name") or None,
|
|
377
|
+
class_name=metadata.get("class_name") or None,
|
|
378
|
+
)
|
|
379
|
+
search_results.append(result)
|
|
380
|
+
|
|
381
|
+
logger.debug(f"Found {len(search_results)} results for query: {query}")
|
|
382
|
+
return search_results
|
|
383
|
+
|
|
384
|
+
except Exception as e:
|
|
385
|
+
logger.error(f"Search failed: {e}")
|
|
386
|
+
raise SearchError(f"Search failed: {e}") from e
|
|
387
|
+
|
|
388
|
+
async def delete_by_file(self, file_path: Path) -> int:
|
|
389
|
+
"""Delete all chunks for a specific file."""
|
|
390
|
+
if not self._collection:
|
|
391
|
+
raise DatabaseNotInitializedError("Database not initialized")
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Get all chunks for this file
|
|
395
|
+
results = self._collection.get(
|
|
396
|
+
where={"file_path": str(file_path)},
|
|
397
|
+
include=["metadatas"],
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
if results["ids"]:
|
|
401
|
+
self._collection.delete(ids=results["ids"])
|
|
402
|
+
count = len(results["ids"])
|
|
403
|
+
logger.debug(f"Deleted {count} chunks for {file_path}")
|
|
404
|
+
return count
|
|
405
|
+
|
|
406
|
+
return 0
|
|
407
|
+
|
|
408
|
+
except Exception as e:
|
|
409
|
+
logger.error(f"Failed to delete chunks for {file_path}: {e}")
|
|
410
|
+
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
411
|
+
|
|
412
|
+
async def get_stats(self) -> IndexStats:
|
|
413
|
+
"""Get database statistics with optimized chunked queries."""
|
|
414
|
+
if not self._collection:
|
|
415
|
+
raise DatabaseNotInitializedError("Database not initialized")
|
|
416
|
+
|
|
417
|
+
try:
|
|
418
|
+
# Get total count (fast operation)
|
|
419
|
+
count = self._collection.count()
|
|
420
|
+
|
|
421
|
+
if count == 0:
|
|
422
|
+
return IndexStats(
|
|
423
|
+
total_files=0,
|
|
424
|
+
total_chunks=0,
|
|
425
|
+
languages={},
|
|
426
|
+
file_types={},
|
|
427
|
+
index_size_mb=0.0,
|
|
428
|
+
last_updated="N/A",
|
|
429
|
+
embedding_model="unknown",
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Process in chunks to avoid loading everything at once
|
|
433
|
+
batch_size_limit = 1000
|
|
434
|
+
|
|
435
|
+
files = set()
|
|
436
|
+
language_counts: dict[str, int] = {}
|
|
437
|
+
file_type_counts: dict[str, int] = {}
|
|
438
|
+
|
|
439
|
+
offset = 0
|
|
440
|
+
while offset < count:
|
|
441
|
+
# Fetch batch
|
|
442
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
443
|
+
logger.debug(
|
|
444
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
445
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
results = self._collection.get(
|
|
449
|
+
include=["metadatas"],
|
|
450
|
+
limit=batch_size,
|
|
451
|
+
offset=offset,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Process batch metadata
|
|
455
|
+
for metadata in results.get("metadatas", []):
|
|
456
|
+
# Language stats
|
|
457
|
+
lang = metadata.get("language", "unknown")
|
|
458
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
459
|
+
|
|
460
|
+
# File stats
|
|
461
|
+
file_path = metadata.get("file_path", "")
|
|
462
|
+
if file_path:
|
|
463
|
+
files.add(file_path)
|
|
464
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
465
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
466
|
+
|
|
467
|
+
offset += batch_size
|
|
468
|
+
|
|
469
|
+
# Yield to event loop periodically to prevent blocking
|
|
470
|
+
await asyncio.sleep(0)
|
|
471
|
+
|
|
472
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
473
|
+
index_size_mb = count * 0.001
|
|
474
|
+
|
|
475
|
+
return IndexStats(
|
|
476
|
+
total_files=len(files),
|
|
477
|
+
total_chunks=count,
|
|
478
|
+
languages=language_counts,
|
|
479
|
+
file_types=file_type_counts,
|
|
480
|
+
index_size_mb=index_size_mb,
|
|
481
|
+
last_updated="unknown",
|
|
482
|
+
embedding_model="unknown",
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
487
|
+
# Return empty stats instead of raising
|
|
488
|
+
return IndexStats(
|
|
489
|
+
total_files=0,
|
|
490
|
+
total_chunks=0,
|
|
491
|
+
languages={},
|
|
492
|
+
file_types={},
|
|
493
|
+
index_size_mb=0.0,
|
|
494
|
+
last_updated="error",
|
|
495
|
+
embedding_model="unknown",
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
async def reset(self) -> None:
|
|
499
|
+
"""Reset the database."""
|
|
500
|
+
if self._client:
|
|
501
|
+
try:
|
|
502
|
+
self._client.reset()
|
|
503
|
+
# Recreate collection
|
|
504
|
+
await self.initialize()
|
|
505
|
+
logger.info("Database reset successfully")
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.error(f"Failed to reset database: {e}")
|
|
508
|
+
raise DatabaseError(f"Failed to reset database: {e}") from e
|
|
509
|
+
|
|
510
|
+
async def get_all_chunks(self) -> list[CodeChunk]:
|
|
511
|
+
"""Get all chunks from the database.
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
List of all code chunks with metadata
|
|
515
|
+
"""
|
|
516
|
+
if not self._collection:
|
|
517
|
+
raise DatabaseNotInitializedError("Database not initialized")
|
|
518
|
+
|
|
519
|
+
try:
|
|
520
|
+
# Get all documents from collection
|
|
521
|
+
results = self._collection.get(include=["metadatas", "documents"])
|
|
522
|
+
|
|
523
|
+
chunks = []
|
|
524
|
+
if results and results.get("ids"):
|
|
525
|
+
for i, _chunk_id in enumerate(results["ids"]):
|
|
526
|
+
metadata = results["metadatas"][i]
|
|
527
|
+
# Document now contains the original content (no metadata appended)
|
|
528
|
+
content = results["documents"][i]
|
|
529
|
+
|
|
530
|
+
# Parse JSON strings back to lists/dicts
|
|
531
|
+
child_chunk_ids = metadata.get("child_chunk_ids", "[]")
|
|
532
|
+
if isinstance(child_chunk_ids, str):
|
|
533
|
+
child_chunk_ids = json.loads(child_chunk_ids)
|
|
534
|
+
|
|
535
|
+
decorators = metadata.get("decorators", "[]")
|
|
536
|
+
if isinstance(decorators, str):
|
|
537
|
+
decorators = json.loads(decorators)
|
|
538
|
+
|
|
539
|
+
parameters = metadata.get("parameters", "[]")
|
|
540
|
+
if isinstance(parameters, str):
|
|
541
|
+
parameters = json.loads(parameters)
|
|
542
|
+
|
|
543
|
+
type_annotations = metadata.get("type_annotations", "{}")
|
|
544
|
+
if isinstance(type_annotations, str):
|
|
545
|
+
type_annotations = json.loads(type_annotations)
|
|
546
|
+
|
|
547
|
+
chunk = CodeChunk(
|
|
548
|
+
content=content,
|
|
549
|
+
file_path=Path(metadata["file_path"]),
|
|
550
|
+
start_line=metadata["start_line"],
|
|
551
|
+
end_line=metadata["end_line"],
|
|
552
|
+
language=metadata["language"],
|
|
553
|
+
chunk_type=metadata.get("chunk_type", "code"),
|
|
554
|
+
function_name=metadata.get("function_name"),
|
|
555
|
+
class_name=metadata.get("class_name"),
|
|
556
|
+
docstring=metadata.get("docstring"),
|
|
557
|
+
imports=metadata.get("imports", []),
|
|
558
|
+
complexity_score=metadata.get("complexity_score", 0.0),
|
|
559
|
+
chunk_id=metadata.get("chunk_id"),
|
|
560
|
+
parent_chunk_id=metadata.get("parent_chunk_id"),
|
|
561
|
+
child_chunk_ids=child_chunk_ids,
|
|
562
|
+
chunk_depth=metadata.get("chunk_depth", 0),
|
|
563
|
+
decorators=decorators,
|
|
564
|
+
parameters=parameters,
|
|
565
|
+
return_type=metadata.get("return_type"),
|
|
566
|
+
type_annotations=type_annotations,
|
|
567
|
+
subproject_name=metadata.get("subproject_name"),
|
|
568
|
+
subproject_path=metadata.get("subproject_path"),
|
|
569
|
+
)
|
|
570
|
+
chunks.append(chunk)
|
|
571
|
+
|
|
572
|
+
logger.debug(f"Retrieved {len(chunks)} chunks from database")
|
|
573
|
+
return chunks
|
|
574
|
+
|
|
575
|
+
except Exception as e:
|
|
576
|
+
logger.error(f"Failed to get all chunks: {e}")
|
|
577
|
+
raise DatabaseError(f"Failed to get all chunks: {e}") from e
|
|
578
|
+
|
|
579
|
+
def _create_searchable_text(self, chunk: CodeChunk) -> str:
|
|
580
|
+
"""Create optimized searchable text from code chunk."""
|
|
581
|
+
import sys
|
|
582
|
+
|
|
583
|
+
print("WARNING: _create_searchable_text IS BEING CALLED!", file=sys.stderr)
|
|
584
|
+
parts = [chunk.content]
|
|
585
|
+
|
|
586
|
+
# Add contextual information
|
|
587
|
+
if chunk.function_name:
|
|
588
|
+
parts.append(f"Function: {chunk.function_name}")
|
|
589
|
+
|
|
590
|
+
if chunk.class_name:
|
|
591
|
+
parts.append(f"Class: {chunk.class_name}")
|
|
592
|
+
|
|
593
|
+
if chunk.docstring:
|
|
594
|
+
parts.append(f"Documentation: {chunk.docstring}")
|
|
595
|
+
|
|
596
|
+
# Add language and file context
|
|
597
|
+
parts.append(f"Language: {chunk.language}")
|
|
598
|
+
parts.append(f"File: {chunk.file_path.name}")
|
|
599
|
+
|
|
600
|
+
return "\n".join(parts)
|
|
601
|
+
|
|
602
|
+
def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any]:
|
|
603
|
+
"""Build ChromaDB where clause from filters."""
|
|
604
|
+
where = {}
|
|
605
|
+
|
|
606
|
+
for key, value in filters.items():
|
|
607
|
+
if isinstance(value, list):
|
|
608
|
+
where[key] = {"$in": value}
|
|
609
|
+
elif isinstance(value, str) and value.startswith("!"):
|
|
610
|
+
where[key] = {"$ne": value[1:]}
|
|
611
|
+
else:
|
|
612
|
+
where[key] = value
|
|
613
|
+
|
|
614
|
+
return where
|
|
615
|
+
|
|
616
|
+
async def _detect_and_recover_corruption(self) -> None:
|
|
617
|
+
"""Detect and recover from index corruption proactively."""
|
|
618
|
+
# Check for common corruption indicators in ChromaDB files
|
|
619
|
+
chroma_db_path = self.persist_directory / "chroma.sqlite3"
|
|
620
|
+
|
|
621
|
+
# If database doesn't exist yet, nothing to check
|
|
622
|
+
if not chroma_db_path.exists():
|
|
623
|
+
return
|
|
624
|
+
|
|
625
|
+
# Check for HNSW index files that might be corrupted
|
|
626
|
+
self.persist_directory / "chroma-collections.parquet"
|
|
627
|
+
index_path = self.persist_directory / "index"
|
|
628
|
+
|
|
629
|
+
if index_path.exists():
|
|
630
|
+
# Look for pickle files in the index
|
|
631
|
+
pickle_files = list(index_path.glob("**/*.pkl"))
|
|
632
|
+
pickle_files.extend(list(index_path.glob("**/*.pickle")))
|
|
633
|
+
|
|
634
|
+
for pickle_file in pickle_files:
|
|
635
|
+
try:
|
|
636
|
+
# Try to read the pickle file to detect corruption
|
|
637
|
+
import pickle # nosec B403 # Trusted internal index files only
|
|
638
|
+
|
|
639
|
+
with open(pickle_file, "rb") as f:
|
|
640
|
+
pickle.load(f) # nosec B301 # Trusted internal index files only
|
|
641
|
+
except (EOFError, pickle.UnpicklingError, Exception) as e:
|
|
642
|
+
logger.warning(
|
|
643
|
+
f"Corrupted index file detected: {pickle_file} - {e}"
|
|
644
|
+
)
|
|
645
|
+
await self._recover_from_corruption()
|
|
646
|
+
return
|
|
647
|
+
|
|
648
|
+
async def _recover_from_corruption(self) -> None:
|
|
649
|
+
"""Recover from index corruption by rebuilding the index."""
|
|
650
|
+
logger.info("Attempting to recover from index corruption...")
|
|
651
|
+
|
|
652
|
+
# Create backup directory
|
|
653
|
+
backup_dir = (
|
|
654
|
+
self.persist_directory.parent / f"{self.persist_directory.name}_backup"
|
|
655
|
+
)
|
|
656
|
+
backup_dir.mkdir(exist_ok=True)
|
|
657
|
+
|
|
658
|
+
# Backup current state (in case we need it)
|
|
659
|
+
import time
|
|
660
|
+
|
|
661
|
+
timestamp = int(time.time())
|
|
662
|
+
backup_path = backup_dir / f"backup_{timestamp}"
|
|
663
|
+
|
|
664
|
+
if self.persist_directory.exists():
|
|
665
|
+
try:
|
|
666
|
+
shutil.copytree(self.persist_directory, backup_path)
|
|
667
|
+
logger.info(f"Created backup at {backup_path}")
|
|
668
|
+
except Exception as e:
|
|
669
|
+
logger.warning(f"Could not create backup: {e}")
|
|
670
|
+
|
|
671
|
+
# Clear the corrupted index
|
|
672
|
+
if self.persist_directory.exists():
|
|
673
|
+
try:
|
|
674
|
+
shutil.rmtree(self.persist_directory)
|
|
675
|
+
logger.info(f"Cleared corrupted index at {self.persist_directory}")
|
|
676
|
+
except Exception as e:
|
|
677
|
+
logger.error(f"Failed to clear corrupted index: {e}")
|
|
678
|
+
raise IndexCorruptionError(
|
|
679
|
+
f"Could not clear corrupted index: {e}"
|
|
680
|
+
) from e
|
|
681
|
+
|
|
682
|
+
# Recreate the directory
|
|
683
|
+
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
684
|
+
logger.info("Index directory recreated. Please re-index your codebase.")
|
|
685
|
+
|
|
686
|
+
async def health_check(self) -> bool:
|
|
687
|
+
"""Check database health and integrity.
|
|
688
|
+
|
|
689
|
+
Returns:
|
|
690
|
+
True if database is healthy, False otherwise
|
|
691
|
+
"""
|
|
692
|
+
try:
|
|
693
|
+
# First check if client is initialized
|
|
694
|
+
if not self._client or not self._collection:
|
|
695
|
+
logger.warning("Database not initialized")
|
|
696
|
+
return False
|
|
697
|
+
|
|
698
|
+
# Try a simple operation to test the connection
|
|
699
|
+
try:
|
|
700
|
+
# Attempt to get count - this will fail if index is corrupted
|
|
701
|
+
count = self._collection.count()
|
|
702
|
+
logger.debug(f"Health check passed: {count} chunks in database")
|
|
703
|
+
|
|
704
|
+
# Try a minimal query to ensure search works
|
|
705
|
+
self._collection.query(
|
|
706
|
+
query_texts=["test"], n_results=1, include=["metadatas"]
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
return True
|
|
710
|
+
|
|
711
|
+
except Exception as e:
|
|
712
|
+
error_msg = str(e).lower()
|
|
713
|
+
if any(
|
|
714
|
+
indicator in error_msg
|
|
715
|
+
for indicator in [
|
|
716
|
+
"pickle",
|
|
717
|
+
"unpickling",
|
|
718
|
+
"eof",
|
|
719
|
+
"ran out of input",
|
|
720
|
+
"hnsw",
|
|
721
|
+
"index",
|
|
722
|
+
"deserialize",
|
|
723
|
+
"corrupt",
|
|
724
|
+
]
|
|
725
|
+
):
|
|
726
|
+
logger.error(f"Index corruption detected during health check: {e}")
|
|
727
|
+
return False
|
|
728
|
+
else:
|
|
729
|
+
# Some other error
|
|
730
|
+
logger.warning(f"Health check failed: {e}")
|
|
731
|
+
return False
|
|
732
|
+
|
|
733
|
+
except Exception as e:
|
|
734
|
+
logger.error(f"Health check error: {e}")
|
|
735
|
+
return False
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
class PooledChromaVectorDatabase(VectorDatabase):
|
|
739
|
+
"""ChromaDB implementation with connection pooling for improved performance."""
|
|
740
|
+
|
|
741
|
+
def __init__(
|
|
742
|
+
self,
|
|
743
|
+
persist_directory: Path,
|
|
744
|
+
embedding_function: EmbeddingFunction,
|
|
745
|
+
collection_name: str = "code_search",
|
|
746
|
+
max_connections: int = 10,
|
|
747
|
+
min_connections: int = 2,
|
|
748
|
+
max_idle_time: float = 300.0,
|
|
749
|
+
max_connection_age: float = 3600.0,
|
|
750
|
+
) -> None:
|
|
751
|
+
"""Initialize pooled ChromaDB vector database.
|
|
752
|
+
|
|
753
|
+
Args:
|
|
754
|
+
persist_directory: Directory to persist database
|
|
755
|
+
embedding_function: Function to generate embeddings
|
|
756
|
+
collection_name: Name of the collection
|
|
757
|
+
max_connections: Maximum number of connections in pool
|
|
758
|
+
min_connections: Minimum number of connections to maintain
|
|
759
|
+
max_idle_time: Maximum time a connection can be idle (seconds)
|
|
760
|
+
max_connection_age: Maximum age of a connection (seconds)
|
|
761
|
+
"""
|
|
762
|
+
self.persist_directory = persist_directory
|
|
763
|
+
self.embedding_function = embedding_function
|
|
764
|
+
self.collection_name = collection_name
|
|
765
|
+
|
|
766
|
+
self._pool = ChromaConnectionPool(
|
|
767
|
+
persist_directory=persist_directory,
|
|
768
|
+
embedding_function=embedding_function,
|
|
769
|
+
collection_name=collection_name,
|
|
770
|
+
max_connections=max_connections,
|
|
771
|
+
min_connections=min_connections,
|
|
772
|
+
max_idle_time=max_idle_time,
|
|
773
|
+
max_connection_age=max_connection_age,
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
async def initialize(self) -> None:
|
|
777
|
+
"""Initialize the connection pool."""
|
|
778
|
+
await self._pool.initialize()
|
|
779
|
+
logger.debug(f"Pooled ChromaDB initialized at {self.persist_directory}")
|
|
780
|
+
|
|
781
|
+
async def close(self) -> None:
|
|
782
|
+
"""Close the connection pool."""
|
|
783
|
+
await self._pool.close()
|
|
784
|
+
logger.debug("Pooled ChromaDB connections closed")
|
|
785
|
+
|
|
786
|
+
async def add_chunks(self, chunks: list[CodeChunk]) -> None:
|
|
787
|
+
"""Add code chunks to the database using pooled connection."""
|
|
788
|
+
if not chunks:
|
|
789
|
+
return
|
|
790
|
+
|
|
791
|
+
# Ensure pool is initialized
|
|
792
|
+
if not self._pool._initialized:
|
|
793
|
+
await self._pool.initialize()
|
|
794
|
+
|
|
795
|
+
try:
|
|
796
|
+
async with self._pool.get_connection() as conn:
|
|
797
|
+
# Prepare data for ChromaDB
|
|
798
|
+
documents = []
|
|
799
|
+
metadatas = []
|
|
800
|
+
ids = []
|
|
801
|
+
|
|
802
|
+
for chunk in chunks:
|
|
803
|
+
# Store original content in documents (no metadata appended)
|
|
804
|
+
documents.append(chunk.content)
|
|
805
|
+
metadatas.append(
|
|
806
|
+
{
|
|
807
|
+
"file_path": str(chunk.file_path),
|
|
808
|
+
"start_line": chunk.start_line,
|
|
809
|
+
"end_line": chunk.end_line,
|
|
810
|
+
"language": chunk.language,
|
|
811
|
+
"chunk_type": chunk.chunk_type,
|
|
812
|
+
"function_name": chunk.function_name or "",
|
|
813
|
+
"class_name": chunk.class_name or "",
|
|
814
|
+
"docstring": chunk.docstring or "",
|
|
815
|
+
"complexity_score": chunk.complexity_score,
|
|
816
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
817
|
+
"chunk_id": chunk.chunk_id or "",
|
|
818
|
+
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
819
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
820
|
+
"chunk_depth": chunk.chunk_depth,
|
|
821
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
822
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
823
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
824
|
+
"return_type": chunk.return_type or "",
|
|
825
|
+
"type_annotations": json.dumps(
|
|
826
|
+
chunk.type_annotations or {}
|
|
827
|
+
),
|
|
828
|
+
# Monorepo support
|
|
829
|
+
"subproject_name": chunk.subproject_name or "",
|
|
830
|
+
"subproject_path": chunk.subproject_path or "",
|
|
831
|
+
}
|
|
832
|
+
)
|
|
833
|
+
ids.append(chunk.id)
|
|
834
|
+
|
|
835
|
+
# Add to collection
|
|
836
|
+
conn.collection.add(documents=documents, metadatas=metadatas, ids=ids)
|
|
837
|
+
|
|
838
|
+
logger.debug(f"Added {len(chunks)} chunks to database")
|
|
839
|
+
|
|
840
|
+
except Exception as e:
|
|
841
|
+
logger.error(f"Failed to add chunks: {e}")
|
|
842
|
+
raise DocumentAdditionError(f"Failed to add chunks: {e}") from e
|
|
843
|
+
|
|
844
|
+
async def search(
|
|
845
|
+
self,
|
|
846
|
+
query: str,
|
|
847
|
+
limit: int = 10,
|
|
848
|
+
filters: dict[str, Any] | None = None,
|
|
849
|
+
similarity_threshold: float = 0.7,
|
|
850
|
+
) -> list[SearchResult]:
|
|
851
|
+
"""Search for similar code chunks using pooled connection."""
|
|
852
|
+
# Ensure pool is initialized
|
|
853
|
+
if not self._pool._initialized:
|
|
854
|
+
await self._pool.initialize()
|
|
855
|
+
|
|
856
|
+
try:
|
|
857
|
+
async with self._pool.get_connection() as conn:
|
|
858
|
+
# Build where clause
|
|
859
|
+
where_clause = self._build_where_clause(filters) if filters else None
|
|
860
|
+
|
|
861
|
+
# Perform search
|
|
862
|
+
results = conn.collection.query(
|
|
863
|
+
query_texts=[query],
|
|
864
|
+
n_results=limit,
|
|
865
|
+
where=where_clause,
|
|
866
|
+
include=["documents", "metadatas", "distances"],
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
# Process results
|
|
870
|
+
search_results = []
|
|
871
|
+
|
|
872
|
+
if results["documents"] and results["documents"][0]:
|
|
873
|
+
for i, (doc, metadata, distance) in enumerate(
|
|
874
|
+
zip(
|
|
875
|
+
results["documents"][0],
|
|
876
|
+
results["metadatas"][0],
|
|
877
|
+
results["distances"][0],
|
|
878
|
+
strict=False,
|
|
879
|
+
)
|
|
880
|
+
):
|
|
881
|
+
# Convert distance to similarity (ChromaDB uses cosine distance)
|
|
882
|
+
# For cosine distance, use a more permissive conversion that handles distances > 1.0
|
|
883
|
+
# Convert to a 0-1 similarity score where lower distances = higher similarity
|
|
884
|
+
similarity = max(0.0, 1.0 / (1.0 + distance))
|
|
885
|
+
|
|
886
|
+
if similarity >= similarity_threshold:
|
|
887
|
+
# Document contains the original content (no metadata appended)
|
|
888
|
+
result = SearchResult(
|
|
889
|
+
content=doc,
|
|
890
|
+
file_path=Path(metadata["file_path"]),
|
|
891
|
+
start_line=metadata["start_line"],
|
|
892
|
+
end_line=metadata["end_line"],
|
|
893
|
+
language=metadata["language"],
|
|
894
|
+
similarity_score=similarity,
|
|
895
|
+
rank=i + 1,
|
|
896
|
+
chunk_type=metadata.get("chunk_type", "code"),
|
|
897
|
+
function_name=metadata.get("function_name") or None,
|
|
898
|
+
class_name=metadata.get("class_name") or None,
|
|
899
|
+
)
|
|
900
|
+
search_results.append(result)
|
|
901
|
+
|
|
902
|
+
logger.debug(f"Found {len(search_results)} results for query: {query}")
|
|
903
|
+
return search_results
|
|
904
|
+
|
|
905
|
+
except Exception as e:
|
|
906
|
+
logger.error(f"Search failed: {e}")
|
|
907
|
+
raise SearchError(f"Search failed: {e}") from e
|
|
908
|
+
|
|
909
|
+
async def delete_by_file(self, file_path: Path) -> int:
|
|
910
|
+
"""Delete all chunks for a specific file using pooled connection."""
|
|
911
|
+
try:
|
|
912
|
+
async with self._pool.get_connection() as conn:
|
|
913
|
+
# Get all chunks for this file
|
|
914
|
+
results = conn.collection.get(
|
|
915
|
+
where={"file_path": str(file_path)}, include=["metadatas"]
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
if not results["ids"]:
|
|
919
|
+
return 0
|
|
920
|
+
|
|
921
|
+
# Delete the chunks
|
|
922
|
+
conn.collection.delete(ids=results["ids"])
|
|
923
|
+
|
|
924
|
+
deleted_count = len(results["ids"])
|
|
925
|
+
logger.debug(f"Deleted {deleted_count} chunks for file: {file_path}")
|
|
926
|
+
return deleted_count
|
|
927
|
+
|
|
928
|
+
except Exception as e:
|
|
929
|
+
logger.error(f"Failed to delete chunks for file {file_path}: {e}")
|
|
930
|
+
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
931
|
+
|
|
932
|
+
async def get_stats(self) -> IndexStats:
|
|
933
|
+
"""Get database statistics with connection pooling and chunked queries."""
|
|
934
|
+
try:
|
|
935
|
+
async with self._pool.get_connection() as conn:
|
|
936
|
+
# Get total count (fast operation)
|
|
937
|
+
count = conn.collection.count()
|
|
938
|
+
|
|
939
|
+
if count == 0:
|
|
940
|
+
return IndexStats(
|
|
941
|
+
total_files=0,
|
|
942
|
+
total_chunks=0,
|
|
943
|
+
languages={},
|
|
944
|
+
file_types={},
|
|
945
|
+
index_size_mb=0.0,
|
|
946
|
+
last_updated="N/A",
|
|
947
|
+
embedding_model="unknown",
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
# Process in chunks to avoid loading everything at once
|
|
951
|
+
batch_size_limit = 1000
|
|
952
|
+
|
|
953
|
+
files = set()
|
|
954
|
+
language_counts: dict[str, int] = {}
|
|
955
|
+
file_type_counts: dict[str, int] = {}
|
|
956
|
+
|
|
957
|
+
offset = 0
|
|
958
|
+
while offset < count:
|
|
959
|
+
# Fetch batch
|
|
960
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
961
|
+
logger.debug(
|
|
962
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
963
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
results = conn.collection.get(
|
|
967
|
+
include=["metadatas"],
|
|
968
|
+
limit=batch_size,
|
|
969
|
+
offset=offset,
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
# Process batch metadata
|
|
973
|
+
for metadata in results.get("metadatas", []):
|
|
974
|
+
# Language stats
|
|
975
|
+
lang = metadata.get("language", "unknown")
|
|
976
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
977
|
+
|
|
978
|
+
# File stats
|
|
979
|
+
file_path = metadata.get("file_path", "")
|
|
980
|
+
if file_path:
|
|
981
|
+
files.add(file_path)
|
|
982
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
983
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
984
|
+
|
|
985
|
+
offset += batch_size
|
|
986
|
+
|
|
987
|
+
# Yield to event loop periodically to prevent blocking
|
|
988
|
+
await asyncio.sleep(0)
|
|
989
|
+
|
|
990
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
991
|
+
index_size_mb = count * 0.001
|
|
992
|
+
|
|
993
|
+
return IndexStats(
|
|
994
|
+
total_files=len(files),
|
|
995
|
+
total_chunks=count,
|
|
996
|
+
languages=language_counts,
|
|
997
|
+
file_types=file_type_counts,
|
|
998
|
+
index_size_mb=index_size_mb,
|
|
999
|
+
last_updated="unknown",
|
|
1000
|
+
embedding_model="unknown",
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
except Exception as e:
|
|
1004
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
1005
|
+
# Return empty stats instead of raising
|
|
1006
|
+
return IndexStats(
|
|
1007
|
+
total_files=0,
|
|
1008
|
+
total_chunks=0,
|
|
1009
|
+
languages={},
|
|
1010
|
+
file_types={},
|
|
1011
|
+
index_size_mb=0.0,
|
|
1012
|
+
last_updated="error",
|
|
1013
|
+
embedding_model="unknown",
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
async def remove_file_chunks(self, file_path: str) -> int:
|
|
1017
|
+
"""Remove all chunks for a specific file using pooled connection."""
|
|
1018
|
+
try:
|
|
1019
|
+
async with self._pool.get_connection() as conn:
|
|
1020
|
+
# Get all chunks for this file
|
|
1021
|
+
results = conn.collection.get(where={"file_path": file_path})
|
|
1022
|
+
|
|
1023
|
+
if not results["ids"]:
|
|
1024
|
+
return 0
|
|
1025
|
+
|
|
1026
|
+
# Delete the chunks
|
|
1027
|
+
conn.collection.delete(ids=results["ids"])
|
|
1028
|
+
|
|
1029
|
+
return len(results["ids"])
|
|
1030
|
+
|
|
1031
|
+
except Exception as e:
|
|
1032
|
+
logger.error(f"Failed to remove chunks for file {file_path}: {e}")
|
|
1033
|
+
return 0
|
|
1034
|
+
|
|
1035
|
+
async def reset(self) -> None:
|
|
1036
|
+
"""Reset the database using pooled connection."""
|
|
1037
|
+
try:
|
|
1038
|
+
async with self._pool.get_connection() as conn:
|
|
1039
|
+
conn.client.reset()
|
|
1040
|
+
# Reinitialize the pool after reset
|
|
1041
|
+
await self._pool.close()
|
|
1042
|
+
await self._pool.initialize()
|
|
1043
|
+
logger.info("Database reset successfully")
|
|
1044
|
+
except Exception as e:
|
|
1045
|
+
logger.error(f"Failed to reset database: {e}")
|
|
1046
|
+
raise DatabaseError(f"Failed to reset database: {e}") from e
|
|
1047
|
+
|
|
1048
|
+
async def get_all_chunks(self) -> list[CodeChunk]:
|
|
1049
|
+
"""Get all chunks from the database using pooled connection.
|
|
1050
|
+
|
|
1051
|
+
Returns:
|
|
1052
|
+
List of all code chunks with metadata
|
|
1053
|
+
"""
|
|
1054
|
+
try:
|
|
1055
|
+
async with self._pool.get_connection() as conn:
|
|
1056
|
+
# Get all documents from collection
|
|
1057
|
+
results = conn.collection.get(include=["metadatas", "documents"])
|
|
1058
|
+
|
|
1059
|
+
chunks = []
|
|
1060
|
+
if results and results.get("ids"):
|
|
1061
|
+
for i, _chunk_id in enumerate(results["ids"]):
|
|
1062
|
+
metadata = results["metadatas"][i]
|
|
1063
|
+
content = results["documents"][i]
|
|
1064
|
+
|
|
1065
|
+
# Parse JSON strings back to lists/dicts
|
|
1066
|
+
child_chunk_ids = metadata.get("child_chunk_ids", "[]")
|
|
1067
|
+
if isinstance(child_chunk_ids, str):
|
|
1068
|
+
child_chunk_ids = json.loads(child_chunk_ids)
|
|
1069
|
+
|
|
1070
|
+
decorators = metadata.get("decorators", "[]")
|
|
1071
|
+
if isinstance(decorators, str):
|
|
1072
|
+
decorators = json.loads(decorators)
|
|
1073
|
+
|
|
1074
|
+
parameters = metadata.get("parameters", "[]")
|
|
1075
|
+
if isinstance(parameters, str):
|
|
1076
|
+
parameters = json.loads(parameters)
|
|
1077
|
+
|
|
1078
|
+
type_annotations = metadata.get("type_annotations", "{}")
|
|
1079
|
+
if isinstance(type_annotations, str):
|
|
1080
|
+
type_annotations = json.loads(type_annotations)
|
|
1081
|
+
|
|
1082
|
+
chunk = CodeChunk(
|
|
1083
|
+
content=content,
|
|
1084
|
+
file_path=Path(metadata["file_path"]),
|
|
1085
|
+
start_line=metadata["start_line"],
|
|
1086
|
+
end_line=metadata["end_line"],
|
|
1087
|
+
language=metadata["language"],
|
|
1088
|
+
chunk_type=metadata.get("chunk_type", "code"),
|
|
1089
|
+
function_name=metadata.get("function_name"),
|
|
1090
|
+
class_name=metadata.get("class_name"),
|
|
1091
|
+
docstring=metadata.get("docstring"),
|
|
1092
|
+
imports=metadata.get("imports", []),
|
|
1093
|
+
complexity_score=metadata.get("complexity_score", 0.0),
|
|
1094
|
+
chunk_id=metadata.get("chunk_id"),
|
|
1095
|
+
parent_chunk_id=metadata.get("parent_chunk_id"),
|
|
1096
|
+
child_chunk_ids=child_chunk_ids,
|
|
1097
|
+
chunk_depth=metadata.get("chunk_depth", 0),
|
|
1098
|
+
decorators=decorators,
|
|
1099
|
+
parameters=parameters,
|
|
1100
|
+
return_type=metadata.get("return_type"),
|
|
1101
|
+
type_annotations=type_annotations,
|
|
1102
|
+
subproject_name=metadata.get("subproject_name"),
|
|
1103
|
+
subproject_path=metadata.get("subproject_path"),
|
|
1104
|
+
)
|
|
1105
|
+
chunks.append(chunk)
|
|
1106
|
+
|
|
1107
|
+
logger.debug(f"Retrieved {len(chunks)} chunks from database")
|
|
1108
|
+
return chunks
|
|
1109
|
+
|
|
1110
|
+
except Exception as e:
|
|
1111
|
+
logger.error(f"Failed to get all chunks: {e}")
|
|
1112
|
+
raise DatabaseError(f"Failed to get all chunks: {e}") from e
|
|
1113
|
+
|
|
1114
|
+
def _build_where_clause(self, filters: dict[str, Any]) -> dict[str, Any] | None:
|
|
1115
|
+
"""Build ChromaDB where clause from filters."""
|
|
1116
|
+
if not filters:
|
|
1117
|
+
return None
|
|
1118
|
+
|
|
1119
|
+
conditions = []
|
|
1120
|
+
|
|
1121
|
+
for key, value in filters.items():
|
|
1122
|
+
if key == "language" and value:
|
|
1123
|
+
conditions.append({"language": {"$eq": value}})
|
|
1124
|
+
elif key == "file_path" and value:
|
|
1125
|
+
if isinstance(value, list):
|
|
1126
|
+
conditions.append({"file_path": {"$in": [str(p) for p in value]}})
|
|
1127
|
+
else:
|
|
1128
|
+
conditions.append({"file_path": {"$eq": str(value)}})
|
|
1129
|
+
elif key == "chunk_type" and value:
|
|
1130
|
+
conditions.append({"chunk_type": {"$eq": value}})
|
|
1131
|
+
|
|
1132
|
+
if not conditions:
|
|
1133
|
+
return None
|
|
1134
|
+
elif len(conditions) > 1:
|
|
1135
|
+
return {"$and": conditions}
|
|
1136
|
+
else:
|
|
1137
|
+
return conditions[0]
|
|
1138
|
+
|
|
1139
|
+
def get_pool_stats(self) -> dict[str, Any]:
|
|
1140
|
+
"""Get connection pool statistics."""
|
|
1141
|
+
return self._pool.get_stats()
|
|
1142
|
+
|
|
1143
|
+
async def health_check(self) -> bool:
|
|
1144
|
+
"""Perform a health check on the database and connection pool."""
|
|
1145
|
+
try:
|
|
1146
|
+
# Check pool health
|
|
1147
|
+
pool_healthy = await self._pool.health_check()
|
|
1148
|
+
if not pool_healthy:
|
|
1149
|
+
return False
|
|
1150
|
+
|
|
1151
|
+
# Try a simple query to verify database integrity
|
|
1152
|
+
try:
|
|
1153
|
+
async with self._pool.get_connection() as conn:
|
|
1154
|
+
# Test basic operations
|
|
1155
|
+
conn.collection.count()
|
|
1156
|
+
conn.collection.query(
|
|
1157
|
+
query_texts=["test"], n_results=1, include=["metadatas"]
|
|
1158
|
+
)
|
|
1159
|
+
return True
|
|
1160
|
+
except Exception as e:
|
|
1161
|
+
error_msg = str(e).lower()
|
|
1162
|
+
if any(
|
|
1163
|
+
indicator in error_msg
|
|
1164
|
+
for indicator in [
|
|
1165
|
+
"pickle",
|
|
1166
|
+
"unpickling",
|
|
1167
|
+
"eof",
|
|
1168
|
+
"ran out of input",
|
|
1169
|
+
"hnsw",
|
|
1170
|
+
"index",
|
|
1171
|
+
"deserialize",
|
|
1172
|
+
"corrupt",
|
|
1173
|
+
]
|
|
1174
|
+
):
|
|
1175
|
+
logger.error(f"Index corruption detected: {e}")
|
|
1176
|
+
# Attempt recovery
|
|
1177
|
+
await self._recover_from_corruption()
|
|
1178
|
+
return False
|
|
1179
|
+
else:
|
|
1180
|
+
logger.warning(f"Health check failed: {e}")
|
|
1181
|
+
return False
|
|
1182
|
+
except Exception as e:
|
|
1183
|
+
logger.error(f"Health check error: {e}")
|
|
1184
|
+
return False
|
|
1185
|
+
|
|
1186
|
+
async def _recover_from_corruption(self) -> None:
|
|
1187
|
+
"""Recover from index corruption by rebuilding the index."""
|
|
1188
|
+
logger.info("Attempting to recover from index corruption...")
|
|
1189
|
+
|
|
1190
|
+
# Close the pool first
|
|
1191
|
+
await self._pool.close()
|
|
1192
|
+
|
|
1193
|
+
# Create backup directory
|
|
1194
|
+
backup_dir = (
|
|
1195
|
+
self.persist_directory.parent / f"{self.persist_directory.name}_backup"
|
|
1196
|
+
)
|
|
1197
|
+
backup_dir.mkdir(exist_ok=True)
|
|
1198
|
+
|
|
1199
|
+
# Backup current state
|
|
1200
|
+
import time
|
|
1201
|
+
|
|
1202
|
+
timestamp = int(time.time())
|
|
1203
|
+
backup_path = backup_dir / f"backup_{timestamp}"
|
|
1204
|
+
|
|
1205
|
+
if self.persist_directory.exists():
|
|
1206
|
+
try:
|
|
1207
|
+
shutil.copytree(self.persist_directory, backup_path)
|
|
1208
|
+
logger.info(f"Created backup at {backup_path}")
|
|
1209
|
+
except Exception as e:
|
|
1210
|
+
logger.warning(f"Could not create backup: {e}")
|
|
1211
|
+
|
|
1212
|
+
# Clear the corrupted index
|
|
1213
|
+
if self.persist_directory.exists():
|
|
1214
|
+
try:
|
|
1215
|
+
shutil.rmtree(self.persist_directory)
|
|
1216
|
+
logger.info(f"Cleared corrupted index at {self.persist_directory}")
|
|
1217
|
+
except Exception as e:
|
|
1218
|
+
logger.error(f"Failed to clear corrupted index: {e}")
|
|
1219
|
+
raise IndexCorruptionError(
|
|
1220
|
+
f"Could not clear corrupted index: {e}"
|
|
1221
|
+
) from e
|
|
1222
|
+
|
|
1223
|
+
# Recreate the directory
|
|
1224
|
+
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
1225
|
+
|
|
1226
|
+
# Reinitialize the pool
|
|
1227
|
+
await self._pool.initialize()
|
|
1228
|
+
logger.info("Index recovered. Please re-index your codebase.")
|
|
1229
|
+
|
|
1230
|
+
async def __aenter__(self):
|
|
1231
|
+
"""Async context manager entry."""
|
|
1232
|
+
await self.initialize()
|
|
1233
|
+
return self
|
|
1234
|
+
|
|
1235
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
1236
|
+
"""Async context manager exit."""
|
|
1237
|
+
await self.close()
|