mcp-code-indexer 4.2.14__py3-none-any.whl → 4.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/database.py +251 -85
- mcp_code_indexer/database/models.py +66 -24
- mcp_code_indexer/database/retry_executor.py +15 -5
- mcp_code_indexer/file_scanner.py +107 -12
- mcp_code_indexer/main.py +75 -23
- mcp_code_indexer/server/mcp_server.py +191 -1
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
- mcp_code_indexer/vector_mode/config.py +113 -45
- mcp_code_indexer/vector_mode/const.py +24 -0
- mcp_code_indexer/vector_mode/daemon.py +860 -98
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
- mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
- mcp_code_indexer/vector_mode/services/__init__.py +9 -0
- mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
- mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
- mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
- mcp_code_indexer/vector_mode/types.py +46 -0
- mcp_code_indexer/vector_mode/utils.py +50 -0
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
|
@@ -27,9 +27,11 @@ from mcp_code_indexer.database.exceptions import (
|
|
|
27
27
|
)
|
|
28
28
|
from mcp_code_indexer.database.models import (
|
|
29
29
|
FileDescription,
|
|
30
|
+
IndexMeta,
|
|
30
31
|
Project,
|
|
31
32
|
ProjectOverview,
|
|
32
33
|
SearchResult,
|
|
34
|
+
SyncStatus,
|
|
33
35
|
WordFrequencyResult,
|
|
34
36
|
WordFrequencyTerm,
|
|
35
37
|
)
|
|
@@ -315,12 +317,10 @@ class DatabaseManager:
|
|
|
315
317
|
self, operation_name: str = "write_operation"
|
|
316
318
|
) -> AsyncIterator[aiosqlite.Connection]:
|
|
317
319
|
"""
|
|
318
|
-
Get a database connection with write serialization
|
|
319
|
-
retry logic.
|
|
320
|
+
Get a database connection with write serialization.
|
|
320
321
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
generator errors.
|
|
322
|
+
Ensures the write lock is held throughout the duration of the context
|
|
323
|
+
to prevent race conditions and database locking errors.
|
|
324
324
|
|
|
325
325
|
Args:
|
|
326
326
|
operation_name: Name of the operation for logging and
|
|
@@ -331,43 +331,10 @@ class DatabaseManager:
|
|
|
331
331
|
"DatabaseManager not initialized - call initialize() first"
|
|
332
332
|
)
|
|
333
333
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
async with self._write_lock:
|
|
339
|
-
async with self.get_connection() as conn:
|
|
340
|
-
return conn
|
|
341
|
-
|
|
342
|
-
try:
|
|
343
|
-
# Use retry executor to handle connection acquisition with retries
|
|
344
|
-
connection = await self._retry_executor.execute_with_retry(
|
|
345
|
-
get_write_connection, operation_name
|
|
346
|
-
)
|
|
347
|
-
|
|
348
|
-
try:
|
|
349
|
-
yield connection
|
|
350
|
-
|
|
351
|
-
# Success - retry executor handles all failure tracking
|
|
352
|
-
|
|
353
|
-
except Exception:
|
|
354
|
-
# Error handling is managed by the retry executor
|
|
355
|
-
raise
|
|
356
|
-
|
|
357
|
-
except DatabaseError:
|
|
358
|
-
# Re-raise our custom database errors as-is
|
|
359
|
-
raise
|
|
360
|
-
except Exception as e:
|
|
361
|
-
# Classify and wrap other exceptions
|
|
362
|
-
classified_error = classify_sqlite_error(e, operation_name)
|
|
363
|
-
logger.error(
|
|
364
|
-
(
|
|
365
|
-
f"Database operation '{operation_name}' failed: "
|
|
366
|
-
f"{classified_error.message}"
|
|
367
|
-
),
|
|
368
|
-
extra={"structured_data": classified_error.to_dict()},
|
|
369
|
-
)
|
|
370
|
-
raise classified_error
|
|
334
|
+
# Acquire lock for exclusive write access - hold it for entire context
|
|
335
|
+
async with self._write_lock:
|
|
336
|
+
async with self.get_connection() as conn:
|
|
337
|
+
yield conn
|
|
371
338
|
|
|
372
339
|
def get_database_stats(self) -> Dict[str, Any]:
|
|
373
340
|
"""
|
|
@@ -740,6 +707,25 @@ class DatabaseManager:
|
|
|
740
707
|
await db.commit()
|
|
741
708
|
logger.debug(f"Updated project: {project.id}")
|
|
742
709
|
|
|
710
|
+
async def set_project_vector_mode(self, project_id: str, enabled: bool) -> None:
|
|
711
|
+
"""Set the vector_mode for a specific project."""
|
|
712
|
+
async with self.get_write_connection_with_retry(
|
|
713
|
+
"set_project_vector_mode"
|
|
714
|
+
) as db:
|
|
715
|
+
await db.execute(
|
|
716
|
+
"UPDATE projects SET vector_mode = ? WHERE id = ?",
|
|
717
|
+
(int(enabled), project_id),
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
# Check if the project was actually updated
|
|
721
|
+
cursor = await db.execute("SELECT changes()")
|
|
722
|
+
changes = await cursor.fetchone()
|
|
723
|
+
if changes[0] == 0:
|
|
724
|
+
raise ValueError(f"Project not found: {project_id}")
|
|
725
|
+
|
|
726
|
+
await db.commit()
|
|
727
|
+
logger.debug(f"Set vector_mode={enabled} for project: {project_id}")
|
|
728
|
+
|
|
743
729
|
async def get_all_projects(self) -> List[Project]:
|
|
744
730
|
"""Get all projects in the database."""
|
|
745
731
|
async with self.get_connection() as db:
|
|
@@ -795,12 +781,19 @@ class DatabaseManager:
|
|
|
795
781
|
) as db:
|
|
796
782
|
await db.execute(
|
|
797
783
|
"""
|
|
798
|
-
INSERT
|
|
784
|
+
INSERT INTO file_descriptions
|
|
799
785
|
(
|
|
800
786
|
project_id, file_path, description, file_hash, last_modified,
|
|
801
787
|
version, source_project_id, to_be_cleaned
|
|
802
788
|
)
|
|
803
789
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
790
|
+
ON CONFLICT(project_id, file_path) DO UPDATE SET
|
|
791
|
+
description=excluded.description,
|
|
792
|
+
file_hash=excluded.file_hash,
|
|
793
|
+
last_modified=excluded.last_modified,
|
|
794
|
+
version=excluded.version,
|
|
795
|
+
source_project_id=excluded.source_project_id,
|
|
796
|
+
to_be_cleaned=excluded.to_be_cleaned
|
|
804
797
|
""",
|
|
805
798
|
(
|
|
806
799
|
file_desc.project_id,
|
|
@@ -898,12 +891,19 @@ class DatabaseManager:
|
|
|
898
891
|
|
|
899
892
|
await conn.executemany(
|
|
900
893
|
"""
|
|
901
|
-
INSERT
|
|
894
|
+
INSERT INTO file_descriptions
|
|
902
895
|
(
|
|
903
896
|
project_id, file_path, description, file_hash, last_modified,
|
|
904
897
|
version, source_project_id, to_be_cleaned
|
|
905
898
|
)
|
|
906
899
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
900
|
+
ON CONFLICT(project_id, file_path) DO UPDATE SET
|
|
901
|
+
description=excluded.description,
|
|
902
|
+
file_hash=excluded.file_hash,
|
|
903
|
+
last_modified=excluded.last_modified,
|
|
904
|
+
version=excluded.version,
|
|
905
|
+
source_project_id=excluded.source_project_id,
|
|
906
|
+
to_be_cleaned=excluded.to_be_cleaned
|
|
907
907
|
""",
|
|
908
908
|
data,
|
|
909
909
|
)
|
|
@@ -1067,10 +1067,8 @@ class DatabaseManager:
|
|
|
1067
1067
|
Returns:
|
|
1068
1068
|
List of file paths that were marked for cleanup
|
|
1069
1069
|
"""
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
async def cleanup_operation(conn: aiosqlite.Connection) -> List[str]:
|
|
1073
|
-
# Get all active file descriptions for this project
|
|
1070
|
+
# 1. Get all active file paths (fast DB read)
|
|
1071
|
+
async with self.get_connection() as conn:
|
|
1074
1072
|
cursor = await conn.execute(
|
|
1075
1073
|
(
|
|
1076
1074
|
"SELECT file_path FROM file_descriptions WHERE "
|
|
@@ -1078,46 +1076,29 @@ class DatabaseManager:
|
|
|
1078
1076
|
),
|
|
1079
1077
|
(project_id,),
|
|
1080
1078
|
)
|
|
1081
|
-
|
|
1082
1079
|
rows = await cursor.fetchall()
|
|
1080
|
+
file_paths = [row["file_path"] for row in rows]
|
|
1083
1081
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1082
|
+
# 2. Check existence on disk (blocking IO - run in executor)
|
|
1083
|
+
def find_removed_files() -> List[str]:
|
|
1084
|
+
missing = []
|
|
1085
|
+
for file_path in file_paths:
|
|
1088
1086
|
full_path = project_root / file_path
|
|
1089
|
-
|
|
1090
1087
|
if not full_path.exists():
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
# Mark descriptions for cleanup instead of deleting
|
|
1094
|
-
if to_remove:
|
|
1095
|
-
import time
|
|
1096
|
-
|
|
1097
|
-
cleanup_timestamp = int(time.time())
|
|
1098
|
-
await conn.executemany(
|
|
1099
|
-
(
|
|
1100
|
-
"UPDATE file_descriptions SET to_be_cleaned = ? WHERE "
|
|
1101
|
-
"project_id = ? AND file_path = ?"
|
|
1102
|
-
),
|
|
1103
|
-
[(cleanup_timestamp, project_id, path) for path in to_remove],
|
|
1104
|
-
)
|
|
1105
|
-
logger.info(
|
|
1106
|
-
(
|
|
1107
|
-
f"Marked {len(to_remove)} missing files for cleanup "
|
|
1108
|
-
f"from {project_id}"
|
|
1109
|
-
)
|
|
1110
|
-
)
|
|
1088
|
+
missing.append(file_path)
|
|
1089
|
+
return missing
|
|
1111
1090
|
|
|
1112
|
-
|
|
1091
|
+
loop = asyncio.get_running_loop()
|
|
1092
|
+
to_remove = await loop.run_in_executor(None, find_removed_files)
|
|
1113
1093
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1094
|
+
# 3. Mark for cleanup (fast DB write)
|
|
1095
|
+
if to_remove:
|
|
1096
|
+
await self.cleanup_manager.mark_files_for_cleanup(project_id, to_remove)
|
|
1097
|
+
logger.info(
|
|
1098
|
+
f"Marked {len(to_remove)} missing files for cleanup from {project_id}"
|
|
1099
|
+
)
|
|
1119
1100
|
|
|
1120
|
-
return
|
|
1101
|
+
return to_remove
|
|
1121
1102
|
|
|
1122
1103
|
async def analyze_word_frequency(
|
|
1123
1104
|
self, project_id: str, limit: int = 200
|
|
@@ -1139,7 +1120,7 @@ class DatabaseManager:
|
|
|
1139
1120
|
stop_words_path = (
|
|
1140
1121
|
Path(__file__).parent.parent / "data" / "stop_words_english.txt"
|
|
1141
1122
|
)
|
|
1142
|
-
stop_words = set()
|
|
1123
|
+
stop_words: set = set()
|
|
1143
1124
|
|
|
1144
1125
|
if stop_words_path.exists():
|
|
1145
1126
|
with open(stop_words_path, "r", encoding="utf-8") as f:
|
|
@@ -1186,8 +1167,8 @@ class DatabaseManager:
|
|
|
1186
1167
|
}
|
|
1187
1168
|
stop_words.update(programming_keywords)
|
|
1188
1169
|
|
|
1170
|
+
# Get all descriptions for this project (fast DB read)
|
|
1189
1171
|
async with self.get_connection() as db:
|
|
1190
|
-
# Get all descriptions for this project
|
|
1191
1172
|
cursor = await db.execute(
|
|
1192
1173
|
(
|
|
1193
1174
|
"SELECT description FROM file_descriptions WHERE "
|
|
@@ -1195,11 +1176,13 @@ class DatabaseManager:
|
|
|
1195
1176
|
),
|
|
1196
1177
|
(project_id,),
|
|
1197
1178
|
)
|
|
1198
|
-
|
|
1199
1179
|
rows = await cursor.fetchall()
|
|
1180
|
+
descriptions = [row["description"] for row in rows]
|
|
1200
1181
|
|
|
1182
|
+
# Process word frequency in executor (CPU-bound work)
|
|
1183
|
+
def process_word_frequency() -> WordFrequencyResult:
|
|
1201
1184
|
# Combine all descriptions
|
|
1202
|
-
all_text = " ".join(
|
|
1185
|
+
all_text = " ".join(descriptions)
|
|
1203
1186
|
|
|
1204
1187
|
# Tokenize and filter
|
|
1205
1188
|
words = re.findall(r"\b[a-zA-Z]{2,}\b", all_text.lower())
|
|
@@ -1220,6 +1203,9 @@ class DatabaseManager:
|
|
|
1220
1203
|
total_unique_terms=len(word_counts),
|
|
1221
1204
|
)
|
|
1222
1205
|
|
|
1206
|
+
loop = asyncio.get_running_loop()
|
|
1207
|
+
return await loop.run_in_executor(None, process_word_frequency)
|
|
1208
|
+
|
|
1223
1209
|
async def cleanup_empty_projects(self) -> int:
|
|
1224
1210
|
"""
|
|
1225
1211
|
Remove projects that have no file descriptions and no project overview.
|
|
@@ -1320,6 +1306,186 @@ class DatabaseManager:
|
|
|
1320
1306
|
"files": file_descriptions,
|
|
1321
1307
|
}
|
|
1322
1308
|
|
|
1309
|
+
# IndexMeta operations
|
|
1310
|
+
async def create_index_meta(self, index_meta: IndexMeta) -> None:
|
|
1311
|
+
"""Create or update index metadata for a project."""
|
|
1312
|
+
async with self.get_write_connection_with_retry("create_index_meta") as db:
|
|
1313
|
+
await db.execute(
|
|
1314
|
+
"""
|
|
1315
|
+
INSERT OR REPLACE INTO index_meta (
|
|
1316
|
+
project_id, total_chunks, indexed_chunks, total_files, indexed_files,
|
|
1317
|
+
last_sync, sync_status, error_message, queue_depth, processing_rate,
|
|
1318
|
+
estimated_completion, metadata, created, last_modified
|
|
1319
|
+
)
|
|
1320
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1321
|
+
""",
|
|
1322
|
+
(
|
|
1323
|
+
index_meta.project_id,
|
|
1324
|
+
index_meta.total_chunks,
|
|
1325
|
+
index_meta.indexed_chunks,
|
|
1326
|
+
index_meta.total_files,
|
|
1327
|
+
index_meta.indexed_files,
|
|
1328
|
+
index_meta.last_sync,
|
|
1329
|
+
index_meta.sync_status.value,
|
|
1330
|
+
index_meta.error_message,
|
|
1331
|
+
index_meta.queue_depth,
|
|
1332
|
+
index_meta.processing_rate,
|
|
1333
|
+
index_meta.estimated_completion,
|
|
1334
|
+
json.dumps(index_meta.metadata),
|
|
1335
|
+
index_meta.created,
|
|
1336
|
+
index_meta.last_modified,
|
|
1337
|
+
),
|
|
1338
|
+
)
|
|
1339
|
+
await db.commit()
|
|
1340
|
+
logger.debug(
|
|
1341
|
+
f"Created/updated index metadata for project: {index_meta.project_id}"
|
|
1342
|
+
)
|
|
1343
|
+
|
|
1344
|
+
async def update_index_meta(self, index_meta: IndexMeta) -> None:
|
|
1345
|
+
"""Update existing index metadata for a project."""
|
|
1346
|
+
async with self.get_write_connection_with_retry("update_index_meta") as db:
|
|
1347
|
+
await db.execute(
|
|
1348
|
+
"""
|
|
1349
|
+
UPDATE index_meta
|
|
1350
|
+
SET total_chunks = ?, indexed_chunks = ?, total_files = ?, indexed_files = ?,
|
|
1351
|
+
last_sync = ?, sync_status = ?, error_message = ?, queue_depth = ?,
|
|
1352
|
+
processing_rate = ?, estimated_completion = ?, metadata = ?, last_modified = ?
|
|
1353
|
+
WHERE project_id = ?
|
|
1354
|
+
""",
|
|
1355
|
+
(
|
|
1356
|
+
index_meta.total_chunks,
|
|
1357
|
+
index_meta.indexed_chunks,
|
|
1358
|
+
index_meta.total_files,
|
|
1359
|
+
index_meta.indexed_files,
|
|
1360
|
+
index_meta.last_sync,
|
|
1361
|
+
index_meta.sync_status.value,
|
|
1362
|
+
index_meta.error_message,
|
|
1363
|
+
index_meta.queue_depth,
|
|
1364
|
+
index_meta.processing_rate,
|
|
1365
|
+
index_meta.estimated_completion,
|
|
1366
|
+
json.dumps(index_meta.metadata),
|
|
1367
|
+
index_meta.last_modified,
|
|
1368
|
+
index_meta.project_id,
|
|
1369
|
+
),
|
|
1370
|
+
)
|
|
1371
|
+
|
|
1372
|
+
# Check if the project was actually updated
|
|
1373
|
+
cursor = await db.execute("SELECT changes()")
|
|
1374
|
+
changes = await cursor.fetchone()
|
|
1375
|
+
if changes[0] == 0:
|
|
1376
|
+
raise ValueError(
|
|
1377
|
+
f"Index metadata not found for project: {index_meta.project_id}"
|
|
1378
|
+
)
|
|
1379
|
+
|
|
1380
|
+
await db.commit()
|
|
1381
|
+
logger.debug(f"Updated index metadata for project: {index_meta.project_id}")
|
|
1382
|
+
|
|
1383
|
+
async def get_index_meta(self, project_id: str) -> Optional[IndexMeta]:
|
|
1384
|
+
"""Retrieve index metadata for a project."""
|
|
1385
|
+
async with self.get_connection() as db:
|
|
1386
|
+
cursor = await db.execute(
|
|
1387
|
+
"SELECT * FROM index_meta WHERE project_id = ?", (project_id,)
|
|
1388
|
+
)
|
|
1389
|
+
row = await cursor.fetchone()
|
|
1390
|
+
|
|
1391
|
+
if row:
|
|
1392
|
+
# Convert row to dict for easier field access
|
|
1393
|
+
row_dict = dict(row)
|
|
1394
|
+
|
|
1395
|
+
# Parse JSON metadata field
|
|
1396
|
+
metadata = (
|
|
1397
|
+
json.loads(row_dict["metadata"]) if row_dict["metadata"] else {}
|
|
1398
|
+
)
|
|
1399
|
+
|
|
1400
|
+
# Parse datetime fields
|
|
1401
|
+
created = (
|
|
1402
|
+
datetime.fromisoformat(row_dict["created"])
|
|
1403
|
+
if row_dict["created"]
|
|
1404
|
+
else datetime.utcnow()
|
|
1405
|
+
)
|
|
1406
|
+
last_modified = (
|
|
1407
|
+
datetime.fromisoformat(row_dict["last_modified"])
|
|
1408
|
+
if row_dict["last_modified"]
|
|
1409
|
+
else datetime.utcnow()
|
|
1410
|
+
)
|
|
1411
|
+
last_sync = (
|
|
1412
|
+
datetime.fromisoformat(row_dict["last_sync"])
|
|
1413
|
+
if row_dict["last_sync"]
|
|
1414
|
+
else None
|
|
1415
|
+
)
|
|
1416
|
+
estimated_completion = (
|
|
1417
|
+
datetime.fromisoformat(row_dict["estimated_completion"])
|
|
1418
|
+
if row_dict["estimated_completion"]
|
|
1419
|
+
else None
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1422
|
+
return IndexMeta(
|
|
1423
|
+
id=row_dict["id"],
|
|
1424
|
+
project_id=row_dict["project_id"],
|
|
1425
|
+
total_chunks=row_dict["total_chunks"],
|
|
1426
|
+
indexed_chunks=row_dict["indexed_chunks"],
|
|
1427
|
+
total_files=row_dict["total_files"],
|
|
1428
|
+
indexed_files=row_dict["indexed_files"],
|
|
1429
|
+
last_sync=last_sync,
|
|
1430
|
+
sync_status=row_dict["sync_status"],
|
|
1431
|
+
error_message=row_dict["error_message"],
|
|
1432
|
+
queue_depth=row_dict["queue_depth"],
|
|
1433
|
+
processing_rate=row_dict["processing_rate"],
|
|
1434
|
+
estimated_completion=estimated_completion,
|
|
1435
|
+
metadata=metadata,
|
|
1436
|
+
created=created,
|
|
1437
|
+
last_modified=last_modified,
|
|
1438
|
+
)
|
|
1439
|
+
return None
|
|
1440
|
+
|
|
1441
|
+
async def get_or_create_index_meta(self, project_id: str, **kwargs) -> IndexMeta:
|
|
1442
|
+
"""
|
|
1443
|
+
Get existing index metadata or create new one with default values.
|
|
1444
|
+
|
|
1445
|
+
Args:
|
|
1446
|
+
project_id: Project identifier
|
|
1447
|
+
**kwargs: Optional fields to override defaults when creating new metadata
|
|
1448
|
+
|
|
1449
|
+
Returns:
|
|
1450
|
+
IndexMeta object (existing or newly created)
|
|
1451
|
+
"""
|
|
1452
|
+
# Try to get existing metadata first
|
|
1453
|
+
existing_meta = await self.get_index_meta(project_id)
|
|
1454
|
+
if existing_meta:
|
|
1455
|
+
return existing_meta
|
|
1456
|
+
|
|
1457
|
+
# Create new metadata with defaults, allowing kwargs to override
|
|
1458
|
+
default_metadata = {
|
|
1459
|
+
"project_id": project_id,
|
|
1460
|
+
"total_chunks": 0,
|
|
1461
|
+
"indexed_chunks": 0,
|
|
1462
|
+
"total_files": 0,
|
|
1463
|
+
"indexed_files": 0,
|
|
1464
|
+
"last_sync": None,
|
|
1465
|
+
"sync_status": SyncStatus.PENDING,
|
|
1466
|
+
"error_message": None,
|
|
1467
|
+
"queue_depth": 0,
|
|
1468
|
+
"processing_rate": 0.0,
|
|
1469
|
+
"estimated_completion": None,
|
|
1470
|
+
"metadata": {},
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
# Override defaults with provided kwargs
|
|
1474
|
+
default_metadata.update(kwargs)
|
|
1475
|
+
|
|
1476
|
+
# Create the IndexMeta object
|
|
1477
|
+
new_meta = IndexMeta(**default_metadata)
|
|
1478
|
+
|
|
1479
|
+
# Store it in the database
|
|
1480
|
+
await self.create_index_meta(new_meta)
|
|
1481
|
+
|
|
1482
|
+
# Return the created metadata (fetch it back to get the assigned ID)
|
|
1483
|
+
result = await self.get_index_meta(project_id)
|
|
1484
|
+
if result is None:
|
|
1485
|
+
raise DatabaseError(f"Failed to create index metadata for project: {project_id}")
|
|
1486
|
+
|
|
1487
|
+
return result
|
|
1488
|
+
|
|
1323
1489
|
# Cleanup operations
|
|
1324
1490
|
|
|
1325
1491
|
@property
|
|
@@ -32,7 +32,9 @@ class Project(BaseModel):
|
|
|
32
32
|
last_accessed: datetime = Field(
|
|
33
33
|
default_factory=datetime.utcnow, description="Last access timestamp"
|
|
34
34
|
)
|
|
35
|
-
vector_mode: bool = Field(
|
|
35
|
+
vector_mode: bool = Field(
|
|
36
|
+
default=False, description="Enable vector search for this project"
|
|
37
|
+
)
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
class FileDescription(BaseModel):
|
|
@@ -189,10 +191,12 @@ class WordFrequencyResult(BaseModel):
|
|
|
189
191
|
|
|
190
192
|
# Vector Mode Models
|
|
191
193
|
|
|
194
|
+
|
|
192
195
|
class ChunkType(str, Enum):
|
|
193
196
|
"""Types of code chunks for semantic analysis."""
|
|
197
|
+
|
|
194
198
|
FUNCTION = "function"
|
|
195
|
-
CLASS = "class"
|
|
199
|
+
CLASS = "class"
|
|
196
200
|
METHOD = "method"
|
|
197
201
|
IMPORT = "import"
|
|
198
202
|
DOCSTRING = "docstring"
|
|
@@ -204,27 +208,32 @@ class ChunkType(str, Enum):
|
|
|
204
208
|
NAMESPACE = "namespace"
|
|
205
209
|
GENERIC = "generic"
|
|
206
210
|
|
|
211
|
+
|
|
207
212
|
class NodeType(str, Enum):
|
|
208
213
|
"""Types of nodes in Merkle tree."""
|
|
214
|
+
|
|
209
215
|
FILE = "file"
|
|
210
216
|
DIRECTORY = "directory"
|
|
211
217
|
PROJECT = "project"
|
|
212
218
|
|
|
219
|
+
|
|
213
220
|
class SyncStatus(str, Enum):
|
|
214
221
|
"""Vector index synchronization status."""
|
|
222
|
+
|
|
215
223
|
PENDING = "pending"
|
|
216
224
|
IN_PROGRESS = "in_progress"
|
|
217
225
|
COMPLETED = "completed"
|
|
218
226
|
FAILED = "failed"
|
|
219
227
|
PAUSED = "paused"
|
|
220
228
|
|
|
229
|
+
|
|
221
230
|
class CodeChunk(BaseModel):
|
|
222
231
|
"""
|
|
223
232
|
Represents a semantic chunk of code extracted from a file.
|
|
224
|
-
|
|
233
|
+
|
|
225
234
|
Used for embedding generation and vector search operations.
|
|
226
235
|
"""
|
|
227
|
-
|
|
236
|
+
|
|
228
237
|
id: Optional[int] = Field(None, description="Database ID")
|
|
229
238
|
file_id: int = Field(..., description="Reference to FileDescription")
|
|
230
239
|
project_id: str = Field(..., description="Reference to project")
|
|
@@ -235,17 +244,24 @@ class CodeChunk(BaseModel):
|
|
|
235
244
|
content_hash: str = Field(..., description="SHA-256 hash of chunk content")
|
|
236
245
|
embedding_id: Optional[str] = Field(None, description="Vector database ID")
|
|
237
246
|
redacted: bool = Field(default=False, description="Whether content was redacted")
|
|
238
|
-
metadata: Dict[str, Any] = Field(
|
|
239
|
-
|
|
240
|
-
|
|
247
|
+
metadata: Dict[str, Any] = Field(
|
|
248
|
+
default_factory=dict, description="Additional metadata"
|
|
249
|
+
)
|
|
250
|
+
created: datetime = Field(
|
|
251
|
+
default_factory=datetime.utcnow, description="Creation timestamp"
|
|
252
|
+
)
|
|
253
|
+
last_modified: datetime = Field(
|
|
254
|
+
default_factory=datetime.utcnow, description="Last update timestamp"
|
|
255
|
+
)
|
|
256
|
+
|
|
241
257
|
|
|
242
258
|
class MerkleNode(BaseModel):
|
|
243
259
|
"""
|
|
244
260
|
Represents a node in the Merkle tree for change detection.
|
|
245
|
-
|
|
261
|
+
|
|
246
262
|
Used to efficiently detect file system changes without scanning entire directory trees.
|
|
247
263
|
"""
|
|
248
|
-
|
|
264
|
+
|
|
249
265
|
id: Optional[int] = Field(None, description="Database ID")
|
|
250
266
|
project_id: str = Field(..., description="Reference to project")
|
|
251
267
|
path: str = Field(..., description="File/directory path relative to project root")
|
|
@@ -253,36 +269,56 @@ class MerkleNode(BaseModel):
|
|
|
253
269
|
node_type: NodeType = Field(..., description="Type of filesystem node")
|
|
254
270
|
parent_path: Optional[str] = Field(None, description="Path to parent directory")
|
|
255
271
|
children_hash: Optional[str] = Field(None, description="Combined hash of children")
|
|
256
|
-
last_modified: datetime = Field(
|
|
272
|
+
last_modified: datetime = Field(
|
|
273
|
+
default_factory=datetime.utcnow, description="Last update timestamp"
|
|
274
|
+
)
|
|
275
|
+
|
|
257
276
|
|
|
258
277
|
class IndexMeta(BaseModel):
|
|
259
278
|
"""
|
|
260
279
|
Metadata about vector indexing progress and status for a project.
|
|
261
|
-
|
|
280
|
+
|
|
262
281
|
Tracks indexing state, statistics, and synchronization status.
|
|
263
282
|
"""
|
|
264
|
-
|
|
283
|
+
|
|
265
284
|
id: Optional[int] = Field(None, description="Database ID")
|
|
266
285
|
project_id: str = Field(..., description="Reference to project", unique=True)
|
|
267
286
|
total_chunks: int = Field(default=0, description="Total number of chunks")
|
|
268
|
-
indexed_chunks: int = Field(
|
|
287
|
+
indexed_chunks: int = Field(
|
|
288
|
+
default=0, description="Number of chunks with embeddings"
|
|
289
|
+
)
|
|
269
290
|
total_files: int = Field(default=0, description="Total number of files")
|
|
270
291
|
indexed_files: int = Field(default=0, description="Number of files processed")
|
|
271
|
-
last_sync: Optional[datetime] = Field(
|
|
272
|
-
|
|
292
|
+
last_sync: Optional[datetime] = Field(
|
|
293
|
+
None, description="Last successful sync timestamp"
|
|
294
|
+
)
|
|
295
|
+
sync_status: SyncStatus = Field(
|
|
296
|
+
default=SyncStatus.PENDING, description="Current sync status"
|
|
297
|
+
)
|
|
273
298
|
error_message: Optional[str] = Field(None, description="Last error message")
|
|
274
299
|
queue_depth: int = Field(default=0, description="Number of pending tasks")
|
|
275
|
-
processing_rate: float = Field(
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
300
|
+
processing_rate: float = Field(
|
|
301
|
+
default=0.0, description="Files per second processing rate"
|
|
302
|
+
)
|
|
303
|
+
estimated_completion: Optional[datetime] = Field(
|
|
304
|
+
None, description="Estimated completion time"
|
|
305
|
+
)
|
|
306
|
+
metadata: Dict[str, Any] = Field(
|
|
307
|
+
default_factory=dict, description="Additional metadata"
|
|
308
|
+
)
|
|
309
|
+
created: datetime = Field(
|
|
310
|
+
default_factory=datetime.utcnow, description="Creation timestamp"
|
|
311
|
+
)
|
|
312
|
+
last_modified: datetime = Field(
|
|
313
|
+
default_factory=datetime.utcnow, description="Last update timestamp"
|
|
314
|
+
)
|
|
315
|
+
|
|
280
316
|
|
|
281
317
|
class VectorSearchResult(BaseModel):
|
|
282
318
|
"""
|
|
283
319
|
Represents a vector search result with similarity scoring.
|
|
284
320
|
"""
|
|
285
|
-
|
|
321
|
+
|
|
286
322
|
file_path: str = Field(..., description="Path to the matching file")
|
|
287
323
|
chunk_name: Optional[str] = Field(None, description="Name of the code chunk")
|
|
288
324
|
chunk_type: ChunkType = Field(..., description="Type of code chunk")
|
|
@@ -291,13 +327,16 @@ class VectorSearchResult(BaseModel):
|
|
|
291
327
|
end_line: int = Field(..., description="Ending line number")
|
|
292
328
|
similarity_score: float = Field(..., description="Cosine similarity score")
|
|
293
329
|
project_id: str = Field(..., description="Project identifier")
|
|
294
|
-
metadata: Dict[str, Any] = Field(
|
|
330
|
+
metadata: Dict[str, Any] = Field(
|
|
331
|
+
default_factory=dict, description="Additional metadata"
|
|
332
|
+
)
|
|
333
|
+
|
|
295
334
|
|
|
296
335
|
class VectorIndexStatus(BaseModel):
|
|
297
336
|
"""
|
|
298
337
|
Current status of vector indexing for a project.
|
|
299
338
|
"""
|
|
300
|
-
|
|
339
|
+
|
|
301
340
|
is_indexing: bool = Field(..., description="Whether indexing is currently active")
|
|
302
341
|
indexed_files: int = Field(..., description="Number of files indexed")
|
|
303
342
|
total_files: int = Field(..., description="Total number of files")
|
|
@@ -307,9 +346,12 @@ class VectorIndexStatus(BaseModel):
|
|
|
307
346
|
sync_status: SyncStatus = Field(..., description="Current sync status")
|
|
308
347
|
queue_depth: int = Field(..., description="Number of pending tasks")
|
|
309
348
|
processing_rate: float = Field(..., description="Processing rate")
|
|
310
|
-
estimated_completion: Optional[datetime] = Field(
|
|
349
|
+
estimated_completion: Optional[datetime] = Field(
|
|
350
|
+
None, description="Estimated completion time"
|
|
351
|
+
)
|
|
311
352
|
error_message: Optional[str] = Field(None, description="Last error message")
|
|
312
353
|
|
|
354
|
+
|
|
313
355
|
# Enable forward references for recursive models
|
|
314
356
|
FolderNode.model_rebuild()
|
|
315
357
|
CodebaseOverview.model_rebuild()
|
|
@@ -279,8 +279,13 @@ class RetryExecutor:
|
|
|
279
279
|
Yields:
|
|
280
280
|
Database connection
|
|
281
281
|
"""
|
|
282
|
+
import sys
|
|
283
|
+
|
|
284
|
+
# Store the context manager so we can properly call __aexit__
|
|
285
|
+
ctx_manager: Optional[AsyncContextManager[aiosqlite.Connection]] = None
|
|
282
286
|
|
|
283
287
|
async def acquire_connection() -> aiosqlite.Connection:
|
|
288
|
+
nonlocal ctx_manager
|
|
284
289
|
# This function will be retried by execute_with_retry
|
|
285
290
|
# Get the async context manager and enter it
|
|
286
291
|
ctx_manager = connection_factory()
|
|
@@ -288,15 +293,20 @@ class RetryExecutor:
|
|
|
288
293
|
return conn
|
|
289
294
|
|
|
290
295
|
# Use execute_with_retry to handle the retry logic
|
|
291
|
-
# We create a connection and store it for the context manager
|
|
292
296
|
connection = await self.execute_with_retry(acquire_connection, operation_name)
|
|
293
297
|
|
|
294
298
|
try:
|
|
295
299
|
yield connection
|
|
296
|
-
|
|
297
|
-
#
|
|
298
|
-
|
|
299
|
-
|
|
300
|
+
except BaseException:
|
|
301
|
+
# Pass actual exception info to __aexit__ for proper rollback/cleanup
|
|
302
|
+
exc_type, exc, tb = sys.exc_info()
|
|
303
|
+
if ctx_manager is not None:
|
|
304
|
+
await ctx_manager.__aexit__(exc_type, exc, tb)
|
|
305
|
+
raise
|
|
306
|
+
else:
|
|
307
|
+
# No exception - call __aexit__ with None values
|
|
308
|
+
if ctx_manager is not None:
|
|
309
|
+
await ctx_manager.__aexit__(None, None, None)
|
|
300
310
|
|
|
301
311
|
def _should_retry_exception(self, retry_state: RetryCallState) -> bool:
|
|
302
312
|
"""
|