mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. mcp_code_indexer/database/database.py +334 -115
  2. mcp_code_indexer/database/database_factory.py +1 -1
  3. mcp_code_indexer/database/exceptions.py +1 -1
  4. mcp_code_indexer/database/models.py +66 -24
  5. mcp_code_indexer/database/retry_executor.py +15 -5
  6. mcp_code_indexer/file_scanner.py +107 -12
  7. mcp_code_indexer/main.py +43 -30
  8. mcp_code_indexer/server/mcp_server.py +201 -7
  9. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  10. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  11. mcp_code_indexer/vector_mode/config.py +113 -45
  12. mcp_code_indexer/vector_mode/const.py +24 -0
  13. mcp_code_indexer/vector_mode/daemon.py +860 -98
  14. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  15. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  16. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  17. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  18. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  19. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  20. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  21. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  22. mcp_code_indexer/vector_mode/types.py +46 -0
  23. mcp_code_indexer/vector_mode/utils.py +50 -0
  24. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/METADATA +13 -10
  25. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/RECORD +28 -21
  26. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/WHEEL +1 -1
  27. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/entry_points.txt +0 -0
  28. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info/licenses}/LICENSE +0 -0
@@ -14,171 +14,354 @@ import uuid
14
14
  from typing import List, Dict, Any, Optional
15
15
  import turbopuffer
16
16
 
17
+
18
+ from turbopuffer.types import Row
19
+
17
20
  from ..config import VectorConfig
18
21
 
19
22
  logger = logging.getLogger(__name__)
20
23
 
24
+
21
25
  class TurbopufferClient:
22
26
  """Clean Turbopuffer client using official SDK."""
23
-
27
+
24
28
  def __init__(self, api_key: str, region: str = "gcp-europe-west3"):
25
29
  self.api_key = api_key
26
30
  self.region = region
27
-
31
+
28
32
  # Initialize official TurboPuffer client
29
- self.client = turbopuffer.Turbopuffer(
30
- api_key=api_key,
31
- region=region
32
- )
33
- logger.info(f"Initialized TurboPuffer client with region {region}")
34
-
33
+ self.client = turbopuffer.Turbopuffer(api_key=api_key, region=region)
34
+
35
35
  def health_check(self) -> bool:
36
36
  """Check if Turbopuffer service is healthy."""
37
37
  try:
38
- namespaces = self.client.namespaces()
38
+ self.client.namespaces()
39
39
  return True
40
40
  except Exception as e:
41
41
  logger.warning(f"Turbopuffer health check failed: {e}")
42
42
  return False
43
-
43
+
44
+ def validate_api_access(self) -> None:
45
+ """
46
+ Validate API key and access to Turbopuffer service.
47
+
48
+ Raises:
49
+ RuntimeError: If API access validation fails with specific error details
50
+ """
51
+ logger.info("Validating Turbopuffer API access...")
52
+ try:
53
+ self.client.namespaces()
54
+ logger.debug("Turbopuffer API access validated successfully")
55
+ except Exception as e:
56
+ error_msg = str(e).lower()
57
+
58
+ if "401" in error_msg or "unauthorized" in error_msg:
59
+ raise RuntimeError(
60
+ f"Turbopuffer API authentication failed: Invalid or expired API key. "
61
+ f"Please check your TURBOPUFFER_API_KEY. Error: {e}"
62
+ )
63
+ elif "403" in error_msg or "forbidden" in error_msg:
64
+ raise RuntimeError(
65
+ f"Turbopuffer API access denied: API key lacks required permissions. Error: {e}"
66
+ )
67
+ elif "429" in error_msg or "rate limit" in error_msg:
68
+ raise RuntimeError(
69
+ f"Turbopuffer API rate limit exceeded: Too many requests. Error: {e}"
70
+ )
71
+ elif "5" in error_msg and ("error" in error_msg or "server" in error_msg):
72
+ raise RuntimeError(
73
+ f"Turbopuffer service unavailable: Server error. Error: {e}"
74
+ )
75
+ else:
76
+ raise RuntimeError(f"Turbopuffer API access validation failed: {e}")
77
+
44
78
  def generate_vector_id(self, project_id: str, chunk_id: int) -> str:
45
79
  """Generate a unique vector ID."""
46
80
  return f"{project_id}_{chunk_id}_{uuid.uuid4().hex[:8]}"
47
-
81
+
48
82
  def upsert_vectors(
49
- self,
50
- vectors: List[Dict[str, Any]],
51
- namespace: str,
52
- **kwargs
83
+ self, vectors: List[Dict[str, Any]], namespace: str, **kwargs
53
84
  ) -> Dict[str, Any]:
54
85
  """Store or update vectors in the database."""
55
86
  if not vectors:
56
87
  return {"upserted": 0}
57
-
88
+
58
89
  logger.info(f"Upserting {len(vectors)} vectors to namespace '{namespace}'")
59
-
60
- # Format vectors for Turbopuffer SDK
61
- formatted_vectors = []
90
+
91
+ # Convert row-based data to columnar format for v0.5+ API
92
+ if not all("id" in vector and "values" in vector for vector in vectors):
93
+ raise ValueError("Each vector must have 'id' and 'values' fields")
94
+
95
+ # Build columnar data structure
96
+ data = {
97
+ "id": [str(vector["id"]) for vector in vectors],
98
+ "vector": [vector["values"] for vector in vectors],
99
+ }
100
+
101
+ # Add metadata attributes as separate columns
102
+ all_metadata_keys = set()
62
103
  for vector in vectors:
63
- if "id" not in vector or "values" not in vector:
64
- raise ValueError("Each vector must have 'id' and 'values' fields")
65
-
66
- formatted_vector = {
67
- "id": str(vector["id"]),
68
- "vector": vector["values"],
69
- "attributes": vector.get("metadata", {}),
70
- }
71
- formatted_vectors.append(formatted_vector)
72
-
104
+ metadata = vector.get("metadata", {})
105
+ all_metadata_keys.update(metadata.keys())
106
+
107
+ # Add each metadata attribute as a column
108
+ for key in all_metadata_keys:
109
+ data[key] = [vector.get("metadata", {}).get(key) for vector in vectors]
110
+
73
111
  try:
112
+ # Get namespace object and use write() with upsert_columns
74
113
  ns = self.client.namespace(namespace)
75
- ns.upsert(vectors=formatted_vectors)
76
-
77
- logger.info(f"Successfully upserted {len(vectors)} vectors")
78
- return {"upserted": len(vectors)}
79
-
114
+ response = ns.write(
115
+ upsert_columns=data,
116
+ distance_metric="cosine_distance", # Default metric TODO: which one to use?
117
+ )
118
+ # Log actual results from the response
119
+ rows_affected = getattr(response, "rows_affected", len(vectors))
120
+ logger.info(
121
+ f"Upsert operation completed: for namespace '{namespace}'. Requested {len(vectors)} vectors, "
122
+ f"actually affected {rows_affected} rows. Response status: {response.status}, response message: {response.message}"
123
+ )
124
+
125
+ return {"upserted": rows_affected}
126
+
80
127
  except Exception as e:
81
128
  logger.error(f"Failed to upsert vectors: {e}")
82
129
  raise RuntimeError(f"Vector upsert failed: {e}")
83
-
130
+
131
+ def upsert_vectors_batch(
132
+ self, all_vectors: List[Dict[str, Any]], namespace: str, **kwargs
133
+ ) -> Dict[str, Any]:
134
+ """
135
+ Store or update vectors from multiple files in a single batch operation.
136
+
137
+ Args:
138
+ all_vectors: List of all vector dictionaries from multiple files
139
+ namespace: Target namespace for storage
140
+ **kwargs: Additional arguments for vector storage
141
+
142
+ Returns:
143
+ Dictionary with upsert results
144
+
145
+ Raises:
146
+ RuntimeError: If batch upsert fails
147
+ """
148
+ if not all_vectors:
149
+ return {"upserted": 0}
150
+
151
+ logger.info(
152
+ f"Batch upserting {len(all_vectors)} vectors to namespace '{namespace}'"
153
+ )
154
+
155
+ # Validate vector structure
156
+ if not all("id" in vector and "values" in vector for vector in all_vectors):
157
+ raise ValueError("Each vector must have 'id' and 'values' fields")
158
+
159
+ try:
160
+ # Process vectors in sub-batches to respect TurboPuffer limits
161
+ max_batch_size = 1000 # TurboPuffer recommended limit
162
+ total_upserted = 0
163
+
164
+ for i in range(0, len(all_vectors), max_batch_size):
165
+ sub_batch = all_vectors[i : i + max_batch_size]
166
+
167
+ logger.debug(
168
+ f"Processing sub-batch {i//max_batch_size + 1}: {len(sub_batch)} vectors"
169
+ )
170
+
171
+ # Build columnar data structure for this sub-batch
172
+ data = {
173
+ "id": [str(vector["id"]) for vector in sub_batch],
174
+ "vector": [vector["values"] for vector in sub_batch],
175
+ }
176
+
177
+ # Add metadata attributes as separate columns
178
+ all_metadata_keys = set()
179
+ for vector in sub_batch:
180
+ metadata = vector.get("metadata", {})
181
+ all_metadata_keys.update(metadata.keys())
182
+
183
+ # Add each metadata attribute as a column
184
+ for key in all_metadata_keys:
185
+ data[key] = [
186
+ vector.get("metadata", {}).get(key) for vector in sub_batch
187
+ ]
188
+
189
+ # Upsert this sub-batch
190
+ ns = self.client.namespace(namespace)
191
+ response = ns.write(
192
+ upsert_columns=data,
193
+ distance_metric="cosine_distance",
194
+ )
195
+
196
+ rows_affected = getattr(response, "rows_affected", len(sub_batch))
197
+ total_upserted += rows_affected
198
+
199
+ logger.debug(
200
+ f"Sub-batch {i//max_batch_size + 1} upserted: "
201
+ f"requested {len(sub_batch)}, affected {rows_affected} rows"
202
+ )
203
+
204
+ logger.info(
205
+ f"Batch upsert operation completed for namespace '{namespace}'. "
206
+ f"Requested {len(all_vectors)} vectors, actually affected {total_upserted} rows"
207
+ )
208
+
209
+ return {"upserted": total_upserted}
210
+
211
+ except Exception as e:
212
+ logger.error(f"Failed to batch upsert vectors: {e}")
213
+ raise RuntimeError(f"Batch vector upsert failed: {e}")
214
+
84
215
  def search_vectors(
85
216
  self,
86
217
  query_vector: List[float],
87
218
  top_k: int = 10,
88
219
  namespace: str = "default",
89
- filters: Optional[Dict[str, Any]] = None,
90
- **kwargs
91
- ) -> List[Dict[str, Any]]:
220
+ filters: turbopuffer.types.Filter | turbopuffer.NotGiven = turbopuffer.NotGiven,
221
+ **kwargs,
222
+ ) -> List[Row] | None:
92
223
  """Search for similar vectors."""
93
- logger.debug(f"Searching {top_k} vectors in namespace '{namespace}'")
94
-
224
+ logger.info(f"Searching {top_k} vectors in namespace '{namespace}'")
225
+
95
226
  try:
96
227
  ns = self.client.namespace(namespace)
97
-
98
228
  results = ns.query(
99
- rank_by=[("vector", "ANN", query_vector)],
229
+ rank_by=("vector", "ANN", query_vector), # Use tuple format for v0.5+
100
230
  top_k=top_k,
101
231
  filters=filters,
102
- include_attributes=True
232
+ exclude_attributes=["vector"],
103
233
  )
104
-
105
- logger.debug(f"Found {len(results)} similar vectors")
106
- return results
107
-
234
+ # Return only rows if present, otherwise None
235
+ if hasattr(results, "rows") and results.rows:
236
+ logger.debug(f"Found {len(results.rows)} similar vectors")
237
+ return results.rows
238
+ else:
239
+ logger.debug("Found 0 similar vectors")
240
+ return None
241
+
108
242
  except Exception as e:
109
243
  logger.error(f"Vector search failed: {e}")
110
244
  raise RuntimeError(f"Vector search failed: {e}")
111
-
245
+
112
246
  def delete_vectors(
113
- self,
114
- vector_ids: List[str],
115
- namespace: str,
116
- **kwargs
247
+ self, vector_ids: List[str], namespace: str, **kwargs
117
248
  ) -> Dict[str, Any]:
118
249
  """Delete vectors by ID."""
119
250
  if not vector_ids:
120
251
  return {"deleted": 0}
121
-
252
+
122
253
  logger.info(f"Deleting {len(vector_ids)} vectors from namespace '{namespace}'")
123
-
254
+
124
255
  try:
125
256
  ns = self.client.namespace(namespace)
126
- ns.delete(ids=vector_ids)
127
-
128
- logger.info(f"Successfully deleted vectors")
129
- return {"deleted": len(vector_ids)}
130
-
257
+
258
+ # Use the write method with deletes parameter (v0.5+ API)
259
+ response = ns.write(deletes=vector_ids)
260
+
261
+ # Log actual results from the response
262
+ rows_affected = getattr(response, "rows_affected", 0)
263
+ logger.info(
264
+ f"Delete operation completed: requested {len(vector_ids)} vectors, "
265
+ f"actually affected {rows_affected} rows"
266
+ )
267
+
268
+ return {"deleted": rows_affected}
269
+
131
270
  except Exception as e:
132
271
  logger.error(f"Failed to delete vectors: {e}")
133
272
  raise RuntimeError(f"Vector deletion failed: {e}")
134
-
273
+
135
274
  def list_namespaces(self) -> List[str]:
136
275
  """List all available namespaces."""
137
276
  try:
138
277
  namespaces = self.client.namespaces()
139
- return [ns.name for ns in namespaces]
140
-
278
+ return [ns.id for ns in namespaces.namespaces]
279
+
141
280
  except Exception as e:
142
281
  logger.error(f"Failed to list namespaces: {e}")
143
282
  raise RuntimeError(f"Namespace listing failed: {e}")
144
-
145
- def create_namespace(self, namespace: str, dimension: int, **kwargs) -> Dict[str, Any]:
146
- """Create a new namespace."""
147
- logger.info(f"Creating namespace '{namespace}' with dimension {dimension}")
148
-
149
- try:
150
- self.client.create_namespace(
151
- name=namespace,
152
- dimension=dimension
153
- )
154
-
155
- logger.info(f"Successfully created namespace '{namespace}'")
156
- return {"name": namespace, "dimension": dimension}
157
-
158
- except Exception as e:
159
- logger.error(f"Failed to create namespace: {e}")
160
- raise RuntimeError(f"Namespace creation failed: {e}")
161
-
283
+
162
284
  def delete_namespace(self, namespace: str) -> Dict[str, Any]:
163
285
  """Delete a namespace and all its vectors."""
164
286
  logger.warning(f"Deleting namespace '{namespace}' and all its vectors")
165
-
166
287
  try:
167
- self.client.delete_namespace(namespace)
168
-
169
- logger.info(f"Successfully deleted namespace '{namespace}'")
288
+ ns = self.client.namespace(namespace)
289
+ # Use delete_all method to delete the namespace (v0.5+ API)
290
+ response = ns.delete_all()
291
+
292
+ logger.info(
293
+ f"Namespace deletion completed: '{namespace}' deleted, "
294
+ f"status: {response.status}, "
295
+ )
170
296
  return {"deleted": namespace}
171
-
297
+
172
298
  except Exception as e:
173
299
  logger.error(f"Failed to delete namespace: {e}")
174
300
  raise RuntimeError(f"Namespace deletion failed: {e}")
175
-
301
+
176
302
  def get_namespace_for_project(self, project_id: str) -> str:
177
303
  """Get the namespace name for a project."""
178
304
  # Use project ID as namespace, with prefix for safety
179
- safe_project_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in project_id)
305
+ safe_project_id = "".join(
306
+ c if c.isalnum() or c in "-_" else "_" for c in project_id
307
+ )
180
308
  return f"mcp_code_{safe_project_id}".lower()
181
-
309
+
310
+ def delete_vectors_for_file(self, namespace: str, file_path: str) -> Dict[str, Any]:
311
+ """
312
+ Delete all vectors associated with a specific file.
313
+
314
+ Args:
315
+ namespace: The namespace to delete from
316
+ file_path: Path to the source file
317
+
318
+ Returns:
319
+ Dictionary with deletion results
320
+
321
+ Raises:
322
+ RuntimeError: If deletion fails
323
+ """
324
+ logger.info(
325
+ f"Deleting vectors for file '{file_path}' in namespace '{namespace}'"
326
+ )
327
+
328
+ try:
329
+ ns = self.client.namespace(namespace)
330
+
331
+ # First, query for vectors with matching file_path
332
+ filter_condition = ("file_path", "Eq", file_path)
333
+ results = ns.query(
334
+ filters=filter_condition,
335
+ top_k=1200, # Set high enough to catch all chunks for a single file. 1200 is max
336
+ include_attributes=False, # We only need IDs
337
+ )
338
+
339
+ if not hasattr(results, "rows") or not results.rows:
340
+ logger.info(
341
+ f"No vectors found for file '{file_path}' in namespace '{namespace}'"
342
+ )
343
+ return {"deleted": 0, "file_path": file_path}
344
+
345
+ # Extract vector IDs to delete
346
+ ids_to_delete = [row.id for row in results.rows]
347
+ logger.info(
348
+ f"Found {len(ids_to_delete)} vectors to delete for file '{file_path}'"
349
+ )
350
+
351
+ # Delete vectors by ID using existing method
352
+ delete_result = self.delete_vectors(ids_to_delete, namespace)
353
+
354
+ logger.info(
355
+ f"File deletion completed: removed {delete_result['deleted']} vectors "
356
+ f"for file '{file_path}' from namespace '{namespace}'"
357
+ )
358
+
359
+ return {"deleted": delete_result["deleted"], "file_path": file_path}
360
+
361
+ except Exception as e:
362
+ logger.error(f"Failed to delete vectors for file '{file_path}': {e}")
363
+ raise RuntimeError(f"File vector deletion failed: {e}")
364
+
182
365
  def search_with_metadata_filter(
183
366
  self,
184
367
  query_vector: List[float],
@@ -186,31 +369,41 @@ class TurbopufferClient:
186
369
  chunk_type: Optional[str] = None,
187
370
  file_path: Optional[str] = None,
188
371
  top_k: int = 10,
189
- **kwargs
190
- ) -> List[Dict[str, Any]]:
372
+ **kwargs,
373
+ ) -> List[Row] | None:
191
374
  """Search vectors with metadata filtering."""
192
375
  namespace = self.get_namespace_for_project(project_id)
193
-
194
- # Build metadata filters
195
- filters = {"project_id": project_id}
376
+
377
+ # Build metadata filters using tuple format (compatible with TurboPuffer v0.5+ API)
378
+ filter_conditions = [("project_id", "Eq", project_id)]
379
+
196
380
  if chunk_type:
197
- filters["chunk_type"] = chunk_type
381
+ filter_conditions.append(("chunk_type", "Eq", chunk_type))
198
382
  if file_path:
199
- filters["file_path"] = file_path
200
-
383
+ filter_conditions.append(("file_path", "Eq", file_path))
384
+
385
+ # Use appropriate filter format based on number of conditions
386
+ if len(filter_conditions) == 1:
387
+ # Single condition - use simple tuple format
388
+ filters = filter_conditions[0]
389
+ else:
390
+ # Multiple conditions - use And format
391
+ filters = ("And", filter_conditions)
392
+
201
393
  return self.search_vectors(
202
394
  query_vector=query_vector,
203
395
  top_k=top_k,
204
396
  namespace=namespace,
205
397
  filters=filters,
206
- **kwargs
398
+ **kwargs,
207
399
  )
208
400
 
401
+
209
402
  def create_turbopuffer_client(config: VectorConfig) -> TurbopufferClient:
210
403
  """Create a Turbopuffer client from configuration."""
211
404
  if not config.turbopuffer_api_key:
212
405
  raise ValueError("TURBOPUFFER_API_KEY is required for vector storage")
213
-
406
+
214
407
  return TurbopufferClient(
215
408
  api_key=config.turbopuffer_api_key,
216
409
  region=config.turbopuffer_region,