mcp-code-indexer 4.1.0__py3-none-any.whl → 4.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcp_code_indexer/main.py CHANGED
@@ -1019,7 +1019,7 @@ async def main() -> None:
1019
1019
 
1020
1020
  # Check if vector mode is available
1021
1021
  if not is_vector_mode_available():
1022
- logger.error("Vector mode requires additional dependencies. Install with: pip install mcp-code-indexer[vector]")
1022
+ logger.error("Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer")
1023
1023
  sys.exit(1)
1024
1024
 
1025
1025
  # Check API keys
@@ -18,6 +18,7 @@ class VectorConfig:
18
18
  # API Configuration
19
19
  voyage_api_key: Optional[str] = None
20
20
  turbopuffer_api_key: Optional[str] = None
21
+ turbopuffer_region: str = "gcp-europe-west3"
21
22
 
22
23
  # Embedding Configuration
23
24
  embedding_model: str = "voyage-code-2"
@@ -57,9 +58,10 @@ class VectorConfig:
57
58
  return cls(
58
59
  voyage_api_key=os.getenv("VOYAGE_API_KEY"),
59
60
  turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
60
- embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-2"),
61
+ turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
62
+ embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
61
63
  batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
62
- max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "1024")),
64
+ max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
63
65
  similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
64
66
  max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
65
67
  enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
@@ -122,6 +124,16 @@ class VectorConfig:
122
124
  if not self.turbopuffer_api_key:
123
125
  errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
124
126
 
127
+ # Validate TurboPuffer region
128
+ supported_regions = [
129
+ 'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
130
+ 'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
131
+ 'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
132
+ ]
133
+ if self.turbopuffer_region not in supported_regions:
134
+ errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
135
+ f"Supported regions: {', '.join(supported_regions)}")
136
+
125
137
  if self.batch_size <= 0:
126
138
  errors.append("batch_size must be positive")
127
139
  if self.max_tokens_per_chunk <= 0:
@@ -1,72 +1,17 @@
1
1
  """
2
2
  External service providers for vector mode.
3
3
 
4
- This package provides integrations with external services including:
5
- - Voyage AI for embedding generation
6
- - Turbopuffer for vector storage and search
4
+ This package provides clean integrations with external services using official SDKs:
5
+ - Voyage AI for embedding generation (voyageai SDK)
6
+ - Turbopuffer for vector storage and search (turbopuffer SDK)
7
7
  """
8
8
 
9
- from typing import Protocol, List, Dict, Any, Optional
10
- from abc import abstractmethod
9
+ from .voyage_client import VoyageClient, create_voyage_client
10
+ from .turbopuffer_client import TurbopufferClient, create_turbopuffer_client
11
11
 
12
- class EmbeddingProvider(Protocol):
13
- """Protocol for embedding generation providers."""
14
-
15
- @abstractmethod
16
- async def generate_embeddings(
17
- self,
18
- texts: List[str],
19
- input_type: str = "document",
20
- **kwargs
21
- ) -> List[List[float]]:
22
- """Generate embeddings for a list of texts."""
23
- ...
24
-
25
- @abstractmethod
26
- async def get_embedding_dimension(self) -> int:
27
- """Get the dimension of embeddings produced by this provider."""
28
- ...
29
-
30
- class VectorStoreProvider(Protocol):
31
- """Protocol for vector storage providers."""
32
-
33
- @abstractmethod
34
- async def upsert_vectors(
35
- self,
36
- vectors: List[Dict[str, Any]],
37
- namespace: Optional[str] = None,
38
- **kwargs
39
- ) -> Dict[str, Any]:
40
- """Store or update vectors in the database."""
41
- ...
42
-
43
- @abstractmethod
44
- async def search_vectors(
45
- self,
46
- query_vector: List[float],
47
- top_k: int = 10,
48
- namespace: Optional[str] = None,
49
- filters: Optional[Dict[str, Any]] = None,
50
- **kwargs
51
- ) -> List[Dict[str, Any]]:
52
- """Search for similar vectors."""
53
- ...
54
-
55
- @abstractmethod
56
- async def delete_vectors(
57
- self,
58
- vector_ids: List[str],
59
- namespace: Optional[str] = None,
60
- **kwargs
61
- ) -> Dict[str, Any]:
62
- """Delete vectors by ID."""
63
- ...
64
-
65
- @abstractmethod
66
- async def get_namespace_stats(
67
- self,
68
- namespace: Optional[str] = None,
69
- **kwargs
70
- ) -> Dict[str, Any]:
71
- """Get statistics about a namespace."""
72
- ...
12
+ __all__ = [
13
+ 'VoyageClient',
14
+ 'create_voyage_client',
15
+ 'TurbopufferClient',
16
+ 'create_turbopuffer_client',
17
+ ]
@@ -1,68 +1,63 @@
1
1
  """
2
- Turbopuffer client for vector storage and search.
2
+ Turbopuffer client for vector storage and search using official SDK.
3
3
 
4
- Provides integration with Turbopuffer's vector database for storing
5
- embeddings and performing similarity searches.
4
+ Provides clean integration with Turbopuffer's vector database for storing
5
+ embeddings and performing similarity searches. Supports configurable
6
+ regions for optimal latency and data residency compliance.
7
+
8
+ Default region: gcp-europe-west3 (Frankfurt)
9
+ Configure via TURBOPUFFER_REGION environment variable.
6
10
  """
7
11
 
8
12
  import logging
9
13
  import uuid
10
- from typing import List, Dict, Any, Optional, Union
11
- import json
14
+ from typing import List, Dict, Any, Optional
15
+ import turbopuffer
12
16
 
13
- from .base_provider import BaseProvider, ProviderError
14
17
  from ..config import VectorConfig
15
18
 
16
19
  logger = logging.getLogger(__name__)
17
20
 
18
- class TurbopufferClient(BaseProvider):
19
- """Client for Turbopuffer vector database."""
21
+ class TurbopufferClient:
22
+ """Clean Turbopuffer client using official SDK."""
20
23
 
21
- def __init__(
22
- self,
23
- api_key: str,
24
- base_url: str = "https://api.turbopuffer.com/v1",
25
- **kwargs
26
- ):
27
- super().__init__(api_key, base_url, **kwargs)
24
+ def __init__(self, api_key: str, region: str = "gcp-europe-west3"):
25
+ self.api_key = api_key
26
+ self.region = region
27
+
28
+ # Initialize official TurboPuffer client
29
+ self.client = turbopuffer.Turbopuffer(
30
+ api_key=api_key,
31
+ region=region
32
+ )
33
+ logger.info(f"Initialized TurboPuffer client with region {region}")
28
34
 
29
- async def health_check(self) -> bool:
35
+ def health_check(self) -> bool:
30
36
  """Check if Turbopuffer service is healthy."""
31
37
  try:
32
- # List namespaces to test connectivity
33
- await self.list_namespaces()
38
+ namespaces = self.client.namespaces()
34
39
  return True
35
40
  except Exception as e:
36
41
  logger.warning(f"Turbopuffer health check failed: {e}")
37
42
  return False
38
43
 
39
- def _generate_vector_id(self, project_id: str, chunk_id: int) -> str:
44
+ def generate_vector_id(self, project_id: str, chunk_id: int) -> str:
40
45
  """Generate a unique vector ID."""
41
46
  return f"{project_id}_{chunk_id}_{uuid.uuid4().hex[:8]}"
42
47
 
43
- async def upsert_vectors(
48
+ def upsert_vectors(
44
49
  self,
45
50
  vectors: List[Dict[str, Any]],
46
51
  namespace: str,
47
52
  **kwargs
48
53
  ) -> Dict[str, Any]:
49
- """
50
- Store or update vectors in the database.
51
-
52
- Args:
53
- vectors: List of vector objects with id, values, and metadata
54
- namespace: Turbopuffer namespace to store vectors in
55
- **kwargs: Additional arguments
56
-
57
- Returns:
58
- Response from Turbopuffer API
59
- """
54
+ """Store or update vectors in the database."""
60
55
  if not vectors:
61
56
  return {"upserted": 0}
62
57
 
63
58
  logger.info(f"Upserting {len(vectors)} vectors to namespace '{namespace}'")
64
59
 
65
- # Format vectors for Turbopuffer API
60
+ # Format vectors for Turbopuffer SDK
66
61
  formatted_vectors = []
67
62
  for vector in vectors:
68
63
  if "id" not in vector or "values" not in vector:
@@ -75,210 +70,108 @@ class TurbopufferClient(BaseProvider):
75
70
  }
76
71
  formatted_vectors.append(formatted_vector)
77
72
 
78
- request_data = {
79
- "vectors": formatted_vectors,
80
- }
81
-
82
73
  try:
83
- response = await self._make_request(
84
- method="POST",
85
- endpoint=f"/namespaces/{namespace}/vectors",
86
- data=request_data,
87
- )
74
+ ns = self.client.namespace(namespace)
75
+ ns.upsert(vectors=formatted_vectors)
88
76
 
89
77
  logger.info(f"Successfully upserted {len(vectors)} vectors")
90
- return response
78
+ return {"upserted": len(vectors)}
91
79
 
92
80
  except Exception as e:
93
81
  logger.error(f"Failed to upsert vectors: {e}")
94
- raise ProviderError(f"Vector upsert failed: {e}")
82
+ raise RuntimeError(f"Vector upsert failed: {e}")
95
83
 
96
- async def search_vectors(
84
+ def search_vectors(
97
85
  self,
98
86
  query_vector: List[float],
99
87
  top_k: int = 10,
100
88
  namespace: str = "default",
101
89
  filters: Optional[Dict[str, Any]] = None,
102
- include_attributes: bool = True,
103
90
  **kwargs
104
91
  ) -> List[Dict[str, Any]]:
105
- """
106
- Search for similar vectors.
107
-
108
- Args:
109
- query_vector: Query vector to search with
110
- top_k: Number of results to return
111
- namespace: Turbopuffer namespace to search in
112
- filters: Metadata filters to apply
113
- include_attributes: Whether to include vector attributes in results
114
- **kwargs: Additional arguments
115
-
116
- Returns:
117
- List of search results with id, score, and metadata
118
- """
92
+ """Search for similar vectors."""
119
93
  logger.debug(f"Searching {top_k} vectors in namespace '{namespace}'")
120
94
 
121
- request_data = {
122
- "vector": query_vector,
123
- "top_k": top_k,
124
- "include_attributes": include_attributes,
125
- }
126
-
127
- if filters:
128
- request_data["filters"] = filters
129
-
130
95
  try:
131
- response = await self._make_request(
132
- method="POST",
133
- endpoint=f"/namespaces/{namespace}/search",
134
- data=request_data,
96
+ ns = self.client.namespace(namespace)
97
+
98
+ results = ns.query(
99
+ rank_by=[("vector", "ANN", query_vector)],
100
+ top_k=top_k,
101
+ filters=filters,
102
+ include_attributes=True
135
103
  )
136
104
 
137
- results = response.get("results", [])
138
105
  logger.debug(f"Found {len(results)} similar vectors")
139
-
140
106
  return results
141
107
 
142
108
  except Exception as e:
143
109
  logger.error(f"Vector search failed: {e}")
144
- raise ProviderError(f"Vector search failed: {e}")
110
+ raise RuntimeError(f"Vector search failed: {e}")
145
111
 
146
- async def delete_vectors(
112
+ def delete_vectors(
147
113
  self,
148
114
  vector_ids: List[str],
149
115
  namespace: str,
150
116
  **kwargs
151
117
  ) -> Dict[str, Any]:
152
- """
153
- Delete vectors by ID.
154
-
155
- Args:
156
- vector_ids: List of vector IDs to delete
157
- namespace: Turbopuffer namespace
158
- **kwargs: Additional arguments
159
-
160
- Returns:
161
- Response from Turbopuffer API
162
- """
118
+ """Delete vectors by ID."""
163
119
  if not vector_ids:
164
120
  return {"deleted": 0}
165
121
 
166
122
  logger.info(f"Deleting {len(vector_ids)} vectors from namespace '{namespace}'")
167
123
 
168
- request_data = {
169
- "ids": vector_ids,
170
- }
171
-
172
124
  try:
173
- response = await self._make_request(
174
- method="DELETE",
175
- endpoint=f"/namespaces/{namespace}/vectors",
176
- data=request_data,
177
- )
125
+ ns = self.client.namespace(namespace)
126
+ ns.delete(ids=vector_ids)
178
127
 
179
128
  logger.info(f"Successfully deleted vectors")
180
- return response
129
+ return {"deleted": len(vector_ids)}
181
130
 
182
131
  except Exception as e:
183
132
  logger.error(f"Failed to delete vectors: {e}")
184
- raise ProviderError(f"Vector deletion failed: {e}")
185
-
186
- async def get_namespace_stats(
187
- self,
188
- namespace: str,
189
- **kwargs
190
- ) -> Dict[str, Any]:
191
- """
192
- Get statistics about a namespace.
193
-
194
- Args:
195
- namespace: Turbopuffer namespace
196
- **kwargs: Additional arguments
197
-
198
- Returns:
199
- Namespace statistics
200
- """
201
- try:
202
- response = await self._make_request(
203
- method="GET",
204
- endpoint=f"/namespaces/{namespace}",
205
- )
206
-
207
- return response
208
-
209
- except Exception as e:
210
- logger.error(f"Failed to get namespace stats: {e}")
211
- raise ProviderError(f"Namespace stats failed: {e}")
133
+ raise RuntimeError(f"Vector deletion failed: {e}")
212
134
 
213
- async def list_namespaces(self) -> List[str]:
135
+ def list_namespaces(self) -> List[str]:
214
136
  """List all available namespaces."""
215
137
  try:
216
- response = await self._make_request(
217
- method="GET",
218
- endpoint="/namespaces",
219
- )
220
-
221
- namespaces = response.get("namespaces", [])
222
- return [ns["name"] for ns in namespaces]
138
+ namespaces = self.client.namespaces()
139
+ return [ns.name for ns in namespaces]
223
140
 
224
141
  except Exception as e:
225
142
  logger.error(f"Failed to list namespaces: {e}")
226
- raise ProviderError(f"Namespace listing failed: {e}")
143
+ raise RuntimeError(f"Namespace listing failed: {e}")
227
144
 
228
- async def create_namespace(
229
- self,
230
- namespace: str,
231
- dimension: int,
232
- **kwargs
233
- ) -> Dict[str, Any]:
234
- """
235
- Create a new namespace.
236
-
237
- Args:
238
- namespace: Name of the namespace to create
239
- dimension: Vector dimension for the namespace
240
- **kwargs: Additional arguments
241
-
242
- Returns:
243
- Response from Turbopuffer API
244
- """
145
+ def create_namespace(self, namespace: str, dimension: int, **kwargs) -> Dict[str, Any]:
146
+ """Create a new namespace."""
245
147
  logger.info(f"Creating namespace '{namespace}' with dimension {dimension}")
246
148
 
247
- request_data = {
248
- "name": namespace,
249
- "dimension": dimension,
250
- }
251
-
252
149
  try:
253
- response = await self._make_request(
254
- method="POST",
255
- endpoint="/namespaces",
256
- data=request_data,
150
+ self.client.create_namespace(
151
+ name=namespace,
152
+ dimension=dimension
257
153
  )
258
154
 
259
155
  logger.info(f"Successfully created namespace '{namespace}'")
260
- return response
156
+ return {"name": namespace, "dimension": dimension}
261
157
 
262
158
  except Exception as e:
263
159
  logger.error(f"Failed to create namespace: {e}")
264
- raise ProviderError(f"Namespace creation failed: {e}")
160
+ raise RuntimeError(f"Namespace creation failed: {e}")
265
161
 
266
- async def delete_namespace(self, namespace: str) -> Dict[str, Any]:
162
+ def delete_namespace(self, namespace: str) -> Dict[str, Any]:
267
163
  """Delete a namespace and all its vectors."""
268
164
  logger.warning(f"Deleting namespace '{namespace}' and all its vectors")
269
165
 
270
166
  try:
271
- response = await self._make_request(
272
- method="DELETE",
273
- endpoint=f"/namespaces/{namespace}",
274
- )
167
+ self.client.delete_namespace(namespace)
275
168
 
276
169
  logger.info(f"Successfully deleted namespace '{namespace}'")
277
- return response
170
+ return {"deleted": namespace}
278
171
 
279
172
  except Exception as e:
280
173
  logger.error(f"Failed to delete namespace: {e}")
281
- raise ProviderError(f"Namespace deletion failed: {e}")
174
+ raise RuntimeError(f"Namespace deletion failed: {e}")
282
175
 
283
176
  def get_namespace_for_project(self, project_id: str) -> str:
284
177
  """Get the namespace name for a project."""
@@ -286,7 +179,7 @@ class TurbopufferClient(BaseProvider):
286
179
  safe_project_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in project_id)
287
180
  return f"mcp_code_{safe_project_id}".lower()
288
181
 
289
- async def search_with_metadata_filter(
182
+ def search_with_metadata_filter(
290
183
  self,
291
184
  query_vector: List[float],
292
185
  project_id: str,
@@ -295,20 +188,7 @@ class TurbopufferClient(BaseProvider):
295
188
  top_k: int = 10,
296
189
  **kwargs
297
190
  ) -> List[Dict[str, Any]]:
298
- """
299
- Search vectors with metadata filtering.
300
-
301
- Args:
302
- query_vector: Query vector
303
- project_id: Project to search within
304
- chunk_type: Filter by chunk type (optional)
305
- file_path: Filter by file path (optional)
306
- top_k: Number of results to return
307
- **kwargs: Additional arguments
308
-
309
- Returns:
310
- Filtered search results
311
- """
191
+ """Search vectors with metadata filtering."""
312
192
  namespace = self.get_namespace_for_project(project_id)
313
193
 
314
194
  # Build metadata filters
@@ -318,7 +198,7 @@ class TurbopufferClient(BaseProvider):
318
198
  if file_path:
319
199
  filters["file_path"] = file_path
320
200
 
321
- return await self.search_vectors(
201
+ return self.search_vectors(
322
202
  query_vector=query_vector,
323
203
  top_k=top_k,
324
204
  namespace=namespace,
@@ -333,6 +213,5 @@ def create_turbopuffer_client(config: VectorConfig) -> TurbopufferClient:
333
213
 
334
214
  return TurbopufferClient(
335
215
  api_key=config.turbopuffer_api_key,
336
- timeout=30.0,
337
- max_retries=3,
216
+ region=config.turbopuffer_region,
338
217
  )
@@ -1,164 +1,78 @@
1
1
  """
2
- Voyage AI client for embedding generation.
2
+ Voyage AI client for embedding generation using official SDK.
3
3
 
4
- Provides integration with Voyage AI's embedding API for generating
4
+ Provides clean integration with Voyage AI's embedding API for generating
5
5
  high-quality code embeddings using the voyage-code-2 model.
6
6
  """
7
7
 
8
8
  import logging
9
- from typing import List, Dict, Any, Optional, Union
10
- import tiktoken
9
+ from typing import List, Dict, Any
10
+ import voyageai
11
11
 
12
- from .base_provider import BaseProvider, ProviderError
13
12
  from ..config import VectorConfig
14
13
 
15
14
  logger = logging.getLogger(__name__)
16
15
 
17
- class VoyageClient(BaseProvider):
18
- """Client for Voyage AI embedding generation."""
16
+ class VoyageClient:
17
+ """Clean Voyage AI client using official SDK."""
19
18
 
20
- def __init__(
21
- self,
22
- api_key: str,
23
- model: str = "voyage-code-2",
24
- base_url: str = "https://api.voyageai.com/v1",
25
- **kwargs
26
- ):
27
- super().__init__(api_key, base_url, **kwargs)
19
+ def __init__(self, api_key: str, model: str = "voyage-code-2"):
20
+ self.api_key = api_key
28
21
  self.model = model
29
- self._embedding_dimension: Optional[int] = None
22
+ self._embedding_dimension: int | None = None
30
23
 
31
- # Note: Voyage AI uses proprietary tokenizer, not tiktoken
32
- # We'll use approximate counting and let the API handle truncation
33
- self.tokenizer = None
34
- logger.info("Using approximate token counting - Voyage AI handles tokenization internally")
24
+ # Initialize official Voyage AI client
25
+ self.client = voyageai.Client(api_key=api_key)
26
+ logger.info(f"Initialized Voyage AI client with model {model}")
35
27
 
36
- async def health_check(self) -> bool:
28
+ def health_check(self) -> bool:
37
29
  """Check if Voyage AI service is healthy."""
38
30
  try:
39
- # Make a small test request
40
- await self.generate_embeddings(["test"], input_type="query")
41
- return True
31
+ result = self.client.embed(["test"], model=self.model, input_type="query")
32
+ return len(result.embeddings) > 0
42
33
  except Exception as e:
43
34
  logger.warning(f"Voyage AI health check failed: {e}")
44
35
  return False
45
36
 
46
- def _count_tokens(self, text: str) -> int:
47
- """Approximate token count - Voyage AI handles exact tokenization."""
48
- # Voyage AI uses proprietary tokenizer - this is just for batching estimates
49
- # Rough approximation: 4 characters per token (conservative estimate)
50
- return len(text) // 4
51
-
52
- def _batch_texts_by_tokens(
53
- self,
54
- texts: List[str],
55
- max_tokens_per_batch: int = 120000 # Leave buffer under 128k limit
56
- ) -> List[List[str]]:
57
- """Batch texts to stay under token limits."""
58
- batches = []
59
- current_batch = []
60
- current_tokens = 0
61
-
62
- for text in texts:
63
- text_tokens = self._count_tokens(text)
64
-
65
- # If single text exceeds limit, truncate it (let Voyage API handle exact truncation)
66
- if text_tokens > max_tokens_per_batch:
67
- # Rough character-based truncation - Voyage API will handle exact tokenization
68
- target_chars = (max_tokens_per_batch - 100) * 4 # Conservative estimate
69
- text = text[:target_chars]
70
- text_tokens = self._count_tokens(text)
71
-
72
- logger.warning(f"Pre-truncated text to ~{text_tokens} tokens (Voyage API will handle exact tokenization)")
73
-
74
- # Check if adding this text would exceed the batch limit
75
- if current_tokens + text_tokens > max_tokens_per_batch and current_batch:
76
- batches.append(current_batch)
77
- current_batch = [text]
78
- current_tokens = text_tokens
79
- else:
80
- current_batch.append(text)
81
- current_tokens += text_tokens
82
-
83
- if current_batch:
84
- batches.append(current_batch)
85
-
86
- return batches
87
-
88
- async def generate_embeddings(
37
+ def generate_embeddings(
89
38
  self,
90
39
  texts: List[str],
91
40
  input_type: str = "document",
92
- truncation: bool = True,
93
41
  **kwargs
94
42
  ) -> List[List[float]]:
95
- """
96
- Generate embeddings for a list of texts.
97
-
98
- Args:
99
- texts: List of texts to embed
100
- input_type: Type of input ("document" or "query")
101
- truncation: Whether to enable truncation
102
- **kwargs: Additional arguments
103
-
104
- Returns:
105
- List of embedding vectors
106
- """
43
+ """Generate embeddings for texts using official SDK."""
107
44
  if not texts:
108
45
  return []
109
46
 
110
47
  logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
111
48
 
112
- # Batch texts to stay under token limits
113
- batches = self._batch_texts_by_tokens(texts)
114
- all_embeddings = []
115
-
116
- for i, batch in enumerate(batches):
117
- logger.debug(f"Processing batch {i+1}/{len(batches)} with {len(batch)} texts")
49
+ try:
50
+ result = self.client.embed(
51
+ texts=texts,
52
+ model=self.model,
53
+ input_type=input_type,
54
+ truncation=True
55
+ )
118
56
 
119
- request_data = {
120
- "input": batch,
121
- "model": self.model,
122
- "input_type": input_type,
123
- "truncation": truncation,
124
- }
57
+ # Log usage if available
58
+ if hasattr(result, 'usage') and result.usage:
59
+ logger.debug(f"Token usage: {result.usage.total_tokens}")
125
60
 
126
- try:
127
- response = await self._make_request(
128
- method="POST",
129
- endpoint="/embeddings",
130
- data=request_data,
131
- )
132
-
133
- # Extract embeddings from response
134
- if "data" not in response:
135
- raise ProviderError("Invalid response format from Voyage AI")
136
-
137
- batch_embeddings = [item["embedding"] for item in response["data"]]
138
- all_embeddings.extend(batch_embeddings)
139
-
140
- # Log usage information if available
141
- if "usage" in response:
142
- usage = response["usage"]
143
- logger.debug(
144
- f"Batch {i+1} usage: {usage.get('total_tokens', 0)} tokens"
145
- )
146
-
147
- except Exception as e:
148
- logger.error(f"Failed to generate embeddings for batch {i+1}: {e}")
149
- raise ProviderError(f"Embedding generation failed: {e}")
150
-
151
- logger.info(f"Successfully generated {len(all_embeddings)} embeddings")
152
- return all_embeddings
61
+ logger.info(f"Successfully generated {len(result.embeddings)} embeddings")
62
+ return result.embeddings
63
+
64
+ except Exception as e:
65
+ logger.error(f"Failed to generate embeddings: {e}")
66
+ raise RuntimeError(f"Embedding generation failed: {e}")
153
67
 
154
- async def get_embedding_dimension(self) -> int:
68
+ def get_embedding_dimension(self) -> int:
155
69
  """Get the dimension of embeddings produced by this model."""
156
70
  if self._embedding_dimension is not None:
157
71
  return self._embedding_dimension
158
72
 
159
73
  # Generate a test embedding to determine dimension
160
74
  try:
161
- test_embeddings = await self.generate_embeddings(["test"], input_type="query")
75
+ test_embeddings = self.generate_embeddings(["test"], input_type="query")
162
76
  if test_embeddings:
163
77
  self._embedding_dimension = len(test_embeddings[0])
164
78
  logger.info(f"Detected embedding dimension: {self._embedding_dimension}")
@@ -166,27 +80,22 @@ class VoyageClient(BaseProvider):
166
80
  except Exception as e:
167
81
  logger.warning(f"Could not determine embedding dimension: {e}")
168
82
 
169
- # Default dimensions for known Voyage models (as of 2024)
170
- # Note: These may change - verify with Voyage AI documentation
83
+ # Default dimensions for known Voyage models
171
84
  model_dimensions = {
172
- "voyage-code-2": 1536, # Code-optimized model
173
- "voyage-2": 1024, # General purpose
174
- "voyage-large-2": 1536, # Large general purpose
175
- "voyage-3": 1024, # Newer general purpose (if available)
85
+ "voyage-code-2": 1536,
86
+ "voyage-2": 1024,
87
+ "voyage-large-2": 1536,
88
+ "voyage-3": 1024,
176
89
  }
177
90
 
178
91
  self._embedding_dimension = model_dimensions.get(self.model, 1536)
179
- logger.info(f"Using default dimension for {self.model}: {self._embedding_dimension}")
92
+ logger.info(f"Using default embedding dimension: {self._embedding_dimension}")
180
93
  return self._embedding_dimension
181
94
 
182
- async def generate_query_embedding(self, query: str) -> List[float]:
183
- """Generate a single embedding for a search query."""
184
- embeddings = await self.generate_embeddings([query], input_type="query")
185
- return embeddings[0] if embeddings else []
186
-
187
- async def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
95
+ def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
188
96
  """Estimate the cost of embedding generation."""
189
- total_tokens = sum(self._count_tokens(text) for text in texts)
97
+ # Rough token estimation (4 chars per token)
98
+ total_tokens = sum(len(text) // 4 for text in texts)
190
99
 
191
100
  # Voyage AI pricing (approximate, may change)
192
101
  cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
@@ -207,6 +116,4 @@ def create_voyage_client(config: VectorConfig) -> VoyageClient:
207
116
  return VoyageClient(
208
117
  api_key=config.voyage_api_key,
209
118
  model=config.embedding_model,
210
- timeout=30.0,
211
- max_retries=3,
212
119
  )
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: mcp-code-indexer
3
- Version: 4.1.0
3
+ Version: 4.2.1
4
4
  Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
5
5
  License: MIT
6
6
  Keywords: mcp,model-context-protocol,code-indexer,ai-tools,codebase-navigation,file-descriptions,llm-tools
7
7
  Author: MCP Code Indexer Contributors
8
8
  Maintainer: MCP Code Indexer Contributors
9
- Requires-Python: >=3.9,<4.0
9
+ Requires-Python: >=3.10,<3.13
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Environment :: Console
12
12
  Classifier: Framework :: AsyncIO
@@ -14,16 +14,15 @@ Classifier: Intended Audience :: Developers
14
14
  Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Operating System :: OS Independent
16
16
  Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
20
19
  Classifier: Programming Language :: Python :: 3.12
21
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.9
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development
24
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
25
  Classifier: Typing :: Typed
26
- Provides-Extra: vector
27
26
  Requires-Dist: aiofiles (==23.2.0)
28
27
  Requires-Dist: aiohttp (>=3.8.0)
29
28
  Requires-Dist: aiosqlite (==0.19.0)
@@ -33,10 +32,15 @@ Requires-Dist: importlib-metadata (>=1.0.0) ; python_version < "3.8"
33
32
  Requires-Dist: mcp (>=1.9.0)
34
33
  Requires-Dist: pydantic (>=2.8.0)
35
34
  Requires-Dist: python-multipart (>=0.0.6)
35
+ Requires-Dist: pyyaml (>=6.0)
36
36
  Requires-Dist: tenacity (>=8.0.0)
37
37
  Requires-Dist: tiktoken (>=0.9.0)
38
38
  Requires-Dist: tomli (>=1.2.0) ; python_version < "3.11"
39
+ Requires-Dist: tree-sitter (>=0.25.0)
40
+ Requires-Dist: turbopuffer (>=0.6.0)
39
41
  Requires-Dist: uvicorn (>=0.24.0)
42
+ Requires-Dist: voyageai (>=0.3.0)
43
+ Requires-Dist: watchdog (>=6.0.0)
40
44
  Project-URL: Documentation, https://github.com/fluffypony/mcp-code-indexer/blob/main/README.md
41
45
  Project-URL: Homepage, https://github.com/fluffypony/mcp-code-indexer
42
46
  Project-URL: Repository, https://github.com/fluffypony/mcp-code-indexer
@@ -44,8 +48,8 @@ Description-Content-Type: text/markdown
44
48
 
45
49
  # MCP Code Indexer 🚀
46
50
 
47
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?46)](https://badge.fury.io/py/mcp-code-indexer)
48
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?46)](https://pypi.org/project/mcp-code-indexer/)
51
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?48)](https://badge.fury.io/py/mcp-code-indexer)
52
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?48)](https://pypi.org/project/mcp-code-indexer/)
49
53
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
50
54
 
51
55
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -215,12 +219,15 @@ Vector Mode transforms how you search and understand codebases by using AI embed
215
219
  ### 🚀 Quick Start
216
220
 
217
221
  ```bash
218
- # Install vector mode dependencies
219
- pip install mcp-code-indexer[vector]
222
+ # Install MCP Code Indexer (includes vector mode)
223
+ pip install mcp-code-indexer
220
224
 
221
225
  # Set required API keys
222
226
  export VOYAGE_API_KEY="pa-your-voyage-api-key"
223
- export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
227
+ export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
228
+
229
+ # Optional: Configure region (default: gcp-europe-west3)
230
+ export TURBOPUFFER_REGION="gcp-europe-west3"
224
231
 
225
232
  # Start with vector mode enabled
226
233
  mcp-code-indexer --vector
@@ -19,7 +19,7 @@ mcp_code_indexer/error_handler.py,sha256=ylciEM-cR7E8Gmd8cfh5olcllJm0FnaYBGH86ya
19
19
  mcp_code_indexer/file_scanner.py,sha256=7Ab34lRQGeh5GBCzcSP96p4YK6LDWFGUHLXqi499UZ4,11838
20
20
  mcp_code_indexer/git_hook_handler.py,sha256=sTtZV3-Yy1Evt06R5NZclELeepM4Ia9OQoR2O6BK3Hk,45517
21
21
  mcp_code_indexer/logging_config.py,sha256=M5eVZ5PwfTROib7ISTQ522n2hUSc4hJ_wUgsrJKsTTg,10030
22
- mcp_code_indexer/main.py,sha256=tdUEcTVLweLmrG49TReGAl1nBf0vnzCIa7NSg6IPPec,37137
22
+ mcp_code_indexer/main.py,sha256=tII1x_LHmD1T951-L1lTzXkR9Vz0z8_pNs-mznfj1CY,37133
23
23
  mcp_code_indexer/middleware/__init__.py,sha256=UCEPzOlZldlqFzYEfrXw1HvCDvY1jpLvyaDGUzVr2aw,368
24
24
  mcp_code_indexer/middleware/auth.py,sha256=4HkHMDZBNsyPA1VE8qF7pRNKbqG4xIDZjllENbgynxI,7258
25
25
  mcp_code_indexer/middleware/error_middleware.py,sha256=0RnKM5fK_n_7AITK2ueAqv30kLBdjU3vaWOTwWd2Xs0,11965
@@ -46,21 +46,20 @@ mcp_code_indexer/vector_mode/chunking/__init__.py,sha256=rjjFMbHsqWIBzL4IajYxXXJ
46
46
  mcp_code_indexer/vector_mode/chunking/ast_chunker.py,sha256=GTl_6U0nSgDRRzKS07tJ7RMX8AmJvvY_IsRn95hvVfA,14623
47
47
  mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py,sha256=xD0zEibjt6FLBFaKHNc63-iKTtCgnOlLL_9Hc8mCrzE,19752
48
48
  mcp_code_indexer/vector_mode/chunking/language_handlers.py,sha256=YEpTVjzyJH445OjniGV05apexsfG5KVR4lwBEl4mGJc,18189
49
- mcp_code_indexer/vector_mode/config.py,sha256=OgjkY-chGIWJCusNA327gm0Jzy_j6U-k4Qdiq70MRBM,6023
49
+ mcp_code_indexer/vector_mode/config.py,sha256=g5p9Q4EAR20DfLv4RxaQnk3_UdysuvWS8rcsjs1vgwI,6680
50
50
  mcp_code_indexer/vector_mode/daemon.py,sha256=le3NkxFD73bKeutruzLY-Bauc-nXzlhlIlDJv4jlxhU,12096
51
51
  mcp_code_indexer/vector_mode/monitoring/__init__.py,sha256=9rNWCvHxRMvYumdIrPjb5K9fpOwe1Aem24hdh8gXoDM,439
52
52
  mcp_code_indexer/vector_mode/monitoring/change_detector.py,sha256=X82e_sKbJJFPhqZFJubLQb8Rs-srRtS7sh0nUOsPCPw,10338
53
53
  mcp_code_indexer/vector_mode/monitoring/file_watcher.py,sha256=AQ6YHSKXPubtprLZngeLb0othJOCNQZ7wwXUvqwphT4,15299
54
54
  mcp_code_indexer/vector_mode/monitoring/merkle_tree.py,sha256=83RLdUj_cgcAlrT9Wev9IBavVEyc8Jo8w--IOJisLOk,14645
55
- mcp_code_indexer/vector_mode/providers/__init__.py,sha256=xZLGtAuaQpEWm5KW5Bdf8fMO92wb7OwOedSKhacjmwY,1908
56
- mcp_code_indexer/vector_mode/providers/base_provider.py,sha256=4lmWUTDwB5CmFhEc004DkniiCuiRfFFTBBB0BOHlsUE,7513
57
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py,sha256=97em_sHGvzEy6h1BI4Ux7IPj8U4d5ayYJyLwzmFRMyM,10758
58
- mcp_code_indexer/vector_mode/providers/voyage_client.py,sha256=12uVi6Hqo2dfoUnbxaXohlsDmfBkeRKEotbvEPzT3n4,8315
55
+ mcp_code_indexer/vector_mode/providers/__init__.py,sha256=0GhPHn7XEBSHa6bLvy8j0Eqvto82o6Bs2hZCrHawLus,514
56
+ mcp_code_indexer/vector_mode/providers/turbopuffer_client.py,sha256=NdBAghmaRUUIGFZOTOZYhYyXvv_QB36lieGQjVlLEno,7599
57
+ mcp_code_indexer/vector_mode/providers/voyage_client.py,sha256=pfm9BOx5Temf0LM-VZ4LH6xwBmZ6XO8XeCSiSZ5LU80,4375
59
58
  mcp_code_indexer/vector_mode/security/__init__.py,sha256=itfeuysSqV-m9xuo-CMkAoucxexVfPgeOU-ieTLvdls,336
60
59
  mcp_code_indexer/vector_mode/security/patterns.py,sha256=0xaiMnZm7YXswq3hVe_DJYePE9MhWuvizApLnmXus9M,11572
61
60
  mcp_code_indexer/vector_mode/security/redactor.py,sha256=tsFzhCJ99bp4EFqQVjZ-4f8Uf3ux9X4ODVR09oJG01U,13380
62
- mcp_code_indexer-4.1.0.dist-info/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
63
- mcp_code_indexer-4.1.0.dist-info/METADATA,sha256=_oF0bxlQWX1SczGQb-nUVkNPWHs4Pt0DlqczLacfSPw,27221
64
- mcp_code_indexer-4.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
65
- mcp_code_indexer-4.1.0.dist-info/entry_points.txt,sha256=UABj7HZ0mC6rvF22gxaz2LLNLGQShTrFmp5u00iUtvo,67
66
- mcp_code_indexer-4.1.0.dist-info/RECORD,,
61
+ mcp_code_indexer-4.2.1.dist-info/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
62
+ mcp_code_indexer-4.2.1.dist-info/METADATA,sha256=jsPpjmDRZabOKWzLTt0MHzdE-jaNJCiosMx2SBQCtJU,27483
63
+ mcp_code_indexer-4.2.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
64
+ mcp_code_indexer-4.2.1.dist-info/entry_points.txt,sha256=UABj7HZ0mC6rvF22gxaz2LLNLGQShTrFmp5u00iUtvo,67
65
+ mcp_code_indexer-4.2.1.dist-info/RECORD,,
@@ -1,230 +0,0 @@
1
- """
2
- Base provider classes with common functionality.
3
-
4
- Provides retry logic, circuit breaker pattern, and error handling
5
- for external service integrations.
6
- """
7
-
8
- import asyncio
9
- import logging
10
- import time
11
- from typing import Any, Dict, List, Optional, Callable, TypeVar
12
- from abc import ABC, abstractmethod
13
- from contextlib import asynccontextmanager
14
- import aiohttp
15
- from tenacity import (
16
- retry,
17
- stop_after_attempt,
18
- wait_exponential,
19
- retry_if_exception_type,
20
- retry_if_result,
21
- before_sleep_log,
22
- )
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
- T = TypeVar('T')
27
-
28
- class CircuitBreakerError(Exception):
29
- """Raised when circuit breaker is open."""
30
- pass
31
-
32
- class ProviderError(Exception):
33
- """Base exception for provider errors."""
34
- pass
35
-
36
- class RateLimitError(ProviderError):
37
- """Raised when rate limit is exceeded."""
38
- pass
39
-
40
- class AuthenticationError(ProviderError):
41
- """Raised when authentication fails."""
42
- pass
43
-
44
- class CircuitBreaker:
45
- """Circuit breaker implementation for external services."""
46
-
47
- def __init__(
48
- self,
49
- failure_threshold: int = 5,
50
- recovery_timeout: float = 60.0,
51
- expected_exception: type = Exception,
52
- ):
53
- self.failure_threshold = failure_threshold
54
- self.recovery_timeout = recovery_timeout
55
- self.expected_exception = expected_exception
56
-
57
- self.failure_count = 0
58
- self.last_failure_time: Optional[float] = None
59
- self.state = "closed" # closed, open, half-open
60
-
61
- def _should_attempt_reset(self) -> bool:
62
- """Check if we should attempt to reset the circuit breaker."""
63
- return (
64
- self.state == "open"
65
- and self.last_failure_time is not None
66
- and time.time() - self.last_failure_time >= self.recovery_timeout
67
- )
68
-
69
- async def call(self, func: Callable[[], T]) -> T:
70
- """Call a function through the circuit breaker."""
71
- if self.state == "open":
72
- if self._should_attempt_reset():
73
- self.state = "half-open"
74
- logger.info("Circuit breaker attempting reset")
75
- else:
76
- raise CircuitBreakerError("Circuit breaker is open")
77
-
78
- try:
79
- result = await func()
80
- # Success - reset failure count
81
- if self.state == "half-open":
82
- self.state = "closed"
83
- logger.info("Circuit breaker reset to closed")
84
- self.failure_count = 0
85
- return result
86
-
87
- except self.expected_exception as e:
88
- self.failure_count += 1
89
- self.last_failure_time = time.time()
90
-
91
- if self.failure_count >= self.failure_threshold:
92
- self.state = "open"
93
- logger.warning(
94
- f"Circuit breaker opened after {self.failure_count} failures"
95
- )
96
-
97
- raise
98
-
99
- class BaseProvider(ABC):
100
- """Base class for external service providers."""
101
-
102
- def __init__(
103
- self,
104
- api_key: str,
105
- base_url: str,
106
- timeout: float = 30.0,
107
- max_retries: int = 3,
108
- circuit_breaker_enabled: bool = True,
109
- ):
110
- self.api_key = api_key
111
- self.base_url = base_url.rstrip('/')
112
- self.timeout = timeout
113
- self.max_retries = max_retries
114
-
115
- # Circuit breaker for resilience
116
- self.circuit_breaker = CircuitBreaker(
117
- failure_threshold=5,
118
- recovery_timeout=60.0,
119
- expected_exception=(aiohttp.ClientError, ProviderError),
120
- ) if circuit_breaker_enabled else None
121
-
122
- # Rate limiting state
123
- self.last_request_time: Optional[float] = None
124
- self.min_request_interval = 0.1 # 100ms between requests
125
-
126
- # Session will be created lazily
127
- self._session: Optional[aiohttp.ClientSession] = None
128
-
129
- @asynccontextmanager
130
- async def _get_session(self):
131
- """Get or create HTTP session."""
132
- if self._session is None or self._session.closed:
133
- connector = aiohttp.TCPConnector(
134
- limit=100,
135
- limit_per_host=30,
136
- ttl_dns_cache=300,
137
- use_dns_cache=True,
138
- )
139
- timeout = aiohttp.ClientTimeout(total=self.timeout)
140
- self._session = aiohttp.ClientSession(
141
- connector=connector,
142
- timeout=timeout,
143
- headers=self._get_default_headers(),
144
- )
145
-
146
- try:
147
- yield self._session
148
- finally:
149
- # Keep session alive for reuse
150
- pass
151
-
152
- def _get_default_headers(self) -> Dict[str, str]:
153
- """Get default headers for API requests."""
154
- return {
155
- "Authorization": f"Bearer {self.api_key}",
156
- "Content-Type": "application/json",
157
- "User-Agent": "mcp-code-indexer/1.0.0",
158
- }
159
-
160
- async def _rate_limit_wait(self) -> None:
161
- """Wait if necessary to respect rate limits."""
162
- if self.last_request_time is not None:
163
- elapsed = time.time() - self.last_request_time
164
- if elapsed < self.min_request_interval:
165
- await asyncio.sleep(self.min_request_interval - elapsed)
166
-
167
- self.last_request_time = time.time()
168
-
169
- @retry(
170
- stop=stop_after_attempt(3),
171
- wait=wait_exponential(multiplier=1, min=1, max=10),
172
- retry=retry_if_exception_type((aiohttp.ClientError, RateLimitError)),
173
- before_sleep=before_sleep_log(logger, logging.WARNING),
174
- )
175
- async def _make_request(
176
- self,
177
- method: str,
178
- endpoint: str,
179
- data: Optional[Dict[str, Any]] = None,
180
- params: Optional[Dict[str, Any]] = None,
181
- **kwargs
182
- ) -> Dict[str, Any]:
183
- """Make an HTTP request with retry logic."""
184
-
185
- async def _request():
186
- await self._rate_limit_wait()
187
-
188
- url = f"{self.base_url}/{endpoint.lstrip('/')}"
189
-
190
- async with self._get_session() as session:
191
- async with session.request(
192
- method=method,
193
- url=url,
194
- json=data,
195
- params=params,
196
- **kwargs
197
- ) as response:
198
- response_data = await response.json()
199
-
200
- if response.status == 429:
201
- raise RateLimitError("Rate limit exceeded")
202
- elif response.status == 401:
203
- raise AuthenticationError("Authentication failed")
204
- elif response.status >= 400:
205
- raise ProviderError(
206
- f"HTTP {response.status}: {response_data.get('error', 'Unknown error')}"
207
- )
208
-
209
- return response_data
210
-
211
- if self.circuit_breaker:
212
- return await self.circuit_breaker.call(_request)
213
- else:
214
- return await _request()
215
-
216
- async def close(self) -> None:
217
- """Close the HTTP session."""
218
- if self._session and not self._session.closed:
219
- await self._session.close()
220
-
221
- async def __aenter__(self):
222
- return self
223
-
224
- async def __aexit__(self, exc_type, exc_val, exc_tb):
225
- await self.close()
226
-
227
- @abstractmethod
228
- async def health_check(self) -> bool:
229
- """Check if the service is healthy."""
230
- pass