mcp-code-indexer 4.1.0__py3-none-any.whl → 4.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/main.py +1 -1
- mcp_code_indexer/vector_mode/config.py +14 -2
- mcp_code_indexer/vector_mode/providers/__init__.py +11 -66
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +66 -187
- mcp_code_indexer/vector_mode/providers/voyage_client.py +44 -137
- {mcp_code_indexer-4.1.0.dist-info → mcp_code_indexer-4.2.1.dist-info}/METADATA +16 -9
- {mcp_code_indexer-4.1.0.dist-info → mcp_code_indexer-4.2.1.dist-info}/RECORD +10 -11
- mcp_code_indexer/vector_mode/providers/base_provider.py +0 -230
- {mcp_code_indexer-4.1.0.dist-info → mcp_code_indexer-4.2.1.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.1.0.dist-info → mcp_code_indexer-4.2.1.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.1.0.dist-info → mcp_code_indexer-4.2.1.dist-info}/entry_points.txt +0 -0
mcp_code_indexer/main.py
CHANGED
|
@@ -1019,7 +1019,7 @@ async def main() -> None:
|
|
|
1019
1019
|
|
|
1020
1020
|
# Check if vector mode is available
|
|
1021
1021
|
if not is_vector_mode_available():
|
|
1022
|
-
logger.error("Vector mode
|
|
1022
|
+
logger.error("Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer")
|
|
1023
1023
|
sys.exit(1)
|
|
1024
1024
|
|
|
1025
1025
|
# Check API keys
|
|
@@ -18,6 +18,7 @@ class VectorConfig:
|
|
|
18
18
|
# API Configuration
|
|
19
19
|
voyage_api_key: Optional[str] = None
|
|
20
20
|
turbopuffer_api_key: Optional[str] = None
|
|
21
|
+
turbopuffer_region: str = "gcp-europe-west3"
|
|
21
22
|
|
|
22
23
|
# Embedding Configuration
|
|
23
24
|
embedding_model: str = "voyage-code-2"
|
|
@@ -57,9 +58,10 @@ class VectorConfig:
|
|
|
57
58
|
return cls(
|
|
58
59
|
voyage_api_key=os.getenv("VOYAGE_API_KEY"),
|
|
59
60
|
turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
|
|
60
|
-
|
|
61
|
+
turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
|
|
62
|
+
embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
|
|
61
63
|
batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
|
|
62
|
-
max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "
|
|
64
|
+
max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
|
|
63
65
|
similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
|
|
64
66
|
max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
|
|
65
67
|
enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
|
|
@@ -122,6 +124,16 @@ class VectorConfig:
|
|
|
122
124
|
if not self.turbopuffer_api_key:
|
|
123
125
|
errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
|
|
124
126
|
|
|
127
|
+
# Validate TurboPuffer region
|
|
128
|
+
supported_regions = [
|
|
129
|
+
'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
|
|
130
|
+
'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
|
|
131
|
+
'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
|
|
132
|
+
]
|
|
133
|
+
if self.turbopuffer_region not in supported_regions:
|
|
134
|
+
errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
|
|
135
|
+
f"Supported regions: {', '.join(supported_regions)}")
|
|
136
|
+
|
|
125
137
|
if self.batch_size <= 0:
|
|
126
138
|
errors.append("batch_size must be positive")
|
|
127
139
|
if self.max_tokens_per_chunk <= 0:
|
|
@@ -1,72 +1,17 @@
|
|
|
1
1
|
"""
|
|
2
2
|
External service providers for vector mode.
|
|
3
3
|
|
|
4
|
-
This package provides integrations with external services
|
|
5
|
-
- Voyage AI for embedding generation
|
|
6
|
-
- Turbopuffer for vector storage and search
|
|
4
|
+
This package provides clean integrations with external services using official SDKs:
|
|
5
|
+
- Voyage AI for embedding generation (voyageai SDK)
|
|
6
|
+
- Turbopuffer for vector storage and search (turbopuffer SDK)
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
from
|
|
9
|
+
from .voyage_client import VoyageClient, create_voyage_client
|
|
10
|
+
from .turbopuffer_client import TurbopufferClient, create_turbopuffer_client
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
texts: List[str],
|
|
19
|
-
input_type: str = "document",
|
|
20
|
-
**kwargs
|
|
21
|
-
) -> List[List[float]]:
|
|
22
|
-
"""Generate embeddings for a list of texts."""
|
|
23
|
-
...
|
|
24
|
-
|
|
25
|
-
@abstractmethod
|
|
26
|
-
async def get_embedding_dimension(self) -> int:
|
|
27
|
-
"""Get the dimension of embeddings produced by this provider."""
|
|
28
|
-
...
|
|
29
|
-
|
|
30
|
-
class VectorStoreProvider(Protocol):
|
|
31
|
-
"""Protocol for vector storage providers."""
|
|
32
|
-
|
|
33
|
-
@abstractmethod
|
|
34
|
-
async def upsert_vectors(
|
|
35
|
-
self,
|
|
36
|
-
vectors: List[Dict[str, Any]],
|
|
37
|
-
namespace: Optional[str] = None,
|
|
38
|
-
**kwargs
|
|
39
|
-
) -> Dict[str, Any]:
|
|
40
|
-
"""Store or update vectors in the database."""
|
|
41
|
-
...
|
|
42
|
-
|
|
43
|
-
@abstractmethod
|
|
44
|
-
async def search_vectors(
|
|
45
|
-
self,
|
|
46
|
-
query_vector: List[float],
|
|
47
|
-
top_k: int = 10,
|
|
48
|
-
namespace: Optional[str] = None,
|
|
49
|
-
filters: Optional[Dict[str, Any]] = None,
|
|
50
|
-
**kwargs
|
|
51
|
-
) -> List[Dict[str, Any]]:
|
|
52
|
-
"""Search for similar vectors."""
|
|
53
|
-
...
|
|
54
|
-
|
|
55
|
-
@abstractmethod
|
|
56
|
-
async def delete_vectors(
|
|
57
|
-
self,
|
|
58
|
-
vector_ids: List[str],
|
|
59
|
-
namespace: Optional[str] = None,
|
|
60
|
-
**kwargs
|
|
61
|
-
) -> Dict[str, Any]:
|
|
62
|
-
"""Delete vectors by ID."""
|
|
63
|
-
...
|
|
64
|
-
|
|
65
|
-
@abstractmethod
|
|
66
|
-
async def get_namespace_stats(
|
|
67
|
-
self,
|
|
68
|
-
namespace: Optional[str] = None,
|
|
69
|
-
**kwargs
|
|
70
|
-
) -> Dict[str, Any]:
|
|
71
|
-
"""Get statistics about a namespace."""
|
|
72
|
-
...
|
|
12
|
+
__all__ = [
|
|
13
|
+
'VoyageClient',
|
|
14
|
+
'create_voyage_client',
|
|
15
|
+
'TurbopufferClient',
|
|
16
|
+
'create_turbopuffer_client',
|
|
17
|
+
]
|
|
@@ -1,68 +1,63 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Turbopuffer client for vector storage and search.
|
|
2
|
+
Turbopuffer client for vector storage and search using official SDK.
|
|
3
3
|
|
|
4
|
-
Provides integration with Turbopuffer's vector database for storing
|
|
5
|
-
embeddings and performing similarity searches.
|
|
4
|
+
Provides clean integration with Turbopuffer's vector database for storing
|
|
5
|
+
embeddings and performing similarity searches. Supports configurable
|
|
6
|
+
regions for optimal latency and data residency compliance.
|
|
7
|
+
|
|
8
|
+
Default region: gcp-europe-west3 (Frankfurt)
|
|
9
|
+
Configure via TURBOPUFFER_REGION environment variable.
|
|
6
10
|
"""
|
|
7
11
|
|
|
8
12
|
import logging
|
|
9
13
|
import uuid
|
|
10
|
-
from typing import List, Dict, Any, Optional
|
|
11
|
-
import
|
|
14
|
+
from typing import List, Dict, Any, Optional
|
|
15
|
+
import turbopuffer
|
|
12
16
|
|
|
13
|
-
from .base_provider import BaseProvider, ProviderError
|
|
14
17
|
from ..config import VectorConfig
|
|
15
18
|
|
|
16
19
|
logger = logging.getLogger(__name__)
|
|
17
20
|
|
|
18
|
-
class TurbopufferClient
|
|
19
|
-
"""
|
|
21
|
+
class TurbopufferClient:
|
|
22
|
+
"""Clean Turbopuffer client using official SDK."""
|
|
20
23
|
|
|
21
|
-
def __init__(
|
|
22
|
-
self
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
24
|
+
def __init__(self, api_key: str, region: str = "gcp-europe-west3"):
|
|
25
|
+
self.api_key = api_key
|
|
26
|
+
self.region = region
|
|
27
|
+
|
|
28
|
+
# Initialize official TurboPuffer client
|
|
29
|
+
self.client = turbopuffer.Turbopuffer(
|
|
30
|
+
api_key=api_key,
|
|
31
|
+
region=region
|
|
32
|
+
)
|
|
33
|
+
logger.info(f"Initialized TurboPuffer client with region {region}")
|
|
28
34
|
|
|
29
|
-
|
|
35
|
+
def health_check(self) -> bool:
|
|
30
36
|
"""Check if Turbopuffer service is healthy."""
|
|
31
37
|
try:
|
|
32
|
-
|
|
33
|
-
await self.list_namespaces()
|
|
38
|
+
namespaces = self.client.namespaces()
|
|
34
39
|
return True
|
|
35
40
|
except Exception as e:
|
|
36
41
|
logger.warning(f"Turbopuffer health check failed: {e}")
|
|
37
42
|
return False
|
|
38
43
|
|
|
39
|
-
def
|
|
44
|
+
def generate_vector_id(self, project_id: str, chunk_id: int) -> str:
|
|
40
45
|
"""Generate a unique vector ID."""
|
|
41
46
|
return f"{project_id}_{chunk_id}_{uuid.uuid4().hex[:8]}"
|
|
42
47
|
|
|
43
|
-
|
|
48
|
+
def upsert_vectors(
|
|
44
49
|
self,
|
|
45
50
|
vectors: List[Dict[str, Any]],
|
|
46
51
|
namespace: str,
|
|
47
52
|
**kwargs
|
|
48
53
|
) -> Dict[str, Any]:
|
|
49
|
-
"""
|
|
50
|
-
Store or update vectors in the database.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
vectors: List of vector objects with id, values, and metadata
|
|
54
|
-
namespace: Turbopuffer namespace to store vectors in
|
|
55
|
-
**kwargs: Additional arguments
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
Response from Turbopuffer API
|
|
59
|
-
"""
|
|
54
|
+
"""Store or update vectors in the database."""
|
|
60
55
|
if not vectors:
|
|
61
56
|
return {"upserted": 0}
|
|
62
57
|
|
|
63
58
|
logger.info(f"Upserting {len(vectors)} vectors to namespace '{namespace}'")
|
|
64
59
|
|
|
65
|
-
# Format vectors for Turbopuffer
|
|
60
|
+
# Format vectors for Turbopuffer SDK
|
|
66
61
|
formatted_vectors = []
|
|
67
62
|
for vector in vectors:
|
|
68
63
|
if "id" not in vector or "values" not in vector:
|
|
@@ -75,210 +70,108 @@ class TurbopufferClient(BaseProvider):
|
|
|
75
70
|
}
|
|
76
71
|
formatted_vectors.append(formatted_vector)
|
|
77
72
|
|
|
78
|
-
request_data = {
|
|
79
|
-
"vectors": formatted_vectors,
|
|
80
|
-
}
|
|
81
|
-
|
|
82
73
|
try:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
endpoint=f"/namespaces/{namespace}/vectors",
|
|
86
|
-
data=request_data,
|
|
87
|
-
)
|
|
74
|
+
ns = self.client.namespace(namespace)
|
|
75
|
+
ns.upsert(vectors=formatted_vectors)
|
|
88
76
|
|
|
89
77
|
logger.info(f"Successfully upserted {len(vectors)} vectors")
|
|
90
|
-
return
|
|
78
|
+
return {"upserted": len(vectors)}
|
|
91
79
|
|
|
92
80
|
except Exception as e:
|
|
93
81
|
logger.error(f"Failed to upsert vectors: {e}")
|
|
94
|
-
raise
|
|
82
|
+
raise RuntimeError(f"Vector upsert failed: {e}")
|
|
95
83
|
|
|
96
|
-
|
|
84
|
+
def search_vectors(
|
|
97
85
|
self,
|
|
98
86
|
query_vector: List[float],
|
|
99
87
|
top_k: int = 10,
|
|
100
88
|
namespace: str = "default",
|
|
101
89
|
filters: Optional[Dict[str, Any]] = None,
|
|
102
|
-
include_attributes: bool = True,
|
|
103
90
|
**kwargs
|
|
104
91
|
) -> List[Dict[str, Any]]:
|
|
105
|
-
"""
|
|
106
|
-
Search for similar vectors.
|
|
107
|
-
|
|
108
|
-
Args:
|
|
109
|
-
query_vector: Query vector to search with
|
|
110
|
-
top_k: Number of results to return
|
|
111
|
-
namespace: Turbopuffer namespace to search in
|
|
112
|
-
filters: Metadata filters to apply
|
|
113
|
-
include_attributes: Whether to include vector attributes in results
|
|
114
|
-
**kwargs: Additional arguments
|
|
115
|
-
|
|
116
|
-
Returns:
|
|
117
|
-
List of search results with id, score, and metadata
|
|
118
|
-
"""
|
|
92
|
+
"""Search for similar vectors."""
|
|
119
93
|
logger.debug(f"Searching {top_k} vectors in namespace '{namespace}'")
|
|
120
94
|
|
|
121
|
-
request_data = {
|
|
122
|
-
"vector": query_vector,
|
|
123
|
-
"top_k": top_k,
|
|
124
|
-
"include_attributes": include_attributes,
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
if filters:
|
|
128
|
-
request_data["filters"] = filters
|
|
129
|
-
|
|
130
95
|
try:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
96
|
+
ns = self.client.namespace(namespace)
|
|
97
|
+
|
|
98
|
+
results = ns.query(
|
|
99
|
+
rank_by=[("vector", "ANN", query_vector)],
|
|
100
|
+
top_k=top_k,
|
|
101
|
+
filters=filters,
|
|
102
|
+
include_attributes=True
|
|
135
103
|
)
|
|
136
104
|
|
|
137
|
-
results = response.get("results", [])
|
|
138
105
|
logger.debug(f"Found {len(results)} similar vectors")
|
|
139
|
-
|
|
140
106
|
return results
|
|
141
107
|
|
|
142
108
|
except Exception as e:
|
|
143
109
|
logger.error(f"Vector search failed: {e}")
|
|
144
|
-
raise
|
|
110
|
+
raise RuntimeError(f"Vector search failed: {e}")
|
|
145
111
|
|
|
146
|
-
|
|
112
|
+
def delete_vectors(
|
|
147
113
|
self,
|
|
148
114
|
vector_ids: List[str],
|
|
149
115
|
namespace: str,
|
|
150
116
|
**kwargs
|
|
151
117
|
) -> Dict[str, Any]:
|
|
152
|
-
"""
|
|
153
|
-
Delete vectors by ID.
|
|
154
|
-
|
|
155
|
-
Args:
|
|
156
|
-
vector_ids: List of vector IDs to delete
|
|
157
|
-
namespace: Turbopuffer namespace
|
|
158
|
-
**kwargs: Additional arguments
|
|
159
|
-
|
|
160
|
-
Returns:
|
|
161
|
-
Response from Turbopuffer API
|
|
162
|
-
"""
|
|
118
|
+
"""Delete vectors by ID."""
|
|
163
119
|
if not vector_ids:
|
|
164
120
|
return {"deleted": 0}
|
|
165
121
|
|
|
166
122
|
logger.info(f"Deleting {len(vector_ids)} vectors from namespace '{namespace}'")
|
|
167
123
|
|
|
168
|
-
request_data = {
|
|
169
|
-
"ids": vector_ids,
|
|
170
|
-
}
|
|
171
|
-
|
|
172
124
|
try:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
endpoint=f"/namespaces/{namespace}/vectors",
|
|
176
|
-
data=request_data,
|
|
177
|
-
)
|
|
125
|
+
ns = self.client.namespace(namespace)
|
|
126
|
+
ns.delete(ids=vector_ids)
|
|
178
127
|
|
|
179
128
|
logger.info(f"Successfully deleted vectors")
|
|
180
|
-
return
|
|
129
|
+
return {"deleted": len(vector_ids)}
|
|
181
130
|
|
|
182
131
|
except Exception as e:
|
|
183
132
|
logger.error(f"Failed to delete vectors: {e}")
|
|
184
|
-
raise
|
|
185
|
-
|
|
186
|
-
async def get_namespace_stats(
|
|
187
|
-
self,
|
|
188
|
-
namespace: str,
|
|
189
|
-
**kwargs
|
|
190
|
-
) -> Dict[str, Any]:
|
|
191
|
-
"""
|
|
192
|
-
Get statistics about a namespace.
|
|
193
|
-
|
|
194
|
-
Args:
|
|
195
|
-
namespace: Turbopuffer namespace
|
|
196
|
-
**kwargs: Additional arguments
|
|
197
|
-
|
|
198
|
-
Returns:
|
|
199
|
-
Namespace statistics
|
|
200
|
-
"""
|
|
201
|
-
try:
|
|
202
|
-
response = await self._make_request(
|
|
203
|
-
method="GET",
|
|
204
|
-
endpoint=f"/namespaces/{namespace}",
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
return response
|
|
208
|
-
|
|
209
|
-
except Exception as e:
|
|
210
|
-
logger.error(f"Failed to get namespace stats: {e}")
|
|
211
|
-
raise ProviderError(f"Namespace stats failed: {e}")
|
|
133
|
+
raise RuntimeError(f"Vector deletion failed: {e}")
|
|
212
134
|
|
|
213
|
-
|
|
135
|
+
def list_namespaces(self) -> List[str]:
|
|
214
136
|
"""List all available namespaces."""
|
|
215
137
|
try:
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
endpoint="/namespaces",
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
namespaces = response.get("namespaces", [])
|
|
222
|
-
return [ns["name"] for ns in namespaces]
|
|
138
|
+
namespaces = self.client.namespaces()
|
|
139
|
+
return [ns.name for ns in namespaces]
|
|
223
140
|
|
|
224
141
|
except Exception as e:
|
|
225
142
|
logger.error(f"Failed to list namespaces: {e}")
|
|
226
|
-
raise
|
|
143
|
+
raise RuntimeError(f"Namespace listing failed: {e}")
|
|
227
144
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
namespace: str,
|
|
231
|
-
dimension: int,
|
|
232
|
-
**kwargs
|
|
233
|
-
) -> Dict[str, Any]:
|
|
234
|
-
"""
|
|
235
|
-
Create a new namespace.
|
|
236
|
-
|
|
237
|
-
Args:
|
|
238
|
-
namespace: Name of the namespace to create
|
|
239
|
-
dimension: Vector dimension for the namespace
|
|
240
|
-
**kwargs: Additional arguments
|
|
241
|
-
|
|
242
|
-
Returns:
|
|
243
|
-
Response from Turbopuffer API
|
|
244
|
-
"""
|
|
145
|
+
def create_namespace(self, namespace: str, dimension: int, **kwargs) -> Dict[str, Any]:
|
|
146
|
+
"""Create a new namespace."""
|
|
245
147
|
logger.info(f"Creating namespace '{namespace}' with dimension {dimension}")
|
|
246
148
|
|
|
247
|
-
request_data = {
|
|
248
|
-
"name": namespace,
|
|
249
|
-
"dimension": dimension,
|
|
250
|
-
}
|
|
251
|
-
|
|
252
149
|
try:
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
data=request_data,
|
|
150
|
+
self.client.create_namespace(
|
|
151
|
+
name=namespace,
|
|
152
|
+
dimension=dimension
|
|
257
153
|
)
|
|
258
154
|
|
|
259
155
|
logger.info(f"Successfully created namespace '{namespace}'")
|
|
260
|
-
return
|
|
156
|
+
return {"name": namespace, "dimension": dimension}
|
|
261
157
|
|
|
262
158
|
except Exception as e:
|
|
263
159
|
logger.error(f"Failed to create namespace: {e}")
|
|
264
|
-
raise
|
|
160
|
+
raise RuntimeError(f"Namespace creation failed: {e}")
|
|
265
161
|
|
|
266
|
-
|
|
162
|
+
def delete_namespace(self, namespace: str) -> Dict[str, Any]:
|
|
267
163
|
"""Delete a namespace and all its vectors."""
|
|
268
164
|
logger.warning(f"Deleting namespace '{namespace}' and all its vectors")
|
|
269
165
|
|
|
270
166
|
try:
|
|
271
|
-
|
|
272
|
-
method="DELETE",
|
|
273
|
-
endpoint=f"/namespaces/{namespace}",
|
|
274
|
-
)
|
|
167
|
+
self.client.delete_namespace(namespace)
|
|
275
168
|
|
|
276
169
|
logger.info(f"Successfully deleted namespace '{namespace}'")
|
|
277
|
-
return
|
|
170
|
+
return {"deleted": namespace}
|
|
278
171
|
|
|
279
172
|
except Exception as e:
|
|
280
173
|
logger.error(f"Failed to delete namespace: {e}")
|
|
281
|
-
raise
|
|
174
|
+
raise RuntimeError(f"Namespace deletion failed: {e}")
|
|
282
175
|
|
|
283
176
|
def get_namespace_for_project(self, project_id: str) -> str:
|
|
284
177
|
"""Get the namespace name for a project."""
|
|
@@ -286,7 +179,7 @@ class TurbopufferClient(BaseProvider):
|
|
|
286
179
|
safe_project_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in project_id)
|
|
287
180
|
return f"mcp_code_{safe_project_id}".lower()
|
|
288
181
|
|
|
289
|
-
|
|
182
|
+
def search_with_metadata_filter(
|
|
290
183
|
self,
|
|
291
184
|
query_vector: List[float],
|
|
292
185
|
project_id: str,
|
|
@@ -295,20 +188,7 @@ class TurbopufferClient(BaseProvider):
|
|
|
295
188
|
top_k: int = 10,
|
|
296
189
|
**kwargs
|
|
297
190
|
) -> List[Dict[str, Any]]:
|
|
298
|
-
"""
|
|
299
|
-
Search vectors with metadata filtering.
|
|
300
|
-
|
|
301
|
-
Args:
|
|
302
|
-
query_vector: Query vector
|
|
303
|
-
project_id: Project to search within
|
|
304
|
-
chunk_type: Filter by chunk type (optional)
|
|
305
|
-
file_path: Filter by file path (optional)
|
|
306
|
-
top_k: Number of results to return
|
|
307
|
-
**kwargs: Additional arguments
|
|
308
|
-
|
|
309
|
-
Returns:
|
|
310
|
-
Filtered search results
|
|
311
|
-
"""
|
|
191
|
+
"""Search vectors with metadata filtering."""
|
|
312
192
|
namespace = self.get_namespace_for_project(project_id)
|
|
313
193
|
|
|
314
194
|
# Build metadata filters
|
|
@@ -318,7 +198,7 @@ class TurbopufferClient(BaseProvider):
|
|
|
318
198
|
if file_path:
|
|
319
199
|
filters["file_path"] = file_path
|
|
320
200
|
|
|
321
|
-
return
|
|
201
|
+
return self.search_vectors(
|
|
322
202
|
query_vector=query_vector,
|
|
323
203
|
top_k=top_k,
|
|
324
204
|
namespace=namespace,
|
|
@@ -333,6 +213,5 @@ def create_turbopuffer_client(config: VectorConfig) -> TurbopufferClient:
|
|
|
333
213
|
|
|
334
214
|
return TurbopufferClient(
|
|
335
215
|
api_key=config.turbopuffer_api_key,
|
|
336
|
-
|
|
337
|
-
max_retries=3,
|
|
216
|
+
region=config.turbopuffer_region,
|
|
338
217
|
)
|
|
@@ -1,164 +1,78 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Voyage AI client for embedding generation.
|
|
2
|
+
Voyage AI client for embedding generation using official SDK.
|
|
3
3
|
|
|
4
|
-
Provides integration with Voyage AI's embedding API for generating
|
|
4
|
+
Provides clean integration with Voyage AI's embedding API for generating
|
|
5
5
|
high-quality code embeddings using the voyage-code-2 model.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
|
-
from typing import List, Dict, Any
|
|
10
|
-
import
|
|
9
|
+
from typing import List, Dict, Any
|
|
10
|
+
import voyageai
|
|
11
11
|
|
|
12
|
-
from .base_provider import BaseProvider, ProviderError
|
|
13
12
|
from ..config import VectorConfig
|
|
14
13
|
|
|
15
14
|
logger = logging.getLogger(__name__)
|
|
16
15
|
|
|
17
|
-
class VoyageClient
|
|
18
|
-
"""
|
|
16
|
+
class VoyageClient:
|
|
17
|
+
"""Clean Voyage AI client using official SDK."""
|
|
19
18
|
|
|
20
|
-
def __init__(
|
|
21
|
-
self
|
|
22
|
-
api_key: str,
|
|
23
|
-
model: str = "voyage-code-2",
|
|
24
|
-
base_url: str = "https://api.voyageai.com/v1",
|
|
25
|
-
**kwargs
|
|
26
|
-
):
|
|
27
|
-
super().__init__(api_key, base_url, **kwargs)
|
|
19
|
+
def __init__(self, api_key: str, model: str = "voyage-code-2"):
|
|
20
|
+
self.api_key = api_key
|
|
28
21
|
self.model = model
|
|
29
|
-
self._embedding_dimension:
|
|
22
|
+
self._embedding_dimension: int | None = None
|
|
30
23
|
|
|
31
|
-
#
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
logger.info("Using approximate token counting - Voyage AI handles tokenization internally")
|
|
24
|
+
# Initialize official Voyage AI client
|
|
25
|
+
self.client = voyageai.Client(api_key=api_key)
|
|
26
|
+
logger.info(f"Initialized Voyage AI client with model {model}")
|
|
35
27
|
|
|
36
|
-
|
|
28
|
+
def health_check(self) -> bool:
|
|
37
29
|
"""Check if Voyage AI service is healthy."""
|
|
38
30
|
try:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
return True
|
|
31
|
+
result = self.client.embed(["test"], model=self.model, input_type="query")
|
|
32
|
+
return len(result.embeddings) > 0
|
|
42
33
|
except Exception as e:
|
|
43
34
|
logger.warning(f"Voyage AI health check failed: {e}")
|
|
44
35
|
return False
|
|
45
36
|
|
|
46
|
-
def
|
|
47
|
-
"""Approximate token count - Voyage AI handles exact tokenization."""
|
|
48
|
-
# Voyage AI uses proprietary tokenizer - this is just for batching estimates
|
|
49
|
-
# Rough approximation: 4 characters per token (conservative estimate)
|
|
50
|
-
return len(text) // 4
|
|
51
|
-
|
|
52
|
-
def _batch_texts_by_tokens(
|
|
53
|
-
self,
|
|
54
|
-
texts: List[str],
|
|
55
|
-
max_tokens_per_batch: int = 120000 # Leave buffer under 128k limit
|
|
56
|
-
) -> List[List[str]]:
|
|
57
|
-
"""Batch texts to stay under token limits."""
|
|
58
|
-
batches = []
|
|
59
|
-
current_batch = []
|
|
60
|
-
current_tokens = 0
|
|
61
|
-
|
|
62
|
-
for text in texts:
|
|
63
|
-
text_tokens = self._count_tokens(text)
|
|
64
|
-
|
|
65
|
-
# If single text exceeds limit, truncate it (let Voyage API handle exact truncation)
|
|
66
|
-
if text_tokens > max_tokens_per_batch:
|
|
67
|
-
# Rough character-based truncation - Voyage API will handle exact tokenization
|
|
68
|
-
target_chars = (max_tokens_per_batch - 100) * 4 # Conservative estimate
|
|
69
|
-
text = text[:target_chars]
|
|
70
|
-
text_tokens = self._count_tokens(text)
|
|
71
|
-
|
|
72
|
-
logger.warning(f"Pre-truncated text to ~{text_tokens} tokens (Voyage API will handle exact tokenization)")
|
|
73
|
-
|
|
74
|
-
# Check if adding this text would exceed the batch limit
|
|
75
|
-
if current_tokens + text_tokens > max_tokens_per_batch and current_batch:
|
|
76
|
-
batches.append(current_batch)
|
|
77
|
-
current_batch = [text]
|
|
78
|
-
current_tokens = text_tokens
|
|
79
|
-
else:
|
|
80
|
-
current_batch.append(text)
|
|
81
|
-
current_tokens += text_tokens
|
|
82
|
-
|
|
83
|
-
if current_batch:
|
|
84
|
-
batches.append(current_batch)
|
|
85
|
-
|
|
86
|
-
return batches
|
|
87
|
-
|
|
88
|
-
async def generate_embeddings(
|
|
37
|
+
def generate_embeddings(
|
|
89
38
|
self,
|
|
90
39
|
texts: List[str],
|
|
91
40
|
input_type: str = "document",
|
|
92
|
-
truncation: bool = True,
|
|
93
41
|
**kwargs
|
|
94
42
|
) -> List[List[float]]:
|
|
95
|
-
"""
|
|
96
|
-
Generate embeddings for a list of texts.
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
texts: List of texts to embed
|
|
100
|
-
input_type: Type of input ("document" or "query")
|
|
101
|
-
truncation: Whether to enable truncation
|
|
102
|
-
**kwargs: Additional arguments
|
|
103
|
-
|
|
104
|
-
Returns:
|
|
105
|
-
List of embedding vectors
|
|
106
|
-
"""
|
|
43
|
+
"""Generate embeddings for texts using official SDK."""
|
|
107
44
|
if not texts:
|
|
108
45
|
return []
|
|
109
46
|
|
|
110
47
|
logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
|
|
111
48
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
49
|
+
try:
|
|
50
|
+
result = self.client.embed(
|
|
51
|
+
texts=texts,
|
|
52
|
+
model=self.model,
|
|
53
|
+
input_type=input_type,
|
|
54
|
+
truncation=True
|
|
55
|
+
)
|
|
118
56
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
"
|
|
122
|
-
"input_type": input_type,
|
|
123
|
-
"truncation": truncation,
|
|
124
|
-
}
|
|
57
|
+
# Log usage if available
|
|
58
|
+
if hasattr(result, 'usage') and result.usage:
|
|
59
|
+
logger.debug(f"Token usage: {result.usage.total_tokens}")
|
|
125
60
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
# Extract embeddings from response
|
|
134
|
-
if "data" not in response:
|
|
135
|
-
raise ProviderError("Invalid response format from Voyage AI")
|
|
136
|
-
|
|
137
|
-
batch_embeddings = [item["embedding"] for item in response["data"]]
|
|
138
|
-
all_embeddings.extend(batch_embeddings)
|
|
139
|
-
|
|
140
|
-
# Log usage information if available
|
|
141
|
-
if "usage" in response:
|
|
142
|
-
usage = response["usage"]
|
|
143
|
-
logger.debug(
|
|
144
|
-
f"Batch {i+1} usage: {usage.get('total_tokens', 0)} tokens"
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
except Exception as e:
|
|
148
|
-
logger.error(f"Failed to generate embeddings for batch {i+1}: {e}")
|
|
149
|
-
raise ProviderError(f"Embedding generation failed: {e}")
|
|
150
|
-
|
|
151
|
-
logger.info(f"Successfully generated {len(all_embeddings)} embeddings")
|
|
152
|
-
return all_embeddings
|
|
61
|
+
logger.info(f"Successfully generated {len(result.embeddings)} embeddings")
|
|
62
|
+
return result.embeddings
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Failed to generate embeddings: {e}")
|
|
66
|
+
raise RuntimeError(f"Embedding generation failed: {e}")
|
|
153
67
|
|
|
154
|
-
|
|
68
|
+
def get_embedding_dimension(self) -> int:
|
|
155
69
|
"""Get the dimension of embeddings produced by this model."""
|
|
156
70
|
if self._embedding_dimension is not None:
|
|
157
71
|
return self._embedding_dimension
|
|
158
72
|
|
|
159
73
|
# Generate a test embedding to determine dimension
|
|
160
74
|
try:
|
|
161
|
-
test_embeddings =
|
|
75
|
+
test_embeddings = self.generate_embeddings(["test"], input_type="query")
|
|
162
76
|
if test_embeddings:
|
|
163
77
|
self._embedding_dimension = len(test_embeddings[0])
|
|
164
78
|
logger.info(f"Detected embedding dimension: {self._embedding_dimension}")
|
|
@@ -166,27 +80,22 @@ class VoyageClient(BaseProvider):
|
|
|
166
80
|
except Exception as e:
|
|
167
81
|
logger.warning(f"Could not determine embedding dimension: {e}")
|
|
168
82
|
|
|
169
|
-
# Default dimensions for known Voyage models
|
|
170
|
-
# Note: These may change - verify with Voyage AI documentation
|
|
83
|
+
# Default dimensions for known Voyage models
|
|
171
84
|
model_dimensions = {
|
|
172
|
-
"voyage-code-2": 1536,
|
|
173
|
-
"voyage-2": 1024,
|
|
174
|
-
"voyage-large-2": 1536,
|
|
175
|
-
"voyage-3": 1024,
|
|
85
|
+
"voyage-code-2": 1536,
|
|
86
|
+
"voyage-2": 1024,
|
|
87
|
+
"voyage-large-2": 1536,
|
|
88
|
+
"voyage-3": 1024,
|
|
176
89
|
}
|
|
177
90
|
|
|
178
91
|
self._embedding_dimension = model_dimensions.get(self.model, 1536)
|
|
179
|
-
logger.info(f"Using default dimension
|
|
92
|
+
logger.info(f"Using default embedding dimension: {self._embedding_dimension}")
|
|
180
93
|
return self._embedding_dimension
|
|
181
94
|
|
|
182
|
-
|
|
183
|
-
"""Generate a single embedding for a search query."""
|
|
184
|
-
embeddings = await self.generate_embeddings([query], input_type="query")
|
|
185
|
-
return embeddings[0] if embeddings else []
|
|
186
|
-
|
|
187
|
-
async def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
|
|
95
|
+
def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
|
|
188
96
|
"""Estimate the cost of embedding generation."""
|
|
189
|
-
|
|
97
|
+
# Rough token estimation (4 chars per token)
|
|
98
|
+
total_tokens = sum(len(text) // 4 for text in texts)
|
|
190
99
|
|
|
191
100
|
# Voyage AI pricing (approximate, may change)
|
|
192
101
|
cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
|
|
@@ -207,6 +116,4 @@ def create_voyage_client(config: VectorConfig) -> VoyageClient:
|
|
|
207
116
|
return VoyageClient(
|
|
208
117
|
api_key=config.voyage_api_key,
|
|
209
118
|
model=config.embedding_model,
|
|
210
|
-
timeout=30.0,
|
|
211
|
-
max_retries=3,
|
|
212
119
|
)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: mcp-code-indexer
|
|
3
|
-
Version: 4.1
|
|
3
|
+
Version: 4.2.1
|
|
4
4
|
Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: mcp,model-context-protocol,code-indexer,ai-tools,codebase-navigation,file-descriptions,llm-tools
|
|
7
7
|
Author: MCP Code Indexer Contributors
|
|
8
8
|
Maintainer: MCP Code Indexer Contributors
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.10,<3.13
|
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
11
11
|
Classifier: Environment :: Console
|
|
12
12
|
Classifier: Framework :: AsyncIO
|
|
@@ -14,16 +14,15 @@ Classifier: Intended Audience :: Developers
|
|
|
14
14
|
Classifier: License :: OSI Approved :: MIT License
|
|
15
15
|
Classifier: Operating System :: OS Independent
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Software Development
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
25
|
Classifier: Typing :: Typed
|
|
26
|
-
Provides-Extra: vector
|
|
27
26
|
Requires-Dist: aiofiles (==23.2.0)
|
|
28
27
|
Requires-Dist: aiohttp (>=3.8.0)
|
|
29
28
|
Requires-Dist: aiosqlite (==0.19.0)
|
|
@@ -33,10 +32,15 @@ Requires-Dist: importlib-metadata (>=1.0.0) ; python_version < "3.8"
|
|
|
33
32
|
Requires-Dist: mcp (>=1.9.0)
|
|
34
33
|
Requires-Dist: pydantic (>=2.8.0)
|
|
35
34
|
Requires-Dist: python-multipart (>=0.0.6)
|
|
35
|
+
Requires-Dist: pyyaml (>=6.0)
|
|
36
36
|
Requires-Dist: tenacity (>=8.0.0)
|
|
37
37
|
Requires-Dist: tiktoken (>=0.9.0)
|
|
38
38
|
Requires-Dist: tomli (>=1.2.0) ; python_version < "3.11"
|
|
39
|
+
Requires-Dist: tree-sitter (>=0.25.0)
|
|
40
|
+
Requires-Dist: turbopuffer (>=0.6.0)
|
|
39
41
|
Requires-Dist: uvicorn (>=0.24.0)
|
|
42
|
+
Requires-Dist: voyageai (>=0.3.0)
|
|
43
|
+
Requires-Dist: watchdog (>=6.0.0)
|
|
40
44
|
Project-URL: Documentation, https://github.com/fluffypony/mcp-code-indexer/blob/main/README.md
|
|
41
45
|
Project-URL: Homepage, https://github.com/fluffypony/mcp-code-indexer
|
|
42
46
|
Project-URL: Repository, https://github.com/fluffypony/mcp-code-indexer
|
|
@@ -44,8 +48,8 @@ Description-Content-Type: text/markdown
|
|
|
44
48
|
|
|
45
49
|
# MCP Code Indexer 🚀
|
|
46
50
|
|
|
47
|
-
[](https://badge.fury.io/py/mcp-code-indexer)
|
|
52
|
+
[](https://pypi.org/project/mcp-code-indexer/)
|
|
49
53
|
[](https://opensource.org/licenses/MIT)
|
|
50
54
|
|
|
51
55
|
A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
|
|
@@ -215,12 +219,15 @@ Vector Mode transforms how you search and understand codebases by using AI embed
|
|
|
215
219
|
### 🚀 Quick Start
|
|
216
220
|
|
|
217
221
|
```bash
|
|
218
|
-
# Install vector mode
|
|
219
|
-
pip install mcp-code-indexer
|
|
222
|
+
# Install MCP Code Indexer (includes vector mode)
|
|
223
|
+
pip install mcp-code-indexer
|
|
220
224
|
|
|
221
225
|
# Set required API keys
|
|
222
226
|
export VOYAGE_API_KEY="pa-your-voyage-api-key"
|
|
223
|
-
export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
|
|
227
|
+
export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
|
|
228
|
+
|
|
229
|
+
# Optional: Configure region (default: gcp-europe-west3)
|
|
230
|
+
export TURBOPUFFER_REGION="gcp-europe-west3"
|
|
224
231
|
|
|
225
232
|
# Start with vector mode enabled
|
|
226
233
|
mcp-code-indexer --vector
|
|
@@ -19,7 +19,7 @@ mcp_code_indexer/error_handler.py,sha256=ylciEM-cR7E8Gmd8cfh5olcllJm0FnaYBGH86ya
|
|
|
19
19
|
mcp_code_indexer/file_scanner.py,sha256=7Ab34lRQGeh5GBCzcSP96p4YK6LDWFGUHLXqi499UZ4,11838
|
|
20
20
|
mcp_code_indexer/git_hook_handler.py,sha256=sTtZV3-Yy1Evt06R5NZclELeepM4Ia9OQoR2O6BK3Hk,45517
|
|
21
21
|
mcp_code_indexer/logging_config.py,sha256=M5eVZ5PwfTROib7ISTQ522n2hUSc4hJ_wUgsrJKsTTg,10030
|
|
22
|
-
mcp_code_indexer/main.py,sha256=
|
|
22
|
+
mcp_code_indexer/main.py,sha256=tII1x_LHmD1T951-L1lTzXkR9Vz0z8_pNs-mznfj1CY,37133
|
|
23
23
|
mcp_code_indexer/middleware/__init__.py,sha256=UCEPzOlZldlqFzYEfrXw1HvCDvY1jpLvyaDGUzVr2aw,368
|
|
24
24
|
mcp_code_indexer/middleware/auth.py,sha256=4HkHMDZBNsyPA1VE8qF7pRNKbqG4xIDZjllENbgynxI,7258
|
|
25
25
|
mcp_code_indexer/middleware/error_middleware.py,sha256=0RnKM5fK_n_7AITK2ueAqv30kLBdjU3vaWOTwWd2Xs0,11965
|
|
@@ -46,21 +46,20 @@ mcp_code_indexer/vector_mode/chunking/__init__.py,sha256=rjjFMbHsqWIBzL4IajYxXXJ
|
|
|
46
46
|
mcp_code_indexer/vector_mode/chunking/ast_chunker.py,sha256=GTl_6U0nSgDRRzKS07tJ7RMX8AmJvvY_IsRn95hvVfA,14623
|
|
47
47
|
mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py,sha256=xD0zEibjt6FLBFaKHNc63-iKTtCgnOlLL_9Hc8mCrzE,19752
|
|
48
48
|
mcp_code_indexer/vector_mode/chunking/language_handlers.py,sha256=YEpTVjzyJH445OjniGV05apexsfG5KVR4lwBEl4mGJc,18189
|
|
49
|
-
mcp_code_indexer/vector_mode/config.py,sha256=
|
|
49
|
+
mcp_code_indexer/vector_mode/config.py,sha256=g5p9Q4EAR20DfLv4RxaQnk3_UdysuvWS8rcsjs1vgwI,6680
|
|
50
50
|
mcp_code_indexer/vector_mode/daemon.py,sha256=le3NkxFD73bKeutruzLY-Bauc-nXzlhlIlDJv4jlxhU,12096
|
|
51
51
|
mcp_code_indexer/vector_mode/monitoring/__init__.py,sha256=9rNWCvHxRMvYumdIrPjb5K9fpOwe1Aem24hdh8gXoDM,439
|
|
52
52
|
mcp_code_indexer/vector_mode/monitoring/change_detector.py,sha256=X82e_sKbJJFPhqZFJubLQb8Rs-srRtS7sh0nUOsPCPw,10338
|
|
53
53
|
mcp_code_indexer/vector_mode/monitoring/file_watcher.py,sha256=AQ6YHSKXPubtprLZngeLb0othJOCNQZ7wwXUvqwphT4,15299
|
|
54
54
|
mcp_code_indexer/vector_mode/monitoring/merkle_tree.py,sha256=83RLdUj_cgcAlrT9Wev9IBavVEyc8Jo8w--IOJisLOk,14645
|
|
55
|
-
mcp_code_indexer/vector_mode/providers/__init__.py,sha256=
|
|
56
|
-
mcp_code_indexer/vector_mode/providers/
|
|
57
|
-
mcp_code_indexer/vector_mode/providers/
|
|
58
|
-
mcp_code_indexer/vector_mode/providers/voyage_client.py,sha256=12uVi6Hqo2dfoUnbxaXohlsDmfBkeRKEotbvEPzT3n4,8315
|
|
55
|
+
mcp_code_indexer/vector_mode/providers/__init__.py,sha256=0GhPHn7XEBSHa6bLvy8j0Eqvto82o6Bs2hZCrHawLus,514
|
|
56
|
+
mcp_code_indexer/vector_mode/providers/turbopuffer_client.py,sha256=NdBAghmaRUUIGFZOTOZYhYyXvv_QB36lieGQjVlLEno,7599
|
|
57
|
+
mcp_code_indexer/vector_mode/providers/voyage_client.py,sha256=pfm9BOx5Temf0LM-VZ4LH6xwBmZ6XO8XeCSiSZ5LU80,4375
|
|
59
58
|
mcp_code_indexer/vector_mode/security/__init__.py,sha256=itfeuysSqV-m9xuo-CMkAoucxexVfPgeOU-ieTLvdls,336
|
|
60
59
|
mcp_code_indexer/vector_mode/security/patterns.py,sha256=0xaiMnZm7YXswq3hVe_DJYePE9MhWuvizApLnmXus9M,11572
|
|
61
60
|
mcp_code_indexer/vector_mode/security/redactor.py,sha256=tsFzhCJ99bp4EFqQVjZ-4f8Uf3ux9X4ODVR09oJG01U,13380
|
|
62
|
-
mcp_code_indexer-4.1.
|
|
63
|
-
mcp_code_indexer-4.1.
|
|
64
|
-
mcp_code_indexer-4.1.
|
|
65
|
-
mcp_code_indexer-4.1.
|
|
66
|
-
mcp_code_indexer-4.1.
|
|
61
|
+
mcp_code_indexer-4.2.1.dist-info/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
|
|
62
|
+
mcp_code_indexer-4.2.1.dist-info/METADATA,sha256=jsPpjmDRZabOKWzLTt0MHzdE-jaNJCiosMx2SBQCtJU,27483
|
|
63
|
+
mcp_code_indexer-4.2.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
64
|
+
mcp_code_indexer-4.2.1.dist-info/entry_points.txt,sha256=UABj7HZ0mC6rvF22gxaz2LLNLGQShTrFmp5u00iUtvo,67
|
|
65
|
+
mcp_code_indexer-4.2.1.dist-info/RECORD,,
|
|
@@ -1,230 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Base provider classes with common functionality.
|
|
3
|
-
|
|
4
|
-
Provides retry logic, circuit breaker pattern, and error handling
|
|
5
|
-
for external service integrations.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import logging
|
|
10
|
-
import time
|
|
11
|
-
from typing import Any, Dict, List, Optional, Callable, TypeVar
|
|
12
|
-
from abc import ABC, abstractmethod
|
|
13
|
-
from contextlib import asynccontextmanager
|
|
14
|
-
import aiohttp
|
|
15
|
-
from tenacity import (
|
|
16
|
-
retry,
|
|
17
|
-
stop_after_attempt,
|
|
18
|
-
wait_exponential,
|
|
19
|
-
retry_if_exception_type,
|
|
20
|
-
retry_if_result,
|
|
21
|
-
before_sleep_log,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
logger = logging.getLogger(__name__)
|
|
25
|
-
|
|
26
|
-
T = TypeVar('T')
|
|
27
|
-
|
|
28
|
-
class CircuitBreakerError(Exception):
|
|
29
|
-
"""Raised when circuit breaker is open."""
|
|
30
|
-
pass
|
|
31
|
-
|
|
32
|
-
class ProviderError(Exception):
|
|
33
|
-
"""Base exception for provider errors."""
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
|
-
class RateLimitError(ProviderError):
|
|
37
|
-
"""Raised when rate limit is exceeded."""
|
|
38
|
-
pass
|
|
39
|
-
|
|
40
|
-
class AuthenticationError(ProviderError):
|
|
41
|
-
"""Raised when authentication fails."""
|
|
42
|
-
pass
|
|
43
|
-
|
|
44
|
-
class CircuitBreaker:
|
|
45
|
-
"""Circuit breaker implementation for external services."""
|
|
46
|
-
|
|
47
|
-
def __init__(
|
|
48
|
-
self,
|
|
49
|
-
failure_threshold: int = 5,
|
|
50
|
-
recovery_timeout: float = 60.0,
|
|
51
|
-
expected_exception: type = Exception,
|
|
52
|
-
):
|
|
53
|
-
self.failure_threshold = failure_threshold
|
|
54
|
-
self.recovery_timeout = recovery_timeout
|
|
55
|
-
self.expected_exception = expected_exception
|
|
56
|
-
|
|
57
|
-
self.failure_count = 0
|
|
58
|
-
self.last_failure_time: Optional[float] = None
|
|
59
|
-
self.state = "closed" # closed, open, half-open
|
|
60
|
-
|
|
61
|
-
def _should_attempt_reset(self) -> bool:
|
|
62
|
-
"""Check if we should attempt to reset the circuit breaker."""
|
|
63
|
-
return (
|
|
64
|
-
self.state == "open"
|
|
65
|
-
and self.last_failure_time is not None
|
|
66
|
-
and time.time() - self.last_failure_time >= self.recovery_timeout
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
async def call(self, func: Callable[[], T]) -> T:
|
|
70
|
-
"""Call a function through the circuit breaker."""
|
|
71
|
-
if self.state == "open":
|
|
72
|
-
if self._should_attempt_reset():
|
|
73
|
-
self.state = "half-open"
|
|
74
|
-
logger.info("Circuit breaker attempting reset")
|
|
75
|
-
else:
|
|
76
|
-
raise CircuitBreakerError("Circuit breaker is open")
|
|
77
|
-
|
|
78
|
-
try:
|
|
79
|
-
result = await func()
|
|
80
|
-
# Success - reset failure count
|
|
81
|
-
if self.state == "half-open":
|
|
82
|
-
self.state = "closed"
|
|
83
|
-
logger.info("Circuit breaker reset to closed")
|
|
84
|
-
self.failure_count = 0
|
|
85
|
-
return result
|
|
86
|
-
|
|
87
|
-
except self.expected_exception as e:
|
|
88
|
-
self.failure_count += 1
|
|
89
|
-
self.last_failure_time = time.time()
|
|
90
|
-
|
|
91
|
-
if self.failure_count >= self.failure_threshold:
|
|
92
|
-
self.state = "open"
|
|
93
|
-
logger.warning(
|
|
94
|
-
f"Circuit breaker opened after {self.failure_count} failures"
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
raise
|
|
98
|
-
|
|
99
|
-
class BaseProvider(ABC):
|
|
100
|
-
"""Base class for external service providers."""
|
|
101
|
-
|
|
102
|
-
def __init__(
|
|
103
|
-
self,
|
|
104
|
-
api_key: str,
|
|
105
|
-
base_url: str,
|
|
106
|
-
timeout: float = 30.0,
|
|
107
|
-
max_retries: int = 3,
|
|
108
|
-
circuit_breaker_enabled: bool = True,
|
|
109
|
-
):
|
|
110
|
-
self.api_key = api_key
|
|
111
|
-
self.base_url = base_url.rstrip('/')
|
|
112
|
-
self.timeout = timeout
|
|
113
|
-
self.max_retries = max_retries
|
|
114
|
-
|
|
115
|
-
# Circuit breaker for resilience
|
|
116
|
-
self.circuit_breaker = CircuitBreaker(
|
|
117
|
-
failure_threshold=5,
|
|
118
|
-
recovery_timeout=60.0,
|
|
119
|
-
expected_exception=(aiohttp.ClientError, ProviderError),
|
|
120
|
-
) if circuit_breaker_enabled else None
|
|
121
|
-
|
|
122
|
-
# Rate limiting state
|
|
123
|
-
self.last_request_time: Optional[float] = None
|
|
124
|
-
self.min_request_interval = 0.1 # 100ms between requests
|
|
125
|
-
|
|
126
|
-
# Session will be created lazily
|
|
127
|
-
self._session: Optional[aiohttp.ClientSession] = None
|
|
128
|
-
|
|
129
|
-
@asynccontextmanager
|
|
130
|
-
async def _get_session(self):
|
|
131
|
-
"""Get or create HTTP session."""
|
|
132
|
-
if self._session is None or self._session.closed:
|
|
133
|
-
connector = aiohttp.TCPConnector(
|
|
134
|
-
limit=100,
|
|
135
|
-
limit_per_host=30,
|
|
136
|
-
ttl_dns_cache=300,
|
|
137
|
-
use_dns_cache=True,
|
|
138
|
-
)
|
|
139
|
-
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
|
140
|
-
self._session = aiohttp.ClientSession(
|
|
141
|
-
connector=connector,
|
|
142
|
-
timeout=timeout,
|
|
143
|
-
headers=self._get_default_headers(),
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
try:
|
|
147
|
-
yield self._session
|
|
148
|
-
finally:
|
|
149
|
-
# Keep session alive for reuse
|
|
150
|
-
pass
|
|
151
|
-
|
|
152
|
-
def _get_default_headers(self) -> Dict[str, str]:
|
|
153
|
-
"""Get default headers for API requests."""
|
|
154
|
-
return {
|
|
155
|
-
"Authorization": f"Bearer {self.api_key}",
|
|
156
|
-
"Content-Type": "application/json",
|
|
157
|
-
"User-Agent": "mcp-code-indexer/1.0.0",
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
async def _rate_limit_wait(self) -> None:
|
|
161
|
-
"""Wait if necessary to respect rate limits."""
|
|
162
|
-
if self.last_request_time is not None:
|
|
163
|
-
elapsed = time.time() - self.last_request_time
|
|
164
|
-
if elapsed < self.min_request_interval:
|
|
165
|
-
await asyncio.sleep(self.min_request_interval - elapsed)
|
|
166
|
-
|
|
167
|
-
self.last_request_time = time.time()
|
|
168
|
-
|
|
169
|
-
@retry(
|
|
170
|
-
stop=stop_after_attempt(3),
|
|
171
|
-
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
172
|
-
retry=retry_if_exception_type((aiohttp.ClientError, RateLimitError)),
|
|
173
|
-
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
174
|
-
)
|
|
175
|
-
async def _make_request(
|
|
176
|
-
self,
|
|
177
|
-
method: str,
|
|
178
|
-
endpoint: str,
|
|
179
|
-
data: Optional[Dict[str, Any]] = None,
|
|
180
|
-
params: Optional[Dict[str, Any]] = None,
|
|
181
|
-
**kwargs
|
|
182
|
-
) -> Dict[str, Any]:
|
|
183
|
-
"""Make an HTTP request with retry logic."""
|
|
184
|
-
|
|
185
|
-
async def _request():
|
|
186
|
-
await self._rate_limit_wait()
|
|
187
|
-
|
|
188
|
-
url = f"{self.base_url}/{endpoint.lstrip('/')}"
|
|
189
|
-
|
|
190
|
-
async with self._get_session() as session:
|
|
191
|
-
async with session.request(
|
|
192
|
-
method=method,
|
|
193
|
-
url=url,
|
|
194
|
-
json=data,
|
|
195
|
-
params=params,
|
|
196
|
-
**kwargs
|
|
197
|
-
) as response:
|
|
198
|
-
response_data = await response.json()
|
|
199
|
-
|
|
200
|
-
if response.status == 429:
|
|
201
|
-
raise RateLimitError("Rate limit exceeded")
|
|
202
|
-
elif response.status == 401:
|
|
203
|
-
raise AuthenticationError("Authentication failed")
|
|
204
|
-
elif response.status >= 400:
|
|
205
|
-
raise ProviderError(
|
|
206
|
-
f"HTTP {response.status}: {response_data.get('error', 'Unknown error')}"
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
return response_data
|
|
210
|
-
|
|
211
|
-
if self.circuit_breaker:
|
|
212
|
-
return await self.circuit_breaker.call(_request)
|
|
213
|
-
else:
|
|
214
|
-
return await _request()
|
|
215
|
-
|
|
216
|
-
async def close(self) -> None:
|
|
217
|
-
"""Close the HTTP session."""
|
|
218
|
-
if self._session and not self._session.closed:
|
|
219
|
-
await self._session.close()
|
|
220
|
-
|
|
221
|
-
async def __aenter__(self):
|
|
222
|
-
return self
|
|
223
|
-
|
|
224
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
225
|
-
await self.close()
|
|
226
|
-
|
|
227
|
-
@abstractmethod
|
|
228
|
-
async def health_check(self) -> bool:
|
|
229
|
-
"""Check if the service is healthy."""
|
|
230
|
-
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|