mcp-code-indexer 4.1.0__tar.gz → 4.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/PKG-INFO +16 -9
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/README.md +8 -5
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/pyproject.toml +8 -11
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/main.py +1 -1
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/config.py +14 -2
- mcp_code_indexer-4.2.1/src/mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
- mcp_code_indexer-4.2.1/src/mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
- mcp_code_indexer-4.2.1/src/mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
- mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/__init__.py +0 -72
- mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/base_provider.py +0 -230
- mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +0 -338
- mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/voyage_client.py +0 -212
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/LICENSE +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/__main__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/ask_handler.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/claude_api_handler.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/cleanup_manager.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/commands/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/commands/makelocal.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/data/stop_words_english.txt +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/connection_health.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/database.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/database_factory.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/exceptions.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/models.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/path_resolver.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/retry_executor.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/deepask_handler.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/error_handler.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/file_scanner.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/git_hook_handler.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/logging_config.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/auth.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/error_middleware.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/logging.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/security.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/001_initial.sql +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/002_performance_indexes.sql +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/003_project_overviews.sql +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/004_remove_branch_dependency.sql +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/005_remove_git_remotes.sql +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/006_vector_mode.sql +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/query_preprocessor.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/server/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/server/mcp_server.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/token_counter.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/tools/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/base.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/http_transport.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/stdio_transport.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/ast_chunker.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/language_handlers.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/daemon.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/change_detector.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/file_watcher.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/security/__init__.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/security/patterns.py +0 -0
- {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/security/redactor.py +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: mcp-code-indexer
|
|
3
|
-
Version: 4.1
|
|
3
|
+
Version: 4.2.1
|
|
4
4
|
Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: mcp,model-context-protocol,code-indexer,ai-tools,codebase-navigation,file-descriptions,llm-tools
|
|
7
7
|
Author: MCP Code Indexer Contributors
|
|
8
8
|
Maintainer: MCP Code Indexer Contributors
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.10,<3.13
|
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
11
11
|
Classifier: Environment :: Console
|
|
12
12
|
Classifier: Framework :: AsyncIO
|
|
@@ -14,16 +14,15 @@ Classifier: Intended Audience :: Developers
|
|
|
14
14
|
Classifier: License :: OSI Approved :: MIT License
|
|
15
15
|
Classifier: Operating System :: OS Independent
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Software Development
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
25
|
Classifier: Typing :: Typed
|
|
26
|
-
Provides-Extra: vector
|
|
27
26
|
Requires-Dist: aiofiles (==23.2.0)
|
|
28
27
|
Requires-Dist: aiohttp (>=3.8.0)
|
|
29
28
|
Requires-Dist: aiosqlite (==0.19.0)
|
|
@@ -33,10 +32,15 @@ Requires-Dist: importlib-metadata (>=1.0.0) ; python_version < "3.8"
|
|
|
33
32
|
Requires-Dist: mcp (>=1.9.0)
|
|
34
33
|
Requires-Dist: pydantic (>=2.8.0)
|
|
35
34
|
Requires-Dist: python-multipart (>=0.0.6)
|
|
35
|
+
Requires-Dist: pyyaml (>=6.0)
|
|
36
36
|
Requires-Dist: tenacity (>=8.0.0)
|
|
37
37
|
Requires-Dist: tiktoken (>=0.9.0)
|
|
38
38
|
Requires-Dist: tomli (>=1.2.0) ; python_version < "3.11"
|
|
39
|
+
Requires-Dist: tree-sitter (>=0.25.0)
|
|
40
|
+
Requires-Dist: turbopuffer (>=0.6.0)
|
|
39
41
|
Requires-Dist: uvicorn (>=0.24.0)
|
|
42
|
+
Requires-Dist: voyageai (>=0.3.0)
|
|
43
|
+
Requires-Dist: watchdog (>=6.0.0)
|
|
40
44
|
Project-URL: Documentation, https://github.com/fluffypony/mcp-code-indexer/blob/main/README.md
|
|
41
45
|
Project-URL: Homepage, https://github.com/fluffypony/mcp-code-indexer
|
|
42
46
|
Project-URL: Repository, https://github.com/fluffypony/mcp-code-indexer
|
|
@@ -44,8 +48,8 @@ Description-Content-Type: text/markdown
|
|
|
44
48
|
|
|
45
49
|
# MCP Code Indexer 🚀
|
|
46
50
|
|
|
47
|
-
[](https://badge.fury.io/py/mcp-code-indexer)
|
|
52
|
+
[](https://pypi.org/project/mcp-code-indexer/)
|
|
49
53
|
[](https://opensource.org/licenses/MIT)
|
|
50
54
|
|
|
51
55
|
A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
|
|
@@ -215,12 +219,15 @@ Vector Mode transforms how you search and understand codebases by using AI embed
|
|
|
215
219
|
### 🚀 Quick Start
|
|
216
220
|
|
|
217
221
|
```bash
|
|
218
|
-
# Install vector mode
|
|
219
|
-
pip install mcp-code-indexer
|
|
222
|
+
# Install MCP Code Indexer (includes vector mode)
|
|
223
|
+
pip install mcp-code-indexer
|
|
220
224
|
|
|
221
225
|
# Set required API keys
|
|
222
226
|
export VOYAGE_API_KEY="pa-your-voyage-api-key"
|
|
223
|
-
export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
|
|
227
|
+
export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
|
|
228
|
+
|
|
229
|
+
# Optional: Configure region (default: gcp-europe-west3)
|
|
230
|
+
export TURBOPUFFER_REGION="gcp-europe-west3"
|
|
224
231
|
|
|
225
232
|
# Start with vector mode enabled
|
|
226
233
|
mcp-code-indexer --vector
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# MCP Code Indexer 🚀
|
|
2
2
|
|
|
3
|
-
[](https://badge.fury.io/py/mcp-code-indexer)
|
|
4
|
+
[](https://pypi.org/project/mcp-code-indexer/)
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
|
6
6
|
|
|
7
7
|
A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
|
|
@@ -171,12 +171,15 @@ Vector Mode transforms how you search and understand codebases by using AI embed
|
|
|
171
171
|
### 🚀 Quick Start
|
|
172
172
|
|
|
173
173
|
```bash
|
|
174
|
-
# Install vector mode
|
|
175
|
-
pip install mcp-code-indexer
|
|
174
|
+
# Install MCP Code Indexer (includes vector mode)
|
|
175
|
+
pip install mcp-code-indexer
|
|
176
176
|
|
|
177
177
|
# Set required API keys
|
|
178
178
|
export VOYAGE_API_KEY="pa-your-voyage-api-key"
|
|
179
|
-
export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
|
|
179
|
+
export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
|
|
180
|
+
|
|
181
|
+
# Optional: Configure region (default: gcp-europe-west3)
|
|
182
|
+
export TURBOPUFFER_REGION="gcp-europe-west3"
|
|
180
183
|
|
|
181
184
|
# Start with vector mode enabled
|
|
182
185
|
mcp-code-indexer --vector
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "mcp-code-indexer"
|
|
7
|
-
version = "4.1
|
|
7
|
+
version = "4.2.1"
|
|
8
8
|
description = "MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews."
|
|
9
9
|
authors = ["MCP Code Indexer Contributors"]
|
|
10
10
|
maintainers = ["MCP Code Indexer Contributors"]
|
|
@@ -43,7 +43,7 @@ classifiers = [
|
|
|
43
43
|
packages = [{include = "mcp_code_indexer", from = "src"}]
|
|
44
44
|
|
|
45
45
|
[tool.poetry.dependencies]
|
|
46
|
-
python = "
|
|
46
|
+
python = ">=3.10,<3.13"
|
|
47
47
|
tiktoken = ">=0.9.0"
|
|
48
48
|
mcp = ">=1.9.0"
|
|
49
49
|
gitignore-parser = "0.1.11"
|
|
@@ -58,15 +58,12 @@ importlib-metadata = {version = ">=1.0.0", markers = "python_version < '3.8'"}
|
|
|
58
58
|
fastapi = ">=0.104.0"
|
|
59
59
|
uvicorn = ">=0.24.0"
|
|
60
60
|
python-multipart = ">=0.0.6"
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
"watchdog",
|
|
68
|
-
"pyyaml"
|
|
69
|
-
]
|
|
61
|
+
# Vector mode dependencies (always included)
|
|
62
|
+
voyageai = ">=0.3.0"
|
|
63
|
+
turbopuffer = ">=0.6.0"
|
|
64
|
+
tree-sitter = ">=0.25.0"
|
|
65
|
+
watchdog = ">=6.0.0"
|
|
66
|
+
pyyaml = ">=6.0"
|
|
70
67
|
|
|
71
68
|
[tool.poetry.group.dev.dependencies]
|
|
72
69
|
pytest = ">=8.0.0"
|
|
@@ -1019,7 +1019,7 @@ async def main() -> None:
|
|
|
1019
1019
|
|
|
1020
1020
|
# Check if vector mode is available
|
|
1021
1021
|
if not is_vector_mode_available():
|
|
1022
|
-
logger.error("Vector mode
|
|
1022
|
+
logger.error("Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer")
|
|
1023
1023
|
sys.exit(1)
|
|
1024
1024
|
|
|
1025
1025
|
# Check API keys
|
{mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/config.py
RENAMED
|
@@ -18,6 +18,7 @@ class VectorConfig:
|
|
|
18
18
|
# API Configuration
|
|
19
19
|
voyage_api_key: Optional[str] = None
|
|
20
20
|
turbopuffer_api_key: Optional[str] = None
|
|
21
|
+
turbopuffer_region: str = "gcp-europe-west3"
|
|
21
22
|
|
|
22
23
|
# Embedding Configuration
|
|
23
24
|
embedding_model: str = "voyage-code-2"
|
|
@@ -57,9 +58,10 @@ class VectorConfig:
|
|
|
57
58
|
return cls(
|
|
58
59
|
voyage_api_key=os.getenv("VOYAGE_API_KEY"),
|
|
59
60
|
turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
|
|
60
|
-
|
|
61
|
+
turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
|
|
62
|
+
embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
|
|
61
63
|
batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
|
|
62
|
-
max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "
|
|
64
|
+
max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
|
|
63
65
|
similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
|
|
64
66
|
max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
|
|
65
67
|
enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
|
|
@@ -122,6 +124,16 @@ class VectorConfig:
|
|
|
122
124
|
if not self.turbopuffer_api_key:
|
|
123
125
|
errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
|
|
124
126
|
|
|
127
|
+
# Validate TurboPuffer region
|
|
128
|
+
supported_regions = [
|
|
129
|
+
'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
|
|
130
|
+
'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
|
|
131
|
+
'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
|
|
132
|
+
]
|
|
133
|
+
if self.turbopuffer_region not in supported_regions:
|
|
134
|
+
errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
|
|
135
|
+
f"Supported regions: {', '.join(supported_regions)}")
|
|
136
|
+
|
|
125
137
|
if self.batch_size <= 0:
|
|
126
138
|
errors.append("batch_size must be positive")
|
|
127
139
|
if self.max_tokens_per_chunk <= 0:
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
External service providers for vector mode.
|
|
3
|
+
|
|
4
|
+
This package provides clean integrations with external services using official SDKs:
|
|
5
|
+
- Voyage AI for embedding generation (voyageai SDK)
|
|
6
|
+
- Turbopuffer for vector storage and search (turbopuffer SDK)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .voyage_client import VoyageClient, create_voyage_client
|
|
10
|
+
from .turbopuffer_client import TurbopufferClient, create_turbopuffer_client
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
'VoyageClient',
|
|
14
|
+
'create_voyage_client',
|
|
15
|
+
'TurbopufferClient',
|
|
16
|
+
'create_turbopuffer_client',
|
|
17
|
+
]
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Turbopuffer client for vector storage and search using official SDK.
|
|
3
|
+
|
|
4
|
+
Provides clean integration with Turbopuffer's vector database for storing
|
|
5
|
+
embeddings and performing similarity searches. Supports configurable
|
|
6
|
+
regions for optimal latency and data residency compliance.
|
|
7
|
+
|
|
8
|
+
Default region: gcp-europe-west3 (Frankfurt)
|
|
9
|
+
Configure via TURBOPUFFER_REGION environment variable.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import uuid
|
|
14
|
+
from typing import List, Dict, Any, Optional
|
|
15
|
+
import turbopuffer
|
|
16
|
+
|
|
17
|
+
from ..config import VectorConfig
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
class TurbopufferClient:
|
|
22
|
+
"""Clean Turbopuffer client using official SDK."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, api_key: str, region: str = "gcp-europe-west3"):
|
|
25
|
+
self.api_key = api_key
|
|
26
|
+
self.region = region
|
|
27
|
+
|
|
28
|
+
# Initialize official TurboPuffer client
|
|
29
|
+
self.client = turbopuffer.Turbopuffer(
|
|
30
|
+
api_key=api_key,
|
|
31
|
+
region=region
|
|
32
|
+
)
|
|
33
|
+
logger.info(f"Initialized TurboPuffer client with region {region}")
|
|
34
|
+
|
|
35
|
+
def health_check(self) -> bool:
|
|
36
|
+
"""Check if Turbopuffer service is healthy."""
|
|
37
|
+
try:
|
|
38
|
+
namespaces = self.client.namespaces()
|
|
39
|
+
return True
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.warning(f"Turbopuffer health check failed: {e}")
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
def generate_vector_id(self, project_id: str, chunk_id: int) -> str:
|
|
45
|
+
"""Generate a unique vector ID."""
|
|
46
|
+
return f"{project_id}_{chunk_id}_{uuid.uuid4().hex[:8]}"
|
|
47
|
+
|
|
48
|
+
def upsert_vectors(
|
|
49
|
+
self,
|
|
50
|
+
vectors: List[Dict[str, Any]],
|
|
51
|
+
namespace: str,
|
|
52
|
+
**kwargs
|
|
53
|
+
) -> Dict[str, Any]:
|
|
54
|
+
"""Store or update vectors in the database."""
|
|
55
|
+
if not vectors:
|
|
56
|
+
return {"upserted": 0}
|
|
57
|
+
|
|
58
|
+
logger.info(f"Upserting {len(vectors)} vectors to namespace '{namespace}'")
|
|
59
|
+
|
|
60
|
+
# Format vectors for Turbopuffer SDK
|
|
61
|
+
formatted_vectors = []
|
|
62
|
+
for vector in vectors:
|
|
63
|
+
if "id" not in vector or "values" not in vector:
|
|
64
|
+
raise ValueError("Each vector must have 'id' and 'values' fields")
|
|
65
|
+
|
|
66
|
+
formatted_vector = {
|
|
67
|
+
"id": str(vector["id"]),
|
|
68
|
+
"vector": vector["values"],
|
|
69
|
+
"attributes": vector.get("metadata", {}),
|
|
70
|
+
}
|
|
71
|
+
formatted_vectors.append(formatted_vector)
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
ns = self.client.namespace(namespace)
|
|
75
|
+
ns.upsert(vectors=formatted_vectors)
|
|
76
|
+
|
|
77
|
+
logger.info(f"Successfully upserted {len(vectors)} vectors")
|
|
78
|
+
return {"upserted": len(vectors)}
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"Failed to upsert vectors: {e}")
|
|
82
|
+
raise RuntimeError(f"Vector upsert failed: {e}")
|
|
83
|
+
|
|
84
|
+
def search_vectors(
|
|
85
|
+
self,
|
|
86
|
+
query_vector: List[float],
|
|
87
|
+
top_k: int = 10,
|
|
88
|
+
namespace: str = "default",
|
|
89
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
90
|
+
**kwargs
|
|
91
|
+
) -> List[Dict[str, Any]]:
|
|
92
|
+
"""Search for similar vectors."""
|
|
93
|
+
logger.debug(f"Searching {top_k} vectors in namespace '{namespace}'")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
ns = self.client.namespace(namespace)
|
|
97
|
+
|
|
98
|
+
results = ns.query(
|
|
99
|
+
rank_by=[("vector", "ANN", query_vector)],
|
|
100
|
+
top_k=top_k,
|
|
101
|
+
filters=filters,
|
|
102
|
+
include_attributes=True
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
logger.debug(f"Found {len(results)} similar vectors")
|
|
106
|
+
return results
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Vector search failed: {e}")
|
|
110
|
+
raise RuntimeError(f"Vector search failed: {e}")
|
|
111
|
+
|
|
112
|
+
def delete_vectors(
|
|
113
|
+
self,
|
|
114
|
+
vector_ids: List[str],
|
|
115
|
+
namespace: str,
|
|
116
|
+
**kwargs
|
|
117
|
+
) -> Dict[str, Any]:
|
|
118
|
+
"""Delete vectors by ID."""
|
|
119
|
+
if not vector_ids:
|
|
120
|
+
return {"deleted": 0}
|
|
121
|
+
|
|
122
|
+
logger.info(f"Deleting {len(vector_ids)} vectors from namespace '{namespace}'")
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
ns = self.client.namespace(namespace)
|
|
126
|
+
ns.delete(ids=vector_ids)
|
|
127
|
+
|
|
128
|
+
logger.info(f"Successfully deleted vectors")
|
|
129
|
+
return {"deleted": len(vector_ids)}
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"Failed to delete vectors: {e}")
|
|
133
|
+
raise RuntimeError(f"Vector deletion failed: {e}")
|
|
134
|
+
|
|
135
|
+
def list_namespaces(self) -> List[str]:
|
|
136
|
+
"""List all available namespaces."""
|
|
137
|
+
try:
|
|
138
|
+
namespaces = self.client.namespaces()
|
|
139
|
+
return [ns.name for ns in namespaces]
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"Failed to list namespaces: {e}")
|
|
143
|
+
raise RuntimeError(f"Namespace listing failed: {e}")
|
|
144
|
+
|
|
145
|
+
def create_namespace(self, namespace: str, dimension: int, **kwargs) -> Dict[str, Any]:
|
|
146
|
+
"""Create a new namespace."""
|
|
147
|
+
logger.info(f"Creating namespace '{namespace}' with dimension {dimension}")
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
self.client.create_namespace(
|
|
151
|
+
name=namespace,
|
|
152
|
+
dimension=dimension
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
logger.info(f"Successfully created namespace '{namespace}'")
|
|
156
|
+
return {"name": namespace, "dimension": dimension}
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f"Failed to create namespace: {e}")
|
|
160
|
+
raise RuntimeError(f"Namespace creation failed: {e}")
|
|
161
|
+
|
|
162
|
+
def delete_namespace(self, namespace: str) -> Dict[str, Any]:
|
|
163
|
+
"""Delete a namespace and all its vectors."""
|
|
164
|
+
logger.warning(f"Deleting namespace '{namespace}' and all its vectors")
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
self.client.delete_namespace(namespace)
|
|
168
|
+
|
|
169
|
+
logger.info(f"Successfully deleted namespace '{namespace}'")
|
|
170
|
+
return {"deleted": namespace}
|
|
171
|
+
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.error(f"Failed to delete namespace: {e}")
|
|
174
|
+
raise RuntimeError(f"Namespace deletion failed: {e}")
|
|
175
|
+
|
|
176
|
+
def get_namespace_for_project(self, project_id: str) -> str:
|
|
177
|
+
"""Get the namespace name for a project."""
|
|
178
|
+
# Use project ID as namespace, with prefix for safety
|
|
179
|
+
safe_project_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in project_id)
|
|
180
|
+
return f"mcp_code_{safe_project_id}".lower()
|
|
181
|
+
|
|
182
|
+
def search_with_metadata_filter(
|
|
183
|
+
self,
|
|
184
|
+
query_vector: List[float],
|
|
185
|
+
project_id: str,
|
|
186
|
+
chunk_type: Optional[str] = None,
|
|
187
|
+
file_path: Optional[str] = None,
|
|
188
|
+
top_k: int = 10,
|
|
189
|
+
**kwargs
|
|
190
|
+
) -> List[Dict[str, Any]]:
|
|
191
|
+
"""Search vectors with metadata filtering."""
|
|
192
|
+
namespace = self.get_namespace_for_project(project_id)
|
|
193
|
+
|
|
194
|
+
# Build metadata filters
|
|
195
|
+
filters = {"project_id": project_id}
|
|
196
|
+
if chunk_type:
|
|
197
|
+
filters["chunk_type"] = chunk_type
|
|
198
|
+
if file_path:
|
|
199
|
+
filters["file_path"] = file_path
|
|
200
|
+
|
|
201
|
+
return self.search_vectors(
|
|
202
|
+
query_vector=query_vector,
|
|
203
|
+
top_k=top_k,
|
|
204
|
+
namespace=namespace,
|
|
205
|
+
filters=filters,
|
|
206
|
+
**kwargs
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def create_turbopuffer_client(config: VectorConfig) -> TurbopufferClient:
|
|
210
|
+
"""Create a Turbopuffer client from configuration."""
|
|
211
|
+
if not config.turbopuffer_api_key:
|
|
212
|
+
raise ValueError("TURBOPUFFER_API_KEY is required for vector storage")
|
|
213
|
+
|
|
214
|
+
return TurbopufferClient(
|
|
215
|
+
api_key=config.turbopuffer_api_key,
|
|
216
|
+
region=config.turbopuffer_region,
|
|
217
|
+
)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Voyage AI client for embedding generation using official SDK.
|
|
3
|
+
|
|
4
|
+
Provides clean integration with Voyage AI's embedding API for generating
|
|
5
|
+
high-quality code embeddings using the voyage-code-2 model.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import List, Dict, Any
|
|
10
|
+
import voyageai
|
|
11
|
+
|
|
12
|
+
from ..config import VectorConfig
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
class VoyageClient:
|
|
17
|
+
"""Clean Voyage AI client using official SDK."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, api_key: str, model: str = "voyage-code-2"):
|
|
20
|
+
self.api_key = api_key
|
|
21
|
+
self.model = model
|
|
22
|
+
self._embedding_dimension: int | None = None
|
|
23
|
+
|
|
24
|
+
# Initialize official Voyage AI client
|
|
25
|
+
self.client = voyageai.Client(api_key=api_key)
|
|
26
|
+
logger.info(f"Initialized Voyage AI client with model {model}")
|
|
27
|
+
|
|
28
|
+
def health_check(self) -> bool:
|
|
29
|
+
"""Check if Voyage AI service is healthy."""
|
|
30
|
+
try:
|
|
31
|
+
result = self.client.embed(["test"], model=self.model, input_type="query")
|
|
32
|
+
return len(result.embeddings) > 0
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.warning(f"Voyage AI health check failed: {e}")
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
def generate_embeddings(
|
|
38
|
+
self,
|
|
39
|
+
texts: List[str],
|
|
40
|
+
input_type: str = "document",
|
|
41
|
+
**kwargs
|
|
42
|
+
) -> List[List[float]]:
|
|
43
|
+
"""Generate embeddings for texts using official SDK."""
|
|
44
|
+
if not texts:
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
result = self.client.embed(
|
|
51
|
+
texts=texts,
|
|
52
|
+
model=self.model,
|
|
53
|
+
input_type=input_type,
|
|
54
|
+
truncation=True
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Log usage if available
|
|
58
|
+
if hasattr(result, 'usage') and result.usage:
|
|
59
|
+
logger.debug(f"Token usage: {result.usage.total_tokens}")
|
|
60
|
+
|
|
61
|
+
logger.info(f"Successfully generated {len(result.embeddings)} embeddings")
|
|
62
|
+
return result.embeddings
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Failed to generate embeddings: {e}")
|
|
66
|
+
raise RuntimeError(f"Embedding generation failed: {e}")
|
|
67
|
+
|
|
68
|
+
def get_embedding_dimension(self) -> int:
|
|
69
|
+
"""Get the dimension of embeddings produced by this model."""
|
|
70
|
+
if self._embedding_dimension is not None:
|
|
71
|
+
return self._embedding_dimension
|
|
72
|
+
|
|
73
|
+
# Generate a test embedding to determine dimension
|
|
74
|
+
try:
|
|
75
|
+
test_embeddings = self.generate_embeddings(["test"], input_type="query")
|
|
76
|
+
if test_embeddings:
|
|
77
|
+
self._embedding_dimension = len(test_embeddings[0])
|
|
78
|
+
logger.info(f"Detected embedding dimension: {self._embedding_dimension}")
|
|
79
|
+
return self._embedding_dimension
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.warning(f"Could not determine embedding dimension: {e}")
|
|
82
|
+
|
|
83
|
+
# Default dimensions for known Voyage models
|
|
84
|
+
model_dimensions = {
|
|
85
|
+
"voyage-code-2": 1536,
|
|
86
|
+
"voyage-2": 1024,
|
|
87
|
+
"voyage-large-2": 1536,
|
|
88
|
+
"voyage-3": 1024,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
self._embedding_dimension = model_dimensions.get(self.model, 1536)
|
|
92
|
+
logger.info(f"Using default embedding dimension: {self._embedding_dimension}")
|
|
93
|
+
return self._embedding_dimension
|
|
94
|
+
|
|
95
|
+
def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
|
|
96
|
+
"""Estimate the cost of embedding generation."""
|
|
97
|
+
# Rough token estimation (4 chars per token)
|
|
98
|
+
total_tokens = sum(len(text) // 4 for text in texts)
|
|
99
|
+
|
|
100
|
+
# Voyage AI pricing (approximate, may change)
|
|
101
|
+
cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
|
|
102
|
+
estimated_cost = (total_tokens / 1000) * cost_per_1k_tokens
|
|
103
|
+
|
|
104
|
+
return {
|
|
105
|
+
"total_tokens": total_tokens,
|
|
106
|
+
"total_texts": len(texts),
|
|
107
|
+
"estimated_cost_usd": round(estimated_cost, 6),
|
|
108
|
+
"model": self.model,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
def create_voyage_client(config: VectorConfig) -> VoyageClient:
|
|
112
|
+
"""Create a Voyage client from configuration."""
|
|
113
|
+
if not config.voyage_api_key:
|
|
114
|
+
raise ValueError("VOYAGE_API_KEY is required for embedding generation")
|
|
115
|
+
|
|
116
|
+
return VoyageClient(
|
|
117
|
+
api_key=config.voyage_api_key,
|
|
118
|
+
model=config.embedding_model,
|
|
119
|
+
)
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
External service providers for vector mode.
|
|
3
|
-
|
|
4
|
-
This package provides integrations with external services including:
|
|
5
|
-
- Voyage AI for embedding generation
|
|
6
|
-
- Turbopuffer for vector storage and search
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from typing import Protocol, List, Dict, Any, Optional
|
|
10
|
-
from abc import abstractmethod
|
|
11
|
-
|
|
12
|
-
class EmbeddingProvider(Protocol):
|
|
13
|
-
"""Protocol for embedding generation providers."""
|
|
14
|
-
|
|
15
|
-
@abstractmethod
|
|
16
|
-
async def generate_embeddings(
|
|
17
|
-
self,
|
|
18
|
-
texts: List[str],
|
|
19
|
-
input_type: str = "document",
|
|
20
|
-
**kwargs
|
|
21
|
-
) -> List[List[float]]:
|
|
22
|
-
"""Generate embeddings for a list of texts."""
|
|
23
|
-
...
|
|
24
|
-
|
|
25
|
-
@abstractmethod
|
|
26
|
-
async def get_embedding_dimension(self) -> int:
|
|
27
|
-
"""Get the dimension of embeddings produced by this provider."""
|
|
28
|
-
...
|
|
29
|
-
|
|
30
|
-
class VectorStoreProvider(Protocol):
|
|
31
|
-
"""Protocol for vector storage providers."""
|
|
32
|
-
|
|
33
|
-
@abstractmethod
|
|
34
|
-
async def upsert_vectors(
|
|
35
|
-
self,
|
|
36
|
-
vectors: List[Dict[str, Any]],
|
|
37
|
-
namespace: Optional[str] = None,
|
|
38
|
-
**kwargs
|
|
39
|
-
) -> Dict[str, Any]:
|
|
40
|
-
"""Store or update vectors in the database."""
|
|
41
|
-
...
|
|
42
|
-
|
|
43
|
-
@abstractmethod
|
|
44
|
-
async def search_vectors(
|
|
45
|
-
self,
|
|
46
|
-
query_vector: List[float],
|
|
47
|
-
top_k: int = 10,
|
|
48
|
-
namespace: Optional[str] = None,
|
|
49
|
-
filters: Optional[Dict[str, Any]] = None,
|
|
50
|
-
**kwargs
|
|
51
|
-
) -> List[Dict[str, Any]]:
|
|
52
|
-
"""Search for similar vectors."""
|
|
53
|
-
...
|
|
54
|
-
|
|
55
|
-
@abstractmethod
|
|
56
|
-
async def delete_vectors(
|
|
57
|
-
self,
|
|
58
|
-
vector_ids: List[str],
|
|
59
|
-
namespace: Optional[str] = None,
|
|
60
|
-
**kwargs
|
|
61
|
-
) -> Dict[str, Any]:
|
|
62
|
-
"""Delete vectors by ID."""
|
|
63
|
-
...
|
|
64
|
-
|
|
65
|
-
@abstractmethod
|
|
66
|
-
async def get_namespace_stats(
|
|
67
|
-
self,
|
|
68
|
-
namespace: Optional[str] = None,
|
|
69
|
-
**kwargs
|
|
70
|
-
) -> Dict[str, Any]:
|
|
71
|
-
"""Get statistics about a namespace."""
|
|
72
|
-
...
|