mcp-code-indexer 4.1.0__tar.gz → 4.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/PKG-INFO +16 -9
  2. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/README.md +8 -5
  3. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/pyproject.toml +8 -11
  4. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/main.py +1 -1
  5. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/config.py +14 -2
  6. mcp_code_indexer-4.2.1/src/mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  7. mcp_code_indexer-4.2.1/src/mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  8. mcp_code_indexer-4.2.1/src/mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  9. mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/__init__.py +0 -72
  10. mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/base_provider.py +0 -230
  11. mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +0 -338
  12. mcp_code_indexer-4.1.0/src/mcp_code_indexer/vector_mode/providers/voyage_client.py +0 -212
  13. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/LICENSE +0 -0
  14. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/__init__.py +0 -0
  15. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/__main__.py +0 -0
  16. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/ask_handler.py +0 -0
  17. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/claude_api_handler.py +0 -0
  18. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/cleanup_manager.py +0 -0
  19. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/commands/__init__.py +0 -0
  20. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/commands/makelocal.py +0 -0
  21. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/data/stop_words_english.txt +0 -0
  22. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/__init__.py +0 -0
  23. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/connection_health.py +0 -0
  24. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/database.py +0 -0
  25. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/database_factory.py +0 -0
  26. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/exceptions.py +0 -0
  27. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/models.py +0 -0
  28. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/path_resolver.py +0 -0
  29. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/database/retry_executor.py +0 -0
  30. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/deepask_handler.py +0 -0
  31. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/error_handler.py +0 -0
  32. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/file_scanner.py +0 -0
  33. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/git_hook_handler.py +0 -0
  34. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/logging_config.py +0 -0
  35. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/__init__.py +0 -0
  36. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/auth.py +0 -0
  37. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/error_middleware.py +0 -0
  38. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/logging.py +0 -0
  39. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/middleware/security.py +0 -0
  40. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/001_initial.sql +0 -0
  41. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/002_performance_indexes.sql +0 -0
  42. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/003_project_overviews.sql +0 -0
  43. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/004_remove_branch_dependency.sql +0 -0
  44. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/005_remove_git_remotes.sql +0 -0
  45. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/migrations/006_vector_mode.sql +0 -0
  46. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/query_preprocessor.py +0 -0
  47. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/server/__init__.py +0 -0
  48. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/server/mcp_server.py +0 -0
  49. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 +0 -0
  50. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/token_counter.py +0 -0
  51. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/tools/__init__.py +0 -0
  52. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/__init__.py +0 -0
  53. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/base.py +0 -0
  54. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/http_transport.py +0 -0
  55. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/transport/stdio_transport.py +0 -0
  56. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/__init__.py +0 -0
  57. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/__init__.py +0 -0
  58. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/ast_chunker.py +0 -0
  59. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +0 -0
  60. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/chunking/language_handlers.py +0 -0
  61. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/daemon.py +0 -0
  62. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/__init__.py +0 -0
  63. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/change_detector.py +0 -0
  64. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/file_watcher.py +0 -0
  65. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +0 -0
  66. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/security/__init__.py +0 -0
  67. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/security/patterns.py +0 -0
  68. {mcp_code_indexer-4.1.0 → mcp_code_indexer-4.2.1}/src/mcp_code_indexer/vector_mode/security/redactor.py +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: mcp-code-indexer
3
- Version: 4.1.0
3
+ Version: 4.2.1
4
4
  Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
5
5
  License: MIT
6
6
  Keywords: mcp,model-context-protocol,code-indexer,ai-tools,codebase-navigation,file-descriptions,llm-tools
7
7
  Author: MCP Code Indexer Contributors
8
8
  Maintainer: MCP Code Indexer Contributors
9
- Requires-Python: >=3.9,<4.0
9
+ Requires-Python: >=3.10,<3.13
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Environment :: Console
12
12
  Classifier: Framework :: AsyncIO
@@ -14,16 +14,15 @@ Classifier: Intended Audience :: Developers
14
14
  Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Operating System :: OS Independent
16
16
  Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
20
19
  Classifier: Programming Language :: Python :: 3.12
21
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.9
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development
24
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
25
  Classifier: Typing :: Typed
26
- Provides-Extra: vector
27
26
  Requires-Dist: aiofiles (==23.2.0)
28
27
  Requires-Dist: aiohttp (>=3.8.0)
29
28
  Requires-Dist: aiosqlite (==0.19.0)
@@ -33,10 +32,15 @@ Requires-Dist: importlib-metadata (>=1.0.0) ; python_version < "3.8"
33
32
  Requires-Dist: mcp (>=1.9.0)
34
33
  Requires-Dist: pydantic (>=2.8.0)
35
34
  Requires-Dist: python-multipart (>=0.0.6)
35
+ Requires-Dist: pyyaml (>=6.0)
36
36
  Requires-Dist: tenacity (>=8.0.0)
37
37
  Requires-Dist: tiktoken (>=0.9.0)
38
38
  Requires-Dist: tomli (>=1.2.0) ; python_version < "3.11"
39
+ Requires-Dist: tree-sitter (>=0.25.0)
40
+ Requires-Dist: turbopuffer (>=0.6.0)
39
41
  Requires-Dist: uvicorn (>=0.24.0)
42
+ Requires-Dist: voyageai (>=0.3.0)
43
+ Requires-Dist: watchdog (>=6.0.0)
40
44
  Project-URL: Documentation, https://github.com/fluffypony/mcp-code-indexer/blob/main/README.md
41
45
  Project-URL: Homepage, https://github.com/fluffypony/mcp-code-indexer
42
46
  Project-URL: Repository, https://github.com/fluffypony/mcp-code-indexer
@@ -44,8 +48,8 @@ Description-Content-Type: text/markdown
44
48
 
45
49
  # MCP Code Indexer 🚀
46
50
 
47
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?46)](https://badge.fury.io/py/mcp-code-indexer)
48
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?46)](https://pypi.org/project/mcp-code-indexer/)
51
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?48)](https://badge.fury.io/py/mcp-code-indexer)
52
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?48)](https://pypi.org/project/mcp-code-indexer/)
49
53
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
50
54
 
51
55
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -215,12 +219,15 @@ Vector Mode transforms how you search and understand codebases by using AI embed
215
219
  ### 🚀 Quick Start
216
220
 
217
221
  ```bash
218
- # Install vector mode dependencies
219
- pip install mcp-code-indexer[vector]
222
+ # Install MCP Code Indexer (includes vector mode)
223
+ pip install mcp-code-indexer
220
224
 
221
225
  # Set required API keys
222
226
  export VOYAGE_API_KEY="pa-your-voyage-api-key"
223
- export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
227
+ export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
228
+
229
+ # Optional: Configure region (default: gcp-europe-west3)
230
+ export TURBOPUFFER_REGION="gcp-europe-west3"
224
231
 
225
232
  # Start with vector mode enabled
226
233
  mcp-code-indexer --vector
@@ -1,7 +1,7 @@
1
1
  # MCP Code Indexer 🚀
2
2
 
3
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?46)](https://badge.fury.io/py/mcp-code-indexer)
4
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?46)](https://pypi.org/project/mcp-code-indexer/)
3
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?48)](https://badge.fury.io/py/mcp-code-indexer)
4
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?48)](https://pypi.org/project/mcp-code-indexer/)
5
5
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
6
 
7
7
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -171,12 +171,15 @@ Vector Mode transforms how you search and understand codebases by using AI embed
171
171
  ### 🚀 Quick Start
172
172
 
173
173
  ```bash
174
- # Install vector mode dependencies
175
- pip install mcp-code-indexer[vector]
174
+ # Install MCP Code Indexer (includes vector mode)
175
+ pip install mcp-code-indexer
176
176
 
177
177
  # Set required API keys
178
178
  export VOYAGE_API_KEY="pa-your-voyage-api-key"
179
- export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
179
+ export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
180
+
181
+ # Optional: Configure region (default: gcp-europe-west3)
182
+ export TURBOPUFFER_REGION="gcp-europe-west3"
180
183
 
181
184
  # Start with vector mode enabled
182
185
  mcp-code-indexer --vector
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "mcp-code-indexer"
7
- version = "4.1.0"
7
+ version = "4.2.1"
8
8
  description = "MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews."
9
9
  authors = ["MCP Code Indexer Contributors"]
10
10
  maintainers = ["MCP Code Indexer Contributors"]
@@ -43,7 +43,7 @@ classifiers = [
43
43
  packages = [{include = "mcp_code_indexer", from = "src"}]
44
44
 
45
45
  [tool.poetry.dependencies]
46
- python = "^3.9"
46
+ python = ">=3.10,<3.13"
47
47
  tiktoken = ">=0.9.0"
48
48
  mcp = ">=1.9.0"
49
49
  gitignore-parser = "0.1.11"
@@ -58,15 +58,12 @@ importlib-metadata = {version = ">=1.0.0", markers = "python_version < '3.8'"}
58
58
  fastapi = ">=0.104.0"
59
59
  uvicorn = ">=0.24.0"
60
60
  python-multipart = ">=0.0.6"
61
-
62
- [tool.poetry.extras]
63
- vector = [
64
- "voyageai",
65
- "turbopuffer",
66
- "tree-sitter",
67
- "watchdog",
68
- "pyyaml"
69
- ]
61
+ # Vector mode dependencies (always included)
62
+ voyageai = ">=0.3.0"
63
+ turbopuffer = ">=0.6.0"
64
+ tree-sitter = ">=0.25.0"
65
+ watchdog = ">=6.0.0"
66
+ pyyaml = ">=6.0"
70
67
 
71
68
  [tool.poetry.group.dev.dependencies]
72
69
  pytest = ">=8.0.0"
@@ -1019,7 +1019,7 @@ async def main() -> None:
1019
1019
 
1020
1020
  # Check if vector mode is available
1021
1021
  if not is_vector_mode_available():
1022
- logger.error("Vector mode requires additional dependencies. Install with: pip install mcp-code-indexer[vector]")
1022
+ logger.error("Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer")
1023
1023
  sys.exit(1)
1024
1024
 
1025
1025
  # Check API keys
@@ -18,6 +18,7 @@ class VectorConfig:
18
18
  # API Configuration
19
19
  voyage_api_key: Optional[str] = None
20
20
  turbopuffer_api_key: Optional[str] = None
21
+ turbopuffer_region: str = "gcp-europe-west3"
21
22
 
22
23
  # Embedding Configuration
23
24
  embedding_model: str = "voyage-code-2"
@@ -57,9 +58,10 @@ class VectorConfig:
57
58
  return cls(
58
59
  voyage_api_key=os.getenv("VOYAGE_API_KEY"),
59
60
  turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
60
- embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-2"),
61
+ turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
62
+ embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
61
63
  batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
62
- max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "1024")),
64
+ max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
63
65
  similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
64
66
  max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
65
67
  enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
@@ -122,6 +124,16 @@ class VectorConfig:
122
124
  if not self.turbopuffer_api_key:
123
125
  errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
124
126
 
127
+ # Validate TurboPuffer region
128
+ supported_regions = [
129
+ 'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
130
+ 'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
131
+ 'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
132
+ ]
133
+ if self.turbopuffer_region not in supported_regions:
134
+ errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
135
+ f"Supported regions: {', '.join(supported_regions)}")
136
+
125
137
  if self.batch_size <= 0:
126
138
  errors.append("batch_size must be positive")
127
139
  if self.max_tokens_per_chunk <= 0:
@@ -0,0 +1,17 @@
1
+ """
2
+ External service providers for vector mode.
3
+
4
+ This package provides clean integrations with external services using official SDKs:
5
+ - Voyage AI for embedding generation (voyageai SDK)
6
+ - Turbopuffer for vector storage and search (turbopuffer SDK)
7
+ """
8
+
9
+ from .voyage_client import VoyageClient, create_voyage_client
10
+ from .turbopuffer_client import TurbopufferClient, create_turbopuffer_client
11
+
12
+ __all__ = [
13
+ 'VoyageClient',
14
+ 'create_voyage_client',
15
+ 'TurbopufferClient',
16
+ 'create_turbopuffer_client',
17
+ ]
@@ -0,0 +1,217 @@
1
+ """
2
+ Turbopuffer client for vector storage and search using official SDK.
3
+
4
+ Provides clean integration with Turbopuffer's vector database for storing
5
+ embeddings and performing similarity searches. Supports configurable
6
+ regions for optimal latency and data residency compliance.
7
+
8
+ Default region: gcp-europe-west3 (Frankfurt)
9
+ Configure via TURBOPUFFER_REGION environment variable.
10
+ """
11
+
12
+ import logging
13
+ import uuid
14
+ from typing import List, Dict, Any, Optional
15
+ import turbopuffer
16
+
17
+ from ..config import VectorConfig
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class TurbopufferClient:
22
+ """Clean Turbopuffer client using official SDK."""
23
+
24
+ def __init__(self, api_key: str, region: str = "gcp-europe-west3"):
25
+ self.api_key = api_key
26
+ self.region = region
27
+
28
+ # Initialize official TurboPuffer client
29
+ self.client = turbopuffer.Turbopuffer(
30
+ api_key=api_key,
31
+ region=region
32
+ )
33
+ logger.info(f"Initialized TurboPuffer client with region {region}")
34
+
35
+ def health_check(self) -> bool:
36
+ """Check if Turbopuffer service is healthy."""
37
+ try:
38
+ namespaces = self.client.namespaces()
39
+ return True
40
+ except Exception as e:
41
+ logger.warning(f"Turbopuffer health check failed: {e}")
42
+ return False
43
+
44
+ def generate_vector_id(self, project_id: str, chunk_id: int) -> str:
45
+ """Generate a unique vector ID."""
46
+ return f"{project_id}_{chunk_id}_{uuid.uuid4().hex[:8]}"
47
+
48
+ def upsert_vectors(
49
+ self,
50
+ vectors: List[Dict[str, Any]],
51
+ namespace: str,
52
+ **kwargs
53
+ ) -> Dict[str, Any]:
54
+ """Store or update vectors in the database."""
55
+ if not vectors:
56
+ return {"upserted": 0}
57
+
58
+ logger.info(f"Upserting {len(vectors)} vectors to namespace '{namespace}'")
59
+
60
+ # Format vectors for Turbopuffer SDK
61
+ formatted_vectors = []
62
+ for vector in vectors:
63
+ if "id" not in vector or "values" not in vector:
64
+ raise ValueError("Each vector must have 'id' and 'values' fields")
65
+
66
+ formatted_vector = {
67
+ "id": str(vector["id"]),
68
+ "vector": vector["values"],
69
+ "attributes": vector.get("metadata", {}),
70
+ }
71
+ formatted_vectors.append(formatted_vector)
72
+
73
+ try:
74
+ ns = self.client.namespace(namespace)
75
+ ns.upsert(vectors=formatted_vectors)
76
+
77
+ logger.info(f"Successfully upserted {len(vectors)} vectors")
78
+ return {"upserted": len(vectors)}
79
+
80
+ except Exception as e:
81
+ logger.error(f"Failed to upsert vectors: {e}")
82
+ raise RuntimeError(f"Vector upsert failed: {e}")
83
+
84
+ def search_vectors(
85
+ self,
86
+ query_vector: List[float],
87
+ top_k: int = 10,
88
+ namespace: str = "default",
89
+ filters: Optional[Dict[str, Any]] = None,
90
+ **kwargs
91
+ ) -> List[Dict[str, Any]]:
92
+ """Search for similar vectors."""
93
+ logger.debug(f"Searching {top_k} vectors in namespace '{namespace}'")
94
+
95
+ try:
96
+ ns = self.client.namespace(namespace)
97
+
98
+ results = ns.query(
99
+ rank_by=[("vector", "ANN", query_vector)],
100
+ top_k=top_k,
101
+ filters=filters,
102
+ include_attributes=True
103
+ )
104
+
105
+ logger.debug(f"Found {len(results)} similar vectors")
106
+ return results
107
+
108
+ except Exception as e:
109
+ logger.error(f"Vector search failed: {e}")
110
+ raise RuntimeError(f"Vector search failed: {e}")
111
+
112
+ def delete_vectors(
113
+ self,
114
+ vector_ids: List[str],
115
+ namespace: str,
116
+ **kwargs
117
+ ) -> Dict[str, Any]:
118
+ """Delete vectors by ID."""
119
+ if not vector_ids:
120
+ return {"deleted": 0}
121
+
122
+ logger.info(f"Deleting {len(vector_ids)} vectors from namespace '{namespace}'")
123
+
124
+ try:
125
+ ns = self.client.namespace(namespace)
126
+ ns.delete(ids=vector_ids)
127
+
128
+ logger.info(f"Successfully deleted vectors")
129
+ return {"deleted": len(vector_ids)}
130
+
131
+ except Exception as e:
132
+ logger.error(f"Failed to delete vectors: {e}")
133
+ raise RuntimeError(f"Vector deletion failed: {e}")
134
+
135
+ def list_namespaces(self) -> List[str]:
136
+ """List all available namespaces."""
137
+ try:
138
+ namespaces = self.client.namespaces()
139
+ return [ns.name for ns in namespaces]
140
+
141
+ except Exception as e:
142
+ logger.error(f"Failed to list namespaces: {e}")
143
+ raise RuntimeError(f"Namespace listing failed: {e}")
144
+
145
+ def create_namespace(self, namespace: str, dimension: int, **kwargs) -> Dict[str, Any]:
146
+ """Create a new namespace."""
147
+ logger.info(f"Creating namespace '{namespace}' with dimension {dimension}")
148
+
149
+ try:
150
+ self.client.create_namespace(
151
+ name=namespace,
152
+ dimension=dimension
153
+ )
154
+
155
+ logger.info(f"Successfully created namespace '{namespace}'")
156
+ return {"name": namespace, "dimension": dimension}
157
+
158
+ except Exception as e:
159
+ logger.error(f"Failed to create namespace: {e}")
160
+ raise RuntimeError(f"Namespace creation failed: {e}")
161
+
162
+ def delete_namespace(self, namespace: str) -> Dict[str, Any]:
163
+ """Delete a namespace and all its vectors."""
164
+ logger.warning(f"Deleting namespace '{namespace}' and all its vectors")
165
+
166
+ try:
167
+ self.client.delete_namespace(namespace)
168
+
169
+ logger.info(f"Successfully deleted namespace '{namespace}'")
170
+ return {"deleted": namespace}
171
+
172
+ except Exception as e:
173
+ logger.error(f"Failed to delete namespace: {e}")
174
+ raise RuntimeError(f"Namespace deletion failed: {e}")
175
+
176
+ def get_namespace_for_project(self, project_id: str) -> str:
177
+ """Get the namespace name for a project."""
178
+ # Use project ID as namespace, with prefix for safety
179
+ safe_project_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in project_id)
180
+ return f"mcp_code_{safe_project_id}".lower()
181
+
182
+ def search_with_metadata_filter(
183
+ self,
184
+ query_vector: List[float],
185
+ project_id: str,
186
+ chunk_type: Optional[str] = None,
187
+ file_path: Optional[str] = None,
188
+ top_k: int = 10,
189
+ **kwargs
190
+ ) -> List[Dict[str, Any]]:
191
+ """Search vectors with metadata filtering."""
192
+ namespace = self.get_namespace_for_project(project_id)
193
+
194
+ # Build metadata filters
195
+ filters = {"project_id": project_id}
196
+ if chunk_type:
197
+ filters["chunk_type"] = chunk_type
198
+ if file_path:
199
+ filters["file_path"] = file_path
200
+
201
+ return self.search_vectors(
202
+ query_vector=query_vector,
203
+ top_k=top_k,
204
+ namespace=namespace,
205
+ filters=filters,
206
+ **kwargs
207
+ )
208
+
209
+ def create_turbopuffer_client(config: VectorConfig) -> TurbopufferClient:
210
+ """Create a Turbopuffer client from configuration."""
211
+ if not config.turbopuffer_api_key:
212
+ raise ValueError("TURBOPUFFER_API_KEY is required for vector storage")
213
+
214
+ return TurbopufferClient(
215
+ api_key=config.turbopuffer_api_key,
216
+ region=config.turbopuffer_region,
217
+ )
@@ -0,0 +1,119 @@
1
+ """
2
+ Voyage AI client for embedding generation using official SDK.
3
+
4
+ Provides clean integration with Voyage AI's embedding API for generating
5
+ high-quality code embeddings using the voyage-code-2 model.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Dict, Any
10
+ import voyageai
11
+
12
+ from ..config import VectorConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class VoyageClient:
17
+ """Clean Voyage AI client using official SDK."""
18
+
19
+ def __init__(self, api_key: str, model: str = "voyage-code-2"):
20
+ self.api_key = api_key
21
+ self.model = model
22
+ self._embedding_dimension: int | None = None
23
+
24
+ # Initialize official Voyage AI client
25
+ self.client = voyageai.Client(api_key=api_key)
26
+ logger.info(f"Initialized Voyage AI client with model {model}")
27
+
28
+ def health_check(self) -> bool:
29
+ """Check if Voyage AI service is healthy."""
30
+ try:
31
+ result = self.client.embed(["test"], model=self.model, input_type="query")
32
+ return len(result.embeddings) > 0
33
+ except Exception as e:
34
+ logger.warning(f"Voyage AI health check failed: {e}")
35
+ return False
36
+
37
+ def generate_embeddings(
38
+ self,
39
+ texts: List[str],
40
+ input_type: str = "document",
41
+ **kwargs
42
+ ) -> List[List[float]]:
43
+ """Generate embeddings for texts using official SDK."""
44
+ if not texts:
45
+ return []
46
+
47
+ logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
48
+
49
+ try:
50
+ result = self.client.embed(
51
+ texts=texts,
52
+ model=self.model,
53
+ input_type=input_type,
54
+ truncation=True
55
+ )
56
+
57
+ # Log usage if available
58
+ if hasattr(result, 'usage') and result.usage:
59
+ logger.debug(f"Token usage: {result.usage.total_tokens}")
60
+
61
+ logger.info(f"Successfully generated {len(result.embeddings)} embeddings")
62
+ return result.embeddings
63
+
64
+ except Exception as e:
65
+ logger.error(f"Failed to generate embeddings: {e}")
66
+ raise RuntimeError(f"Embedding generation failed: {e}")
67
+
68
+ def get_embedding_dimension(self) -> int:
69
+ """Get the dimension of embeddings produced by this model."""
70
+ if self._embedding_dimension is not None:
71
+ return self._embedding_dimension
72
+
73
+ # Generate a test embedding to determine dimension
74
+ try:
75
+ test_embeddings = self.generate_embeddings(["test"], input_type="query")
76
+ if test_embeddings:
77
+ self._embedding_dimension = len(test_embeddings[0])
78
+ logger.info(f"Detected embedding dimension: {self._embedding_dimension}")
79
+ return self._embedding_dimension
80
+ except Exception as e:
81
+ logger.warning(f"Could not determine embedding dimension: {e}")
82
+
83
+ # Default dimensions for known Voyage models
84
+ model_dimensions = {
85
+ "voyage-code-2": 1536,
86
+ "voyage-2": 1024,
87
+ "voyage-large-2": 1536,
88
+ "voyage-3": 1024,
89
+ }
90
+
91
+ self._embedding_dimension = model_dimensions.get(self.model, 1536)
92
+ logger.info(f"Using default embedding dimension: {self._embedding_dimension}")
93
+ return self._embedding_dimension
94
+
95
+ def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
96
+ """Estimate the cost of embedding generation."""
97
+ # Rough token estimation (4 chars per token)
98
+ total_tokens = sum(len(text) // 4 for text in texts)
99
+
100
+ # Voyage AI pricing (approximate, may change)
101
+ cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
102
+ estimated_cost = (total_tokens / 1000) * cost_per_1k_tokens
103
+
104
+ return {
105
+ "total_tokens": total_tokens,
106
+ "total_texts": len(texts),
107
+ "estimated_cost_usd": round(estimated_cost, 6),
108
+ "model": self.model,
109
+ }
110
+
111
+ def create_voyage_client(config: VectorConfig) -> VoyageClient:
112
+ """Create a Voyage client from configuration."""
113
+ if not config.voyage_api_key:
114
+ raise ValueError("VOYAGE_API_KEY is required for embedding generation")
115
+
116
+ return VoyageClient(
117
+ api_key=config.voyage_api_key,
118
+ model=config.embedding_model,
119
+ )
@@ -1,72 +0,0 @@
1
- """
2
- External service providers for vector mode.
3
-
4
- This package provides integrations with external services including:
5
- - Voyage AI for embedding generation
6
- - Turbopuffer for vector storage and search
7
- """
8
-
9
- from typing import Protocol, List, Dict, Any, Optional
10
- from abc import abstractmethod
11
-
12
- class EmbeddingProvider(Protocol):
13
- """Protocol for embedding generation providers."""
14
-
15
- @abstractmethod
16
- async def generate_embeddings(
17
- self,
18
- texts: List[str],
19
- input_type: str = "document",
20
- **kwargs
21
- ) -> List[List[float]]:
22
- """Generate embeddings for a list of texts."""
23
- ...
24
-
25
- @abstractmethod
26
- async def get_embedding_dimension(self) -> int:
27
- """Get the dimension of embeddings produced by this provider."""
28
- ...
29
-
30
- class VectorStoreProvider(Protocol):
31
- """Protocol for vector storage providers."""
32
-
33
- @abstractmethod
34
- async def upsert_vectors(
35
- self,
36
- vectors: List[Dict[str, Any]],
37
- namespace: Optional[str] = None,
38
- **kwargs
39
- ) -> Dict[str, Any]:
40
- """Store or update vectors in the database."""
41
- ...
42
-
43
- @abstractmethod
44
- async def search_vectors(
45
- self,
46
- query_vector: List[float],
47
- top_k: int = 10,
48
- namespace: Optional[str] = None,
49
- filters: Optional[Dict[str, Any]] = None,
50
- **kwargs
51
- ) -> List[Dict[str, Any]]:
52
- """Search for similar vectors."""
53
- ...
54
-
55
- @abstractmethod
56
- async def delete_vectors(
57
- self,
58
- vector_ids: List[str],
59
- namespace: Optional[str] = None,
60
- **kwargs
61
- ) -> Dict[str, Any]:
62
- """Delete vectors by ID."""
63
- ...
64
-
65
- @abstractmethod
66
- async def get_namespace_stats(
67
- self,
68
- namespace: Optional[str] = None,
69
- **kwargs
70
- ) -> Dict[str, Any]:
71
- """Get statistics about a namespace."""
72
- ...