mcp-code-indexer 4.2.14__py3-none-any.whl → 4.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/database.py +251 -85
- mcp_code_indexer/database/models.py +66 -24
- mcp_code_indexer/database/retry_executor.py +15 -5
- mcp_code_indexer/file_scanner.py +107 -12
- mcp_code_indexer/main.py +75 -23
- mcp_code_indexer/server/mcp_server.py +191 -1
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
- mcp_code_indexer/vector_mode/config.py +113 -45
- mcp_code_indexer/vector_mode/const.py +24 -0
- mcp_code_indexer/vector_mode/daemon.py +860 -98
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
- mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
- mcp_code_indexer/vector_mode/services/__init__.py +9 -0
- mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
- mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
- mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
- mcp_code_indexer/vector_mode/types.py +46 -0
- mcp_code_indexer/vector_mode/utils.py +50 -0
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
- {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
|
@@ -11,47 +11,78 @@ from pathlib import Path
|
|
|
11
11
|
from typing import Optional
|
|
12
12
|
import yaml
|
|
13
13
|
|
|
14
|
+
from .const import MODEL_DIMENSIONS
|
|
15
|
+
|
|
16
|
+
DEFAULT_EMBEDDING_MODEL = "voyage-code-2"
|
|
17
|
+
|
|
18
|
+
|
|
14
19
|
@dataclass
|
|
15
20
|
class VectorConfig:
|
|
16
21
|
"""Configuration for vector mode operations."""
|
|
17
|
-
|
|
22
|
+
|
|
18
23
|
# API Configuration
|
|
19
24
|
voyage_api_key: Optional[str] = None
|
|
20
25
|
turbopuffer_api_key: Optional[str] = None
|
|
21
26
|
turbopuffer_region: str = "gcp-europe-west3"
|
|
22
|
-
|
|
23
|
-
# Embedding Configuration
|
|
24
|
-
embedding_model: str =
|
|
27
|
+
|
|
28
|
+
# Embedding Configuration
|
|
29
|
+
embedding_model: str = DEFAULT_EMBEDDING_MODEL
|
|
25
30
|
batch_size: int = 128
|
|
26
31
|
max_tokens_per_chunk: int = 1024
|
|
27
|
-
|
|
32
|
+
voyage_batch_size_limit: int = 1000
|
|
33
|
+
voyage_max_tokens_per_batch: int = 120000
|
|
34
|
+
|
|
28
35
|
# Search Configuration
|
|
29
36
|
similarity_threshold: float = 0.5
|
|
30
37
|
max_search_results: int = 20
|
|
31
38
|
enable_recency_boost: bool = True
|
|
32
|
-
|
|
39
|
+
|
|
33
40
|
# Chunking Configuration
|
|
34
41
|
max_chunk_size: int = 1500
|
|
35
42
|
chunk_overlap: int = 100
|
|
36
43
|
prefer_semantic_chunks: bool = True
|
|
37
|
-
|
|
44
|
+
|
|
38
45
|
# File Monitoring Configuration
|
|
39
46
|
watch_debounce_ms: int = 100
|
|
40
|
-
ignore_patterns: list[str] = field(
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
47
|
+
ignore_patterns: list[str] = field(
|
|
48
|
+
default_factory=lambda: [
|
|
49
|
+
"*.log",
|
|
50
|
+
"*.tmp",
|
|
51
|
+
"*~",
|
|
52
|
+
".git/*",
|
|
53
|
+
"__pycache__/*",
|
|
54
|
+
"node_modules/*",
|
|
55
|
+
"*.pyc",
|
|
56
|
+
"*.pyo",
|
|
57
|
+
".DS_Store",
|
|
58
|
+
"Thumbs.db",
|
|
59
|
+
".vscode/*",
|
|
60
|
+
".coverage",
|
|
61
|
+
".ruff_cache/*",
|
|
62
|
+
".mypy_cache/*",
|
|
63
|
+
".pytest_cache/*",
|
|
64
|
+
".import_linter_cache/*",
|
|
65
|
+
"*/tiktoken_cache/*",
|
|
66
|
+
".code-index/*",
|
|
67
|
+
".hypothesis/*",
|
|
68
|
+
"poetry.lock",
|
|
69
|
+
"venv/*",
|
|
70
|
+
"htmlcov/*",
|
|
71
|
+
]
|
|
72
|
+
)
|
|
73
|
+
|
|
45
74
|
# Daemon Configuration
|
|
46
75
|
daemon_enabled: bool = True
|
|
47
76
|
daemon_poll_interval: int = 5
|
|
48
77
|
max_queue_size: int = 1000
|
|
49
78
|
worker_count: int = 3
|
|
50
|
-
|
|
79
|
+
max_concurrent_files: int = 5
|
|
80
|
+
max_concurrent_batches: int = 5
|
|
81
|
+
|
|
51
82
|
# Security Configuration
|
|
52
83
|
redact_secrets: bool = True
|
|
53
84
|
redaction_patterns_file: Optional[str] = None
|
|
54
|
-
|
|
85
|
+
|
|
55
86
|
@classmethod
|
|
56
87
|
def from_env(cls) -> "VectorConfig":
|
|
57
88
|
"""Create config from environment variables."""
|
|
@@ -59,12 +90,19 @@ class VectorConfig:
|
|
|
59
90
|
voyage_api_key=os.getenv("VOYAGE_API_KEY"),
|
|
60
91
|
turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
|
|
61
92
|
turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
|
|
62
|
-
embedding_model=os.getenv(
|
|
93
|
+
embedding_model=os.getenv(
|
|
94
|
+
"VECTOR_EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL
|
|
95
|
+
),
|
|
96
|
+
voyage_batch_size_limit=int(os.getenv("VOYAGE_BATCH_SIZE_LIMIT", "1000")),
|
|
97
|
+
voyage_max_tokens_per_batch=int(
|
|
98
|
+
os.getenv("VOYAGE_MAX_TOKENS_PER_BATCH", "120000")
|
|
99
|
+
),
|
|
63
100
|
batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
|
|
64
101
|
max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
|
|
65
102
|
similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
|
|
66
103
|
max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
|
|
67
|
-
enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower()
|
|
104
|
+
enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower()
|
|
105
|
+
== "true",
|
|
68
106
|
max_chunk_size=int(os.getenv("VECTOR_CHUNK_SIZE", "1500")),
|
|
69
107
|
chunk_overlap=int(os.getenv("VECTOR_CHUNK_OVERLAP", "100")),
|
|
70
108
|
watch_debounce_ms=int(os.getenv("VECTOR_DEBOUNCE_MS", "100")),
|
|
@@ -72,68 +110,81 @@ class VectorConfig:
|
|
|
72
110
|
daemon_poll_interval=int(os.getenv("VECTOR_POLL_INTERVAL", "5")),
|
|
73
111
|
max_queue_size=int(os.getenv("VECTOR_MAX_QUEUE", "1000")),
|
|
74
112
|
worker_count=int(os.getenv("VECTOR_WORKERS", "3")),
|
|
113
|
+
max_concurrent_files=int(os.getenv("VECTOR_MAX_CONCURRENT_FILES", "5")),
|
|
114
|
+
max_concurrent_batches=int(os.getenv("VECTOR_MAX_CONCURRENT_BATCHES", "5")),
|
|
75
115
|
redact_secrets=os.getenv("VECTOR_REDACT_SECRETS", "true").lower() == "true",
|
|
76
116
|
)
|
|
77
|
-
|
|
117
|
+
|
|
78
118
|
@classmethod
|
|
79
119
|
def from_file(cls, config_path: Path) -> "VectorConfig":
|
|
80
120
|
"""Load config from YAML file."""
|
|
81
121
|
if not config_path.exists():
|
|
82
122
|
return cls.from_env()
|
|
83
|
-
|
|
123
|
+
|
|
84
124
|
try:
|
|
85
125
|
with open(config_path, "r") as f:
|
|
86
126
|
data = yaml.safe_load(f) or {}
|
|
87
|
-
|
|
127
|
+
|
|
88
128
|
# Merge with environment variables (env takes precedence)
|
|
89
129
|
env_config = cls.from_env()
|
|
90
|
-
|
|
130
|
+
|
|
91
131
|
# Update with file values only if env variable not set
|
|
92
132
|
for key, value in data.items():
|
|
93
133
|
if hasattr(env_config, key):
|
|
94
134
|
env_value = getattr(env_config, key)
|
|
95
135
|
# Use file value if env value is None or default
|
|
96
|
-
if env_value is None or (
|
|
136
|
+
if env_value is None or (
|
|
137
|
+
key == "voyage_api_key" and env_value is None
|
|
138
|
+
):
|
|
97
139
|
setattr(env_config, key, value)
|
|
98
|
-
|
|
140
|
+
|
|
99
141
|
return env_config
|
|
100
|
-
|
|
142
|
+
|
|
101
143
|
except Exception as e:
|
|
102
144
|
raise ValueError(f"Failed to load config from {config_path}: {e}")
|
|
103
|
-
|
|
145
|
+
|
|
104
146
|
def to_file(self, config_path: Path) -> None:
|
|
105
147
|
"""Save config to YAML file."""
|
|
106
148
|
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
-
|
|
149
|
+
|
|
108
150
|
# Don't save API keys to file for security
|
|
109
|
-
data = {
|
|
110
|
-
|
|
111
|
-
if not k.endswith("_api_key")
|
|
112
|
-
}
|
|
113
|
-
|
|
151
|
+
data = {k: v for k, v in self.__dict__.items() if not k.endswith("_api_key")}
|
|
152
|
+
|
|
114
153
|
with open(config_path, "w") as f:
|
|
115
154
|
yaml.dump(data, f, default_flow_style=False, sort_keys=True)
|
|
116
|
-
|
|
155
|
+
|
|
117
156
|
def validate(self) -> list[str]:
|
|
118
157
|
"""Validate configuration and return list of errors."""
|
|
119
158
|
errors = []
|
|
120
|
-
|
|
159
|
+
|
|
121
160
|
if self.daemon_enabled:
|
|
122
161
|
if not self.voyage_api_key:
|
|
123
|
-
errors.append(
|
|
162
|
+
errors.append(
|
|
163
|
+
"VOYAGE_API_KEY environment variable required for vector mode"
|
|
164
|
+
)
|
|
124
165
|
if not self.turbopuffer_api_key:
|
|
125
|
-
errors.append(
|
|
126
|
-
|
|
166
|
+
errors.append(
|
|
167
|
+
"TURBOPUFFER_API_KEY environment variable required for vector mode"
|
|
168
|
+
)
|
|
169
|
+
|
|
127
170
|
# Validate TurboPuffer region
|
|
128
171
|
supported_regions = [
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
172
|
+
"aws-ap-southeast-2",
|
|
173
|
+
"aws-eu-central-1",
|
|
174
|
+
"aws-us-east-1",
|
|
175
|
+
"aws-us-east-2",
|
|
176
|
+
"aws-us-west-2",
|
|
177
|
+
"gcp-us-central1",
|
|
178
|
+
"gcp-us-west1",
|
|
179
|
+
"gcp-us-east4",
|
|
180
|
+
"gcp-europe-west3",
|
|
132
181
|
]
|
|
133
182
|
if self.turbopuffer_region not in supported_regions:
|
|
134
|
-
errors.append(
|
|
135
|
-
|
|
136
|
-
|
|
183
|
+
errors.append(
|
|
184
|
+
f"turbopuffer_region '{self.turbopuffer_region}' is not supported. "
|
|
185
|
+
+ f"Supported regions: {', '.join(supported_regions)}"
|
|
186
|
+
)
|
|
187
|
+
|
|
137
188
|
if self.batch_size <= 0:
|
|
138
189
|
errors.append("batch_size must be positive")
|
|
139
190
|
if self.max_tokens_per_chunk <= 0:
|
|
@@ -148,20 +199,37 @@ class VectorConfig:
|
|
|
148
199
|
errors.append("chunk_overlap cannot be negative")
|
|
149
200
|
if self.worker_count <= 0:
|
|
150
201
|
errors.append("worker_count must be positive")
|
|
151
|
-
|
|
202
|
+
if self.max_concurrent_files <= 0 or self.max_concurrent_files > 50:
|
|
203
|
+
errors.append("max_concurrent_files must be between 1 and 50")
|
|
204
|
+
if self.voyage_batch_size_limit <= 0 or self.voyage_batch_size_limit > 1000:
|
|
205
|
+
errors.append("voyage_batch_size_limit must be between 1 and 1000")
|
|
206
|
+
if (
|
|
207
|
+
self.voyage_max_tokens_per_batch <= 0
|
|
208
|
+
or self.voyage_max_tokens_per_batch > 120000
|
|
209
|
+
):
|
|
210
|
+
errors.append("voyage_max_tokens_per_batch must be between 1 and 120000")
|
|
211
|
+
|
|
152
212
|
return errors
|
|
153
213
|
|
|
214
|
+
def get_embedding_dimensions(self) -> int:
|
|
215
|
+
"""Get the vector dimensions for the current embedding model."""
|
|
216
|
+
return MODEL_DIMENSIONS.get(
|
|
217
|
+
self.embedding_model, 1536
|
|
218
|
+
) # Default to 1536 if model not found
|
|
219
|
+
|
|
220
|
+
|
|
154
221
|
def load_vector_config(config_path: Optional[Path] = None) -> VectorConfig:
|
|
155
222
|
"""Load vector configuration from file or environment."""
|
|
156
223
|
if config_path is None:
|
|
157
224
|
from . import get_vector_config_path
|
|
225
|
+
|
|
158
226
|
config_path = get_vector_config_path()
|
|
159
|
-
|
|
227
|
+
|
|
160
228
|
config = VectorConfig.from_file(config_path)
|
|
161
|
-
|
|
229
|
+
|
|
162
230
|
# Validate configuration
|
|
163
231
|
errors = config.validate()
|
|
164
232
|
if errors:
|
|
165
233
|
raise ValueError(f"Invalid vector configuration: {'; '.join(errors)}")
|
|
166
|
-
|
|
234
|
+
|
|
167
235
|
return config
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants for Voyage AI providers.
|
|
3
|
+
|
|
4
|
+
Contains model names, dimensions, and other constants used across
|
|
5
|
+
Voyage AI provider implementations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VoyageModel:
|
|
10
|
+
"""Voyage AI model names."""
|
|
11
|
+
|
|
12
|
+
VOYAGE_CODE_2 = "voyage-code-2"
|
|
13
|
+
VOYAGE_2 = "voyage-2"
|
|
14
|
+
VOYAGE_LARGE_2 = "voyage-large-2"
|
|
15
|
+
VOYAGE_3 = "voyage-3"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Model dimensions mapping
|
|
19
|
+
MODEL_DIMENSIONS = {
|
|
20
|
+
VoyageModel.VOYAGE_CODE_2: 1536,
|
|
21
|
+
VoyageModel.VOYAGE_2: 1024,
|
|
22
|
+
VoyageModel.VOYAGE_LARGE_2: 1536,
|
|
23
|
+
VoyageModel.VOYAGE_3: 1024,
|
|
24
|
+
}
|