mcp-code-indexer 4.2.14__py3-none-any.whl → 4.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/database.py +251 -85
  2. mcp_code_indexer/database/models.py +66 -24
  3. mcp_code_indexer/database/retry_executor.py +15 -5
  4. mcp_code_indexer/file_scanner.py +107 -12
  5. mcp_code_indexer/main.py +75 -23
  6. mcp_code_indexer/server/mcp_server.py +191 -1
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  9. mcp_code_indexer/vector_mode/config.py +113 -45
  10. mcp_code_indexer/vector_mode/const.py +24 -0
  11. mcp_code_indexer/vector_mode/daemon.py +860 -98
  12. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  13. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  14. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  15. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  16. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  17. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  18. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  19. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  20. mcp_code_indexer/vector_mode/types.py +46 -0
  21. mcp_code_indexer/vector_mode/utils.py +50 -0
  22. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
  23. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
  24. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
  25. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
  26. {mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
@@ -11,47 +11,78 @@ from pathlib import Path
11
11
  from typing import Optional
12
12
  import yaml
13
13
 
14
+ from .const import MODEL_DIMENSIONS
15
+
16
+ DEFAULT_EMBEDDING_MODEL = "voyage-code-2"
17
+
18
+
14
19
  @dataclass
15
20
  class VectorConfig:
16
21
  """Configuration for vector mode operations."""
17
-
22
+
18
23
  # API Configuration
19
24
  voyage_api_key: Optional[str] = None
20
25
  turbopuffer_api_key: Optional[str] = None
21
26
  turbopuffer_region: str = "gcp-europe-west3"
22
-
23
- # Embedding Configuration
24
- embedding_model: str = "voyage-code-2"
27
+
28
+ # Embedding Configuration
29
+ embedding_model: str = DEFAULT_EMBEDDING_MODEL
25
30
  batch_size: int = 128
26
31
  max_tokens_per_chunk: int = 1024
27
-
32
+ voyage_batch_size_limit: int = 1000
33
+ voyage_max_tokens_per_batch: int = 120000
34
+
28
35
  # Search Configuration
29
36
  similarity_threshold: float = 0.5
30
37
  max_search_results: int = 20
31
38
  enable_recency_boost: bool = True
32
-
39
+
33
40
  # Chunking Configuration
34
41
  max_chunk_size: int = 1500
35
42
  chunk_overlap: int = 100
36
43
  prefer_semantic_chunks: bool = True
37
-
44
+
38
45
  # File Monitoring Configuration
39
46
  watch_debounce_ms: int = 100
40
- ignore_patterns: list[str] = field(default_factory=lambda: [
41
- "*.log", "*.tmp", "*~", ".git/*", "__pycache__/*", "node_modules/*",
42
- "*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
43
- ])
44
-
47
+ ignore_patterns: list[str] = field(
48
+ default_factory=lambda: [
49
+ "*.log",
50
+ "*.tmp",
51
+ "*~",
52
+ ".git/*",
53
+ "__pycache__/*",
54
+ "node_modules/*",
55
+ "*.pyc",
56
+ "*.pyo",
57
+ ".DS_Store",
58
+ "Thumbs.db",
59
+ ".vscode/*",
60
+ ".coverage",
61
+ ".ruff_cache/*",
62
+ ".mypy_cache/*",
63
+ ".pytest_cache/*",
64
+ ".import_linter_cache/*",
65
+ "*/tiktoken_cache/*",
66
+ ".code-index/*",
67
+ ".hypothesis/*",
68
+ "poetry.lock",
69
+ "venv/*",
70
+ "htmlcov/*",
71
+ ]
72
+ )
73
+
45
74
  # Daemon Configuration
46
75
  daemon_enabled: bool = True
47
76
  daemon_poll_interval: int = 5
48
77
  max_queue_size: int = 1000
49
78
  worker_count: int = 3
50
-
79
+ max_concurrent_files: int = 5
80
+ max_concurrent_batches: int = 5
81
+
51
82
  # Security Configuration
52
83
  redact_secrets: bool = True
53
84
  redaction_patterns_file: Optional[str] = None
54
-
85
+
55
86
  @classmethod
56
87
  def from_env(cls) -> "VectorConfig":
57
88
  """Create config from environment variables."""
@@ -59,12 +90,19 @@ class VectorConfig:
59
90
  voyage_api_key=os.getenv("VOYAGE_API_KEY"),
60
91
  turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
61
92
  turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
62
- embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
93
+ embedding_model=os.getenv(
94
+ "VECTOR_EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL
95
+ ),
96
+ voyage_batch_size_limit=int(os.getenv("VOYAGE_BATCH_SIZE_LIMIT", "1000")),
97
+ voyage_max_tokens_per_batch=int(
98
+ os.getenv("VOYAGE_MAX_TOKENS_PER_BATCH", "120000")
99
+ ),
63
100
  batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
64
101
  max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
65
102
  similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
66
103
  max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
67
- enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
104
+ enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower()
105
+ == "true",
68
106
  max_chunk_size=int(os.getenv("VECTOR_CHUNK_SIZE", "1500")),
69
107
  chunk_overlap=int(os.getenv("VECTOR_CHUNK_OVERLAP", "100")),
70
108
  watch_debounce_ms=int(os.getenv("VECTOR_DEBOUNCE_MS", "100")),
@@ -72,68 +110,81 @@ class VectorConfig:
72
110
  daemon_poll_interval=int(os.getenv("VECTOR_POLL_INTERVAL", "5")),
73
111
  max_queue_size=int(os.getenv("VECTOR_MAX_QUEUE", "1000")),
74
112
  worker_count=int(os.getenv("VECTOR_WORKERS", "3")),
113
+ max_concurrent_files=int(os.getenv("VECTOR_MAX_CONCURRENT_FILES", "5")),
114
+ max_concurrent_batches=int(os.getenv("VECTOR_MAX_CONCURRENT_BATCHES", "5")),
75
115
  redact_secrets=os.getenv("VECTOR_REDACT_SECRETS", "true").lower() == "true",
76
116
  )
77
-
117
+
78
118
  @classmethod
79
119
  def from_file(cls, config_path: Path) -> "VectorConfig":
80
120
  """Load config from YAML file."""
81
121
  if not config_path.exists():
82
122
  return cls.from_env()
83
-
123
+
84
124
  try:
85
125
  with open(config_path, "r") as f:
86
126
  data = yaml.safe_load(f) or {}
87
-
127
+
88
128
  # Merge with environment variables (env takes precedence)
89
129
  env_config = cls.from_env()
90
-
130
+
91
131
  # Update with file values only if env variable not set
92
132
  for key, value in data.items():
93
133
  if hasattr(env_config, key):
94
134
  env_value = getattr(env_config, key)
95
135
  # Use file value if env value is None or default
96
- if env_value is None or (key == "voyage_api_key" and env_value is None):
136
+ if env_value is None or (
137
+ key == "voyage_api_key" and env_value is None
138
+ ):
97
139
  setattr(env_config, key, value)
98
-
140
+
99
141
  return env_config
100
-
142
+
101
143
  except Exception as e:
102
144
  raise ValueError(f"Failed to load config from {config_path}: {e}")
103
-
145
+
104
146
  def to_file(self, config_path: Path) -> None:
105
147
  """Save config to YAML file."""
106
148
  config_path.parent.mkdir(parents=True, exist_ok=True)
107
-
149
+
108
150
  # Don't save API keys to file for security
109
- data = {
110
- k: v for k, v in self.__dict__.items()
111
- if not k.endswith("_api_key")
112
- }
113
-
151
+ data = {k: v for k, v in self.__dict__.items() if not k.endswith("_api_key")}
152
+
114
153
  with open(config_path, "w") as f:
115
154
  yaml.dump(data, f, default_flow_style=False, sort_keys=True)
116
-
155
+
117
156
  def validate(self) -> list[str]:
118
157
  """Validate configuration and return list of errors."""
119
158
  errors = []
120
-
159
+
121
160
  if self.daemon_enabled:
122
161
  if not self.voyage_api_key:
123
- errors.append("VOYAGE_API_KEY environment variable required for vector mode")
162
+ errors.append(
163
+ "VOYAGE_API_KEY environment variable required for vector mode"
164
+ )
124
165
  if not self.turbopuffer_api_key:
125
- errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
126
-
166
+ errors.append(
167
+ "TURBOPUFFER_API_KEY environment variable required for vector mode"
168
+ )
169
+
127
170
  # Validate TurboPuffer region
128
171
  supported_regions = [
129
- 'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
130
- 'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
131
- 'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
172
+ "aws-ap-southeast-2",
173
+ "aws-eu-central-1",
174
+ "aws-us-east-1",
175
+ "aws-us-east-2",
176
+ "aws-us-west-2",
177
+ "gcp-us-central1",
178
+ "gcp-us-west1",
179
+ "gcp-us-east4",
180
+ "gcp-europe-west3",
132
181
  ]
133
182
  if self.turbopuffer_region not in supported_regions:
134
- errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
135
- f"Supported regions: {', '.join(supported_regions)}")
136
-
183
+ errors.append(
184
+ f"turbopuffer_region '{self.turbopuffer_region}' is not supported. "
185
+ + f"Supported regions: {', '.join(supported_regions)}"
186
+ )
187
+
137
188
  if self.batch_size <= 0:
138
189
  errors.append("batch_size must be positive")
139
190
  if self.max_tokens_per_chunk <= 0:
@@ -148,20 +199,37 @@ class VectorConfig:
148
199
  errors.append("chunk_overlap cannot be negative")
149
200
  if self.worker_count <= 0:
150
201
  errors.append("worker_count must be positive")
151
-
202
+ if self.max_concurrent_files <= 0 or self.max_concurrent_files > 50:
203
+ errors.append("max_concurrent_files must be between 1 and 50")
204
+ if self.voyage_batch_size_limit <= 0 or self.voyage_batch_size_limit > 1000:
205
+ errors.append("voyage_batch_size_limit must be between 1 and 1000")
206
+ if (
207
+ self.voyage_max_tokens_per_batch <= 0
208
+ or self.voyage_max_tokens_per_batch > 120000
209
+ ):
210
+ errors.append("voyage_max_tokens_per_batch must be between 1 and 120000")
211
+
152
212
  return errors
153
213
 
214
+ def get_embedding_dimensions(self) -> int:
215
+ """Get the vector dimensions for the current embedding model."""
216
+ return MODEL_DIMENSIONS.get(
217
+ self.embedding_model, 1536
218
+ ) # Default to 1536 if model not found
219
+
220
+
154
221
  def load_vector_config(config_path: Optional[Path] = None) -> VectorConfig:
155
222
  """Load vector configuration from file or environment."""
156
223
  if config_path is None:
157
224
  from . import get_vector_config_path
225
+
158
226
  config_path = get_vector_config_path()
159
-
227
+
160
228
  config = VectorConfig.from_file(config_path)
161
-
229
+
162
230
  # Validate configuration
163
231
  errors = config.validate()
164
232
  if errors:
165
233
  raise ValueError(f"Invalid vector configuration: {'; '.join(errors)}")
166
-
234
+
167
235
  return config
@@ -0,0 +1,24 @@
1
+ """
2
+ Constants for Voyage AI providers.
3
+
4
+ Contains model names, dimensions, and other constants used across
5
+ Voyage AI provider implementations.
6
+ """
7
+
8
+
9
+ class VoyageModel:
10
+ """Voyage AI model names."""
11
+
12
+ VOYAGE_CODE_2 = "voyage-code-2"
13
+ VOYAGE_2 = "voyage-2"
14
+ VOYAGE_LARGE_2 = "voyage-large-2"
15
+ VOYAGE_3 = "voyage-3"
16
+
17
+
18
+ # Model dimensions mapping
19
+ MODEL_DIMENSIONS = {
20
+ VoyageModel.VOYAGE_CODE_2: 1536,
21
+ VoyageModel.VOYAGE_2: 1024,
22
+ VoyageModel.VOYAGE_LARGE_2: 1536,
23
+ VoyageModel.VOYAGE_3: 1024,
24
+ }