mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/models.py +125 -1
  2. mcp_code_indexer/main.py +60 -0
  3. mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  4. mcp_code_indexer/server/mcp_server.py +3 -0
  5. mcp_code_indexer/vector_mode/__init__.py +36 -0
  6. mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  9. mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  10. mcp_code_indexer/vector_mode/config.py +167 -0
  11. mcp_code_indexer/vector_mode/daemon.py +335 -0
  12. mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  13. mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  14. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  15. mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  16. mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  17. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  18. mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  19. mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  20. mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  21. mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  22. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
  23. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
  24. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
  25. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
  26. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,119 @@
1
+ """
2
+ Voyage AI client for embedding generation using official SDK.
3
+
4
+ Provides clean integration with Voyage AI's embedding API for generating
5
+ high-quality code embeddings using the voyage-code-2 model.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Dict, Any
10
+ import voyageai
11
+
12
+ from ..config import VectorConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class VoyageClient:
17
+ """Clean Voyage AI client using official SDK."""
18
+
19
+ def __init__(self, api_key: str, model: str = "voyage-code-2"):
20
+ self.api_key = api_key
21
+ self.model = model
22
+ self._embedding_dimension: int | None = None
23
+
24
+ # Initialize official Voyage AI client
25
+ self.client = voyageai.Client(api_key=api_key)
26
+ logger.info(f"Initialized Voyage AI client with model {model}")
27
+
28
+ def health_check(self) -> bool:
29
+ """Check if Voyage AI service is healthy."""
30
+ try:
31
+ result = self.client.embed(["test"], model=self.model, input_type="query")
32
+ return len(result.embeddings) > 0
33
+ except Exception as e:
34
+ logger.warning(f"Voyage AI health check failed: {e}")
35
+ return False
36
+
37
+ def generate_embeddings(
38
+ self,
39
+ texts: List[str],
40
+ input_type: str = "document",
41
+ **kwargs
42
+ ) -> List[List[float]]:
43
+ """Generate embeddings for texts using official SDK."""
44
+ if not texts:
45
+ return []
46
+
47
+ logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
48
+
49
+ try:
50
+ result = self.client.embed(
51
+ texts=texts,
52
+ model=self.model,
53
+ input_type=input_type,
54
+ truncation=True
55
+ )
56
+
57
+ # Log usage if available
58
+ if hasattr(result, 'usage') and result.usage:
59
+ logger.debug(f"Token usage: {result.usage.total_tokens}")
60
+
61
+ logger.info(f"Successfully generated {len(result.embeddings)} embeddings")
62
+ return result.embeddings
63
+
64
+ except Exception as e:
65
+ logger.error(f"Failed to generate embeddings: {e}")
66
+ raise RuntimeError(f"Embedding generation failed: {e}")
67
+
68
+ def get_embedding_dimension(self) -> int:
69
+ """Get the dimension of embeddings produced by this model."""
70
+ if self._embedding_dimension is not None:
71
+ return self._embedding_dimension
72
+
73
+ # Generate a test embedding to determine dimension
74
+ try:
75
+ test_embeddings = self.generate_embeddings(["test"], input_type="query")
76
+ if test_embeddings:
77
+ self._embedding_dimension = len(test_embeddings[0])
78
+ logger.info(f"Detected embedding dimension: {self._embedding_dimension}")
79
+ return self._embedding_dimension
80
+ except Exception as e:
81
+ logger.warning(f"Could not determine embedding dimension: {e}")
82
+
83
+ # Default dimensions for known Voyage models
84
+ model_dimensions = {
85
+ "voyage-code-2": 1536,
86
+ "voyage-2": 1024,
87
+ "voyage-large-2": 1536,
88
+ "voyage-3": 1024,
89
+ }
90
+
91
+ self._embedding_dimension = model_dimensions.get(self.model, 1536)
92
+ logger.info(f"Using default embedding dimension: {self._embedding_dimension}")
93
+ return self._embedding_dimension
94
+
95
+ def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
96
+ """Estimate the cost of embedding generation."""
97
+ # Rough token estimation (4 chars per token)
98
+ total_tokens = sum(len(text) // 4 for text in texts)
99
+
100
+ # Voyage AI pricing (approximate, may change)
101
+ cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
102
+ estimated_cost = (total_tokens / 1000) * cost_per_1k_tokens
103
+
104
+ return {
105
+ "total_tokens": total_tokens,
106
+ "total_texts": len(texts),
107
+ "estimated_cost_usd": round(estimated_cost, 6),
108
+ "model": self.model,
109
+ }
110
+
111
+ def create_voyage_client(config: VectorConfig) -> VoyageClient:
112
+ """Create a Voyage client from configuration."""
113
+ if not config.voyage_api_key:
114
+ raise ValueError("VOYAGE_API_KEY is required for embedding generation")
115
+
116
+ return VoyageClient(
117
+ api_key=config.voyage_api_key,
118
+ model=config.embedding_model,
119
+ )
@@ -0,0 +1,11 @@
1
+ """
2
+ Security module for vector mode.
3
+
4
+ Provides secret redaction capabilities to prevent sensitive information
5
+ from being sent to external APIs for embedding generation.
6
+ """
7
+
8
+ from .redactor import SecretRedactor, RedactionResult
9
+ from .patterns import SecurityPatterns
10
+
11
+ __all__ = ["SecretRedactor", "RedactionResult", "SecurityPatterns"]
@@ -0,0 +1,297 @@
1
+ """
2
+ Security patterns for detecting secrets in code.
3
+
4
+ Comprehensive collection of regex patterns to identify API keys, tokens,
5
+ passwords, connection strings, and other sensitive information.
6
+ """
7
+
8
+ import re
9
+ from typing import List, Dict, Pattern, NamedTuple
10
+ from dataclasses import dataclass
11
+
12
+ class PatternMatch(NamedTuple):
13
+ """Represents a detected secret pattern match."""
14
+ pattern_name: str
15
+ pattern_type: str
16
+ start_pos: int
17
+ end_pos: int
18
+ matched_text: str
19
+ confidence: float
20
+
21
+ @dataclass
22
+ class SecurityPattern:
23
+ """Represents a security pattern with metadata."""
24
+ name: str
25
+ pattern: Pattern[str]
26
+ pattern_type: str
27
+ description: str
28
+ confidence: float = 1.0
29
+ context_required: bool = False
30
+
31
+ class SecurityPatterns:
32
+ """Collection of security patterns for secret detection."""
33
+
34
+ def __init__(self):
35
+ self.patterns = self._build_patterns()
36
+
37
+ def _build_patterns(self) -> List[SecurityPattern]:
38
+ """Build comprehensive list of security patterns."""
39
+ patterns = []
40
+
41
+ # API Keys and Tokens
42
+ patterns.extend([
43
+ SecurityPattern(
44
+ name="aws_access_key",
45
+ pattern=re.compile(r'AKIA[0-9A-Z]{16}', re.IGNORECASE),
46
+ pattern_type="api_key",
47
+ description="AWS Access Key ID",
48
+ confidence=0.95
49
+ ),
50
+ SecurityPattern(
51
+ name="aws_secret_key",
52
+ pattern=re.compile(r'[A-Za-z0-9/+=]{40}', re.IGNORECASE),
53
+ pattern_type="api_key",
54
+ description="AWS Secret Access Key",
55
+ confidence=0.7,
56
+ context_required=True
57
+ ),
58
+ SecurityPattern(
59
+ name="github_token",
60
+ pattern=re.compile(r'gh[pousr]_[A-Za-z0-9_]{36,}', re.IGNORECASE),
61
+ pattern_type="api_key",
62
+ description="GitHub Token",
63
+ confidence=0.95
64
+ ),
65
+ SecurityPattern(
66
+ name="google_api_key",
67
+ pattern=re.compile(r'AIza[0-9A-Za-z\-_]{35}', re.IGNORECASE),
68
+ pattern_type="api_key",
69
+ description="Google API Key",
70
+ confidence=0.95
71
+ ),
72
+ SecurityPattern(
73
+ name="slack_token",
74
+ pattern=re.compile(r'xox[baprs]-([0-9a-zA-Z]{10,48})', re.IGNORECASE),
75
+ pattern_type="api_key",
76
+ description="Slack Token",
77
+ confidence=0.95
78
+ ),
79
+ SecurityPattern(
80
+ name="stripe_key",
81
+ pattern=re.compile(r'[rs]k_(test|live)_[0-9a-zA-Z]{24}', re.IGNORECASE),
82
+ pattern_type="api_key",
83
+ description="Stripe API Key",
84
+ confidence=0.95
85
+ ),
86
+ SecurityPattern(
87
+ name="openai_api_key",
88
+ pattern=re.compile(r'sk-[a-zA-Z0-9]{48}', re.IGNORECASE),
89
+ pattern_type="api_key",
90
+ description="OpenAI API Key",
91
+ confidence=0.95
92
+ ),
93
+ SecurityPattern(
94
+ name="anthropic_api_key",
95
+ pattern=re.compile(r'sk-ant-api03-[a-zA-Z0-9\-_]{95}', re.IGNORECASE),
96
+ pattern_type="api_key",
97
+ description="Anthropic API Key",
98
+ confidence=0.95
99
+ ),
100
+ SecurityPattern(
101
+ name="voyage_api_key",
102
+ pattern=re.compile(r'pa-[a-zA-Z0-9]{32}', re.IGNORECASE),
103
+ pattern_type="api_key",
104
+ description="Voyage AI API Key",
105
+ confidence=0.95
106
+ ),
107
+ ])
108
+
109
+ # JWT Tokens
110
+ patterns.append(
111
+ SecurityPattern(
112
+ name="jwt_token",
113
+ pattern=re.compile(r'eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*', re.IGNORECASE),
114
+ pattern_type="token",
115
+ description="JWT Token",
116
+ confidence=0.9
117
+ )
118
+ )
119
+
120
+ # Database Connection Strings
121
+ patterns.extend([
122
+ SecurityPattern(
123
+ name="postgres_url",
124
+ pattern=re.compile(r'postgres(?:ql)?://[^\s]+', re.IGNORECASE),
125
+ pattern_type="connection_string",
126
+ description="PostgreSQL Connection String",
127
+ confidence=0.85
128
+ ),
129
+ SecurityPattern(
130
+ name="mysql_url",
131
+ pattern=re.compile(r'mysql://[^\s]+', re.IGNORECASE),
132
+ pattern_type="connection_string",
133
+ description="MySQL Connection String",
134
+ confidence=0.85
135
+ ),
136
+ SecurityPattern(
137
+ name="mongodb_url",
138
+ pattern=re.compile(r'mongodb(?:\+srv)?://[^\s]+', re.IGNORECASE),
139
+ pattern_type="connection_string",
140
+ description="MongoDB Connection String",
141
+ confidence=0.85
142
+ ),
143
+ SecurityPattern(
144
+ name="redis_url",
145
+ pattern=re.compile(r'redis://[^\s]+', re.IGNORECASE),
146
+ pattern_type="connection_string",
147
+ description="Redis Connection String",
148
+ confidence=0.85
149
+ ),
150
+ ])
151
+
152
+ # Private Keys
153
+ patterns.extend([
154
+ SecurityPattern(
155
+ name="rsa_private_key",
156
+ pattern=re.compile(r'-----BEGIN RSA PRIVATE KEY-----[^-]+-----END RSA PRIVATE KEY-----', re.MULTILINE | re.DOTALL),
157
+ pattern_type="private_key",
158
+ description="RSA Private Key",
159
+ confidence=1.0
160
+ ),
161
+ SecurityPattern(
162
+ name="ssh_private_key",
163
+ pattern=re.compile(r'-----BEGIN OPENSSH PRIVATE KEY-----[^-]+-----END OPENSSH PRIVATE KEY-----', re.MULTILINE | re.DOTALL),
164
+ pattern_type="private_key",
165
+ description="SSH Private Key",
166
+ confidence=1.0
167
+ ),
168
+ SecurityPattern(
169
+ name="ec_private_key",
170
+ pattern=re.compile(r'-----BEGIN EC PRIVATE KEY-----[^-]+-----END EC PRIVATE KEY-----', re.MULTILINE | re.DOTALL),
171
+ pattern_type="private_key",
172
+ description="EC Private Key",
173
+ confidence=1.0
174
+ ),
175
+ ])
176
+
177
+ # Environment Variable Patterns
178
+ patterns.extend([
179
+ SecurityPattern(
180
+ name="env_password",
181
+ pattern=re.compile(r'(?i)(password|passwd|pwd)\s*[=:]\s*["\']?[^\s"\']+["\']?', re.IGNORECASE),
182
+ pattern_type="password",
183
+ description="Environment Variable Password",
184
+ confidence=0.7,
185
+ context_required=True
186
+ ),
187
+ SecurityPattern(
188
+ name="env_secret",
189
+ pattern=re.compile(r'(?i)(secret|token|key)\s*[=:]\s*["\']?[^\s"\']+["\']?', re.IGNORECASE),
190
+ pattern_type="secret",
191
+ description="Environment Variable Secret",
192
+ confidence=0.6,
193
+ context_required=True
194
+ ),
195
+ ])
196
+
197
+ # Generic Patterns (lower confidence)
198
+ patterns.extend([
199
+ SecurityPattern(
200
+ name="base64_encoded",
201
+ pattern=re.compile(r'[A-Za-z0-9+/]{32,}={0,2}', re.IGNORECASE),
202
+ pattern_type="encoded_data",
203
+ description="Base64 Encoded Data",
204
+ confidence=0.3,
205
+ context_required=True
206
+ ),
207
+ SecurityPattern(
208
+ name="hex_encoded",
209
+ pattern=re.compile(r'[a-fA-F0-9]{32,}', re.IGNORECASE),
210
+ pattern_type="encoded_data",
211
+ description="Hex Encoded Data",
212
+ confidence=0.3,
213
+ context_required=True
214
+ ),
215
+ SecurityPattern(
216
+ name="uuid",
217
+ pattern=re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.IGNORECASE),
218
+ pattern_type="identifier",
219
+ description="UUID",
220
+ confidence=0.2,
221
+ context_required=True
222
+ ),
223
+ ])
224
+
225
+ # URLs with embedded credentials
226
+ patterns.append(
227
+ SecurityPattern(
228
+ name="url_with_credentials",
229
+ pattern=re.compile(r'https?://[^:/\s]+:[^@/\s]+@[^\s]+', re.IGNORECASE),
230
+ pattern_type="credential_url",
231
+ description="URL with embedded credentials",
232
+ confidence=0.9
233
+ )
234
+ )
235
+
236
+ return patterns
237
+
238
+ def get_patterns_by_type(self, pattern_type: str) -> List[SecurityPattern]:
239
+ """Get all patterns of a specific type."""
240
+ return [p for p in self.patterns if p.pattern_type == pattern_type]
241
+
242
+ def get_high_confidence_patterns(self, min_confidence: float = 0.8) -> List[SecurityPattern]:
243
+ """Get patterns with confidence above threshold."""
244
+ return [p for p in self.patterns if p.confidence >= min_confidence]
245
+
246
+ def get_context_sensitive_patterns(self) -> List[SecurityPattern]:
247
+ """Get patterns that require context for accurate detection."""
248
+ return [p for p in self.patterns if p.context_required]
249
+
250
+ def find_matches(self, text: str, min_confidence: float = 0.5) -> List[PatternMatch]:
251
+ """Find all pattern matches in text above confidence threshold."""
252
+ matches = []
253
+
254
+ for pattern in self.patterns:
255
+ if pattern.confidence < min_confidence:
256
+ continue
257
+
258
+ for match in pattern.pattern.finditer(text):
259
+ # For context-sensitive patterns, check surrounding context
260
+ if pattern.context_required:
261
+ if not self._has_suspicious_context(text, match.start(), match.end()):
262
+ continue
263
+
264
+ matches.append(PatternMatch(
265
+ pattern_name=pattern.name,
266
+ pattern_type=pattern.pattern_type,
267
+ start_pos=match.start(),
268
+ end_pos=match.end(),
269
+ matched_text=match.group(),
270
+ confidence=pattern.confidence
271
+ ))
272
+
273
+ # Sort by position for consistent output
274
+ return sorted(matches, key=lambda m: m.start_pos)
275
+
276
+ def _has_suspicious_context(self, text: str, start: int, end: int, context_size: int = 50) -> bool:
277
+ """Check if match has suspicious context indicating it's likely a secret."""
278
+ # Get surrounding context
279
+ context_start = max(0, start - context_size)
280
+ context_end = min(len(text), end + context_size)
281
+ context = text[context_start:context_end].lower()
282
+
283
+ # Keywords that suggest secret/credential usage
284
+ suspicious_keywords = [
285
+ 'password', 'passwd', 'pwd', 'secret', 'token', 'key', 'api',
286
+ 'auth', 'credential', 'login', 'access', 'private', 'confidential',
287
+ 'env', 'config', 'setting', 'var', 'export', 'process.env'
288
+ ]
289
+
290
+ return any(keyword in context for keyword in suspicious_keywords)
291
+
292
+ def get_pattern_summary(self) -> Dict[str, int]:
293
+ """Get summary of patterns by type."""
294
+ summary = {}
295
+ for pattern in self.patterns:
296
+ summary[pattern.pattern_type] = summary.get(pattern.pattern_type, 0) + 1
297
+ return summary