mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/models.py +125 -1
- mcp_code_indexer/main.py +60 -0
- mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
- mcp_code_indexer/server/mcp_server.py +3 -0
- mcp_code_indexer/vector_mode/__init__.py +36 -0
- mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
- mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
- mcp_code_indexer/vector_mode/config.py +167 -0
- mcp_code_indexer/vector_mode/daemon.py +335 -0
- mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
- mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
- mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
- mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
- mcp_code_indexer/vector_mode/security/__init__.py +11 -0
- mcp_code_indexer/vector_mode/security/patterns.py +297 -0
- mcp_code_indexer/vector_mode/security/redactor.py +368 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Secret redaction engine for vector mode.
|
|
3
|
+
|
|
4
|
+
Detects and redacts sensitive information from code before sending
|
|
5
|
+
to external APIs for embedding generation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
from typing import List, Dict, Any, Optional, Set, NamedTuple
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from .patterns import SecurityPatterns, PatternMatch
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
class RedactionResult(NamedTuple):
|
|
20
|
+
"""Result of secret redaction process."""
|
|
21
|
+
original_hash: str
|
|
22
|
+
redacted_content: str
|
|
23
|
+
redaction_count: int
|
|
24
|
+
patterns_matched: List[str]
|
|
25
|
+
confidence_scores: List[float]
|
|
26
|
+
was_redacted: bool
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class RedactionStats:
|
|
30
|
+
"""Statistics about redaction operations."""
|
|
31
|
+
total_files_processed: int = 0
|
|
32
|
+
total_redactions: int = 0
|
|
33
|
+
redactions_by_type: Dict[str, int] = None
|
|
34
|
+
redactions_by_pattern: Dict[str, int] = None
|
|
35
|
+
high_confidence_redactions: int = 0
|
|
36
|
+
|
|
37
|
+
def __post_init__(self):
|
|
38
|
+
if self.redactions_by_type is None:
|
|
39
|
+
self.redactions_by_type = {}
|
|
40
|
+
if self.redactions_by_pattern is None:
|
|
41
|
+
self.redactions_by_pattern = {}
|
|
42
|
+
|
|
43
|
+
class SecretRedactor:
|
|
44
|
+
"""
|
|
45
|
+
Main secret redaction engine.
|
|
46
|
+
|
|
47
|
+
Scans code content for secrets and replaces them with safe placeholders
|
|
48
|
+
while preserving code structure and semantics.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
min_confidence: float = 0.5,
|
|
54
|
+
preserve_structure: bool = True,
|
|
55
|
+
redaction_marker: str = "[REDACTED]",
|
|
56
|
+
custom_patterns_file: Optional[Path] = None,
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Initialize the secret redactor.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
min_confidence: Minimum confidence threshold for redaction
|
|
63
|
+
preserve_structure: Whether to preserve code structure
|
|
64
|
+
redaction_marker: Marker to use for redacted content
|
|
65
|
+
custom_patterns_file: Path to custom patterns file
|
|
66
|
+
"""
|
|
67
|
+
self.min_confidence = min_confidence
|
|
68
|
+
self.preserve_structure = preserve_structure
|
|
69
|
+
self.redaction_marker = redaction_marker
|
|
70
|
+
|
|
71
|
+
# Load security patterns
|
|
72
|
+
self.patterns = SecurityPatterns()
|
|
73
|
+
|
|
74
|
+
# Load custom patterns if provided
|
|
75
|
+
if custom_patterns_file and custom_patterns_file.exists():
|
|
76
|
+
self._load_custom_patterns(custom_patterns_file)
|
|
77
|
+
|
|
78
|
+
# Statistics tracking
|
|
79
|
+
self.stats = RedactionStats()
|
|
80
|
+
|
|
81
|
+
# Common safe file extensions (no redaction needed)
|
|
82
|
+
self.safe_extensions = {
|
|
83
|
+
'.md', '.txt', '.rst', '.json', '.yaml', '.yml',
|
|
84
|
+
'.xml', '.html', '.css', '.svg', '.license'
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# Cache for performance
|
|
88
|
+
self._pattern_cache: Dict[str, List[PatternMatch]] = {}
|
|
89
|
+
|
|
90
|
+
def _load_custom_patterns(self, patterns_file: Path) -> None:
|
|
91
|
+
"""Load custom security patterns from file."""
|
|
92
|
+
try:
|
|
93
|
+
# TODO: Implement custom pattern loading
|
|
94
|
+
logger.info(f"Custom patterns file specified but not yet implemented: {patterns_file}")
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(f"Failed to load custom patterns from {patterns_file}: {e}")
|
|
97
|
+
|
|
98
|
+
def _should_process_file(self, file_path: str) -> bool:
|
|
99
|
+
"""Determine if a file should be processed for redaction."""
|
|
100
|
+
path = Path(file_path)
|
|
101
|
+
|
|
102
|
+
# Skip safe file types
|
|
103
|
+
if path.suffix.lower() in self.safe_extensions:
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
# Skip test files (often contain mock data)
|
|
107
|
+
if any(test_marker in path.name.lower() for test_marker in ['test', 'spec', 'mock']):
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
# Skip documentation directories
|
|
111
|
+
if any(doc_dir in path.parts for doc_dir in ['docs', 'documentation', 'examples']):
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
def _generate_content_hash(self, content: str) -> str:
|
|
117
|
+
"""Generate SHA-256 hash of content."""
|
|
118
|
+
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
|
119
|
+
|
|
120
|
+
def _create_redaction_marker(
|
|
121
|
+
self,
|
|
122
|
+
pattern_type: str,
|
|
123
|
+
pattern_name: str,
|
|
124
|
+
original_length: int
|
|
125
|
+
) -> str:
|
|
126
|
+
"""Create a redaction marker that preserves structure."""
|
|
127
|
+
if not self.preserve_structure:
|
|
128
|
+
return self.redaction_marker
|
|
129
|
+
|
|
130
|
+
# Create a marker that maintains similar length
|
|
131
|
+
base_marker = f"[REDACTED:{pattern_type.upper()}]"
|
|
132
|
+
|
|
133
|
+
if original_length <= len(base_marker):
|
|
134
|
+
return base_marker[:original_length]
|
|
135
|
+
|
|
136
|
+
# Pad with safe characters to maintain length
|
|
137
|
+
padding = 'X' * (original_length - len(base_marker))
|
|
138
|
+
return base_marker + padding
|
|
139
|
+
|
|
140
|
+
def _redact_matches(self, content: str, matches: List[PatternMatch]) -> str:
|
|
141
|
+
"""Apply redactions to content based on matches."""
|
|
142
|
+
if not matches:
|
|
143
|
+
return content
|
|
144
|
+
|
|
145
|
+
# Sort matches by position (reverse order for safe replacement)
|
|
146
|
+
sorted_matches = sorted(matches, key=lambda m: m.start_pos, reverse=True)
|
|
147
|
+
|
|
148
|
+
redacted_content = content
|
|
149
|
+
|
|
150
|
+
for match in sorted_matches:
|
|
151
|
+
# Create appropriate redaction marker
|
|
152
|
+
marker = self._create_redaction_marker(
|
|
153
|
+
match.pattern_type,
|
|
154
|
+
match.pattern_name,
|
|
155
|
+
len(match.matched_text)
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Replace the matched text
|
|
159
|
+
redacted_content = (
|
|
160
|
+
redacted_content[:match.start_pos] +
|
|
161
|
+
marker +
|
|
162
|
+
redacted_content[match.end_pos:]
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Update statistics
|
|
166
|
+
self.stats.total_redactions += 1
|
|
167
|
+
self.stats.redactions_by_type[match.pattern_type] = (
|
|
168
|
+
self.stats.redactions_by_type.get(match.pattern_type, 0) + 1
|
|
169
|
+
)
|
|
170
|
+
self.stats.redactions_by_pattern[match.pattern_name] = (
|
|
171
|
+
self.stats.redactions_by_pattern.get(match.pattern_name, 0) + 1
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if match.confidence >= 0.8:
|
|
175
|
+
self.stats.high_confidence_redactions += 1
|
|
176
|
+
|
|
177
|
+
logger.debug(
|
|
178
|
+
f"Redacted {match.pattern_name} (confidence: {match.confidence:.2f}): "
|
|
179
|
+
f"{match.matched_text[:20]}..."
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return redacted_content
|
|
183
|
+
|
|
184
|
+
def _filter_false_positives(
|
|
185
|
+
self,
|
|
186
|
+
matches: List[PatternMatch],
|
|
187
|
+
content: str,
|
|
188
|
+
file_path: Optional[str] = None
|
|
189
|
+
) -> List[PatternMatch]:
|
|
190
|
+
"""Filter out likely false positives based on context."""
|
|
191
|
+
filtered_matches = []
|
|
192
|
+
|
|
193
|
+
for match in matches:
|
|
194
|
+
# Skip very short matches for low-confidence patterns
|
|
195
|
+
if match.confidence < 0.7 and len(match.matched_text) < 16:
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# Skip matches that look like placeholders
|
|
199
|
+
if self._looks_like_placeholder(match.matched_text):
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# Skip matches in comments (for code files)
|
|
203
|
+
if file_path and self._is_in_comment(content, match.start_pos, file_path):
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
# Skip matches that are likely examples or documentation
|
|
207
|
+
if self._is_example_content(content, match.start_pos):
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
filtered_matches.append(match)
|
|
211
|
+
|
|
212
|
+
return filtered_matches
|
|
213
|
+
|
|
214
|
+
def _looks_like_placeholder(self, text: str) -> bool:
|
|
215
|
+
"""Check if text looks like a placeholder rather than real secret."""
|
|
216
|
+
placeholder_indicators = [
|
|
217
|
+
'example', 'sample', 'test', 'demo', 'placeholder', 'your',
|
|
218
|
+
'xxxxx', '11111', '00000', 'aaaaa', 'abcdef',
|
|
219
|
+
'replace', 'insert', 'enter', 'put'
|
|
220
|
+
]
|
|
221
|
+
|
|
222
|
+
text_lower = text.lower()
|
|
223
|
+
return any(indicator in text_lower for indicator in placeholder_indicators)
|
|
224
|
+
|
|
225
|
+
def _is_in_comment(self, content: str, position: int, file_path: str) -> bool:
|
|
226
|
+
"""Check if position is within a comment."""
|
|
227
|
+
# Get file extension for comment style detection
|
|
228
|
+
ext = Path(file_path).suffix.lower()
|
|
229
|
+
|
|
230
|
+
# Find line containing the position
|
|
231
|
+
lines = content[:position].split('\n')
|
|
232
|
+
current_line = content.split('\n')[len(lines) - 1] if lines else ""
|
|
233
|
+
|
|
234
|
+
# Check for common comment patterns
|
|
235
|
+
comment_patterns = {
|
|
236
|
+
'.py': [r'#.*', r'""".*?"""', r"'''.*?'''"],
|
|
237
|
+
'.js': [r'//.*', r'/\*.*?\*/'],
|
|
238
|
+
'.java': [r'//.*', r'/\*.*?\*/'],
|
|
239
|
+
'.cpp': [r'//.*', r'/\*.*?\*/'],
|
|
240
|
+
'.c': [r'//.*', r'/\*.*?\*/'],
|
|
241
|
+
'.go': [r'//.*', r'/\*.*?\*/'],
|
|
242
|
+
'.rs': [r'//.*', r'/\*.*?\*/'],
|
|
243
|
+
'.sh': [r'#.*'],
|
|
244
|
+
'.sql': [r'--.*', r'/\*.*?\*/'],
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
patterns = comment_patterns.get(ext, [])
|
|
248
|
+
for pattern in patterns:
|
|
249
|
+
if re.search(pattern, current_line, re.DOTALL):
|
|
250
|
+
return True
|
|
251
|
+
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
def _is_example_content(self, content: str, position: int, context_size: int = 100) -> bool:
|
|
255
|
+
"""Check if content around position suggests it's example/documentation."""
|
|
256
|
+
start = max(0, position - context_size)
|
|
257
|
+
end = min(len(content), position + context_size)
|
|
258
|
+
context = content[start:end].lower()
|
|
259
|
+
|
|
260
|
+
example_indicators = [
|
|
261
|
+
'example', 'sample', 'demo', 'tutorial', 'documentation',
|
|
262
|
+
'readme', 'how to', 'getting started', 'quickstart'
|
|
263
|
+
]
|
|
264
|
+
|
|
265
|
+
return any(indicator in context for indicator in example_indicators)
|
|
266
|
+
|
|
267
|
+
def redact_content(
|
|
268
|
+
self,
|
|
269
|
+
content: str,
|
|
270
|
+
file_path: Optional[str] = None,
|
|
271
|
+
cache_key: Optional[str] = None
|
|
272
|
+
) -> RedactionResult:
|
|
273
|
+
"""
|
|
274
|
+
Redact secrets from content.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
content: Content to redact
|
|
278
|
+
file_path: Path to file (for context)
|
|
279
|
+
cache_key: Optional cache key for performance
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
RedactionResult with original hash and redacted content
|
|
283
|
+
"""
|
|
284
|
+
# Generate hash of original content
|
|
285
|
+
original_hash = self._generate_content_hash(content)
|
|
286
|
+
|
|
287
|
+
# Check if file should be processed
|
|
288
|
+
if file_path and not self._should_process_file(file_path):
|
|
289
|
+
logger.debug(f"Skipping redaction for safe file: {file_path}")
|
|
290
|
+
return RedactionResult(
|
|
291
|
+
original_hash=original_hash,
|
|
292
|
+
redacted_content=content,
|
|
293
|
+
redaction_count=0,
|
|
294
|
+
patterns_matched=[],
|
|
295
|
+
confidence_scores=[],
|
|
296
|
+
was_redacted=False
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Check cache if available
|
|
300
|
+
if cache_key and cache_key in self._pattern_cache:
|
|
301
|
+
matches = self._pattern_cache[cache_key]
|
|
302
|
+
else:
|
|
303
|
+
# Find all pattern matches
|
|
304
|
+
matches = self.patterns.find_matches(content, self.min_confidence)
|
|
305
|
+
|
|
306
|
+
# Filter false positives
|
|
307
|
+
matches = self._filter_false_positives(matches, content, file_path)
|
|
308
|
+
|
|
309
|
+
# Cache results
|
|
310
|
+
if cache_key:
|
|
311
|
+
self._pattern_cache[cache_key] = matches
|
|
312
|
+
|
|
313
|
+
# Apply redactions
|
|
314
|
+
redacted_content = self._redact_matches(content, matches)
|
|
315
|
+
|
|
316
|
+
# Update statistics
|
|
317
|
+
self.stats.total_files_processed += 1
|
|
318
|
+
|
|
319
|
+
# Log redaction summary
|
|
320
|
+
if matches:
|
|
321
|
+
logger.info(
|
|
322
|
+
f"Redacted {len(matches)} secrets from {file_path or 'content'} "
|
|
323
|
+
f"(confidence range: {min(m.confidence for m in matches):.2f}-"
|
|
324
|
+
f"{max(m.confidence for m in matches):.2f})"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
return RedactionResult(
|
|
328
|
+
original_hash=original_hash,
|
|
329
|
+
redacted_content=redacted_content,
|
|
330
|
+
redaction_count=len(matches),
|
|
331
|
+
patterns_matched=[m.pattern_name for m in matches],
|
|
332
|
+
confidence_scores=[m.confidence for m in matches],
|
|
333
|
+
was_redacted=len(matches) > 0
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
def redact_file(self, file_path: Path) -> RedactionResult:
|
|
337
|
+
"""Redact secrets from a file."""
|
|
338
|
+
try:
|
|
339
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
340
|
+
content = f.read()
|
|
341
|
+
|
|
342
|
+
return self.redact_content(content, str(file_path))
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error(f"Failed to redact file {file_path}: {e}")
|
|
346
|
+
return RedactionResult(
|
|
347
|
+
original_hash="",
|
|
348
|
+
redacted_content="",
|
|
349
|
+
redaction_count=0,
|
|
350
|
+
patterns_matched=[],
|
|
351
|
+
confidence_scores=[],
|
|
352
|
+
was_redacted=False
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def get_redaction_stats(self) -> RedactionStats:
|
|
356
|
+
"""Get redaction statistics."""
|
|
357
|
+
return self.stats
|
|
358
|
+
|
|
359
|
+
def clear_cache(self) -> None:
|
|
360
|
+
"""Clear the pattern match cache."""
|
|
361
|
+
self._pattern_cache.clear()
|
|
362
|
+
|
|
363
|
+
def set_confidence_threshold(self, threshold: float) -> None:
|
|
364
|
+
"""Update the confidence threshold for redaction."""
|
|
365
|
+
if 0.0 <= threshold <= 1.0:
|
|
366
|
+
self.min_confidence = threshold
|
|
367
|
+
else:
|
|
368
|
+
raise ValueError("Confidence threshold must be between 0.0 and 1.0")
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: mcp-code-indexer
|
|
3
|
-
Version: 4.0
|
|
3
|
+
Version: 4.2.0
|
|
4
4
|
Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: mcp,model-context-protocol,code-indexer,ai-tools,codebase-navigation,file-descriptions,llm-tools
|
|
7
7
|
Author: MCP Code Indexer Contributors
|
|
8
8
|
Maintainer: MCP Code Indexer Contributors
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
11
11
|
Classifier: Environment :: Console
|
|
12
12
|
Classifier: Framework :: AsyncIO
|
|
@@ -14,15 +14,16 @@ Classifier: Intended Audience :: Developers
|
|
|
14
14
|
Classifier: License :: OSI Approved :: MIT License
|
|
15
15
|
Classifier: Operating System :: OS Independent
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Software Development
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
25
|
Classifier: Typing :: Typed
|
|
26
|
+
Provides-Extra: vector
|
|
26
27
|
Requires-Dist: aiofiles (==23.2.0)
|
|
27
28
|
Requires-Dist: aiohttp (>=3.8.0)
|
|
28
29
|
Requires-Dist: aiosqlite (==0.19.0)
|
|
@@ -43,8 +44,8 @@ Description-Content-Type: text/markdown
|
|
|
43
44
|
|
|
44
45
|
# MCP Code Indexer 🚀
|
|
45
46
|
|
|
46
|
-
[](https://badge.fury.io/py/mcp-code-indexer)
|
|
48
|
+
[](https://pypi.org/project/mcp-code-indexer/)
|
|
48
49
|
[](https://opensource.org/licenses/MIT)
|
|
49
50
|
|
|
50
51
|
A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
|
|
@@ -197,6 +198,66 @@ The git hook integration provides intelligent automation:
|
|
|
197
198
|
|
|
198
199
|
**Learn More**: See [Git Hook Setup Guide](docs/git-hook-setup.md) for complete configuration options and troubleshooting.
|
|
199
200
|
|
|
201
|
+
## 🧠 Vector Mode (BETA)
|
|
202
|
+
|
|
203
|
+
🚀 **NEW Feature**: Semantic code search with vector embeddings! Experience AI-powered code discovery that understands context and meaning, not just keywords.
|
|
204
|
+
|
|
205
|
+
### 🎯 What is Vector Mode?
|
|
206
|
+
|
|
207
|
+
Vector Mode transforms how you search and understand codebases by using AI embeddings:
|
|
208
|
+
|
|
209
|
+
- **🔍 Semantic Search**: Find code by meaning, not just text matching
|
|
210
|
+
- **⚡ Real-time Indexing**: Automatic embedding generation as code changes
|
|
211
|
+
- **🛡️ Secure by Default**: Comprehensive secret redaction before API calls
|
|
212
|
+
- **🌐 Multi-language**: Python, JavaScript, TypeScript with AST-based chunking
|
|
213
|
+
- **📊 Smart Chunking**: Context-aware code segmentation for optimal embeddings
|
|
214
|
+
|
|
215
|
+
### 🚀 Quick Start
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
# Install vector mode dependencies
|
|
219
|
+
pip install mcp-code-indexer[vector]
|
|
220
|
+
|
|
221
|
+
# Set required API keys
|
|
222
|
+
export VOYAGE_API_KEY="pa-your-voyage-api-key"
|
|
223
|
+
export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
|
|
224
|
+
|
|
225
|
+
# Optional: Configure region (default: gcp-europe-west3)
|
|
226
|
+
export TURBOPUFFER_REGION="gcp-europe-west3"
|
|
227
|
+
|
|
228
|
+
# Start with vector mode enabled
|
|
229
|
+
mcp-code-indexer --vector
|
|
230
|
+
|
|
231
|
+
# The daemon automatically starts and begins indexing your projects
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### 💡 Key Features
|
|
235
|
+
|
|
236
|
+
- **🔐 Secret Redaction**: 20+ pattern types automatically detected and redacted
|
|
237
|
+
- **🌳 Merkle Trees**: Efficient change detection without full directory scans
|
|
238
|
+
- **🎛️ Circuit Breakers**: Resilient API integration with automatic retry logic
|
|
239
|
+
- **📈 Production Ready**: Built for high-concurrency with comprehensive monitoring
|
|
240
|
+
|
|
241
|
+
### 🔧 Advanced Configuration
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
# Custom configuration
|
|
245
|
+
mcp-code-indexer --vector --vector-config /path/to/config.yaml
|
|
246
|
+
|
|
247
|
+
# HTTP mode with vector search
|
|
248
|
+
mcp-code-indexer --vector --http --port 8080
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### 🛠️ Architecture
|
|
252
|
+
|
|
253
|
+
Vector Mode adds powerful new MCP tools:
|
|
254
|
+
- `vector_search` - Semantic code search across projects
|
|
255
|
+
- `similarity_search` - Find similar code patterns
|
|
256
|
+
- `dependency_search` - Discover code relationships
|
|
257
|
+
- `vector_status` - Monitor indexing progress
|
|
258
|
+
|
|
259
|
+
**Status**: Currently in BETA - foundations implemented, full pipeline in development.
|
|
260
|
+
|
|
200
261
|
## 🔧 Development Setup
|
|
201
262
|
|
|
202
263
|
### 👨💻 For Contributors
|
|
@@ -11,7 +11,7 @@ mcp_code_indexer/database/connection_health.py,sha256=jZr3tCbfjUJujdXe_uxtm1N4c3
|
|
|
11
11
|
mcp_code_indexer/database/database.py,sha256=bBjQoa1cH8iAHZxnv157229yBCZhcZre4g6QtBvl6Dk,47321
|
|
12
12
|
mcp_code_indexer/database/database_factory.py,sha256=zm942m72mqCYTGh1GFyVw-hBsbZcZnx3znJ2ZQPwISM,4316
|
|
13
13
|
mcp_code_indexer/database/exceptions.py,sha256=bamoC-ssw_TMRA5-6lzX6d_1DlcXXrcmiCMBdUEQ9dI,10479
|
|
14
|
-
mcp_code_indexer/database/models.py,sha256=
|
|
14
|
+
mcp_code_indexer/database/models.py,sha256=OBIHmggY7BK-b9xak616YwttGS8pr8fY2DnAdP5JqYY,13181
|
|
15
15
|
mcp_code_indexer/database/path_resolver.py,sha256=1Ubx6Ly5F2dnvhbdN3tqyowBHslABXpoA6wgL4BQYGo,3461
|
|
16
16
|
mcp_code_indexer/database/retry_executor.py,sha256=6Hb0BM2BO6fl7sTIHtHFcwgV93W22eOrFvexYtFpa0k,13966
|
|
17
17
|
mcp_code_indexer/deepask_handler.py,sha256=qI9h_Me5WQAbt3hzzDG8XDBMZlnvx-I9R7OsmO_o8aA,18497
|
|
@@ -19,7 +19,7 @@ mcp_code_indexer/error_handler.py,sha256=ylciEM-cR7E8Gmd8cfh5olcllJm0FnaYBGH86ya
|
|
|
19
19
|
mcp_code_indexer/file_scanner.py,sha256=7Ab34lRQGeh5GBCzcSP96p4YK6LDWFGUHLXqi499UZ4,11838
|
|
20
20
|
mcp_code_indexer/git_hook_handler.py,sha256=sTtZV3-Yy1Evt06R5NZclELeepM4Ia9OQoR2O6BK3Hk,45517
|
|
21
21
|
mcp_code_indexer/logging_config.py,sha256=M5eVZ5PwfTROib7ISTQ522n2hUSc4hJ_wUgsrJKsTTg,10030
|
|
22
|
-
mcp_code_indexer/main.py,sha256=
|
|
22
|
+
mcp_code_indexer/main.py,sha256=tdUEcTVLweLmrG49TReGAl1nBf0vnzCIa7NSg6IPPec,37137
|
|
23
23
|
mcp_code_indexer/middleware/__init__.py,sha256=UCEPzOlZldlqFzYEfrXw1HvCDvY1jpLvyaDGUzVr2aw,368
|
|
24
24
|
mcp_code_indexer/middleware/auth.py,sha256=4HkHMDZBNsyPA1VE8qF7pRNKbqG4xIDZjllENbgynxI,7258
|
|
25
25
|
mcp_code_indexer/middleware/error_middleware.py,sha256=0RnKM5fK_n_7AITK2ueAqv30kLBdjU3vaWOTwWd2Xs0,11965
|
|
@@ -30,9 +30,10 @@ mcp_code_indexer/migrations/002_performance_indexes.sql,sha256=-J6Ce3nyF8pJ2hN5Z
|
|
|
30
30
|
mcp_code_indexer/migrations/003_project_overviews.sql,sha256=pPzn7UmJ_Bda9mJ1nYTN1GeuYwdQHC7Fva6PvWaucUw,891
|
|
31
31
|
mcp_code_indexer/migrations/004_remove_branch_dependency.sql,sha256=WrWl3_17_1s9zVEN4dESLhDr6ezVF18dpGaeab4eJGs,6451
|
|
32
32
|
mcp_code_indexer/migrations/005_remove_git_remotes.sql,sha256=vT84AaV1hyN4zq5W67hR14TgAwhW7_RNtBHrCoksxA4,1299
|
|
33
|
+
mcp_code_indexer/migrations/006_vector_mode.sql,sha256=kN-UBPGoagqtpxpGEjdz-V3hevPAXxAdNmxF4iIPsY8,7448
|
|
33
34
|
mcp_code_indexer/query_preprocessor.py,sha256=vi23sK2ffs4T5PGY7lHrbCBDL421AlPz2dldqX_3JKA,5491
|
|
34
35
|
mcp_code_indexer/server/__init__.py,sha256=16xMcuriUOBlawRqWNBk6niwrvtv_JD5xvI36X1Vsmk,41
|
|
35
|
-
mcp_code_indexer/server/mcp_server.py,sha256=
|
|
36
|
+
mcp_code_indexer/server/mcp_server.py,sha256=wFfsnrd5OOdzi4gmkVjggKBMIT6CYGcjWFClUguR9zo,75935
|
|
36
37
|
mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4,sha256=Ijkht27pm96ZW3_3OFE-7xAPtR0YyTWXoRO8_-hlsqc,1681126
|
|
37
38
|
mcp_code_indexer/token_counter.py,sha256=e6WsyCEWMMSkMwLbcVtr5e8vEqh-kFqNmiJErCNdqHE,8220
|
|
38
39
|
mcp_code_indexer/tools/__init__.py,sha256=m01mxML2UdD7y5rih_XNhNSCMzQTz7WQ_T1TeOcYlnE,49
|
|
@@ -40,8 +41,25 @@ mcp_code_indexer/transport/__init__.py,sha256=OrdabRjO3EChnTZ06IIFxO6imRQ3PwtRtJ
|
|
|
40
41
|
mcp_code_indexer/transport/base.py,sha256=Lb3IrL8wr1QvUQpx1GyBQW5bvDpJoUFyXWNAw0dbOK4,3258
|
|
41
42
|
mcp_code_indexer/transport/http_transport.py,sha256=iDxW8CXEIPlpyOtSFkU1qw1FtbKbXgCvJXfVtlXbzIo,13291
|
|
42
43
|
mcp_code_indexer/transport/stdio_transport.py,sha256=a-Pu3usx_NwkHsN2VU8Qe0EwcA7PGL54Gbu2Ee8e0lU,4792
|
|
43
|
-
mcp_code_indexer
|
|
44
|
-
mcp_code_indexer
|
|
45
|
-
mcp_code_indexer
|
|
46
|
-
mcp_code_indexer
|
|
47
|
-
mcp_code_indexer
|
|
44
|
+
mcp_code_indexer/vector_mode/__init__.py,sha256=78RDdsmZGMcCBqaXxRslgzSrY-A9pLbmlb0bnnMXuVo,1064
|
|
45
|
+
mcp_code_indexer/vector_mode/chunking/__init__.py,sha256=rjjFMbHsqWIBzL4IajYxXXJud_RvBrpFNjVcxnRIWCE,490
|
|
46
|
+
mcp_code_indexer/vector_mode/chunking/ast_chunker.py,sha256=GTl_6U0nSgDRRzKS07tJ7RMX8AmJvvY_IsRn95hvVfA,14623
|
|
47
|
+
mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py,sha256=xD0zEibjt6FLBFaKHNc63-iKTtCgnOlLL_9Hc8mCrzE,19752
|
|
48
|
+
mcp_code_indexer/vector_mode/chunking/language_handlers.py,sha256=YEpTVjzyJH445OjniGV05apexsfG5KVR4lwBEl4mGJc,18189
|
|
49
|
+
mcp_code_indexer/vector_mode/config.py,sha256=g5p9Q4EAR20DfLv4RxaQnk3_UdysuvWS8rcsjs1vgwI,6680
|
|
50
|
+
mcp_code_indexer/vector_mode/daemon.py,sha256=le3NkxFD73bKeutruzLY-Bauc-nXzlhlIlDJv4jlxhU,12096
|
|
51
|
+
mcp_code_indexer/vector_mode/monitoring/__init__.py,sha256=9rNWCvHxRMvYumdIrPjb5K9fpOwe1Aem24hdh8gXoDM,439
|
|
52
|
+
mcp_code_indexer/vector_mode/monitoring/change_detector.py,sha256=X82e_sKbJJFPhqZFJubLQb8Rs-srRtS7sh0nUOsPCPw,10338
|
|
53
|
+
mcp_code_indexer/vector_mode/monitoring/file_watcher.py,sha256=AQ6YHSKXPubtprLZngeLb0othJOCNQZ7wwXUvqwphT4,15299
|
|
54
|
+
mcp_code_indexer/vector_mode/monitoring/merkle_tree.py,sha256=83RLdUj_cgcAlrT9Wev9IBavVEyc8Jo8w--IOJisLOk,14645
|
|
55
|
+
mcp_code_indexer/vector_mode/providers/__init__.py,sha256=0GhPHn7XEBSHa6bLvy8j0Eqvto82o6Bs2hZCrHawLus,514
|
|
56
|
+
mcp_code_indexer/vector_mode/providers/turbopuffer_client.py,sha256=NdBAghmaRUUIGFZOTOZYhYyXvv_QB36lieGQjVlLEno,7599
|
|
57
|
+
mcp_code_indexer/vector_mode/providers/voyage_client.py,sha256=pfm9BOx5Temf0LM-VZ4LH6xwBmZ6XO8XeCSiSZ5LU80,4375
|
|
58
|
+
mcp_code_indexer/vector_mode/security/__init__.py,sha256=itfeuysSqV-m9xuo-CMkAoucxexVfPgeOU-ieTLvdls,336
|
|
59
|
+
mcp_code_indexer/vector_mode/security/patterns.py,sha256=0xaiMnZm7YXswq3hVe_DJYePE9MhWuvizApLnmXus9M,11572
|
|
60
|
+
mcp_code_indexer/vector_mode/security/redactor.py,sha256=tsFzhCJ99bp4EFqQVjZ-4f8Uf3ux9X4ODVR09oJG01U,13380
|
|
61
|
+
mcp_code_indexer-4.2.0.dist-info/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
|
|
62
|
+
mcp_code_indexer-4.2.0.dist-info/METADATA,sha256=mm_AhqMQmDSxtvmImTb7i9oCNuXlTjPduo10_nyBREI,27325
|
|
63
|
+
mcp_code_indexer-4.2.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
64
|
+
mcp_code_indexer-4.2.0.dist-info/entry_points.txt,sha256=UABj7HZ0mC6rvF22gxaz2LLNLGQShTrFmp5u00iUtvo,67
|
|
65
|
+
mcp_code_indexer-4.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|