mcp-code-indexer 4.0.1__py3-none-any.whl → 4.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/__init__.py +7 -5
- mcp_code_indexer/ask_handler.py +2 -2
- mcp_code_indexer/claude_api_handler.py +10 -5
- mcp_code_indexer/cleanup_manager.py +20 -12
- mcp_code_indexer/commands/makelocal.py +85 -63
- mcp_code_indexer/data/stop_words_english.txt +1 -1
- mcp_code_indexer/database/connection_health.py +29 -20
- mcp_code_indexer/database/database.py +44 -31
- mcp_code_indexer/database/database_factory.py +19 -20
- mcp_code_indexer/database/exceptions.py +10 -10
- mcp_code_indexer/database/models.py +126 -1
- mcp_code_indexer/database/path_resolver.py +22 -21
- mcp_code_indexer/database/retry_executor.py +37 -19
- mcp_code_indexer/deepask_handler.py +3 -3
- mcp_code_indexer/error_handler.py +46 -20
- mcp_code_indexer/file_scanner.py +15 -12
- mcp_code_indexer/git_hook_handler.py +71 -76
- mcp_code_indexer/logging_config.py +13 -5
- mcp_code_indexer/main.py +85 -22
- mcp_code_indexer/middleware/__init__.py +1 -1
- mcp_code_indexer/middleware/auth.py +47 -43
- mcp_code_indexer/middleware/error_middleware.py +15 -15
- mcp_code_indexer/middleware/logging.py +44 -42
- mcp_code_indexer/middleware/security.py +84 -76
- mcp_code_indexer/migrations/002_performance_indexes.sql +1 -1
- mcp_code_indexer/migrations/004_remove_branch_dependency.sql +14 -14
- mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
- mcp_code_indexer/query_preprocessor.py +2 -2
- mcp_code_indexer/server/mcp_server.py +158 -94
- mcp_code_indexer/transport/__init__.py +1 -1
- mcp_code_indexer/transport/base.py +19 -17
- mcp_code_indexer/transport/http_transport.py +89 -76
- mcp_code_indexer/transport/stdio_transport.py +12 -8
- mcp_code_indexer/vector_mode/__init__.py +36 -0
- mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
- mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
- mcp_code_indexer/vector_mode/config.py +155 -0
- mcp_code_indexer/vector_mode/daemon.py +335 -0
- mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
- mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
- mcp_code_indexer/vector_mode/providers/__init__.py +72 -0
- mcp_code_indexer/vector_mode/providers/base_provider.py +230 -0
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +338 -0
- mcp_code_indexer/vector_mode/providers/voyage_client.py +212 -0
- mcp_code_indexer/vector_mode/security/__init__.py +11 -0
- mcp_code_indexer/vector_mode/security/patterns.py +297 -0
- mcp_code_indexer/vector_mode/security/redactor.py +368 -0
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/METADATA +82 -24
- mcp_code_indexer-4.1.0.dist-info/RECORD +66 -0
- mcp_code_indexer-4.0.1.dist-info/RECORD +0 -47
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Voyage AI client for embedding generation.
|
|
3
|
+
|
|
4
|
+
Provides integration with Voyage AI's embedding API for generating
|
|
5
|
+
high-quality code embeddings using the voyage-code-2 model.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import List, Dict, Any, Optional, Union
|
|
10
|
+
import tiktoken
|
|
11
|
+
|
|
12
|
+
from .base_provider import BaseProvider, ProviderError
|
|
13
|
+
from ..config import VectorConfig
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
class VoyageClient(BaseProvider):
|
|
18
|
+
"""Client for Voyage AI embedding generation."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
api_key: str,
|
|
23
|
+
model: str = "voyage-code-2",
|
|
24
|
+
base_url: str = "https://api.voyageai.com/v1",
|
|
25
|
+
**kwargs
|
|
26
|
+
):
|
|
27
|
+
super().__init__(api_key, base_url, **kwargs)
|
|
28
|
+
self.model = model
|
|
29
|
+
self._embedding_dimension: Optional[int] = None
|
|
30
|
+
|
|
31
|
+
# Note: Voyage AI uses proprietary tokenizer, not tiktoken
|
|
32
|
+
# We'll use approximate counting and let the API handle truncation
|
|
33
|
+
self.tokenizer = None
|
|
34
|
+
logger.info("Using approximate token counting - Voyage AI handles tokenization internally")
|
|
35
|
+
|
|
36
|
+
async def health_check(self) -> bool:
|
|
37
|
+
"""Check if Voyage AI service is healthy."""
|
|
38
|
+
try:
|
|
39
|
+
# Make a small test request
|
|
40
|
+
await self.generate_embeddings(["test"], input_type="query")
|
|
41
|
+
return True
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.warning(f"Voyage AI health check failed: {e}")
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
def _count_tokens(self, text: str) -> int:
|
|
47
|
+
"""Approximate token count - Voyage AI handles exact tokenization."""
|
|
48
|
+
# Voyage AI uses proprietary tokenizer - this is just for batching estimates
|
|
49
|
+
# Rough approximation: 4 characters per token (conservative estimate)
|
|
50
|
+
return len(text) // 4
|
|
51
|
+
|
|
52
|
+
def _batch_texts_by_tokens(
|
|
53
|
+
self,
|
|
54
|
+
texts: List[str],
|
|
55
|
+
max_tokens_per_batch: int = 120000 # Leave buffer under 128k limit
|
|
56
|
+
) -> List[List[str]]:
|
|
57
|
+
"""Batch texts to stay under token limits."""
|
|
58
|
+
batches = []
|
|
59
|
+
current_batch = []
|
|
60
|
+
current_tokens = 0
|
|
61
|
+
|
|
62
|
+
for text in texts:
|
|
63
|
+
text_tokens = self._count_tokens(text)
|
|
64
|
+
|
|
65
|
+
# If single text exceeds limit, truncate it (let Voyage API handle exact truncation)
|
|
66
|
+
if text_tokens > max_tokens_per_batch:
|
|
67
|
+
# Rough character-based truncation - Voyage API will handle exact tokenization
|
|
68
|
+
target_chars = (max_tokens_per_batch - 100) * 4 # Conservative estimate
|
|
69
|
+
text = text[:target_chars]
|
|
70
|
+
text_tokens = self._count_tokens(text)
|
|
71
|
+
|
|
72
|
+
logger.warning(f"Pre-truncated text to ~{text_tokens} tokens (Voyage API will handle exact tokenization)")
|
|
73
|
+
|
|
74
|
+
# Check if adding this text would exceed the batch limit
|
|
75
|
+
if current_tokens + text_tokens > max_tokens_per_batch and current_batch:
|
|
76
|
+
batches.append(current_batch)
|
|
77
|
+
current_batch = [text]
|
|
78
|
+
current_tokens = text_tokens
|
|
79
|
+
else:
|
|
80
|
+
current_batch.append(text)
|
|
81
|
+
current_tokens += text_tokens
|
|
82
|
+
|
|
83
|
+
if current_batch:
|
|
84
|
+
batches.append(current_batch)
|
|
85
|
+
|
|
86
|
+
return batches
|
|
87
|
+
|
|
88
|
+
async def generate_embeddings(
|
|
89
|
+
self,
|
|
90
|
+
texts: List[str],
|
|
91
|
+
input_type: str = "document",
|
|
92
|
+
truncation: bool = True,
|
|
93
|
+
**kwargs
|
|
94
|
+
) -> List[List[float]]:
|
|
95
|
+
"""
|
|
96
|
+
Generate embeddings for a list of texts.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
texts: List of texts to embed
|
|
100
|
+
input_type: Type of input ("document" or "query")
|
|
101
|
+
truncation: Whether to enable truncation
|
|
102
|
+
**kwargs: Additional arguments
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of embedding vectors
|
|
106
|
+
"""
|
|
107
|
+
if not texts:
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
|
|
111
|
+
|
|
112
|
+
# Batch texts to stay under token limits
|
|
113
|
+
batches = self._batch_texts_by_tokens(texts)
|
|
114
|
+
all_embeddings = []
|
|
115
|
+
|
|
116
|
+
for i, batch in enumerate(batches):
|
|
117
|
+
logger.debug(f"Processing batch {i+1}/{len(batches)} with {len(batch)} texts")
|
|
118
|
+
|
|
119
|
+
request_data = {
|
|
120
|
+
"input": batch,
|
|
121
|
+
"model": self.model,
|
|
122
|
+
"input_type": input_type,
|
|
123
|
+
"truncation": truncation,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
response = await self._make_request(
|
|
128
|
+
method="POST",
|
|
129
|
+
endpoint="/embeddings",
|
|
130
|
+
data=request_data,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Extract embeddings from response
|
|
134
|
+
if "data" not in response:
|
|
135
|
+
raise ProviderError("Invalid response format from Voyage AI")
|
|
136
|
+
|
|
137
|
+
batch_embeddings = [item["embedding"] for item in response["data"]]
|
|
138
|
+
all_embeddings.extend(batch_embeddings)
|
|
139
|
+
|
|
140
|
+
# Log usage information if available
|
|
141
|
+
if "usage" in response:
|
|
142
|
+
usage = response["usage"]
|
|
143
|
+
logger.debug(
|
|
144
|
+
f"Batch {i+1} usage: {usage.get('total_tokens', 0)} tokens"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"Failed to generate embeddings for batch {i+1}: {e}")
|
|
149
|
+
raise ProviderError(f"Embedding generation failed: {e}")
|
|
150
|
+
|
|
151
|
+
logger.info(f"Successfully generated {len(all_embeddings)} embeddings")
|
|
152
|
+
return all_embeddings
|
|
153
|
+
|
|
154
|
+
async def get_embedding_dimension(self) -> int:
|
|
155
|
+
"""Get the dimension of embeddings produced by this model."""
|
|
156
|
+
if self._embedding_dimension is not None:
|
|
157
|
+
return self._embedding_dimension
|
|
158
|
+
|
|
159
|
+
# Generate a test embedding to determine dimension
|
|
160
|
+
try:
|
|
161
|
+
test_embeddings = await self.generate_embeddings(["test"], input_type="query")
|
|
162
|
+
if test_embeddings:
|
|
163
|
+
self._embedding_dimension = len(test_embeddings[0])
|
|
164
|
+
logger.info(f"Detected embedding dimension: {self._embedding_dimension}")
|
|
165
|
+
return self._embedding_dimension
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.warning(f"Could not determine embedding dimension: {e}")
|
|
168
|
+
|
|
169
|
+
# Default dimensions for known Voyage models (as of 2024)
|
|
170
|
+
# Note: These may change - verify with Voyage AI documentation
|
|
171
|
+
model_dimensions = {
|
|
172
|
+
"voyage-code-2": 1536, # Code-optimized model
|
|
173
|
+
"voyage-2": 1024, # General purpose
|
|
174
|
+
"voyage-large-2": 1536, # Large general purpose
|
|
175
|
+
"voyage-3": 1024, # Newer general purpose (if available)
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
self._embedding_dimension = model_dimensions.get(self.model, 1536)
|
|
179
|
+
logger.info(f"Using default dimension for {self.model}: {self._embedding_dimension}")
|
|
180
|
+
return self._embedding_dimension
|
|
181
|
+
|
|
182
|
+
async def generate_query_embedding(self, query: str) -> List[float]:
|
|
183
|
+
"""Generate a single embedding for a search query."""
|
|
184
|
+
embeddings = await self.generate_embeddings([query], input_type="query")
|
|
185
|
+
return embeddings[0] if embeddings else []
|
|
186
|
+
|
|
187
|
+
async def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
|
|
188
|
+
"""Estimate the cost of embedding generation."""
|
|
189
|
+
total_tokens = sum(self._count_tokens(text) for text in texts)
|
|
190
|
+
|
|
191
|
+
# Voyage AI pricing (approximate, may change)
|
|
192
|
+
cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
|
|
193
|
+
estimated_cost = (total_tokens / 1000) * cost_per_1k_tokens
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
"total_tokens": total_tokens,
|
|
197
|
+
"total_texts": len(texts),
|
|
198
|
+
"estimated_cost_usd": round(estimated_cost, 6),
|
|
199
|
+
"model": self.model,
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
def create_voyage_client(config: VectorConfig) -> VoyageClient:
|
|
203
|
+
"""Create a Voyage client from configuration."""
|
|
204
|
+
if not config.voyage_api_key:
|
|
205
|
+
raise ValueError("VOYAGE_API_KEY is required for embedding generation")
|
|
206
|
+
|
|
207
|
+
return VoyageClient(
|
|
208
|
+
api_key=config.voyage_api_key,
|
|
209
|
+
model=config.embedding_model,
|
|
210
|
+
timeout=30.0,
|
|
211
|
+
max_retries=3,
|
|
212
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Security module for vector mode.
|
|
3
|
+
|
|
4
|
+
Provides secret redaction capabilities to prevent sensitive information
|
|
5
|
+
from being sent to external APIs for embedding generation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .redactor import SecretRedactor, RedactionResult
|
|
9
|
+
from .patterns import SecurityPatterns
|
|
10
|
+
|
|
11
|
+
__all__ = ["SecretRedactor", "RedactionResult", "SecurityPatterns"]
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Security patterns for detecting secrets in code.
|
|
3
|
+
|
|
4
|
+
Comprehensive collection of regex patterns to identify API keys, tokens,
|
|
5
|
+
passwords, connection strings, and other sensitive information.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import List, Dict, Pattern, NamedTuple
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
class PatternMatch(NamedTuple):
|
|
13
|
+
"""Represents a detected secret pattern match."""
|
|
14
|
+
pattern_name: str
|
|
15
|
+
pattern_type: str
|
|
16
|
+
start_pos: int
|
|
17
|
+
end_pos: int
|
|
18
|
+
matched_text: str
|
|
19
|
+
confidence: float
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class SecurityPattern:
|
|
23
|
+
"""Represents a security pattern with metadata."""
|
|
24
|
+
name: str
|
|
25
|
+
pattern: Pattern[str]
|
|
26
|
+
pattern_type: str
|
|
27
|
+
description: str
|
|
28
|
+
confidence: float = 1.0
|
|
29
|
+
context_required: bool = False
|
|
30
|
+
|
|
31
|
+
class SecurityPatterns:
|
|
32
|
+
"""Collection of security patterns for secret detection."""
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
self.patterns = self._build_patterns()
|
|
36
|
+
|
|
37
|
+
def _build_patterns(self) -> List[SecurityPattern]:
|
|
38
|
+
"""Build comprehensive list of security patterns."""
|
|
39
|
+
patterns = []
|
|
40
|
+
|
|
41
|
+
# API Keys and Tokens
|
|
42
|
+
patterns.extend([
|
|
43
|
+
SecurityPattern(
|
|
44
|
+
name="aws_access_key",
|
|
45
|
+
pattern=re.compile(r'AKIA[0-9A-Z]{16}', re.IGNORECASE),
|
|
46
|
+
pattern_type="api_key",
|
|
47
|
+
description="AWS Access Key ID",
|
|
48
|
+
confidence=0.95
|
|
49
|
+
),
|
|
50
|
+
SecurityPattern(
|
|
51
|
+
name="aws_secret_key",
|
|
52
|
+
pattern=re.compile(r'[A-Za-z0-9/+=]{40}', re.IGNORECASE),
|
|
53
|
+
pattern_type="api_key",
|
|
54
|
+
description="AWS Secret Access Key",
|
|
55
|
+
confidence=0.7,
|
|
56
|
+
context_required=True
|
|
57
|
+
),
|
|
58
|
+
SecurityPattern(
|
|
59
|
+
name="github_token",
|
|
60
|
+
pattern=re.compile(r'gh[pousr]_[A-Za-z0-9_]{36,}', re.IGNORECASE),
|
|
61
|
+
pattern_type="api_key",
|
|
62
|
+
description="GitHub Token",
|
|
63
|
+
confidence=0.95
|
|
64
|
+
),
|
|
65
|
+
SecurityPattern(
|
|
66
|
+
name="google_api_key",
|
|
67
|
+
pattern=re.compile(r'AIza[0-9A-Za-z\-_]{35}', re.IGNORECASE),
|
|
68
|
+
pattern_type="api_key",
|
|
69
|
+
description="Google API Key",
|
|
70
|
+
confidence=0.95
|
|
71
|
+
),
|
|
72
|
+
SecurityPattern(
|
|
73
|
+
name="slack_token",
|
|
74
|
+
pattern=re.compile(r'xox[baprs]-([0-9a-zA-Z]{10,48})', re.IGNORECASE),
|
|
75
|
+
pattern_type="api_key",
|
|
76
|
+
description="Slack Token",
|
|
77
|
+
confidence=0.95
|
|
78
|
+
),
|
|
79
|
+
SecurityPattern(
|
|
80
|
+
name="stripe_key",
|
|
81
|
+
pattern=re.compile(r'[rs]k_(test|live)_[0-9a-zA-Z]{24}', re.IGNORECASE),
|
|
82
|
+
pattern_type="api_key",
|
|
83
|
+
description="Stripe API Key",
|
|
84
|
+
confidence=0.95
|
|
85
|
+
),
|
|
86
|
+
SecurityPattern(
|
|
87
|
+
name="openai_api_key",
|
|
88
|
+
pattern=re.compile(r'sk-[a-zA-Z0-9]{48}', re.IGNORECASE),
|
|
89
|
+
pattern_type="api_key",
|
|
90
|
+
description="OpenAI API Key",
|
|
91
|
+
confidence=0.95
|
|
92
|
+
),
|
|
93
|
+
SecurityPattern(
|
|
94
|
+
name="anthropic_api_key",
|
|
95
|
+
pattern=re.compile(r'sk-ant-api03-[a-zA-Z0-9\-_]{95}', re.IGNORECASE),
|
|
96
|
+
pattern_type="api_key",
|
|
97
|
+
description="Anthropic API Key",
|
|
98
|
+
confidence=0.95
|
|
99
|
+
),
|
|
100
|
+
SecurityPattern(
|
|
101
|
+
name="voyage_api_key",
|
|
102
|
+
pattern=re.compile(r'pa-[a-zA-Z0-9]{32}', re.IGNORECASE),
|
|
103
|
+
pattern_type="api_key",
|
|
104
|
+
description="Voyage AI API Key",
|
|
105
|
+
confidence=0.95
|
|
106
|
+
),
|
|
107
|
+
])
|
|
108
|
+
|
|
109
|
+
# JWT Tokens
|
|
110
|
+
patterns.append(
|
|
111
|
+
SecurityPattern(
|
|
112
|
+
name="jwt_token",
|
|
113
|
+
pattern=re.compile(r'eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*', re.IGNORECASE),
|
|
114
|
+
pattern_type="token",
|
|
115
|
+
description="JWT Token",
|
|
116
|
+
confidence=0.9
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Database Connection Strings
|
|
121
|
+
patterns.extend([
|
|
122
|
+
SecurityPattern(
|
|
123
|
+
name="postgres_url",
|
|
124
|
+
pattern=re.compile(r'postgres(?:ql)?://[^\s]+', re.IGNORECASE),
|
|
125
|
+
pattern_type="connection_string",
|
|
126
|
+
description="PostgreSQL Connection String",
|
|
127
|
+
confidence=0.85
|
|
128
|
+
),
|
|
129
|
+
SecurityPattern(
|
|
130
|
+
name="mysql_url",
|
|
131
|
+
pattern=re.compile(r'mysql://[^\s]+', re.IGNORECASE),
|
|
132
|
+
pattern_type="connection_string",
|
|
133
|
+
description="MySQL Connection String",
|
|
134
|
+
confidence=0.85
|
|
135
|
+
),
|
|
136
|
+
SecurityPattern(
|
|
137
|
+
name="mongodb_url",
|
|
138
|
+
pattern=re.compile(r'mongodb(?:\+srv)?://[^\s]+', re.IGNORECASE),
|
|
139
|
+
pattern_type="connection_string",
|
|
140
|
+
description="MongoDB Connection String",
|
|
141
|
+
confidence=0.85
|
|
142
|
+
),
|
|
143
|
+
SecurityPattern(
|
|
144
|
+
name="redis_url",
|
|
145
|
+
pattern=re.compile(r'redis://[^\s]+', re.IGNORECASE),
|
|
146
|
+
pattern_type="connection_string",
|
|
147
|
+
description="Redis Connection String",
|
|
148
|
+
confidence=0.85
|
|
149
|
+
),
|
|
150
|
+
])
|
|
151
|
+
|
|
152
|
+
# Private Keys
|
|
153
|
+
patterns.extend([
|
|
154
|
+
SecurityPattern(
|
|
155
|
+
name="rsa_private_key",
|
|
156
|
+
pattern=re.compile(r'-----BEGIN RSA PRIVATE KEY-----[^-]+-----END RSA PRIVATE KEY-----', re.MULTILINE | re.DOTALL),
|
|
157
|
+
pattern_type="private_key",
|
|
158
|
+
description="RSA Private Key",
|
|
159
|
+
confidence=1.0
|
|
160
|
+
),
|
|
161
|
+
SecurityPattern(
|
|
162
|
+
name="ssh_private_key",
|
|
163
|
+
pattern=re.compile(r'-----BEGIN OPENSSH PRIVATE KEY-----[^-]+-----END OPENSSH PRIVATE KEY-----', re.MULTILINE | re.DOTALL),
|
|
164
|
+
pattern_type="private_key",
|
|
165
|
+
description="SSH Private Key",
|
|
166
|
+
confidence=1.0
|
|
167
|
+
),
|
|
168
|
+
SecurityPattern(
|
|
169
|
+
name="ec_private_key",
|
|
170
|
+
pattern=re.compile(r'-----BEGIN EC PRIVATE KEY-----[^-]+-----END EC PRIVATE KEY-----', re.MULTILINE | re.DOTALL),
|
|
171
|
+
pattern_type="private_key",
|
|
172
|
+
description="EC Private Key",
|
|
173
|
+
confidence=1.0
|
|
174
|
+
),
|
|
175
|
+
])
|
|
176
|
+
|
|
177
|
+
# Environment Variable Patterns
|
|
178
|
+
patterns.extend([
|
|
179
|
+
SecurityPattern(
|
|
180
|
+
name="env_password",
|
|
181
|
+
pattern=re.compile(r'(?i)(password|passwd|pwd)\s*[=:]\s*["\']?[^\s"\']+["\']?', re.IGNORECASE),
|
|
182
|
+
pattern_type="password",
|
|
183
|
+
description="Environment Variable Password",
|
|
184
|
+
confidence=0.7,
|
|
185
|
+
context_required=True
|
|
186
|
+
),
|
|
187
|
+
SecurityPattern(
|
|
188
|
+
name="env_secret",
|
|
189
|
+
pattern=re.compile(r'(?i)(secret|token|key)\s*[=:]\s*["\']?[^\s"\']+["\']?', re.IGNORECASE),
|
|
190
|
+
pattern_type="secret",
|
|
191
|
+
description="Environment Variable Secret",
|
|
192
|
+
confidence=0.6,
|
|
193
|
+
context_required=True
|
|
194
|
+
),
|
|
195
|
+
])
|
|
196
|
+
|
|
197
|
+
# Generic Patterns (lower confidence)
|
|
198
|
+
patterns.extend([
|
|
199
|
+
SecurityPattern(
|
|
200
|
+
name="base64_encoded",
|
|
201
|
+
pattern=re.compile(r'[A-Za-z0-9+/]{32,}={0,2}', re.IGNORECASE),
|
|
202
|
+
pattern_type="encoded_data",
|
|
203
|
+
description="Base64 Encoded Data",
|
|
204
|
+
confidence=0.3,
|
|
205
|
+
context_required=True
|
|
206
|
+
),
|
|
207
|
+
SecurityPattern(
|
|
208
|
+
name="hex_encoded",
|
|
209
|
+
pattern=re.compile(r'[a-fA-F0-9]{32,}', re.IGNORECASE),
|
|
210
|
+
pattern_type="encoded_data",
|
|
211
|
+
description="Hex Encoded Data",
|
|
212
|
+
confidence=0.3,
|
|
213
|
+
context_required=True
|
|
214
|
+
),
|
|
215
|
+
SecurityPattern(
|
|
216
|
+
name="uuid",
|
|
217
|
+
pattern=re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.IGNORECASE),
|
|
218
|
+
pattern_type="identifier",
|
|
219
|
+
description="UUID",
|
|
220
|
+
confidence=0.2,
|
|
221
|
+
context_required=True
|
|
222
|
+
),
|
|
223
|
+
])
|
|
224
|
+
|
|
225
|
+
# URLs with embedded credentials
|
|
226
|
+
patterns.append(
|
|
227
|
+
SecurityPattern(
|
|
228
|
+
name="url_with_credentials",
|
|
229
|
+
pattern=re.compile(r'https?://[^:/\s]+:[^@/\s]+@[^\s]+', re.IGNORECASE),
|
|
230
|
+
pattern_type="credential_url",
|
|
231
|
+
description="URL with embedded credentials",
|
|
232
|
+
confidence=0.9
|
|
233
|
+
)
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return patterns
|
|
237
|
+
|
|
238
|
+
def get_patterns_by_type(self, pattern_type: str) -> List[SecurityPattern]:
|
|
239
|
+
"""Get all patterns of a specific type."""
|
|
240
|
+
return [p for p in self.patterns if p.pattern_type == pattern_type]
|
|
241
|
+
|
|
242
|
+
def get_high_confidence_patterns(self, min_confidence: float = 0.8) -> List[SecurityPattern]:
|
|
243
|
+
"""Get patterns with confidence above threshold."""
|
|
244
|
+
return [p for p in self.patterns if p.confidence >= min_confidence]
|
|
245
|
+
|
|
246
|
+
def get_context_sensitive_patterns(self) -> List[SecurityPattern]:
|
|
247
|
+
"""Get patterns that require context for accurate detection."""
|
|
248
|
+
return [p for p in self.patterns if p.context_required]
|
|
249
|
+
|
|
250
|
+
def find_matches(self, text: str, min_confidence: float = 0.5) -> List[PatternMatch]:
|
|
251
|
+
"""Find all pattern matches in text above confidence threshold."""
|
|
252
|
+
matches = []
|
|
253
|
+
|
|
254
|
+
for pattern in self.patterns:
|
|
255
|
+
if pattern.confidence < min_confidence:
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
for match in pattern.pattern.finditer(text):
|
|
259
|
+
# For context-sensitive patterns, check surrounding context
|
|
260
|
+
if pattern.context_required:
|
|
261
|
+
if not self._has_suspicious_context(text, match.start(), match.end()):
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
matches.append(PatternMatch(
|
|
265
|
+
pattern_name=pattern.name,
|
|
266
|
+
pattern_type=pattern.pattern_type,
|
|
267
|
+
start_pos=match.start(),
|
|
268
|
+
end_pos=match.end(),
|
|
269
|
+
matched_text=match.group(),
|
|
270
|
+
confidence=pattern.confidence
|
|
271
|
+
))
|
|
272
|
+
|
|
273
|
+
# Sort by position for consistent output
|
|
274
|
+
return sorted(matches, key=lambda m: m.start_pos)
|
|
275
|
+
|
|
276
|
+
def _has_suspicious_context(self, text: str, start: int, end: int, context_size: int = 50) -> bool:
|
|
277
|
+
"""Check if match has suspicious context indicating it's likely a secret."""
|
|
278
|
+
# Get surrounding context
|
|
279
|
+
context_start = max(0, start - context_size)
|
|
280
|
+
context_end = min(len(text), end + context_size)
|
|
281
|
+
context = text[context_start:context_end].lower()
|
|
282
|
+
|
|
283
|
+
# Keywords that suggest secret/credential usage
|
|
284
|
+
suspicious_keywords = [
|
|
285
|
+
'password', 'passwd', 'pwd', 'secret', 'token', 'key', 'api',
|
|
286
|
+
'auth', 'credential', 'login', 'access', 'private', 'confidential',
|
|
287
|
+
'env', 'config', 'setting', 'var', 'export', 'process.env'
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
return any(keyword in context for keyword in suspicious_keywords)
|
|
291
|
+
|
|
292
|
+
def get_pattern_summary(self) -> Dict[str, int]:
|
|
293
|
+
"""Get summary of patterns by type."""
|
|
294
|
+
summary = {}
|
|
295
|
+
for pattern in self.patterns:
|
|
296
|
+
summary[pattern.pattern_type] = summary.get(pattern.pattern_type, 0) + 1
|
|
297
|
+
return summary
|