mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/database.py +334 -115
- mcp_code_indexer/database/database_factory.py +1 -1
- mcp_code_indexer/database/exceptions.py +1 -1
- mcp_code_indexer/database/models.py +66 -24
- mcp_code_indexer/database/retry_executor.py +15 -5
- mcp_code_indexer/file_scanner.py +107 -12
- mcp_code_indexer/main.py +43 -30
- mcp_code_indexer/server/mcp_server.py +201 -7
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
- mcp_code_indexer/vector_mode/config.py +113 -45
- mcp_code_indexer/vector_mode/const.py +24 -0
- mcp_code_indexer/vector_mode/daemon.py +860 -98
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
- mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
- mcp_code_indexer/vector_mode/services/__init__.py +9 -0
- mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
- mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
- mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
- mcp_code_indexer/vector_mode/types.py +46 -0
- mcp_code_indexer/vector_mode/utils.py +50 -0
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/METADATA +13 -10
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/RECORD +28 -21
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/WHEEL +1 -1
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/entry_points.txt +0 -0
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info/licenses}/LICENSE +0 -0
|
@@ -6,25 +6,29 @@ high-quality code embeddings using the voyage-code-2 model.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
|
-
from typing import List, Dict, Any
|
|
9
|
+
from typing import List, Dict, Any, Tuple
|
|
10
10
|
import voyageai
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from ..config import VectorConfig, DEFAULT_EMBEDDING_MODEL
|
|
15
|
+
from ..const import MODEL_DIMENSIONS
|
|
13
16
|
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
19
|
+
|
|
16
20
|
class VoyageClient:
|
|
17
21
|
"""Clean Voyage AI client using official SDK."""
|
|
18
|
-
|
|
19
|
-
def __init__(self, api_key: str, model: str =
|
|
22
|
+
|
|
23
|
+
def __init__(self, api_key: str, model: str = DEFAULT_EMBEDDING_MODEL):
|
|
20
24
|
self.api_key = api_key
|
|
21
25
|
self.model = model
|
|
22
26
|
self._embedding_dimension: int | None = None
|
|
23
|
-
|
|
27
|
+
|
|
24
28
|
# Initialize official Voyage AI client
|
|
25
29
|
self.client = voyageai.Client(api_key=api_key)
|
|
26
30
|
logger.info(f"Initialized Voyage AI client with model {model}")
|
|
27
|
-
|
|
31
|
+
|
|
28
32
|
def health_check(self) -> bool:
|
|
29
33
|
"""Check if Voyage AI service is healthy."""
|
|
30
34
|
try:
|
|
@@ -33,74 +37,108 @@ class VoyageClient:
|
|
|
33
37
|
except Exception as e:
|
|
34
38
|
logger.warning(f"Voyage AI health check failed: {e}")
|
|
35
39
|
return False
|
|
36
|
-
|
|
40
|
+
|
|
41
|
+
def validate_api_access(self) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Validate API key and access to Voyage AI service.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
RuntimeError: If API access validation fails with specific error details
|
|
47
|
+
"""
|
|
48
|
+
logger.info("Validating Voyage AI API access...")
|
|
49
|
+
try:
|
|
50
|
+
result = self.client.embed(["test"], model=self.model, input_type="query")
|
|
51
|
+
if not result or not result.embeddings:
|
|
52
|
+
raise RuntimeError("Voyage AI API returned empty response")
|
|
53
|
+
logger.debug("Voyage AI API access validated successfully")
|
|
54
|
+
except Exception as e:
|
|
55
|
+
error_msg = str(e).lower()
|
|
56
|
+
|
|
57
|
+
if (
|
|
58
|
+
"401" in error_msg
|
|
59
|
+
or "unauthorized" in error_msg
|
|
60
|
+
or "api key" in error_msg
|
|
61
|
+
):
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
f"Voyage AI API authentication failed: Invalid or expired API key. "
|
|
64
|
+
f"Please check your VOYAGE_API_KEY. Error: {e}"
|
|
65
|
+
)
|
|
66
|
+
elif "403" in error_msg or "forbidden" in error_msg:
|
|
67
|
+
raise RuntimeError(
|
|
68
|
+
f"Voyage AI API access denied: API key lacks required permissions. Error: {e}"
|
|
69
|
+
)
|
|
70
|
+
elif "429" in error_msg or "rate limit" in error_msg:
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
f"Voyage AI API rate limit exceeded: Too many requests. Error: {e}"
|
|
73
|
+
)
|
|
74
|
+
elif "quota" in error_msg or "usage" in error_msg:
|
|
75
|
+
raise RuntimeError(
|
|
76
|
+
f"Voyage AI API quota exceeded: Usage limit reached. Error: {e}"
|
|
77
|
+
)
|
|
78
|
+
elif "5" in error_msg and ("error" in error_msg or "server" in error_msg):
|
|
79
|
+
raise RuntimeError(
|
|
80
|
+
f"Voyage AI service unavailable: Server error. Error: {e}"
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
raise RuntimeError(f"Voyage AI API access validation failed: {e}")
|
|
84
|
+
|
|
85
|
+
logger.info("Voyage AI API access validated successfully")
|
|
86
|
+
|
|
37
87
|
def generate_embeddings(
|
|
38
|
-
self,
|
|
39
|
-
texts: List[str],
|
|
40
|
-
input_type: str = "document",
|
|
41
|
-
**kwargs
|
|
88
|
+
self, texts: List[str], input_type: str = "document", **kwargs
|
|
42
89
|
) -> List[List[float]]:
|
|
43
90
|
"""Generate embeddings for texts using official SDK."""
|
|
44
91
|
if not texts:
|
|
45
92
|
return []
|
|
46
|
-
|
|
93
|
+
|
|
47
94
|
logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
|
|
48
|
-
|
|
95
|
+
|
|
49
96
|
try:
|
|
50
97
|
result = self.client.embed(
|
|
51
|
-
texts=texts,
|
|
52
|
-
model=self.model,
|
|
53
|
-
input_type=input_type,
|
|
54
|
-
truncation=True
|
|
98
|
+
texts=texts, model=self.model, input_type=input_type, truncation=True
|
|
55
99
|
)
|
|
56
|
-
|
|
100
|
+
|
|
57
101
|
# Log usage if available
|
|
58
|
-
if hasattr(result,
|
|
102
|
+
if hasattr(result, "usage") and result.usage:
|
|
59
103
|
logger.debug(f"Token usage: {result.usage.total_tokens}")
|
|
60
|
-
|
|
104
|
+
|
|
61
105
|
logger.info(f"Successfully generated {len(result.embeddings)} embeddings")
|
|
62
106
|
return result.embeddings
|
|
63
|
-
|
|
107
|
+
|
|
64
108
|
except Exception as e:
|
|
65
109
|
logger.error(f"Failed to generate embeddings: {e}")
|
|
66
110
|
raise RuntimeError(f"Embedding generation failed: {e}")
|
|
67
|
-
|
|
111
|
+
|
|
68
112
|
def get_embedding_dimension(self) -> int:
|
|
69
113
|
"""Get the dimension of embeddings produced by this model."""
|
|
70
114
|
if self._embedding_dimension is not None:
|
|
71
115
|
return self._embedding_dimension
|
|
72
|
-
|
|
116
|
+
|
|
73
117
|
# Generate a test embedding to determine dimension
|
|
74
118
|
try:
|
|
75
119
|
test_embeddings = self.generate_embeddings(["test"], input_type="query")
|
|
76
120
|
if test_embeddings:
|
|
77
121
|
self._embedding_dimension = len(test_embeddings[0])
|
|
78
|
-
logger.info(
|
|
122
|
+
logger.info(
|
|
123
|
+
f"Detected embedding dimension: {self._embedding_dimension}"
|
|
124
|
+
)
|
|
79
125
|
return self._embedding_dimension
|
|
80
126
|
except Exception as e:
|
|
81
127
|
logger.warning(f"Could not determine embedding dimension: {e}")
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
model_dimensions = {
|
|
85
|
-
"voyage-code-2": 1536,
|
|
86
|
-
"voyage-2": 1024,
|
|
87
|
-
"voyage-large-2": 1536,
|
|
88
|
-
"voyage-3": 1024,
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
self._embedding_dimension = model_dimensions.get(self.model, 1536)
|
|
128
|
+
|
|
129
|
+
self._embedding_dimension = MODEL_DIMENSIONS[self.model]
|
|
92
130
|
logger.info(f"Using default embedding dimension: {self._embedding_dimension}")
|
|
93
131
|
return self._embedding_dimension
|
|
94
|
-
|
|
132
|
+
|
|
95
133
|
def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
|
|
96
134
|
"""Estimate the cost of embedding generation."""
|
|
97
135
|
# Rough token estimation (4 chars per token)
|
|
98
136
|
total_tokens = sum(len(text) // 4 for text in texts)
|
|
99
|
-
|
|
137
|
+
|
|
100
138
|
# Voyage AI pricing (approximate, may change)
|
|
101
139
|
cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
|
|
102
140
|
estimated_cost = (total_tokens / 1000) * cost_per_1k_tokens
|
|
103
|
-
|
|
141
|
+
|
|
104
142
|
return {
|
|
105
143
|
"total_tokens": total_tokens,
|
|
106
144
|
"total_texts": len(texts),
|
|
@@ -108,11 +146,75 @@ class VoyageClient:
|
|
|
108
146
|
"model": self.model,
|
|
109
147
|
}
|
|
110
148
|
|
|
149
|
+
def count_tokens(self, texts: List[str]) -> int:
|
|
150
|
+
return self.client.count_tokens(texts, self.model)
|
|
151
|
+
|
|
152
|
+
def generate_embeddings_batch(
|
|
153
|
+
self,
|
|
154
|
+
all_texts: List[str],
|
|
155
|
+
file_boundaries: List[Tuple[str, int, int]],
|
|
156
|
+
input_type: str = "document",
|
|
157
|
+
**kwargs,
|
|
158
|
+
) -> Dict[str, List[List[float]]]:
|
|
159
|
+
"""
|
|
160
|
+
Generate embeddings for texts from multiple files in a single batch call.
|
|
161
|
+
|
|
162
|
+
Note: Token limits should be handled at the calling level. This method
|
|
163
|
+
assumes the provided texts fit within API limits.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
all_texts: Flattened list of all text chunks from all files
|
|
167
|
+
file_boundaries: List of (file_path, start_idx, end_idx) tuples indicating
|
|
168
|
+
which embeddings belong to which file
|
|
169
|
+
input_type: Type of input for embedding generation
|
|
170
|
+
**kwargs: Additional arguments for embedding generation
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Dictionary mapping file paths to their corresponding embeddings
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
RuntimeError: If embedding generation fails
|
|
177
|
+
"""
|
|
178
|
+
if not all_texts:
|
|
179
|
+
return {}
|
|
180
|
+
|
|
181
|
+
if not file_boundaries:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
"file_boundaries cannot be empty when all_texts is provided"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
logger.info(
|
|
187
|
+
f"Generating batch embeddings for {len(all_texts)} texts from {len(file_boundaries)} files using {self.model}"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
# Generate embeddings for all texts in a single API call
|
|
192
|
+
all_embeddings = self.generate_embeddings(
|
|
193
|
+
all_texts, input_type=input_type, **kwargs
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Group embeddings by file using boundaries
|
|
197
|
+
file_embeddings = {}
|
|
198
|
+
for file_path, start_idx, end_idx in file_boundaries:
|
|
199
|
+
file_embeddings[file_path] = all_embeddings[start_idx:end_idx]
|
|
200
|
+
|
|
201
|
+
logger.info(
|
|
202
|
+
f"Successfully generated batch embeddings for {len(file_boundaries)} files "
|
|
203
|
+
f"({len(all_embeddings)} total embeddings)"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return file_embeddings
|
|
207
|
+
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.error(f"Failed to generate batch embeddings: {e}")
|
|
210
|
+
raise RuntimeError(f"Batch embedding generation failed: {e}")
|
|
211
|
+
|
|
212
|
+
|
|
111
213
|
def create_voyage_client(config: VectorConfig) -> VoyageClient:
|
|
112
214
|
"""Create a Voyage client from configuration."""
|
|
113
215
|
if not config.voyage_api_key:
|
|
114
216
|
raise ValueError("VOYAGE_API_KEY is required for embedding generation")
|
|
115
|
-
|
|
217
|
+
|
|
116
218
|
return VoyageClient(
|
|
117
219
|
api_key=config.voyage_api_key,
|
|
118
220
|
model=config.embedding_model,
|
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding Service for converting code chunks to vector embeddings.
|
|
3
|
+
|
|
4
|
+
Provides a clean abstraction layer between chunked code and embedding providers,
|
|
5
|
+
handling text preparation, batching, and provider communication.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Dict, Tuple
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
from ..chunking.ast_chunker import CodeChunk
|
|
16
|
+
from ..providers.voyage_client import VoyageClient
|
|
17
|
+
from ..config import VectorConfig
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EmbeddingService:
|
|
23
|
+
"""
|
|
24
|
+
Service for converting code chunks to embeddings.
|
|
25
|
+
|
|
26
|
+
Handles text preparation, context enhancement, batching, and provider
|
|
27
|
+
communication while maintaining separation from daemon orchestration logic.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, embedding_client: VoyageClient, config: VectorConfig):
|
|
31
|
+
"""Initialize embedding service with client and configuration."""
|
|
32
|
+
self.embedding_client = embedding_client
|
|
33
|
+
self.config = config
|
|
34
|
+
|
|
35
|
+
# Validate API access immediately during initialization
|
|
36
|
+
self.embedding_client.validate_api_access()
|
|
37
|
+
|
|
38
|
+
async def generate_embeddings_for_chunks(
|
|
39
|
+
self, chunks: List[CodeChunk], project_name: str, file_path: Path
|
|
40
|
+
) -> List[List[float]]:
|
|
41
|
+
"""
|
|
42
|
+
Generate embeddings for a list of code chunks.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
chunks: List of code chunks to embed
|
|
46
|
+
project_name: Name of the project (for logging)
|
|
47
|
+
file_path: Path to source file (for logging)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of embedding vectors (one per chunk)
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
RuntimeError: If embedding generation fails
|
|
54
|
+
"""
|
|
55
|
+
if not chunks:
|
|
56
|
+
logger.debug(f"No chunks provided for {file_path}")
|
|
57
|
+
return []
|
|
58
|
+
|
|
59
|
+
logger.info(
|
|
60
|
+
f"Generating embeddings for {len(chunks)} chunks from {file_path}",
|
|
61
|
+
extra={
|
|
62
|
+
"structured_data": {
|
|
63
|
+
"project_name": project_name,
|
|
64
|
+
"file_path": str(file_path),
|
|
65
|
+
"chunk_count": len(chunks),
|
|
66
|
+
"embedding_model": self.config.embedding_model,
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
# Extract text content from chunks with context enhancement
|
|
73
|
+
texts = self._prepare_chunk_texts(chunks)
|
|
74
|
+
|
|
75
|
+
# Process chunks in batches to respect API limits
|
|
76
|
+
all_embeddings = []
|
|
77
|
+
batch_size = self.config.batch_size
|
|
78
|
+
|
|
79
|
+
for i in range(0, len(texts), batch_size):
|
|
80
|
+
batch_texts = texts[i : i + batch_size]
|
|
81
|
+
batch_chunks = chunks[i : i + batch_size]
|
|
82
|
+
|
|
83
|
+
logger.debug(
|
|
84
|
+
f"Processing embedding batch {i//batch_size + 1} "
|
|
85
|
+
f"({len(batch_texts)} chunks) for {file_path}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Generate embeddings using async/sync bridge
|
|
89
|
+
embeddings = await asyncio.get_event_loop().run_in_executor(
|
|
90
|
+
None,
|
|
91
|
+
lambda: self.embedding_client.generate_embeddings(
|
|
92
|
+
batch_texts, input_type="document" # Code chunks are documents
|
|
93
|
+
),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
all_embeddings.extend(embeddings)
|
|
97
|
+
|
|
98
|
+
# Log batch statistics
|
|
99
|
+
self._log_batch_stats(
|
|
100
|
+
batch_chunks, i // batch_size + 1, len(embeddings)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
logger.info(
|
|
104
|
+
f"Successfully generated {len(all_embeddings)} embeddings for {file_path}",
|
|
105
|
+
extra={
|
|
106
|
+
"structured_data": {
|
|
107
|
+
"project_name": project_name,
|
|
108
|
+
"file_path": str(file_path),
|
|
109
|
+
"embedding_count": len(all_embeddings),
|
|
110
|
+
}
|
|
111
|
+
},
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return all_embeddings
|
|
115
|
+
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.error(
|
|
118
|
+
f"Failed to generate embeddings for {file_path}: {e}",
|
|
119
|
+
extra={
|
|
120
|
+
"structured_data": {
|
|
121
|
+
"project_name": project_name,
|
|
122
|
+
"file_path": str(file_path),
|
|
123
|
+
"chunk_count": len(chunks),
|
|
124
|
+
"error": str(e),
|
|
125
|
+
}
|
|
126
|
+
},
|
|
127
|
+
exc_info=True,
|
|
128
|
+
)
|
|
129
|
+
raise
|
|
130
|
+
|
|
131
|
+
def _prepare_chunk_texts(self, chunks: List[CodeChunk]) -> List[str]:
|
|
132
|
+
"""
|
|
133
|
+
Prepare text content from chunks with context enhancement.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
chunks: List of code chunks
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of prepared text strings ready for embedding
|
|
140
|
+
"""
|
|
141
|
+
texts = []
|
|
142
|
+
for chunk in chunks:
|
|
143
|
+
# Include chunk context for better embeddings
|
|
144
|
+
text_content = chunk.content
|
|
145
|
+
if chunk.name:
|
|
146
|
+
# Prefix with chunk name for context
|
|
147
|
+
text_content = f"# {chunk.name}\n{chunk.content}"
|
|
148
|
+
texts.append(text_content)
|
|
149
|
+
return texts
|
|
150
|
+
|
|
151
|
+
def _log_batch_stats(
|
|
152
|
+
self, batch_chunks: List[CodeChunk], batch_num: int, embedding_count: int
|
|
153
|
+
) -> None:
|
|
154
|
+
"""Log statistics for a processed batch."""
|
|
155
|
+
chunk_types = {}
|
|
156
|
+
redacted_count = 0
|
|
157
|
+
|
|
158
|
+
for chunk in batch_chunks:
|
|
159
|
+
chunk_type = chunk.chunk_type.value
|
|
160
|
+
chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
|
|
161
|
+
if chunk.redacted:
|
|
162
|
+
redacted_count += 1
|
|
163
|
+
|
|
164
|
+
logger.debug(
|
|
165
|
+
f"Batch {batch_num} complete: "
|
|
166
|
+
f"{embedding_count} embeddings generated, "
|
|
167
|
+
f"chunk types: {chunk_types}, "
|
|
168
|
+
f"redacted: {redacted_count}"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
async def generate_embeddings_for_multiple_files(
|
|
172
|
+
self, file_chunks: Dict[str, List[CodeChunk]], project_name: str
|
|
173
|
+
) -> Dict[str, List[List[float]]]:
|
|
174
|
+
"""
|
|
175
|
+
Generate embeddings for chunks from multiple files in a single batch operation.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
file_chunks: Dictionary mapping file paths to their code chunks
|
|
179
|
+
project_name: Name of the project (for logging)
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dictionary mapping file paths to their corresponding embeddings
|
|
183
|
+
|
|
184
|
+
Raises:
|
|
185
|
+
RuntimeError: If embedding generation fails
|
|
186
|
+
"""
|
|
187
|
+
if not file_chunks:
|
|
188
|
+
logger.debug("No file chunks provided for batch processing")
|
|
189
|
+
return {}
|
|
190
|
+
|
|
191
|
+
total_chunks = sum(len(chunks) for chunks in file_chunks.values())
|
|
192
|
+
logger.info(
|
|
193
|
+
f"Generating batch embeddings for {len(file_chunks)} files "
|
|
194
|
+
f"({total_chunks} total chunks)",
|
|
195
|
+
extra={
|
|
196
|
+
"structured_data": {
|
|
197
|
+
"project_name": project_name,
|
|
198
|
+
"file_count": len(file_chunks),
|
|
199
|
+
"chunk_count": total_chunks,
|
|
200
|
+
"embedding_model": self.config.embedding_model,
|
|
201
|
+
}
|
|
202
|
+
},
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
# Build batches from the start based on actual token counts and text limits
|
|
207
|
+
batches = await self._build_token_aware_batches(file_chunks, project_name)
|
|
208
|
+
|
|
209
|
+
if not batches:
|
|
210
|
+
logger.debug("No valid chunks found after text preparation")
|
|
211
|
+
return {}
|
|
212
|
+
|
|
213
|
+
# Process each batch
|
|
214
|
+
logger.info(
|
|
215
|
+
f"Processing {len(batches)} token-aware batches for project {project_name}"
|
|
216
|
+
)
|
|
217
|
+
all_file_embeddings = {}
|
|
218
|
+
|
|
219
|
+
for i, (batch_texts, batch_boundaries) in enumerate(batches):
|
|
220
|
+
logger.debug(
|
|
221
|
+
f"Processing batch {i + 1}/{len(batches)}: {len(batch_texts)} texts"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Generate embeddings for this batch
|
|
225
|
+
batch_file_embeddings = await asyncio.get_event_loop().run_in_executor(
|
|
226
|
+
None,
|
|
227
|
+
lambda texts=batch_texts, boundaries=batch_boundaries: self.embedding_client.generate_embeddings_batch(
|
|
228
|
+
all_texts=texts,
|
|
229
|
+
file_boundaries=boundaries,
|
|
230
|
+
input_type="document",
|
|
231
|
+
),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Merge results
|
|
235
|
+
all_file_embeddings.update(batch_file_embeddings)
|
|
236
|
+
|
|
237
|
+
file_embeddings = all_file_embeddings
|
|
238
|
+
|
|
239
|
+
# Log batch statistics
|
|
240
|
+
self._log_batch_embedding_stats(file_chunks, file_embeddings)
|
|
241
|
+
|
|
242
|
+
logger.info(
|
|
243
|
+
f"Successfully generated batch embeddings for {len(file_embeddings)} files",
|
|
244
|
+
extra={
|
|
245
|
+
"structured_data": {
|
|
246
|
+
"project_name": project_name,
|
|
247
|
+
"files_processed": len(file_embeddings),
|
|
248
|
+
"total_embeddings": sum(
|
|
249
|
+
len(embs) for embs in file_embeddings.values()
|
|
250
|
+
),
|
|
251
|
+
}
|
|
252
|
+
},
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return file_embeddings
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.error(
|
|
259
|
+
f"Failed to generate batch embeddings: {e}",
|
|
260
|
+
extra={
|
|
261
|
+
"structured_data": {
|
|
262
|
+
"project_name": project_name,
|
|
263
|
+
"file_count": len(file_chunks),
|
|
264
|
+
"chunk_count": total_chunks,
|
|
265
|
+
"error": str(e),
|
|
266
|
+
}
|
|
267
|
+
},
|
|
268
|
+
exc_info=True,
|
|
269
|
+
)
|
|
270
|
+
raise
|
|
271
|
+
|
|
272
|
+
async def _build_token_aware_batches(
|
|
273
|
+
self, file_chunks: Dict[str, List[CodeChunk]], project_name: str
|
|
274
|
+
) -> List[Tuple[List[str], List[Tuple[str, int, int]]]]:
|
|
275
|
+
"""
|
|
276
|
+
Build batches from file chunks respecting both token and text count limits.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
file_chunks: Dictionary mapping file paths to their code chunks
|
|
280
|
+
project_name: Name of the project (for logging)
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
List of tuples: (batch_texts, batch_file_boundaries)
|
|
284
|
+
"""
|
|
285
|
+
batches = [] # List of (batch_texts, batch_file_boundaries)
|
|
286
|
+
current_batch_texts = []
|
|
287
|
+
current_batch_boundaries = []
|
|
288
|
+
current_batch_tokens = 0
|
|
289
|
+
batch_idx = 0
|
|
290
|
+
|
|
291
|
+
for file_path, chunks in file_chunks.items():
|
|
292
|
+
if not chunks:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
# Prepare texts for this file
|
|
296
|
+
file_texts = self._prepare_chunk_texts(chunks)
|
|
297
|
+
if not file_texts:
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
# Count tokens for this file using accurate Voyage API
|
|
301
|
+
file_tokens = await asyncio.get_event_loop().run_in_executor(
|
|
302
|
+
None, lambda texts=file_texts: self.embedding_client.count_tokens(texts)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
logger.debug(
|
|
306
|
+
f"File {file_path}: {len(file_texts)} texts, {file_tokens} tokens"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# If adding this file would exceed token limit OR text count limit, finalize current batch
|
|
310
|
+
if (
|
|
311
|
+
current_batch_tokens + file_tokens
|
|
312
|
+
> self.config.voyage_max_tokens_per_batch
|
|
313
|
+
or len(current_batch_texts) + len(file_texts)
|
|
314
|
+
> self.config.voyage_batch_size_limit
|
|
315
|
+
) and current_batch_texts:
|
|
316
|
+
|
|
317
|
+
# Determine which limit was exceeded for logging
|
|
318
|
+
token_exceeded = (
|
|
319
|
+
current_batch_tokens + file_tokens
|
|
320
|
+
> self.config.voyage_max_tokens_per_batch
|
|
321
|
+
)
|
|
322
|
+
count_exceeded = (
|
|
323
|
+
len(current_batch_texts) + len(file_texts)
|
|
324
|
+
> self.config.voyage_batch_size_limit
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
logger.info(
|
|
328
|
+
f"Finalizing batch {len(batches) + 1}: {len(current_batch_texts)} texts, "
|
|
329
|
+
f"{current_batch_tokens} tokens (limit exceeded: "
|
|
330
|
+
f"tokens={token_exceeded}, count={count_exceeded})"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
batches.append((current_batch_texts, current_batch_boundaries))
|
|
334
|
+
|
|
335
|
+
# Start new batch
|
|
336
|
+
current_batch_texts = []
|
|
337
|
+
current_batch_boundaries = []
|
|
338
|
+
current_batch_tokens = 0
|
|
339
|
+
batch_idx = 0
|
|
340
|
+
|
|
341
|
+
# Add this file to current batch
|
|
342
|
+
start_idx = batch_idx
|
|
343
|
+
end_idx = batch_idx + len(file_texts)
|
|
344
|
+
current_batch_texts.extend(file_texts)
|
|
345
|
+
current_batch_boundaries.append((file_path, start_idx, end_idx))
|
|
346
|
+
current_batch_tokens += file_tokens
|
|
347
|
+
batch_idx = end_idx
|
|
348
|
+
|
|
349
|
+
# Add final batch if it has content
|
|
350
|
+
if current_batch_texts:
|
|
351
|
+
logger.info(
|
|
352
|
+
f"Finalizing final batch {len(batches) + 1}: {len(current_batch_texts)} texts, "
|
|
353
|
+
f"{current_batch_tokens} tokens"
|
|
354
|
+
)
|
|
355
|
+
batches.append((current_batch_texts, current_batch_boundaries))
|
|
356
|
+
|
|
357
|
+
return batches
|
|
358
|
+
|
|
359
|
+
def _log_batch_embedding_stats(
|
|
360
|
+
self,
|
|
361
|
+
file_chunks: Dict[str, List[CodeChunk]],
|
|
362
|
+
file_embeddings: Dict[str, List[List[float]]],
|
|
363
|
+
) -> None:
|
|
364
|
+
"""Log statistics for batch embedding processing."""
|
|
365
|
+
total_chunks = 0
|
|
366
|
+
chunk_types = {}
|
|
367
|
+
redacted_count = 0
|
|
368
|
+
languages = set()
|
|
369
|
+
|
|
370
|
+
for file_path, chunks in file_chunks.items():
|
|
371
|
+
total_chunks += len(chunks)
|
|
372
|
+
for chunk in chunks:
|
|
373
|
+
chunk_type = chunk.chunk_type.value
|
|
374
|
+
chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
|
|
375
|
+
if chunk.redacted:
|
|
376
|
+
redacted_count += 1
|
|
377
|
+
languages.add(chunk.language)
|
|
378
|
+
|
|
379
|
+
total_embeddings = sum(len(embs) for embs in file_embeddings.values())
|
|
380
|
+
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"Batch embedding complete: "
|
|
383
|
+
f"{len(file_chunks)} files, "
|
|
384
|
+
f"{total_chunks} chunks, "
|
|
385
|
+
f"{total_embeddings} embeddings generated, "
|
|
386
|
+
f"chunk types: {chunk_types}, "
|
|
387
|
+
f"languages: {sorted(languages)}, "
|
|
388
|
+
f"redacted: {redacted_count}"
|
|
389
|
+
)
|