mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/database.py +251 -85
  2. mcp_code_indexer/database/models.py +66 -24
  3. mcp_code_indexer/database/retry_executor.py +15 -5
  4. mcp_code_indexer/file_scanner.py +107 -12
  5. mcp_code_indexer/main.py +43 -30
  6. mcp_code_indexer/server/mcp_server.py +191 -1
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  9. mcp_code_indexer/vector_mode/config.py +113 -45
  10. mcp_code_indexer/vector_mode/const.py +24 -0
  11. mcp_code_indexer/vector_mode/daemon.py +860 -98
  12. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  13. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  14. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  15. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  16. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  17. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  18. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  19. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  20. mcp_code_indexer/vector_mode/types.py +46 -0
  21. mcp_code_indexer/vector_mode/utils.py +50 -0
  22. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
  23. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
  24. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
  25. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
  26. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
@@ -6,25 +6,29 @@ high-quality code embeddings using the voyage-code-2 model.
6
6
  """
7
7
 
8
8
  import logging
9
- from typing import List, Dict, Any
9
+ from typing import List, Dict, Any, Tuple
10
10
  import voyageai
11
11
 
12
- from ..config import VectorConfig
12
+
13
+
14
+ from ..config import VectorConfig, DEFAULT_EMBEDDING_MODEL
15
+ from ..const import MODEL_DIMENSIONS
13
16
 
14
17
  logger = logging.getLogger(__name__)
15
18
 
19
+
16
20
  class VoyageClient:
17
21
  """Clean Voyage AI client using official SDK."""
18
-
19
- def __init__(self, api_key: str, model: str = "voyage-code-2"):
22
+
23
+ def __init__(self, api_key: str, model: str = DEFAULT_EMBEDDING_MODEL):
20
24
  self.api_key = api_key
21
25
  self.model = model
22
26
  self._embedding_dimension: int | None = None
23
-
27
+
24
28
  # Initialize official Voyage AI client
25
29
  self.client = voyageai.Client(api_key=api_key)
26
30
  logger.info(f"Initialized Voyage AI client with model {model}")
27
-
31
+
28
32
  def health_check(self) -> bool:
29
33
  """Check if Voyage AI service is healthy."""
30
34
  try:
@@ -33,74 +37,108 @@ class VoyageClient:
33
37
  except Exception as e:
34
38
  logger.warning(f"Voyage AI health check failed: {e}")
35
39
  return False
36
-
40
+
41
+ def validate_api_access(self) -> None:
42
+ """
43
+ Validate API key and access to Voyage AI service.
44
+
45
+ Raises:
46
+ RuntimeError: If API access validation fails with specific error details
47
+ """
48
+ logger.info("Validating Voyage AI API access...")
49
+ try:
50
+ result = self.client.embed(["test"], model=self.model, input_type="query")
51
+ if not result or not result.embeddings:
52
+ raise RuntimeError("Voyage AI API returned empty response")
53
+ logger.debug("Voyage AI API access validated successfully")
54
+ except Exception as e:
55
+ error_msg = str(e).lower()
56
+
57
+ if (
58
+ "401" in error_msg
59
+ or "unauthorized" in error_msg
60
+ or "api key" in error_msg
61
+ ):
62
+ raise RuntimeError(
63
+ f"Voyage AI API authentication failed: Invalid or expired API key. "
64
+ f"Please check your VOYAGE_API_KEY. Error: {e}"
65
+ )
66
+ elif "403" in error_msg or "forbidden" in error_msg:
67
+ raise RuntimeError(
68
+ f"Voyage AI API access denied: API key lacks required permissions. Error: {e}"
69
+ )
70
+ elif "429" in error_msg or "rate limit" in error_msg:
71
+ raise RuntimeError(
72
+ f"Voyage AI API rate limit exceeded: Too many requests. Error: {e}"
73
+ )
74
+ elif "quota" in error_msg or "usage" in error_msg:
75
+ raise RuntimeError(
76
+ f"Voyage AI API quota exceeded: Usage limit reached. Error: {e}"
77
+ )
78
+ elif "5" in error_msg and ("error" in error_msg or "server" in error_msg):
79
+ raise RuntimeError(
80
+ f"Voyage AI service unavailable: Server error. Error: {e}"
81
+ )
82
+ else:
83
+ raise RuntimeError(f"Voyage AI API access validation failed: {e}")
84
+
85
+ logger.info("Voyage AI API access validated successfully")
86
+
37
87
  def generate_embeddings(
38
- self,
39
- texts: List[str],
40
- input_type: str = "document",
41
- **kwargs
88
+ self, texts: List[str], input_type: str = "document", **kwargs
42
89
  ) -> List[List[float]]:
43
90
  """Generate embeddings for texts using official SDK."""
44
91
  if not texts:
45
92
  return []
46
-
93
+
47
94
  logger.info(f"Generating embeddings for {len(texts)} texts using {self.model}")
48
-
95
+
49
96
  try:
50
97
  result = self.client.embed(
51
- texts=texts,
52
- model=self.model,
53
- input_type=input_type,
54
- truncation=True
98
+ texts=texts, model=self.model, input_type=input_type, truncation=True
55
99
  )
56
-
100
+
57
101
  # Log usage if available
58
- if hasattr(result, 'usage') and result.usage:
102
+ if hasattr(result, "usage") and result.usage:
59
103
  logger.debug(f"Token usage: {result.usage.total_tokens}")
60
-
104
+
61
105
  logger.info(f"Successfully generated {len(result.embeddings)} embeddings")
62
106
  return result.embeddings
63
-
107
+
64
108
  except Exception as e:
65
109
  logger.error(f"Failed to generate embeddings: {e}")
66
110
  raise RuntimeError(f"Embedding generation failed: {e}")
67
-
111
+
68
112
  def get_embedding_dimension(self) -> int:
69
113
  """Get the dimension of embeddings produced by this model."""
70
114
  if self._embedding_dimension is not None:
71
115
  return self._embedding_dimension
72
-
116
+
73
117
  # Generate a test embedding to determine dimension
74
118
  try:
75
119
  test_embeddings = self.generate_embeddings(["test"], input_type="query")
76
120
  if test_embeddings:
77
121
  self._embedding_dimension = len(test_embeddings[0])
78
- logger.info(f"Detected embedding dimension: {self._embedding_dimension}")
122
+ logger.info(
123
+ f"Detected embedding dimension: {self._embedding_dimension}"
124
+ )
79
125
  return self._embedding_dimension
80
126
  except Exception as e:
81
127
  logger.warning(f"Could not determine embedding dimension: {e}")
82
-
83
- # Default dimensions for known Voyage models
84
- model_dimensions = {
85
- "voyage-code-2": 1536,
86
- "voyage-2": 1024,
87
- "voyage-large-2": 1536,
88
- "voyage-3": 1024,
89
- }
90
-
91
- self._embedding_dimension = model_dimensions.get(self.model, 1536)
128
+
129
+ self._embedding_dimension = MODEL_DIMENSIONS[self.model]
92
130
  logger.info(f"Using default embedding dimension: {self._embedding_dimension}")
93
131
  return self._embedding_dimension
94
-
132
+
95
133
  def estimate_cost(self, texts: List[str]) -> Dict[str, Any]:
96
134
  """Estimate the cost of embedding generation."""
97
135
  # Rough token estimation (4 chars per token)
98
136
  total_tokens = sum(len(text) // 4 for text in texts)
99
-
137
+
100
138
  # Voyage AI pricing (approximate, may change)
101
139
  cost_per_1k_tokens = 0.00013 # voyage-code-2 pricing
102
140
  estimated_cost = (total_tokens / 1000) * cost_per_1k_tokens
103
-
141
+
104
142
  return {
105
143
  "total_tokens": total_tokens,
106
144
  "total_texts": len(texts),
@@ -108,11 +146,75 @@ class VoyageClient:
108
146
  "model": self.model,
109
147
  }
110
148
 
149
+ def count_tokens(self, texts: List[str]) -> int:
150
+ return self.client.count_tokens(texts, self.model)
151
+
152
+ def generate_embeddings_batch(
153
+ self,
154
+ all_texts: List[str],
155
+ file_boundaries: List[Tuple[str, int, int]],
156
+ input_type: str = "document",
157
+ **kwargs,
158
+ ) -> Dict[str, List[List[float]]]:
159
+ """
160
+ Generate embeddings for texts from multiple files in a single batch call.
161
+
162
+ Note: Token limits should be handled at the calling level. This method
163
+ assumes the provided texts fit within API limits.
164
+
165
+ Args:
166
+ all_texts: Flattened list of all text chunks from all files
167
+ file_boundaries: List of (file_path, start_idx, end_idx) tuples indicating
168
+ which embeddings belong to which file
169
+ input_type: Type of input for embedding generation
170
+ **kwargs: Additional arguments for embedding generation
171
+
172
+ Returns:
173
+ Dictionary mapping file paths to their corresponding embeddings
174
+
175
+ Raises:
176
+ RuntimeError: If embedding generation fails
177
+ """
178
+ if not all_texts:
179
+ return {}
180
+
181
+ if not file_boundaries:
182
+ raise ValueError(
183
+ "file_boundaries cannot be empty when all_texts is provided"
184
+ )
185
+
186
+ logger.info(
187
+ f"Generating batch embeddings for {len(all_texts)} texts from {len(file_boundaries)} files using {self.model}"
188
+ )
189
+
190
+ try:
191
+ # Generate embeddings for all texts in a single API call
192
+ all_embeddings = self.generate_embeddings(
193
+ all_texts, input_type=input_type, **kwargs
194
+ )
195
+
196
+ # Group embeddings by file using boundaries
197
+ file_embeddings = {}
198
+ for file_path, start_idx, end_idx in file_boundaries:
199
+ file_embeddings[file_path] = all_embeddings[start_idx:end_idx]
200
+
201
+ logger.info(
202
+ f"Successfully generated batch embeddings for {len(file_boundaries)} files "
203
+ f"({len(all_embeddings)} total embeddings)"
204
+ )
205
+
206
+ return file_embeddings
207
+
208
+ except Exception as e:
209
+ logger.error(f"Failed to generate batch embeddings: {e}")
210
+ raise RuntimeError(f"Batch embedding generation failed: {e}")
211
+
212
+
111
213
  def create_voyage_client(config: VectorConfig) -> VoyageClient:
112
214
  """Create a Voyage client from configuration."""
113
215
  if not config.voyage_api_key:
114
216
  raise ValueError("VOYAGE_API_KEY is required for embedding generation")
115
-
217
+
116
218
  return VoyageClient(
117
219
  api_key=config.voyage_api_key,
118
220
  model=config.embedding_model,
@@ -0,0 +1,9 @@
1
+ """
2
+ Vector mode services module.
3
+
4
+ Contains service classes that handle specialized vector operations.
5
+ """
6
+
7
+ from .embedding_service import EmbeddingService
8
+
9
+ __all__ = ["EmbeddingService"]
@@ -0,0 +1,389 @@
1
+ """
2
+ Embedding Service for converting code chunks to vector embeddings.
3
+
4
+ Provides a clean abstraction layer between chunked code and embedding providers,
5
+ handling text preparation, batching, and provider communication.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import List, Dict, Tuple
12
+
13
+
14
+
15
+ from ..chunking.ast_chunker import CodeChunk
16
+ from ..providers.voyage_client import VoyageClient
17
+ from ..config import VectorConfig
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class EmbeddingService:
23
+ """
24
+ Service for converting code chunks to embeddings.
25
+
26
+ Handles text preparation, context enhancement, batching, and provider
27
+ communication while maintaining separation from daemon orchestration logic.
28
+ """
29
+
30
+ def __init__(self, embedding_client: VoyageClient, config: VectorConfig):
31
+ """Initialize embedding service with client and configuration."""
32
+ self.embedding_client = embedding_client
33
+ self.config = config
34
+
35
+ # Validate API access immediately during initialization
36
+ self.embedding_client.validate_api_access()
37
+
38
+ async def generate_embeddings_for_chunks(
39
+ self, chunks: List[CodeChunk], project_name: str, file_path: Path
40
+ ) -> List[List[float]]:
41
+ """
42
+ Generate embeddings for a list of code chunks.
43
+
44
+ Args:
45
+ chunks: List of code chunks to embed
46
+ project_name: Name of the project (for logging)
47
+ file_path: Path to source file (for logging)
48
+
49
+ Returns:
50
+ List of embedding vectors (one per chunk)
51
+
52
+ Raises:
53
+ RuntimeError: If embedding generation fails
54
+ """
55
+ if not chunks:
56
+ logger.debug(f"No chunks provided for {file_path}")
57
+ return []
58
+
59
+ logger.info(
60
+ f"Generating embeddings for {len(chunks)} chunks from {file_path}",
61
+ extra={
62
+ "structured_data": {
63
+ "project_name": project_name,
64
+ "file_path": str(file_path),
65
+ "chunk_count": len(chunks),
66
+ "embedding_model": self.config.embedding_model,
67
+ }
68
+ },
69
+ )
70
+
71
+ try:
72
+ # Extract text content from chunks with context enhancement
73
+ texts = self._prepare_chunk_texts(chunks)
74
+
75
+ # Process chunks in batches to respect API limits
76
+ all_embeddings = []
77
+ batch_size = self.config.batch_size
78
+
79
+ for i in range(0, len(texts), batch_size):
80
+ batch_texts = texts[i : i + batch_size]
81
+ batch_chunks = chunks[i : i + batch_size]
82
+
83
+ logger.debug(
84
+ f"Processing embedding batch {i//batch_size + 1} "
85
+ f"({len(batch_texts)} chunks) for {file_path}"
86
+ )
87
+
88
+ # Generate embeddings using async/sync bridge
89
+ embeddings = await asyncio.get_event_loop().run_in_executor(
90
+ None,
91
+ lambda: self.embedding_client.generate_embeddings(
92
+ batch_texts, input_type="document" # Code chunks are documents
93
+ ),
94
+ )
95
+
96
+ all_embeddings.extend(embeddings)
97
+
98
+ # Log batch statistics
99
+ self._log_batch_stats(
100
+ batch_chunks, i // batch_size + 1, len(embeddings)
101
+ )
102
+
103
+ logger.info(
104
+ f"Successfully generated {len(all_embeddings)} embeddings for {file_path}",
105
+ extra={
106
+ "structured_data": {
107
+ "project_name": project_name,
108
+ "file_path": str(file_path),
109
+ "embedding_count": len(all_embeddings),
110
+ }
111
+ },
112
+ )
113
+
114
+ return all_embeddings
115
+
116
+ except Exception as e:
117
+ logger.error(
118
+ f"Failed to generate embeddings for {file_path}: {e}",
119
+ extra={
120
+ "structured_data": {
121
+ "project_name": project_name,
122
+ "file_path": str(file_path),
123
+ "chunk_count": len(chunks),
124
+ "error": str(e),
125
+ }
126
+ },
127
+ exc_info=True,
128
+ )
129
+ raise
130
+
131
+ def _prepare_chunk_texts(self, chunks: List[CodeChunk]) -> List[str]:
132
+ """
133
+ Prepare text content from chunks with context enhancement.
134
+
135
+ Args:
136
+ chunks: List of code chunks
137
+
138
+ Returns:
139
+ List of prepared text strings ready for embedding
140
+ """
141
+ texts = []
142
+ for chunk in chunks:
143
+ # Include chunk context for better embeddings
144
+ text_content = chunk.content
145
+ if chunk.name:
146
+ # Prefix with chunk name for context
147
+ text_content = f"# {chunk.name}\n{chunk.content}"
148
+ texts.append(text_content)
149
+ return texts
150
+
151
+ def _log_batch_stats(
152
+ self, batch_chunks: List[CodeChunk], batch_num: int, embedding_count: int
153
+ ) -> None:
154
+ """Log statistics for a processed batch."""
155
+ chunk_types = {}
156
+ redacted_count = 0
157
+
158
+ for chunk in batch_chunks:
159
+ chunk_type = chunk.chunk_type.value
160
+ chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
161
+ if chunk.redacted:
162
+ redacted_count += 1
163
+
164
+ logger.debug(
165
+ f"Batch {batch_num} complete: "
166
+ f"{embedding_count} embeddings generated, "
167
+ f"chunk types: {chunk_types}, "
168
+ f"redacted: {redacted_count}"
169
+ )
170
+
171
+ async def generate_embeddings_for_multiple_files(
172
+ self, file_chunks: Dict[str, List[CodeChunk]], project_name: str
173
+ ) -> Dict[str, List[List[float]]]:
174
+ """
175
+ Generate embeddings for chunks from multiple files in a single batch operation.
176
+
177
+ Args:
178
+ file_chunks: Dictionary mapping file paths to their code chunks
179
+ project_name: Name of the project (for logging)
180
+
181
+ Returns:
182
+ Dictionary mapping file paths to their corresponding embeddings
183
+
184
+ Raises:
185
+ RuntimeError: If embedding generation fails
186
+ """
187
+ if not file_chunks:
188
+ logger.debug("No file chunks provided for batch processing")
189
+ return {}
190
+
191
+ total_chunks = sum(len(chunks) for chunks in file_chunks.values())
192
+ logger.info(
193
+ f"Generating batch embeddings for {len(file_chunks)} files "
194
+ f"({total_chunks} total chunks)",
195
+ extra={
196
+ "structured_data": {
197
+ "project_name": project_name,
198
+ "file_count": len(file_chunks),
199
+ "chunk_count": total_chunks,
200
+ "embedding_model": self.config.embedding_model,
201
+ }
202
+ },
203
+ )
204
+
205
+ try:
206
+ # Build batches from the start based on actual token counts and text limits
207
+ batches = await self._build_token_aware_batches(file_chunks, project_name)
208
+
209
+ if not batches:
210
+ logger.debug("No valid chunks found after text preparation")
211
+ return {}
212
+
213
+ # Process each batch
214
+ logger.info(
215
+ f"Processing {len(batches)} token-aware batches for project {project_name}"
216
+ )
217
+ all_file_embeddings = {}
218
+
219
+ for i, (batch_texts, batch_boundaries) in enumerate(batches):
220
+ logger.debug(
221
+ f"Processing batch {i + 1}/{len(batches)}: {len(batch_texts)} texts"
222
+ )
223
+
224
+ # Generate embeddings for this batch
225
+ batch_file_embeddings = await asyncio.get_event_loop().run_in_executor(
226
+ None,
227
+ lambda texts=batch_texts, boundaries=batch_boundaries: self.embedding_client.generate_embeddings_batch(
228
+ all_texts=texts,
229
+ file_boundaries=boundaries,
230
+ input_type="document",
231
+ ),
232
+ )
233
+
234
+ # Merge results
235
+ all_file_embeddings.update(batch_file_embeddings)
236
+
237
+ file_embeddings = all_file_embeddings
238
+
239
+ # Log batch statistics
240
+ self._log_batch_embedding_stats(file_chunks, file_embeddings)
241
+
242
+ logger.info(
243
+ f"Successfully generated batch embeddings for {len(file_embeddings)} files",
244
+ extra={
245
+ "structured_data": {
246
+ "project_name": project_name,
247
+ "files_processed": len(file_embeddings),
248
+ "total_embeddings": sum(
249
+ len(embs) for embs in file_embeddings.values()
250
+ ),
251
+ }
252
+ },
253
+ )
254
+
255
+ return file_embeddings
256
+
257
+ except Exception as e:
258
+ logger.error(
259
+ f"Failed to generate batch embeddings: {e}",
260
+ extra={
261
+ "structured_data": {
262
+ "project_name": project_name,
263
+ "file_count": len(file_chunks),
264
+ "chunk_count": total_chunks,
265
+ "error": str(e),
266
+ }
267
+ },
268
+ exc_info=True,
269
+ )
270
+ raise
271
+
272
+ async def _build_token_aware_batches(
273
+ self, file_chunks: Dict[str, List[CodeChunk]], project_name: str
274
+ ) -> List[Tuple[List[str], List[Tuple[str, int, int]]]]:
275
+ """
276
+ Build batches from file chunks respecting both token and text count limits.
277
+
278
+ Args:
279
+ file_chunks: Dictionary mapping file paths to their code chunks
280
+ project_name: Name of the project (for logging)
281
+
282
+ Returns:
283
+ List of tuples: (batch_texts, batch_file_boundaries)
284
+ """
285
+ batches = [] # List of (batch_texts, batch_file_boundaries)
286
+ current_batch_texts = []
287
+ current_batch_boundaries = []
288
+ current_batch_tokens = 0
289
+ batch_idx = 0
290
+
291
+ for file_path, chunks in file_chunks.items():
292
+ if not chunks:
293
+ continue
294
+
295
+ # Prepare texts for this file
296
+ file_texts = self._prepare_chunk_texts(chunks)
297
+ if not file_texts:
298
+ continue
299
+
300
+ # Count tokens for this file using accurate Voyage API
301
+ file_tokens = await asyncio.get_event_loop().run_in_executor(
302
+ None, lambda texts=file_texts: self.embedding_client.count_tokens(texts)
303
+ )
304
+
305
+ logger.debug(
306
+ f"File {file_path}: {len(file_texts)} texts, {file_tokens} tokens"
307
+ )
308
+
309
+ # If adding this file would exceed token limit OR text count limit, finalize current batch
310
+ if (
311
+ current_batch_tokens + file_tokens
312
+ > self.config.voyage_max_tokens_per_batch
313
+ or len(current_batch_texts) + len(file_texts)
314
+ > self.config.voyage_batch_size_limit
315
+ ) and current_batch_texts:
316
+
317
+ # Determine which limit was exceeded for logging
318
+ token_exceeded = (
319
+ current_batch_tokens + file_tokens
320
+ > self.config.voyage_max_tokens_per_batch
321
+ )
322
+ count_exceeded = (
323
+ len(current_batch_texts) + len(file_texts)
324
+ > self.config.voyage_batch_size_limit
325
+ )
326
+
327
+ logger.info(
328
+ f"Finalizing batch {len(batches) + 1}: {len(current_batch_texts)} texts, "
329
+ f"{current_batch_tokens} tokens (limit exceeded: "
330
+ f"tokens={token_exceeded}, count={count_exceeded})"
331
+ )
332
+
333
+ batches.append((current_batch_texts, current_batch_boundaries))
334
+
335
+ # Start new batch
336
+ current_batch_texts = []
337
+ current_batch_boundaries = []
338
+ current_batch_tokens = 0
339
+ batch_idx = 0
340
+
341
+ # Add this file to current batch
342
+ start_idx = batch_idx
343
+ end_idx = batch_idx + len(file_texts)
344
+ current_batch_texts.extend(file_texts)
345
+ current_batch_boundaries.append((file_path, start_idx, end_idx))
346
+ current_batch_tokens += file_tokens
347
+ batch_idx = end_idx
348
+
349
+ # Add final batch if it has content
350
+ if current_batch_texts:
351
+ logger.info(
352
+ f"Finalizing final batch {len(batches) + 1}: {len(current_batch_texts)} texts, "
353
+ f"{current_batch_tokens} tokens"
354
+ )
355
+ batches.append((current_batch_texts, current_batch_boundaries))
356
+
357
+ return batches
358
+
359
+ def _log_batch_embedding_stats(
360
+ self,
361
+ file_chunks: Dict[str, List[CodeChunk]],
362
+ file_embeddings: Dict[str, List[List[float]]],
363
+ ) -> None:
364
+ """Log statistics for batch embedding processing."""
365
+ total_chunks = 0
366
+ chunk_types = {}
367
+ redacted_count = 0
368
+ languages = set()
369
+
370
+ for file_path, chunks in file_chunks.items():
371
+ total_chunks += len(chunks)
372
+ for chunk in chunks:
373
+ chunk_type = chunk.chunk_type.value
374
+ chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
375
+ if chunk.redacted:
376
+ redacted_count += 1
377
+ languages.add(chunk.language)
378
+
379
+ total_embeddings = sum(len(embs) for embs in file_embeddings.values())
380
+
381
+ logger.debug(
382
+ f"Batch embedding complete: "
383
+ f"{len(file_chunks)} files, "
384
+ f"{total_chunks} chunks, "
385
+ f"{total_embeddings} embeddings generated, "
386
+ f"chunk types: {chunk_types}, "
387
+ f"languages: {sorted(languages)}, "
388
+ f"redacted: {redacted_count}"
389
+ )