mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. mcp_code_indexer/database/database.py +334 -115
  2. mcp_code_indexer/database/database_factory.py +1 -1
  3. mcp_code_indexer/database/exceptions.py +1 -1
  4. mcp_code_indexer/database/models.py +66 -24
  5. mcp_code_indexer/database/retry_executor.py +15 -5
  6. mcp_code_indexer/file_scanner.py +107 -12
  7. mcp_code_indexer/main.py +43 -30
  8. mcp_code_indexer/server/mcp_server.py +201 -7
  9. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  10. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  11. mcp_code_indexer/vector_mode/config.py +113 -45
  12. mcp_code_indexer/vector_mode/const.py +24 -0
  13. mcp_code_indexer/vector_mode/daemon.py +860 -98
  14. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  15. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  16. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  17. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  18. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  19. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  20. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  21. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  22. mcp_code_indexer/vector_mode/types.py +46 -0
  23. mcp_code_indexer/vector_mode/utils.py +50 -0
  24. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/METADATA +13 -10
  25. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/RECORD +28 -21
  26. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/WHEEL +1 -1
  27. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/entry_points.txt +0 -0
  28. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,580 @@
1
+ """
2
+ VectorStorageService for vector storage and retrieval.
3
+
4
+ Provides a clean abstraction layer between code chunks/embeddings and the
5
+ vector storage backend, handling namespace management, vector formatting,
6
+ and error handling.
7
+ """
8
+
9
+ import asyncio
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import List, Dict, Any, Optional
13
+
14
+ from turbopuffer.types import Row
15
+
16
+ from ..chunking.ast_chunker import CodeChunk
17
+ from ..providers.turbopuffer_client import TurbopufferClient
18
+ from ..config import VectorConfig
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class VectorStorageService:
24
+ """
25
+ Service for storing code embeddings in vector database.
26
+
27
+ Handles namespace management, vector formatting, and provides a clean
28
+ abstraction layer between the daemon orchestration and vector storage.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ turbopuffer_client: TurbopufferClient,
34
+ embedding_dimension: int,
35
+ config: VectorConfig,
36
+ ):
37
+ """Initialize VectorStorageService with client, embedding dimension, and configuration."""
38
+ self.turbopuffer_client = turbopuffer_client
39
+ self.embedding_dimension = embedding_dimension
40
+ self.config = config
41
+ self._namespace_cache: Dict[str, bool] = {} # Cache for namespace existence
42
+ self._api_validated = False # Lazy validation flag
43
+
44
+ async def _ensure_api_validated(self) -> None:
45
+ """Validate API access lazily on first use (avoids blocking event loop at init)."""
46
+ if self._api_validated:
47
+ return
48
+
49
+ loop = asyncio.get_running_loop()
50
+ await loop.run_in_executor(
51
+ None, self.turbopuffer_client.validate_api_access
52
+ )
53
+ self._api_validated = True
54
+
55
+ async def store_embeddings(
56
+ self,
57
+ embeddings: List[List[float]],
58
+ chunks: List[CodeChunk],
59
+ project_name: str,
60
+ file_path: str,
61
+ ) -> None:
62
+ """
63
+ Store embeddings for code chunks in Turbopuffer.
64
+
65
+ Args:
66
+ embeddings: List of embedding vectors
67
+ chunks: List of code chunks corresponding to embeddings
68
+ project_name: Name of the project
69
+ file_path: Path to the source file
70
+
71
+ Raises:
72
+ ValueError: If embeddings and chunks count mismatch
73
+ RuntimeError: If Turbopuffer operations fail
74
+ """
75
+ if not embeddings:
76
+ logger.debug(f"No embeddings to store for {file_path}")
77
+ return
78
+
79
+ if len(embeddings) != len(chunks):
80
+ raise ValueError(
81
+ f"Embeddings and chunks count mismatch: "
82
+ f"{len(embeddings)} embeddings vs {len(chunks)} chunks"
83
+ )
84
+
85
+ try:
86
+ await self._ensure_api_validated()
87
+ loop = asyncio.get_running_loop()
88
+ # Get namespace name (will be created implicitly by upsert_vectors)
89
+ namespace = self.turbopuffer_client.get_namespace_for_project(project_name)
90
+
91
+ # Clear existing vectors for this file to prevent redundant entries
92
+ logger.info(
93
+ f"Clearing existing vectors for file {file_path} before upserting new ones"
94
+ )
95
+ try:
96
+ delete_result = await loop.run_in_executor(
97
+ None,
98
+ self.turbopuffer_client.delete_vectors_for_file,
99
+ namespace,
100
+ file_path,
101
+ )
102
+ logger.info(
103
+ f"Cleared {delete_result['deleted']} existing vectors for {file_path}"
104
+ )
105
+ except Exception as e:
106
+ logger.warning(f"Failed to clear existing vectors for {file_path}: {e}")
107
+ # Continue with upsert even if deletion fails - better to have duplicates than no data
108
+
109
+ # Format vectors for storage
110
+ vectors = self._format_vectors_for_storage(
111
+ embeddings, chunks, project_name, file_path
112
+ )
113
+
114
+ # Store in Turbopuffer
115
+ result = await loop.run_in_executor(
116
+ None,
117
+ self.turbopuffer_client.upsert_vectors,
118
+ vectors,
119
+ namespace,
120
+ )
121
+
122
+ logger.info(
123
+ f"Stored {result['upserted']} vectors for {file_path} "
124
+ f"in namespace {namespace}"
125
+ )
126
+
127
+ except Exception as e:
128
+ logger.error(f"Failed to store embeddings for {file_path}: {e}")
129
+ raise RuntimeError(f"Vector storage failed: {e}")
130
+
131
+ async def store_embeddings_batch(
132
+ self,
133
+ file_embeddings: Dict[str, List[List[float]]],
134
+ file_chunks: Dict[str, List[CodeChunk]],
135
+ project_name: str,
136
+ ) -> None:
137
+ """
138
+ Store embeddings for multiple files in a single batch operation.
139
+
140
+ Args:
141
+ file_embeddings: Dictionary mapping file paths to their embeddings
142
+ file_chunks: Dictionary mapping file paths to their code chunks
143
+ project_name: Name of the project
144
+
145
+ Raises:
146
+ ValueError: If embeddings and chunks count mismatch for any file
147
+ RuntimeError: If Turbopuffer operations fail
148
+ """
149
+ if not file_embeddings:
150
+ logger.debug("No embeddings to store in batch")
151
+ return
152
+
153
+ if set(file_embeddings.keys()) != set(file_chunks.keys()):
154
+ raise ValueError("file_embeddings and file_chunks must have matching keys")
155
+
156
+ # Validate embeddings and chunks counts for each file
157
+ for file_path in file_embeddings.keys():
158
+ embeddings = file_embeddings[file_path]
159
+ chunks = file_chunks[file_path]
160
+ if len(embeddings) != len(chunks):
161
+ raise ValueError(
162
+ f"Embeddings and chunks count mismatch for {file_path}: "
163
+ f"{len(embeddings)} embeddings vs {len(chunks)} chunks"
164
+ )
165
+
166
+ total_vectors = sum(len(embs) for embs in file_embeddings.values())
167
+ logger.info(
168
+ f"Batch storing {total_vectors} vectors from {len(file_embeddings)} files "
169
+ f"for project {project_name}"
170
+ )
171
+
172
+ try:
173
+ await self._ensure_api_validated()
174
+ loop = asyncio.get_running_loop()
175
+ # Get namespace name (will be created implicitly by batch upsert)
176
+ namespace = self.turbopuffer_client.get_namespace_for_project(project_name)
177
+
178
+ # Delete existing vectors for all files in a single batch operation
179
+ files_to_clear = list(file_embeddings.keys())
180
+ try:
181
+ total_cleared = await self.batch_delete_vectors_for_files(
182
+ project_name, files_to_clear
183
+ )
184
+ if total_cleared > 0:
185
+ logger.info(
186
+ f"Cleared {total_cleared} existing vectors from {len(files_to_clear)} files"
187
+ )
188
+ except Exception as e:
189
+ logger.warning(f"Failed to batch clear existing vectors: {e}")
190
+ # Continue with batch upsert even if deletion fails
191
+
192
+ # Format all vectors for batch storage
193
+ all_vectors = []
194
+ for file_path in file_embeddings.keys():
195
+ embeddings = file_embeddings[file_path]
196
+ chunks = file_chunks[file_path]
197
+
198
+ file_vectors = self._format_vectors_for_storage(
199
+ embeddings, chunks, project_name, file_path
200
+ )
201
+ all_vectors.extend(file_vectors)
202
+
203
+ # Perform batch upsert
204
+ result = await loop.run_in_executor(
205
+ None,
206
+ self.turbopuffer_client.upsert_vectors_batch,
207
+ all_vectors,
208
+ namespace,
209
+ )
210
+
211
+ logger.info(
212
+ f"Batch stored {result['upserted']} vectors from {len(file_embeddings)} files "
213
+ f"in namespace {namespace}"
214
+ )
215
+
216
+ except Exception as e:
217
+ logger.error(f"Failed to batch store embeddings: {e}")
218
+ raise RuntimeError(f"Batch vector storage failed: {e}")
219
+
220
+ async def _ensure_namespace_exists(self, project_name: str) -> str | None:
221
+ """
222
+ Ensure the namespace for a project exists.
223
+
224
+ Args:
225
+ project_name: Name of the project
226
+
227
+ Returns:
228
+ The namespace name if it exists, None otherwise
229
+
230
+ Raises:
231
+ RuntimeError: If namespace operations fail
232
+ """
233
+ namespace = self.turbopuffer_client.get_namespace_for_project(project_name)
234
+ # Check cache first
235
+ if namespace in self._namespace_cache:
236
+ return namespace
237
+
238
+ try:
239
+ loop = asyncio.get_running_loop()
240
+ # List existing namespaces
241
+ existing_namespaces = await loop.run_in_executor(
242
+ None, self.turbopuffer_client.list_namespaces
243
+ )
244
+ if namespace not in existing_namespaces:
245
+ # Namespace doesn't exist and will be created implicitly on first write
246
+ logger.info(
247
+ f"Namespace '{namespace}' for project '{project_name}' does not exist"
248
+ )
249
+ return None
250
+
251
+ # Cache the result for existing namespace
252
+ self._namespace_cache[namespace] = True
253
+ return namespace
254
+
255
+ except Exception as e:
256
+ logger.error(f"Failed to ensure namespace {namespace}: {e}")
257
+ raise RuntimeError(f"Namespace operation failed: {e}")
258
+
259
+ def _format_vectors_for_storage(
260
+ self,
261
+ embeddings: List[List[float]],
262
+ chunks: List[CodeChunk],
263
+ project_name: str,
264
+ file_path: str,
265
+ ) -> List[Dict[str, Any]]:
266
+ """
267
+ Format embeddings and chunks into vectors suitable for Turbopuffer storage.
268
+
269
+ Args:
270
+ embeddings: List of embedding vectors
271
+ chunks: List of corresponding code chunks
272
+ project_name: Name of the project
273
+ file_path: Path to the source file
274
+
275
+ Returns:
276
+ List of formatted vector dictionaries
277
+ """
278
+ from datetime import datetime
279
+
280
+ vectors = []
281
+
282
+ # Get file modification time once for all chunks
283
+ try:
284
+ file_path_obj = Path(file_path)
285
+ mtime_unix = file_path_obj.stat().st_mtime
286
+ mtime_iso = datetime.fromtimestamp(mtime_unix).isoformat()
287
+ except (OSError, FileNotFoundError) as e:
288
+ logger.warning(f"Failed to get mtime for {file_path}: {e}")
289
+ mtime_unix = 0.0
290
+ mtime_iso = datetime.fromtimestamp(0.0).isoformat()
291
+
292
+ for i, (embedding, chunk) in enumerate(zip(embeddings, chunks)):
293
+ # Generate unique vector ID
294
+ vector_id = self.turbopuffer_client.generate_vector_id(project_name, i)
295
+
296
+ # Prepare metadata
297
+ metadata = {
298
+ "project_id": project_name,
299
+ "project_name": project_name,
300
+ "file_path": file_path,
301
+ "chunk_type": chunk.chunk_type.value,
302
+ "chunk_name": chunk.name,
303
+ "start_line": chunk.start_line,
304
+ "end_line": chunk.end_line,
305
+ "content_hash": chunk.content_hash,
306
+ "language": chunk.language,
307
+ "redacted": chunk.redacted,
308
+ "chunk_index": i,
309
+ "imports": ",".join(chunk.imports) if chunk.imports else "",
310
+ "file_mtime": str(mtime_unix),
311
+ "file_mtime_iso": mtime_iso,
312
+ }
313
+
314
+ # Add custom metadata if present
315
+ if chunk.metadata:
316
+ metadata.update(chunk.metadata)
317
+
318
+ vector = {
319
+ "id": vector_id,
320
+ "values": embedding,
321
+ "metadata": metadata,
322
+ }
323
+ vectors.append(vector)
324
+
325
+ logger.debug(f"Formatted {len(vectors)} vectors for storage")
326
+ return vectors
327
+
328
+ async def delete_vectors_for_file(self, project_name: str, file_path: str) -> None:
329
+ """
330
+ Delete all vectors associated with a specific file.
331
+
332
+ Args:
333
+ project_name: Name of the project
334
+ file_path: Path to the source file
335
+
336
+ Raises:
337
+ RuntimeError: If deletion fails
338
+ """
339
+ try:
340
+ await self._ensure_api_validated()
341
+ loop = asyncio.get_running_loop()
342
+ namespace = self.turbopuffer_client.get_namespace_for_project(project_name)
343
+
344
+ logger.info(
345
+ f"Deleting vectors for file {file_path} in namespace {namespace}"
346
+ )
347
+
348
+ # Use the TurbopufferClient method to delete by file_path filter
349
+ result = await loop.run_in_executor(
350
+ None,
351
+ self.turbopuffer_client.delete_vectors_for_file,
352
+ namespace,
353
+ file_path,
354
+ )
355
+
356
+ logger.info(
357
+ f"Successfully deleted {result['deleted']} vectors for file {file_path}"
358
+ )
359
+
360
+ except Exception as e:
361
+ logger.error(f"Failed to delete vectors for {file_path}: {e}")
362
+ raise RuntimeError(f"Vector deletion failed: {e}")
363
+
364
+ async def batch_delete_vectors_for_files(
365
+ self, project_name: str, file_paths: list[str]
366
+ ) -> int:
367
+ """
368
+ Delete all vectors associated with multiple files in a single batch operation.
369
+
370
+ Args:
371
+ project_name: Name of the project
372
+ file_paths: List of file paths to delete vectors for
373
+
374
+ Returns:
375
+ Number of vectors deleted
376
+
377
+ Raises:
378
+ RuntimeError: If deletion fails
379
+ """
380
+ if not file_paths:
381
+ return 0
382
+
383
+ try:
384
+ await self._ensure_api_validated()
385
+ loop = asyncio.get_running_loop()
386
+ namespace = self.turbopuffer_client.get_namespace_for_project(project_name)
387
+
388
+ logger.info(
389
+ f"Batch deleting vectors for {len(file_paths)} files in namespace {namespace}"
390
+ )
391
+
392
+ # Create dummy vector with correct dimensions for search
393
+ dummy_vector = [0.0] * self.embedding_dimension
394
+
395
+ # Find all vectors for the specified files
396
+ def search_for_files() -> list:
397
+ return self.turbopuffer_client.search_vectors(
398
+ query_vector=dummy_vector,
399
+ top_k=1200, # Set high enough to catch all chunks for multiple files
400
+ namespace=namespace,
401
+ filters=(
402
+ "And",
403
+ [
404
+ ("project_id", "Eq", project_name),
405
+ ("file_path", "In", file_paths),
406
+ ],
407
+ ),
408
+ )
409
+
410
+ rows = await loop.run_in_executor(None, search_for_files)
411
+
412
+ if not rows:
413
+ logger.info(
414
+ f"No vectors found for {len(file_paths)} files in namespace {namespace}"
415
+ )
416
+ return 0
417
+
418
+ # Extract vector IDs to delete
419
+ ids_to_delete = [row.id for row in rows]
420
+ logger.info(
421
+ f"Found {len(ids_to_delete)} vectors to delete for {len(file_paths)} files"
422
+ )
423
+
424
+ # Delete vectors by ID in batch
425
+ delete_result = await loop.run_in_executor(
426
+ None,
427
+ self.turbopuffer_client.delete_vectors,
428
+ ids_to_delete,
429
+ namespace,
430
+ )
431
+
432
+ deleted_count = delete_result["deleted"]
433
+ logger.info(
434
+ f"Batch deletion completed: removed {deleted_count} vectors "
435
+ f"for {len(file_paths)} files from namespace {namespace}"
436
+ )
437
+
438
+ return deleted_count
439
+
440
+ except Exception as e:
441
+ logger.error(
442
+ f"Failed to batch delete vectors for {len(file_paths)} files: {e}"
443
+ )
444
+ raise RuntimeError(f"Batch vector deletion failed: {e}")
445
+
446
+ async def search_similar_chunks(
447
+ self,
448
+ query_embedding: List[float],
449
+ project_name: str,
450
+ top_k: int = 10,
451
+ chunk_type: Optional[str] = None,
452
+ file_path: Optional[str] = None,
453
+ ) -> List[Row] | None:
454
+ """
455
+ Search for similar code chunks using embedding similarity.
456
+
457
+ Args:
458
+ query_embedding: The query embedding vector
459
+ project_name: Name of the project to search in
460
+ top_k: Number of results to return
461
+ chunk_type: Optional filter by chunk type
462
+ file_path: Optional filter by file path
463
+
464
+ Returns:
465
+ List of similar chunks with metadata and similarity scores
466
+
467
+ Raises:
468
+ RuntimeError: If search fails
469
+ """
470
+ try:
471
+ await self._ensure_api_validated()
472
+ loop = asyncio.get_running_loop()
473
+
474
+ def do_search() -> list:
475
+ return self.turbopuffer_client.search_with_metadata_filter(
476
+ query_vector=query_embedding,
477
+ project_id=project_name,
478
+ chunk_type=chunk_type,
479
+ file_path=file_path,
480
+ top_k=top_k,
481
+ )
482
+
483
+ results = await loop.run_in_executor(None, do_search)
484
+
485
+ logger.debug(f"Found {len(results)} similar chunks in {project_name}")
486
+ return results
487
+
488
+ except Exception as e:
489
+ logger.error(f"Failed to search similar chunks: {e}")
490
+ raise RuntimeError(f"Vector search failed: {e}")
491
+
492
+ async def get_file_metadata(
493
+ self, project_name: str, file_paths: list[str] | None = None
494
+ ) -> dict[str, float]:
495
+ """
496
+ Retrieve file modification times from stored vector metadata.
497
+
498
+ Args:
499
+ project_name: Name of the project
500
+ file_paths: Optional list of specific file paths to query. If None, queries all files.
501
+
502
+ Returns:
503
+ Dictionary mapping file_path to mtime (Unix timestamp)
504
+
505
+ Raises:
506
+ RuntimeError: If metadata query fails
507
+ """
508
+ try:
509
+ await self._ensure_api_validated()
510
+ loop = asyncio.get_running_loop()
511
+ namespace = await self._ensure_namespace_exists(project_name)
512
+
513
+ # If namespace doesn't exist, return empty metadata
514
+ if namespace is None:
515
+ logger.debug(
516
+ f"No namespace found for project {project_name}, returning empty metadata"
517
+ )
518
+ return {}
519
+
520
+ # Create dummy vector with correct dimensions
521
+ dummy_vector = [0.0] * self.embedding_dimension
522
+
523
+ if file_paths is None or len(file_paths) == 0:
524
+ # Query all files for the project
525
+ def search_all_files() -> list:
526
+ return self.turbopuffer_client.search_vectors(
527
+ query_vector=dummy_vector, # Dummy vector since we only want metadata
528
+ top_k=1200, # High limit to get all files (warning: it still can limit results)
529
+ namespace=namespace,
530
+ filters=("project_id", "Eq", project_name),
531
+ )
532
+
533
+ rows = await loop.run_in_executor(None, search_all_files)
534
+ else:
535
+ # Query specific files
536
+ def search_specific_files() -> list:
537
+ return self.turbopuffer_client.search_vectors(
538
+ query_vector=dummy_vector,
539
+ top_k=1200,
540
+ namespace=namespace,
541
+ filters=(
542
+ "And",
543
+ [
544
+ ("project_id", "Eq", project_name),
545
+ ("file_path", "In", file_paths),
546
+ ],
547
+ ),
548
+ )
549
+
550
+ rows = await loop.run_in_executor(None, search_specific_files)
551
+
552
+ if not rows:
553
+ logger.debug(
554
+ f"No rows found for project {project_name}, returning empty metadata"
555
+ )
556
+ return {}
557
+
558
+ # Extract file metadata, keeping only the most recent mtime per file
559
+ file_metadata = {}
560
+ for row in rows:
561
+ # Check if row has file_path and file_mtime attributes
562
+ if hasattr(row, "file_path") and hasattr(row, "file_mtime"):
563
+ file_path = row.file_path
564
+ mtime = float(row.file_mtime)
565
+
566
+ # Keep the most recent mtime for each file
567
+ if (
568
+ file_path not in file_metadata
569
+ or mtime > file_metadata[file_path]
570
+ ):
571
+ file_metadata[file_path] = mtime
572
+
573
+ logger.debug(
574
+ f"Retrieved metadata for {len(file_metadata)} files from {project_name}"
575
+ )
576
+ return file_metadata
577
+
578
+ except Exception as e:
579
+ logger.error(f"Failed to get file metadata for {project_name}: {e}")
580
+ return {}
@@ -0,0 +1,46 @@
1
+ """Type definitions for vector mode daemon tasks."""
2
+
3
+ from enum import Enum
4
+ from typing import TypedDict, Literal
5
+
6
+ from mcp_code_indexer.vector_mode.monitoring.change_detector import FileChange
7
+
8
+
9
+ class VectorDaemonTaskType(Enum):
10
+ """Task types for daemon processing queue."""
11
+
12
+ SCAN_PROJECT = "scan_project"
13
+ PROCESS_FILE_CHANGE = "process_file_change"
14
+ INITIAL_PROJECT_EMBEDDING = "initial_project_embedding"
15
+
16
+
17
+ class BaseTask(TypedDict):
18
+ """Base task with common fields."""
19
+
20
+ project_name: str
21
+ timestamp: float
22
+
23
+
24
+ class ScanProjectTask(BaseTask):
25
+ """Task for scanning and indexing a project."""
26
+
27
+ type: Literal[VectorDaemonTaskType.SCAN_PROJECT]
28
+ folder_path: str
29
+
30
+
31
+ class ProcessFileChangeTask(BaseTask):
32
+ """Task for processing file changes."""
33
+
34
+ type: Literal[VectorDaemonTaskType.PROCESS_FILE_CHANGE]
35
+ change: "FileChange" # Forward reference to avoid circular import
36
+
37
+
38
+ class InitialProjectEmbeddingTask(BaseTask):
39
+ """Task for performing initial project embedding."""
40
+
41
+ type: Literal[VectorDaemonTaskType.INITIAL_PROJECT_EMBEDDING]
42
+ folder_path: str
43
+
44
+
45
+ # Union type for all task types
46
+ TaskItem = ScanProjectTask | ProcessFileChangeTask | InitialProjectEmbeddingTask
@@ -0,0 +1,50 @@
1
+ """
2
+ Utility functions for vector mode operations.
3
+
4
+ Common utilities shared across vector mode components for file processing,
5
+ path handling, and pattern matching operations.
6
+ """
7
+
8
+ import re
9
+ import fnmatch
10
+ from pathlib import Path
11
+ from typing import List
12
+
13
+
14
+ def should_ignore_path(
15
+ path: Path, project_root: Path, ignore_patterns: List[str]
16
+ ) -> bool:
17
+ """
18
+ Check if a path should be ignored based on ignore patterns.
19
+
20
+ Args:
21
+ path: Path to check for ignoring
22
+ project_root: Root path of the project
23
+ ignore_patterns: List of glob patterns to match against
24
+
25
+ Returns:
26
+ True if the path should be ignored, False otherwise
27
+
28
+ Raises:
29
+ ValueError: If path is not relative to project_root
30
+ """
31
+ try:
32
+ relative_path = path.relative_to(project_root)
33
+ path_str = str(relative_path)
34
+
35
+ # Compile ignore patterns for matching
36
+ compiled_patterns = [fnmatch.translate(pattern) for pattern in ignore_patterns]
37
+
38
+ # Check if path matches any ignore pattern
39
+ for pattern in compiled_patterns:
40
+ if re.match(pattern, path_str):
41
+ return True
42
+
43
+ return False
44
+
45
+ except ValueError:
46
+ # Path is not relative to project root - should be ignored
47
+ return True
48
+
49
+
50
+