crackerjack 0.38.14__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crackerjack might be problematic. Click here for more details.

Files changed (39) hide show
  1. crackerjack/__main__.py +134 -13
  2. crackerjack/agents/__init__.py +2 -0
  3. crackerjack/agents/base.py +1 -0
  4. crackerjack/agents/claude_code_bridge.py +319 -0
  5. crackerjack/agents/coordinator.py +6 -3
  6. crackerjack/agents/dry_agent.py +187 -3
  7. crackerjack/agents/enhanced_coordinator.py +279 -0
  8. crackerjack/agents/enhanced_proactive_agent.py +185 -0
  9. crackerjack/agents/performance_agent.py +324 -3
  10. crackerjack/agents/refactoring_agent.py +254 -5
  11. crackerjack/agents/semantic_agent.py +479 -0
  12. crackerjack/agents/semantic_helpers.py +356 -0
  13. crackerjack/cli/options.py +27 -0
  14. crackerjack/cli/semantic_handlers.py +290 -0
  15. crackerjack/core/async_workflow_orchestrator.py +9 -8
  16. crackerjack/core/enhanced_container.py +1 -1
  17. crackerjack/core/phase_coordinator.py +1 -1
  18. crackerjack/core/proactive_workflow.py +1 -1
  19. crackerjack/core/workflow_orchestrator.py +9 -6
  20. crackerjack/documentation/ai_templates.py +1 -1
  21. crackerjack/interactive.py +1 -1
  22. crackerjack/mcp/server_core.py +2 -0
  23. crackerjack/mcp/tools/__init__.py +2 -0
  24. crackerjack/mcp/tools/semantic_tools.py +584 -0
  25. crackerjack/models/semantic_models.py +271 -0
  26. crackerjack/plugins/loader.py +2 -2
  27. crackerjack/py313.py +4 -1
  28. crackerjack/services/embeddings.py +444 -0
  29. crackerjack/services/initialization.py +1 -1
  30. crackerjack/services/quality_intelligence.py +11 -1
  31. crackerjack/services/smart_scheduling.py +1 -1
  32. crackerjack/services/status_authentication.py +3 -3
  33. crackerjack/services/vector_store.py +681 -0
  34. crackerjack/slash_commands/run.md +84 -50
  35. {crackerjack-0.38.14.dist-info → crackerjack-0.39.0.dist-info}/METADATA +7 -2
  36. {crackerjack-0.38.14.dist-info → crackerjack-0.39.0.dist-info}/RECORD +39 -29
  37. {crackerjack-0.38.14.dist-info → crackerjack-0.39.0.dist-info}/WHEEL +0 -0
  38. {crackerjack-0.38.14.dist-info → crackerjack-0.39.0.dist-info}/entry_points.txt +0 -0
  39. {crackerjack-0.38.14.dist-info → crackerjack-0.39.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,681 @@
1
+ """Core vector store service for semantic search functionality."""
2
+
3
+ import json
4
+ import logging
5
+ import sqlite3
6
+ import tempfile
7
+ import typing as t
8
+ from contextlib import contextmanager
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+
12
+ from ..models.semantic_models import (
13
+ EmbeddingVector,
14
+ IndexingProgress,
15
+ IndexStats,
16
+ SearchQuery,
17
+ SearchResult,
18
+ SemanticConfig,
19
+ )
20
+ from .embeddings import EmbeddingService
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class VectorStore:
26
+ """Core vector store for managing embeddings and semantic search."""
27
+
28
+ def __init__(
29
+ self,
30
+ config: SemanticConfig,
31
+ db_path: Path | None = None,
32
+ embedding_service: EmbeddingService | None = None,
33
+ ) -> None:
34
+ """Initialize the vector store.
35
+
36
+ Args:
37
+ config: Semantic search configuration
38
+ db_path: Optional path to SQLite database (uses temp file if None)
39
+ embedding_service: Optional embedding service (creates new if None)
40
+ """
41
+ self.config = config
42
+ self.embedding_service = embedding_service or EmbeddingService(config)
43
+
44
+ # Database setup
45
+ if db_path is None:
46
+ # Create temporary database file
47
+ self._temp_db = tempfile.NamedTemporaryFile(
48
+ suffix=".db", delete=False, prefix="crackerjack_vectors_"
49
+ )
50
+ self.db_path = Path(self._temp_db.name)
51
+ else:
52
+ self.db_path = db_path
53
+ self._temp_db = None
54
+
55
+ self._initialize_database()
56
+
57
+ def _initialize_database(self) -> None:
58
+ """Initialize SQLite database with required tables."""
59
+ with self._get_connection() as conn:
60
+ # Create embeddings table
61
+ conn.execute("""
62
+ CREATE TABLE IF NOT EXISTS embeddings (
63
+ chunk_id TEXT PRIMARY KEY,
64
+ file_path TEXT NOT NULL,
65
+ content TEXT NOT NULL,
66
+ embedding BLOB NOT NULL,
67
+ created_at TEXT NOT NULL,
68
+ file_hash TEXT NOT NULL,
69
+ start_line INTEGER NOT NULL,
70
+ end_line INTEGER NOT NULL,
71
+ file_type TEXT NOT NULL
72
+ )
73
+ """)
74
+
75
+ # Create indexes for performance
76
+ conn.execute("""
77
+ CREATE INDEX IF NOT EXISTS idx_file_path ON embeddings(file_path)
78
+ """)
79
+ conn.execute("""
80
+ CREATE INDEX IF NOT EXISTS idx_file_hash ON embeddings(file_hash)
81
+ """)
82
+ conn.execute("""
83
+ CREATE INDEX IF NOT EXISTS idx_file_type ON embeddings(file_type)
84
+ """)
85
+
86
+ # Create file tracking table
87
+ conn.execute("""
88
+ CREATE TABLE IF NOT EXISTS file_tracking (
89
+ file_path TEXT PRIMARY KEY,
90
+ file_hash TEXT NOT NULL,
91
+ last_indexed TEXT NOT NULL,
92
+ chunk_count INTEGER NOT NULL DEFAULT 0
93
+ )
94
+ """)
95
+
96
+ conn.commit()
97
+
98
+ @contextmanager
99
+ def _get_connection(self) -> t.Iterator[sqlite3.Connection]:
100
+ """Get a database connection with proper error handling."""
101
+ conn = None
102
+ try:
103
+ conn = sqlite3.connect(self.db_path)
104
+ conn.row_factory = sqlite3.Row
105
+ yield conn
106
+ except Exception as e:
107
+ if conn:
108
+ conn.rollback()
109
+ logger.error(f"Database error: {e}")
110
+ raise
111
+ finally:
112
+ if conn:
113
+ conn.close()
114
+
115
+ def index_file(
116
+ self,
117
+ file_path: Path,
118
+ progress_callback: t.Callable[[IndexingProgress], None] | None = None,
119
+ ) -> list[EmbeddingVector]:
120
+ """Index a single file and return created embeddings.
121
+
122
+ Args:
123
+ file_path: Path to file to index
124
+ progress_callback: Optional callback for progress updates
125
+
126
+ Returns:
127
+ List of embedding vectors created for the file
128
+
129
+ Raises:
130
+ OSError: If file cannot be read
131
+ ValueError: If file is too large or has unsupported extension
132
+ """
133
+ # Validate file and check if reindexing is needed
134
+ current_hash = self._prepare_file_for_indexing(file_path)
135
+ if current_hash is None: # File up to date
136
+ return self._get_existing_embeddings(file_path)
137
+
138
+ logger.info(f"Indexing file: {file_path}")
139
+
140
+ try:
141
+ # Process file content into chunks and metadata
142
+ chunk_data = self._process_file_content(file_path, current_hash)
143
+ if not chunk_data["chunks"]:
144
+ logger.warning(f"No chunks generated for file: {file_path}")
145
+ return []
146
+
147
+ # Generate embeddings and create vector objects
148
+ embeddings = self._create_embedding_vectors(
149
+ file_path, current_hash, chunk_data, progress_callback
150
+ )
151
+
152
+ # Store results and update tracking
153
+ self._store_embeddings(embeddings)
154
+ self._update_file_tracking(file_path, current_hash, len(embeddings))
155
+
156
+ logger.info(
157
+ f"Successfully indexed {len(embeddings)} chunks from {file_path}"
158
+ )
159
+ return embeddings
160
+
161
+ except Exception as e:
162
+ logger.error(f"Failed to index file {file_path}: {e}")
163
+ raise
164
+
165
+ def _prepare_file_for_indexing(self, file_path: Path) -> str | None:
166
+ """Prepare file for indexing and return hash if reindexing needed.
167
+
168
+ Args:
169
+ file_path: Path to prepare for indexing
170
+
171
+ Returns:
172
+ File hash if reindexing needed, None if file is up to date
173
+ """
174
+ self._validate_file_for_indexing(file_path)
175
+
176
+ current_hash = self.embedding_service.get_file_hash(file_path)
177
+ if not self._needs_reindexing(file_path, current_hash):
178
+ logger.debug(f"File {file_path} is up to date, skipping")
179
+ return None
180
+
181
+ return current_hash
182
+
183
+ def _process_file_content(
184
+ self, file_path: Path, current_hash: str
185
+ ) -> dict[str, t.Any]:
186
+ """Process file content into chunks and metadata.
187
+
188
+ Args:
189
+ file_path: Path to process
190
+ current_hash: File hash for chunk IDs
191
+
192
+ Returns:
193
+ Dictionary with chunks, texts, and metadata
194
+ """
195
+ content = file_path.read_text(encoding="utf-8")
196
+ chunks = self.embedding_service.chunk_text(content)
197
+
198
+ chunk_texts = []
199
+ chunk_metadata = []
200
+
201
+ for i, chunk_content in enumerate(chunks):
202
+ chunk_id = f"{file_path.stem}_{current_hash[:8]}_{i}"
203
+ start_line = i * (self.config.chunk_size // 50) + 1 # Rough estimate
204
+ end_line = start_line + (len(chunk_content.split("\n")) - 1)
205
+
206
+ chunk_texts.append(chunk_content)
207
+ chunk_metadata.append(
208
+ {
209
+ "chunk_id": chunk_id,
210
+ "start_line": start_line,
211
+ "end_line": end_line,
212
+ }
213
+ )
214
+
215
+ return {
216
+ "chunks": chunks,
217
+ "chunk_texts": chunk_texts,
218
+ "chunk_metadata": chunk_metadata,
219
+ }
220
+
221
+ def _create_embedding_vectors(
222
+ self,
223
+ file_path: Path,
224
+ current_hash: str,
225
+ chunk_data: dict[str, t.Any],
226
+ progress_callback: t.Callable[[IndexingProgress], None] | None = None,
227
+ ) -> list[EmbeddingVector]:
228
+ """Create embedding vectors from chunk data.
229
+
230
+ Args:
231
+ file_path: Path being indexed
232
+ current_hash: File hash
233
+ chunk_data: Processed chunk data
234
+ progress_callback: Optional progress callback
235
+
236
+ Returns:
237
+ List of embedding vectors
238
+ """
239
+ chunk_texts = chunk_data["chunk_texts"]
240
+ chunk_metadata = chunk_data["chunk_metadata"]
241
+
242
+ # Generate embeddings in batch for efficiency
243
+ embedding_vectors = self.embedding_service.generate_embeddings_batch(
244
+ chunk_texts
245
+ )
246
+
247
+ embeddings = []
248
+ for i, (embedding_vector, metadata) in enumerate(
249
+ zip(embedding_vectors, chunk_metadata)
250
+ ):
251
+ if not embedding_vector: # Skip empty embeddings
252
+ continue
253
+
254
+ # Progress callback
255
+ if progress_callback:
256
+ progress = IndexingProgress(
257
+ current_file=file_path,
258
+ files_processed=0,
259
+ total_files=1,
260
+ chunks_created=i,
261
+ elapsed_time=0.0,
262
+ )
263
+ progress_callback(progress)
264
+
265
+ embedding = EmbeddingVector(
266
+ file_path=file_path,
267
+ chunk_id=metadata["chunk_id"],
268
+ content=chunk_texts[i],
269
+ embedding=embedding_vector,
270
+ created_at=datetime.now(),
271
+ file_hash=current_hash,
272
+ start_line=metadata["start_line"],
273
+ end_line=metadata["end_line"],
274
+ file_type=file_path.suffix,
275
+ )
276
+ embeddings.append(embedding)
277
+
278
+ return embeddings
279
+
280
+ def _validate_file_for_indexing(self, file_path: Path) -> None:
281
+ """Validate that a file can be indexed.
282
+
283
+ Args:
284
+ file_path: Path to validate
285
+
286
+ Raises:
287
+ ValueError: If file cannot be indexed
288
+ OSError: If file cannot be accessed
289
+ """
290
+ if not file_path.exists():
291
+ raise OSError(f"File does not exist: {file_path}")
292
+
293
+ if not file_path.is_file():
294
+ raise ValueError(f"Path is not a file: {file_path}")
295
+
296
+ # Check file size
297
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
298
+ if file_size_mb > self.config.max_file_size_mb:
299
+ raise ValueError(
300
+ f"File too large: {file_size_mb:.1f}MB > {self.config.max_file_size_mb}MB"
301
+ )
302
+
303
+ # Check file extension
304
+ if (
305
+ self.config.included_extensions
306
+ and file_path.suffix not in self.config.included_extensions
307
+ ):
308
+ raise ValueError(f"File extension not included: {file_path.suffix}")
309
+
310
+ # Check exclusion patterns
311
+ file_str = str(file_path)
312
+ for pattern in self.config.excluded_patterns:
313
+ if self._matches_pattern(file_str, pattern):
314
+ raise ValueError(f"File matches exclusion pattern: {pattern}")
315
+
316
+ def _matches_pattern(self, file_path: str, pattern: str) -> bool:
317
+ """Check if file path matches exclusion pattern."""
318
+ import fnmatch
319
+
320
+ return fnmatch.fnmatch(file_path, pattern)
321
+
322
+ def _needs_reindexing(self, file_path: Path, current_hash: str) -> bool:
323
+ """Check if file needs to be reindexed.
324
+
325
+ Args:
326
+ file_path: Path to check
327
+ current_hash: Current file hash
328
+
329
+ Returns:
330
+ True if file needs reindexing
331
+ """
332
+ with self._get_connection() as conn:
333
+ cursor = conn.execute(
334
+ "SELECT file_hash FROM file_tracking WHERE file_path = ?",
335
+ (str(file_path),),
336
+ )
337
+ row = cursor.fetchone()
338
+
339
+ if row is None:
340
+ return True # File not indexed yet
341
+
342
+ return row["file_hash"] != current_hash
343
+
344
+ def _get_existing_embeddings(self, file_path: Path) -> list[EmbeddingVector]:
345
+ """Get existing embeddings for a file.
346
+
347
+ Args:
348
+ file_path: Path to get embeddings for
349
+
350
+ Returns:
351
+ List of existing embeddings
352
+ """
353
+ embeddings = []
354
+
355
+ with self._get_connection() as conn:
356
+ cursor = conn.execute(
357
+ """SELECT chunk_id, file_path, content, embedding, created_at,
358
+ file_hash, start_line, end_line, file_type
359
+ FROM embeddings WHERE file_path = ?""",
360
+ (str(file_path),),
361
+ )
362
+
363
+ for row in cursor.fetchall():
364
+ # Deserialize embedding
365
+ embedding_data = json.loads(row["embedding"])
366
+
367
+ embedding = EmbeddingVector(
368
+ file_path=Path(row["file_path"]),
369
+ chunk_id=row["chunk_id"],
370
+ content=row["content"],
371
+ embedding=embedding_data,
372
+ created_at=datetime.fromisoformat(row["created_at"]),
373
+ file_hash=row["file_hash"],
374
+ start_line=row["start_line"],
375
+ end_line=row["end_line"],
376
+ file_type=row["file_type"],
377
+ )
378
+ embeddings.append(embedding)
379
+
380
+ return embeddings
381
+
382
+ def _store_embeddings(self, embeddings: list[EmbeddingVector]) -> None:
383
+ """Store embeddings in database.
384
+
385
+ Args:
386
+ embeddings: List of embeddings to store
387
+ """
388
+ if not embeddings:
389
+ return
390
+
391
+ with self._get_connection() as conn:
392
+ # Remove existing embeddings for these files
393
+ file_paths = {str(emb.file_path) for emb in embeddings}
394
+ for file_path in file_paths:
395
+ conn.execute("DELETE FROM embeddings WHERE file_path = ?", (file_path,))
396
+
397
+ # Insert new embeddings
398
+ for embedding in embeddings:
399
+ conn.execute(
400
+ """
401
+ INSERT INTO embeddings
402
+ (chunk_id, file_path, content, embedding, created_at,
403
+ file_hash, start_line, end_line, file_type)
404
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
405
+ """,
406
+ (
407
+ embedding.chunk_id,
408
+ str(embedding.file_path),
409
+ embedding.content,
410
+ json.dumps(embedding.embedding),
411
+ embedding.created_at.isoformat(),
412
+ embedding.file_hash,
413
+ embedding.start_line,
414
+ embedding.end_line,
415
+ embedding.file_type,
416
+ ),
417
+ )
418
+
419
+ conn.commit()
420
+
421
+ def _update_file_tracking(
422
+ self, file_path: Path, file_hash: str, chunk_count: int
423
+ ) -> None:
424
+ """Update file tracking information.
425
+
426
+ Args:
427
+ file_path: Path of indexed file
428
+ file_hash: Hash of file content
429
+ chunk_count: Number of chunks created
430
+ """
431
+ with self._get_connection() as conn:
432
+ conn.execute(
433
+ """
434
+ INSERT OR REPLACE INTO file_tracking
435
+ (file_path, file_hash, last_indexed, chunk_count)
436
+ VALUES (?, ?, ?, ?)
437
+ """,
438
+ (str(file_path), file_hash, datetime.now().isoformat(), chunk_count),
439
+ )
440
+ conn.commit()
441
+
442
+ def search(self, query: SearchQuery) -> list[SearchResult]:
443
+ """Perform semantic search and return results.
444
+
445
+ Args:
446
+ query: Search query with parameters
447
+
448
+ Returns:
449
+ List of search results sorted by similarity score
450
+ """
451
+ # Generate embedding for query
452
+ query_embedding = self.embedding_service.generate_embedding(query.query)
453
+
454
+ # Get all embeddings from database
455
+ embeddings_data = self._get_all_embeddings(query.file_types)
456
+
457
+ if not embeddings_data:
458
+ return []
459
+
460
+ # Calculate similarities
461
+ similarities = self.embedding_service.calculate_similarities_batch(
462
+ query_embedding, [data["embedding"] for data in embeddings_data]
463
+ )
464
+
465
+ # Create search results
466
+ results = []
467
+ for i, (data, similarity) in enumerate(zip(embeddings_data, similarities)):
468
+ if similarity >= query.min_similarity:
469
+ # Get context lines if requested
470
+ context_lines = []
471
+ if query.include_context:
472
+ context_lines = self._get_context_lines(
473
+ Path(data["file_path"]),
474
+ data["start_line"],
475
+ data["end_line"],
476
+ query.context_lines,
477
+ )
478
+
479
+ result = SearchResult(
480
+ file_path=Path(data["file_path"]),
481
+ chunk_id=data["chunk_id"],
482
+ content=data["content"],
483
+ similarity_score=similarity,
484
+ start_line=data["start_line"],
485
+ end_line=data["end_line"],
486
+ file_type=data["file_type"],
487
+ context_lines=context_lines,
488
+ )
489
+ results.append(result)
490
+
491
+ # Sort by similarity score (descending) and limit results
492
+ results.sort(key=lambda x: x.similarity_score, reverse=True)
493
+ return results[: query.max_results]
494
+
495
+ def _get_all_embeddings(
496
+ self, file_types: list[str] | None = None
497
+ ) -> list[dict[str, t.Any]]:
498
+ """Get all embeddings from database with optional file type filtering.
499
+
500
+ Args:
501
+ file_types: Optional list of file types to filter by
502
+
503
+ Returns:
504
+ List of embedding data dictionaries
505
+ """
506
+ embeddings_data = []
507
+
508
+ with self._get_connection() as conn:
509
+ if file_types:
510
+ # Build parameterized query safely with proper placeholders
511
+ placeholders = ",".join("?" * len(file_types))
512
+ # Use static query template with placeholders - safe from injection
513
+ query_template = (
514
+ "SELECT chunk_id, file_path, content, embedding, start_line, end_line, file_type "
515
+ "FROM embeddings WHERE file_type IN ({})"
516
+ )
517
+ query_sql = query_template.format(placeholders) # nosec B608
518
+ cursor = conn.execute(query_sql, file_types)
519
+ else:
520
+ cursor = conn.execute("""
521
+ SELECT chunk_id, file_path, content, embedding, start_line, end_line, file_type
522
+ FROM embeddings
523
+ """)
524
+
525
+ for row in cursor.fetchall():
526
+ data = {
527
+ "chunk_id": row["chunk_id"],
528
+ "file_path": row["file_path"],
529
+ "content": row["content"],
530
+ "embedding": json.loads(row["embedding"]),
531
+ "start_line": row["start_line"],
532
+ "end_line": row["end_line"],
533
+ "file_type": row["file_type"],
534
+ }
535
+ embeddings_data.append(data)
536
+
537
+ return embeddings_data
538
+
539
+ def _get_context_lines(
540
+ self, file_path: Path, start_line: int, end_line: int, context_count: int
541
+ ) -> list[str]:
542
+ """Get context lines around a text chunk.
543
+
544
+ Args:
545
+ file_path: Path to source file
546
+ start_line: Starting line of chunk
547
+ end_line: Ending line of chunk
548
+ context_count: Number of context lines to include
549
+
550
+ Returns:
551
+ List of context lines
552
+ """
553
+ try:
554
+ if not file_path.exists():
555
+ return []
556
+
557
+ lines = file_path.read_text(encoding="utf-8").splitlines()
558
+
559
+ # Calculate context range
560
+ context_start = max(0, start_line - context_count - 1)
561
+ context_end = min(len(lines), end_line + context_count)
562
+
563
+ return lines[context_start:context_end]
564
+
565
+ except Exception as e:
566
+ logger.warning(f"Failed to get context lines for {file_path}: {e}")
567
+ return []
568
+
569
+ def get_stats(self) -> IndexStats:
570
+ """Get statistics about the vector store index.
571
+
572
+ Returns:
573
+ Index statistics
574
+ """
575
+ with self._get_connection() as conn:
576
+ # Get total counts
577
+ cursor = conn.execute("SELECT COUNT(*) as total_chunks FROM embeddings")
578
+ total_chunks = cursor.fetchone()["total_chunks"]
579
+
580
+ cursor = conn.execute(
581
+ "SELECT COUNT(DISTINCT file_path) as total_files FROM embeddings"
582
+ )
583
+ total_files = cursor.fetchone()["total_files"]
584
+
585
+ # Get file type distribution
586
+ cursor = conn.execute("""
587
+ SELECT file_type, COUNT(*) as count
588
+ FROM embeddings
589
+ GROUP BY file_type
590
+ """)
591
+ file_types = {row["file_type"]: row["count"] for row in cursor.fetchall()}
592
+
593
+ # Get last update time
594
+ cursor = conn.execute(
595
+ "SELECT MAX(created_at) as last_updated FROM embeddings"
596
+ )
597
+ last_updated_str = cursor.fetchone()["last_updated"]
598
+ last_updated = (
599
+ datetime.fromisoformat(last_updated_str)
600
+ if last_updated_str
601
+ else datetime.now()
602
+ )
603
+
604
+ # Calculate average chunk size
605
+ cursor = conn.execute(
606
+ "SELECT AVG(LENGTH(content)) as avg_size FROM embeddings"
607
+ )
608
+ avg_chunk_size = cursor.fetchone()["avg_size"] or 0.0
609
+
610
+ # Estimate index size (rough approximation)
611
+ index_size_mb = (total_chunks * 384 * 4) / (
612
+ 1024 * 1024
613
+ ) # Assuming 384-dim embeddings
614
+
615
+ return IndexStats(
616
+ total_files=total_files,
617
+ total_chunks=total_chunks,
618
+ index_size_mb=index_size_mb,
619
+ last_updated=last_updated,
620
+ file_types=file_types,
621
+ embedding_model=self.config.embedding_model,
622
+ avg_chunk_size=avg_chunk_size,
623
+ )
624
+
625
+ def remove_file(self, file_path: Path) -> bool:
626
+ """Remove a file's embeddings from the index.
627
+
628
+ Args:
629
+ file_path: Path of file to remove
630
+
631
+ Returns:
632
+ True if file was removed, False if not found
633
+ """
634
+ with self._get_connection() as conn:
635
+ # Check if file exists in index
636
+ cursor = conn.execute(
637
+ "SELECT COUNT(*) as count FROM embeddings WHERE file_path = ?",
638
+ (str(file_path),),
639
+ )
640
+ count = cursor.fetchone()["count"]
641
+
642
+ if count == 0:
643
+ return False
644
+
645
+ # Remove embeddings
646
+ conn.execute(
647
+ "DELETE FROM embeddings WHERE file_path = ?", (str(file_path),)
648
+ )
649
+
650
+ # Remove from file tracking
651
+ conn.execute(
652
+ "DELETE FROM file_tracking WHERE file_path = ?", (str(file_path),)
653
+ )
654
+
655
+ conn.commit()
656
+ logger.info(f"Removed {count} embeddings for file: {file_path}")
657
+ return True
658
+
659
+ def clear_index(self) -> None:
660
+ """Clear all embeddings from the index."""
661
+ with self._get_connection() as conn:
662
+ conn.execute("DELETE FROM embeddings")
663
+ conn.execute("DELETE FROM file_tracking")
664
+ conn.commit()
665
+ logger.info("Cleared all embeddings from index")
666
+
667
+ def close(self) -> None:
668
+ """Clean up resources."""
669
+ if self._temp_db:
670
+ self._temp_db.close()
671
+ if self.db_path.exists():
672
+ self.db_path.unlink()
673
+ logger.debug("Cleaned up temporary database")
674
+
675
+ def __enter__(self) -> "VectorStore":
676
+ """Context manager entry."""
677
+ return self
678
+
679
+ def __exit__(self, exc_type: t.Any, exc_val: t.Any, exc_tb: t.Any) -> None:
680
+ """Context manager exit."""
681
+ self.close()