crackerjack 0.38.15__py3-none-any.whl → 0.39.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crackerjack might be problematic. Click here for more details.
- crackerjack/__main__.py +134 -13
- crackerjack/agents/__init__.py +2 -0
- crackerjack/agents/base.py +1 -0
- crackerjack/agents/claude_code_bridge.py +319 -0
- crackerjack/agents/coordinator.py +6 -3
- crackerjack/agents/dry_agent.py +187 -3
- crackerjack/agents/enhanced_coordinator.py +279 -0
- crackerjack/agents/enhanced_proactive_agent.py +185 -0
- crackerjack/agents/performance_agent.py +324 -3
- crackerjack/agents/refactoring_agent.py +254 -5
- crackerjack/agents/semantic_agent.py +479 -0
- crackerjack/agents/semantic_helpers.py +356 -0
- crackerjack/cli/options.py +27 -0
- crackerjack/cli/semantic_handlers.py +290 -0
- crackerjack/core/async_workflow_orchestrator.py +9 -8
- crackerjack/core/enhanced_container.py +1 -1
- crackerjack/core/phase_coordinator.py +1 -1
- crackerjack/core/proactive_workflow.py +1 -1
- crackerjack/core/workflow_orchestrator.py +9 -6
- crackerjack/documentation/ai_templates.py +1 -1
- crackerjack/interactive.py +1 -1
- crackerjack/mcp/server_core.py +2 -0
- crackerjack/mcp/tools/__init__.py +2 -0
- crackerjack/mcp/tools/semantic_tools.py +584 -0
- crackerjack/models/semantic_models.py +271 -0
- crackerjack/plugins/loader.py +2 -2
- crackerjack/py313.py +4 -1
- crackerjack/services/embeddings.py +444 -0
- crackerjack/services/quality_intelligence.py +11 -1
- crackerjack/services/smart_scheduling.py +1 -1
- crackerjack/services/vector_store.py +681 -0
- crackerjack/slash_commands/run.md +84 -50
- {crackerjack-0.38.15.dist-info → crackerjack-0.39.1.dist-info}/METADATA +7 -2
- {crackerjack-0.38.15.dist-info → crackerjack-0.39.1.dist-info}/RECORD +37 -27
- {crackerjack-0.38.15.dist-info → crackerjack-0.39.1.dist-info}/WHEEL +0 -0
- {crackerjack-0.38.15.dist-info → crackerjack-0.39.1.dist-info}/entry_points.txt +0 -0
- {crackerjack-0.38.15.dist-info → crackerjack-0.39.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
"""Core vector store service for semantic search functionality."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import sqlite3
|
|
6
|
+
import tempfile
|
|
7
|
+
import typing as t
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from ..models.semantic_models import (
|
|
13
|
+
EmbeddingVector,
|
|
14
|
+
IndexingProgress,
|
|
15
|
+
IndexStats,
|
|
16
|
+
SearchQuery,
|
|
17
|
+
SearchResult,
|
|
18
|
+
SemanticConfig,
|
|
19
|
+
)
|
|
20
|
+
from .embeddings import EmbeddingService
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class VectorStore:
|
|
26
|
+
"""Core vector store for managing embeddings and semantic search."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
config: SemanticConfig,
|
|
31
|
+
db_path: Path | None = None,
|
|
32
|
+
embedding_service: EmbeddingService | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Initialize the vector store.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
config: Semantic search configuration
|
|
38
|
+
db_path: Optional path to SQLite database (uses temp file if None)
|
|
39
|
+
embedding_service: Optional embedding service (creates new if None)
|
|
40
|
+
"""
|
|
41
|
+
self.config = config
|
|
42
|
+
self.embedding_service = embedding_service or EmbeddingService(config)
|
|
43
|
+
|
|
44
|
+
# Database setup
|
|
45
|
+
if db_path is None:
|
|
46
|
+
# Create temporary database file
|
|
47
|
+
self._temp_db = tempfile.NamedTemporaryFile(
|
|
48
|
+
suffix=".db", delete=False, prefix="crackerjack_vectors_"
|
|
49
|
+
)
|
|
50
|
+
self.db_path = Path(self._temp_db.name)
|
|
51
|
+
else:
|
|
52
|
+
self.db_path = db_path
|
|
53
|
+
self._temp_db = None
|
|
54
|
+
|
|
55
|
+
self._initialize_database()
|
|
56
|
+
|
|
57
|
+
def _initialize_database(self) -> None:
|
|
58
|
+
"""Initialize SQLite database with required tables."""
|
|
59
|
+
with self._get_connection() as conn:
|
|
60
|
+
# Create embeddings table
|
|
61
|
+
conn.execute("""
|
|
62
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
63
|
+
chunk_id TEXT PRIMARY KEY,
|
|
64
|
+
file_path TEXT NOT NULL,
|
|
65
|
+
content TEXT NOT NULL,
|
|
66
|
+
embedding BLOB NOT NULL,
|
|
67
|
+
created_at TEXT NOT NULL,
|
|
68
|
+
file_hash TEXT NOT NULL,
|
|
69
|
+
start_line INTEGER NOT NULL,
|
|
70
|
+
end_line INTEGER NOT NULL,
|
|
71
|
+
file_type TEXT NOT NULL
|
|
72
|
+
)
|
|
73
|
+
""")
|
|
74
|
+
|
|
75
|
+
# Create indexes for performance
|
|
76
|
+
conn.execute("""
|
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_file_path ON embeddings(file_path)
|
|
78
|
+
""")
|
|
79
|
+
conn.execute("""
|
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_file_hash ON embeddings(file_hash)
|
|
81
|
+
""")
|
|
82
|
+
conn.execute("""
|
|
83
|
+
CREATE INDEX IF NOT EXISTS idx_file_type ON embeddings(file_type)
|
|
84
|
+
""")
|
|
85
|
+
|
|
86
|
+
# Create file tracking table
|
|
87
|
+
conn.execute("""
|
|
88
|
+
CREATE TABLE IF NOT EXISTS file_tracking (
|
|
89
|
+
file_path TEXT PRIMARY KEY,
|
|
90
|
+
file_hash TEXT NOT NULL,
|
|
91
|
+
last_indexed TEXT NOT NULL,
|
|
92
|
+
chunk_count INTEGER NOT NULL DEFAULT 0
|
|
93
|
+
)
|
|
94
|
+
""")
|
|
95
|
+
|
|
96
|
+
conn.commit()
|
|
97
|
+
|
|
98
|
+
@contextmanager
|
|
99
|
+
def _get_connection(self) -> t.Iterator[sqlite3.Connection]:
|
|
100
|
+
"""Get a database connection with proper error handling."""
|
|
101
|
+
conn = None
|
|
102
|
+
try:
|
|
103
|
+
conn = sqlite3.connect(self.db_path)
|
|
104
|
+
conn.row_factory = sqlite3.Row
|
|
105
|
+
yield conn
|
|
106
|
+
except Exception as e:
|
|
107
|
+
if conn:
|
|
108
|
+
conn.rollback()
|
|
109
|
+
logger.error(f"Database error: {e}")
|
|
110
|
+
raise
|
|
111
|
+
finally:
|
|
112
|
+
if conn:
|
|
113
|
+
conn.close()
|
|
114
|
+
|
|
115
|
+
def index_file(
|
|
116
|
+
self,
|
|
117
|
+
file_path: Path,
|
|
118
|
+
progress_callback: t.Callable[[IndexingProgress], None] | None = None,
|
|
119
|
+
) -> list[EmbeddingVector]:
|
|
120
|
+
"""Index a single file and return created embeddings.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
file_path: Path to file to index
|
|
124
|
+
progress_callback: Optional callback for progress updates
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of embedding vectors created for the file
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
OSError: If file cannot be read
|
|
131
|
+
ValueError: If file is too large or has unsupported extension
|
|
132
|
+
"""
|
|
133
|
+
# Validate file and check if reindexing is needed
|
|
134
|
+
current_hash = self._prepare_file_for_indexing(file_path)
|
|
135
|
+
if current_hash is None: # File up to date
|
|
136
|
+
return self._get_existing_embeddings(file_path)
|
|
137
|
+
|
|
138
|
+
logger.info(f"Indexing file: {file_path}")
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
# Process file content into chunks and metadata
|
|
142
|
+
chunk_data = self._process_file_content(file_path, current_hash)
|
|
143
|
+
if not chunk_data["chunks"]:
|
|
144
|
+
logger.warning(f"No chunks generated for file: {file_path}")
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
# Generate embeddings and create vector objects
|
|
148
|
+
embeddings = self._create_embedding_vectors(
|
|
149
|
+
file_path, current_hash, chunk_data, progress_callback
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Store results and update tracking
|
|
153
|
+
self._store_embeddings(embeddings)
|
|
154
|
+
self._update_file_tracking(file_path, current_hash, len(embeddings))
|
|
155
|
+
|
|
156
|
+
logger.info(
|
|
157
|
+
f"Successfully indexed {len(embeddings)} chunks from {file_path}"
|
|
158
|
+
)
|
|
159
|
+
return embeddings
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.error(f"Failed to index file {file_path}: {e}")
|
|
163
|
+
raise
|
|
164
|
+
|
|
165
|
+
def _prepare_file_for_indexing(self, file_path: Path) -> str | None:
|
|
166
|
+
"""Prepare file for indexing and return hash if reindexing needed.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
file_path: Path to prepare for indexing
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
File hash if reindexing needed, None if file is up to date
|
|
173
|
+
"""
|
|
174
|
+
self._validate_file_for_indexing(file_path)
|
|
175
|
+
|
|
176
|
+
current_hash = self.embedding_service.get_file_hash(file_path)
|
|
177
|
+
if not self._needs_reindexing(file_path, current_hash):
|
|
178
|
+
logger.debug(f"File {file_path} is up to date, skipping")
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
return current_hash
|
|
182
|
+
|
|
183
|
+
def _process_file_content(
|
|
184
|
+
self, file_path: Path, current_hash: str
|
|
185
|
+
) -> dict[str, t.Any]:
|
|
186
|
+
"""Process file content into chunks and metadata.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
file_path: Path to process
|
|
190
|
+
current_hash: File hash for chunk IDs
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Dictionary with chunks, texts, and metadata
|
|
194
|
+
"""
|
|
195
|
+
content = file_path.read_text(encoding="utf-8")
|
|
196
|
+
chunks = self.embedding_service.chunk_text(content)
|
|
197
|
+
|
|
198
|
+
chunk_texts = []
|
|
199
|
+
chunk_metadata = []
|
|
200
|
+
|
|
201
|
+
for i, chunk_content in enumerate(chunks):
|
|
202
|
+
chunk_id = f"{file_path.stem}_{current_hash[:8]}_{i}"
|
|
203
|
+
start_line = i * (self.config.chunk_size // 50) + 1 # Rough estimate
|
|
204
|
+
end_line = start_line + (len(chunk_content.split("\n")) - 1)
|
|
205
|
+
|
|
206
|
+
chunk_texts.append(chunk_content)
|
|
207
|
+
chunk_metadata.append(
|
|
208
|
+
{
|
|
209
|
+
"chunk_id": chunk_id,
|
|
210
|
+
"start_line": start_line,
|
|
211
|
+
"end_line": end_line,
|
|
212
|
+
}
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
return {
|
|
216
|
+
"chunks": chunks,
|
|
217
|
+
"chunk_texts": chunk_texts,
|
|
218
|
+
"chunk_metadata": chunk_metadata,
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
def _create_embedding_vectors(
|
|
222
|
+
self,
|
|
223
|
+
file_path: Path,
|
|
224
|
+
current_hash: str,
|
|
225
|
+
chunk_data: dict[str, t.Any],
|
|
226
|
+
progress_callback: t.Callable[[IndexingProgress], None] | None = None,
|
|
227
|
+
) -> list[EmbeddingVector]:
|
|
228
|
+
"""Create embedding vectors from chunk data.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
file_path: Path being indexed
|
|
232
|
+
current_hash: File hash
|
|
233
|
+
chunk_data: Processed chunk data
|
|
234
|
+
progress_callback: Optional progress callback
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
List of embedding vectors
|
|
238
|
+
"""
|
|
239
|
+
chunk_texts = chunk_data["chunk_texts"]
|
|
240
|
+
chunk_metadata = chunk_data["chunk_metadata"]
|
|
241
|
+
|
|
242
|
+
# Generate embeddings in batch for efficiency
|
|
243
|
+
embedding_vectors = self.embedding_service.generate_embeddings_batch(
|
|
244
|
+
chunk_texts
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
embeddings = []
|
|
248
|
+
for i, (embedding_vector, metadata) in enumerate(
|
|
249
|
+
zip(embedding_vectors, chunk_metadata)
|
|
250
|
+
):
|
|
251
|
+
if not embedding_vector: # Skip empty embeddings
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
# Progress callback
|
|
255
|
+
if progress_callback:
|
|
256
|
+
progress = IndexingProgress(
|
|
257
|
+
current_file=file_path,
|
|
258
|
+
files_processed=0,
|
|
259
|
+
total_files=1,
|
|
260
|
+
chunks_created=i,
|
|
261
|
+
elapsed_time=0.0,
|
|
262
|
+
)
|
|
263
|
+
progress_callback(progress)
|
|
264
|
+
|
|
265
|
+
embedding = EmbeddingVector(
|
|
266
|
+
file_path=file_path,
|
|
267
|
+
chunk_id=metadata["chunk_id"],
|
|
268
|
+
content=chunk_texts[i],
|
|
269
|
+
embedding=embedding_vector,
|
|
270
|
+
created_at=datetime.now(),
|
|
271
|
+
file_hash=current_hash,
|
|
272
|
+
start_line=metadata["start_line"],
|
|
273
|
+
end_line=metadata["end_line"],
|
|
274
|
+
file_type=file_path.suffix,
|
|
275
|
+
)
|
|
276
|
+
embeddings.append(embedding)
|
|
277
|
+
|
|
278
|
+
return embeddings
|
|
279
|
+
|
|
280
|
+
def _validate_file_for_indexing(self, file_path: Path) -> None:
|
|
281
|
+
"""Validate that a file can be indexed.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
file_path: Path to validate
|
|
285
|
+
|
|
286
|
+
Raises:
|
|
287
|
+
ValueError: If file cannot be indexed
|
|
288
|
+
OSError: If file cannot be accessed
|
|
289
|
+
"""
|
|
290
|
+
if not file_path.exists():
|
|
291
|
+
raise OSError(f"File does not exist: {file_path}")
|
|
292
|
+
|
|
293
|
+
if not file_path.is_file():
|
|
294
|
+
raise ValueError(f"Path is not a file: {file_path}")
|
|
295
|
+
|
|
296
|
+
# Check file size
|
|
297
|
+
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
298
|
+
if file_size_mb > self.config.max_file_size_mb:
|
|
299
|
+
raise ValueError(
|
|
300
|
+
f"File too large: {file_size_mb:.1f}MB > {self.config.max_file_size_mb}MB"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Check file extension
|
|
304
|
+
if (
|
|
305
|
+
self.config.included_extensions
|
|
306
|
+
and file_path.suffix not in self.config.included_extensions
|
|
307
|
+
):
|
|
308
|
+
raise ValueError(f"File extension not included: {file_path.suffix}")
|
|
309
|
+
|
|
310
|
+
# Check exclusion patterns
|
|
311
|
+
file_str = str(file_path)
|
|
312
|
+
for pattern in self.config.excluded_patterns:
|
|
313
|
+
if self._matches_pattern(file_str, pattern):
|
|
314
|
+
raise ValueError(f"File matches exclusion pattern: {pattern}")
|
|
315
|
+
|
|
316
|
+
def _matches_pattern(self, file_path: str, pattern: str) -> bool:
|
|
317
|
+
"""Check if file path matches exclusion pattern."""
|
|
318
|
+
import fnmatch
|
|
319
|
+
|
|
320
|
+
return fnmatch.fnmatch(file_path, pattern)
|
|
321
|
+
|
|
322
|
+
def _needs_reindexing(self, file_path: Path, current_hash: str) -> bool:
|
|
323
|
+
"""Check if file needs to be reindexed.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
file_path: Path to check
|
|
327
|
+
current_hash: Current file hash
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
True if file needs reindexing
|
|
331
|
+
"""
|
|
332
|
+
with self._get_connection() as conn:
|
|
333
|
+
cursor = conn.execute(
|
|
334
|
+
"SELECT file_hash FROM file_tracking WHERE file_path = ?",
|
|
335
|
+
(str(file_path),),
|
|
336
|
+
)
|
|
337
|
+
row = cursor.fetchone()
|
|
338
|
+
|
|
339
|
+
if row is None:
|
|
340
|
+
return True # File not indexed yet
|
|
341
|
+
|
|
342
|
+
return row["file_hash"] != current_hash
|
|
343
|
+
|
|
344
|
+
def _get_existing_embeddings(self, file_path: Path) -> list[EmbeddingVector]:
|
|
345
|
+
"""Get existing embeddings for a file.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
file_path: Path to get embeddings for
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
List of existing embeddings
|
|
352
|
+
"""
|
|
353
|
+
embeddings = []
|
|
354
|
+
|
|
355
|
+
with self._get_connection() as conn:
|
|
356
|
+
cursor = conn.execute(
|
|
357
|
+
"""SELECT chunk_id, file_path, content, embedding, created_at,
|
|
358
|
+
file_hash, start_line, end_line, file_type
|
|
359
|
+
FROM embeddings WHERE file_path = ?""",
|
|
360
|
+
(str(file_path),),
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
for row in cursor.fetchall():
|
|
364
|
+
# Deserialize embedding
|
|
365
|
+
embedding_data = json.loads(row["embedding"])
|
|
366
|
+
|
|
367
|
+
embedding = EmbeddingVector(
|
|
368
|
+
file_path=Path(row["file_path"]),
|
|
369
|
+
chunk_id=row["chunk_id"],
|
|
370
|
+
content=row["content"],
|
|
371
|
+
embedding=embedding_data,
|
|
372
|
+
created_at=datetime.fromisoformat(row["created_at"]),
|
|
373
|
+
file_hash=row["file_hash"],
|
|
374
|
+
start_line=row["start_line"],
|
|
375
|
+
end_line=row["end_line"],
|
|
376
|
+
file_type=row["file_type"],
|
|
377
|
+
)
|
|
378
|
+
embeddings.append(embedding)
|
|
379
|
+
|
|
380
|
+
return embeddings
|
|
381
|
+
|
|
382
|
+
def _store_embeddings(self, embeddings: list[EmbeddingVector]) -> None:
|
|
383
|
+
"""Store embeddings in database.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
embeddings: List of embeddings to store
|
|
387
|
+
"""
|
|
388
|
+
if not embeddings:
|
|
389
|
+
return
|
|
390
|
+
|
|
391
|
+
with self._get_connection() as conn:
|
|
392
|
+
# Remove existing embeddings for these files
|
|
393
|
+
file_paths = {str(emb.file_path) for emb in embeddings}
|
|
394
|
+
for file_path in file_paths:
|
|
395
|
+
conn.execute("DELETE FROM embeddings WHERE file_path = ?", (file_path,))
|
|
396
|
+
|
|
397
|
+
# Insert new embeddings
|
|
398
|
+
for embedding in embeddings:
|
|
399
|
+
conn.execute(
|
|
400
|
+
"""
|
|
401
|
+
INSERT INTO embeddings
|
|
402
|
+
(chunk_id, file_path, content, embedding, created_at,
|
|
403
|
+
file_hash, start_line, end_line, file_type)
|
|
404
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
405
|
+
""",
|
|
406
|
+
(
|
|
407
|
+
embedding.chunk_id,
|
|
408
|
+
str(embedding.file_path),
|
|
409
|
+
embedding.content,
|
|
410
|
+
json.dumps(embedding.embedding),
|
|
411
|
+
embedding.created_at.isoformat(),
|
|
412
|
+
embedding.file_hash,
|
|
413
|
+
embedding.start_line,
|
|
414
|
+
embedding.end_line,
|
|
415
|
+
embedding.file_type,
|
|
416
|
+
),
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
conn.commit()
|
|
420
|
+
|
|
421
|
+
def _update_file_tracking(
|
|
422
|
+
self, file_path: Path, file_hash: str, chunk_count: int
|
|
423
|
+
) -> None:
|
|
424
|
+
"""Update file tracking information.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
file_path: Path of indexed file
|
|
428
|
+
file_hash: Hash of file content
|
|
429
|
+
chunk_count: Number of chunks created
|
|
430
|
+
"""
|
|
431
|
+
with self._get_connection() as conn:
|
|
432
|
+
conn.execute(
|
|
433
|
+
"""
|
|
434
|
+
INSERT OR REPLACE INTO file_tracking
|
|
435
|
+
(file_path, file_hash, last_indexed, chunk_count)
|
|
436
|
+
VALUES (?, ?, ?, ?)
|
|
437
|
+
""",
|
|
438
|
+
(str(file_path), file_hash, datetime.now().isoformat(), chunk_count),
|
|
439
|
+
)
|
|
440
|
+
conn.commit()
|
|
441
|
+
|
|
442
|
+
def search(self, query: SearchQuery) -> list[SearchResult]:
|
|
443
|
+
"""Perform semantic search and return results.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
query: Search query with parameters
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
List of search results sorted by similarity score
|
|
450
|
+
"""
|
|
451
|
+
# Generate embedding for query
|
|
452
|
+
query_embedding = self.embedding_service.generate_embedding(query.query)
|
|
453
|
+
|
|
454
|
+
# Get all embeddings from database
|
|
455
|
+
embeddings_data = self._get_all_embeddings(query.file_types)
|
|
456
|
+
|
|
457
|
+
if not embeddings_data:
|
|
458
|
+
return []
|
|
459
|
+
|
|
460
|
+
# Calculate similarities
|
|
461
|
+
similarities = self.embedding_service.calculate_similarities_batch(
|
|
462
|
+
query_embedding, [data["embedding"] for data in embeddings_data]
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
# Create search results
|
|
466
|
+
results = []
|
|
467
|
+
for i, (data, similarity) in enumerate(zip(embeddings_data, similarities)):
|
|
468
|
+
if similarity >= query.min_similarity:
|
|
469
|
+
# Get context lines if requested
|
|
470
|
+
context_lines = []
|
|
471
|
+
if query.include_context:
|
|
472
|
+
context_lines = self._get_context_lines(
|
|
473
|
+
Path(data["file_path"]),
|
|
474
|
+
data["start_line"],
|
|
475
|
+
data["end_line"],
|
|
476
|
+
query.context_lines,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
result = SearchResult(
|
|
480
|
+
file_path=Path(data["file_path"]),
|
|
481
|
+
chunk_id=data["chunk_id"],
|
|
482
|
+
content=data["content"],
|
|
483
|
+
similarity_score=similarity,
|
|
484
|
+
start_line=data["start_line"],
|
|
485
|
+
end_line=data["end_line"],
|
|
486
|
+
file_type=data["file_type"],
|
|
487
|
+
context_lines=context_lines,
|
|
488
|
+
)
|
|
489
|
+
results.append(result)
|
|
490
|
+
|
|
491
|
+
# Sort by similarity score (descending) and limit results
|
|
492
|
+
results.sort(key=lambda x: x.similarity_score, reverse=True)
|
|
493
|
+
return results[: query.max_results]
|
|
494
|
+
|
|
495
|
+
def _get_all_embeddings(
|
|
496
|
+
self, file_types: list[str] | None = None
|
|
497
|
+
) -> list[dict[str, t.Any]]:
|
|
498
|
+
"""Get all embeddings from database with optional file type filtering.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
file_types: Optional list of file types to filter by
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
List of embedding data dictionaries
|
|
505
|
+
"""
|
|
506
|
+
embeddings_data = []
|
|
507
|
+
|
|
508
|
+
with self._get_connection() as conn:
|
|
509
|
+
if file_types:
|
|
510
|
+
# Build parameterized query safely with proper placeholders
|
|
511
|
+
placeholders = ",".join("?" * len(file_types))
|
|
512
|
+
# Use static query template with placeholders - safe from injection
|
|
513
|
+
query_template = (
|
|
514
|
+
"SELECT chunk_id, file_path, content, embedding, start_line, end_line, file_type "
|
|
515
|
+
"FROM embeddings WHERE file_type IN ({})"
|
|
516
|
+
)
|
|
517
|
+
query_sql = query_template.format(placeholders) # nosec B608
|
|
518
|
+
cursor = conn.execute(query_sql, file_types)
|
|
519
|
+
else:
|
|
520
|
+
cursor = conn.execute("""
|
|
521
|
+
SELECT chunk_id, file_path, content, embedding, start_line, end_line, file_type
|
|
522
|
+
FROM embeddings
|
|
523
|
+
""")
|
|
524
|
+
|
|
525
|
+
for row in cursor.fetchall():
|
|
526
|
+
data = {
|
|
527
|
+
"chunk_id": row["chunk_id"],
|
|
528
|
+
"file_path": row["file_path"],
|
|
529
|
+
"content": row["content"],
|
|
530
|
+
"embedding": json.loads(row["embedding"]),
|
|
531
|
+
"start_line": row["start_line"],
|
|
532
|
+
"end_line": row["end_line"],
|
|
533
|
+
"file_type": row["file_type"],
|
|
534
|
+
}
|
|
535
|
+
embeddings_data.append(data)
|
|
536
|
+
|
|
537
|
+
return embeddings_data
|
|
538
|
+
|
|
539
|
+
def _get_context_lines(
|
|
540
|
+
self, file_path: Path, start_line: int, end_line: int, context_count: int
|
|
541
|
+
) -> list[str]:
|
|
542
|
+
"""Get context lines around a text chunk.
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
file_path: Path to source file
|
|
546
|
+
start_line: Starting line of chunk
|
|
547
|
+
end_line: Ending line of chunk
|
|
548
|
+
context_count: Number of context lines to include
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
List of context lines
|
|
552
|
+
"""
|
|
553
|
+
try:
|
|
554
|
+
if not file_path.exists():
|
|
555
|
+
return []
|
|
556
|
+
|
|
557
|
+
lines = file_path.read_text(encoding="utf-8").splitlines()
|
|
558
|
+
|
|
559
|
+
# Calculate context range
|
|
560
|
+
context_start = max(0, start_line - context_count - 1)
|
|
561
|
+
context_end = min(len(lines), end_line + context_count)
|
|
562
|
+
|
|
563
|
+
return lines[context_start:context_end]
|
|
564
|
+
|
|
565
|
+
except Exception as e:
|
|
566
|
+
logger.warning(f"Failed to get context lines for {file_path}: {e}")
|
|
567
|
+
return []
|
|
568
|
+
|
|
569
|
+
def get_stats(self) -> IndexStats:
|
|
570
|
+
"""Get statistics about the vector store index.
|
|
571
|
+
|
|
572
|
+
Returns:
|
|
573
|
+
Index statistics
|
|
574
|
+
"""
|
|
575
|
+
with self._get_connection() as conn:
|
|
576
|
+
# Get total counts
|
|
577
|
+
cursor = conn.execute("SELECT COUNT(*) as total_chunks FROM embeddings")
|
|
578
|
+
total_chunks = cursor.fetchone()["total_chunks"]
|
|
579
|
+
|
|
580
|
+
cursor = conn.execute(
|
|
581
|
+
"SELECT COUNT(DISTINCT file_path) as total_files FROM embeddings"
|
|
582
|
+
)
|
|
583
|
+
total_files = cursor.fetchone()["total_files"]
|
|
584
|
+
|
|
585
|
+
# Get file type distribution
|
|
586
|
+
cursor = conn.execute("""
|
|
587
|
+
SELECT file_type, COUNT(*) as count
|
|
588
|
+
FROM embeddings
|
|
589
|
+
GROUP BY file_type
|
|
590
|
+
""")
|
|
591
|
+
file_types = {row["file_type"]: row["count"] for row in cursor.fetchall()}
|
|
592
|
+
|
|
593
|
+
# Get last update time
|
|
594
|
+
cursor = conn.execute(
|
|
595
|
+
"SELECT MAX(created_at) as last_updated FROM embeddings"
|
|
596
|
+
)
|
|
597
|
+
last_updated_str = cursor.fetchone()["last_updated"]
|
|
598
|
+
last_updated = (
|
|
599
|
+
datetime.fromisoformat(last_updated_str)
|
|
600
|
+
if last_updated_str
|
|
601
|
+
else datetime.now()
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Calculate average chunk size
|
|
605
|
+
cursor = conn.execute(
|
|
606
|
+
"SELECT AVG(LENGTH(content)) as avg_size FROM embeddings"
|
|
607
|
+
)
|
|
608
|
+
avg_chunk_size = cursor.fetchone()["avg_size"] or 0.0
|
|
609
|
+
|
|
610
|
+
# Estimate index size (rough approximation)
|
|
611
|
+
index_size_mb = (total_chunks * 384 * 4) / (
|
|
612
|
+
1024 * 1024
|
|
613
|
+
) # Assuming 384-dim embeddings
|
|
614
|
+
|
|
615
|
+
return IndexStats(
|
|
616
|
+
total_files=total_files,
|
|
617
|
+
total_chunks=total_chunks,
|
|
618
|
+
index_size_mb=index_size_mb,
|
|
619
|
+
last_updated=last_updated,
|
|
620
|
+
file_types=file_types,
|
|
621
|
+
embedding_model=self.config.embedding_model,
|
|
622
|
+
avg_chunk_size=avg_chunk_size,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
def remove_file(self, file_path: Path) -> bool:
|
|
626
|
+
"""Remove a file's embeddings from the index.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
file_path: Path of file to remove
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
True if file was removed, False if not found
|
|
633
|
+
"""
|
|
634
|
+
with self._get_connection() as conn:
|
|
635
|
+
# Check if file exists in index
|
|
636
|
+
cursor = conn.execute(
|
|
637
|
+
"SELECT COUNT(*) as count FROM embeddings WHERE file_path = ?",
|
|
638
|
+
(str(file_path),),
|
|
639
|
+
)
|
|
640
|
+
count = cursor.fetchone()["count"]
|
|
641
|
+
|
|
642
|
+
if count == 0:
|
|
643
|
+
return False
|
|
644
|
+
|
|
645
|
+
# Remove embeddings
|
|
646
|
+
conn.execute(
|
|
647
|
+
"DELETE FROM embeddings WHERE file_path = ?", (str(file_path),)
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
# Remove from file tracking
|
|
651
|
+
conn.execute(
|
|
652
|
+
"DELETE FROM file_tracking WHERE file_path = ?", (str(file_path),)
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
conn.commit()
|
|
656
|
+
logger.info(f"Removed {count} embeddings for file: {file_path}")
|
|
657
|
+
return True
|
|
658
|
+
|
|
659
|
+
def clear_index(self) -> None:
|
|
660
|
+
"""Clear all embeddings from the index."""
|
|
661
|
+
with self._get_connection() as conn:
|
|
662
|
+
conn.execute("DELETE FROM embeddings")
|
|
663
|
+
conn.execute("DELETE FROM file_tracking")
|
|
664
|
+
conn.commit()
|
|
665
|
+
logger.info("Cleared all embeddings from index")
|
|
666
|
+
|
|
667
|
+
def close(self) -> None:
|
|
668
|
+
"""Clean up resources."""
|
|
669
|
+
if self._temp_db:
|
|
670
|
+
self._temp_db.close()
|
|
671
|
+
if self.db_path.exists():
|
|
672
|
+
self.db_path.unlink()
|
|
673
|
+
logger.debug("Cleaned up temporary database")
|
|
674
|
+
|
|
675
|
+
def __enter__(self) -> "VectorStore":
|
|
676
|
+
"""Context manager entry."""
|
|
677
|
+
return self
|
|
678
|
+
|
|
679
|
+
def __exit__(self, exc_type: t.Any, exc_val: t.Any, exc_tb: t.Any) -> None:
|
|
680
|
+
"""Context manager exit."""
|
|
681
|
+
self.close()
|