mcp-sqlite-memory-bank 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,398 @@
1
+ """
2
+ Semantic search functionality for SQLite Memory Bank using sentence-transformers.
3
+
4
+ This module provides vector embeddings and semantic similarity search capabilities
5
+ to enhance the memory bank's knowledge discovery features.
6
+
7
+ Author: Robert Meisner
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ import numpy as np
14
+
15
+ # Optional imports with graceful fallback
16
+ try:
17
+ from sentence_transformers import SentenceTransformer, util
18
+
19
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
20
+ except ImportError:
21
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
22
+ SentenceTransformer = None
23
+ util = None
24
+ logging.warning("sentence-transformers not available. Install with: pip install sentence-transformers")
25
+
26
+ try:
27
+ import torch
28
+
29
+ TORCH_AVAILABLE = True
30
+ except ImportError:
31
+ TORCH_AVAILABLE = False
32
+ torch = None
33
+ logging.warning("torch not available. Install with: pip install torch")
34
+
35
+ from .types import ValidationError, DatabaseError
36
+
37
+
38
+ class SemanticSearchEngine:
39
+ """
40
+ Handles semantic search using sentence-transformers.
41
+
42
+ Features:
43
+ - Vector embeddings for text content
44
+ - Semantic similarity search
45
+ - Hybrid search combining semantic + keyword matching
46
+ - Caching for performance
47
+ """
48
+
49
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
50
+ """Initialize the semantic search engine."""
51
+ self.model_name = model_name
52
+ self._model = None
53
+ self._embedding_cache = {}
54
+
55
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
56
+ raise ValueError(
57
+ "sentence-transformers is not available. Please install with: pip install sentence-transformers"
58
+ )
59
+
60
+ @property
61
+ def model(self):
62
+ """Lazy load the sentence transformer model."""
63
+ if self._model is None:
64
+ if not SENTENCE_TRANSFORMERS_AVAILABLE or SentenceTransformer is None:
65
+ raise ValueError("sentence-transformers is not available")
66
+ try:
67
+ self._model = SentenceTransformer(self.model_name)
68
+ logging.info(f"Loaded semantic search model: {self.model_name}")
69
+ except Exception as e:
70
+ raise DatabaseError(f"Failed to load semantic search model {self.model_name}: {e}")
71
+ return self._model
72
+
73
+ def get_embedding_dimensions(self) -> Optional[int]:
74
+ """Get the embedding dimension size for the current model."""
75
+ return self.model.get_sentence_embedding_dimension()
76
+
77
+ def generate_embedding(self, text: str) -> List[float]:
78
+ """
79
+ Generate embedding for a single text.
80
+
81
+ Args:
82
+ text: Text to embed
83
+
84
+ Returns:
85
+ List of floats representing the embedding vector
86
+ """
87
+ if not text or not text.strip():
88
+ raise ValidationError("Text cannot be empty for embedding generation")
89
+
90
+ # Check cache first
91
+ cache_key = f"{self.model_name}:{hash(text)}"
92
+ if cache_key in self._embedding_cache:
93
+ return self._embedding_cache[cache_key]
94
+
95
+ try:
96
+ # Generate embedding
97
+ embedding = self.model.encode([text], convert_to_tensor=False)[0]
98
+ embedding_list = embedding.tolist()
99
+
100
+ # Cache for future use
101
+ self._embedding_cache[cache_key] = embedding_list
102
+
103
+ return embedding_list
104
+ except Exception as e:
105
+ raise DatabaseError(f"Failed to generate embedding: {e}")
106
+
107
+ def generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
108
+ """
109
+ Generate embeddings for multiple texts efficiently.
110
+
111
+ Args:
112
+ texts: List of texts to embed
113
+
114
+ Returns:
115
+ List of embedding vectors
116
+ """
117
+ if not texts:
118
+ return []
119
+
120
+ # Filter out empty texts
121
+ valid_texts = [text for text in texts if text and text.strip()]
122
+ if not valid_texts:
123
+ raise ValidationError("No valid texts provided for embedding generation")
124
+
125
+ try:
126
+ embeddings = self.model.encode(
127
+ valid_texts, convert_to_tensor=False, show_progress_bar=len(valid_texts) > 10
128
+ )
129
+ return [emb.tolist() for emb in embeddings]
130
+ except Exception as e:
131
+ raise DatabaseError(f"Failed to generate batch embeddings: {e}")
132
+
133
+ def calculate_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
134
+ """
135
+ Calculate cosine similarity between two embeddings.
136
+
137
+ Args:
138
+ embedding1: First embedding vector
139
+ embedding2: Second embedding vector
140
+
141
+ Returns:
142
+ Similarity score between 0 and 1
143
+ """
144
+ if not TORCH_AVAILABLE or torch is None:
145
+ # Fallback to numpy implementation
146
+ norm1 = np.linalg.norm(embedding1)
147
+ norm2 = np.linalg.norm(embedding2)
148
+ if norm1 == 0 or norm2 == 0:
149
+ return 0.0
150
+ similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
151
+ return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
152
+
153
+ if not SENTENCE_TRANSFORMERS_AVAILABLE or util is None:
154
+ # Fallback to numpy implementation
155
+ norm1 = np.linalg.norm(embedding1)
156
+ norm2 = np.linalg.norm(embedding2)
157
+ if norm1 == 0 or norm2 == 0:
158
+ return 0.0
159
+ similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
160
+ return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
161
+
162
+ try:
163
+ # Convert to tensors for efficient computation
164
+ emb1_tensor = torch.tensor(embedding1)
165
+ emb2_tensor = torch.tensor(embedding2)
166
+
167
+ # Calculate cosine similarity
168
+ similarity = util.cos_sim(emb1_tensor, emb2_tensor).item()
169
+ return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
170
+ except Exception as e:
171
+ raise DatabaseError(f"Failed to calculate similarity: {e}")
172
+
173
+ def find_similar_embeddings(
174
+ self,
175
+ query_embedding: List[float],
176
+ candidate_embeddings: List[List[float]],
177
+ similarity_threshold: float = 0.5,
178
+ top_k: int = 10,
179
+ ) -> List[Tuple[int, float]]:
180
+ """
181
+ Find the most similar embeddings to a query embedding.
182
+
183
+ Args:
184
+ query_embedding: Query vector
185
+ candidate_embeddings: List of candidate vectors
186
+ similarity_threshold: Minimum similarity score
187
+ top_k: Maximum number of results
188
+
189
+ Returns:
190
+ List of (index, similarity_score) tuples, sorted by similarity descending
191
+ """
192
+ if not candidate_embeddings:
193
+ return []
194
+
195
+ # Use efficient torch/sentence-transformers if available
196
+ if TORCH_AVAILABLE and torch is not None and SENTENCE_TRANSFORMERS_AVAILABLE and util is not None:
197
+ try:
198
+ # Convert to tensors
199
+ query_tensor = torch.tensor(query_embedding).unsqueeze(0)
200
+ candidate_tensor = torch.tensor(candidate_embeddings)
201
+
202
+ # Calculate similarities
203
+ similarities = util.cos_sim(query_tensor, candidate_tensor)[0]
204
+
205
+ # Find matches above threshold
206
+ results = []
207
+ for idx, sim in enumerate(similarities):
208
+ sim_score = sim.item()
209
+ if sim_score >= similarity_threshold:
210
+ results.append((idx, sim_score))
211
+
212
+ # Sort by similarity descending and limit to top_k
213
+ results.sort(key=lambda x: x[1], reverse=True)
214
+ return results[:top_k]
215
+ except Exception as e:
216
+ logging.warning(f"Torch similarity search failed, using numpy fallback: {e}")
217
+
218
+ # Fallback to numpy implementation
219
+ results = []
220
+ for idx, candidate_emb in enumerate(candidate_embeddings):
221
+ similarity = self.calculate_similarity(query_embedding, candidate_emb)
222
+ if similarity >= similarity_threshold:
223
+ results.append((idx, similarity))
224
+
225
+ # Sort by similarity descending and limit to top_k
226
+ results.sort(key=lambda x: x[1], reverse=True)
227
+ return results[:top_k]
228
+
229
+ def semantic_search(
230
+ self,
231
+ query: str,
232
+ content_data: List[Dict[str, Any]],
233
+ embedding_column: str = "embedding",
234
+ content_columns: Optional[List[str]] = None,
235
+ similarity_threshold: float = 0.5,
236
+ top_k: int = 10,
237
+ ) -> List[Dict[str, Any]]:
238
+ """
239
+ Perform semantic search on content data.
240
+
241
+ Args:
242
+ query: Natural language search query
243
+ content_data: List of rows with embeddings and content
244
+ embedding_column: Column name containing embeddings
245
+ content_columns: Columns to search in (for highlighting)
246
+ similarity_threshold: Minimum similarity score
247
+ top_k: Maximum number of results
248
+
249
+ Returns:
250
+ List of search results with similarity scores
251
+ """
252
+ if not query.strip():
253
+ raise ValidationError("Search query cannot be empty")
254
+
255
+ if not content_data:
256
+ return []
257
+
258
+ try:
259
+ # Generate query embedding
260
+ query_embedding = self.generate_embedding(query)
261
+
262
+ # Extract embeddings from content data
263
+ candidate_embeddings = []
264
+ valid_indices = []
265
+
266
+ for idx, row in enumerate(content_data):
267
+ if embedding_column in row and row[embedding_column]:
268
+ try:
269
+ # Parse embedding from JSON string or use directly if already a list
270
+ embedding = row[embedding_column]
271
+ if isinstance(embedding, str):
272
+ embedding = json.loads(embedding)
273
+
274
+ candidate_embeddings.append(embedding)
275
+ valid_indices.append(idx)
276
+ except (json.JSONDecodeError, TypeError) as e:
277
+ logging.warning(f"Invalid embedding data in row {idx}: {e}")
278
+ continue
279
+
280
+ if not candidate_embeddings:
281
+ return []
282
+
283
+ # Find similar embeddings
284
+ similar_indices = self.find_similar_embeddings(
285
+ query_embedding, candidate_embeddings, similarity_threshold, top_k
286
+ )
287
+
288
+ # Build results
289
+ results = []
290
+ for candidate_idx, similarity_score in similar_indices:
291
+ original_idx = valid_indices[candidate_idx]
292
+ row = content_data[original_idx].copy()
293
+
294
+ # Add similarity score
295
+ row["similarity_score"] = round(similarity_score, 3)
296
+
297
+ # Add matched content highlighting if specified
298
+ if content_columns:
299
+ matched_content = []
300
+ for col in content_columns:
301
+ if col in row and row[col] and query.lower() in str(row[col]).lower():
302
+ matched_content.append(f"{col}: {row[col]}")
303
+ if matched_content:
304
+ row["matched_content"] = matched_content
305
+
306
+ results.append(row)
307
+
308
+ return results
309
+
310
+ except Exception as e:
311
+ raise DatabaseError(f"Semantic search failed: {e}")
312
+
313
+ def hybrid_search(
314
+ self,
315
+ query: str,
316
+ content_data: List[Dict[str, Any]],
317
+ text_columns: Optional[List[str]] = None,
318
+ embedding_column: str = "embedding",
319
+ semantic_weight: float = 0.7,
320
+ text_weight: float = 0.3,
321
+ top_k: int = 10,
322
+ ) -> List[Dict[str, Any]]:
323
+ """
324
+ Combine semantic search with keyword matching for better results.
325
+
326
+ Args:
327
+ query: Search query
328
+ content_data: Content to search
329
+ text_columns: Columns to perform text search on
330
+ embedding_column: Column containing embeddings
331
+ semantic_weight: Weight for semantic similarity (0-1)
332
+ text_weight: Weight for text matching (0-1)
333
+ top_k: Maximum results
334
+
335
+ Returns:
336
+ Ranked search results
337
+ """
338
+ if not content_data:
339
+ return []
340
+
341
+ # Normalize weights
342
+ total_weight = semantic_weight + text_weight
343
+ if total_weight > 0:
344
+ semantic_weight /= total_weight
345
+ text_weight /= total_weight
346
+
347
+ # Get semantic search results
348
+ semantic_results = self.semantic_search(
349
+ query, content_data, embedding_column, similarity_threshold=0.3, top_k=top_k * 2 # Get more for reranking
350
+ )
351
+
352
+ # Add text matching scores
353
+ query_lower = query.lower()
354
+ for result in semantic_results:
355
+ text_score = 0.0
356
+
357
+ if text_columns:
358
+ for col in text_columns:
359
+ if col in result and result[col]:
360
+ content = str(result[col]).lower()
361
+ if query_lower in content:
362
+ # Simple frequency-based text scoring
363
+ text_score += content.count(query_lower) / len(content.split())
364
+
365
+ # Combine scores
366
+ semantic_score = result.get("similarity_score", 0.0)
367
+ combined_score = (semantic_score * semantic_weight) + (text_score * text_weight)
368
+ result["combined_score"] = round(combined_score, 3)
369
+ result["text_score"] = round(text_score, 3)
370
+
371
+ # Sort by combined score
372
+ semantic_results.sort(key=lambda x: x.get("combined_score", 0), reverse=True)
373
+
374
+ return semantic_results[:top_k]
375
+
376
+ def clear_cache(self):
377
+ """Clear the embedding cache."""
378
+ self._embedding_cache.clear()
379
+ logging.info("Semantic search cache cleared")
380
+
381
+
382
+ # Global instance
383
+ _semantic_engine: Optional[SemanticSearchEngine] = None
384
+
385
+
386
+ def get_semantic_engine(model_name: str = "all-MiniLM-L6-v2") -> SemanticSearchEngine:
387
+ """Get or create the global semantic search engine."""
388
+ global _semantic_engine
389
+
390
+ if _semantic_engine is None or _semantic_engine.model_name != model_name:
391
+ _semantic_engine = SemanticSearchEngine(model_name)
392
+
393
+ return _semantic_engine
394
+
395
+
396
+ def is_semantic_search_available() -> bool:
397
+ """Check if semantic search is available."""
398
+ return SENTENCE_TRANSFORMERS_AVAILABLE