mcp-sqlite-memory-bank 1.2.3__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,386 @@
1
+ """
2
+ Semantic search functionality for SQLite Memory Bank using sentence-transformers.
3
+
4
+ This module provides vector embeddings and semantic similarity search capabilities
5
+ to enhance the memory bank's knowledge discovery features.
6
+
7
+ Author: Robert Meisner
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ import numpy as np
14
+ from functools import lru_cache
15
+
16
+ # Optional imports with graceful fallback
17
+ try:
18
+ from sentence_transformers import SentenceTransformer, util
19
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
20
+ except ImportError:
21
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
22
+ SentenceTransformer = None
23
+ util = None
24
+ logging.warning("sentence-transformers not available. Install with: pip install sentence-transformers")
25
+
26
+ try:
27
+ import torch
28
+ TORCH_AVAILABLE = True
29
+ except ImportError:
30
+ TORCH_AVAILABLE = False
31
+ torch = None
32
+ logging.warning("torch not available. Install with: pip install torch")
33
+
34
+ from .types import ToolResponse, ValidationError, DatabaseError
35
+
36
+
37
+ class SemanticSearchEngine:
38
+ """
39
+ Handles semantic search using sentence-transformers.
40
+
41
+ Features:
42
+ - Vector embeddings for text content
43
+ - Semantic similarity search
44
+ - Hybrid search combining semantic + keyword matching
45
+ - Caching for performance
46
+ """
47
+
48
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
49
+ """Initialize the semantic search engine."""
50
+ self.model_name = model_name
51
+ self._model = None
52
+ self._embedding_cache = {}
53
+
54
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
55
+ raise ValueError("sentence-transformers is not available. Please install with: pip install sentence-transformers")
56
+
57
+ @property
58
+ def model(self):
59
+ """Lazy load the sentence transformer model."""
60
+ if self._model is None:
61
+ if not SENTENCE_TRANSFORMERS_AVAILABLE or SentenceTransformer is None:
62
+ raise ValueError("sentence-transformers is not available")
63
+ try:
64
+ self._model = SentenceTransformer(self.model_name)
65
+ logging.info(f"Loaded semantic search model: {self.model_name}")
66
+ except Exception as e:
67
+ raise DatabaseError(f"Failed to load semantic search model {self.model_name}: {e}")
68
+ return self._model
69
+
70
+ def get_embedding_dimensions(self) -> Optional[int]:
71
+ """Get the embedding dimension size for the current model."""
72
+ return self.model.get_sentence_embedding_dimension()
73
+
74
+ def generate_embedding(self, text: str) -> List[float]:
75
+ """
76
+ Generate embedding for a single text.
77
+
78
+ Args:
79
+ text: Text to embed
80
+
81
+ Returns:
82
+ List of floats representing the embedding vector
83
+ """
84
+ if not text or not text.strip():
85
+ raise ValidationError("Text cannot be empty for embedding generation")
86
+
87
+ # Check cache first
88
+ cache_key = f"{self.model_name}:{hash(text)}"
89
+ if cache_key in self._embedding_cache:
90
+ return self._embedding_cache[cache_key]
91
+
92
+ try:
93
+ # Generate embedding
94
+ embedding = self.model.encode([text], convert_to_tensor=False)[0]
95
+ embedding_list = embedding.tolist()
96
+
97
+ # Cache for future use
98
+ self._embedding_cache[cache_key] = embedding_list
99
+
100
+ return embedding_list
101
+ except Exception as e:
102
+ raise DatabaseError(f"Failed to generate embedding: {e}")
103
+
104
+ def generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
105
+ """
106
+ Generate embeddings for multiple texts efficiently.
107
+
108
+ Args:
109
+ texts: List of texts to embed
110
+
111
+ Returns:
112
+ List of embedding vectors
113
+ """
114
+ if not texts:
115
+ return []
116
+
117
+ # Filter out empty texts
118
+ valid_texts = [text for text in texts if text and text.strip()]
119
+ if not valid_texts:
120
+ raise ValidationError("No valid texts provided for embedding generation")
121
+
122
+ try:
123
+ embeddings = self.model.encode(valid_texts, convert_to_tensor=False, show_progress_bar=len(valid_texts) > 10)
124
+ return [emb.tolist() for emb in embeddings]
125
+ except Exception as e:
126
+ raise DatabaseError(f"Failed to generate batch embeddings: {e}")
127
+
128
+ def calculate_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
129
+ """
130
+ Calculate cosine similarity between two embeddings.
131
+
132
+ Args:
133
+ embedding1: First embedding vector
134
+ embedding2: Second embedding vector
135
+
136
+ Returns:
137
+ Similarity score between 0 and 1
138
+ """
139
+ if not TORCH_AVAILABLE or torch is None:
140
+ # Fallback to numpy implementation
141
+ norm1 = np.linalg.norm(embedding1)
142
+ norm2 = np.linalg.norm(embedding2)
143
+ if norm1 == 0 or norm2 == 0:
144
+ return 0.0
145
+ similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
146
+ return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
147
+
148
+ if not SENTENCE_TRANSFORMERS_AVAILABLE or util is None:
149
+ # Fallback to numpy implementation
150
+ norm1 = np.linalg.norm(embedding1)
151
+ norm2 = np.linalg.norm(embedding2)
152
+ if norm1 == 0 or norm2 == 0:
153
+ return 0.0
154
+ similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
155
+ return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
156
+
157
+ try:
158
+ # Convert to tensors for efficient computation
159
+ emb1_tensor = torch.tensor(embedding1)
160
+ emb2_tensor = torch.tensor(embedding2)
161
+
162
+ # Calculate cosine similarity
163
+ similarity = util.cos_sim(emb1_tensor, emb2_tensor).item()
164
+ return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
165
+ except Exception as e:
166
+ raise DatabaseError(f"Failed to calculate similarity: {e}")
167
+
168
+ def find_similar_embeddings(self, query_embedding: List[float],
169
+ candidate_embeddings: List[List[float]],
170
+ similarity_threshold: float = 0.5,
171
+ top_k: int = 10) -> List[Tuple[int, float]]:
172
+ """
173
+ Find the most similar embeddings to a query embedding.
174
+
175
+ Args:
176
+ query_embedding: Query vector
177
+ candidate_embeddings: List of candidate vectors
178
+ similarity_threshold: Minimum similarity score
179
+ top_k: Maximum number of results
180
+
181
+ Returns:
182
+ List of (index, similarity_score) tuples, sorted by similarity descending
183
+ """
184
+ if not candidate_embeddings:
185
+ return []
186
+
187
+ # Use efficient torch/sentence-transformers if available
188
+ if TORCH_AVAILABLE and torch is not None and SENTENCE_TRANSFORMERS_AVAILABLE and util is not None:
189
+ try:
190
+ # Convert to tensors
191
+ query_tensor = torch.tensor(query_embedding).unsqueeze(0)
192
+ candidate_tensor = torch.tensor(candidate_embeddings)
193
+
194
+ # Calculate similarities
195
+ similarities = util.cos_sim(query_tensor, candidate_tensor)[0]
196
+
197
+ # Find matches above threshold
198
+ results = []
199
+ for idx, sim in enumerate(similarities):
200
+ sim_score = sim.item()
201
+ if sim_score >= similarity_threshold:
202
+ results.append((idx, sim_score))
203
+
204
+ # Sort by similarity descending and limit to top_k
205
+ results.sort(key=lambda x: x[1], reverse=True)
206
+ return results[:top_k]
207
+ except Exception as e: logging.warning(f"Torch similarity search failed, using numpy fallback: {e}")
208
+
209
+ # Fallback to numpy implementation
210
+ results = []
211
+ for idx, candidate_emb in enumerate(candidate_embeddings):
212
+ similarity = self.calculate_similarity(query_embedding, candidate_emb)
213
+ if similarity >= similarity_threshold:
214
+ results.append((idx, similarity))
215
+
216
+ # Sort by similarity descending and limit to top_k
217
+ results.sort(key=lambda x: x[1], reverse=True)
218
+ return results[:top_k]
219
+
220
+ def semantic_search(self, query: str, content_data: List[Dict[str, Any]],
221
+ embedding_column: str = "embedding",
222
+ content_columns: Optional[List[str]] = None,
223
+ similarity_threshold: float = 0.5,
224
+ top_k: int = 10) -> List[Dict[str, Any]]:
225
+ """
226
+ Perform semantic search on content data.
227
+
228
+ Args:
229
+ query: Natural language search query
230
+ content_data: List of rows with embeddings and content
231
+ embedding_column: Column name containing embeddings
232
+ content_columns: Columns to search in (for highlighting)
233
+ similarity_threshold: Minimum similarity score
234
+ top_k: Maximum number of results
235
+
236
+ Returns:
237
+ List of search results with similarity scores
238
+ """
239
+ if not query.strip():
240
+ raise ValidationError("Search query cannot be empty")
241
+
242
+ if not content_data:
243
+ return []
244
+
245
+ try:
246
+ # Generate query embedding
247
+ query_embedding = self.generate_embedding(query)
248
+
249
+ # Extract embeddings from content data
250
+ candidate_embeddings = []
251
+ valid_indices = []
252
+
253
+ for idx, row in enumerate(content_data):
254
+ if embedding_column in row and row[embedding_column]:
255
+ try:
256
+ # Parse embedding from JSON string or use directly if already a list
257
+ embedding = row[embedding_column]
258
+ if isinstance(embedding, str):
259
+ embedding = json.loads(embedding)
260
+
261
+ candidate_embeddings.append(embedding)
262
+ valid_indices.append(idx)
263
+ except (json.JSONDecodeError, TypeError) as e:
264
+ logging.warning(f"Invalid embedding data in row {idx}: {e}")
265
+ continue
266
+
267
+ if not candidate_embeddings:
268
+ return []
269
+
270
+ # Find similar embeddings
271
+ similar_indices = self.find_similar_embeddings(
272
+ query_embedding, candidate_embeddings,
273
+ similarity_threshold, top_k
274
+ )
275
+
276
+ # Build results
277
+ results = []
278
+ for candidate_idx, similarity_score in similar_indices:
279
+ original_idx = valid_indices[candidate_idx]
280
+ row = content_data[original_idx].copy()
281
+
282
+ # Add similarity score
283
+ row['similarity_score'] = round(similarity_score, 3)
284
+
285
+ # Add matched content highlighting if specified
286
+ if content_columns:
287
+ matched_content = []
288
+ for col in content_columns:
289
+ if col in row and row[col] and query.lower() in str(row[col]).lower():
290
+ matched_content.append(f"{col}: {row[col]}")
291
+ if matched_content:
292
+ row['matched_content'] = matched_content
293
+
294
+ results.append(row)
295
+
296
+ return results
297
+
298
+ except Exception as e:
299
+ raise DatabaseError(f"Semantic search failed: {e}")
300
+
301
+ def hybrid_search(self, query: str, content_data: List[Dict[str, Any]],
302
+ text_columns: Optional[List[str]] = None,
303
+ embedding_column: str = "embedding",
304
+ semantic_weight: float = 0.7,
305
+ text_weight: float = 0.3,
306
+ top_k: int = 10) -> List[Dict[str, Any]]:
307
+ """
308
+ Combine semantic search with keyword matching for better results.
309
+
310
+ Args:
311
+ query: Search query
312
+ content_data: Content to search
313
+ text_columns: Columns to perform text search on
314
+ embedding_column: Column containing embeddings
315
+ semantic_weight: Weight for semantic similarity (0-1)
316
+ text_weight: Weight for text matching (0-1)
317
+ top_k: Maximum results
318
+
319
+ Returns:
320
+ Ranked search results
321
+ """
322
+ if not content_data:
323
+ return []
324
+
325
+ # Normalize weights
326
+ total_weight = semantic_weight + text_weight
327
+ if total_weight > 0:
328
+ semantic_weight /= total_weight
329
+ text_weight /= total_weight
330
+
331
+ # Get semantic search results
332
+ semantic_results = self.semantic_search(
333
+ query, content_data, embedding_column,
334
+ similarity_threshold=0.3, top_k=top_k * 2 # Get more for reranking
335
+ )
336
+
337
+ # Create result map for reranking
338
+ result_map = {id(result): result for result in semantic_results}
339
+
340
+ # Add text matching scores
341
+ query_lower = query.lower()
342
+ for result in semantic_results:
343
+ text_score = 0.0
344
+
345
+ if text_columns:
346
+ for col in text_columns:
347
+ if col in result and result[col]:
348
+ content = str(result[col]).lower()
349
+ if query_lower in content:
350
+ # Simple frequency-based text scoring
351
+ text_score += content.count(query_lower) / len(content.split())
352
+
353
+ # Combine scores
354
+ semantic_score = result.get('similarity_score', 0.0)
355
+ combined_score = (semantic_score * semantic_weight) + (text_score * text_weight)
356
+ result['combined_score'] = round(combined_score, 3)
357
+ result['text_score'] = round(text_score, 3)
358
+
359
+ # Sort by combined score
360
+ semantic_results.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
361
+
362
+ return semantic_results[:top_k]
363
+
364
+ def clear_cache(self):
365
+ """Clear the embedding cache."""
366
+ self._embedding_cache.clear()
367
+ logging.info("Semantic search cache cleared")
368
+
369
+
370
+ # Global instance
371
+ _semantic_engine: Optional[SemanticSearchEngine] = None
372
+
373
+
374
+ def get_semantic_engine(model_name: str = "all-MiniLM-L6-v2") -> SemanticSearchEngine:
375
+ """Get or create the global semantic search engine."""
376
+ global _semantic_engine
377
+
378
+ if _semantic_engine is None or _semantic_engine.model_name != model_name:
379
+ _semantic_engine = SemanticSearchEngine(model_name)
380
+
381
+ return _semantic_engine
382
+
383
+
384
+ def is_semantic_search_available() -> bool:
385
+ """Check if semantic search is available."""
386
+ return SENTENCE_TRANSFORMERS_AVAILABLE