mcp-sqlite-memory-bank 1.3.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,11 +11,11 @@ import json
11
11
  import logging
12
12
  from typing import List, Dict, Any, Optional, Tuple
13
13
  import numpy as np
14
- from functools import lru_cache
15
14
 
16
15
  # Optional imports with graceful fallback
17
16
  try:
18
17
  from sentence_transformers import SentenceTransformer, util
18
+
19
19
  SENTENCE_TRANSFORMERS_AVAILABLE = True
20
20
  except ImportError:
21
21
  SENTENCE_TRANSFORMERS_AVAILABLE = False
@@ -25,36 +25,39 @@ except ImportError:
25
25
 
26
26
  try:
27
27
  import torch
28
+
28
29
  TORCH_AVAILABLE = True
29
30
  except ImportError:
30
31
  TORCH_AVAILABLE = False
31
32
  torch = None
32
33
  logging.warning("torch not available. Install with: pip install torch")
33
34
 
34
- from .types import ToolResponse, ValidationError, DatabaseError
35
+ from .types import ValidationError, DatabaseError
35
36
 
36
37
 
37
38
  class SemanticSearchEngine:
38
39
  """
39
40
  Handles semantic search using sentence-transformers.
40
-
41
+
41
42
  Features:
42
43
  - Vector embeddings for text content
43
44
  - Semantic similarity search
44
45
  - Hybrid search combining semantic + keyword matching
45
46
  - Caching for performance
46
47
  """
47
-
48
+
48
49
  def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
49
50
  """Initialize the semantic search engine."""
50
51
  self.model_name = model_name
51
52
  self._model = None
52
53
  self._embedding_cache = {}
53
-
54
+
54
55
  if not SENTENCE_TRANSFORMERS_AVAILABLE:
55
- raise ValueError("sentence-transformers is not available. Please install with: pip install sentence-transformers")
56
-
57
- @property
56
+ raise ValueError(
57
+ "sentence-transformers is not available. Please install with: pip install sentence-transformers"
58
+ )
59
+
60
+ @property
58
61
  def model(self):
59
62
  """Lazy load the sentence transformer model."""
60
63
  if self._model is None:
@@ -66,73 +69,75 @@ class SemanticSearchEngine:
66
69
  except Exception as e:
67
70
  raise DatabaseError(f"Failed to load semantic search model {self.model_name}: {e}")
68
71
  return self._model
69
-
72
+
70
73
  def get_embedding_dimensions(self) -> Optional[int]:
71
74
  """Get the embedding dimension size for the current model."""
72
75
  return self.model.get_sentence_embedding_dimension()
73
-
76
+
74
77
  def generate_embedding(self, text: str) -> List[float]:
75
78
  """
76
79
  Generate embedding for a single text.
77
-
80
+
78
81
  Args:
79
82
  text: Text to embed
80
-
83
+
81
84
  Returns:
82
85
  List of floats representing the embedding vector
83
86
  """
84
87
  if not text or not text.strip():
85
88
  raise ValidationError("Text cannot be empty for embedding generation")
86
-
89
+
87
90
  # Check cache first
88
91
  cache_key = f"{self.model_name}:{hash(text)}"
89
92
  if cache_key in self._embedding_cache:
90
93
  return self._embedding_cache[cache_key]
91
-
94
+
92
95
  try:
93
96
  # Generate embedding
94
97
  embedding = self.model.encode([text], convert_to_tensor=False)[0]
95
98
  embedding_list = embedding.tolist()
96
-
99
+
97
100
  # Cache for future use
98
101
  self._embedding_cache[cache_key] = embedding_list
99
-
102
+
100
103
  return embedding_list
101
104
  except Exception as e:
102
105
  raise DatabaseError(f"Failed to generate embedding: {e}")
103
-
106
+
104
107
  def generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
105
108
  """
106
109
  Generate embeddings for multiple texts efficiently.
107
-
110
+
108
111
  Args:
109
112
  texts: List of texts to embed
110
-
113
+
111
114
  Returns:
112
115
  List of embedding vectors
113
116
  """
114
117
  if not texts:
115
118
  return []
116
-
119
+
117
120
  # Filter out empty texts
118
121
  valid_texts = [text for text in texts if text and text.strip()]
119
122
  if not valid_texts:
120
123
  raise ValidationError("No valid texts provided for embedding generation")
121
-
124
+
122
125
  try:
123
- embeddings = self.model.encode(valid_texts, convert_to_tensor=False, show_progress_bar=len(valid_texts) > 10)
126
+ embeddings = self.model.encode(
127
+ valid_texts, convert_to_tensor=False, show_progress_bar=len(valid_texts) > 10
128
+ )
124
129
  return [emb.tolist() for emb in embeddings]
125
130
  except Exception as e:
126
131
  raise DatabaseError(f"Failed to generate batch embeddings: {e}")
127
-
132
+
128
133
  def calculate_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
129
134
  """
130
135
  Calculate cosine similarity between two embeddings.
131
-
136
+
132
137
  Args:
133
138
  embedding1: First embedding vector
134
139
  embedding2: Second embedding vector
135
-
140
+
136
141
  Returns:
137
142
  Similarity score between 0 and 1
138
143
  """
@@ -144,7 +149,7 @@ class SemanticSearchEngine:
144
149
  return 0.0
145
150
  similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
146
151
  return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
147
-
152
+
148
153
  if not SENTENCE_TRANSFORMERS_AVAILABLE or util is None:
149
154
  # Fallback to numpy implementation
150
155
  norm1 = np.linalg.norm(embedding1)
@@ -153,78 +158,86 @@ class SemanticSearchEngine:
153
158
  return 0.0
154
159
  similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
155
160
  return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
156
-
161
+
157
162
  try:
158
163
  # Convert to tensors for efficient computation
159
164
  emb1_tensor = torch.tensor(embedding1)
160
165
  emb2_tensor = torch.tensor(embedding2)
161
-
166
+
162
167
  # Calculate cosine similarity
163
168
  similarity = util.cos_sim(emb1_tensor, emb2_tensor).item()
164
169
  return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
165
170
  except Exception as e:
166
171
  raise DatabaseError(f"Failed to calculate similarity: {e}")
167
-
168
- def find_similar_embeddings(self, query_embedding: List[float],
169
- candidate_embeddings: List[List[float]],
170
- similarity_threshold: float = 0.5,
171
- top_k: int = 10) -> List[Tuple[int, float]]:
172
+
173
+ def find_similar_embeddings(
174
+ self,
175
+ query_embedding: List[float],
176
+ candidate_embeddings: List[List[float]],
177
+ similarity_threshold: float = 0.5,
178
+ top_k: int = 10,
179
+ ) -> List[Tuple[int, float]]:
172
180
  """
173
181
  Find the most similar embeddings to a query embedding.
174
-
182
+
175
183
  Args:
176
184
  query_embedding: Query vector
177
185
  candidate_embeddings: List of candidate vectors
178
186
  similarity_threshold: Minimum similarity score
179
187
  top_k: Maximum number of results
180
-
188
+
181
189
  Returns:
182
190
  List of (index, similarity_score) tuples, sorted by similarity descending
183
191
  """
184
192
  if not candidate_embeddings:
185
193
  return []
186
-
194
+
187
195
  # Use efficient torch/sentence-transformers if available
188
196
  if TORCH_AVAILABLE and torch is not None and SENTENCE_TRANSFORMERS_AVAILABLE and util is not None:
189
197
  try:
190
198
  # Convert to tensors
191
199
  query_tensor = torch.tensor(query_embedding).unsqueeze(0)
192
200
  candidate_tensor = torch.tensor(candidate_embeddings)
193
-
201
+
194
202
  # Calculate similarities
195
203
  similarities = util.cos_sim(query_tensor, candidate_tensor)[0]
196
-
204
+
197
205
  # Find matches above threshold
198
206
  results = []
199
207
  for idx, sim in enumerate(similarities):
200
208
  sim_score = sim.item()
201
209
  if sim_score >= similarity_threshold:
202
210
  results.append((idx, sim_score))
203
-
211
+
204
212
  # Sort by similarity descending and limit to top_k
205
213
  results.sort(key=lambda x: x[1], reverse=True)
206
214
  return results[:top_k]
207
- except Exception as e: logging.warning(f"Torch similarity search failed, using numpy fallback: {e}")
208
-
215
+ except Exception as e:
216
+ logging.warning(f"Torch similarity search failed, using numpy fallback: {e}")
217
+
209
218
  # Fallback to numpy implementation
210
219
  results = []
211
220
  for idx, candidate_emb in enumerate(candidate_embeddings):
212
221
  similarity = self.calculate_similarity(query_embedding, candidate_emb)
213
222
  if similarity >= similarity_threshold:
214
223
  results.append((idx, similarity))
215
-
224
+
216
225
  # Sort by similarity descending and limit to top_k
217
226
  results.sort(key=lambda x: x[1], reverse=True)
218
227
  return results[:top_k]
219
-
220
- def semantic_search(self, query: str, content_data: List[Dict[str, Any]],
221
- embedding_column: str = "embedding",
222
- content_columns: Optional[List[str]] = None,
223
- similarity_threshold: float = 0.5,
224
- top_k: int = 10) -> List[Dict[str, Any]]:
228
+
229
+ def semantic_search(
230
+ self,
231
+ query: str,
232
+ content_data: List[Dict[str, Any]],
233
+ embedding_column: str = "embedding",
234
+ content_columns: Optional[List[str]] = None,
235
+ similarity_threshold: float = 0.5,
236
+ top_k: int = 10,
237
+ ) -> List[Dict[str, Any]]:
225
238
  """
226
239
  Perform semantic search on content data.
227
-
240
+
228
241
  Args:
229
242
  query: Natural language search query
230
243
  content_data: List of rows with embeddings and content
@@ -232,24 +245,24 @@ class SemanticSearchEngine:
232
245
  content_columns: Columns to search in (for highlighting)
233
246
  similarity_threshold: Minimum similarity score
234
247
  top_k: Maximum number of results
235
-
248
+
236
249
  Returns:
237
250
  List of search results with similarity scores
238
251
  """
239
252
  if not query.strip():
240
253
  raise ValidationError("Search query cannot be empty")
241
-
254
+
242
255
  if not content_data:
243
256
  return []
244
-
257
+
245
258
  try:
246
259
  # Generate query embedding
247
260
  query_embedding = self.generate_embedding(query)
248
-
261
+
249
262
  # Extract embeddings from content data
250
263
  candidate_embeddings = []
251
264
  valid_indices = []
252
-
265
+
253
266
  for idx, row in enumerate(content_data):
254
267
  if embedding_column in row and row[embedding_column]:
255
268
  try:
@@ -257,31 +270,30 @@ class SemanticSearchEngine:
257
270
  embedding = row[embedding_column]
258
271
  if isinstance(embedding, str):
259
272
  embedding = json.loads(embedding)
260
-
273
+
261
274
  candidate_embeddings.append(embedding)
262
275
  valid_indices.append(idx)
263
276
  except (json.JSONDecodeError, TypeError) as e:
264
277
  logging.warning(f"Invalid embedding data in row {idx}: {e}")
265
278
  continue
266
-
279
+
267
280
  if not candidate_embeddings:
268
281
  return []
269
-
282
+
270
283
  # Find similar embeddings
271
284
  similar_indices = self.find_similar_embeddings(
272
- query_embedding, candidate_embeddings,
273
- similarity_threshold, top_k
285
+ query_embedding, candidate_embeddings, similarity_threshold, top_k
274
286
  )
275
-
287
+
276
288
  # Build results
277
289
  results = []
278
290
  for candidate_idx, similarity_score in similar_indices:
279
291
  original_idx = valid_indices[candidate_idx]
280
292
  row = content_data[original_idx].copy()
281
-
293
+
282
294
  # Add similarity score
283
- row['similarity_score'] = round(similarity_score, 3)
284
-
295
+ row["similarity_score"] = round(similarity_score, 3)
296
+
285
297
  # Add matched content highlighting if specified
286
298
  if content_columns:
287
299
  matched_content = []
@@ -289,24 +301,28 @@ class SemanticSearchEngine:
289
301
  if col in row and row[col] and query.lower() in str(row[col]).lower():
290
302
  matched_content.append(f"{col}: {row[col]}")
291
303
  if matched_content:
292
- row['matched_content'] = matched_content
293
-
304
+ row["matched_content"] = matched_content
305
+
294
306
  results.append(row)
295
-
307
+
296
308
  return results
297
-
309
+
298
310
  except Exception as e:
299
311
  raise DatabaseError(f"Semantic search failed: {e}")
300
-
301
- def hybrid_search(self, query: str, content_data: List[Dict[str, Any]],
302
- text_columns: Optional[List[str]] = None,
303
- embedding_column: str = "embedding",
304
- semantic_weight: float = 0.7,
305
- text_weight: float = 0.3,
306
- top_k: int = 10) -> List[Dict[str, Any]]:
312
+
313
+ def hybrid_search(
314
+ self,
315
+ query: str,
316
+ content_data: List[Dict[str, Any]],
317
+ text_columns: Optional[List[str]] = None,
318
+ embedding_column: str = "embedding",
319
+ semantic_weight: float = 0.7,
320
+ text_weight: float = 0.3,
321
+ top_k: int = 10,
322
+ ) -> List[Dict[str, Any]]:
307
323
  """
308
324
  Combine semantic search with keyword matching for better results.
309
-
325
+
310
326
  Args:
311
327
  query: Search query
312
328
  content_data: Content to search
@@ -315,33 +331,29 @@ class SemanticSearchEngine:
315
331
  semantic_weight: Weight for semantic similarity (0-1)
316
332
  text_weight: Weight for text matching (0-1)
317
333
  top_k: Maximum results
318
-
334
+
319
335
  Returns:
320
336
  Ranked search results
321
337
  """
322
338
  if not content_data:
323
339
  return []
324
-
340
+
325
341
  # Normalize weights
326
342
  total_weight = semantic_weight + text_weight
327
343
  if total_weight > 0:
328
344
  semantic_weight /= total_weight
329
345
  text_weight /= total_weight
330
-
346
+
331
347
  # Get semantic search results
332
348
  semantic_results = self.semantic_search(
333
- query, content_data, embedding_column,
334
- similarity_threshold=0.3, top_k=top_k * 2 # Get more for reranking
349
+ query, content_data, embedding_column, similarity_threshold=0.3, top_k=top_k * 2 # Get more for reranking
335
350
  )
336
-
337
- # Create result map for reranking
338
- result_map = {id(result): result for result in semantic_results}
339
-
351
+
340
352
  # Add text matching scores
341
353
  query_lower = query.lower()
342
354
  for result in semantic_results:
343
355
  text_score = 0.0
344
-
356
+
345
357
  if text_columns:
346
358
  for col in text_columns:
347
359
  if col in result and result[col]:
@@ -349,18 +361,18 @@ class SemanticSearchEngine:
349
361
  if query_lower in content:
350
362
  # Simple frequency-based text scoring
351
363
  text_score += content.count(query_lower) / len(content.split())
352
-
364
+
353
365
  # Combine scores
354
- semantic_score = result.get('similarity_score', 0.0)
366
+ semantic_score = result.get("similarity_score", 0.0)
355
367
  combined_score = (semantic_score * semantic_weight) + (text_score * text_weight)
356
- result['combined_score'] = round(combined_score, 3)
357
- result['text_score'] = round(text_score, 3)
358
-
368
+ result["combined_score"] = round(combined_score, 3)
369
+ result["text_score"] = round(text_score, 3)
370
+
359
371
  # Sort by combined score
360
- semantic_results.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
361
-
372
+ semantic_results.sort(key=lambda x: x.get("combined_score", 0), reverse=True)
373
+
362
374
  return semantic_results[:top_k]
363
-
375
+
364
376
  def clear_cache(self):
365
377
  """Clear the embedding cache."""
366
378
  self._embedding_cache.clear()
@@ -374,10 +386,10 @@ _semantic_engine: Optional[SemanticSearchEngine] = None
374
386
  def get_semantic_engine(model_name: str = "all-MiniLM-L6-v2") -> SemanticSearchEngine:
375
387
  """Get or create the global semantic search engine."""
376
388
  global _semantic_engine
377
-
389
+
378
390
  if _semantic_engine is None or _semantic_engine.model_name != model_name:
379
391
  _semantic_engine = SemanticSearchEngine(model_name)
380
-
392
+
381
393
  return _semantic_engine
382
394
 
383
395