mcp-sqlite-memory-bank 1.3.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_sqlite_memory_bank/database.py +247 -160
- mcp_sqlite_memory_bank/prompts.py +252 -0
- mcp_sqlite_memory_bank/resources.py +164 -0
- mcp_sqlite_memory_bank/semantic.py +107 -95
- mcp_sqlite_memory_bank/server.py +183 -33
- mcp_sqlite_memory_bank/types.py +6 -0
- mcp_sqlite_memory_bank/utils.py +5 -2
- {mcp_sqlite_memory_bank-1.3.0.dist-info → mcp_sqlite_memory_bank-1.4.1.dist-info}/METADATA +168 -4
- mcp_sqlite_memory_bank-1.4.1.dist-info/RECORD +15 -0
- mcp_sqlite_memory_bank-1.3.0.dist-info/RECORD +0 -13
- {mcp_sqlite_memory_bank-1.3.0.dist-info → mcp_sqlite_memory_bank-1.4.1.dist-info}/WHEEL +0 -0
- {mcp_sqlite_memory_bank-1.3.0.dist-info → mcp_sqlite_memory_bank-1.4.1.dist-info}/entry_points.txt +0 -0
- {mcp_sqlite_memory_bank-1.3.0.dist-info → mcp_sqlite_memory_bank-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {mcp_sqlite_memory_bank-1.3.0.dist-info → mcp_sqlite_memory_bank-1.4.1.dist-info}/top_level.txt +0 -0
@@ -11,11 +11,11 @@ import json
|
|
11
11
|
import logging
|
12
12
|
from typing import List, Dict, Any, Optional, Tuple
|
13
13
|
import numpy as np
|
14
|
-
from functools import lru_cache
|
15
14
|
|
16
15
|
# Optional imports with graceful fallback
|
17
16
|
try:
|
18
17
|
from sentence_transformers import SentenceTransformer, util
|
18
|
+
|
19
19
|
SENTENCE_TRANSFORMERS_AVAILABLE = True
|
20
20
|
except ImportError:
|
21
21
|
SENTENCE_TRANSFORMERS_AVAILABLE = False
|
@@ -25,36 +25,39 @@ except ImportError:
|
|
25
25
|
|
26
26
|
try:
|
27
27
|
import torch
|
28
|
+
|
28
29
|
TORCH_AVAILABLE = True
|
29
30
|
except ImportError:
|
30
31
|
TORCH_AVAILABLE = False
|
31
32
|
torch = None
|
32
33
|
logging.warning("torch not available. Install with: pip install torch")
|
33
34
|
|
34
|
-
from .types import
|
35
|
+
from .types import ValidationError, DatabaseError
|
35
36
|
|
36
37
|
|
37
38
|
class SemanticSearchEngine:
|
38
39
|
"""
|
39
40
|
Handles semantic search using sentence-transformers.
|
40
|
-
|
41
|
+
|
41
42
|
Features:
|
42
43
|
- Vector embeddings for text content
|
43
44
|
- Semantic similarity search
|
44
45
|
- Hybrid search combining semantic + keyword matching
|
45
46
|
- Caching for performance
|
46
47
|
"""
|
47
|
-
|
48
|
+
|
48
49
|
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
49
50
|
"""Initialize the semantic search engine."""
|
50
51
|
self.model_name = model_name
|
51
52
|
self._model = None
|
52
53
|
self._embedding_cache = {}
|
53
|
-
|
54
|
+
|
54
55
|
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
55
|
-
raise ValueError(
|
56
|
-
|
57
|
-
|
56
|
+
raise ValueError(
|
57
|
+
"sentence-transformers is not available. Please install with: pip install sentence-transformers"
|
58
|
+
)
|
59
|
+
|
60
|
+
@property
|
58
61
|
def model(self):
|
59
62
|
"""Lazy load the sentence transformer model."""
|
60
63
|
if self._model is None:
|
@@ -66,73 +69,75 @@ class SemanticSearchEngine:
|
|
66
69
|
except Exception as e:
|
67
70
|
raise DatabaseError(f"Failed to load semantic search model {self.model_name}: {e}")
|
68
71
|
return self._model
|
69
|
-
|
72
|
+
|
70
73
|
def get_embedding_dimensions(self) -> Optional[int]:
|
71
74
|
"""Get the embedding dimension size for the current model."""
|
72
75
|
return self.model.get_sentence_embedding_dimension()
|
73
|
-
|
76
|
+
|
74
77
|
def generate_embedding(self, text: str) -> List[float]:
|
75
78
|
"""
|
76
79
|
Generate embedding for a single text.
|
77
|
-
|
80
|
+
|
78
81
|
Args:
|
79
82
|
text: Text to embed
|
80
|
-
|
83
|
+
|
81
84
|
Returns:
|
82
85
|
List of floats representing the embedding vector
|
83
86
|
"""
|
84
87
|
if not text or not text.strip():
|
85
88
|
raise ValidationError("Text cannot be empty for embedding generation")
|
86
|
-
|
89
|
+
|
87
90
|
# Check cache first
|
88
91
|
cache_key = f"{self.model_name}:{hash(text)}"
|
89
92
|
if cache_key in self._embedding_cache:
|
90
93
|
return self._embedding_cache[cache_key]
|
91
|
-
|
94
|
+
|
92
95
|
try:
|
93
96
|
# Generate embedding
|
94
97
|
embedding = self.model.encode([text], convert_to_tensor=False)[0]
|
95
98
|
embedding_list = embedding.tolist()
|
96
|
-
|
99
|
+
|
97
100
|
# Cache for future use
|
98
101
|
self._embedding_cache[cache_key] = embedding_list
|
99
|
-
|
102
|
+
|
100
103
|
return embedding_list
|
101
104
|
except Exception as e:
|
102
105
|
raise DatabaseError(f"Failed to generate embedding: {e}")
|
103
|
-
|
106
|
+
|
104
107
|
def generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
|
105
108
|
"""
|
106
109
|
Generate embeddings for multiple texts efficiently.
|
107
|
-
|
110
|
+
|
108
111
|
Args:
|
109
112
|
texts: List of texts to embed
|
110
|
-
|
113
|
+
|
111
114
|
Returns:
|
112
115
|
List of embedding vectors
|
113
116
|
"""
|
114
117
|
if not texts:
|
115
118
|
return []
|
116
|
-
|
119
|
+
|
117
120
|
# Filter out empty texts
|
118
121
|
valid_texts = [text for text in texts if text and text.strip()]
|
119
122
|
if not valid_texts:
|
120
123
|
raise ValidationError("No valid texts provided for embedding generation")
|
121
|
-
|
124
|
+
|
122
125
|
try:
|
123
|
-
embeddings = self.model.encode(
|
126
|
+
embeddings = self.model.encode(
|
127
|
+
valid_texts, convert_to_tensor=False, show_progress_bar=len(valid_texts) > 10
|
128
|
+
)
|
124
129
|
return [emb.tolist() for emb in embeddings]
|
125
130
|
except Exception as e:
|
126
131
|
raise DatabaseError(f"Failed to generate batch embeddings: {e}")
|
127
|
-
|
132
|
+
|
128
133
|
def calculate_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
|
129
134
|
"""
|
130
135
|
Calculate cosine similarity between two embeddings.
|
131
|
-
|
136
|
+
|
132
137
|
Args:
|
133
138
|
embedding1: First embedding vector
|
134
139
|
embedding2: Second embedding vector
|
135
|
-
|
140
|
+
|
136
141
|
Returns:
|
137
142
|
Similarity score between 0 and 1
|
138
143
|
"""
|
@@ -144,7 +149,7 @@ class SemanticSearchEngine:
|
|
144
149
|
return 0.0
|
145
150
|
similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
|
146
151
|
return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
|
147
|
-
|
152
|
+
|
148
153
|
if not SENTENCE_TRANSFORMERS_AVAILABLE or util is None:
|
149
154
|
# Fallback to numpy implementation
|
150
155
|
norm1 = np.linalg.norm(embedding1)
|
@@ -153,78 +158,86 @@ class SemanticSearchEngine:
|
|
153
158
|
return 0.0
|
154
159
|
similarity = float(np.dot(embedding1, embedding2) / (norm1 * norm2))
|
155
160
|
return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
|
156
|
-
|
161
|
+
|
157
162
|
try:
|
158
163
|
# Convert to tensors for efficient computation
|
159
164
|
emb1_tensor = torch.tensor(embedding1)
|
160
165
|
emb2_tensor = torch.tensor(embedding2)
|
161
|
-
|
166
|
+
|
162
167
|
# Calculate cosine similarity
|
163
168
|
similarity = util.cos_sim(emb1_tensor, emb2_tensor).item()
|
164
169
|
return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
|
165
170
|
except Exception as e:
|
166
171
|
raise DatabaseError(f"Failed to calculate similarity: {e}")
|
167
|
-
|
168
|
-
def find_similar_embeddings(
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
+
|
173
|
+
def find_similar_embeddings(
|
174
|
+
self,
|
175
|
+
query_embedding: List[float],
|
176
|
+
candidate_embeddings: List[List[float]],
|
177
|
+
similarity_threshold: float = 0.5,
|
178
|
+
top_k: int = 10,
|
179
|
+
) -> List[Tuple[int, float]]:
|
172
180
|
"""
|
173
181
|
Find the most similar embeddings to a query embedding.
|
174
|
-
|
182
|
+
|
175
183
|
Args:
|
176
184
|
query_embedding: Query vector
|
177
185
|
candidate_embeddings: List of candidate vectors
|
178
186
|
similarity_threshold: Minimum similarity score
|
179
187
|
top_k: Maximum number of results
|
180
|
-
|
188
|
+
|
181
189
|
Returns:
|
182
190
|
List of (index, similarity_score) tuples, sorted by similarity descending
|
183
191
|
"""
|
184
192
|
if not candidate_embeddings:
|
185
193
|
return []
|
186
|
-
|
194
|
+
|
187
195
|
# Use efficient torch/sentence-transformers if available
|
188
196
|
if TORCH_AVAILABLE and torch is not None and SENTENCE_TRANSFORMERS_AVAILABLE and util is not None:
|
189
197
|
try:
|
190
198
|
# Convert to tensors
|
191
199
|
query_tensor = torch.tensor(query_embedding).unsqueeze(0)
|
192
200
|
candidate_tensor = torch.tensor(candidate_embeddings)
|
193
|
-
|
201
|
+
|
194
202
|
# Calculate similarities
|
195
203
|
similarities = util.cos_sim(query_tensor, candidate_tensor)[0]
|
196
|
-
|
204
|
+
|
197
205
|
# Find matches above threshold
|
198
206
|
results = []
|
199
207
|
for idx, sim in enumerate(similarities):
|
200
208
|
sim_score = sim.item()
|
201
209
|
if sim_score >= similarity_threshold:
|
202
210
|
results.append((idx, sim_score))
|
203
|
-
|
211
|
+
|
204
212
|
# Sort by similarity descending and limit to top_k
|
205
213
|
results.sort(key=lambda x: x[1], reverse=True)
|
206
214
|
return results[:top_k]
|
207
|
-
except Exception as e:
|
208
|
-
|
215
|
+
except Exception as e:
|
216
|
+
logging.warning(f"Torch similarity search failed, using numpy fallback: {e}")
|
217
|
+
|
209
218
|
# Fallback to numpy implementation
|
210
219
|
results = []
|
211
220
|
for idx, candidate_emb in enumerate(candidate_embeddings):
|
212
221
|
similarity = self.calculate_similarity(query_embedding, candidate_emb)
|
213
222
|
if similarity >= similarity_threshold:
|
214
223
|
results.append((idx, similarity))
|
215
|
-
|
224
|
+
|
216
225
|
# Sort by similarity descending and limit to top_k
|
217
226
|
results.sort(key=lambda x: x[1], reverse=True)
|
218
227
|
return results[:top_k]
|
219
|
-
|
220
|
-
def semantic_search(
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
228
|
+
|
229
|
+
def semantic_search(
|
230
|
+
self,
|
231
|
+
query: str,
|
232
|
+
content_data: List[Dict[str, Any]],
|
233
|
+
embedding_column: str = "embedding",
|
234
|
+
content_columns: Optional[List[str]] = None,
|
235
|
+
similarity_threshold: float = 0.5,
|
236
|
+
top_k: int = 10,
|
237
|
+
) -> List[Dict[str, Any]]:
|
225
238
|
"""
|
226
239
|
Perform semantic search on content data.
|
227
|
-
|
240
|
+
|
228
241
|
Args:
|
229
242
|
query: Natural language search query
|
230
243
|
content_data: List of rows with embeddings and content
|
@@ -232,24 +245,24 @@ class SemanticSearchEngine:
|
|
232
245
|
content_columns: Columns to search in (for highlighting)
|
233
246
|
similarity_threshold: Minimum similarity score
|
234
247
|
top_k: Maximum number of results
|
235
|
-
|
248
|
+
|
236
249
|
Returns:
|
237
250
|
List of search results with similarity scores
|
238
251
|
"""
|
239
252
|
if not query.strip():
|
240
253
|
raise ValidationError("Search query cannot be empty")
|
241
|
-
|
254
|
+
|
242
255
|
if not content_data:
|
243
256
|
return []
|
244
|
-
|
257
|
+
|
245
258
|
try:
|
246
259
|
# Generate query embedding
|
247
260
|
query_embedding = self.generate_embedding(query)
|
248
|
-
|
261
|
+
|
249
262
|
# Extract embeddings from content data
|
250
263
|
candidate_embeddings = []
|
251
264
|
valid_indices = []
|
252
|
-
|
265
|
+
|
253
266
|
for idx, row in enumerate(content_data):
|
254
267
|
if embedding_column in row and row[embedding_column]:
|
255
268
|
try:
|
@@ -257,31 +270,30 @@ class SemanticSearchEngine:
|
|
257
270
|
embedding = row[embedding_column]
|
258
271
|
if isinstance(embedding, str):
|
259
272
|
embedding = json.loads(embedding)
|
260
|
-
|
273
|
+
|
261
274
|
candidate_embeddings.append(embedding)
|
262
275
|
valid_indices.append(idx)
|
263
276
|
except (json.JSONDecodeError, TypeError) as e:
|
264
277
|
logging.warning(f"Invalid embedding data in row {idx}: {e}")
|
265
278
|
continue
|
266
|
-
|
279
|
+
|
267
280
|
if not candidate_embeddings:
|
268
281
|
return []
|
269
|
-
|
282
|
+
|
270
283
|
# Find similar embeddings
|
271
284
|
similar_indices = self.find_similar_embeddings(
|
272
|
-
query_embedding, candidate_embeddings,
|
273
|
-
similarity_threshold, top_k
|
285
|
+
query_embedding, candidate_embeddings, similarity_threshold, top_k
|
274
286
|
)
|
275
|
-
|
287
|
+
|
276
288
|
# Build results
|
277
289
|
results = []
|
278
290
|
for candidate_idx, similarity_score in similar_indices:
|
279
291
|
original_idx = valid_indices[candidate_idx]
|
280
292
|
row = content_data[original_idx].copy()
|
281
|
-
|
293
|
+
|
282
294
|
# Add similarity score
|
283
|
-
row[
|
284
|
-
|
295
|
+
row["similarity_score"] = round(similarity_score, 3)
|
296
|
+
|
285
297
|
# Add matched content highlighting if specified
|
286
298
|
if content_columns:
|
287
299
|
matched_content = []
|
@@ -289,24 +301,28 @@ class SemanticSearchEngine:
|
|
289
301
|
if col in row and row[col] and query.lower() in str(row[col]).lower():
|
290
302
|
matched_content.append(f"{col}: {row[col]}")
|
291
303
|
if matched_content:
|
292
|
-
row[
|
293
|
-
|
304
|
+
row["matched_content"] = matched_content
|
305
|
+
|
294
306
|
results.append(row)
|
295
|
-
|
307
|
+
|
296
308
|
return results
|
297
|
-
|
309
|
+
|
298
310
|
except Exception as e:
|
299
311
|
raise DatabaseError(f"Semantic search failed: {e}")
|
300
|
-
|
301
|
-
def hybrid_search(
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
312
|
+
|
313
|
+
def hybrid_search(
|
314
|
+
self,
|
315
|
+
query: str,
|
316
|
+
content_data: List[Dict[str, Any]],
|
317
|
+
text_columns: Optional[List[str]] = None,
|
318
|
+
embedding_column: str = "embedding",
|
319
|
+
semantic_weight: float = 0.7,
|
320
|
+
text_weight: float = 0.3,
|
321
|
+
top_k: int = 10,
|
322
|
+
) -> List[Dict[str, Any]]:
|
307
323
|
"""
|
308
324
|
Combine semantic search with keyword matching for better results.
|
309
|
-
|
325
|
+
|
310
326
|
Args:
|
311
327
|
query: Search query
|
312
328
|
content_data: Content to search
|
@@ -315,33 +331,29 @@ class SemanticSearchEngine:
|
|
315
331
|
semantic_weight: Weight for semantic similarity (0-1)
|
316
332
|
text_weight: Weight for text matching (0-1)
|
317
333
|
top_k: Maximum results
|
318
|
-
|
334
|
+
|
319
335
|
Returns:
|
320
336
|
Ranked search results
|
321
337
|
"""
|
322
338
|
if not content_data:
|
323
339
|
return []
|
324
|
-
|
340
|
+
|
325
341
|
# Normalize weights
|
326
342
|
total_weight = semantic_weight + text_weight
|
327
343
|
if total_weight > 0:
|
328
344
|
semantic_weight /= total_weight
|
329
345
|
text_weight /= total_weight
|
330
|
-
|
346
|
+
|
331
347
|
# Get semantic search results
|
332
348
|
semantic_results = self.semantic_search(
|
333
|
-
query, content_data, embedding_column,
|
334
|
-
similarity_threshold=0.3, top_k=top_k * 2 # Get more for reranking
|
349
|
+
query, content_data, embedding_column, similarity_threshold=0.3, top_k=top_k * 2 # Get more for reranking
|
335
350
|
)
|
336
|
-
|
337
|
-
# Create result map for reranking
|
338
|
-
result_map = {id(result): result for result in semantic_results}
|
339
|
-
|
351
|
+
|
340
352
|
# Add text matching scores
|
341
353
|
query_lower = query.lower()
|
342
354
|
for result in semantic_results:
|
343
355
|
text_score = 0.0
|
344
|
-
|
356
|
+
|
345
357
|
if text_columns:
|
346
358
|
for col in text_columns:
|
347
359
|
if col in result and result[col]:
|
@@ -349,18 +361,18 @@ class SemanticSearchEngine:
|
|
349
361
|
if query_lower in content:
|
350
362
|
# Simple frequency-based text scoring
|
351
363
|
text_score += content.count(query_lower) / len(content.split())
|
352
|
-
|
364
|
+
|
353
365
|
# Combine scores
|
354
|
-
semantic_score = result.get(
|
366
|
+
semantic_score = result.get("similarity_score", 0.0)
|
355
367
|
combined_score = (semantic_score * semantic_weight) + (text_score * text_weight)
|
356
|
-
result[
|
357
|
-
result[
|
358
|
-
|
368
|
+
result["combined_score"] = round(combined_score, 3)
|
369
|
+
result["text_score"] = round(text_score, 3)
|
370
|
+
|
359
371
|
# Sort by combined score
|
360
|
-
semantic_results.sort(key=lambda x: x.get(
|
361
|
-
|
372
|
+
semantic_results.sort(key=lambda x: x.get("combined_score", 0), reverse=True)
|
373
|
+
|
362
374
|
return semantic_results[:top_k]
|
363
|
-
|
375
|
+
|
364
376
|
def clear_cache(self):
|
365
377
|
"""Clear the embedding cache."""
|
366
378
|
self._embedding_cache.clear()
|
@@ -374,10 +386,10 @@ _semantic_engine: Optional[SemanticSearchEngine] = None
|
|
374
386
|
def get_semantic_engine(model_name: str = "all-MiniLM-L6-v2") -> SemanticSearchEngine:
|
375
387
|
"""Get or create the global semantic search engine."""
|
376
388
|
global _semantic_engine
|
377
|
-
|
389
|
+
|
378
390
|
if _semantic_engine is None or _semantic_engine.model_name != model_name:
|
379
391
|
_semantic_engine = SemanticSearchEngine(model_name)
|
380
|
-
|
392
|
+
|
381
393
|
return _semantic_engine
|
382
394
|
|
383
395
|
|