marqeta-diva-mcp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- marqeta_diva_mcp/__init__.py +3 -0
- marqeta_diva_mcp/__main__.py +6 -0
- marqeta_diva_mcp/client.py +471 -0
- marqeta_diva_mcp/embeddings.py +131 -0
- marqeta_diva_mcp/local_storage.py +348 -0
- marqeta_diva_mcp/rag_tools.py +366 -0
- marqeta_diva_mcp/server.py +940 -0
- marqeta_diva_mcp/vector_store.py +274 -0
- marqeta_diva_mcp-0.2.0.dist-info/METADATA +515 -0
- marqeta_diva_mcp-0.2.0.dist-info/RECORD +13 -0
- marqeta_diva_mcp-0.2.0.dist-info/WHEEL +4 -0
- marqeta_diva_mcp-0.2.0.dist-info/entry_points.txt +2 -0
- marqeta_diva_mcp-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Vector store management for transaction embeddings using ChromaDB."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
import chromadb
|
|
7
|
+
from chromadb.config import Settings
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TransactionVectorStore:
|
|
11
|
+
"""Manages vector storage and retrieval for transaction embeddings."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, persist_directory: str = "./chroma_db"):
|
|
14
|
+
"""
|
|
15
|
+
Initialize the vector store.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
persist_directory: Directory where ChromaDB will persist data
|
|
19
|
+
"""
|
|
20
|
+
self.persist_directory = Path(persist_directory)
|
|
21
|
+
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
|
|
23
|
+
print(f"[VectorStore] Initializing ChromaDB at {self.persist_directory}...", file=sys.stderr)
|
|
24
|
+
|
|
25
|
+
# Initialize ChromaDB client with persistence
|
|
26
|
+
self.client = chromadb.PersistentClient(
|
|
27
|
+
path=str(self.persist_directory),
|
|
28
|
+
settings=Settings(
|
|
29
|
+
anonymized_telemetry=False,
|
|
30
|
+
allow_reset=True
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Collection for transactions
|
|
35
|
+
self.collection_name = "transactions"
|
|
36
|
+
self.collection = None
|
|
37
|
+
|
|
38
|
+
print(f"[VectorStore] ChromaDB initialized successfully", file=sys.stderr)
|
|
39
|
+
|
|
40
|
+
def create_collection(self, embedding_dimension: int = 384) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Create or get the transactions collection.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
embedding_dimension: Dimension of embedding vectors (default: 384 for MiniLM)
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
self.collection = self.client.get_or_create_collection(
|
|
49
|
+
name=self.collection_name,
|
|
50
|
+
metadata={"hnsw:space": "cosine"} # Use cosine similarity
|
|
51
|
+
)
|
|
52
|
+
print(f"[VectorStore] Collection '{self.collection_name}' ready", file=sys.stderr)
|
|
53
|
+
except Exception as e:
|
|
54
|
+
print(f"[VectorStore] Error creating collection: {e}", file=sys.stderr)
|
|
55
|
+
raise
|
|
56
|
+
|
|
57
|
+
def add_transactions(
|
|
58
|
+
self,
|
|
59
|
+
transactions: List[Dict[str, Any]],
|
|
60
|
+
embeddings: List[List[float]]
|
|
61
|
+
) -> int:
|
|
62
|
+
"""
|
|
63
|
+
Add transactions and their embeddings to the vector store.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
transactions: List of transaction dictionaries
|
|
67
|
+
embeddings: List of embedding vectors
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Number of transactions added
|
|
71
|
+
"""
|
|
72
|
+
if self.collection is None:
|
|
73
|
+
self.create_collection()
|
|
74
|
+
|
|
75
|
+
if len(transactions) != len(embeddings):
|
|
76
|
+
raise ValueError(f"Mismatch: {len(transactions)} transactions, {len(embeddings)} embeddings")
|
|
77
|
+
|
|
78
|
+
# Prepare data for ChromaDB
|
|
79
|
+
ids = []
|
|
80
|
+
metadatas = []
|
|
81
|
+
documents = []
|
|
82
|
+
|
|
83
|
+
for txn in transactions:
|
|
84
|
+
# Use transaction_token as ID
|
|
85
|
+
txn_id = txn.get("transaction_token")
|
|
86
|
+
if not txn_id:
|
|
87
|
+
print(f"[VectorStore] Warning: Transaction missing token, skipping", file=sys.stderr)
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# ChromaDB requires string IDs - convert if needed
|
|
91
|
+
if not isinstance(txn_id, str):
|
|
92
|
+
txn_id = str(txn_id)
|
|
93
|
+
|
|
94
|
+
ids.append(txn_id)
|
|
95
|
+
|
|
96
|
+
# Store essential metadata for filtering
|
|
97
|
+
metadata = {
|
|
98
|
+
"merchant_name": txn.get("merchant_name", ""),
|
|
99
|
+
"transaction_amount": float(txn.get("transaction_amount", 0.0)),
|
|
100
|
+
"transaction_type": txn.get("transaction_type", ""),
|
|
101
|
+
"state": txn.get("state", txn.get("transaction_status", "")),
|
|
102
|
+
"user_token": txn.get("user_token", txn.get("acting_user_token", "")),
|
|
103
|
+
"card_token": txn.get("card_token", ""),
|
|
104
|
+
"created_time": txn.get("created_time", txn.get("transaction_timestamp", "")),
|
|
105
|
+
"network": txn.get("network", ""),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# Remove empty values to save space
|
|
109
|
+
metadata = {k: v for k, v in metadata.items() if v}
|
|
110
|
+
|
|
111
|
+
metadatas.append(metadata)
|
|
112
|
+
|
|
113
|
+
# Store human-readable document text
|
|
114
|
+
doc_parts = []
|
|
115
|
+
if metadata.get("merchant_name"):
|
|
116
|
+
doc_parts.append(f"Merchant: {metadata['merchant_name']}")
|
|
117
|
+
if metadata.get("transaction_amount"):
|
|
118
|
+
doc_parts.append(f"Amount: ${metadata['transaction_amount']:.2f}")
|
|
119
|
+
if metadata.get("transaction_type"):
|
|
120
|
+
doc_parts.append(f"Type: {metadata['transaction_type']}")
|
|
121
|
+
|
|
122
|
+
documents.append(" | ".join(doc_parts) if doc_parts else txn_id)
|
|
123
|
+
|
|
124
|
+
# Add to collection
|
|
125
|
+
self.collection.add(
|
|
126
|
+
ids=ids,
|
|
127
|
+
embeddings=embeddings,
|
|
128
|
+
metadatas=metadatas,
|
|
129
|
+
documents=documents
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
print(f"[VectorStore] Added {len(ids)} transactions to vector store", file=sys.stderr)
|
|
133
|
+
return len(ids)
|
|
134
|
+
|
|
135
|
+
def search(
|
|
136
|
+
self,
|
|
137
|
+
query_embedding: List[float],
|
|
138
|
+
n_results: int = 10,
|
|
139
|
+
where: Optional[Dict[str, Any]] = None,
|
|
140
|
+
where_document: Optional[Dict[str, str]] = None
|
|
141
|
+
) -> Dict[str, Any]:
|
|
142
|
+
"""
|
|
143
|
+
Search for similar transactions.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
query_embedding: Query embedding vector
|
|
147
|
+
n_results: Number of results to return
|
|
148
|
+
where: Metadata filter conditions (e.g., {"transaction_amount": {"$gt": 100}})
|
|
149
|
+
where_document: Document text filter conditions
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Dictionary with ids, distances, metadatas, and documents
|
|
153
|
+
"""
|
|
154
|
+
if self.collection is None:
|
|
155
|
+
self.create_collection()
|
|
156
|
+
|
|
157
|
+
results = self.collection.query(
|
|
158
|
+
query_embeddings=[query_embedding],
|
|
159
|
+
n_results=n_results,
|
|
160
|
+
where=where,
|
|
161
|
+
where_document=where_document
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Reformat results for easier consumption
|
|
165
|
+
if results["ids"] and results["ids"][0]:
|
|
166
|
+
formatted_results = {
|
|
167
|
+
"count": len(results["ids"][0]),
|
|
168
|
+
"transactions": []
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
for i in range(len(results["ids"][0])):
|
|
172
|
+
formatted_results["transactions"].append({
|
|
173
|
+
"transaction_token": results["ids"][0][i],
|
|
174
|
+
"similarity_score": 1 - results["distances"][0][i], # Convert distance to similarity
|
|
175
|
+
"metadata": results["metadatas"][0][i] if results["metadatas"] else {},
|
|
176
|
+
"document": results["documents"][0][i] if results["documents"] else ""
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
return formatted_results
|
|
180
|
+
else:
|
|
181
|
+
return {"count": 0, "transactions": []}
|
|
182
|
+
|
|
183
|
+
def get_by_id(self, transaction_id: str) -> Optional[Dict[str, Any]]:
|
|
184
|
+
"""
|
|
185
|
+
Get a transaction by its ID.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
transaction_id: Transaction token
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Transaction data or None if not found
|
|
192
|
+
"""
|
|
193
|
+
if self.collection is None:
|
|
194
|
+
self.create_collection()
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
result = self.collection.get(ids=[transaction_id], include=["embeddings", "metadatas", "documents"])
|
|
198
|
+
|
|
199
|
+
if result["ids"]:
|
|
200
|
+
return {
|
|
201
|
+
"transaction_token": result["ids"][0],
|
|
202
|
+
"embedding": result["embeddings"][0] if result["embeddings"] else None,
|
|
203
|
+
"metadata": result["metadatas"][0] if result["metadatas"] else {},
|
|
204
|
+
"document": result["documents"][0] if result["documents"] else ""
|
|
205
|
+
}
|
|
206
|
+
except Exception as e:
|
|
207
|
+
print(f"[VectorStore] Error retrieving transaction {transaction_id}: {e}", file=sys.stderr)
|
|
208
|
+
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
def delete_transactions(self, transaction_ids: List[str]) -> int:
|
|
212
|
+
"""
|
|
213
|
+
Delete transactions from the vector store.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
transaction_ids: List of transaction tokens to delete
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Number of transactions deleted
|
|
220
|
+
"""
|
|
221
|
+
if self.collection is None:
|
|
222
|
+
return 0
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
self.collection.delete(ids=transaction_ids)
|
|
226
|
+
print(f"[VectorStore] Deleted {len(transaction_ids)} transactions", file=sys.stderr)
|
|
227
|
+
return len(transaction_ids)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
print(f"[VectorStore] Error deleting transactions: {e}", file=sys.stderr)
|
|
230
|
+
return 0
|
|
231
|
+
|
|
232
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
233
|
+
"""
|
|
234
|
+
Get statistics about the vector store.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Dictionary with collection statistics
|
|
238
|
+
"""
|
|
239
|
+
if self.collection is None:
|
|
240
|
+
return {
|
|
241
|
+
"collection_name": self.collection_name,
|
|
242
|
+
"count": 0,
|
|
243
|
+
"persist_directory": str(self.persist_directory),
|
|
244
|
+
"status": "not_initialized"
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
count = self.collection.count()
|
|
248
|
+
|
|
249
|
+
return {
|
|
250
|
+
"collection_name": self.collection_name,
|
|
251
|
+
"count": count,
|
|
252
|
+
"persist_directory": str(self.persist_directory),
|
|
253
|
+
"status": "initialized"
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
def clear(self) -> None:
|
|
257
|
+
"""Clear all data from the collection."""
|
|
258
|
+
if self.collection is not None:
|
|
259
|
+
print(f"[VectorStore] Clearing collection '{self.collection_name}'...", file=sys.stderr)
|
|
260
|
+
self.client.delete_collection(name=self.collection_name)
|
|
261
|
+
self.collection = None
|
|
262
|
+
print(f"[VectorStore] Collection cleared", file=sys.stderr)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# Global vector store instance (lazy-loaded)
|
|
266
|
+
_vector_store: TransactionVectorStore | None = None
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def get_vector_store(persist_directory: str = "./chroma_db") -> TransactionVectorStore:
|
|
270
|
+
"""Get or create the global vector store instance."""
|
|
271
|
+
global _vector_store
|
|
272
|
+
if _vector_store is None:
|
|
273
|
+
_vector_store = TransactionVectorStore(persist_directory=persist_directory)
|
|
274
|
+
return _vector_store
|