marqeta-diva-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ """Vector store management for transaction embeddings using ChromaDB."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+ import chromadb
7
+ from chromadb.config import Settings
8
+
9
+
10
+ class TransactionVectorStore:
11
+ """Manages vector storage and retrieval for transaction embeddings."""
12
+
13
+ def __init__(self, persist_directory: str = "./chroma_db"):
14
+ """
15
+ Initialize the vector store.
16
+
17
+ Args:
18
+ persist_directory: Directory where ChromaDB will persist data
19
+ """
20
+ self.persist_directory = Path(persist_directory)
21
+ self.persist_directory.mkdir(parents=True, exist_ok=True)
22
+
23
+ print(f"[VectorStore] Initializing ChromaDB at {self.persist_directory}...", file=sys.stderr)
24
+
25
+ # Initialize ChromaDB client with persistence
26
+ self.client = chromadb.PersistentClient(
27
+ path=str(self.persist_directory),
28
+ settings=Settings(
29
+ anonymized_telemetry=False,
30
+ allow_reset=True
31
+ )
32
+ )
33
+
34
+ # Collection for transactions
35
+ self.collection_name = "transactions"
36
+ self.collection = None
37
+
38
+ print(f"[VectorStore] ChromaDB initialized successfully", file=sys.stderr)
39
+
40
+ def create_collection(self, embedding_dimension: int = 384) -> None:
41
+ """
42
+ Create or get the transactions collection.
43
+
44
+ Args:
45
+ embedding_dimension: Dimension of embedding vectors (default: 384 for MiniLM)
46
+ """
47
+ try:
48
+ self.collection = self.client.get_or_create_collection(
49
+ name=self.collection_name,
50
+ metadata={"hnsw:space": "cosine"} # Use cosine similarity
51
+ )
52
+ print(f"[VectorStore] Collection '{self.collection_name}' ready", file=sys.stderr)
53
+ except Exception as e:
54
+ print(f"[VectorStore] Error creating collection: {e}", file=sys.stderr)
55
+ raise
56
+
57
+ def add_transactions(
58
+ self,
59
+ transactions: List[Dict[str, Any]],
60
+ embeddings: List[List[float]]
61
+ ) -> int:
62
+ """
63
+ Add transactions and their embeddings to the vector store.
64
+
65
+ Args:
66
+ transactions: List of transaction dictionaries
67
+ embeddings: List of embedding vectors
68
+
69
+ Returns:
70
+ Number of transactions added
71
+ """
72
+ if self.collection is None:
73
+ self.create_collection()
74
+
75
+ if len(transactions) != len(embeddings):
76
+ raise ValueError(f"Mismatch: {len(transactions)} transactions, {len(embeddings)} embeddings")
77
+
78
+ # Prepare data for ChromaDB
79
+ ids = []
80
+ metadatas = []
81
+ documents = []
82
+
83
+ for txn in transactions:
84
+ # Use transaction_token as ID
85
+ txn_id = txn.get("transaction_token")
86
+ if not txn_id:
87
+ print(f"[VectorStore] Warning: Transaction missing token, skipping", file=sys.stderr)
88
+ continue
89
+
90
+ # ChromaDB requires string IDs - convert if needed
91
+ if not isinstance(txn_id, str):
92
+ txn_id = str(txn_id)
93
+
94
+ ids.append(txn_id)
95
+
96
+ # Store essential metadata for filtering
97
+ metadata = {
98
+ "merchant_name": txn.get("merchant_name", ""),
99
+ "transaction_amount": float(txn.get("transaction_amount", 0.0)),
100
+ "transaction_type": txn.get("transaction_type", ""),
101
+ "state": txn.get("state", txn.get("transaction_status", "")),
102
+ "user_token": txn.get("user_token", txn.get("acting_user_token", "")),
103
+ "card_token": txn.get("card_token", ""),
104
+ "created_time": txn.get("created_time", txn.get("transaction_timestamp", "")),
105
+ "network": txn.get("network", ""),
106
+ }
107
+
108
+ # Remove empty values to save space
109
+ metadata = {k: v for k, v in metadata.items() if v}
110
+
111
+ metadatas.append(metadata)
112
+
113
+ # Store human-readable document text
114
+ doc_parts = []
115
+ if metadata.get("merchant_name"):
116
+ doc_parts.append(f"Merchant: {metadata['merchant_name']}")
117
+ if metadata.get("transaction_amount"):
118
+ doc_parts.append(f"Amount: ${metadata['transaction_amount']:.2f}")
119
+ if metadata.get("transaction_type"):
120
+ doc_parts.append(f"Type: {metadata['transaction_type']}")
121
+
122
+ documents.append(" | ".join(doc_parts) if doc_parts else txn_id)
123
+
124
+ # Add to collection
125
+ self.collection.add(
126
+ ids=ids,
127
+ embeddings=embeddings,
128
+ metadatas=metadatas,
129
+ documents=documents
130
+ )
131
+
132
+ print(f"[VectorStore] Added {len(ids)} transactions to vector store", file=sys.stderr)
133
+ return len(ids)
134
+
135
+ def search(
136
+ self,
137
+ query_embedding: List[float],
138
+ n_results: int = 10,
139
+ where: Optional[Dict[str, Any]] = None,
140
+ where_document: Optional[Dict[str, str]] = None
141
+ ) -> Dict[str, Any]:
142
+ """
143
+ Search for similar transactions.
144
+
145
+ Args:
146
+ query_embedding: Query embedding vector
147
+ n_results: Number of results to return
148
+ where: Metadata filter conditions (e.g., {"transaction_amount": {"$gt": 100}})
149
+ where_document: Document text filter conditions
150
+
151
+ Returns:
152
+ Dictionary with ids, distances, metadatas, and documents
153
+ """
154
+ if self.collection is None:
155
+ self.create_collection()
156
+
157
+ results = self.collection.query(
158
+ query_embeddings=[query_embedding],
159
+ n_results=n_results,
160
+ where=where,
161
+ where_document=where_document
162
+ )
163
+
164
+ # Reformat results for easier consumption
165
+ if results["ids"] and results["ids"][0]:
166
+ formatted_results = {
167
+ "count": len(results["ids"][0]),
168
+ "transactions": []
169
+ }
170
+
171
+ for i in range(len(results["ids"][0])):
172
+ formatted_results["transactions"].append({
173
+ "transaction_token": results["ids"][0][i],
174
+ "similarity_score": 1 - results["distances"][0][i], # Convert distance to similarity
175
+ "metadata": results["metadatas"][0][i] if results["metadatas"] else {},
176
+ "document": results["documents"][0][i] if results["documents"] else ""
177
+ })
178
+
179
+ return formatted_results
180
+ else:
181
+ return {"count": 0, "transactions": []}
182
+
183
+ def get_by_id(self, transaction_id: str) -> Optional[Dict[str, Any]]:
184
+ """
185
+ Get a transaction by its ID.
186
+
187
+ Args:
188
+ transaction_id: Transaction token
189
+
190
+ Returns:
191
+ Transaction data or None if not found
192
+ """
193
+ if self.collection is None:
194
+ self.create_collection()
195
+
196
+ try:
197
+ result = self.collection.get(ids=[transaction_id], include=["embeddings", "metadatas", "documents"])
198
+
199
+ if result["ids"]:
200
+ return {
201
+ "transaction_token": result["ids"][0],
202
+ "embedding": result["embeddings"][0] if result["embeddings"] else None,
203
+ "metadata": result["metadatas"][0] if result["metadatas"] else {},
204
+ "document": result["documents"][0] if result["documents"] else ""
205
+ }
206
+ except Exception as e:
207
+ print(f"[VectorStore] Error retrieving transaction {transaction_id}: {e}", file=sys.stderr)
208
+
209
+ return None
210
+
211
+ def delete_transactions(self, transaction_ids: List[str]) -> int:
212
+ """
213
+ Delete transactions from the vector store.
214
+
215
+ Args:
216
+ transaction_ids: List of transaction tokens to delete
217
+
218
+ Returns:
219
+ Number of transactions deleted
220
+ """
221
+ if self.collection is None:
222
+ return 0
223
+
224
+ try:
225
+ self.collection.delete(ids=transaction_ids)
226
+ print(f"[VectorStore] Deleted {len(transaction_ids)} transactions", file=sys.stderr)
227
+ return len(transaction_ids)
228
+ except Exception as e:
229
+ print(f"[VectorStore] Error deleting transactions: {e}", file=sys.stderr)
230
+ return 0
231
+
232
+ def get_stats(self) -> Dict[str, Any]:
233
+ """
234
+ Get statistics about the vector store.
235
+
236
+ Returns:
237
+ Dictionary with collection statistics
238
+ """
239
+ if self.collection is None:
240
+ return {
241
+ "collection_name": self.collection_name,
242
+ "count": 0,
243
+ "persist_directory": str(self.persist_directory),
244
+ "status": "not_initialized"
245
+ }
246
+
247
+ count = self.collection.count()
248
+
249
+ return {
250
+ "collection_name": self.collection_name,
251
+ "count": count,
252
+ "persist_directory": str(self.persist_directory),
253
+ "status": "initialized"
254
+ }
255
+
256
+ def clear(self) -> None:
257
+ """Clear all data from the collection."""
258
+ if self.collection is not None:
259
+ print(f"[VectorStore] Clearing collection '{self.collection_name}'...", file=sys.stderr)
260
+ self.client.delete_collection(name=self.collection_name)
261
+ self.collection = None
262
+ print(f"[VectorStore] Collection cleared", file=sys.stderr)
263
+
264
+
265
+ # Global vector store instance (lazy-loaded)
266
+ _vector_store: TransactionVectorStore | None = None
267
+
268
+
269
+ def get_vector_store(persist_directory: str = "./chroma_db") -> TransactionVectorStore:
270
+ """Get or create the global vector store instance."""
271
+ global _vector_store
272
+ if _vector_store is None:
273
+ _vector_store = TransactionVectorStore(persist_directory=persist_directory)
274
+ return _vector_store