roampal 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. roampal/__init__.py +29 -0
  2. roampal/__main__.py +6 -0
  3. roampal/backend/__init__.py +1 -0
  4. roampal/backend/modules/__init__.py +1 -0
  5. roampal/backend/modules/memory/__init__.py +43 -0
  6. roampal/backend/modules/memory/chromadb_adapter.py +623 -0
  7. roampal/backend/modules/memory/config.py +102 -0
  8. roampal/backend/modules/memory/content_graph.py +543 -0
  9. roampal/backend/modules/memory/context_service.py +455 -0
  10. roampal/backend/modules/memory/embedding_service.py +96 -0
  11. roampal/backend/modules/memory/knowledge_graph_service.py +1052 -0
  12. roampal/backend/modules/memory/memory_bank_service.py +433 -0
  13. roampal/backend/modules/memory/memory_types.py +296 -0
  14. roampal/backend/modules/memory/outcome_service.py +400 -0
  15. roampal/backend/modules/memory/promotion_service.py +473 -0
  16. roampal/backend/modules/memory/routing_service.py +444 -0
  17. roampal/backend/modules/memory/scoring_service.py +324 -0
  18. roampal/backend/modules/memory/search_service.py +646 -0
  19. roampal/backend/modules/memory/tests/__init__.py +1 -0
  20. roampal/backend/modules/memory/tests/conftest.py +12 -0
  21. roampal/backend/modules/memory/tests/unit/__init__.py +1 -0
  22. roampal/backend/modules/memory/tests/unit/conftest.py +7 -0
  23. roampal/backend/modules/memory/tests/unit/test_knowledge_graph_service.py +517 -0
  24. roampal/backend/modules/memory/tests/unit/test_memory_bank_service.py +504 -0
  25. roampal/backend/modules/memory/tests/unit/test_outcome_service.py +485 -0
  26. roampal/backend/modules/memory/tests/unit/test_scoring_service.py +255 -0
  27. roampal/backend/modules/memory/tests/unit/test_search_service.py +413 -0
  28. roampal/backend/modules/memory/tests/unit/test_unified_memory_system.py +418 -0
  29. roampal/backend/modules/memory/unified_memory_system.py +1277 -0
  30. roampal/cli.py +638 -0
  31. roampal/hooks/__init__.py +16 -0
  32. roampal/hooks/session_manager.py +587 -0
  33. roampal/hooks/stop_hook.py +176 -0
  34. roampal/hooks/user_prompt_submit_hook.py +103 -0
  35. roampal/mcp/__init__.py +7 -0
  36. roampal/mcp/server.py +611 -0
  37. roampal/server/__init__.py +7 -0
  38. roampal/server/main.py +744 -0
  39. roampal-0.1.4.dist-info/METADATA +179 -0
  40. roampal-0.1.4.dist-info/RECORD +44 -0
  41. roampal-0.1.4.dist-info/WHEEL +5 -0
  42. roampal-0.1.4.dist-info/entry_points.txt +2 -0
  43. roampal-0.1.4.dist-info/licenses/LICENSE +190 -0
  44. roampal-0.1.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,623 @@
1
+ import logging
2
+ from typing import List, Dict, Any, Optional
3
+ import chromadb
4
+ import sys
5
+ import os
6
+ from chromadb.config import Settings as ChromaSettings
7
+
8
+ # BM25 for hybrid search (v2.1 Enhanced Retrieval)
9
+ try:
10
+ from rank_bm25 import BM25Okapi
11
+ import nltk
12
+ # Download punkt tokenizer silently if needed
13
+ try:
14
+ nltk.data.find('tokenizers/punkt')
15
+ except LookupError:
16
+ nltk.download('punkt', quiet=True)
17
+ BM25_AVAILABLE = True
18
+ except ImportError:
19
+ BM25_AVAILABLE = False
20
+ logger = logging.getLogger(__name__)
21
+ logger.warning("BM25 not available (pip install rank-bm25 nltk)")
22
+
23
+ # Add the backend directory to sys.path if not already there
24
+ backend_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
25
+ if backend_dir not in sys.path:
26
+ sys.path.insert(0, backend_dir)
27
+
28
+ from .embedding_service import EmbeddingService
29
+ import shutil
30
+ from pathlib import Path
31
+ import time
32
+ import os
33
+ # Simple collection naming for single user
34
+ def get_loopsmith_collection():
35
+ return "loopsmith_memories"
36
+
37
+ def get_user_chromadb_collection(user_id: str, shard_id: str = "default") -> str:
38
+ """Generate user-specific collection name for multi-user support"""
39
+ return f"user_{user_id}_{shard_id}_memories"
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ DEFAULT_COLLECTION_NAME = "loopsmith_memories"
44
+
45
+ class ChromaDBAdapter:
46
+ """
47
+ Concrete implementation of the VectorDBInterface using ChromaDB.
48
+ Handles persistent local vector storage and retrieval for Roampal memories.
49
+ Supports collection-specific persistence directories.
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ persistence_directory: str = None,
55
+ use_server: bool = None, # None = auto-detect
56
+ user_id: Optional[str] = None,
57
+ # Backwards compatibility with old API
58
+ collection_name: str = None,
59
+ persist_directory: str = None
60
+ ):
61
+ # Handle old API: ChromaDBAdapter(collection_name=..., persist_directory=...)
62
+ if persist_directory is not None:
63
+ persistence_directory = persist_directory
64
+ if persistence_directory is None:
65
+ persistence_directory = "./chromadb" # Default fallback
66
+
67
+ self.db_path = str(persistence_directory) # Keep for compatibility but not used in server mode
68
+
69
+ # Auto-detect server mode: use local embedded mode for benchmarks/tests
70
+ if use_server is None:
71
+ use_server = os.environ.get("ROAMPAL_USE_SERVER", "").lower() == "true"
72
+ self.use_server = use_server
73
+ self.client = None
74
+ self.collection: Optional[chromadb.Collection] = None
75
+ self.collection_name: Optional[str] = collection_name # Store for auto-init
76
+ self._auto_init_lock = False
77
+ self._current_path = None
78
+ self.user_id = user_id # Add user context
79
+ self._pending_collection_name = collection_name # For backwards compat auto-init
80
+
81
+ # BM25 index for hybrid search (v2.1)
82
+ self.bm25_index = None
83
+ self.bm25_docs = []
84
+ self.bm25_ids = []
85
+ self.bm25_metadatas = []
86
+ self._bm25_needs_rebuild = True
87
+
88
+ # Only create local dirs if not using server
89
+ if not self.use_server:
90
+ os.makedirs(self.db_path, exist_ok=True)
91
+ # Disabled automatic cleanup - use cleanup_chromadb.py utility instead
92
+ # self._clean_old_folders() # Can cause lock issues on Windows
93
+
94
+ def _clean_old_folders(self):
95
+ """Delete old UUID folders in the vector store directory with retry on lock."""
96
+ # Cleanup debug removed
97
+ logger.info(f"ChromaDB cleanup called for: {self.db_path}")
98
+
99
+ for entry in os.scandir(self.db_path):
100
+ if entry.is_dir() and len(entry.name) == 36 and entry.name.count('-') == 4: # UUID pattern
101
+ folder_path = Path(self.db_path) / entry.name
102
+
103
+ for attempt in range(3): # Retry 3 times
104
+ try:
105
+ shutil.rmtree(folder_path)
106
+ logger.info(f"Deleted old folder: {entry.name}")
107
+ break
108
+ except PermissionError as e:
109
+ if attempt < 2: # Wait and retry
110
+ time.sleep(1)
111
+ continue
112
+ logger.warning(f"Failed to delete old folder {entry.name} after retries: {e}")
113
+ except Exception as e:
114
+ logger.warning(f"Failed to delete old folder {entry.name}: {e}")
115
+ break
116
+
117
+ async def initialize(
118
+ self,
119
+ collection_name: str = DEFAULT_COLLECTION_NAME,
120
+ fragment_id: Optional[str] = None,
121
+ embedding_model_name: Optional[str] = None,
122
+ user_id: Optional[str] = None
123
+ ):
124
+ if self.client is None:
125
+ if self.use_server:
126
+ # Connect to ChromaDB server
127
+ self.client = chromadb.HttpClient(
128
+ host="localhost",
129
+ port=8003,
130
+ settings=ChromaSettings(anonymized_telemetry=False)
131
+ )
132
+ logger.info(f"ChromaDB client connected to server at localhost:8003")
133
+ else:
134
+ # Use local embedded mode (for testing only)
135
+ self.client = chromadb.PersistentClient(path=self.db_path)
136
+ logger.info(f"ChromaDB client initialized for local path: {self.db_path}")
137
+
138
+ # Use user-isolated collection if user_id provided
139
+ if user_id:
140
+ self.user_id = user_id
141
+ # Create user-specific collection name
142
+ shard_id = fragment_id or "roampal"
143
+ collection_name = get_user_chromadb_collection(user_id, shard_id)
144
+ logger.info(f"Using user-isolated collection: {collection_name}")
145
+
146
+ # Store collection name for reference
147
+ self.collection_name = collection_name
148
+
149
+ # Use get_or_create to reuse existing collection
150
+ # Don't use ChromaDB's default embedding function - Roampal provides embeddings manually
151
+ # This prevents dimension mismatch (ChromaDB default is 384d, Roampal uses 768d)
152
+ self.collection = self.client.get_or_create_collection(
153
+ name=collection_name,
154
+ embedding_function=None # Manual embeddings from EmbeddingService
155
+ )
156
+ logger.info(f"ChromaDB collection '{collection_name}' ready (lazy loaded)")
157
+
158
+ # No need to force file creation in server mode
159
+ if not self.use_server and self.collection.count() == 0:
160
+ logger.info(f"Empty collection '{collection_name}' initialized in embedded mode")
161
+
162
+ async def _ensure_initialized(self):
163
+ if self.collection is None and not self._auto_init_lock:
164
+ self._auto_init_lock = True
165
+ logger.warning("ChromaDBAdapter auto-initializing collection on demand (explicit .initialize() was not called).")
166
+ await self.initialize(collection_name=self.collection_name or DEFAULT_COLLECTION_NAME)
167
+ self._auto_init_lock = False
168
+
169
+ async def upsert_vectors(
170
+ self,
171
+ ids: List[str],
172
+ vectors: List[List[float]],
173
+ metadatas: List[Dict[str, Any]]
174
+ ):
175
+ await self._ensure_initialized()
176
+ if not (len(ids) == len(vectors) == len(metadatas)):
177
+ error_msg = (
178
+ f"Length of ids ({len(ids)}), "
179
+ f"vectors ({len(vectors)}), and "
180
+ f"metadatas ({len(metadatas)}) must be the same."
181
+ )
182
+ logger.error(error_msg)
183
+ raise ValueError(error_msg)
184
+
185
+ logger.info(f"Upserting {len(ids)} vectors into collection '{self.collection.name}'...")
186
+ try:
187
+ # Extract documents from metadata for ChromaDB persistence
188
+ documents = []
189
+ for metadata in metadatas:
190
+ # Use the content or text field as the document
191
+ doc = metadata.get('content', metadata.get('text', metadata.get('original_text', '')))
192
+ documents.append(str(doc))
193
+
194
+ self.collection.upsert(
195
+ ids=ids,
196
+ embeddings=vectors,
197
+ metadatas=metadatas,
198
+ documents=documents # ChromaDB needs documents to persist properly
199
+ )
200
+
201
+ # Mark BM25 index as needing rebuild (v2.1 hybrid search)
202
+ self._bm25_needs_rebuild = True
203
+
204
+ # ChromaDB now handles persistence automatically in both modes
205
+ # The reconnection workaround has been removed as of 2024-09-17
206
+ # Data is persisted on write with proper transaction handling
207
+
208
+ logger.info(f"Successfully upserted {len(ids)} vectors.")
209
+ except Exception as e:
210
+ logger.error(f"Failed to upsert vectors into ChromaDB: {e}", exc_info=True)
211
+ raise
212
+
213
+ async def query_vectors(self, query_vector: List[float], top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
214
+ """Query vectors with comprehensive error handling."""
215
+ try:
216
+ # Check if collection is empty first
217
+ await self._ensure_initialized()
218
+
219
+ # v0.2.4: Refresh collection to see changes from other processes (e.g., UI uploads)
220
+ # ChromaDB's PersistentClient caches collection state; re-fetching syncs with disk
221
+ if self.client and self.collection_name:
222
+ self.collection = self.client.get_or_create_collection(
223
+ name=self.collection_name,
224
+ metadata={"hnsw:space": "l2"}
225
+ )
226
+
227
+ if self.collection and self.collection.count() == 0:
228
+ logger.debug(f"[ChromaDB] Collection '{self.collection_name}' is empty, returning empty results")
229
+ return []
230
+
231
+ # Validate query vector
232
+ if not query_vector or not isinstance(query_vector, list):
233
+ logger.warning(f"[ChromaDB] Invalid query vector: {type(query_vector)}")
234
+ return []
235
+
236
+ # Check for None values in query vector
237
+ if any(v is None for v in query_vector):
238
+ logger.warning("[ChromaDB] Query vector contains None values")
239
+ return []
240
+
241
+ # Ensure query vector is numeric
242
+ try:
243
+ query_vector = [float(v) for v in query_vector]
244
+ except (ValueError, TypeError) as e:
245
+ logger.warning(f"[ChromaDB] Failed to convert query vector to floats: {e}")
246
+ return []
247
+
248
+ logger.info(f"Querying for top {top_k} vectors in collection '{self.collection_name}'...")
249
+
250
+ # Perform query with error handling
251
+ try:
252
+ results = self.collection.query(
253
+ query_embeddings=[query_vector],
254
+ n_results=top_k,
255
+ where=filters
256
+ )
257
+ except Exception as e:
258
+ logger.error(f"[ChromaDB] Query failed: {e}")
259
+ return []
260
+
261
+ # Process results with comprehensive error handling
262
+ processed_results = []
263
+
264
+ try:
265
+ # Extract data from results
266
+ ids = results.get('ids', [[]])[0] if results.get('ids') else []
267
+ embeddings = results.get('embeddings', [[]])[0] if results.get('embeddings') else []
268
+ documents = results.get('documents', [[]])[0] if results.get('documents') else []
269
+ metadatas = results.get('metadatas', [[]])[0] if results.get('metadatas') else []
270
+ distances = results.get('distances', [[]])[0] if results.get('distances') else []
271
+
272
+ # Process each result
273
+ for i in range(len(ids)):
274
+ try:
275
+ # Safe data extraction
276
+ result_id = ids[i] if i < len(ids) else f"unknown_{i}"
277
+ result_embedding = embeddings[i] if i < len(embeddings) else []
278
+ result_document = documents[i] if i < len(documents) else ""
279
+ result_metadata = metadatas[i] if i < len(metadatas) else {}
280
+ result_distance = distances[i] if i < len(distances) else 2.0
281
+
282
+ # Validate embedding
283
+ if result_embedding is None:
284
+ logger.warning(f"[ChromaDB] Unexpected embeddings type: {type(result_embedding)}")
285
+ continue
286
+
287
+ # Create safe result object
288
+ result = {
289
+ 'id': str(result_id),
290
+ 'text': str(result_document) if result_document else "",
291
+ 'metadata': result_metadata if isinstance(result_metadata, dict) else {},
292
+ 'distance': float(result_distance) if result_distance is not None else 2.0,
293
+ 'embedding': result_embedding if isinstance(result_embedding, list) else []
294
+ }
295
+
296
+ processed_results.append(result)
297
+
298
+ except Exception as e:
299
+ logger.warning(f"[ChromaDB] Error processing result {i}: {e}")
300
+ continue
301
+
302
+ except Exception as e:
303
+ logger.error(f"[ChromaDB] Error processing query results: {e}")
304
+ return []
305
+
306
+ logger.info(f"Query returned {len(processed_results)} results.")
307
+ return processed_results
308
+
309
+ except Exception as e:
310
+ logger.error(f"[ChromaDB] Critical error in query_vectors: {e}")
311
+ return []
312
+
313
+ async def _build_bm25_index(self):
314
+ """Build BM25 index from all documents (v2.1 Hybrid Search)"""
315
+ if not BM25_AVAILABLE:
316
+ return
317
+
318
+ await self._ensure_initialized()
319
+
320
+ if self.collection.count() == 0:
321
+ logger.debug("[BM25] Collection empty, skipping index build")
322
+ return
323
+
324
+ try:
325
+ # Get all documents
326
+ all_data = self.collection.get(include=["documents", "metadatas"])
327
+ self.bm25_ids = all_data.get("ids", [])
328
+ self.bm25_docs = all_data.get("documents", [])
329
+ self.bm25_metadatas = all_data.get("metadatas", [])
330
+
331
+ # Tokenize documents for BM25
332
+ tokenized_docs = [doc.lower().split() for doc in self.bm25_docs]
333
+
334
+ # Build BM25 index
335
+ self.bm25_index = BM25Okapi(tokenized_docs)
336
+ self._bm25_needs_rebuild = False
337
+
338
+ logger.debug(f"[BM25] Index built with {len(self.bm25_docs)} documents")
339
+ except Exception as e:
340
+ logger.warning(f"[BM25] Index build failed: {e}")
341
+ self.bm25_index = None
342
+
343
+ async def hybrid_query(
344
+ self,
345
+ query_vector: List[float],
346
+ query_text: str,
347
+ top_k: int = 5,
348
+ filters: Optional[Dict] = None
349
+ ) -> List[Dict[str, Any]]:
350
+ """
351
+ Hybrid search combining vector (semantic) + BM25 (lexical) with RRF fusion.
352
+ Based on industry best practices (Elastic, Weaviate, Microsoft Azure, 2025).
353
+
354
+ Args:
355
+ query_vector: Dense embedding for semantic search
356
+ query_text: Text query for BM25 lexical search
357
+ top_k: Number of results to return
358
+ filters: Optional metadata filters
359
+
360
+ Returns:
361
+ Fused results ranked by Reciprocal Rank Fusion (RRF)
362
+ """
363
+ # 1. Vector search (semantic similarity)
364
+ vector_results = await self.query_vectors(query_vector, top_k=top_k*2, filters=filters)
365
+
366
+ # If BM25 not available, fall back to pure vector search
367
+ if not BM25_AVAILABLE or not self.bm25_index:
368
+ return vector_results[:top_k]
369
+
370
+ try:
371
+ # 2. Rebuild BM25 index if needed
372
+ if self._bm25_needs_rebuild:
373
+ await self._build_bm25_index()
374
+
375
+ if not self.bm25_index:
376
+ # BM25 build failed, use vector only
377
+ return vector_results[:top_k]
378
+
379
+ # 3. BM25 search (lexical matching)
380
+ tokenized_query = query_text.lower().split()
381
+ bm25_scores = self.bm25_index.get_scores(tokenized_query)
382
+
383
+ # Get top BM25 results
384
+ top_bm25_indices = sorted(
385
+ range(len(bm25_scores)),
386
+ key=lambda i: bm25_scores[i],
387
+ reverse=True
388
+ )[:top_k*2]
389
+
390
+ bm25_results = []
391
+ for idx in top_bm25_indices:
392
+ if idx < len(self.bm25_ids):
393
+ bm25_results.append({
394
+ "id": self.bm25_ids[idx],
395
+ "text": self.bm25_docs[idx],
396
+ "metadata": self.bm25_metadatas[idx] if idx < len(self.bm25_metadatas) else {},
397
+ "bm25_score": float(bm25_scores[idx]),
398
+ "distance": max(0.0, 1.0 - (bm25_scores[idx] / 100.0)) # Normalize to distance
399
+ })
400
+
401
+ # 4. Reciprocal Rank Fusion (RRF) with k=60 (research-backed constant)
402
+ rrf_scores = {}
403
+
404
+ # Add vector search rankings
405
+ for rank, result in enumerate(vector_results):
406
+ doc_id = result["id"]
407
+ rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1.0 / (rank + 60)
408
+
409
+ # Add BM25 rankings
410
+ for rank, result in enumerate(bm25_results):
411
+ doc_id = result["id"]
412
+ rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1.0 / (rank + 60)
413
+
414
+ # 5. Merge results and add RRF scores
415
+ merged = {}
416
+ for r in vector_results + bm25_results:
417
+ doc_id = r["id"]
418
+ if doc_id not in merged:
419
+ merged[doc_id] = r
420
+ merged[doc_id]["rrf_score"] = rrf_scores.get(doc_id, 0.0)
421
+ # Keep original distance, but add RRF for ranking
422
+
423
+ # 6. Sort by RRF score and return top-k
424
+ final_results = sorted(
425
+ merged.values(),
426
+ key=lambda x: x.get("rrf_score", 0.0),
427
+ reverse=True
428
+ )[:top_k]
429
+
430
+ logger.debug(f"[HYBRID] Merged {len(vector_results)} vector + {len(bm25_results)} BM25 → {len(final_results)} results")
431
+ return final_results
432
+
433
+ except Exception as e:
434
+ logger.warning(f"[HYBRID] Hybrid search failed, falling back to vector only: {e}")
435
+ return vector_results[:top_k]
436
+
437
+ async def get_collection_count(self) -> int:
438
+ """Get the total number of items in the collection"""
439
+ await self._ensure_initialized()
440
+ try:
441
+ count = self.collection.count()
442
+ logger.debug(f"Collection '{self.collection_name}' contains {count} items")
443
+ return count
444
+ except Exception as e:
445
+ logger.error(f"Error getting collection count: {e}")
446
+ return 0
447
+
448
+ async def get_vectors_by_ids(
449
+ self,
450
+ ids: List[str]
451
+ ) -> Dict[str, Any]:
452
+ await self._ensure_initialized()
453
+ try:
454
+ result = self.collection.get(ids=ids, include=["documents", "embeddings", "metadatas"])
455
+ return result
456
+ except Exception as e:
457
+ logger.error(f"Failed to get vectors by ids: {e}", exc_info=True)
458
+ return {}
459
+
460
+ def list_all_ids(self) -> List[str]:
461
+ if self.collection is None:
462
+ raise RuntimeError("ChromaDB collection not initialized")
463
+ result = self.collection.get(include=[])
464
+ return result.get('ids', [])
465
+
466
+ def delete_vectors(self, ids: List[str]):
467
+ if self.collection is None:
468
+ raise RuntimeError("ChromaDB collection not initialized")
469
+ self.collection.delete(ids=ids)
470
+
471
+ def get_all_vectors(self) -> List[Dict[str, Any]]:
472
+ if self.collection is None:
473
+ raise RuntimeError("ChromaDB collection not initialized")
474
+ results = self.collection.get(include=["embeddings", "metadatas"])
475
+ ids = results.get("ids", [])
476
+ vectors = results.get("embeddings", [])
477
+ metadatas = results.get("metadatas", [])
478
+ out = []
479
+ for i in range(len(ids)):
480
+ vector = None
481
+ if isinstance(vectors, (list, tuple)) and len(vectors) > i:
482
+ vector = vectors[i]
483
+ elif hasattr(vectors, '__len__') and hasattr(vectors, '__getitem__') and len(vectors) > i:
484
+ vector = vectors[i]
485
+ metadata = metadatas[i] if isinstance(metadatas, (list, tuple)) and len(metadatas) > i else {}
486
+ out.append({
487
+ "id": ids[i],
488
+ "vector": vector,
489
+ "metadata": metadata,
490
+ })
491
+ return out
492
+
493
+ def get_fragment(self, fragment_id: str) -> Optional[Dict[str, Any]]:
494
+ if self.collection is None:
495
+ raise RuntimeError("ChromaDB collection not initialized. Cannot get fragment.")
496
+ result = self.collection.get(ids=[fragment_id], include=["embeddings", "metadatas", "documents"])
497
+ if not result or not result.get("ids"):
498
+ return None
499
+ embeddings = result.get("embeddings", [])
500
+ vector = None
501
+ if isinstance(embeddings, (list, tuple)) and len(embeddings) > 0:
502
+ vector = embeddings[0]
503
+ elif hasattr(embeddings, '__len__') and hasattr(embeddings, '__getitem__') and len(embeddings) > 0:
504
+ vector = embeddings[0]
505
+ metadatas = result.get("metadatas", [])
506
+ metadata = metadatas[0] if isinstance(metadatas, (list, tuple)) and len(metadatas) > 0 else {}
507
+ documents = result.get("documents", [])
508
+ content = documents[0] if isinstance(documents, (list, tuple)) and len(documents) > 0 else ""
509
+ return {
510
+ "id": result["ids"][0],
511
+ "vector": vector,
512
+ "metadata": metadata,
513
+ "content": content,
514
+ }
515
+
516
+ def update_fragment_metadata(self, fragment_id: str, metadata_updates: Dict[str, Any]):
517
+ if self.collection is None:
518
+ raise RuntimeError("ChromaDB collection not initialized")
519
+ frag = self.get_fragment(fragment_id)
520
+ if not frag:
521
+ logger.warning(f"update_fragment_metadata: No fragment with id={fragment_id}")
522
+ return
523
+ if frag.get("vector") is None:
524
+ logger.warning(
525
+ f"Skipping metadata update for fragment {fragment_id} "
526
+ "because it has no associated vector."
527
+ )
528
+ return
529
+ metadata = frag.get("metadata", {}) or {}
530
+ metadata.update(metadata_updates)
531
+ self.collection.upsert(
532
+ ids=[fragment_id],
533
+ embeddings=[frag.get("vector")],
534
+ metadatas=[metadata]
535
+ )
536
+ logger.info(f"Fragment {fragment_id} metadata updated with {metadata_updates}")
537
+
538
+ def update_fragment_score(self, fragment_id: str, new_score: float):
539
+ if self.collection is None:
540
+ raise RuntimeError("ChromaDB collection not initialized")
541
+ frag = self.get_fragment(fragment_id)
542
+ if not frag:
543
+ logger.warning(f"update_fragment_score: No fragment with id={fragment_id}")
544
+ return
545
+ if frag.get("vector") is None:
546
+ logger.warning(
547
+ f"Skipping score update for fragment {fragment_id} "
548
+ "because it has no associated vector."
549
+ )
550
+ return
551
+ metadata = frag.get("metadata", {}) or {}
552
+ metadata["composite_score"] = new_score
553
+ self.collection.upsert(
554
+ ids=[fragment_id],
555
+ embeddings=[frag.get("vector")],
556
+ metadatas=[metadata]
557
+ )
558
+ logger.info(f"Fragment {fragment_id} composite_score updated to {new_score}")
559
+
560
+ async def update_metadata(self, doc_id: str, metadata: Dict[str, Any]):
561
+ """Update metadata for existing document without re-embedding.
562
+
563
+ Used by deduplication system to increment counters (e.g., mentioned_count)
564
+ or update quality metrics without regenerating embeddings.
565
+ """
566
+ await self._ensure_initialized()
567
+ if self.collection is None:
568
+ raise RuntimeError("ChromaDB collection not initialized")
569
+
570
+ try:
571
+ # Get existing document to preserve embedding
572
+ frag = self.get_fragment(doc_id)
573
+ if not frag:
574
+ logger.warning(f"update_metadata: No document with id={doc_id}")
575
+ return
576
+
577
+ if frag.get("vector") is None:
578
+ logger.warning(
579
+ f"Skipping metadata update for {doc_id} "
580
+ "because it has no associated vector."
581
+ )
582
+ return
583
+
584
+ # Update with new metadata while preserving embedding
585
+ self.collection.update(
586
+ ids=[doc_id],
587
+ metadatas=[metadata]
588
+ )
589
+ logger.info(f"Metadata updated for document {doc_id}")
590
+
591
+ except Exception as e:
592
+ logger.error(f"Failed to update metadata for {doc_id}: {e}")
593
+ raise
594
+
595
+ async def cleanup(self):
596
+ """Gracefully cleanup ChromaDB connections"""
597
+ try:
598
+ if self.collection:
599
+ # Persist any pending writes
600
+ if hasattr(self.collection, 'persist'):
601
+ self.collection.persist()
602
+ self.collection = None
603
+
604
+ if self.client:
605
+ # Close the client connection
606
+ if hasattr(self.client, 'close'):
607
+ self.client.close()
608
+ self.client = None
609
+
610
+ logger.info(f"ChromaDB adapter cleaned up for {self.collection_name}")
611
+ except Exception as e:
612
+ logger.warning(f"Error during ChromaDB cleanup: {e}")
613
+
614
+ def __del__(self):
615
+ """Cleanup on deletion"""
616
+ try:
617
+ if self.client or self.collection:
618
+ import asyncio
619
+ loop = asyncio.new_event_loop()
620
+ loop.run_until_complete(self.cleanup())
621
+ loop.close()
622
+ except:
623
+ pass # Ignore errors in destructor