roampal 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roampal/__init__.py +29 -0
- roampal/__main__.py +6 -0
- roampal/backend/__init__.py +1 -0
- roampal/backend/modules/__init__.py +1 -0
- roampal/backend/modules/memory/__init__.py +43 -0
- roampal/backend/modules/memory/chromadb_adapter.py +623 -0
- roampal/backend/modules/memory/config.py +102 -0
- roampal/backend/modules/memory/content_graph.py +543 -0
- roampal/backend/modules/memory/context_service.py +455 -0
- roampal/backend/modules/memory/embedding_service.py +96 -0
- roampal/backend/modules/memory/knowledge_graph_service.py +1052 -0
- roampal/backend/modules/memory/memory_bank_service.py +433 -0
- roampal/backend/modules/memory/memory_types.py +296 -0
- roampal/backend/modules/memory/outcome_service.py +400 -0
- roampal/backend/modules/memory/promotion_service.py +473 -0
- roampal/backend/modules/memory/routing_service.py +444 -0
- roampal/backend/modules/memory/scoring_service.py +324 -0
- roampal/backend/modules/memory/search_service.py +646 -0
- roampal/backend/modules/memory/tests/__init__.py +1 -0
- roampal/backend/modules/memory/tests/conftest.py +12 -0
- roampal/backend/modules/memory/tests/unit/__init__.py +1 -0
- roampal/backend/modules/memory/tests/unit/conftest.py +7 -0
- roampal/backend/modules/memory/tests/unit/test_knowledge_graph_service.py +517 -0
- roampal/backend/modules/memory/tests/unit/test_memory_bank_service.py +504 -0
- roampal/backend/modules/memory/tests/unit/test_outcome_service.py +485 -0
- roampal/backend/modules/memory/tests/unit/test_scoring_service.py +255 -0
- roampal/backend/modules/memory/tests/unit/test_search_service.py +413 -0
- roampal/backend/modules/memory/tests/unit/test_unified_memory_system.py +418 -0
- roampal/backend/modules/memory/unified_memory_system.py +1277 -0
- roampal/cli.py +638 -0
- roampal/hooks/__init__.py +16 -0
- roampal/hooks/session_manager.py +587 -0
- roampal/hooks/stop_hook.py +176 -0
- roampal/hooks/user_prompt_submit_hook.py +103 -0
- roampal/mcp/__init__.py +7 -0
- roampal/mcp/server.py +611 -0
- roampal/server/__init__.py +7 -0
- roampal/server/main.py +744 -0
- roampal-0.1.4.dist-info/METADATA +179 -0
- roampal-0.1.4.dist-info/RECORD +44 -0
- roampal-0.1.4.dist-info/WHEEL +5 -0
- roampal-0.1.4.dist-info/entry_points.txt +2 -0
- roampal-0.1.4.dist-info/licenses/LICENSE +190 -0
- roampal-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,623 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Dict, Any, Optional
|
|
3
|
+
import chromadb
|
|
4
|
+
import sys
|
|
5
|
+
import os
|
|
6
|
+
from chromadb.config import Settings as ChromaSettings
|
|
7
|
+
|
|
8
|
+
# BM25 for hybrid search (v2.1 Enhanced Retrieval)
|
|
9
|
+
try:
|
|
10
|
+
from rank_bm25 import BM25Okapi
|
|
11
|
+
import nltk
|
|
12
|
+
# Download punkt tokenizer silently if needed
|
|
13
|
+
try:
|
|
14
|
+
nltk.data.find('tokenizers/punkt')
|
|
15
|
+
except LookupError:
|
|
16
|
+
nltk.download('punkt', quiet=True)
|
|
17
|
+
BM25_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
BM25_AVAILABLE = False
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
logger.warning("BM25 not available (pip install rank-bm25 nltk)")
|
|
22
|
+
|
|
23
|
+
# Add the backend directory to sys.path if not already there
|
|
24
|
+
backend_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
|
|
25
|
+
if backend_dir not in sys.path:
|
|
26
|
+
sys.path.insert(0, backend_dir)
|
|
27
|
+
|
|
28
|
+
from .embedding_service import EmbeddingService
|
|
29
|
+
import shutil
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
import time
|
|
32
|
+
import os
|
|
33
|
+
# Simple collection naming for single user
|
|
34
|
+
def get_loopsmith_collection():
|
|
35
|
+
return "loopsmith_memories"
|
|
36
|
+
|
|
37
|
+
def get_user_chromadb_collection(user_id: str, shard_id: str = "default") -> str:
|
|
38
|
+
"""Generate user-specific collection name for multi-user support"""
|
|
39
|
+
return f"user_{user_id}_{shard_id}_memories"
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
DEFAULT_COLLECTION_NAME = "loopsmith_memories"
|
|
44
|
+
|
|
45
|
+
class ChromaDBAdapter:
|
|
46
|
+
"""
|
|
47
|
+
Concrete implementation of the VectorDBInterface using ChromaDB.
|
|
48
|
+
Handles persistent local vector storage and retrieval for Roampal memories.
|
|
49
|
+
Supports collection-specific persistence directories.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
persistence_directory: str = None,
|
|
55
|
+
use_server: bool = None, # None = auto-detect
|
|
56
|
+
user_id: Optional[str] = None,
|
|
57
|
+
# Backwards compatibility with old API
|
|
58
|
+
collection_name: str = None,
|
|
59
|
+
persist_directory: str = None
|
|
60
|
+
):
|
|
61
|
+
# Handle old API: ChromaDBAdapter(collection_name=..., persist_directory=...)
|
|
62
|
+
if persist_directory is not None:
|
|
63
|
+
persistence_directory = persist_directory
|
|
64
|
+
if persistence_directory is None:
|
|
65
|
+
persistence_directory = "./chromadb" # Default fallback
|
|
66
|
+
|
|
67
|
+
self.db_path = str(persistence_directory) # Keep for compatibility but not used in server mode
|
|
68
|
+
|
|
69
|
+
# Auto-detect server mode: use local embedded mode for benchmarks/tests
|
|
70
|
+
if use_server is None:
|
|
71
|
+
use_server = os.environ.get("ROAMPAL_USE_SERVER", "").lower() == "true"
|
|
72
|
+
self.use_server = use_server
|
|
73
|
+
self.client = None
|
|
74
|
+
self.collection: Optional[chromadb.Collection] = None
|
|
75
|
+
self.collection_name: Optional[str] = collection_name # Store for auto-init
|
|
76
|
+
self._auto_init_lock = False
|
|
77
|
+
self._current_path = None
|
|
78
|
+
self.user_id = user_id # Add user context
|
|
79
|
+
self._pending_collection_name = collection_name # For backwards compat auto-init
|
|
80
|
+
|
|
81
|
+
# BM25 index for hybrid search (v2.1)
|
|
82
|
+
self.bm25_index = None
|
|
83
|
+
self.bm25_docs = []
|
|
84
|
+
self.bm25_ids = []
|
|
85
|
+
self.bm25_metadatas = []
|
|
86
|
+
self._bm25_needs_rebuild = True
|
|
87
|
+
|
|
88
|
+
# Only create local dirs if not using server
|
|
89
|
+
if not self.use_server:
|
|
90
|
+
os.makedirs(self.db_path, exist_ok=True)
|
|
91
|
+
# Disabled automatic cleanup - use cleanup_chromadb.py utility instead
|
|
92
|
+
# self._clean_old_folders() # Can cause lock issues on Windows
|
|
93
|
+
|
|
94
|
+
def _clean_old_folders(self):
|
|
95
|
+
"""Delete old UUID folders in the vector store directory with retry on lock."""
|
|
96
|
+
# Cleanup debug removed
|
|
97
|
+
logger.info(f"ChromaDB cleanup called for: {self.db_path}")
|
|
98
|
+
|
|
99
|
+
for entry in os.scandir(self.db_path):
|
|
100
|
+
if entry.is_dir() and len(entry.name) == 36 and entry.name.count('-') == 4: # UUID pattern
|
|
101
|
+
folder_path = Path(self.db_path) / entry.name
|
|
102
|
+
|
|
103
|
+
for attempt in range(3): # Retry 3 times
|
|
104
|
+
try:
|
|
105
|
+
shutil.rmtree(folder_path)
|
|
106
|
+
logger.info(f"Deleted old folder: {entry.name}")
|
|
107
|
+
break
|
|
108
|
+
except PermissionError as e:
|
|
109
|
+
if attempt < 2: # Wait and retry
|
|
110
|
+
time.sleep(1)
|
|
111
|
+
continue
|
|
112
|
+
logger.warning(f"Failed to delete old folder {entry.name} after retries: {e}")
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.warning(f"Failed to delete old folder {entry.name}: {e}")
|
|
115
|
+
break
|
|
116
|
+
|
|
117
|
+
async def initialize(
|
|
118
|
+
self,
|
|
119
|
+
collection_name: str = DEFAULT_COLLECTION_NAME,
|
|
120
|
+
fragment_id: Optional[str] = None,
|
|
121
|
+
embedding_model_name: Optional[str] = None,
|
|
122
|
+
user_id: Optional[str] = None
|
|
123
|
+
):
|
|
124
|
+
if self.client is None:
|
|
125
|
+
if self.use_server:
|
|
126
|
+
# Connect to ChromaDB server
|
|
127
|
+
self.client = chromadb.HttpClient(
|
|
128
|
+
host="localhost",
|
|
129
|
+
port=8003,
|
|
130
|
+
settings=ChromaSettings(anonymized_telemetry=False)
|
|
131
|
+
)
|
|
132
|
+
logger.info(f"ChromaDB client connected to server at localhost:8003")
|
|
133
|
+
else:
|
|
134
|
+
# Use local embedded mode (for testing only)
|
|
135
|
+
self.client = chromadb.PersistentClient(path=self.db_path)
|
|
136
|
+
logger.info(f"ChromaDB client initialized for local path: {self.db_path}")
|
|
137
|
+
|
|
138
|
+
# Use user-isolated collection if user_id provided
|
|
139
|
+
if user_id:
|
|
140
|
+
self.user_id = user_id
|
|
141
|
+
# Create user-specific collection name
|
|
142
|
+
shard_id = fragment_id or "roampal"
|
|
143
|
+
collection_name = get_user_chromadb_collection(user_id, shard_id)
|
|
144
|
+
logger.info(f"Using user-isolated collection: {collection_name}")
|
|
145
|
+
|
|
146
|
+
# Store collection name for reference
|
|
147
|
+
self.collection_name = collection_name
|
|
148
|
+
|
|
149
|
+
# Use get_or_create to reuse existing collection
|
|
150
|
+
# Don't use ChromaDB's default embedding function - Roampal provides embeddings manually
|
|
151
|
+
# This prevents dimension mismatch (ChromaDB default is 384d, Roampal uses 768d)
|
|
152
|
+
self.collection = self.client.get_or_create_collection(
|
|
153
|
+
name=collection_name,
|
|
154
|
+
embedding_function=None # Manual embeddings from EmbeddingService
|
|
155
|
+
)
|
|
156
|
+
logger.info(f"ChromaDB collection '{collection_name}' ready (lazy loaded)")
|
|
157
|
+
|
|
158
|
+
# No need to force file creation in server mode
|
|
159
|
+
if not self.use_server and self.collection.count() == 0:
|
|
160
|
+
logger.info(f"Empty collection '{collection_name}' initialized in embedded mode")
|
|
161
|
+
|
|
162
|
+
async def _ensure_initialized(self):
|
|
163
|
+
if self.collection is None and not self._auto_init_lock:
|
|
164
|
+
self._auto_init_lock = True
|
|
165
|
+
logger.warning("ChromaDBAdapter auto-initializing collection on demand (explicit .initialize() was not called).")
|
|
166
|
+
await self.initialize(collection_name=self.collection_name or DEFAULT_COLLECTION_NAME)
|
|
167
|
+
self._auto_init_lock = False
|
|
168
|
+
|
|
169
|
+
async def upsert_vectors(
|
|
170
|
+
self,
|
|
171
|
+
ids: List[str],
|
|
172
|
+
vectors: List[List[float]],
|
|
173
|
+
metadatas: List[Dict[str, Any]]
|
|
174
|
+
):
|
|
175
|
+
await self._ensure_initialized()
|
|
176
|
+
if not (len(ids) == len(vectors) == len(metadatas)):
|
|
177
|
+
error_msg = (
|
|
178
|
+
f"Length of ids ({len(ids)}), "
|
|
179
|
+
f"vectors ({len(vectors)}), and "
|
|
180
|
+
f"metadatas ({len(metadatas)}) must be the same."
|
|
181
|
+
)
|
|
182
|
+
logger.error(error_msg)
|
|
183
|
+
raise ValueError(error_msg)
|
|
184
|
+
|
|
185
|
+
logger.info(f"Upserting {len(ids)} vectors into collection '{self.collection.name}'...")
|
|
186
|
+
try:
|
|
187
|
+
# Extract documents from metadata for ChromaDB persistence
|
|
188
|
+
documents = []
|
|
189
|
+
for metadata in metadatas:
|
|
190
|
+
# Use the content or text field as the document
|
|
191
|
+
doc = metadata.get('content', metadata.get('text', metadata.get('original_text', '')))
|
|
192
|
+
documents.append(str(doc))
|
|
193
|
+
|
|
194
|
+
self.collection.upsert(
|
|
195
|
+
ids=ids,
|
|
196
|
+
embeddings=vectors,
|
|
197
|
+
metadatas=metadatas,
|
|
198
|
+
documents=documents # ChromaDB needs documents to persist properly
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Mark BM25 index as needing rebuild (v2.1 hybrid search)
|
|
202
|
+
self._bm25_needs_rebuild = True
|
|
203
|
+
|
|
204
|
+
# ChromaDB now handles persistence automatically in both modes
|
|
205
|
+
# The reconnection workaround has been removed as of 2024-09-17
|
|
206
|
+
# Data is persisted on write with proper transaction handling
|
|
207
|
+
|
|
208
|
+
logger.info(f"Successfully upserted {len(ids)} vectors.")
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.error(f"Failed to upsert vectors into ChromaDB: {e}", exc_info=True)
|
|
211
|
+
raise
|
|
212
|
+
|
|
213
|
+
async def query_vectors(self, query_vector: List[float], top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
|
|
214
|
+
"""Query vectors with comprehensive error handling."""
|
|
215
|
+
try:
|
|
216
|
+
# Check if collection is empty first
|
|
217
|
+
await self._ensure_initialized()
|
|
218
|
+
|
|
219
|
+
# v0.2.4: Refresh collection to see changes from other processes (e.g., UI uploads)
|
|
220
|
+
# ChromaDB's PersistentClient caches collection state; re-fetching syncs with disk
|
|
221
|
+
if self.client and self.collection_name:
|
|
222
|
+
self.collection = self.client.get_or_create_collection(
|
|
223
|
+
name=self.collection_name,
|
|
224
|
+
metadata={"hnsw:space": "l2"}
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if self.collection and self.collection.count() == 0:
|
|
228
|
+
logger.debug(f"[ChromaDB] Collection '{self.collection_name}' is empty, returning empty results")
|
|
229
|
+
return []
|
|
230
|
+
|
|
231
|
+
# Validate query vector
|
|
232
|
+
if not query_vector or not isinstance(query_vector, list):
|
|
233
|
+
logger.warning(f"[ChromaDB] Invalid query vector: {type(query_vector)}")
|
|
234
|
+
return []
|
|
235
|
+
|
|
236
|
+
# Check for None values in query vector
|
|
237
|
+
if any(v is None for v in query_vector):
|
|
238
|
+
logger.warning("[ChromaDB] Query vector contains None values")
|
|
239
|
+
return []
|
|
240
|
+
|
|
241
|
+
# Ensure query vector is numeric
|
|
242
|
+
try:
|
|
243
|
+
query_vector = [float(v) for v in query_vector]
|
|
244
|
+
except (ValueError, TypeError) as e:
|
|
245
|
+
logger.warning(f"[ChromaDB] Failed to convert query vector to floats: {e}")
|
|
246
|
+
return []
|
|
247
|
+
|
|
248
|
+
logger.info(f"Querying for top {top_k} vectors in collection '{self.collection_name}'...")
|
|
249
|
+
|
|
250
|
+
# Perform query with error handling
|
|
251
|
+
try:
|
|
252
|
+
results = self.collection.query(
|
|
253
|
+
query_embeddings=[query_vector],
|
|
254
|
+
n_results=top_k,
|
|
255
|
+
where=filters
|
|
256
|
+
)
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.error(f"[ChromaDB] Query failed: {e}")
|
|
259
|
+
return []
|
|
260
|
+
|
|
261
|
+
# Process results with comprehensive error handling
|
|
262
|
+
processed_results = []
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
# Extract data from results
|
|
266
|
+
ids = results.get('ids', [[]])[0] if results.get('ids') else []
|
|
267
|
+
embeddings = results.get('embeddings', [[]])[0] if results.get('embeddings') else []
|
|
268
|
+
documents = results.get('documents', [[]])[0] if results.get('documents') else []
|
|
269
|
+
metadatas = results.get('metadatas', [[]])[0] if results.get('metadatas') else []
|
|
270
|
+
distances = results.get('distances', [[]])[0] if results.get('distances') else []
|
|
271
|
+
|
|
272
|
+
# Process each result
|
|
273
|
+
for i in range(len(ids)):
|
|
274
|
+
try:
|
|
275
|
+
# Safe data extraction
|
|
276
|
+
result_id = ids[i] if i < len(ids) else f"unknown_{i}"
|
|
277
|
+
result_embedding = embeddings[i] if i < len(embeddings) else []
|
|
278
|
+
result_document = documents[i] if i < len(documents) else ""
|
|
279
|
+
result_metadata = metadatas[i] if i < len(metadatas) else {}
|
|
280
|
+
result_distance = distances[i] if i < len(distances) else 2.0
|
|
281
|
+
|
|
282
|
+
# Validate embedding
|
|
283
|
+
if result_embedding is None:
|
|
284
|
+
logger.warning(f"[ChromaDB] Unexpected embeddings type: {type(result_embedding)}")
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
# Create safe result object
|
|
288
|
+
result = {
|
|
289
|
+
'id': str(result_id),
|
|
290
|
+
'text': str(result_document) if result_document else "",
|
|
291
|
+
'metadata': result_metadata if isinstance(result_metadata, dict) else {},
|
|
292
|
+
'distance': float(result_distance) if result_distance is not None else 2.0,
|
|
293
|
+
'embedding': result_embedding if isinstance(result_embedding, list) else []
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
processed_results.append(result)
|
|
297
|
+
|
|
298
|
+
except Exception as e:
|
|
299
|
+
logger.warning(f"[ChromaDB] Error processing result {i}: {e}")
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
except Exception as e:
|
|
303
|
+
logger.error(f"[ChromaDB] Error processing query results: {e}")
|
|
304
|
+
return []
|
|
305
|
+
|
|
306
|
+
logger.info(f"Query returned {len(processed_results)} results.")
|
|
307
|
+
return processed_results
|
|
308
|
+
|
|
309
|
+
except Exception as e:
|
|
310
|
+
logger.error(f"[ChromaDB] Critical error in query_vectors: {e}")
|
|
311
|
+
return []
|
|
312
|
+
|
|
313
|
+
async def _build_bm25_index(self):
|
|
314
|
+
"""Build BM25 index from all documents (v2.1 Hybrid Search)"""
|
|
315
|
+
if not BM25_AVAILABLE:
|
|
316
|
+
return
|
|
317
|
+
|
|
318
|
+
await self._ensure_initialized()
|
|
319
|
+
|
|
320
|
+
if self.collection.count() == 0:
|
|
321
|
+
logger.debug("[BM25] Collection empty, skipping index build")
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
# Get all documents
|
|
326
|
+
all_data = self.collection.get(include=["documents", "metadatas"])
|
|
327
|
+
self.bm25_ids = all_data.get("ids", [])
|
|
328
|
+
self.bm25_docs = all_data.get("documents", [])
|
|
329
|
+
self.bm25_metadatas = all_data.get("metadatas", [])
|
|
330
|
+
|
|
331
|
+
# Tokenize documents for BM25
|
|
332
|
+
tokenized_docs = [doc.lower().split() for doc in self.bm25_docs]
|
|
333
|
+
|
|
334
|
+
# Build BM25 index
|
|
335
|
+
self.bm25_index = BM25Okapi(tokenized_docs)
|
|
336
|
+
self._bm25_needs_rebuild = False
|
|
337
|
+
|
|
338
|
+
logger.debug(f"[BM25] Index built with {len(self.bm25_docs)} documents")
|
|
339
|
+
except Exception as e:
|
|
340
|
+
logger.warning(f"[BM25] Index build failed: {e}")
|
|
341
|
+
self.bm25_index = None
|
|
342
|
+
|
|
343
|
+
async def hybrid_query(
|
|
344
|
+
self,
|
|
345
|
+
query_vector: List[float],
|
|
346
|
+
query_text: str,
|
|
347
|
+
top_k: int = 5,
|
|
348
|
+
filters: Optional[Dict] = None
|
|
349
|
+
) -> List[Dict[str, Any]]:
|
|
350
|
+
"""
|
|
351
|
+
Hybrid search combining vector (semantic) + BM25 (lexical) with RRF fusion.
|
|
352
|
+
Based on industry best practices (Elastic, Weaviate, Microsoft Azure, 2025).
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
query_vector: Dense embedding for semantic search
|
|
356
|
+
query_text: Text query for BM25 lexical search
|
|
357
|
+
top_k: Number of results to return
|
|
358
|
+
filters: Optional metadata filters
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
Fused results ranked by Reciprocal Rank Fusion (RRF)
|
|
362
|
+
"""
|
|
363
|
+
# 1. Vector search (semantic similarity)
|
|
364
|
+
vector_results = await self.query_vectors(query_vector, top_k=top_k*2, filters=filters)
|
|
365
|
+
|
|
366
|
+
# If BM25 not available, fall back to pure vector search
|
|
367
|
+
if not BM25_AVAILABLE or not self.bm25_index:
|
|
368
|
+
return vector_results[:top_k]
|
|
369
|
+
|
|
370
|
+
try:
|
|
371
|
+
# 2. Rebuild BM25 index if needed
|
|
372
|
+
if self._bm25_needs_rebuild:
|
|
373
|
+
await self._build_bm25_index()
|
|
374
|
+
|
|
375
|
+
if not self.bm25_index:
|
|
376
|
+
# BM25 build failed, use vector only
|
|
377
|
+
return vector_results[:top_k]
|
|
378
|
+
|
|
379
|
+
# 3. BM25 search (lexical matching)
|
|
380
|
+
tokenized_query = query_text.lower().split()
|
|
381
|
+
bm25_scores = self.bm25_index.get_scores(tokenized_query)
|
|
382
|
+
|
|
383
|
+
# Get top BM25 results
|
|
384
|
+
top_bm25_indices = sorted(
|
|
385
|
+
range(len(bm25_scores)),
|
|
386
|
+
key=lambda i: bm25_scores[i],
|
|
387
|
+
reverse=True
|
|
388
|
+
)[:top_k*2]
|
|
389
|
+
|
|
390
|
+
bm25_results = []
|
|
391
|
+
for idx in top_bm25_indices:
|
|
392
|
+
if idx < len(self.bm25_ids):
|
|
393
|
+
bm25_results.append({
|
|
394
|
+
"id": self.bm25_ids[idx],
|
|
395
|
+
"text": self.bm25_docs[idx],
|
|
396
|
+
"metadata": self.bm25_metadatas[idx] if idx < len(self.bm25_metadatas) else {},
|
|
397
|
+
"bm25_score": float(bm25_scores[idx]),
|
|
398
|
+
"distance": max(0.0, 1.0 - (bm25_scores[idx] / 100.0)) # Normalize to distance
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
# 4. Reciprocal Rank Fusion (RRF) with k=60 (research-backed constant)
|
|
402
|
+
rrf_scores = {}
|
|
403
|
+
|
|
404
|
+
# Add vector search rankings
|
|
405
|
+
for rank, result in enumerate(vector_results):
|
|
406
|
+
doc_id = result["id"]
|
|
407
|
+
rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1.0 / (rank + 60)
|
|
408
|
+
|
|
409
|
+
# Add BM25 rankings
|
|
410
|
+
for rank, result in enumerate(bm25_results):
|
|
411
|
+
doc_id = result["id"]
|
|
412
|
+
rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1.0 / (rank + 60)
|
|
413
|
+
|
|
414
|
+
# 5. Merge results and add RRF scores
|
|
415
|
+
merged = {}
|
|
416
|
+
for r in vector_results + bm25_results:
|
|
417
|
+
doc_id = r["id"]
|
|
418
|
+
if doc_id not in merged:
|
|
419
|
+
merged[doc_id] = r
|
|
420
|
+
merged[doc_id]["rrf_score"] = rrf_scores.get(doc_id, 0.0)
|
|
421
|
+
# Keep original distance, but add RRF for ranking
|
|
422
|
+
|
|
423
|
+
# 6. Sort by RRF score and return top-k
|
|
424
|
+
final_results = sorted(
|
|
425
|
+
merged.values(),
|
|
426
|
+
key=lambda x: x.get("rrf_score", 0.0),
|
|
427
|
+
reverse=True
|
|
428
|
+
)[:top_k]
|
|
429
|
+
|
|
430
|
+
logger.debug(f"[HYBRID] Merged {len(vector_results)} vector + {len(bm25_results)} BM25 → {len(final_results)} results")
|
|
431
|
+
return final_results
|
|
432
|
+
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logger.warning(f"[HYBRID] Hybrid search failed, falling back to vector only: {e}")
|
|
435
|
+
return vector_results[:top_k]
|
|
436
|
+
|
|
437
|
+
async def get_collection_count(self) -> int:
|
|
438
|
+
"""Get the total number of items in the collection"""
|
|
439
|
+
await self._ensure_initialized()
|
|
440
|
+
try:
|
|
441
|
+
count = self.collection.count()
|
|
442
|
+
logger.debug(f"Collection '{self.collection_name}' contains {count} items")
|
|
443
|
+
return count
|
|
444
|
+
except Exception as e:
|
|
445
|
+
logger.error(f"Error getting collection count: {e}")
|
|
446
|
+
return 0
|
|
447
|
+
|
|
448
|
+
async def get_vectors_by_ids(
|
|
449
|
+
self,
|
|
450
|
+
ids: List[str]
|
|
451
|
+
) -> Dict[str, Any]:
|
|
452
|
+
await self._ensure_initialized()
|
|
453
|
+
try:
|
|
454
|
+
result = self.collection.get(ids=ids, include=["documents", "embeddings", "metadatas"])
|
|
455
|
+
return result
|
|
456
|
+
except Exception as e:
|
|
457
|
+
logger.error(f"Failed to get vectors by ids: {e}", exc_info=True)
|
|
458
|
+
return {}
|
|
459
|
+
|
|
460
|
+
def list_all_ids(self) -> List[str]:
|
|
461
|
+
if self.collection is None:
|
|
462
|
+
raise RuntimeError("ChromaDB collection not initialized")
|
|
463
|
+
result = self.collection.get(include=[])
|
|
464
|
+
return result.get('ids', [])
|
|
465
|
+
|
|
466
|
+
def delete_vectors(self, ids: List[str]):
|
|
467
|
+
if self.collection is None:
|
|
468
|
+
raise RuntimeError("ChromaDB collection not initialized")
|
|
469
|
+
self.collection.delete(ids=ids)
|
|
470
|
+
|
|
471
|
+
def get_all_vectors(self) -> List[Dict[str, Any]]:
|
|
472
|
+
if self.collection is None:
|
|
473
|
+
raise RuntimeError("ChromaDB collection not initialized")
|
|
474
|
+
results = self.collection.get(include=["embeddings", "metadatas"])
|
|
475
|
+
ids = results.get("ids", [])
|
|
476
|
+
vectors = results.get("embeddings", [])
|
|
477
|
+
metadatas = results.get("metadatas", [])
|
|
478
|
+
out = []
|
|
479
|
+
for i in range(len(ids)):
|
|
480
|
+
vector = None
|
|
481
|
+
if isinstance(vectors, (list, tuple)) and len(vectors) > i:
|
|
482
|
+
vector = vectors[i]
|
|
483
|
+
elif hasattr(vectors, '__len__') and hasattr(vectors, '__getitem__') and len(vectors) > i:
|
|
484
|
+
vector = vectors[i]
|
|
485
|
+
metadata = metadatas[i] if isinstance(metadatas, (list, tuple)) and len(metadatas) > i else {}
|
|
486
|
+
out.append({
|
|
487
|
+
"id": ids[i],
|
|
488
|
+
"vector": vector,
|
|
489
|
+
"metadata": metadata,
|
|
490
|
+
})
|
|
491
|
+
return out
|
|
492
|
+
|
|
493
|
+
def get_fragment(self, fragment_id: str) -> Optional[Dict[str, Any]]:
|
|
494
|
+
if self.collection is None:
|
|
495
|
+
raise RuntimeError("ChromaDB collection not initialized. Cannot get fragment.")
|
|
496
|
+
result = self.collection.get(ids=[fragment_id], include=["embeddings", "metadatas", "documents"])
|
|
497
|
+
if not result or not result.get("ids"):
|
|
498
|
+
return None
|
|
499
|
+
embeddings = result.get("embeddings", [])
|
|
500
|
+
vector = None
|
|
501
|
+
if isinstance(embeddings, (list, tuple)) and len(embeddings) > 0:
|
|
502
|
+
vector = embeddings[0]
|
|
503
|
+
elif hasattr(embeddings, '__len__') and hasattr(embeddings, '__getitem__') and len(embeddings) > 0:
|
|
504
|
+
vector = embeddings[0]
|
|
505
|
+
metadatas = result.get("metadatas", [])
|
|
506
|
+
metadata = metadatas[0] if isinstance(metadatas, (list, tuple)) and len(metadatas) > 0 else {}
|
|
507
|
+
documents = result.get("documents", [])
|
|
508
|
+
content = documents[0] if isinstance(documents, (list, tuple)) and len(documents) > 0 else ""
|
|
509
|
+
return {
|
|
510
|
+
"id": result["ids"][0],
|
|
511
|
+
"vector": vector,
|
|
512
|
+
"metadata": metadata,
|
|
513
|
+
"content": content,
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
def update_fragment_metadata(self, fragment_id: str, metadata_updates: Dict[str, Any]):
|
|
517
|
+
if self.collection is None:
|
|
518
|
+
raise RuntimeError("ChromaDB collection not initialized")
|
|
519
|
+
frag = self.get_fragment(fragment_id)
|
|
520
|
+
if not frag:
|
|
521
|
+
logger.warning(f"update_fragment_metadata: No fragment with id={fragment_id}")
|
|
522
|
+
return
|
|
523
|
+
if frag.get("vector") is None:
|
|
524
|
+
logger.warning(
|
|
525
|
+
f"Skipping metadata update for fragment {fragment_id} "
|
|
526
|
+
"because it has no associated vector."
|
|
527
|
+
)
|
|
528
|
+
return
|
|
529
|
+
metadata = frag.get("metadata", {}) or {}
|
|
530
|
+
metadata.update(metadata_updates)
|
|
531
|
+
self.collection.upsert(
|
|
532
|
+
ids=[fragment_id],
|
|
533
|
+
embeddings=[frag.get("vector")],
|
|
534
|
+
metadatas=[metadata]
|
|
535
|
+
)
|
|
536
|
+
logger.info(f"Fragment {fragment_id} metadata updated with {metadata_updates}")
|
|
537
|
+
|
|
538
|
+
def update_fragment_score(self, fragment_id: str, new_score: float):
|
|
539
|
+
if self.collection is None:
|
|
540
|
+
raise RuntimeError("ChromaDB collection not initialized")
|
|
541
|
+
frag = self.get_fragment(fragment_id)
|
|
542
|
+
if not frag:
|
|
543
|
+
logger.warning(f"update_fragment_score: No fragment with id={fragment_id}")
|
|
544
|
+
return
|
|
545
|
+
if frag.get("vector") is None:
|
|
546
|
+
logger.warning(
|
|
547
|
+
f"Skipping score update for fragment {fragment_id} "
|
|
548
|
+
"because it has no associated vector."
|
|
549
|
+
)
|
|
550
|
+
return
|
|
551
|
+
metadata = frag.get("metadata", {}) or {}
|
|
552
|
+
metadata["composite_score"] = new_score
|
|
553
|
+
self.collection.upsert(
|
|
554
|
+
ids=[fragment_id],
|
|
555
|
+
embeddings=[frag.get("vector")],
|
|
556
|
+
metadatas=[metadata]
|
|
557
|
+
)
|
|
558
|
+
logger.info(f"Fragment {fragment_id} composite_score updated to {new_score}")
|
|
559
|
+
|
|
560
|
+
async def update_metadata(self, doc_id: str, metadata: Dict[str, Any]):
|
|
561
|
+
"""Update metadata for existing document without re-embedding.
|
|
562
|
+
|
|
563
|
+
Used by deduplication system to increment counters (e.g., mentioned_count)
|
|
564
|
+
or update quality metrics without regenerating embeddings.
|
|
565
|
+
"""
|
|
566
|
+
await self._ensure_initialized()
|
|
567
|
+
if self.collection is None:
|
|
568
|
+
raise RuntimeError("ChromaDB collection not initialized")
|
|
569
|
+
|
|
570
|
+
try:
|
|
571
|
+
# Get existing document to preserve embedding
|
|
572
|
+
frag = self.get_fragment(doc_id)
|
|
573
|
+
if not frag:
|
|
574
|
+
logger.warning(f"update_metadata: No document with id={doc_id}")
|
|
575
|
+
return
|
|
576
|
+
|
|
577
|
+
if frag.get("vector") is None:
|
|
578
|
+
logger.warning(
|
|
579
|
+
f"Skipping metadata update for {doc_id} "
|
|
580
|
+
"because it has no associated vector."
|
|
581
|
+
)
|
|
582
|
+
return
|
|
583
|
+
|
|
584
|
+
# Update with new metadata while preserving embedding
|
|
585
|
+
self.collection.update(
|
|
586
|
+
ids=[doc_id],
|
|
587
|
+
metadatas=[metadata]
|
|
588
|
+
)
|
|
589
|
+
logger.info(f"Metadata updated for document {doc_id}")
|
|
590
|
+
|
|
591
|
+
except Exception as e:
|
|
592
|
+
logger.error(f"Failed to update metadata for {doc_id}: {e}")
|
|
593
|
+
raise
|
|
594
|
+
|
|
595
|
+
async def cleanup(self):
|
|
596
|
+
"""Gracefully cleanup ChromaDB connections"""
|
|
597
|
+
try:
|
|
598
|
+
if self.collection:
|
|
599
|
+
# Persist any pending writes
|
|
600
|
+
if hasattr(self.collection, 'persist'):
|
|
601
|
+
self.collection.persist()
|
|
602
|
+
self.collection = None
|
|
603
|
+
|
|
604
|
+
if self.client:
|
|
605
|
+
# Close the client connection
|
|
606
|
+
if hasattr(self.client, 'close'):
|
|
607
|
+
self.client.close()
|
|
608
|
+
self.client = None
|
|
609
|
+
|
|
610
|
+
logger.info(f"ChromaDB adapter cleaned up for {self.collection_name}")
|
|
611
|
+
except Exception as e:
|
|
612
|
+
logger.warning(f"Error during ChromaDB cleanup: {e}")
|
|
613
|
+
|
|
614
|
+
def __del__(self):
|
|
615
|
+
"""Cleanup on deletion"""
|
|
616
|
+
try:
|
|
617
|
+
if self.client or self.collection:
|
|
618
|
+
import asyncio
|
|
619
|
+
loop = asyncio.new_event_loop()
|
|
620
|
+
loop.run_until_complete(self.cleanup())
|
|
621
|
+
loop.close()
|
|
622
|
+
except:
|
|
623
|
+
pass # Ignore errors in destructor
|