agent-brain-rag 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,320 @@
1
+ """Chroma vector store manager with thread-safe operations."""
2
+
3
+ import asyncio
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ import chromadb
10
+ from chromadb.config import Settings as ChromaSettings
11
+
12
+ from doc_serve_server.config import settings
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class SearchResult:
19
+ """Result from a similarity search."""
20
+
21
+ text: str
22
+ metadata: dict[str, Any]
23
+ score: float
24
+ chunk_id: str
25
+
26
+
27
+ class VectorStoreManager:
28
+ """
29
+ Manages Chroma vector store operations with thread-safe access.
30
+
31
+ This class provides a high-level interface for storing and retrieving
32
+ document embeddings using Chroma as the backend.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ persist_dir: Optional[str] = None,
38
+ collection_name: Optional[str] = None,
39
+ ):
40
+ """
41
+ Initialize the vector store manager.
42
+
43
+ Args:
44
+ persist_dir: Directory for persistent storage. Defaults to config value.
45
+ collection_name: Name of the collection. Defaults to config value.
46
+ """
47
+ self.persist_dir = persist_dir or settings.CHROMA_PERSIST_DIR
48
+ self.collection_name = collection_name or settings.COLLECTION_NAME
49
+ self._client: Optional[chromadb.PersistentClient] = None # type: ignore[valid-type]
50
+ self._collection: Optional[chromadb.Collection] = None
51
+ self._lock = asyncio.Lock()
52
+ self._initialized = False
53
+
54
+ @property
55
+ def is_initialized(self) -> bool:
56
+ """Check if the vector store is initialized."""
57
+ return self._initialized and self._collection is not None
58
+
59
+ async def initialize(self) -> None:
60
+ """
61
+ Initialize the Chroma client and collection.
62
+
63
+ Creates the persistence directory if it doesn't exist and
64
+ initializes or loads the existing collection.
65
+ """
66
+ async with self._lock:
67
+ if self._initialized:
68
+ return
69
+
70
+ # Ensure persistence directory exists
71
+ persist_path = Path(self.persist_dir)
72
+ persist_path.mkdir(parents=True, exist_ok=True)
73
+
74
+ # Initialize Chroma client
75
+ self._client = chromadb.PersistentClient(
76
+ path=str(persist_path),
77
+ settings=ChromaSettings(
78
+ anonymized_telemetry=False,
79
+ allow_reset=True,
80
+ ),
81
+ )
82
+
83
+ # Get or create collection
84
+ self._collection = self._client.get_or_create_collection(
85
+ name=self.collection_name,
86
+ metadata={"hnsw:space": "cosine"},
87
+ )
88
+
89
+ self._initialized = True
90
+ logger.info(
91
+ f"Vector store initialized: {self.collection_name} "
92
+ f"({self._collection.count()} existing documents)"
93
+ )
94
+
95
+ async def add_documents(
96
+ self,
97
+ ids: list[str],
98
+ embeddings: list[list[float]],
99
+ documents: list[str],
100
+ metadatas: Optional[list[dict[str, Any]]] = None,
101
+ ) -> int:
102
+ """
103
+ Add documents with embeddings to the vector store.
104
+
105
+ Args:
106
+ ids: Unique identifiers for each document.
107
+ embeddings: Embedding vectors for each document.
108
+ documents: Text content of each document.
109
+ metadatas: Optional metadata for each document.
110
+
111
+ Returns:
112
+ Number of documents added.
113
+ """
114
+ if not self.is_initialized:
115
+ raise RuntimeError("Vector store not initialized. Call initialize() first.")
116
+
117
+ if not (len(ids) == len(embeddings) == len(documents)):
118
+ raise ValueError("ids, embeddings, and documents must have the same length")
119
+
120
+ async with self._lock:
121
+ assert self._collection is not None
122
+ self._collection.add(
123
+ ids=ids,
124
+ embeddings=embeddings, # type: ignore[arg-type]
125
+ documents=documents,
126
+ metadatas=metadatas or [{}] * len(ids), # type: ignore[arg-type]
127
+ )
128
+
129
+ logger.debug(f"Added {len(ids)} documents to vector store")
130
+ return len(ids)
131
+
132
+ async def upsert_documents(
133
+ self,
134
+ ids: list[str],
135
+ embeddings: list[list[float]],
136
+ documents: list[str],
137
+ metadatas: Optional[list[dict[str, Any]]] = None,
138
+ ) -> int:
139
+ """
140
+ Upsert documents with embeddings to the vector store.
141
+ If IDs already exist, the content and embeddings will be updated.
142
+
143
+ Args:
144
+ ids: Unique identifiers for each document.
145
+ embeddings: Embedding vectors for each document.
146
+ documents: Text content of each document.
147
+ metadatas: Optional metadata for each document.
148
+
149
+ Returns:
150
+ Number of documents upserted.
151
+ """
152
+ if not self.is_initialized:
153
+ raise RuntimeError("Vector store not initialized. Call initialize() first.")
154
+
155
+ if not (len(ids) == len(embeddings) == len(documents)):
156
+ raise ValueError("ids, embeddings, and documents must have the same length")
157
+
158
+ async with self._lock:
159
+ assert self._collection is not None
160
+ self._collection.upsert(
161
+ ids=ids,
162
+ embeddings=embeddings, # type: ignore[arg-type]
163
+ documents=documents,
164
+ metadatas=metadatas or [{}] * len(ids), # type: ignore[arg-type]
165
+ )
166
+
167
+ logger.debug(f"Upserted {len(ids)} documents to vector store")
168
+ return len(ids)
169
+
170
+ async def similarity_search(
171
+ self,
172
+ query_embedding: list[float],
173
+ top_k: int = 5,
174
+ similarity_threshold: float = 0.0,
175
+ where: Optional[dict[str, Any]] = None,
176
+ ) -> list[SearchResult]:
177
+ """
178
+ Perform similarity search on the vector store.
179
+
180
+ Args:
181
+ query_embedding: Embedding vector to search for.
182
+ top_k: Maximum number of results to return.
183
+ similarity_threshold: Minimum similarity score (0-1).
184
+ where: Optional metadata filter.
185
+
186
+ Returns:
187
+ List of SearchResult objects sorted by score descending.
188
+
189
+ Raises:
190
+ RuntimeError: If the store is not initialized.
191
+ """
192
+ if not self.is_initialized:
193
+ raise RuntimeError("Vector store not initialized. Call initialize() first.")
194
+
195
+ async with self._lock:
196
+ assert self._collection is not None
197
+ results = self._collection.query(
198
+ query_embeddings=[query_embedding], # type: ignore[arg-type]
199
+ n_results=top_k,
200
+ where=where,
201
+ include=["documents", "metadatas", "distances"], # type: ignore[list-item]
202
+ )
203
+
204
+ # Convert Chroma results to SearchResult objects
205
+ search_results: list[SearchResult] = []
206
+
207
+ if results["ids"] and results["ids"][0]:
208
+ for idx, chunk_id in enumerate(results["ids"][0]):
209
+ # Chroma returns distances, convert to similarity (cosine)
210
+ distances = results["distances"]
211
+ distance = distances[0][idx] if distances else 0.0
212
+ similarity = 1 - distance # Cosine distance to similarity
213
+
214
+ if similarity >= similarity_threshold:
215
+ documents = results["documents"]
216
+ metadatas = results["metadatas"]
217
+ text_val = documents[0][idx] if documents else ""
218
+ meta_val: dict[str, Any] = {}
219
+ if metadatas and metadatas[0][idx]:
220
+ meta_val = dict(metadatas[0][idx])
221
+ search_results.append(
222
+ SearchResult(
223
+ text=text_val,
224
+ metadata=meta_val,
225
+ score=similarity,
226
+ chunk_id=chunk_id,
227
+ )
228
+ )
229
+
230
+ # Sort by score descending
231
+ search_results.sort(key=lambda x: x.score, reverse=True)
232
+
233
+ logger.debug(
234
+ f"Similarity search returned {len(search_results)} results "
235
+ f"(threshold: {similarity_threshold})"
236
+ )
237
+ return search_results
238
+
239
+ async def get_count(self, where: Optional[dict[str, Any]] = None) -> int:
240
+ """
241
+ Get the number of documents in the collection, optionally filtered.
242
+
243
+ Args:
244
+ where: Optional metadata filter.
245
+
246
+ Returns:
247
+ Number of documents stored.
248
+ """
249
+ if not self.is_initialized:
250
+ return 0
251
+
252
+ async with self._lock:
253
+ assert self._collection is not None
254
+ if where:
255
+ # get() is the only way to filter for counts in some Chroma versions
256
+ # include=[] to minimize data transfer
257
+ results = self._collection.get(where=where, include=[])
258
+ if results and "ids" in results:
259
+ return len(results["ids"])
260
+ return 0
261
+ return self._collection.count()
262
+
263
+ async def delete_collection(self) -> None:
264
+ """
265
+ Delete the entire collection.
266
+
267
+ Warning: This permanently removes all stored documents and embeddings.
268
+ """
269
+ if not self._client:
270
+ return
271
+
272
+ async with self._lock:
273
+ try:
274
+ assert self._client is not None
275
+ self._client.delete_collection(self.collection_name)
276
+ self._collection = None
277
+ self._initialized = False
278
+ logger.warning(f"Deleted collection: {self.collection_name}")
279
+ except Exception as e:
280
+ logger.error(f"Failed to delete collection: {e}")
281
+ raise
282
+
283
+ async def reset(self) -> None:
284
+ """
285
+ Reset the vector store by deleting and recreating the collection.
286
+ """
287
+ await self.delete_collection()
288
+ self._initialized = False
289
+ await self.initialize()
290
+
291
+ async def close(self) -> None:
292
+ """
293
+ Close the vector store connection.
294
+
295
+ Should be called during application shutdown.
296
+ """
297
+ async with self._lock:
298
+ self._collection = None
299
+ self._client = None
300
+ self._initialized = False
301
+ logger.info("Vector store connection closed")
302
+
303
+
304
+ # Global singleton instance
305
+ _vector_store: Optional[VectorStoreManager] = None
306
+
307
+
308
+ def get_vector_store() -> VectorStoreManager:
309
+ """Get the global vector store instance."""
310
+ global _vector_store
311
+ if _vector_store is None:
312
+ _vector_store = VectorStoreManager()
313
+ return _vector_store
314
+
315
+
316
+ async def initialize_vector_store() -> VectorStoreManager:
317
+ """Initialize and return the global vector store instance."""
318
+ store = get_vector_store()
319
+ await store.initialize()
320
+ return store
@@ -0,0 +1,72 @@
1
+ """State directory and storage path resolution."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ STATE_DIR_NAME = ".claude/doc-serve"
9
+
10
+ SUBDIRECTORIES = [
11
+ "data",
12
+ "data/chroma_db",
13
+ "data/bm25_index",
14
+ "data/llamaindex",
15
+ "logs",
16
+ ]
17
+
18
+
19
+ def resolve_state_dir(project_root: Path) -> Path:
20
+ """Resolve the state directory for a project.
21
+
22
+ Returns <project_root>/.claude/doc-serve/
23
+
24
+ Args:
25
+ project_root: Resolved project root path.
26
+
27
+ Returns:
28
+ Path to the state directory.
29
+ """
30
+ state_dir = project_root.resolve() / STATE_DIR_NAME
31
+ return state_dir
32
+
33
+
34
+ def resolve_storage_paths(state_dir: Path) -> dict[str, Path]:
35
+ """Resolve all storage paths relative to state directory.
36
+
37
+ Creates directories if they don't exist.
38
+
39
+ Args:
40
+ state_dir: Path to the state directory.
41
+
42
+ Returns:
43
+ Dictionary mapping storage names to paths.
44
+ """
45
+ paths: dict[str, Path] = {
46
+ "state_dir": state_dir,
47
+ "data": state_dir / "data",
48
+ "chroma_db": state_dir / "data" / "chroma_db",
49
+ "bm25_index": state_dir / "data" / "bm25_index",
50
+ "llamaindex": state_dir / "data" / "llamaindex",
51
+ "logs": state_dir / "logs",
52
+ }
53
+
54
+ # Create directories
55
+ for path in paths.values():
56
+ path.mkdir(parents=True, exist_ok=True)
57
+
58
+ return paths
59
+
60
+
61
+ def resolve_shared_project_dir(project_id: str) -> Path:
62
+ """Resolve per-project storage under shared daemon.
63
+
64
+ Args:
65
+ project_id: Unique project identifier.
66
+
67
+ Returns:
68
+ Path to shared project data directory.
69
+ """
70
+ shared_dir = Path.home() / ".doc-serve" / "projects" / project_id / "data"
71
+ shared_dir.mkdir(parents=True, exist_ok=True)
72
+ return shared_dir