agent-brain-rag 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_brain_rag-1.1.0.dist-info/METADATA +202 -0
- agent_brain_rag-1.1.0.dist-info/RECORD +31 -0
- agent_brain_rag-1.1.0.dist-info/WHEEL +4 -0
- agent_brain_rag-1.1.0.dist-info/entry_points.txt +3 -0
- doc_serve_server/__init__.py +3 -0
- doc_serve_server/api/__init__.py +5 -0
- doc_serve_server/api/main.py +332 -0
- doc_serve_server/api/routers/__init__.py +11 -0
- doc_serve_server/api/routers/health.py +100 -0
- doc_serve_server/api/routers/index.py +208 -0
- doc_serve_server/api/routers/query.py +96 -0
- doc_serve_server/config/__init__.py +5 -0
- doc_serve_server/config/settings.py +92 -0
- doc_serve_server/indexing/__init__.py +19 -0
- doc_serve_server/indexing/bm25_index.py +166 -0
- doc_serve_server/indexing/chunking.py +831 -0
- doc_serve_server/indexing/document_loader.py +506 -0
- doc_serve_server/indexing/embedding.py +274 -0
- doc_serve_server/locking.py +133 -0
- doc_serve_server/models/__init__.py +18 -0
- doc_serve_server/models/health.py +126 -0
- doc_serve_server/models/index.py +157 -0
- doc_serve_server/models/query.py +191 -0
- doc_serve_server/project_root.py +85 -0
- doc_serve_server/runtime.py +112 -0
- doc_serve_server/services/__init__.py +11 -0
- doc_serve_server/services/indexing_service.py +476 -0
- doc_serve_server/services/query_service.py +414 -0
- doc_serve_server/storage/__init__.py +5 -0
- doc_serve_server/storage/vector_store.py +320 -0
- doc_serve_server/storage_paths.py +72 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""Chroma vector store manager with thread-safe operations."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
import chromadb
|
|
10
|
+
from chromadb.config import Settings as ChromaSettings
|
|
11
|
+
|
|
12
|
+
from doc_serve_server.config import settings
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class SearchResult:
|
|
19
|
+
"""Result from a similarity search."""
|
|
20
|
+
|
|
21
|
+
text: str
|
|
22
|
+
metadata: dict[str, Any]
|
|
23
|
+
score: float
|
|
24
|
+
chunk_id: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class VectorStoreManager:
|
|
28
|
+
"""
|
|
29
|
+
Manages Chroma vector store operations with thread-safe access.
|
|
30
|
+
|
|
31
|
+
This class provides a high-level interface for storing and retrieving
|
|
32
|
+
document embeddings using Chroma as the backend.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
persist_dir: Optional[str] = None,
|
|
38
|
+
collection_name: Optional[str] = None,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the vector store manager.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
persist_dir: Directory for persistent storage. Defaults to config value.
|
|
45
|
+
collection_name: Name of the collection. Defaults to config value.
|
|
46
|
+
"""
|
|
47
|
+
self.persist_dir = persist_dir or settings.CHROMA_PERSIST_DIR
|
|
48
|
+
self.collection_name = collection_name or settings.COLLECTION_NAME
|
|
49
|
+
self._client: Optional[chromadb.PersistentClient] = None # type: ignore[valid-type]
|
|
50
|
+
self._collection: Optional[chromadb.Collection] = None
|
|
51
|
+
self._lock = asyncio.Lock()
|
|
52
|
+
self._initialized = False
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def is_initialized(self) -> bool:
|
|
56
|
+
"""Check if the vector store is initialized."""
|
|
57
|
+
return self._initialized and self._collection is not None
|
|
58
|
+
|
|
59
|
+
async def initialize(self) -> None:
|
|
60
|
+
"""
|
|
61
|
+
Initialize the Chroma client and collection.
|
|
62
|
+
|
|
63
|
+
Creates the persistence directory if it doesn't exist and
|
|
64
|
+
initializes or loads the existing collection.
|
|
65
|
+
"""
|
|
66
|
+
async with self._lock:
|
|
67
|
+
if self._initialized:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
# Ensure persistence directory exists
|
|
71
|
+
persist_path = Path(self.persist_dir)
|
|
72
|
+
persist_path.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
|
|
74
|
+
# Initialize Chroma client
|
|
75
|
+
self._client = chromadb.PersistentClient(
|
|
76
|
+
path=str(persist_path),
|
|
77
|
+
settings=ChromaSettings(
|
|
78
|
+
anonymized_telemetry=False,
|
|
79
|
+
allow_reset=True,
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Get or create collection
|
|
84
|
+
self._collection = self._client.get_or_create_collection(
|
|
85
|
+
name=self.collection_name,
|
|
86
|
+
metadata={"hnsw:space": "cosine"},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
self._initialized = True
|
|
90
|
+
logger.info(
|
|
91
|
+
f"Vector store initialized: {self.collection_name} "
|
|
92
|
+
f"({self._collection.count()} existing documents)"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
async def add_documents(
|
|
96
|
+
self,
|
|
97
|
+
ids: list[str],
|
|
98
|
+
embeddings: list[list[float]],
|
|
99
|
+
documents: list[str],
|
|
100
|
+
metadatas: Optional[list[dict[str, Any]]] = None,
|
|
101
|
+
) -> int:
|
|
102
|
+
"""
|
|
103
|
+
Add documents with embeddings to the vector store.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
ids: Unique identifiers for each document.
|
|
107
|
+
embeddings: Embedding vectors for each document.
|
|
108
|
+
documents: Text content of each document.
|
|
109
|
+
metadatas: Optional metadata for each document.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Number of documents added.
|
|
113
|
+
"""
|
|
114
|
+
if not self.is_initialized:
|
|
115
|
+
raise RuntimeError("Vector store not initialized. Call initialize() first.")
|
|
116
|
+
|
|
117
|
+
if not (len(ids) == len(embeddings) == len(documents)):
|
|
118
|
+
raise ValueError("ids, embeddings, and documents must have the same length")
|
|
119
|
+
|
|
120
|
+
async with self._lock:
|
|
121
|
+
assert self._collection is not None
|
|
122
|
+
self._collection.add(
|
|
123
|
+
ids=ids,
|
|
124
|
+
embeddings=embeddings, # type: ignore[arg-type]
|
|
125
|
+
documents=documents,
|
|
126
|
+
metadatas=metadatas or [{}] * len(ids), # type: ignore[arg-type]
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Added {len(ids)} documents to vector store")
|
|
130
|
+
return len(ids)
|
|
131
|
+
|
|
132
|
+
async def upsert_documents(
|
|
133
|
+
self,
|
|
134
|
+
ids: list[str],
|
|
135
|
+
embeddings: list[list[float]],
|
|
136
|
+
documents: list[str],
|
|
137
|
+
metadatas: Optional[list[dict[str, Any]]] = None,
|
|
138
|
+
) -> int:
|
|
139
|
+
"""
|
|
140
|
+
Upsert documents with embeddings to the vector store.
|
|
141
|
+
If IDs already exist, the content and embeddings will be updated.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
ids: Unique identifiers for each document.
|
|
145
|
+
embeddings: Embedding vectors for each document.
|
|
146
|
+
documents: Text content of each document.
|
|
147
|
+
metadatas: Optional metadata for each document.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Number of documents upserted.
|
|
151
|
+
"""
|
|
152
|
+
if not self.is_initialized:
|
|
153
|
+
raise RuntimeError("Vector store not initialized. Call initialize() first.")
|
|
154
|
+
|
|
155
|
+
if not (len(ids) == len(embeddings) == len(documents)):
|
|
156
|
+
raise ValueError("ids, embeddings, and documents must have the same length")
|
|
157
|
+
|
|
158
|
+
async with self._lock:
|
|
159
|
+
assert self._collection is not None
|
|
160
|
+
self._collection.upsert(
|
|
161
|
+
ids=ids,
|
|
162
|
+
embeddings=embeddings, # type: ignore[arg-type]
|
|
163
|
+
documents=documents,
|
|
164
|
+
metadatas=metadatas or [{}] * len(ids), # type: ignore[arg-type]
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
logger.debug(f"Upserted {len(ids)} documents to vector store")
|
|
168
|
+
return len(ids)
|
|
169
|
+
|
|
170
|
+
async def similarity_search(
|
|
171
|
+
self,
|
|
172
|
+
query_embedding: list[float],
|
|
173
|
+
top_k: int = 5,
|
|
174
|
+
similarity_threshold: float = 0.0,
|
|
175
|
+
where: Optional[dict[str, Any]] = None,
|
|
176
|
+
) -> list[SearchResult]:
|
|
177
|
+
"""
|
|
178
|
+
Perform similarity search on the vector store.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
query_embedding: Embedding vector to search for.
|
|
182
|
+
top_k: Maximum number of results to return.
|
|
183
|
+
similarity_threshold: Minimum similarity score (0-1).
|
|
184
|
+
where: Optional metadata filter.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of SearchResult objects sorted by score descending.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
RuntimeError: If the store is not initialized.
|
|
191
|
+
"""
|
|
192
|
+
if not self.is_initialized:
|
|
193
|
+
raise RuntimeError("Vector store not initialized. Call initialize() first.")
|
|
194
|
+
|
|
195
|
+
async with self._lock:
|
|
196
|
+
assert self._collection is not None
|
|
197
|
+
results = self._collection.query(
|
|
198
|
+
query_embeddings=[query_embedding], # type: ignore[arg-type]
|
|
199
|
+
n_results=top_k,
|
|
200
|
+
where=where,
|
|
201
|
+
include=["documents", "metadatas", "distances"], # type: ignore[list-item]
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Convert Chroma results to SearchResult objects
|
|
205
|
+
search_results: list[SearchResult] = []
|
|
206
|
+
|
|
207
|
+
if results["ids"] and results["ids"][0]:
|
|
208
|
+
for idx, chunk_id in enumerate(results["ids"][0]):
|
|
209
|
+
# Chroma returns distances, convert to similarity (cosine)
|
|
210
|
+
distances = results["distances"]
|
|
211
|
+
distance = distances[0][idx] if distances else 0.0
|
|
212
|
+
similarity = 1 - distance # Cosine distance to similarity
|
|
213
|
+
|
|
214
|
+
if similarity >= similarity_threshold:
|
|
215
|
+
documents = results["documents"]
|
|
216
|
+
metadatas = results["metadatas"]
|
|
217
|
+
text_val = documents[0][idx] if documents else ""
|
|
218
|
+
meta_val: dict[str, Any] = {}
|
|
219
|
+
if metadatas and metadatas[0][idx]:
|
|
220
|
+
meta_val = dict(metadatas[0][idx])
|
|
221
|
+
search_results.append(
|
|
222
|
+
SearchResult(
|
|
223
|
+
text=text_val,
|
|
224
|
+
metadata=meta_val,
|
|
225
|
+
score=similarity,
|
|
226
|
+
chunk_id=chunk_id,
|
|
227
|
+
)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Sort by score descending
|
|
231
|
+
search_results.sort(key=lambda x: x.score, reverse=True)
|
|
232
|
+
|
|
233
|
+
logger.debug(
|
|
234
|
+
f"Similarity search returned {len(search_results)} results "
|
|
235
|
+
f"(threshold: {similarity_threshold})"
|
|
236
|
+
)
|
|
237
|
+
return search_results
|
|
238
|
+
|
|
239
|
+
async def get_count(self, where: Optional[dict[str, Any]] = None) -> int:
|
|
240
|
+
"""
|
|
241
|
+
Get the number of documents in the collection, optionally filtered.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
where: Optional metadata filter.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Number of documents stored.
|
|
248
|
+
"""
|
|
249
|
+
if not self.is_initialized:
|
|
250
|
+
return 0
|
|
251
|
+
|
|
252
|
+
async with self._lock:
|
|
253
|
+
assert self._collection is not None
|
|
254
|
+
if where:
|
|
255
|
+
# get() is the only way to filter for counts in some Chroma versions
|
|
256
|
+
# include=[] to minimize data transfer
|
|
257
|
+
results = self._collection.get(where=where, include=[])
|
|
258
|
+
if results and "ids" in results:
|
|
259
|
+
return len(results["ids"])
|
|
260
|
+
return 0
|
|
261
|
+
return self._collection.count()
|
|
262
|
+
|
|
263
|
+
async def delete_collection(self) -> None:
|
|
264
|
+
"""
|
|
265
|
+
Delete the entire collection.
|
|
266
|
+
|
|
267
|
+
Warning: This permanently removes all stored documents and embeddings.
|
|
268
|
+
"""
|
|
269
|
+
if not self._client:
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
async with self._lock:
|
|
273
|
+
try:
|
|
274
|
+
assert self._client is not None
|
|
275
|
+
self._client.delete_collection(self.collection_name)
|
|
276
|
+
self._collection = None
|
|
277
|
+
self._initialized = False
|
|
278
|
+
logger.warning(f"Deleted collection: {self.collection_name}")
|
|
279
|
+
except Exception as e:
|
|
280
|
+
logger.error(f"Failed to delete collection: {e}")
|
|
281
|
+
raise
|
|
282
|
+
|
|
283
|
+
async def reset(self) -> None:
|
|
284
|
+
"""
|
|
285
|
+
Reset the vector store by deleting and recreating the collection.
|
|
286
|
+
"""
|
|
287
|
+
await self.delete_collection()
|
|
288
|
+
self._initialized = False
|
|
289
|
+
await self.initialize()
|
|
290
|
+
|
|
291
|
+
async def close(self) -> None:
|
|
292
|
+
"""
|
|
293
|
+
Close the vector store connection.
|
|
294
|
+
|
|
295
|
+
Should be called during application shutdown.
|
|
296
|
+
"""
|
|
297
|
+
async with self._lock:
|
|
298
|
+
self._collection = None
|
|
299
|
+
self._client = None
|
|
300
|
+
self._initialized = False
|
|
301
|
+
logger.info("Vector store connection closed")
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# Global singleton instance
|
|
305
|
+
_vector_store: Optional[VectorStoreManager] = None
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def get_vector_store() -> VectorStoreManager:
|
|
309
|
+
"""Get the global vector store instance."""
|
|
310
|
+
global _vector_store
|
|
311
|
+
if _vector_store is None:
|
|
312
|
+
_vector_store = VectorStoreManager()
|
|
313
|
+
return _vector_store
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
async def initialize_vector_store() -> VectorStoreManager:
|
|
317
|
+
"""Initialize and return the global vector store instance."""
|
|
318
|
+
store = get_vector_store()
|
|
319
|
+
await store.initialize()
|
|
320
|
+
return store
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""State directory and storage path resolution."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
STATE_DIR_NAME = ".claude/doc-serve"
|
|
9
|
+
|
|
10
|
+
SUBDIRECTORIES = [
|
|
11
|
+
"data",
|
|
12
|
+
"data/chroma_db",
|
|
13
|
+
"data/bm25_index",
|
|
14
|
+
"data/llamaindex",
|
|
15
|
+
"logs",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def resolve_state_dir(project_root: Path) -> Path:
|
|
20
|
+
"""Resolve the state directory for a project.
|
|
21
|
+
|
|
22
|
+
Returns <project_root>/.claude/doc-serve/
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
project_root: Resolved project root path.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Path to the state directory.
|
|
29
|
+
"""
|
|
30
|
+
state_dir = project_root.resolve() / STATE_DIR_NAME
|
|
31
|
+
return state_dir
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def resolve_storage_paths(state_dir: Path) -> dict[str, Path]:
|
|
35
|
+
"""Resolve all storage paths relative to state directory.
|
|
36
|
+
|
|
37
|
+
Creates directories if they don't exist.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
state_dir: Path to the state directory.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Dictionary mapping storage names to paths.
|
|
44
|
+
"""
|
|
45
|
+
paths: dict[str, Path] = {
|
|
46
|
+
"state_dir": state_dir,
|
|
47
|
+
"data": state_dir / "data",
|
|
48
|
+
"chroma_db": state_dir / "data" / "chroma_db",
|
|
49
|
+
"bm25_index": state_dir / "data" / "bm25_index",
|
|
50
|
+
"llamaindex": state_dir / "data" / "llamaindex",
|
|
51
|
+
"logs": state_dir / "logs",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Create directories
|
|
55
|
+
for path in paths.values():
|
|
56
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
|
|
58
|
+
return paths
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def resolve_shared_project_dir(project_id: str) -> Path:
|
|
62
|
+
"""Resolve per-project storage under shared daemon.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
project_id: Unique project identifier.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Path to shared project data directory.
|
|
69
|
+
"""
|
|
70
|
+
shared_dir = Path.home() / ".doc-serve" / "projects" / project_id / "data"
|
|
71
|
+
shared_dir.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
return shared_dir
|