swarmauri_vectorstore_persistentchromadb 0.6.0.dev154__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmauri_vectorstore_persistentchromadb-0.6.0.dev154/PKG-INFO +21 -0
- swarmauri_vectorstore_persistentchromadb-0.6.0.dev154/README.md +1 -0
- swarmauri_vectorstore_persistentchromadb-0.6.0.dev154/pyproject.toml +58 -0
- swarmauri_vectorstore_persistentchromadb-0.6.0.dev154/swarmauri_vectorstore_persistentchromadb/PersistentChromaDBVectorStore.py +188 -0
- swarmauri_vectorstore_persistentchromadb-0.6.0.dev154/swarmauri_vectorstore_persistentchromadb/__init__.py +14 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: swarmauri_vectorstore_persistentchromadb
|
|
3
|
+
Version: 0.6.0.dev154
|
|
4
|
+
Summary: A Persistent ChromaDB based Vector Store
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Jacob Stewart
|
|
7
|
+
Author-email: jacob@swarmauri.com
|
|
8
|
+
Requires-Python: >=3.10,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: chromadb (>=0.5.17,<0.6.0)
|
|
15
|
+
Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
|
|
16
|
+
Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
|
|
17
|
+
Requires-Dist: swarmauri_embedding_doc2vec (>=0.6.0.dev154,<0.7.0)
|
|
18
|
+
Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "swarmauri_vectorstore_persistentchromadb"
|
|
3
|
+
version = "0.6.0.dev154"
|
|
4
|
+
description = "A Persistent ChromaDB based Vector Store"
|
|
5
|
+
authors = ["Jacob Stewart <jacob@swarmauri.com>"]
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
repository = "http://github.com/swarmauri/swarmauri-sdk"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"License :: OSI Approved :: Apache Software License",
|
|
11
|
+
"Programming Language :: Python :: 3.10",
|
|
12
|
+
"Programming Language :: Python :: 3.11",
|
|
13
|
+
"Programming Language :: Python :: 3.12"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.poetry.dependencies]
|
|
17
|
+
python = ">=3.10,<3.13"
|
|
18
|
+
|
|
19
|
+
# Swarmauri
|
|
20
|
+
swarmauri_core = {version = "^0.6.0.dev154"}
|
|
21
|
+
swarmauri_base = {version = "^0.6.0.dev154"}
|
|
22
|
+
swarmauri_embedding_doc2vec = {version = "^0.6.0.dev154"}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Dependencies
|
|
26
|
+
chromadb = "^0.5.17"
|
|
27
|
+
|
|
28
|
+
[tool.poetry.group.dev.dependencies]
|
|
29
|
+
flake8 = "^7.0"
|
|
30
|
+
pytest = "^8.0"
|
|
31
|
+
pytest-asyncio = ">=0.24.0"
|
|
32
|
+
pytest-xdist = "^3.6.1"
|
|
33
|
+
pytest-json-report = "^1.5.0"
|
|
34
|
+
python-dotenv = "*"
|
|
35
|
+
requests = "^2.32.3"
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["poetry-core>=1.0.0"]
|
|
39
|
+
build-backend = "poetry.core.masonry.api"
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
norecursedirs = ["combined", "scripts"]
|
|
43
|
+
|
|
44
|
+
markers = [
|
|
45
|
+
"test: standard test",
|
|
46
|
+
"unit: Unit tests",
|
|
47
|
+
"integration: Integration tests",
|
|
48
|
+
"acceptance: Acceptance tests",
|
|
49
|
+
"experimental: Experimental tests"
|
|
50
|
+
]
|
|
51
|
+
log_cli = true
|
|
52
|
+
log_cli_level = "INFO"
|
|
53
|
+
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
|
|
54
|
+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
55
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
56
|
+
|
|
57
|
+
[tool.poetry.plugins."swarmauri.vector_stores"]
|
|
58
|
+
PersistentChromaDBVectorStore = "swarmauri_vectorstore_persistentchromadb.PersistentChromaDBVectorStore:PersistentChromaDBVectorStore"
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import chromadb
|
|
3
|
+
|
|
4
|
+
from typing import List, Union, Literal
|
|
5
|
+
|
|
6
|
+
from swarmauri_standard.documents.concrete.Document import Document
|
|
7
|
+
from swarmauri_embedding_doc2vec.Doc2VecEmbedding import Doc2VecEmbedding
|
|
8
|
+
from swarmauri.distances.concrete.CosineDistance import CosineDistance
|
|
9
|
+
|
|
10
|
+
from swarmauri.vector_stores.base.VectorStoreBase import VectorStoreBase
|
|
11
|
+
from swarmauri.vector_stores.base.VectorStoreRetrieveMixin import (
|
|
12
|
+
VectorStoreRetrieveMixin,
|
|
13
|
+
)
|
|
14
|
+
from swarmauri.vector_stores.base.VectorStoreSaveLoadMixin import (
|
|
15
|
+
VectorStoreSaveLoadMixin,
|
|
16
|
+
)
|
|
17
|
+
from swarmauri.vector_stores.base.VectorStorePersistentMixin import (
|
|
18
|
+
VectorStorePersistentMixin,
|
|
19
|
+
)
|
|
20
|
+
from swarmauri_core.ComponentBase import ComponentBase
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@ComponentBase.register_type(VectorStoreBase, "PersistentChromaDBVectorStore")
|
|
24
|
+
class PersistentChromaDBVectorStore(
|
|
25
|
+
VectorStoreSaveLoadMixin,
|
|
26
|
+
VectorStoreRetrieveMixin,
|
|
27
|
+
VectorStorePersistentMixin,
|
|
28
|
+
VectorStoreBase,
|
|
29
|
+
):
|
|
30
|
+
type: Literal["PersistentChromaDBVectorStore"] = "PersistentChromaDBVectorStore"
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the PersistentChromaDBVectorStore.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
Args:
|
|
38
|
+
**kwargs: keyword arguments.
|
|
39
|
+
"""
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
|
|
42
|
+
self._embedder = Doc2VecEmbedding(vector_size=self.vector_size)
|
|
43
|
+
self._distance = CosineDistance()
|
|
44
|
+
|
|
45
|
+
def connect(self) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Establish a connection to ChromaDB and get or create the collection.
|
|
48
|
+
"""
|
|
49
|
+
# settings = Settings(
|
|
50
|
+
# chroma_api_impl="chromadb.api.fastapi.FastAPI", # Use FastAPI if LocalAPI is not supported
|
|
51
|
+
# chroma_server_host="localhost", # Server host
|
|
52
|
+
# chroma_server_http_port=8000, # Server port
|
|
53
|
+
# )
|
|
54
|
+
#
|
|
55
|
+
# self.client = chromadb.Client(
|
|
56
|
+
# settings=settings,
|
|
57
|
+
# )
|
|
58
|
+
self.client = chromadb.Client()
|
|
59
|
+
|
|
60
|
+
self.collection = self.client.get_or_create_collection(
|
|
61
|
+
name=self.collection_name
|
|
62
|
+
)
|
|
63
|
+
logging.info(
|
|
64
|
+
f"Connected to ChromaDB at {self.path}, collection: {self.collection_name}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def disconnect(self) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Close the connection to ChromaDB.
|
|
70
|
+
"""
|
|
71
|
+
if self.client:
|
|
72
|
+
# Perform any necessary cleanup here
|
|
73
|
+
self.client = None
|
|
74
|
+
self.collection = None
|
|
75
|
+
|
|
76
|
+
def add_document(self, document: Document) -> None:
|
|
77
|
+
embedding = None
|
|
78
|
+
if not document.embedding:
|
|
79
|
+
self._embedder.fit([document.content]) # Fit only once
|
|
80
|
+
embedding = (
|
|
81
|
+
self._embedder.transform([document.content])[0].to_numpy().tolist()
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
embedding = document.embedding
|
|
85
|
+
|
|
86
|
+
self.collection.add(
|
|
87
|
+
ids=[document.id],
|
|
88
|
+
documents=[document.content],
|
|
89
|
+
embeddings=[embedding],
|
|
90
|
+
metadatas=[document.metadata],
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def add_documents(self, documents: List[Document]) -> None:
|
|
94
|
+
ids = [doc.id for doc in documents]
|
|
95
|
+
texts = [doc.content for doc in documents]
|
|
96
|
+
|
|
97
|
+
for doc in documents:
|
|
98
|
+
self._embedder.fit([doc.content])
|
|
99
|
+
|
|
100
|
+
embeddings = [
|
|
101
|
+
self._embedder.infer_vector(doc.content).value for doc in documents
|
|
102
|
+
]
|
|
103
|
+
metadatas = [doc.metadata for doc in documents]
|
|
104
|
+
if metadatas[0]:
|
|
105
|
+
self.collection.add(
|
|
106
|
+
ids=ids, documents=texts, embeddings=embeddings, metadatas=metadatas
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
self.collection.add(ids=ids, documents=texts, embeddings=embeddings)
|
|
110
|
+
|
|
111
|
+
def get_document(self, doc_id: str) -> Union[Document, None]:
|
|
112
|
+
results = self.collection.get(ids=[doc_id])
|
|
113
|
+
if not results["metadatas"][0]:
|
|
114
|
+
results["metadatas"][0] = {}
|
|
115
|
+
if results["ids"]:
|
|
116
|
+
document = Document(
|
|
117
|
+
id=results["ids"][0],
|
|
118
|
+
content=results["documents"][0],
|
|
119
|
+
metadata=results["metadatas"][0],
|
|
120
|
+
)
|
|
121
|
+
return document
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
def get_all_documents(self) -> List[Document]:
|
|
125
|
+
results = self.collection.get()
|
|
126
|
+
documents = [
|
|
127
|
+
Document(
|
|
128
|
+
id=results["ids"][idx],
|
|
129
|
+
content=results["documents"][idx],
|
|
130
|
+
metadata=results["metadatas"][idx],
|
|
131
|
+
)
|
|
132
|
+
for idx in range(len(results["ids"]))
|
|
133
|
+
]
|
|
134
|
+
return documents
|
|
135
|
+
|
|
136
|
+
def delete_document(self, doc_id: str) -> None:
|
|
137
|
+
self.collection.delete(ids=[doc_id])
|
|
138
|
+
|
|
139
|
+
def update_document(self, doc_id: str, updated_document: Document) -> None:
|
|
140
|
+
document_vector = None
|
|
141
|
+
# Precompute the embedding outside the update process
|
|
142
|
+
if not updated_document.embedding:
|
|
143
|
+
# Transform without refitting to avoid vocabulary issues
|
|
144
|
+
document_vector = self._embedder.transform([updated_document.content])[0]
|
|
145
|
+
else:
|
|
146
|
+
document_vector = updated_document.embedding
|
|
147
|
+
|
|
148
|
+
document_vector = document_vector.to_numpy().tolist()
|
|
149
|
+
|
|
150
|
+
updated_document.embedding = document_vector
|
|
151
|
+
|
|
152
|
+
self.delete_document(doc_id)
|
|
153
|
+
self.add_document(updated_document)
|
|
154
|
+
|
|
155
|
+
def clear_documents(self) -> None:
|
|
156
|
+
documents = self.get_all_documents()
|
|
157
|
+
doc_ids = [doc.id for doc in documents]
|
|
158
|
+
self.collection.delete(ids=doc_ids)
|
|
159
|
+
|
|
160
|
+
def document_count(self) -> int:
|
|
161
|
+
return len(self.get_all_documents())
|
|
162
|
+
|
|
163
|
+
def retrieve(self, query: str, top_k: int = 5) -> List[Document]:
|
|
164
|
+
query_embedding = self._embedder.infer_vector(query).value
|
|
165
|
+
# print(query_embedding)
|
|
166
|
+
|
|
167
|
+
results = self.collection.query(
|
|
168
|
+
query_embeddings=query_embedding, n_results=top_k
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return [
|
|
172
|
+
Document(
|
|
173
|
+
id=results["ids"][0][idx],
|
|
174
|
+
content=results["documents"][0][idx],
|
|
175
|
+
metadata=(
|
|
176
|
+
results["metadatas"][0][idx] if results["metadatas"][0][idx] else {}
|
|
177
|
+
),
|
|
178
|
+
)
|
|
179
|
+
for idx in range(len(results["ids"][0]))
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
# Override the model_dump_json method
|
|
183
|
+
def model_dump_json(self, *args, **kwargs) -> str:
|
|
184
|
+
# Call the disconnect method before serialization
|
|
185
|
+
self.disconnect()
|
|
186
|
+
|
|
187
|
+
# Now proceed with the usual JSON serialization
|
|
188
|
+
return super().model_dump_json(*args, **kwargs)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .PersistentChromaDBVectorStore import PersistentChromaDBVectorStore
|
|
2
|
+
|
|
3
|
+
__version__ = "0.6.0.dev26"
|
|
4
|
+
__long_desc__ = """
|
|
5
|
+
|
|
6
|
+
# Swarmauri PersistentChromaDB Plugin
|
|
7
|
+
|
|
8
|
+
This repository includes an PersistentChromaDB of a Swarmauri Plugin.
|
|
9
|
+
|
|
10
|
+
Visit us at: https://swarmauri.com
|
|
11
|
+
Follow us at: https://github.com/swarmauri
|
|
12
|
+
Star us at: https://github.com/swarmauri/swarmauri-sdk
|
|
13
|
+
|
|
14
|
+
"""
|