swarmauri_vectorstore_duckdb 0.6.0.dev154__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmauri_vectorstore_duckdb-0.6.0.dev154/PKG-INFO +21 -0
- swarmauri_vectorstore_duckdb-0.6.0.dev154/README.md +1 -0
- swarmauri_vectorstore_duckdb-0.6.0.dev154/pyproject.toml +58 -0
- swarmauri_vectorstore_duckdb-0.6.0.dev154/swarmauri_vectorstore_duckdb/DuckDBVectorStore.py +283 -0
- swarmauri_vectorstore_duckdb-0.6.0.dev154/swarmauri_vectorstore_duckdb/__init__.py +14 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: swarmauri_vectorstore_duckdb
|
|
3
|
+
Version: 0.6.0.dev154
|
|
4
|
+
Summary: A DuckDB based Vector Store
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Jacob Stewart
|
|
7
|
+
Author-email: jacob@swarmauri.com
|
|
8
|
+
Requires-Python: >=3.10,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: duckdb (>=1.1.1,<2.0.0)
|
|
15
|
+
Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
|
|
16
|
+
Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
|
|
17
|
+
Requires-Dist: swarmauri_embedding_doc2vec (>=0.6.0.dev154,<0.7.0)
|
|
18
|
+
Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "swarmauri_vectorstore_duckdb"
|
|
3
|
+
version = "0.6.0.dev154"
|
|
4
|
+
description = "A DuckDB based Vector Store"
|
|
5
|
+
authors = ["Jacob Stewart <jacob@swarmauri.com>"]
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
repository = "http://github.com/swarmauri/swarmauri-sdk"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"License :: OSI Approved :: Apache Software License",
|
|
11
|
+
"Programming Language :: Python :: 3.10",
|
|
12
|
+
"Programming Language :: Python :: 3.11",
|
|
13
|
+
"Programming Language :: Python :: 3.12"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.poetry.dependencies]
|
|
17
|
+
python = ">=3.10,<3.13"
|
|
18
|
+
|
|
19
|
+
# Swarmauri
|
|
20
|
+
swarmauri_core = {version = "^0.6.0.dev154"}
|
|
21
|
+
swarmauri_base = {version = "^0.6.0.dev154"}
|
|
22
|
+
swarmauri_embedding_doc2vec = {version = "^0.6.0.dev154"}
|
|
23
|
+
|
|
24
|
+
# Dependencies
|
|
25
|
+
duckdb = "^1.1.1"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
[tool.poetry.group.dev.dependencies]
|
|
29
|
+
flake8 = "^7.0"
|
|
30
|
+
pytest = "^8.0"
|
|
31
|
+
pytest-asyncio = ">=0.24.0"
|
|
32
|
+
pytest-xdist = "^3.6.1"
|
|
33
|
+
pytest-json-report = "^1.5.0"
|
|
34
|
+
python-dotenv = "*"
|
|
35
|
+
requests = "^2.32.3"
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["poetry-core>=1.0.0"]
|
|
39
|
+
build-backend = "poetry.core.masonry.api"
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
norecursedirs = ["combined", "scripts"]
|
|
43
|
+
|
|
44
|
+
markers = [
|
|
45
|
+
"test: standard test",
|
|
46
|
+
"unit: Unit tests",
|
|
47
|
+
"integration: Integration tests",
|
|
48
|
+
"acceptance: Acceptance tests",
|
|
49
|
+
"experimental: Experimental tests"
|
|
50
|
+
]
|
|
51
|
+
log_cli = true
|
|
52
|
+
log_cli_level = "INFO"
|
|
53
|
+
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
|
|
54
|
+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
55
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
56
|
+
|
|
57
|
+
[tool.poetry.plugins."swarmauri.vector_stores"]
|
|
58
|
+
DuckDBVectorStore = "swarmauri_vectorstore_duckdb.DuckDBVectorStore:DuckDBVectorStore"
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
from typing import List, Literal, Dict, Any, Optional
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import duckdb
|
|
5
|
+
from pydantic import Field, PrivateAttr
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from swarmauri_standard.vectors.Vector import Vector
|
|
9
|
+
from swarmauri_standard.documents.Document import Document
|
|
10
|
+
from swarmauri_embedding_doc2vec.Doc2VecEmbedding import Doc2VecEmbedding
|
|
11
|
+
from swarmauri_standard.distances.CosineDistance import CosineDistance
|
|
12
|
+
|
|
13
|
+
from swarmauri_base.vector_stores.VectorStoreBase import VectorStoreBase
|
|
14
|
+
from swarmauri_base.vector_stores.VectorStoreRetrieveMixin import (
|
|
15
|
+
VectorStoreRetrieveMixin,
|
|
16
|
+
)
|
|
17
|
+
from swarmauri_base.vector_stores.VectorStoreSaveLoadMixin import (
|
|
18
|
+
VectorStoreSaveLoadMixin,
|
|
19
|
+
)
|
|
20
|
+
from swarmauri_core.ComponentBase import ComponentBase
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@ComponentBase.register_type(VectorStoreBase, "DuckDBVectorStore")
|
|
24
|
+
class DuckDBVectorStore(
|
|
25
|
+
VectorStoreSaveLoadMixin, VectorStoreRetrieveMixin, VectorStoreBase
|
|
26
|
+
):
|
|
27
|
+
"""A vector store implementation using DuckDB as the backend."""
|
|
28
|
+
|
|
29
|
+
type: Literal["DuckDBVectorStore"] = "DuckDBVectorStore"
|
|
30
|
+
database_name: str = Field(
|
|
31
|
+
default=":memory:", description="Name of the DuckDB database"
|
|
32
|
+
)
|
|
33
|
+
table_name: str = Field(
|
|
34
|
+
default="documents", description="Name of the table to store documents"
|
|
35
|
+
)
|
|
36
|
+
embed_dim: Optional[int] = Field(
|
|
37
|
+
default=None, description="Dimension of the embedding vectors"
|
|
38
|
+
)
|
|
39
|
+
persist_dir: str = Field(
|
|
40
|
+
default="./storage", description="Directory to persist the database"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
_conn: Any = PrivateAttr(default=None)
|
|
44
|
+
_is_initialized: bool = PrivateAttr(default=False)
|
|
45
|
+
_database_path: Optional[str] = PrivateAttr(default=None)
|
|
46
|
+
|
|
47
|
+
def __init__(self, **data):
|
|
48
|
+
super().__init__(**data)
|
|
49
|
+
self._embedder = Doc2VecEmbedding()
|
|
50
|
+
self._distance = CosineDistance()
|
|
51
|
+
|
|
52
|
+
if not os.path.exists(self.persist_dir):
|
|
53
|
+
os.makedirs(self.persist_dir)
|
|
54
|
+
|
|
55
|
+
if self.database_name == ":memory:":
|
|
56
|
+
self._conn = duckdb.connect(self.database_name)
|
|
57
|
+
self._setup_extensions(self._conn)
|
|
58
|
+
self._initialize_table(self._conn)
|
|
59
|
+
else:
|
|
60
|
+
self._database_path = os.path.join(self.persist_dir, self.database_name)
|
|
61
|
+
with duckdb.connect(self._database_path) as conn:
|
|
62
|
+
self._setup_extensions(conn)
|
|
63
|
+
self._initialize_table(conn)
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def _setup_extensions(conn):
|
|
67
|
+
conn.install_extension("json")
|
|
68
|
+
conn.load_extension("json")
|
|
69
|
+
conn.install_extension("fts")
|
|
70
|
+
conn.load_extension("fts")
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _cosine_similarity(vec1, vec2):
|
|
74
|
+
dot_product = np.dot(vec1, vec2)
|
|
75
|
+
norm_vec1 = np.linalg.norm(vec1)
|
|
76
|
+
norm_vec2 = np.linalg.norm(vec2)
|
|
77
|
+
return dot_product / (norm_vec1 * norm_vec2)
|
|
78
|
+
|
|
79
|
+
def connect(self) -> None:
|
|
80
|
+
"""Connect to the DuckDB database and initialize if necessary."""
|
|
81
|
+
if not self._is_initialized:
|
|
82
|
+
if self.database_name == ":memory:":
|
|
83
|
+
self._initialize_table(self._conn)
|
|
84
|
+
else:
|
|
85
|
+
with duckdb.connect(self._database_path) as conn:
|
|
86
|
+
self._setup_extensions(conn)
|
|
87
|
+
self._initialize_table(conn)
|
|
88
|
+
self._is_initialized = True
|
|
89
|
+
|
|
90
|
+
def _initialize_table(self, conn) -> None:
|
|
91
|
+
embed_dim_str = f"[{self.embed_dim}]" if self.embed_dim else "[]"
|
|
92
|
+
conn.execute(
|
|
93
|
+
f"""
|
|
94
|
+
CREATE TABLE IF NOT EXISTS {self.table_name} (
|
|
95
|
+
id VARCHAR PRIMARY KEY,
|
|
96
|
+
content TEXT,
|
|
97
|
+
embedding FLOAT{embed_dim_str},
|
|
98
|
+
metadata JSON
|
|
99
|
+
)"""
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def disconnect(self) -> None:
|
|
103
|
+
"""Disconnect from the DuckDB database."""
|
|
104
|
+
if self._conn and self.database_name == ":memory:":
|
|
105
|
+
self._conn.close()
|
|
106
|
+
self._conn = None
|
|
107
|
+
self._is_initialized = False
|
|
108
|
+
|
|
109
|
+
def _prepare_document(self, document: Document) -> Dict[str, Any]:
|
|
110
|
+
if not document.embedding:
|
|
111
|
+
self._embedder.fit([document.content])
|
|
112
|
+
embedding = (
|
|
113
|
+
self._embedder.transform([document.content])[0].to_numpy().tolist()
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
embedding = (
|
|
117
|
+
document.embedding.value
|
|
118
|
+
if isinstance(document.embedding, Vector)
|
|
119
|
+
else document.embedding
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
"id": document.id,
|
|
124
|
+
"content": document.content,
|
|
125
|
+
"embedding": embedding,
|
|
126
|
+
"metadata": json.dumps(document.metadata or {}),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
def add_document(self, document: Document) -> None:
|
|
130
|
+
# Ensure the document is properly prepared before insertion
|
|
131
|
+
data = self._prepare_document(document)
|
|
132
|
+
query = f"""
|
|
133
|
+
INSERT OR REPLACE INTO {self.table_name} (id, content, embedding, metadata)
|
|
134
|
+
VALUES (?, ?, ?, ?)
|
|
135
|
+
"""
|
|
136
|
+
if self.database_name == ":memory:":
|
|
137
|
+
self._conn.execute(
|
|
138
|
+
query,
|
|
139
|
+
[data["id"], data["content"], data["embedding"], data["metadata"]],
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
with duckdb.connect(self._database_path) as conn:
|
|
143
|
+
conn.execute(
|
|
144
|
+
query,
|
|
145
|
+
[data["id"], data["content"], data["embedding"], data["metadata"]],
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def add_documents(self, documents: List[Document]) -> None:
|
|
149
|
+
ids = [doc.id for doc in documents]
|
|
150
|
+
contents = [doc.content for doc in documents]
|
|
151
|
+
|
|
152
|
+
self._embedder.fit(contents) # Fit the embedder once with all contents
|
|
153
|
+
|
|
154
|
+
embeddings = [
|
|
155
|
+
self._embedder.transform([doc.content])[0].to_numpy().tolist()
|
|
156
|
+
for doc in documents
|
|
157
|
+
]
|
|
158
|
+
metadatas = [json.dumps(doc.metadata or {}) for doc in documents]
|
|
159
|
+
|
|
160
|
+
data_list = list(zip(ids, contents, embeddings, metadatas))
|
|
161
|
+
|
|
162
|
+
query = f"""
|
|
163
|
+
INSERT OR REPLACE INTO {self.table_name} (id, content, embedding, metadata)
|
|
164
|
+
VALUES (?, ?, ?, ?)
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
if self.database_name == ":memory:":
|
|
168
|
+
self._conn.executemany(query, data_list)
|
|
169
|
+
else:
|
|
170
|
+
with duckdb.connect(self._database_path) as conn:
|
|
171
|
+
conn.executemany(query, data_list)
|
|
172
|
+
|
|
173
|
+
def get_document(self, id: str) -> Optional[Document]:
|
|
174
|
+
query = f"SELECT id, content, metadata FROM {self.table_name} WHERE id = ?"
|
|
175
|
+
if self.database_name == ":memory:":
|
|
176
|
+
result = self._conn.execute(query, [id]).fetchone()
|
|
177
|
+
else:
|
|
178
|
+
with duckdb.connect(self._database_path) as conn:
|
|
179
|
+
result = conn.execute(query, [id]).fetchone()
|
|
180
|
+
|
|
181
|
+
if result:
|
|
182
|
+
return Document(
|
|
183
|
+
id=result[0], content=result[1], metadata=json.loads(result[2])
|
|
184
|
+
)
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
def retrieve(self, query: str, top_k: int = 5) -> List[Document]:
|
|
188
|
+
query_embedding = self._embedder.transform([query])[0].to_numpy().tolist()
|
|
189
|
+
select_query = f"""
|
|
190
|
+
SELECT id, content, metadata, embedding
|
|
191
|
+
FROM {self.table_name}
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
if self.database_name == ":memory:":
|
|
195
|
+
results = self._conn.execute(select_query).fetchall()
|
|
196
|
+
else:
|
|
197
|
+
with duckdb.connect(self._database_path) as conn:
|
|
198
|
+
results = conn.execute(select_query).fetchall()
|
|
199
|
+
|
|
200
|
+
# Calculate cosine similarities
|
|
201
|
+
similarities = [
|
|
202
|
+
(
|
|
203
|
+
row[0],
|
|
204
|
+
row[1],
|
|
205
|
+
row[2],
|
|
206
|
+
row[3],
|
|
207
|
+
self._cosine_similarity(query_embedding, row[3]),
|
|
208
|
+
)
|
|
209
|
+
for row in results
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
# Get top-k results sorted by similarity
|
|
213
|
+
top_results = sorted(similarities, key=lambda x: x[4], reverse=True)[:top_k]
|
|
214
|
+
|
|
215
|
+
return [
|
|
216
|
+
Document(
|
|
217
|
+
id=row[0],
|
|
218
|
+
content=row[1],
|
|
219
|
+
metadata=json.loads(row[2]),
|
|
220
|
+
embedding=Vector(value=row[3]),
|
|
221
|
+
)
|
|
222
|
+
for row in top_results
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
def delete_document(self, id: str) -> None:
|
|
226
|
+
query = f"DELETE FROM {self.table_name} WHERE id = ?"
|
|
227
|
+
if self.database_name == ":memory:":
|
|
228
|
+
self._conn.execute(query, [id])
|
|
229
|
+
else:
|
|
230
|
+
with duckdb.connect(self._database_path) as conn:
|
|
231
|
+
conn.execute(query, [id])
|
|
232
|
+
|
|
233
|
+
def get_all_documents(self) -> List[Document]:
|
|
234
|
+
query = f"SELECT id, content, metadata FROM {self.table_name}"
|
|
235
|
+
if self.database_name == ":memory:":
|
|
236
|
+
results = self._conn.execute(query).fetchall()
|
|
237
|
+
else:
|
|
238
|
+
with duckdb.connect(self._database_path) as conn:
|
|
239
|
+
results = conn.execute(query).fetchall()
|
|
240
|
+
|
|
241
|
+
return [
|
|
242
|
+
Document(id=row[0], content=row[1], metadata=json.loads(row[2]))
|
|
243
|
+
for row in results
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
def update_document(self, id: str, new_document: Document) -> None:
|
|
247
|
+
"""Update an existing document in the DuckDB store."""
|
|
248
|
+
try:
|
|
249
|
+
data = self._prepare_document(new_document)
|
|
250
|
+
query = f"""
|
|
251
|
+
UPDATE {self.table_name}
|
|
252
|
+
SET content = ?, embedding = ?, metadata = ?
|
|
253
|
+
WHERE id = ?
|
|
254
|
+
"""
|
|
255
|
+
if self.database_name == ":memory:":
|
|
256
|
+
self._conn.execute(
|
|
257
|
+
query, [data["content"], data["embedding"], data["metadata"], id]
|
|
258
|
+
)
|
|
259
|
+
else:
|
|
260
|
+
with duckdb.connect(self._database_path) as conn:
|
|
261
|
+
conn.execute(
|
|
262
|
+
query,
|
|
263
|
+
[data["content"], data["embedding"], data["metadata"], id],
|
|
264
|
+
)
|
|
265
|
+
except Exception as e:
|
|
266
|
+
raise RuntimeError(f"Failed to update document {id}: {str(e)}")
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def from_local(cls, database_path: str, table_name: str = "documents", **kwargs):
|
|
270
|
+
"""Load a DuckDBVectorStore from a local file."""
|
|
271
|
+
database_name = os.path.basename(database_path)
|
|
272
|
+
persist_dir = os.path.dirname(database_path)
|
|
273
|
+
return cls(
|
|
274
|
+
database_name=database_name,
|
|
275
|
+
table_name=table_name,
|
|
276
|
+
persist_dir=persist_dir,
|
|
277
|
+
**kwargs,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
def model_dump_json(self, *args, **kwargs) -> str:
|
|
281
|
+
"""Override model_dump_json to ensure connection is closed before serialization."""
|
|
282
|
+
self.disconnect()
|
|
283
|
+
return super().model_dump_json(*args, **kwargs)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .DuckDBVectorStore import DuckDBVectorStore
|
|
2
|
+
|
|
3
|
+
__version__ = "0.6.0.dev26"
|
|
4
|
+
__long_desc__ = """
|
|
5
|
+
|
|
6
|
+
# Swarmauri DuckDB Plugin
|
|
7
|
+
|
|
8
|
+
This repository includes a DuckDB of a Swarmauri Plugin.
|
|
9
|
+
|
|
10
|
+
Visit us at: https://swarmauri.com
|
|
11
|
+
Follow us at: https://github.com/swarmauri
|
|
12
|
+
Star us at: https://github.com/swarmauri/swarmauri-sdk
|
|
13
|
+
|
|
14
|
+
"""
|