swarmauri_vectorstore_duckdb 0.6.0.dev154__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.3
2
+ Name: swarmauri_vectorstore_duckdb
3
+ Version: 0.6.0.dev154
4
+ Summary: A DuckDB based Vector Store
5
+ License: Apache-2.0
6
+ Author: Jacob Stewart
7
+ Author-email: jacob@swarmauri.com
8
+ Requires-Python: >=3.10,<3.13
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: duckdb (>=1.1.1,<2.0.0)
15
+ Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
16
+ Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
17
+ Requires-Dist: swarmauri_embedding_doc2vec (>=0.6.0.dev154,<0.7.0)
18
+ Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Swarmauri Example Community Package
@@ -0,0 +1 @@
1
+ # Swarmauri Example Community Package
@@ -0,0 +1,58 @@
1
+ [tool.poetry]
2
+ name = "swarmauri_vectorstore_duckdb"
3
+ version = "0.6.0.dev154"
4
+ description = "A DuckDB based Vector Store"
5
+ authors = ["Jacob Stewart <jacob@swarmauri.com>"]
6
+ license = "Apache-2.0"
7
+ readme = "README.md"
8
+ repository = "http://github.com/swarmauri/swarmauri-sdk"
9
+ classifiers = [
10
+ "License :: OSI Approved :: Apache Software License",
11
+ "Programming Language :: Python :: 3.10",
12
+ "Programming Language :: Python :: 3.11",
13
+ "Programming Language :: Python :: 3.12"
14
+ ]
15
+
16
+ [tool.poetry.dependencies]
17
+ python = ">=3.10,<3.13"
18
+
19
+ # Swarmauri
20
+ swarmauri_core = {version = "^0.6.0.dev154"}
21
+ swarmauri_base = {version = "^0.6.0.dev154"}
22
+ swarmauri_embedding_doc2vec = {version = "^0.6.0.dev154"}
23
+
24
+ # Dependencies
25
+ duckdb = "^1.1.1"
26
+
27
+
28
+ [tool.poetry.group.dev.dependencies]
29
+ flake8 = "^7.0"
30
+ pytest = "^8.0"
31
+ pytest-asyncio = ">=0.24.0"
32
+ pytest-xdist = "^3.6.1"
33
+ pytest-json-report = "^1.5.0"
34
+ python-dotenv = "*"
35
+ requests = "^2.32.3"
36
+
37
+ [build-system]
38
+ requires = ["poetry-core>=1.0.0"]
39
+ build-backend = "poetry.core.masonry.api"
40
+
41
+ [tool.pytest.ini_options]
42
+ norecursedirs = ["combined", "scripts"]
43
+
44
+ markers = [
45
+ "test: standard test",
46
+ "unit: Unit tests",
47
+ "integration: Integration tests",
48
+ "acceptance: Acceptance tests",
49
+ "experimental: Experimental tests"
50
+ ]
51
+ log_cli = true
52
+ log_cli_level = "INFO"
53
+ log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
54
+ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
55
+ asyncio_default_fixture_loop_scope = "function"
56
+
57
+ [tool.poetry.plugins."swarmauri.vector_stores"]
58
+ DuckDBVectorStore = "swarmauri_vectorstore_duckdb.DuckDBVectorStore:DuckDBVectorStore"
@@ -0,0 +1,283 @@
1
+ from typing import List, Literal, Dict, Any, Optional
2
+ import os
3
+ import json
4
+ import duckdb
5
+ from pydantic import Field, PrivateAttr
6
+ import numpy as np
7
+
8
+ from swarmauri_standard.vectors.Vector import Vector
9
+ from swarmauri_standard.documents.Document import Document
10
+ from swarmauri_embedding_doc2vec.Doc2VecEmbedding import Doc2VecEmbedding
11
+ from swarmauri_standard.distances.CosineDistance import CosineDistance
12
+
13
+ from swarmauri_base.vector_stores.VectorStoreBase import VectorStoreBase
14
+ from swarmauri_base.vector_stores.VectorStoreRetrieveMixin import (
15
+ VectorStoreRetrieveMixin,
16
+ )
17
+ from swarmauri_base.vector_stores.VectorStoreSaveLoadMixin import (
18
+ VectorStoreSaveLoadMixin,
19
+ )
20
+ from swarmauri_core.ComponentBase import ComponentBase
21
+
22
+
23
+ @ComponentBase.register_type(VectorStoreBase, "DuckDBVectorStore")
24
+ class DuckDBVectorStore(
25
+ VectorStoreSaveLoadMixin, VectorStoreRetrieveMixin, VectorStoreBase
26
+ ):
27
+ """A vector store implementation using DuckDB as the backend."""
28
+
29
+ type: Literal["DuckDBVectorStore"] = "DuckDBVectorStore"
30
+ database_name: str = Field(
31
+ default=":memory:", description="Name of the DuckDB database"
32
+ )
33
+ table_name: str = Field(
34
+ default="documents", description="Name of the table to store documents"
35
+ )
36
+ embed_dim: Optional[int] = Field(
37
+ default=None, description="Dimension of the embedding vectors"
38
+ )
39
+ persist_dir: str = Field(
40
+ default="./storage", description="Directory to persist the database"
41
+ )
42
+
43
+ _conn: Any = PrivateAttr(default=None)
44
+ _is_initialized: bool = PrivateAttr(default=False)
45
+ _database_path: Optional[str] = PrivateAttr(default=None)
46
+
47
+ def __init__(self, **data):
48
+ super().__init__(**data)
49
+ self._embedder = Doc2VecEmbedding()
50
+ self._distance = CosineDistance()
51
+
52
+ if not os.path.exists(self.persist_dir):
53
+ os.makedirs(self.persist_dir)
54
+
55
+ if self.database_name == ":memory:":
56
+ self._conn = duckdb.connect(self.database_name)
57
+ self._setup_extensions(self._conn)
58
+ self._initialize_table(self._conn)
59
+ else:
60
+ self._database_path = os.path.join(self.persist_dir, self.database_name)
61
+ with duckdb.connect(self._database_path) as conn:
62
+ self._setup_extensions(conn)
63
+ self._initialize_table(conn)
64
+
65
+ @staticmethod
66
+ def _setup_extensions(conn):
67
+ conn.install_extension("json")
68
+ conn.load_extension("json")
69
+ conn.install_extension("fts")
70
+ conn.load_extension("fts")
71
+
72
+ @staticmethod
73
+ def _cosine_similarity(vec1, vec2):
74
+ dot_product = np.dot(vec1, vec2)
75
+ norm_vec1 = np.linalg.norm(vec1)
76
+ norm_vec2 = np.linalg.norm(vec2)
77
+ return dot_product / (norm_vec1 * norm_vec2)
78
+
79
+ def connect(self) -> None:
80
+ """Connect to the DuckDB database and initialize if necessary."""
81
+ if not self._is_initialized:
82
+ if self.database_name == ":memory:":
83
+ self._initialize_table(self._conn)
84
+ else:
85
+ with duckdb.connect(self._database_path) as conn:
86
+ self._setup_extensions(conn)
87
+ self._initialize_table(conn)
88
+ self._is_initialized = True
89
+
90
+ def _initialize_table(self, conn) -> None:
91
+ embed_dim_str = f"[{self.embed_dim}]" if self.embed_dim else "[]"
92
+ conn.execute(
93
+ f"""
94
+ CREATE TABLE IF NOT EXISTS {self.table_name} (
95
+ id VARCHAR PRIMARY KEY,
96
+ content TEXT,
97
+ embedding FLOAT{embed_dim_str},
98
+ metadata JSON
99
+ )"""
100
+ )
101
+
102
+ def disconnect(self) -> None:
103
+ """Disconnect from the DuckDB database."""
104
+ if self._conn and self.database_name == ":memory:":
105
+ self._conn.close()
106
+ self._conn = None
107
+ self._is_initialized = False
108
+
109
+ def _prepare_document(self, document: Document) -> Dict[str, Any]:
110
+ if not document.embedding:
111
+ self._embedder.fit([document.content])
112
+ embedding = (
113
+ self._embedder.transform([document.content])[0].to_numpy().tolist()
114
+ )
115
+ else:
116
+ embedding = (
117
+ document.embedding.value
118
+ if isinstance(document.embedding, Vector)
119
+ else document.embedding
120
+ )
121
+
122
+ return {
123
+ "id": document.id,
124
+ "content": document.content,
125
+ "embedding": embedding,
126
+ "metadata": json.dumps(document.metadata or {}),
127
+ }
128
+
129
+ def add_document(self, document: Document) -> None:
130
+ # Ensure the document is properly prepared before insertion
131
+ data = self._prepare_document(document)
132
+ query = f"""
133
+ INSERT OR REPLACE INTO {self.table_name} (id, content, embedding, metadata)
134
+ VALUES (?, ?, ?, ?)
135
+ """
136
+ if self.database_name == ":memory:":
137
+ self._conn.execute(
138
+ query,
139
+ [data["id"], data["content"], data["embedding"], data["metadata"]],
140
+ )
141
+ else:
142
+ with duckdb.connect(self._database_path) as conn:
143
+ conn.execute(
144
+ query,
145
+ [data["id"], data["content"], data["embedding"], data["metadata"]],
146
+ )
147
+
148
+ def add_documents(self, documents: List[Document]) -> None:
149
+ ids = [doc.id for doc in documents]
150
+ contents = [doc.content for doc in documents]
151
+
152
+ self._embedder.fit(contents) # Fit the embedder once with all contents
153
+
154
+ embeddings = [
155
+ self._embedder.transform([doc.content])[0].to_numpy().tolist()
156
+ for doc in documents
157
+ ]
158
+ metadatas = [json.dumps(doc.metadata or {}) for doc in documents]
159
+
160
+ data_list = list(zip(ids, contents, embeddings, metadatas))
161
+
162
+ query = f"""
163
+ INSERT OR REPLACE INTO {self.table_name} (id, content, embedding, metadata)
164
+ VALUES (?, ?, ?, ?)
165
+ """
166
+
167
+ if self.database_name == ":memory:":
168
+ self._conn.executemany(query, data_list)
169
+ else:
170
+ with duckdb.connect(self._database_path) as conn:
171
+ conn.executemany(query, data_list)
172
+
173
+ def get_document(self, id: str) -> Optional[Document]:
174
+ query = f"SELECT id, content, metadata FROM {self.table_name} WHERE id = ?"
175
+ if self.database_name == ":memory:":
176
+ result = self._conn.execute(query, [id]).fetchone()
177
+ else:
178
+ with duckdb.connect(self._database_path) as conn:
179
+ result = conn.execute(query, [id]).fetchone()
180
+
181
+ if result:
182
+ return Document(
183
+ id=result[0], content=result[1], metadata=json.loads(result[2])
184
+ )
185
+ return None
186
+
187
+ def retrieve(self, query: str, top_k: int = 5) -> List[Document]:
188
+ query_embedding = self._embedder.transform([query])[0].to_numpy().tolist()
189
+ select_query = f"""
190
+ SELECT id, content, metadata, embedding
191
+ FROM {self.table_name}
192
+ """
193
+
194
+ if self.database_name == ":memory:":
195
+ results = self._conn.execute(select_query).fetchall()
196
+ else:
197
+ with duckdb.connect(self._database_path) as conn:
198
+ results = conn.execute(select_query).fetchall()
199
+
200
+ # Calculate cosine similarities
201
+ similarities = [
202
+ (
203
+ row[0],
204
+ row[1],
205
+ row[2],
206
+ row[3],
207
+ self._cosine_similarity(query_embedding, row[3]),
208
+ )
209
+ for row in results
210
+ ]
211
+
212
+ # Get top-k results sorted by similarity
213
+ top_results = sorted(similarities, key=lambda x: x[4], reverse=True)[:top_k]
214
+
215
+ return [
216
+ Document(
217
+ id=row[0],
218
+ content=row[1],
219
+ metadata=json.loads(row[2]),
220
+ embedding=Vector(value=row[3]),
221
+ )
222
+ for row in top_results
223
+ ]
224
+
225
+ def delete_document(self, id: str) -> None:
226
+ query = f"DELETE FROM {self.table_name} WHERE id = ?"
227
+ if self.database_name == ":memory:":
228
+ self._conn.execute(query, [id])
229
+ else:
230
+ with duckdb.connect(self._database_path) as conn:
231
+ conn.execute(query, [id])
232
+
233
+ def get_all_documents(self) -> List[Document]:
234
+ query = f"SELECT id, content, metadata FROM {self.table_name}"
235
+ if self.database_name == ":memory:":
236
+ results = self._conn.execute(query).fetchall()
237
+ else:
238
+ with duckdb.connect(self._database_path) as conn:
239
+ results = conn.execute(query).fetchall()
240
+
241
+ return [
242
+ Document(id=row[0], content=row[1], metadata=json.loads(row[2]))
243
+ for row in results
244
+ ]
245
+
246
+ def update_document(self, id: str, new_document: Document) -> None:
247
+ """Update an existing document in the DuckDB store."""
248
+ try:
249
+ data = self._prepare_document(new_document)
250
+ query = f"""
251
+ UPDATE {self.table_name}
252
+ SET content = ?, embedding = ?, metadata = ?
253
+ WHERE id = ?
254
+ """
255
+ if self.database_name == ":memory:":
256
+ self._conn.execute(
257
+ query, [data["content"], data["embedding"], data["metadata"], id]
258
+ )
259
+ else:
260
+ with duckdb.connect(self._database_path) as conn:
261
+ conn.execute(
262
+ query,
263
+ [data["content"], data["embedding"], data["metadata"], id],
264
+ )
265
+ except Exception as e:
266
+ raise RuntimeError(f"Failed to update document {id}: {str(e)}")
267
+
268
+ @classmethod
269
+ def from_local(cls, database_path: str, table_name: str = "documents", **kwargs):
270
+ """Load a DuckDBVectorStore from a local file."""
271
+ database_name = os.path.basename(database_path)
272
+ persist_dir = os.path.dirname(database_path)
273
+ return cls(
274
+ database_name=database_name,
275
+ table_name=table_name,
276
+ persist_dir=persist_dir,
277
+ **kwargs,
278
+ )
279
+
280
+ def model_dump_json(self, *args, **kwargs) -> str:
281
+ """Override model_dump_json to ensure connection is closed before serialization."""
282
+ self.disconnect()
283
+ return super().model_dump_json(*args, **kwargs)
@@ -0,0 +1,14 @@
1
+ from .DuckDBVectorStore import DuckDBVectorStore
2
+
3
+ __version__ = "0.6.0.dev26"
4
+ __long_desc__ = """
5
+
6
+ # Swarmauri DuckDB Plugin
7
+
8
+ This repository includes a DuckDB of a Swarmauri Plugin.
9
+
10
+ Visit us at: https://swarmauri.com
11
+ Follow us at: https://github.com/swarmauri
12
+ Star us at: https://github.com/swarmauri/swarmauri-sdk
13
+
14
+ """