swarmauri_vectorstore_pinecone 0.6.0.dev154__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.3
2
+ Name: swarmauri_vectorstore_pinecone
3
+ Version: 0.6.0.dev154
4
+ Summary: Swarmauri Pinecone Vector Store
5
+ License: Apache-2.0
6
+ Author: Jacob Stewart
7
+ Author-email: jacob@swarmauri.com
8
+ Requires-Python: >=3.10,<3.13
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: pinecone-client[grpc] (>=5.0.1,<6.0.0)
15
+ Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
16
+ Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
17
+ Requires-Dist: swarmauri_embedding_doc2vec (>=0.6.0.dev154,<0.7.0)
18
+ Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Swarmauri Example Community Package
@@ -0,0 +1 @@
1
+ # Swarmauri Example Community Package
@@ -0,0 +1,57 @@
1
+ [tool.poetry]
2
+ name = "swarmauri_vectorstore_pinecone"
3
+ version = "0.6.0.dev154"
4
+ description = "Swarmauri Pinecone Vector Store"
5
+ authors = ["Jacob Stewart <jacob@swarmauri.com>"]
6
+ license = "Apache-2.0"
7
+ readme = "README.md"
8
+ repository = "http://github.com/swarmauri/swarmauri-sdk"
9
+ classifiers = [
10
+ "License :: OSI Approved :: Apache Software License",
11
+ "Programming Language :: Python :: 3.10",
12
+ "Programming Language :: Python :: 3.11",
13
+ "Programming Language :: Python :: 3.12"
14
+ ]
15
+
16
+ [tool.poetry.dependencies]
17
+ python = ">=3.10,<3.13"
18
+
19
+ # Swarmauri
20
+ swarmauri_core = {version = "^0.6.0.dev154"}
21
+ swarmauri_base = {version = "^0.6.0.dev154"}
22
+ swarmauri_embedding_doc2vec = {version = "^0.6.0.dev154"}
23
+
24
+ # Dependencies
25
+ pinecone-client = { version = "^5.0.1", extras = ["grpc"] }
26
+
27
+ [tool.poetry.group.dev.dependencies]
28
+ flake8 = "^7.0"
29
+ pytest = "^8.0"
30
+ pytest-asyncio = ">=0.24.0"
31
+ pytest-xdist = "^3.6.1"
32
+ pytest-json-report = "^1.5.0"
33
+ python-dotenv = "*"
34
+ requests = "^2.32.3"
35
+
36
+ [build-system]
37
+ requires = ["poetry-core>=1.0.0"]
38
+ build-backend = "poetry.core.masonry.api"
39
+
40
+ [tool.pytest.ini_options]
41
+ norecursedirs = ["combined", "scripts"]
42
+
43
+ markers = [
44
+ "test: standard test",
45
+ "unit: Unit tests",
46
+ "integration: Integration tests",
47
+ "acceptance: Acceptance tests",
48
+ "experimental: Experimental tests"
49
+ ]
50
+ log_cli = true
51
+ log_cli_level = "INFO"
52
+ log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
53
+ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
54
+ asyncio_default_fixture_loop_scope = "function"
55
+
56
+ [tool.poetry.plugins."swarmauri.vector_stores"]
57
+ PineconeVectorStore = "swarmauri_vectorstore_pinecone.PineconeVectorStore:PineconeVectorStore"
@@ -0,0 +1,372 @@
1
+ import numpy as np
2
+ from typing import List, Union, Literal, Dict, Any, Optional
3
+
4
+ from pinecone.grpc import PineconeGRPC as Pinecone
5
+ from pinecone import ServerlessSpec
6
+
7
+ from swarmauri.documents.concrete.Document import Document
8
+ from swarmauri_embedding_doc2vec.Doc2VecEmbedding import Doc2VecEmbedding
9
+ from swarmauri.distances.concrete.CosineDistance import CosineDistance
10
+
11
+ from swarmauri.vector_stores.base.VectorStoreBase import VectorStoreBase
12
+ from swarmauri.vector_stores.base.VectorStoreRetrieveMixin import (
13
+ VectorStoreRetrieveMixin,
14
+ )
15
+ from swarmauri.vector_stores.base.VectorStoreCloudMixin import (
16
+ VectorStoreCloudMixin,
17
+ )
18
+ from swarmauri.vector_stores.base.VectorStoreSaveLoadMixin import (
19
+ VectorStoreSaveLoadMixin,
20
+ )
21
+ from swarmauri_core.ComponentBase import ComponentBase
22
+
23
+
24
+ @ComponentBase.register_type(VectorStoreBase, "PineconeVectorStore")
25
+ class PineconeVectorStore(
26
+ VectorStoreRetrieveMixin,
27
+ VectorStoreCloudMixin,
28
+ VectorStoreSaveLoadMixin,
29
+ VectorStoreBase,
30
+ ):
31
+ """
32
+ A vector store implementation using Pinecone as the backend.
33
+
34
+ This class provides methods to interact with a Pinecone index, including
35
+ adding, retrieving, updating, and deleting documents, as well as performing
36
+ similarity searches.
37
+ """
38
+
39
+ type: Literal["PineconeVectorStore"] = "PineconeVectorStore"
40
+
41
+ def __init__(self, **kwargs):
42
+ """
43
+ Initialize the PineconeVectorStore.
44
+ Args:
45
+ **kwargs: Additional keyword arguments.
46
+ """
47
+ super().__init__(**kwargs)
48
+ self._embedder = Doc2VecEmbedding(vector_size=self.vector_size)
49
+ self._distance = CosineDistance()
50
+
51
+ def delete(self):
52
+ """
53
+ Delete the Pinecone index if it exists.
54
+
55
+ """
56
+ try:
57
+ pc = Pinecone(api_key=self.api_key)
58
+ pc.delete_index(self.collection_name)
59
+ self.client = None
60
+ except Exception as e:
61
+ raise RuntimeError(
62
+ f"Failed to delete index {self.collection_name}: {str(e)}"
63
+ )
64
+
65
+ def connect(self, metric: Optional[str] = "cosine", cloud: Optional[str] = "aws", region: Optional[str] = "us-east-1"):
66
+ """
67
+ Connect to the Pinecone index, creating it if it doesn't exist.
68
+
69
+ Args:
70
+ metric (Optional[str]): The distance metric to use. Defaults to "cosine".
71
+ cloud (Optional[str]): The cloud provider to use. Defaults to "aws".
72
+ region (Optional[str]): The region to use. Defaults to "us-east-1".
73
+
74
+ """
75
+ try:
76
+ pc = Pinecone(api_key=self.api_key)
77
+ if not pc.has_index(self.collection_name):
78
+ pc.create_index(
79
+ name=self.collection_name,
80
+ dimension=self.vector_size,
81
+ metric=metric,
82
+ spec=ServerlessSpec(
83
+ cloud=cloud,
84
+ region=region,
85
+ ),
86
+ )
87
+ self.client = pc.Index(self.collection_name)
88
+ except Exception as e:
89
+ raise RuntimeError(
90
+ f"Failed to connect to Pinecone index {self.collection_name}: {str(e)}"
91
+ )
92
+
93
+ def disconnect(self):
94
+ """
95
+ Disconnect from the Pinecone index.
96
+
97
+ """
98
+ try:
99
+ self.client = None
100
+ except Exception as e:
101
+ raise RuntimeError(f"Error during disconnecting: {str(e)}")
102
+
103
+ def _prepare_vector(self, document: Document) -> Dict[str, Any]:
104
+ """
105
+ Prepare a vector for insertion into the Pinecone index.
106
+
107
+ Args:
108
+ document (Document): The document to prepare.
109
+
110
+ Returns:
111
+ Dict[str, Any]: A dictionary containing the prepared vector data.
112
+ """
113
+ embedding = None
114
+ if not document.embedding:
115
+ self._embedder.fit([document.content])
116
+ embedding = (
117
+ self._embedder.transform([document.content])[0].to_numpy().tolist()
118
+ )
119
+ else:
120
+ embedding = document.embedding
121
+
122
+ document.metadata["content"] = document.content
123
+ return {"id": document.id, "values": embedding, "metadata": document.metadata}
124
+
125
+ def add_document(self, document: Document, namespace: Optional[str] = "") -> None:
126
+ """
127
+ Add a single document to the Pinecone index.
128
+
129
+ Args:
130
+ document (Document): The document to add.
131
+ namespace (Optional[str]): The namespace to add the document to. Defaults to "".
132
+ """
133
+ try:
134
+ vector = self._prepare_vector(document)
135
+ self.client.upsert(vectors=[vector], namespace=namespace)
136
+ except Exception as e:
137
+ raise RuntimeError(f"Failed to add document {document.id}: {str(e)}")
138
+
139
+ def add_documents(
140
+ self,
141
+ documents: List[Document],
142
+ namespace: Optional[str] = "",
143
+ batch_size: int = 200,
144
+ ) -> None:
145
+ """
146
+ Add multiple documents to the Pinecone index in batches.
147
+
148
+ Args:
149
+ documents (List[Document]): The list of documents to add.
150
+ namespace (Optional[str]): The namespace to add the documents to. Defaults to "".
151
+ batch_size (int): The number of documents to add in each batch. Defaults to 200.
152
+
153
+ """
154
+ if batch_size <= 0 or batch_size > 1000:
155
+ raise ValueError("Batch size must be between 1 and 1000.")
156
+
157
+ vectors = [self._prepare_vector(doc) for doc in documents]
158
+ for i in range(0, len(vectors), batch_size):
159
+ batch_vectors = vectors[i : i + batch_size]
160
+ try:
161
+ self.client.upsert(vectors=batch_vectors, namespace=namespace)
162
+ except Exception as e:
163
+ raise RuntimeError(
164
+ f"Error during batch upsert. Consider lowering batch size: {str(e)}"
165
+ )
166
+
167
+ def get_document(
168
+ self, id: str, namespace: Optional[str] = ""
169
+ ) -> Union[Document, None]:
170
+ """
171
+ Retrieve a single document from the Pinecone index by its ID.
172
+
173
+ Args:
174
+ id (str): The ID of the document to retrieve.
175
+ namespace (Optional[str]): The namespace to search in. Defaults to "".
176
+
177
+ Returns:
178
+ Union[Document, None]: The retrieved document, or None if not found.
179
+
180
+ """
181
+ try:
182
+ result = self.client.fetch(ids=[id], namespace=namespace)
183
+ if id in result["vectors"]:
184
+ vector = result["vectors"][id]
185
+ return Document(
186
+ id=id,
187
+ content=vector["metadata"].get("content", ""),
188
+ metadata=vector["metadata"] or {},
189
+ )
190
+ return None
191
+ except Exception as e:
192
+ raise RuntimeError(f"Failed to get document {id}: {str(e)}")
193
+
194
+ def _get_ids_from_query(self, input_vector):
195
+ """
196
+ Get document IDs from a query vector.
197
+
198
+ Args:
199
+ input_vector: The input vector to query.
200
+
201
+ """
202
+ results = self.client.query(
203
+ vector=input_vector, top_k=10000, include_values=False
204
+ )
205
+ return {result["id"] for result in results["matches"]}
206
+
207
+ def _get_all_ids_from_client(self, namespace: Optional[str] = ""):
208
+ """
209
+ Get all document IDs from the Pinecone index.
210
+
211
+ Args:
212
+ namespace (Optional[str]): The namespace to search in. Defaults to "".
213
+
214
+ Returns:
215
+ set: A set of all document IDs in the index.
216
+ """
217
+ num_vectors = self.client.describe_index_stats()["namespaces"][namespace][
218
+ "vector_count"
219
+ ]
220
+ all_ids = set()
221
+ while len(all_ids) < num_vectors:
222
+ input_vector = np.random.rand(self.vector_size).tolist()
223
+ ids = self._get_ids_from_query(input_vector)
224
+ all_ids.update(ids)
225
+ return all_ids
226
+
227
+ def get_all_documents(self, namespace: Optional[str] = "") -> List[Document]:
228
+ """
229
+ Retrieve all documents from the Pinecone index.
230
+
231
+ Args:
232
+ namespace (Optional[str]): The namespace to search in. Defaults to "".
233
+
234
+ Returns:
235
+ List[Document]: A list of all documents in the index.
236
+
237
+ """
238
+ try:
239
+ documents = []
240
+ id_list = list(self._get_all_ids_from_client(namespace))
241
+ batch_size = min(len(id_list), 1000)
242
+ for i in range(0, len(id_list), batch_size):
243
+ batch_ids = id_list[i : i + batch_size]
244
+ result = self.client.fetch(ids=batch_ids, namespace=namespace)
245
+ for id, vector in result["vectors"].items():
246
+ documents.append(
247
+ Document(
248
+ id=id,
249
+ content=vector["metadata"].get("content", ""),
250
+ metadata=vector["metadata"] or {},
251
+ )
252
+ )
253
+ return documents
254
+ except Exception as e:
255
+ raise RuntimeError(f"Failed to get all documents: {str(e)}")
256
+
257
+ def delete_document(self, id: str, namespace: Optional[str] = "") -> None:
258
+ """
259
+ Delete a single document from the Pinecone index.
260
+
261
+ Args:
262
+ id (str): The ID of the document to delete.
263
+ namespace (Optional[str]): The namespace to delete from. Defaults to "".
264
+
265
+ """
266
+ try:
267
+ self.client.delete(ids=[id], namespace=namespace)
268
+ except Exception as e:
269
+ raise RuntimeError(f"Failed to delete document {id}: {str(e)}")
270
+
271
+ def clear_documents(self, namespace: Optional[str] = "") -> None:
272
+ """
273
+ Delete all documents from the Pinecone index in a given namespace.
274
+
275
+ Args:
276
+ namespace (Optional[str]): The namespace to clear. Defaults to "".
277
+
278
+ """
279
+ try:
280
+ self.client.delete(delete_all=True, namespace=namespace)
281
+ except Exception as e:
282
+ raise RuntimeError(
283
+ f"Failed to clear documents in namespace {namespace}: {str(e)}"
284
+ )
285
+
286
+ def update_document(
287
+ self, id: str, document: Document, namespace: Optional[str] = ""
288
+ ) -> None:
289
+ """
290
+ Update a document in the Pinecone index.
291
+
292
+ Args:
293
+ id (str): The ID of the document to update.
294
+ document (Document): The updated document.
295
+ namespace (Optional[str]): The namespace of the document. Defaults to "".
296
+
297
+ """
298
+ try:
299
+ embedding = (
300
+ self._embedder.transform([document.content])[0].to_numpy().tolist()
301
+ )
302
+ document.metadata["content"] = document.content
303
+ self.client.update(
304
+ id=id,
305
+ values=embedding,
306
+ set_metadata=document.metadata,
307
+ namespace=namespace,
308
+ )
309
+ except Exception as e:
310
+ raise RuntimeError(f"Failed to update document {id}: {str(e)}")
311
+
312
+ def document_count(self, namespace: Optional[str] = "") -> int:
313
+ """
314
+ Get the number of documents in the Pinecone index.
315
+
316
+ Args:
317
+ namespace (Optional[str]): The namespace to count documents in. Defaults to "".
318
+
319
+ Returns:
320
+ int: The number of documents in the index.
321
+
322
+ """
323
+ try:
324
+ return self.client.describe_index_stats()["namespaces"][namespace][
325
+ "vector_count"
326
+ ]
327
+ except Exception as e:
328
+ raise RuntimeError(
329
+ f"Failed to get document count for namespace {namespace}: {str(e)}"
330
+ )
331
+
332
+ def retrieve(
333
+ self, query: str, top_k: int = 5, namespace: Optional[str] = ""
334
+ ) -> List[Document]:
335
+ """
336
+ Retrieve documents from the Pinecone index based on a query string.
337
+
338
+ Args:
339
+ query (str): The query string to search for.
340
+ top_k (int): The number of results to return. Defaults to 5.
341
+ namespace (Optional[str]): The namespace to search in. Defaults to "".
342
+
343
+ Returns:
344
+ List[Document]: A list of retrieved documents.
345
+
346
+ """
347
+ try:
348
+ query_embedding = self._embedder.infer_vector(query).value
349
+ results = self.client.query(
350
+ vector=query_embedding,
351
+ top_k=top_k,
352
+ namespace=namespace,
353
+ include_metadata=True,
354
+ )
355
+ return [
356
+ Document(
357
+ id=match["id"],
358
+ content=match["metadata"].get("content", ""),
359
+ metadata=match["metadata"] or {},
360
+ )
361
+ for match in results["matches"]
362
+ ]
363
+ except Exception as e:
364
+ raise RuntimeError(f"Failed to retrieve documents: {str(e)}")
365
+
366
+ # Override the model_dump_json method
367
+ def model_dump_json(self, *args, **kwargs) -> str:
368
+ # Call the disconnect method before serialization
369
+ self.disconnect()
370
+
371
+ # Now proceed with the usual JSON serialization
372
+ return super().model_dump_json(*args, **kwargs)
@@ -0,0 +1,13 @@
1
+ from .PineconeVectorStore import PineconeVectorStore
2
+
3
+ __version__ = "0.6.0.dev26"
4
+ __long_desc__ = """
5
+
6
+ # Swarmauri Pinecone VectorStore Plugin
7
+
8
+
9
+ Visit us at: https://swarmauri.com
10
+ Follow us at: https://github.com/swarmauri
11
+ Star us at: https://github.com/swarmauri/swarmauri-sdk
12
+
13
+ """