ai-parrot 0.8.3__cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.8.3.dist-info/LICENSE +21 -0
- ai_parrot-0.8.3.dist-info/METADATA +306 -0
- ai_parrot-0.8.3.dist-info/RECORD +128 -0
- ai_parrot-0.8.3.dist-info/WHEEL +6 -0
- ai_parrot-0.8.3.dist-info/top_level.txt +2 -0
- parrot/__init__.py +30 -0
- parrot/bots/__init__.py +5 -0
- parrot/bots/abstract.py +1115 -0
- parrot/bots/agent.py +492 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/bose.py +17 -0
- parrot/bots/chatbot.py +271 -0
- parrot/bots/cody.py +17 -0
- parrot/bots/copilot.py +117 -0
- parrot/bots/data.py +730 -0
- parrot/bots/dataframe.py +103 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/interfaces/__init__.py +1 -0
- parrot/bots/interfaces/retrievers.py +12 -0
- parrot/bots/notebook.py +619 -0
- parrot/bots/odoo.py +17 -0
- parrot/bots/prompts/__init__.py +41 -0
- parrot/bots/prompts/agents.py +91 -0
- parrot/bots/prompts/data.py +214 -0
- parrot/bots/retrievals/__init__.py +1 -0
- parrot/bots/retrievals/constitutional.py +19 -0
- parrot/bots/retrievals/multi.py +122 -0
- parrot/bots/retrievals/retrieval.py +610 -0
- parrot/bots/tools/__init__.py +7 -0
- parrot/bots/tools/eda.py +325 -0
- parrot/bots/tools/pdf.py +50 -0
- parrot/bots/tools/plot.py +48 -0
- parrot/bots/troc.py +16 -0
- parrot/conf.py +170 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-312-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agents.py +292 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +192 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/http.py +805 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +18 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/exif.py +709 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/llms/__init__.py +1 -0
- parrot/llms/abstract.py +69 -0
- parrot/llms/anthropic.py +58 -0
- parrot/llms/gemma.py +15 -0
- parrot/llms/google.py +44 -0
- parrot/llms/groq.py +67 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +61 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +89 -0
- parrot/loaders/__init__.py +9 -0
- parrot/loaders/abstract.py +628 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/txt.py +26 -0
- parrot/manager.py +333 -0
- parrot/models.py +504 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +11 -0
- parrot/stores/abstract.py +248 -0
- parrot/stores/chroma.py +188 -0
- parrot/stores/duck.py +162 -0
- parrot/stores/embeddings/__init__.py +10 -0
- parrot/stores/embeddings/abstract.py +46 -0
- parrot/stores/embeddings/base.py +52 -0
- parrot/stores/embeddings/bge.py +20 -0
- parrot/stores/embeddings/fastembed.py +17 -0
- parrot/stores/embeddings/google.py +18 -0
- parrot/stores/embeddings/huggingface.py +20 -0
- parrot/stores/embeddings/ollama.py +14 -0
- parrot/stores/embeddings/openai.py +26 -0
- parrot/stores/embeddings/transformers.py +21 -0
- parrot/stores/embeddings/vertexai.py +17 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss.py +160 -0
- parrot/stores/milvus.py +397 -0
- parrot/stores/postgres.py +653 -0
- parrot/stores/qdrant.py +170 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +68 -0
- parrot/tools/asknews.py +33 -0
- parrot/tools/basic.py +51 -0
- parrot/tools/bby.py +359 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/docx.py +343 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/execute.py +56 -0
- parrot/tools/gamma.py +28 -0
- parrot/tools/google.py +170 -0
- parrot/tools/gvoice.py +301 -0
- parrot/tools/results.py +278 -0
- parrot/tools/stack.py +27 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +58 -0
- parrot/tools/zipcode.py +198 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-312-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-312-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Union
|
|
3
|
+
import importlib
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from langchain.docstore.document import Document
|
|
6
|
+
from langchain_core.vectorstores import VectorStoreRetriever
|
|
7
|
+
from navconfig.logging import logging
|
|
8
|
+
from ..conf import (
|
|
9
|
+
EMBEDDING_DEFAULT_MODEL
|
|
10
|
+
)
|
|
11
|
+
from ..exceptions import ConfigError # pylint: disable=E0611
|
|
12
|
+
from .embeddings import supported_embeddings
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AbstractStore(ABC):
|
|
16
|
+
"""AbstractStore class.
|
|
17
|
+
|
|
18
|
+
Base class for all Database Vector Stores.
|
|
19
|
+
Args:
|
|
20
|
+
embeddings (str): Embedding name.
|
|
21
|
+
|
|
22
|
+
Supported Vector Stores:
|
|
23
|
+
- Qdrant
|
|
24
|
+
- Milvus
|
|
25
|
+
- Faiss
|
|
26
|
+
- Chroma
|
|
27
|
+
- PgVector
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
embedding_model: Union[dict, str] = None,
|
|
33
|
+
embedding: Union[dict, Callable] = None,
|
|
34
|
+
**kwargs
|
|
35
|
+
):
|
|
36
|
+
self.client: Callable = None
|
|
37
|
+
self.vector: Callable = None
|
|
38
|
+
self._embed_: Callable = None
|
|
39
|
+
self._connected: bool = False
|
|
40
|
+
if embedding_model is not None:
|
|
41
|
+
if isinstance(embedding_model, str):
|
|
42
|
+
self.embedding_model = {
|
|
43
|
+
'model_name': embedding_model,
|
|
44
|
+
'model_type': 'huggingface'
|
|
45
|
+
}
|
|
46
|
+
elif isinstance(embedding_model, dict):
|
|
47
|
+
self.embedding_model = embedding_model
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
"Embedding Model must be a string or a dictionary."
|
|
51
|
+
)
|
|
52
|
+
# Use or not connection to a vector database:
|
|
53
|
+
self._use_database: bool = kwargs.get('use_database', True)
|
|
54
|
+
# Database Information:
|
|
55
|
+
self.collection_name: str = kwargs.get('collection_name', 'my_collection')
|
|
56
|
+
self.dimension: int = kwargs.get("dimension", 768)
|
|
57
|
+
self._metric_type: str = kwargs.get("metric_type", 'COSINE')
|
|
58
|
+
self._index_type: str = kwargs.get("index_type", 'IVF_FLAT')
|
|
59
|
+
self.database: str = kwargs.get('database', '')
|
|
60
|
+
self.index_name = kwargs.get("index_name", "my_index")
|
|
61
|
+
if embedding is not None:
|
|
62
|
+
if isinstance(embedding, str):
|
|
63
|
+
self.embedding_model = {
|
|
64
|
+
'model_name': embedding,
|
|
65
|
+
'model_type': 'huggingface'
|
|
66
|
+
}
|
|
67
|
+
elif isinstance(embedding, dict):
|
|
68
|
+
self.embedding_model = embedding
|
|
69
|
+
else:
|
|
70
|
+
# is a callable:
|
|
71
|
+
self.embedding_model = {
|
|
72
|
+
'model_name': EMBEDDING_DEFAULT_MODEL,
|
|
73
|
+
'model_type': 'huggingface'
|
|
74
|
+
}
|
|
75
|
+
self._embed_ = embedding
|
|
76
|
+
self.logger = logging.getLogger(
|
|
77
|
+
f"Store.{__name__}"
|
|
78
|
+
)
|
|
79
|
+
# Client Connection (if required):
|
|
80
|
+
self._connection = None
|
|
81
|
+
# Create the Embedding Model:
|
|
82
|
+
self._embed_ = self.create_embedding(
|
|
83
|
+
embedding_model=self.embedding_model
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def connected(self) -> bool:
|
|
88
|
+
return self._connected
|
|
89
|
+
|
|
90
|
+
def is_connected(self):
|
|
91
|
+
return self._connected
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
async def connection(self) -> tuple:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
@abstractmethod
|
|
98
|
+
async def disconnect(self) -> None:
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
# Async Context Manager
|
|
102
|
+
async def __aenter__(self):
|
|
103
|
+
if self._use_database:
|
|
104
|
+
if not self._connection:
|
|
105
|
+
await self.connection()
|
|
106
|
+
return self
|
|
107
|
+
|
|
108
|
+
async def _free_resources(self):
|
|
109
|
+
self._embed_.free()
|
|
110
|
+
self._embed_ = None
|
|
111
|
+
|
|
112
|
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
|
113
|
+
# closing Embedding
|
|
114
|
+
try:
|
|
115
|
+
await self.disconnect()
|
|
116
|
+
except RuntimeError:
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
@abstractmethod
|
|
120
|
+
def get_vector(self, metric_type: str = None, **kwargs):
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
def get_vectorstore(self):
|
|
124
|
+
return self.get_vector()
|
|
125
|
+
|
|
126
|
+
@abstractmethod
|
|
127
|
+
async def similarity_search(
|
|
128
|
+
self,
|
|
129
|
+
query: str,
|
|
130
|
+
collection: Union[str, None] = None,
|
|
131
|
+
limit: int = 2
|
|
132
|
+
) -> list: # noqa
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
@abstractmethod
|
|
136
|
+
async def from_documents(
|
|
137
|
+
self,
|
|
138
|
+
documents: List[Document],
|
|
139
|
+
collection: Union[str, None] = None,
|
|
140
|
+
**kwargs
|
|
141
|
+
) -> Callable:
|
|
142
|
+
"""
|
|
143
|
+
Create Vector Store from Documents.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
documents (List[Document]): List of Documents.
|
|
147
|
+
collection (str): Collection Name.
|
|
148
|
+
kwargs: Additional Arguments.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Callable VectorStore.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
@abstractmethod
|
|
155
|
+
async def add_documents(
|
|
156
|
+
self,
|
|
157
|
+
documents: List[Document],
|
|
158
|
+
collection: Union[str, None] = None,
|
|
159
|
+
**kwargs
|
|
160
|
+
) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Add Documents to Vector Store.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
documents (List[Document]): List of Documents.
|
|
166
|
+
collection (str): Collection Name.
|
|
167
|
+
kwargs: Additional Arguments.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
None.
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
def create_embedding(
|
|
174
|
+
self,
|
|
175
|
+
|
|
176
|
+
embedding_model: dict,
|
|
177
|
+
**kwargs
|
|
178
|
+
):
|
|
179
|
+
"""
|
|
180
|
+
Create Embedding Model.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
embedding_model (dict): Embedding Model Configuration.
|
|
184
|
+
kwargs: Additional Arguments.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Callable: Embedding Model.
|
|
188
|
+
|
|
189
|
+
"""
|
|
190
|
+
model_type = embedding_model.get('model_type', 'huggingface')
|
|
191
|
+
model_name = embedding_model.get('model_name', EMBEDDING_DEFAULT_MODEL)
|
|
192
|
+
if model_type not in supported_embeddings:
|
|
193
|
+
raise ConfigError(
|
|
194
|
+
f"Embedding Model Type: {model_type} not supported."
|
|
195
|
+
)
|
|
196
|
+
embed_cls = supported_embeddings[model_type]
|
|
197
|
+
cls_path = f".embeddings.{model_type}" # Relative module path
|
|
198
|
+
try:
|
|
199
|
+
embed_module = importlib.import_module(
|
|
200
|
+
cls_path,
|
|
201
|
+
package=__package__
|
|
202
|
+
)
|
|
203
|
+
embed_obj = getattr(embed_module, embed_cls)
|
|
204
|
+
return embed_obj(
|
|
205
|
+
model_name=model_name,
|
|
206
|
+
**kwargs
|
|
207
|
+
)
|
|
208
|
+
except ImportError as e:
|
|
209
|
+
raise ConfigError(
|
|
210
|
+
f"Error Importing Embedding Model: {model_type}"
|
|
211
|
+
) from e
|
|
212
|
+
|
|
213
|
+
def get_default_embedding(self):
|
|
214
|
+
embed_model = {
|
|
215
|
+
'model_name': EMBEDDING_DEFAULT_MODEL,
|
|
216
|
+
'model_type': 'huggingface'
|
|
217
|
+
}
|
|
218
|
+
return self.create_embedding(
|
|
219
|
+
embedding_model=embed_model
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def generate_embedding(self, documents: List[Document]):
|
|
223
|
+
if not self._embed_:
|
|
224
|
+
self._embed_ = self.get_default_embedding()
|
|
225
|
+
|
|
226
|
+
# Using the Embed Model to Generate Embeddings:
|
|
227
|
+
embeddings = self._embed_.embed_documents(documents)
|
|
228
|
+
return embeddings
|
|
229
|
+
|
|
230
|
+
def as_retriever(
|
|
231
|
+
self,
|
|
232
|
+
metric_type: str = 'COSINE',
|
|
233
|
+
index_type: str = 'IVF_FLAT',
|
|
234
|
+
search_type: str = 'similarity',
|
|
235
|
+
chain_type: str = 'stuff',
|
|
236
|
+
search_kwargs: dict = None
|
|
237
|
+
) -> Callable:
|
|
238
|
+
vector = self.get_vector(metric_type=metric_type, index_type=index_type)
|
|
239
|
+
if not vector:
|
|
240
|
+
raise ConfigError(
|
|
241
|
+
"Vector Store is not connected. Check your connection."
|
|
242
|
+
)
|
|
243
|
+
return VectorStoreRetriever(
|
|
244
|
+
vectorstore=vector,
|
|
245
|
+
search_type=search_type,
|
|
246
|
+
chain_type=chain_type,
|
|
247
|
+
search_kwargs=search_kwargs
|
|
248
|
+
)
|
parrot/stores/chroma.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
from uuid import uuid4
|
|
4
|
+
import logging
|
|
5
|
+
from langchain.docstore.document import Document
|
|
6
|
+
from langchain.memory import VectorStoreRetrieverMemory
|
|
7
|
+
import chromadb
|
|
8
|
+
from langchain_chroma import Chroma
|
|
9
|
+
from .abstract import AbstractStore
|
|
10
|
+
from ..conf import CHROMADB_HOST, CHROMADB_PORT
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logging.getLogger('chromadb').setLevel(logging.INFO)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ChromaStore(AbstractStore):
|
|
17
|
+
"""Chroma DB Store Class.
|
|
18
|
+
|
|
19
|
+
Using Chroma as Document Vector Store.
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
embedding_model: Union[dict, str] = None,
|
|
26
|
+
embedding: Union[dict, Callable] = None,
|
|
27
|
+
**kwargs
|
|
28
|
+
):
|
|
29
|
+
super().__init__(
|
|
30
|
+
embedding_model=embedding_model,
|
|
31
|
+
embedding=embedding,
|
|
32
|
+
**kwargs
|
|
33
|
+
)
|
|
34
|
+
self.database_path: str = kwargs.pop('database_path', 'chroma.db')
|
|
35
|
+
self._ephemeral: bool = kwargs.pop('ephemeral', False)
|
|
36
|
+
self._local: bool = kwargs.pop('local', False)
|
|
37
|
+
self.host = kwargs.pop("host", CHROMADB_HOST)
|
|
38
|
+
self.port = kwargs.pop("port", CHROMADB_PORT)
|
|
39
|
+
self._collection = None
|
|
40
|
+
|
|
41
|
+
async def connection(self):
|
|
42
|
+
"""Connection to ChromaDB.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
alias (str): Database alias.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Callable: ChromaDB connection.
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
if self._ephemeral:
|
|
52
|
+
self._connection = chromadb.Client()
|
|
53
|
+
elif self._local:
|
|
54
|
+
self._connection = chromadb.PersistentClient(
|
|
55
|
+
path=self.database_path,
|
|
56
|
+
database=self.database,
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
# Client-Server Connection:
|
|
60
|
+
self._connection = chromadb.HttpClient(
|
|
61
|
+
host=self.host,
|
|
62
|
+
port=self.port,
|
|
63
|
+
database=self.database,
|
|
64
|
+
)
|
|
65
|
+
self._collection = self._connection.get_or_create_collection(self.collection_name)
|
|
66
|
+
self._connected = True
|
|
67
|
+
return self._connection
|
|
68
|
+
|
|
69
|
+
async def disconnect(self) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Closing the Connection on ChromaDB
|
|
72
|
+
"""
|
|
73
|
+
self._connection = None
|
|
74
|
+
self._connected = False
|
|
75
|
+
|
|
76
|
+
def get_vector(
|
|
77
|
+
self,
|
|
78
|
+
collection: Union[str, None] = None,
|
|
79
|
+
embedding: Optional[Callable] = None,
|
|
80
|
+
) -> Chroma:
|
|
81
|
+
|
|
82
|
+
if not collection:
|
|
83
|
+
collection = self.collection_name
|
|
84
|
+
if embedding is not None:
|
|
85
|
+
_embed_ = embedding
|
|
86
|
+
else:
|
|
87
|
+
_embed_ = self._embed_ or self.create_embedding(
|
|
88
|
+
embedding_model=self.embedding_model
|
|
89
|
+
)
|
|
90
|
+
return Chroma(
|
|
91
|
+
collection_name=self.collection_name,
|
|
92
|
+
embedding_function=_embed_.embedding,
|
|
93
|
+
client=self._connection,
|
|
94
|
+
create_collection_if_not_exists=True,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
async def from_documents(self, documents: list[Document], collection: str = None, **kwargs):
|
|
98
|
+
"""
|
|
99
|
+
Save Documents as Vectors in Chroma.
|
|
100
|
+
"""
|
|
101
|
+
vectordb = await Chroma.afrom_documents(
|
|
102
|
+
documents=documents,
|
|
103
|
+
embedding=self._embed_.embedding,
|
|
104
|
+
connection=self._connection,
|
|
105
|
+
)
|
|
106
|
+
return vectordb
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
async def add_texts(self, objects: list, collection: str = None):
|
|
110
|
+
"""
|
|
111
|
+
Add Texts to ChromaDB
|
|
112
|
+
"""
|
|
113
|
+
async with self:
|
|
114
|
+
collection = self._connection.get_or_create_collection(collection)
|
|
115
|
+
for i, doc in enumerate(objects):
|
|
116
|
+
collection.add(ids=[str(i)], documents=[doc])
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
async def add_documents(
|
|
120
|
+
self,
|
|
121
|
+
documents: list,
|
|
122
|
+
collection: str = None,
|
|
123
|
+
embedding: Optional[Callable] = None,
|
|
124
|
+
) -> bool:
|
|
125
|
+
"""Add Documents to ChromaDB"""
|
|
126
|
+
|
|
127
|
+
if collection is None:
|
|
128
|
+
collection = self.collection_name
|
|
129
|
+
|
|
130
|
+
async with self:
|
|
131
|
+
collection_obj = self._connection.get_or_create_collection(collection)
|
|
132
|
+
uuids = [str(uuid4()) for _ in range(len(documents))]
|
|
133
|
+
vector_db = self.get_vector(collection=collection, embedding=embedding)
|
|
134
|
+
await vector_db.aadd_documents(documents=documents, ids=uuids)
|
|
135
|
+
|
|
136
|
+
return True
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
async def update_documents(
|
|
140
|
+
self,
|
|
141
|
+
documents: list,
|
|
142
|
+
collection: str = None,
|
|
143
|
+
embedding: Optional[Callable] = None,
|
|
144
|
+
) -> bool:
|
|
145
|
+
"""
|
|
146
|
+
Update Documents to ChromaDB
|
|
147
|
+
"""
|
|
148
|
+
async with self:
|
|
149
|
+
collection = self._connection.get_or_create_collection(collection)
|
|
150
|
+
vector_db = self.get_vector(collection=collection, embedding=embedding)
|
|
151
|
+
# Split the documents into ids and documents
|
|
152
|
+
if all('id' in doc for doc in documents):
|
|
153
|
+
ids = [doc.pop('id') for doc in documents]
|
|
154
|
+
vector_db.update_documents(documents=documents, ids=ids)
|
|
155
|
+
return True
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
async def similarity_search(
|
|
159
|
+
self,
|
|
160
|
+
query: str,
|
|
161
|
+
collection: Union[str, None] = None,
|
|
162
|
+
embedding: Optional[Callable] = None,
|
|
163
|
+
limit: int = 2,
|
|
164
|
+
filter: Optional[dict] = None,
|
|
165
|
+
) -> list:
|
|
166
|
+
if collection is None:
|
|
167
|
+
collection = self.collection_name
|
|
168
|
+
async with self:
|
|
169
|
+
vector_db = self.get_vector(collection=collection, embedding=embedding)
|
|
170
|
+
return vector_db.similarity_search(query, k=limit, filter=filter)
|
|
171
|
+
|
|
172
|
+
def memory_retriever(
|
|
173
|
+
self,
|
|
174
|
+
documents: Optional[list] = None,
|
|
175
|
+
num_results: int = 5
|
|
176
|
+
) -> VectorStoreRetrieverMemory:
|
|
177
|
+
if not documents:
|
|
178
|
+
documents = []
|
|
179
|
+
vectordb = Chroma.from_documents(
|
|
180
|
+
documents=documents,
|
|
181
|
+
embedding=self._embed_.embedding,
|
|
182
|
+
connection=self._connection,
|
|
183
|
+
)
|
|
184
|
+
retriever = Chroma.as_retriever(
|
|
185
|
+
vectordb,
|
|
186
|
+
search_kwargs=dict(k=num_results)
|
|
187
|
+
)
|
|
188
|
+
return VectorStoreRetrieverMemory(retriever=retriever)
|
parrot/stores/duck.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
import duckdb
|
|
4
|
+
from langchain.docstore.document import Document
|
|
5
|
+
from langchain.memory import VectorStoreRetrieverMemory
|
|
6
|
+
from langchain_community.vectorstores import DuckDB
|
|
7
|
+
from .abstract import AbstractStore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DuckDBStore(AbstractStore):
|
|
11
|
+
"""DuckDB Store Class.
|
|
12
|
+
|
|
13
|
+
Using DuckDB as Document Vector Store.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
default_config: dict ={
|
|
17
|
+
"enable_external_access": "false",
|
|
18
|
+
"autoinstall_known_extensions": "false",
|
|
19
|
+
"autoload_known_extensions": "false"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
embedding_model: Union[dict, str] = None,
|
|
25
|
+
embedding: Union[dict, Callable] = None,
|
|
26
|
+
**kwargs
|
|
27
|
+
):
|
|
28
|
+
super().__init__(
|
|
29
|
+
embedding_model=embedding_model,
|
|
30
|
+
embedding=embedding,
|
|
31
|
+
**kwargs
|
|
32
|
+
)
|
|
33
|
+
self.credentials = {
|
|
34
|
+
"database": self.database,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
config: dict = kwargs.pop("config", {})
|
|
38
|
+
self.config = {
|
|
39
|
+
**self.default_config,
|
|
40
|
+
**config
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
async def connection(self, alias: str = None):
|
|
44
|
+
"""Connection to DuckDB.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
alias (str): Database alias.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Callable: DuckDB connection.
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
self._connection = duckdb.connect(**self.credentials, config=self.config)
|
|
54
|
+
self._connected = True
|
|
55
|
+
return self._connection
|
|
56
|
+
|
|
57
|
+
async def disconnect(self) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Closing the Connection on DuckDB
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
if self._connection:
|
|
63
|
+
self._connection.close()
|
|
64
|
+
except Exception as err:
|
|
65
|
+
raise RuntimeError(
|
|
66
|
+
message=f"{__name__!s}: Closing Error: {err!s}"
|
|
67
|
+
) from err
|
|
68
|
+
finally:
|
|
69
|
+
self._connection = None
|
|
70
|
+
self._connected = False
|
|
71
|
+
|
|
72
|
+
def get_vector(
|
|
73
|
+
self,
|
|
74
|
+
collection: Union[str, None] = None,
|
|
75
|
+
embedding: Optional[Callable] = None,
|
|
76
|
+
metadata_field: str = 'id',
|
|
77
|
+
text_field: str = 'text',
|
|
78
|
+
vector_key: str = 'vector',
|
|
79
|
+
) -> DuckDB:
|
|
80
|
+
|
|
81
|
+
if not collection:
|
|
82
|
+
collection = self.collection_name
|
|
83
|
+
if embedding is not None:
|
|
84
|
+
_embed_ = embedding
|
|
85
|
+
else:
|
|
86
|
+
_embed_ = self.create_embedding(
|
|
87
|
+
embedding_model=self.embedding_model
|
|
88
|
+
)
|
|
89
|
+
return DuckDB(
|
|
90
|
+
connection=self._connection,
|
|
91
|
+
table_name=collection,
|
|
92
|
+
embedding=_embed_,
|
|
93
|
+
vector_key=vector_key,
|
|
94
|
+
text_key=text_field,
|
|
95
|
+
id_key=metadata_field
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def add_texts(self, objects: list, collection: str = None):
|
|
100
|
+
"""
|
|
101
|
+
Add Texts to DuckDB
|
|
102
|
+
"""
|
|
103
|
+
async with self:
|
|
104
|
+
store = self.get_vector(collection=collection)
|
|
105
|
+
store.add_texts(objects)
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
async def similarity_search(
|
|
109
|
+
self,
|
|
110
|
+
query: str,
|
|
111
|
+
collection: Union[str, None] = None,
|
|
112
|
+
embedding: Optional[Callable] = None,
|
|
113
|
+
limit: int = 2,
|
|
114
|
+
) -> list:
|
|
115
|
+
if collection is None:
|
|
116
|
+
collection = self.collection_name
|
|
117
|
+
async with self:
|
|
118
|
+
vector_db = self.get_vector(collection=collection, embedding=embedding)
|
|
119
|
+
return await vector_db.asimilarity_search(query, k=limit)
|
|
120
|
+
|
|
121
|
+
def memory_retriever(
|
|
122
|
+
self,
|
|
123
|
+
documents: Optional[list] = None,
|
|
124
|
+
num_results: int = 5
|
|
125
|
+
) -> VectorStoreRetrieverMemory:
|
|
126
|
+
if not documents:
|
|
127
|
+
documents = []
|
|
128
|
+
vectordb = DuckDB.from_documents(
|
|
129
|
+
documents=documents,
|
|
130
|
+
embedding=self._embed_.embedding,
|
|
131
|
+
connection=self._connection,
|
|
132
|
+
)
|
|
133
|
+
retriever = DuckDB.as_retriever(
|
|
134
|
+
vectordb,
|
|
135
|
+
search_kwargs=dict(k=num_results)
|
|
136
|
+
)
|
|
137
|
+
return VectorStoreRetrieverMemory(retriever=retriever)
|
|
138
|
+
|
|
139
|
+
async def from_documents(self, documents: list[Document], collection: str = None, **kwargs):
|
|
140
|
+
"""
|
|
141
|
+
Save Documents as Vectors in DuckDB.
|
|
142
|
+
"""
|
|
143
|
+
if not collection:
|
|
144
|
+
collection = self.collection_name
|
|
145
|
+
vectordb = await DuckDB.afrom_documents(
|
|
146
|
+
documents,
|
|
147
|
+
embedding=self._embed_.embedding,
|
|
148
|
+
connection=self._connection,
|
|
149
|
+
)
|
|
150
|
+
return vectordb
|
|
151
|
+
|
|
152
|
+
async def add_documents(self, documents: list[Document], collection: str = None, **kwargs):
|
|
153
|
+
"""
|
|
154
|
+
Add Documents as Vectors in DuckDB.
|
|
155
|
+
"""
|
|
156
|
+
if not collection:
|
|
157
|
+
collection = self.collection_name
|
|
158
|
+
vectordb = self.get_vector(collection=collection)
|
|
159
|
+
result = await vectordb.aadd_documents(
|
|
160
|
+
documents=documents
|
|
161
|
+
)
|
|
162
|
+
return result
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
supported_embeddings = {
|
|
2
|
+
'openai': 'OpenAIEmbed',
|
|
3
|
+
'google': 'GoogleEmbed',
|
|
4
|
+
'vertexai': 'VertexAIEmbed',
|
|
5
|
+
'huggingface': 'HugginfaceEmbed',
|
|
6
|
+
'fastembed': 'FastembedEmbed',
|
|
7
|
+
'bge': 'BgeEmbed',
|
|
8
|
+
'ollama': 'OllamaEmbed',
|
|
9
|
+
'transformers': 'TransformersEmbed'
|
|
10
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from ...conf import (
|
|
3
|
+
MAX_BATCH_SIZE,
|
|
4
|
+
EMBEDDING_DEFAULT_MODEL,
|
|
5
|
+
EMBEDDING_DEVICE
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
class AbstractEmbed(ABC):
|
|
9
|
+
"""A wrapper class for Create embeddings."""
|
|
10
|
+
model_name: str = EMBEDDING_DEFAULT_MODEL
|
|
11
|
+
encode_kwargs: str = {
|
|
12
|
+
'normalize_embeddings': True,
|
|
13
|
+
"batch_size": MAX_BATCH_SIZE
|
|
14
|
+
}
|
|
15
|
+
model_kwargs = {
|
|
16
|
+
'device': EMBEDDING_DEVICE,
|
|
17
|
+
'trust_remote_code':True
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
def __init__(self, model_name: str = None, **kwargs):
|
|
21
|
+
self._embedding = self._create_embedding(model_name, **kwargs)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def embedding(self):
|
|
25
|
+
return self._embedding
|
|
26
|
+
|
|
27
|
+
def free(self):
|
|
28
|
+
"""
|
|
29
|
+
Free the resources.
|
|
30
|
+
"""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def _get_device(self):
|
|
34
|
+
return EMBEDDING_DEVICE
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def _create_embedding(self, model_name: str = None, **kwargs):
|
|
38
|
+
"""
|
|
39
|
+
Create Embedding Model.
|
|
40
|
+
Args:
|
|
41
|
+
model_name (str): The name of the model to use.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Callable: Embedding Model.
|
|
45
|
+
"""
|
|
46
|
+
pass
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
import gc
|
|
3
|
+
import torch
|
|
4
|
+
from .abstract import AbstractEmbed
|
|
5
|
+
from ...conf import CUDA_DEFAULT_DEVICE, EMBEDDING_DEVICE
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseEmbed(AbstractEmbed):
|
|
9
|
+
"""A wrapper class for Base embeddings.
|
|
10
|
+
|
|
11
|
+
Use this class to Embedding Models that requires Torch/Transformers.
|
|
12
|
+
"""
|
|
13
|
+
model_kwargs = {
|
|
14
|
+
'device': EMBEDDING_DEVICE,
|
|
15
|
+
'trust_remote_code':True
|
|
16
|
+
}
|
|
17
|
+
def _get_device(self, device_type: str = None, cuda_number: Optional[int] = None):
|
|
18
|
+
"""Get Default device for Torch and transformers.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
# torch.backends.cudnn.deterministic = True
|
|
22
|
+
if device_type is not None:
|
|
23
|
+
return torch.device(device_type)
|
|
24
|
+
if torch.cuda.is_available():
|
|
25
|
+
if CUDA_DEFAULT_DEVICE == 'cpu':
|
|
26
|
+
# Use CPU even if CUDA is available
|
|
27
|
+
return torch.device('cpu')
|
|
28
|
+
if cuda_number is not None:
|
|
29
|
+
# Use specified CUDA GPU
|
|
30
|
+
return torch.device(f'cuda:{cuda_number}')
|
|
31
|
+
# Use CUDA GPU if available
|
|
32
|
+
if cuda_number is None:
|
|
33
|
+
return torch.device(f'cuda:0')
|
|
34
|
+
if torch.backends.mps.is_available():
|
|
35
|
+
# Use CUDA Multi-Processing Service if available
|
|
36
|
+
return torch.device("mps")
|
|
37
|
+
if EMBEDDING_DEVICE == 'cuda':
|
|
38
|
+
if cuda_number is None and CUDA_DEFAULT_DEVICE != 'cpu':
|
|
39
|
+
# Use CUDA GPU if available
|
|
40
|
+
cuda_number = CUDA_DEFAULT_DEVICE
|
|
41
|
+
return torch.device(f'cuda:{cuda_number}')
|
|
42
|
+
return torch.device(EMBEDDING_DEVICE)
|
|
43
|
+
|
|
44
|
+
def free(self):
|
|
45
|
+
"""
|
|
46
|
+
Free the resources.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
gc.collect() # Run Python garbage collector to free unreferenced objects
|
|
50
|
+
torch.cuda.empty_cache() # Release cached memory blocks back to the GPU
|
|
51
|
+
except Exception as e:
|
|
52
|
+
print(f"Error: {e}")
|