ai-parrot 0.8.3__cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.8.3.dist-info/LICENSE +21 -0
- ai_parrot-0.8.3.dist-info/METADATA +306 -0
- ai_parrot-0.8.3.dist-info/RECORD +128 -0
- ai_parrot-0.8.3.dist-info/WHEEL +6 -0
- ai_parrot-0.8.3.dist-info/top_level.txt +2 -0
- parrot/__init__.py +30 -0
- parrot/bots/__init__.py +5 -0
- parrot/bots/abstract.py +1115 -0
- parrot/bots/agent.py +492 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/bose.py +17 -0
- parrot/bots/chatbot.py +271 -0
- parrot/bots/cody.py +17 -0
- parrot/bots/copilot.py +117 -0
- parrot/bots/data.py +730 -0
- parrot/bots/dataframe.py +103 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/interfaces/__init__.py +1 -0
- parrot/bots/interfaces/retrievers.py +12 -0
- parrot/bots/notebook.py +619 -0
- parrot/bots/odoo.py +17 -0
- parrot/bots/prompts/__init__.py +41 -0
- parrot/bots/prompts/agents.py +91 -0
- parrot/bots/prompts/data.py +214 -0
- parrot/bots/retrievals/__init__.py +1 -0
- parrot/bots/retrievals/constitutional.py +19 -0
- parrot/bots/retrievals/multi.py +122 -0
- parrot/bots/retrievals/retrieval.py +610 -0
- parrot/bots/tools/__init__.py +7 -0
- parrot/bots/tools/eda.py +325 -0
- parrot/bots/tools/pdf.py +50 -0
- parrot/bots/tools/plot.py +48 -0
- parrot/bots/troc.py +16 -0
- parrot/conf.py +170 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-312-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agents.py +292 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +192 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/http.py +805 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +18 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/exif.py +709 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/llms/__init__.py +1 -0
- parrot/llms/abstract.py +69 -0
- parrot/llms/anthropic.py +58 -0
- parrot/llms/gemma.py +15 -0
- parrot/llms/google.py +44 -0
- parrot/llms/groq.py +67 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +61 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +89 -0
- parrot/loaders/__init__.py +9 -0
- parrot/loaders/abstract.py +628 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/txt.py +26 -0
- parrot/manager.py +333 -0
- parrot/models.py +504 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +11 -0
- parrot/stores/abstract.py +248 -0
- parrot/stores/chroma.py +188 -0
- parrot/stores/duck.py +162 -0
- parrot/stores/embeddings/__init__.py +10 -0
- parrot/stores/embeddings/abstract.py +46 -0
- parrot/stores/embeddings/base.py +52 -0
- parrot/stores/embeddings/bge.py +20 -0
- parrot/stores/embeddings/fastembed.py +17 -0
- parrot/stores/embeddings/google.py +18 -0
- parrot/stores/embeddings/huggingface.py +20 -0
- parrot/stores/embeddings/ollama.py +14 -0
- parrot/stores/embeddings/openai.py +26 -0
- parrot/stores/embeddings/transformers.py +21 -0
- parrot/stores/embeddings/vertexai.py +17 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss.py +160 -0
- parrot/stores/milvus.py +397 -0
- parrot/stores/postgres.py +653 -0
- parrot/stores/qdrant.py +170 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +68 -0
- parrot/tools/asknews.py +33 -0
- parrot/tools/basic.py +51 -0
- parrot/tools/bby.py +359 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/docx.py +343 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/execute.py +56 -0
- parrot/tools/gamma.py +28 -0
- parrot/tools/google.py +170 -0
- parrot/tools/gvoice.py +301 -0
- parrot/tools/results.py +278 -0
- parrot/tools/stack.py +27 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +58 -0
- parrot/tools/zipcode.py +198 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-312-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-312-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
|
@@ -0,0 +1,653 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Powerful PostgreSQL Vector Database Store with Custom Table Support.
|
|
3
|
+
"""
|
|
4
|
+
from typing import (
|
|
5
|
+
Any,
|
|
6
|
+
Dict,
|
|
7
|
+
List,
|
|
8
|
+
Tuple,
|
|
9
|
+
Union,
|
|
10
|
+
Optional,
|
|
11
|
+
Sequence
|
|
12
|
+
)
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
import asyncio
|
|
15
|
+
import uuid
|
|
16
|
+
# SQL Alchemy
|
|
17
|
+
import sqlalchemy
|
|
18
|
+
from sqlalchemy import inspect, text
|
|
19
|
+
from sqlalchemy.ext.declarative import declarative_base
|
|
20
|
+
from sqlalchemy import Column, String, ARRAY, Float, JSON, text, func
|
|
21
|
+
from sqlalchemy.dialects.postgresql import JSON, JSONB, JSONPATH, UUID, insert
|
|
22
|
+
from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine
|
|
23
|
+
from sqlalchemy.future import select
|
|
24
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
25
|
+
# PgVector
|
|
26
|
+
from pgvector.sqlalchemy import Vector # type: ignore
|
|
27
|
+
# Langchain
|
|
28
|
+
from langchain_core.embeddings import Embeddings
|
|
29
|
+
from langchain.docstore.document import Document
|
|
30
|
+
from langchain.memory import VectorStoreRetrieverMemory
|
|
31
|
+
from langchain_community.vectorstores.pgembedding import PGEmbedding
|
|
32
|
+
from langchain_community.vectorstores.utils import DistanceStrategy
|
|
33
|
+
from langchain_postgres.vectorstores import (
|
|
34
|
+
PGVector,
|
|
35
|
+
_get_embedding_collection_store,
|
|
36
|
+
_results_to_docs
|
|
37
|
+
)
|
|
38
|
+
from datamodel.parsers.json import json_encoder # pylint: disable=E0611
|
|
39
|
+
from .abstract import AbstractStore
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Base = declarative_base()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Define the async classmethods to be attached to our ORM model.
|
|
47
|
+
async def aget_by_name(cls, session: AsyncSession, name: str) -> Optional["CustomEmbeddingStore"]:
|
|
48
|
+
# result = await session.execute(select(cls).where(cls.name == name))
|
|
49
|
+
# return result.scalars().first()
|
|
50
|
+
return cls(cmetadata={})
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PgVector(PGVector):
|
|
54
|
+
"""
|
|
55
|
+
PgVector extends PGVector so that it uses an existing table from a specified schema.
|
|
56
|
+
|
|
57
|
+
When instantiating, you provide:
|
|
58
|
+
- connection: an AsyncEngine (or synchronous engine) to your PostgreSQL database.
|
|
59
|
+
- schema: the database schema where your table lives.
|
|
60
|
+
- table_name: the name of the table that stores the embeddings.
|
|
61
|
+
- embedding_length: the dimension of the embedding vectors.
|
|
62
|
+
- embeddings: your embedding function/model (which must provide embed_query).
|
|
63
|
+
|
|
64
|
+
This implementation overrides the _get_embedding_collection_store method to return a tuple of
|
|
65
|
+
ORM model classes that both refer to your table. It validates (using SQLAlchemy’s inspector)
|
|
66
|
+
that the table contains the required columns: 'id', 'embedding', 'document', and 'cmetadata'.
|
|
67
|
+
|
|
68
|
+
The returned ORM models can then be used by PGVector’s built-in similarity search and retriever.
|
|
69
|
+
"""
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
embeddings: Embeddings,
|
|
73
|
+
*,
|
|
74
|
+
table_name: str = None,
|
|
75
|
+
schema: str = 'public',
|
|
76
|
+
collection_name: Optional[str] = None,
|
|
77
|
+
id_column: str = 'id',
|
|
78
|
+
**kwargs
|
|
79
|
+
) -> None:
|
|
80
|
+
self.table_name = table_name
|
|
81
|
+
self.schema = schema
|
|
82
|
+
self._id_column: str = id_column
|
|
83
|
+
self._schema_based: bool = False
|
|
84
|
+
if self.table_name:
|
|
85
|
+
self._schema_based: bool = True
|
|
86
|
+
elif '.' in collection_name:
|
|
87
|
+
self.schema, self.table_name = collection_name.split('.')
|
|
88
|
+
self._schema_based: bool = True
|
|
89
|
+
super().__init__(
|
|
90
|
+
embeddings=embeddings,
|
|
91
|
+
collection_name=collection_name,
|
|
92
|
+
**kwargs
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
async def _get_embedding_collection_store(
|
|
96
|
+
self,
|
|
97
|
+
table: str,
|
|
98
|
+
schema: str,
|
|
99
|
+
dimension: int = 768,
|
|
100
|
+
**kwargs
|
|
101
|
+
) -> Tuple[type, type]:
|
|
102
|
+
"""
|
|
103
|
+
Return custom ORM model classes (EmbeddingStore, CollectionStore)
|
|
104
|
+
that both reference the same table.
|
|
105
|
+
|
|
106
|
+
In this custom implementation, both the "collection" and "embedding" stores
|
|
107
|
+
are represented by a single table.
|
|
108
|
+
The table is expected to have the following columns:
|
|
109
|
+
- id: unique identifier (String)
|
|
110
|
+
- embedding: the vector column (Vector(dimension))
|
|
111
|
+
- document: text column containing the document
|
|
112
|
+
- cmetadata: JSONB column for metadata
|
|
113
|
+
|
|
114
|
+
Raises an error if the table does not have the required schema.
|
|
115
|
+
"""
|
|
116
|
+
# Dynamically create the model class.
|
|
117
|
+
attrs = {
|
|
118
|
+
'__tablename__': table,
|
|
119
|
+
'__table_args__': {"schema": schema},
|
|
120
|
+
self._id_column: sqlalchemy.Column(
|
|
121
|
+
sqlalchemy.String,
|
|
122
|
+
primary_key=True,
|
|
123
|
+
index=True,
|
|
124
|
+
unique=True,
|
|
125
|
+
default=lambda: str(uuid.uuid4())
|
|
126
|
+
),
|
|
127
|
+
'embedding': sqlalchemy.Column(Vector(dimension)),
|
|
128
|
+
'document': sqlalchemy.Column(sqlalchemy.String, nullable=True),
|
|
129
|
+
'cmetadata': sqlalchemy.Column(JSONB, nullable=True),
|
|
130
|
+
# Attach the async classmethods.
|
|
131
|
+
'aget_by_name': classmethod(aget_by_name),
|
|
132
|
+
# 'aget_or_create': classmethod(aget_or_create)
|
|
133
|
+
}
|
|
134
|
+
EmbeddingStore = type("CustomEmbeddingStore", (Base,), attrs)
|
|
135
|
+
EmbeddingStore.__name__ = "EmbeddingStore"
|
|
136
|
+
return (EmbeddingStore, EmbeddingStore)
|
|
137
|
+
|
|
138
|
+
async def __apost_init__(
|
|
139
|
+
self,
|
|
140
|
+
) -> None:
|
|
141
|
+
"""Async initialize the store (use lazy approach)."""
|
|
142
|
+
if self._async_init: # Warning: possible race condition
|
|
143
|
+
return
|
|
144
|
+
self._async_init = True
|
|
145
|
+
if self._schema_based:
|
|
146
|
+
ebstore, cstore = await self._get_embedding_collection_store(
|
|
147
|
+
table=self.table_name,
|
|
148
|
+
schema=self.schema,
|
|
149
|
+
dimension=self._embedding_length
|
|
150
|
+
)
|
|
151
|
+
else:
|
|
152
|
+
ebstore, cstore = _get_embedding_collection_store(
|
|
153
|
+
self._embedding_length
|
|
154
|
+
)
|
|
155
|
+
self.CollectionStore = cstore
|
|
156
|
+
self.EmbeddingStore = ebstore
|
|
157
|
+
|
|
158
|
+
if not self._schema_based:
|
|
159
|
+
await self.acreate_tables_if_not_exists()
|
|
160
|
+
await self.acreate_collection()
|
|
161
|
+
|
|
162
|
+
async def asimilarity_search(
|
|
163
|
+
self,
|
|
164
|
+
query: str,
|
|
165
|
+
k: int = 4,
|
|
166
|
+
score_threshold: Optional[float] = None,
|
|
167
|
+
filter: Optional[dict] = None,
|
|
168
|
+
**kwargs: Any,
|
|
169
|
+
) -> List[Document]:
|
|
170
|
+
"""Run similarity search with PGVector with distance.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
query (str): Query text to search for.
|
|
174
|
+
k (int): Number of results to return. Defaults to 4.
|
|
175
|
+
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
List of Documents most similar to the query.
|
|
179
|
+
"""
|
|
180
|
+
await self.__apost_init__() # Lazy async init
|
|
181
|
+
embedding = await self.embeddings.aembed_query(query)
|
|
182
|
+
return await self.asimilarity_search_by_vector(
|
|
183
|
+
embedding=embedding,
|
|
184
|
+
k=k,
|
|
185
|
+
score_threshold=score_threshold,
|
|
186
|
+
filter=filter,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
async def asimilarity_search_by_vector(
|
|
190
|
+
self,
|
|
191
|
+
embedding: List[float],
|
|
192
|
+
k: int = 4,
|
|
193
|
+
score_threshold: Optional[float] = None,
|
|
194
|
+
filter: Optional[dict] = None,
|
|
195
|
+
**kwargs: Any,
|
|
196
|
+
) -> List[Document]:
|
|
197
|
+
"""Return docs most similar to embedding vector.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
embedding: Embedding to look up documents similar to.
|
|
201
|
+
k: Number of Documents to return. Defaults to 4.
|
|
202
|
+
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
List of Documents most similar to the query vector.
|
|
206
|
+
"""
|
|
207
|
+
assert self._async_engine, "This method must be called with async_mode"
|
|
208
|
+
await self.__apost_init__() # Lazy async init
|
|
209
|
+
docs_and_scores = await self.asimilarity_search_with_score_by_vector(
|
|
210
|
+
embedding=embedding, k=k, score_threshold=score_threshold, filter=filter
|
|
211
|
+
)
|
|
212
|
+
return _results_to_docs(docs_and_scores)
|
|
213
|
+
|
|
214
|
+
async def asimilarity_search_with_score_by_vector(
|
|
215
|
+
self,
|
|
216
|
+
embedding: List[float],
|
|
217
|
+
k: int = 4,
|
|
218
|
+
score_threshold: Optional[float] = None,
|
|
219
|
+
filter: Optional[dict] = None,
|
|
220
|
+
) -> List[Tuple[Document, float]]:
|
|
221
|
+
await self.__apost_init__() # Lazy async init
|
|
222
|
+
async with self._make_async_session() as session: # type: ignore[arg-type]
|
|
223
|
+
results = await self._aquery_collection(
|
|
224
|
+
session=session, embedding=embedding, k=k, score_threshold=score_threshold, filter=filter
|
|
225
|
+
)
|
|
226
|
+
return self._results_to_docs_and_scores(results)
|
|
227
|
+
|
|
228
|
+
async def _aquery_collection(
|
|
229
|
+
self,
|
|
230
|
+
session: AsyncSession,
|
|
231
|
+
embedding: List[float],
|
|
232
|
+
k: int = 4,
|
|
233
|
+
score_threshold: Optional[float] = None,
|
|
234
|
+
filter: Optional[Dict[str, str]] = None,
|
|
235
|
+
) -> List[Tuple[Document, float]]:
|
|
236
|
+
"""Search for similar documents in the collection.
|
|
237
|
+
|
|
238
|
+
If score_threshold is provided, returns all documents whose computed distance is below that threshold.
|
|
239
|
+
Otherwise, if k is provided, returns at most k documents.
|
|
240
|
+
"""
|
|
241
|
+
async with self._make_async_session() as session: # type: ignore[arg-type]
|
|
242
|
+
filter_by = []
|
|
243
|
+
if filter:
|
|
244
|
+
if self.use_jsonb:
|
|
245
|
+
filter_clause = self._create_filter_clause(filter)
|
|
246
|
+
if filter_clause is not None:
|
|
247
|
+
filter_by.append(filter_clause)
|
|
248
|
+
else:
|
|
249
|
+
# For non-JSONB cases, you might use a deprecated method:
|
|
250
|
+
filter_clauses = self._create_filter_clause_json_deprecated(filter)
|
|
251
|
+
filter_by.extend(filter_clauses)
|
|
252
|
+
|
|
253
|
+
# Compute the distance expression
|
|
254
|
+
distance_expr = self.distance_strategy(embedding).label("distance")
|
|
255
|
+
stmt = (
|
|
256
|
+
sqlalchemy.select(
|
|
257
|
+
self.EmbeddingStore,
|
|
258
|
+
self.distance_strategy(embedding).label("distance")
|
|
259
|
+
)
|
|
260
|
+
.filter(*filter_by)
|
|
261
|
+
)
|
|
262
|
+
# If a score threshold is provided, add a filter on the distance.
|
|
263
|
+
if score_threshold is not None:
|
|
264
|
+
stmt = stmt.filter(distance_expr < score_threshold)
|
|
265
|
+
else:
|
|
266
|
+
# Otherwise, limit the number of results.
|
|
267
|
+
stmt = stmt.order_by(distance_expr).limit(k)
|
|
268
|
+
|
|
269
|
+
stmt = stmt.order_by(sqlalchemy.asc(distance_expr))
|
|
270
|
+
# Execute the query and return the results.
|
|
271
|
+
results: Sequence[Any] = (await session.execute(stmt)).all()
|
|
272
|
+
return results
|
|
273
|
+
|
|
274
|
+
def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, float]]:
|
|
275
|
+
"""Return docs and scores from results."""
|
|
276
|
+
id_col = getattr(self, "_id_column", "id")
|
|
277
|
+
docs = [
|
|
278
|
+
(
|
|
279
|
+
Document(
|
|
280
|
+
id=str(getattr(result.EmbeddingStore, id_col)),
|
|
281
|
+
page_content=result.EmbeddingStore.document,
|
|
282
|
+
metadata=result.EmbeddingStore.cmetadata,
|
|
283
|
+
),
|
|
284
|
+
result.distance if self.embeddings is not None else None,
|
|
285
|
+
)
|
|
286
|
+
for result in results
|
|
287
|
+
]
|
|
288
|
+
return docs
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class PgvectorStore(AbstractStore):
|
|
292
|
+
"""Pgvector Store Class.
|
|
293
|
+
|
|
294
|
+
Using PostgreSQL + PgVector to saving vectors in database.
|
|
295
|
+
"""
|
|
296
|
+
def __init__(
|
|
297
|
+
self,
|
|
298
|
+
embedding_model: Union[dict, str] = None,
|
|
299
|
+
embedding: Union[dict, Callable] = None,
|
|
300
|
+
**kwargs
|
|
301
|
+
):
|
|
302
|
+
super().__init__(
|
|
303
|
+
embedding_model=embedding_model,
|
|
304
|
+
embedding=embedding,
|
|
305
|
+
**kwargs
|
|
306
|
+
)
|
|
307
|
+
self.table: str = kwargs.get('table', None)
|
|
308
|
+
self.schema: str = kwargs.get('schema', 'public')
|
|
309
|
+
self._id_column = kwargs.get('id_column', 'id')
|
|
310
|
+
if self.table and not self.collection_name:
|
|
311
|
+
self.collection_name = f"{self.schema}.{self.table}"
|
|
312
|
+
self.dsn = kwargs.get('dsn', self.database)
|
|
313
|
+
self._drop: bool = kwargs.pop('drop', False)
|
|
314
|
+
self._connection: AsyncEngine = None
|
|
315
|
+
|
|
316
|
+
async def collection_exists(self, collection: str = None) -> bool:
|
|
317
|
+
"""Check if a collection exists in the database."""
|
|
318
|
+
if not collection:
|
|
319
|
+
collection = self.collection_name
|
|
320
|
+
async with self._connection.connect() as conn:
|
|
321
|
+
# ✅ Check if the collection (table) exists
|
|
322
|
+
check_query = f"""
|
|
323
|
+
SELECT EXISTS (
|
|
324
|
+
SELECT FROM information_schema.tables
|
|
325
|
+
WHERE table_name = '{collection}'
|
|
326
|
+
);
|
|
327
|
+
"""
|
|
328
|
+
result = await conn.execute(sqlalchemy.text(check_query))
|
|
329
|
+
return bool(result.scalar())
|
|
330
|
+
|
|
331
|
+
async def connection(self, alias: str = None):
|
|
332
|
+
"""Connection to DuckDB.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
alias (str): Database alias.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
Callable: DuckDB connection.
|
|
339
|
+
|
|
340
|
+
"""
|
|
341
|
+
self._connection = create_async_engine(self.dsn, future=True, echo=False)
|
|
342
|
+
async with self._connection.begin() as conn:
|
|
343
|
+
if getattr(self, "_drop", False):
|
|
344
|
+
vectorstore = PgVector(
|
|
345
|
+
embeddings=self._embed_.embedding,
|
|
346
|
+
table_name=self.table,
|
|
347
|
+
schema=self.schema,
|
|
348
|
+
id_column=self._id_column,
|
|
349
|
+
collection_name=self.collection_name,
|
|
350
|
+
embedding_length=self.dimension,
|
|
351
|
+
connection=self._connection,
|
|
352
|
+
use_jsonb=True,
|
|
353
|
+
create_extension=False
|
|
354
|
+
)
|
|
355
|
+
await vectorstore.adrop_tables()
|
|
356
|
+
if not await self.collection_exists(self.collection_name):
|
|
357
|
+
print(f"⚠️ Collection `{self.collection_name}` not found. Creating a new one...")
|
|
358
|
+
await self.create_collection(self.collection_name)
|
|
359
|
+
self._connected = True
|
|
360
|
+
return self._connection
|
|
361
|
+
|
|
362
|
+
def engine(self):
|
|
363
|
+
return self._connection
|
|
364
|
+
|
|
365
|
+
async def disconnect(self) -> None:
|
|
366
|
+
"""
|
|
367
|
+
Closing the Connection on DuckDB
|
|
368
|
+
"""
|
|
369
|
+
try:
|
|
370
|
+
if self._connection:
|
|
371
|
+
await self._connection.dispose()
|
|
372
|
+
except Exception as err:
|
|
373
|
+
raise RuntimeError(
|
|
374
|
+
message=f"{__name__!s}: Closing Error: {err!s}"
|
|
375
|
+
) from err
|
|
376
|
+
finally:
|
|
377
|
+
self._connection = None
|
|
378
|
+
self._connected = False
|
|
379
|
+
|
|
380
|
+
async def create_collection(self, collection: str) -> None:
|
|
381
|
+
"""Create a new collection in the database."""
|
|
382
|
+
async with self._connection.connect() as conn:
|
|
383
|
+
# ✅ Create the collection in PgVector
|
|
384
|
+
_embed_ = self._embed_ or self.create_embedding(
|
|
385
|
+
embedding_model=self.embedding_model
|
|
386
|
+
)
|
|
387
|
+
self._client = PgVector(
|
|
388
|
+
embeddings=_embed_.embedding,
|
|
389
|
+
embedding_length=self.dimension,
|
|
390
|
+
collection_name=self.collection_name,
|
|
391
|
+
connection=self._connection,
|
|
392
|
+
use_jsonb=True,
|
|
393
|
+
create_extension=False
|
|
394
|
+
)
|
|
395
|
+
print(
|
|
396
|
+
f"✅ Collection `{self.collection_name}` created successfully."
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
def get_vector(
|
|
400
|
+
self,
|
|
401
|
+
table: Optional[str] = None,
|
|
402
|
+
schema: Optional[str] = None,
|
|
403
|
+
collection: Union[str, None] = None,
|
|
404
|
+
embedding: Optional[Callable] = None,
|
|
405
|
+
**kwargs
|
|
406
|
+
) -> PGVector:
|
|
407
|
+
"""
|
|
408
|
+
This function retrieves a vector from the specified collection using the provided embedding.
|
|
409
|
+
If no collection is specified, it uses the default collection name.
|
|
410
|
+
If no embedding is provided, it creates a new embedding using the specified embedding model.
|
|
411
|
+
|
|
412
|
+
Parameters:
|
|
413
|
+
- collection (Union[str, None]): The name of the collection from which to retrieve the vector.
|
|
414
|
+
- embedding (Optional[Callable]): The embedding function to use for vector retrieval.
|
|
415
|
+
- kwargs: Additional keyword arguments to pass to the PGVector constructor.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
- PGVector: The retrieved vector from the specified collection.
|
|
419
|
+
"""
|
|
420
|
+
if not table:
|
|
421
|
+
table = self.table
|
|
422
|
+
if not schema:
|
|
423
|
+
schema = self.schema
|
|
424
|
+
if not collection:
|
|
425
|
+
collection = self.collection_name
|
|
426
|
+
if embedding is not None:
|
|
427
|
+
_embed_ = embedding
|
|
428
|
+
else:
|
|
429
|
+
_embed_ = self.create_embedding(
|
|
430
|
+
embedding_model=self.embedding_model
|
|
431
|
+
)
|
|
432
|
+
return PgVector(
|
|
433
|
+
connection=self._connection,
|
|
434
|
+
table_name=table,
|
|
435
|
+
schema=schema,
|
|
436
|
+
id_column=self._id_column,
|
|
437
|
+
collection_name=collection,
|
|
438
|
+
embedding_length=self.dimension,
|
|
439
|
+
embeddings=_embed_.embedding,
|
|
440
|
+
logger=self.logger,
|
|
441
|
+
async_mode=True,
|
|
442
|
+
use_jsonb=True,
|
|
443
|
+
create_extension=False,
|
|
444
|
+
**kwargs
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
def memory_retriever(
|
|
448
|
+
self,
|
|
449
|
+
documents: Optional[List[Document]] = None,
|
|
450
|
+
num_results: int = 5
|
|
451
|
+
) -> VectorStoreRetrieverMemory:
|
|
452
|
+
_embed_ = self._embed_ or self.create_embedding(
|
|
453
|
+
embedding_model=self.embedding_model
|
|
454
|
+
)
|
|
455
|
+
vectordb = PgVector.from_documents(
|
|
456
|
+
documents or [],
|
|
457
|
+
embedding=_embed_.embedding,
|
|
458
|
+
connection=self._connection,
|
|
459
|
+
collection_name=self.collection_name,
|
|
460
|
+
embedding_length=self.dimension,
|
|
461
|
+
use_jsonb=True,
|
|
462
|
+
async_mode=True,
|
|
463
|
+
create_extension=False,
|
|
464
|
+
)
|
|
465
|
+
retriever = PgVector.as_retriever(
|
|
466
|
+
vectordb,
|
|
467
|
+
search_kwargs=dict(k=num_results)
|
|
468
|
+
)
|
|
469
|
+
return VectorStoreRetrieverMemory(retriever=retriever)
|
|
470
|
+
|
|
471
|
+
async def from_documents(
|
|
472
|
+
self,
|
|
473
|
+
documents: List[Document],
|
|
474
|
+
table: Optional[str] = None,
|
|
475
|
+
schema: Optional[str] = 'public',
|
|
476
|
+
collection: Union[str, None] = None,
|
|
477
|
+
**kwargs
|
|
478
|
+
) -> None:
|
|
479
|
+
"""Save Documents as Vectors in VectorStore."""
|
|
480
|
+
_embed_ = self._embed_ or self.create_embedding(
|
|
481
|
+
embedding_model=self.embedding_model
|
|
482
|
+
)
|
|
483
|
+
if not collection:
|
|
484
|
+
collection = self.collection_name
|
|
485
|
+
vectordb = await PgVector.afrom_documents(
|
|
486
|
+
documents,
|
|
487
|
+
connection=self._connection,
|
|
488
|
+
table_name=table,
|
|
489
|
+
schema=schema,
|
|
490
|
+
id_column=self._id_column,
|
|
491
|
+
collection_name=collection,
|
|
492
|
+
embedding=_embed_.embedding,
|
|
493
|
+
embedding_length=self.dimension,
|
|
494
|
+
use_jsonb=True,
|
|
495
|
+
async_mode=True,
|
|
496
|
+
)
|
|
497
|
+
return vectordb
|
|
498
|
+
|
|
499
|
+
async def add_documents(
|
|
500
|
+
self,
|
|
501
|
+
documents: List[Document],
|
|
502
|
+
collection: Union[str, None] = None,
|
|
503
|
+
**kwargs
|
|
504
|
+
) -> None:
|
|
505
|
+
"""Save Documents as Vectors in VectorStore."""
|
|
506
|
+
if not collection:
|
|
507
|
+
collection = self.collection_name
|
|
508
|
+
vectordb = self.get_vector(collection=collection, **kwargs)
|
|
509
|
+
# Asynchronously add documents to PGVector
|
|
510
|
+
await vectordb.aadd_documents(documents)
|
|
511
|
+
|
|
512
|
+
async def similarity_search(
|
|
513
|
+
self,
|
|
514
|
+
query: str,
|
|
515
|
+
table: Optional[str] = None,
|
|
516
|
+
schema: Optional[str] = None,
|
|
517
|
+
collection: Union[str, None] = None,
|
|
518
|
+
limit: int = 2,
|
|
519
|
+
score_threshold: Optional[float] = None,
|
|
520
|
+
filter: Optional[dict] = None,
|
|
521
|
+
**kwargs
|
|
522
|
+
) -> List[Document]:
|
|
523
|
+
"""Search for similar documents in VectorStore."""
|
|
524
|
+
if not table:
|
|
525
|
+
table = self.table
|
|
526
|
+
if not schema:
|
|
527
|
+
schema = self.schema
|
|
528
|
+
if collection is None:
|
|
529
|
+
collection = self.collection_name
|
|
530
|
+
async with self:
|
|
531
|
+
vector_db = self.get_vector(table=table, schema=schema, collection=collection, **kwargs)
|
|
532
|
+
return await vector_db.asimilarity_search(
|
|
533
|
+
query,
|
|
534
|
+
k=limit,
|
|
535
|
+
score_threshold=score_threshold,
|
|
536
|
+
filter=filter
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
async def create_embedding_table(
|
|
540
|
+
self,
|
|
541
|
+
table: str,
|
|
542
|
+
columns: List[str],
|
|
543
|
+
schema: str = 'public',
|
|
544
|
+
embedding_column: str = 'embedding',
|
|
545
|
+
document_column: str = 'document',
|
|
546
|
+
metadata_column: str = 'metadata',
|
|
547
|
+
id_column: str = 'id',
|
|
548
|
+
dimension: int = 768,
|
|
549
|
+
use_jsonb: bool = False,
|
|
550
|
+
drop_columns: bool = False,
|
|
551
|
+
**kwargs
|
|
552
|
+
):
|
|
553
|
+
"""
|
|
554
|
+
Create an embedding column and vectorize Table information.
|
|
555
|
+
"""
|
|
556
|
+
tablename = f'{schema}.{table}'
|
|
557
|
+
cols = ', '.join(columns)
|
|
558
|
+
_qry = f'SELECT {cols} FROM {tablename};'
|
|
559
|
+
# Generate a sample embedding to determine its dimension
|
|
560
|
+
sample_vector = self._embed_.embedding.embed_query("sample text")
|
|
561
|
+
vector_dim = len(sample_vector)
|
|
562
|
+
# Compare it with the expected dimension
|
|
563
|
+
if vector_dim != dimension:
|
|
564
|
+
raise ValueError(
|
|
565
|
+
f"Expected embedding dimension {self.dimension}, but got {vector_dim}"
|
|
566
|
+
)
|
|
567
|
+
async with self._connection.begin() as conn:
|
|
568
|
+
result = await conn.execute(
|
|
569
|
+
sqlalchemy.text(_qry)
|
|
570
|
+
)
|
|
571
|
+
rows = result.fetchall()
|
|
572
|
+
# Concatenate column names and values to form an input string:
|
|
573
|
+
# 'store_name: BestBuy, location_code: 123456, ...'
|
|
574
|
+
# data = [
|
|
575
|
+
# ', '.join([f"{col}: {row[col]}" for col in columns])
|
|
576
|
+
# for row in rows
|
|
577
|
+
# ]
|
|
578
|
+
# if drop columns, then first remove the existing columns:
|
|
579
|
+
if drop_columns:
|
|
580
|
+
for column in (document_column, embedding_column, metadata_column):
|
|
581
|
+
await conn.execute(
|
|
582
|
+
sqlalchemy.text(
|
|
583
|
+
f'ALTER TABLE {tablename} DROP COLUMN IF EXISTS {column};'
|
|
584
|
+
)
|
|
585
|
+
)
|
|
586
|
+
# Create a new column for embeddings
|
|
587
|
+
if use_jsonb:
|
|
588
|
+
await conn.execute(
|
|
589
|
+
sqlalchemy.text(
|
|
590
|
+
f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {embedding_column} JSONB;' # pylint: disable=C0301
|
|
591
|
+
)
|
|
592
|
+
)
|
|
593
|
+
else:
|
|
594
|
+
# Use Embedding pgvector type:
|
|
595
|
+
await conn.execute(
|
|
596
|
+
sqlalchemy.text(
|
|
597
|
+
f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {embedding_column} vector({dimension});' # pylint: disable=C0301
|
|
598
|
+
)
|
|
599
|
+
)
|
|
600
|
+
# Create a Index for vector:
|
|
601
|
+
# TODO: define index algorithm and options.
|
|
602
|
+
await conn.execute(
|
|
603
|
+
sqlalchemy.text(
|
|
604
|
+
f"CREATE INDEX IF NOT EXISTS idx_{schema}_{table}_embeddings ON {tablename} USING hnsw ({embedding_column} vector_l2_ops);" # pylint: disable=C0301
|
|
605
|
+
)
|
|
606
|
+
)
|
|
607
|
+
# And also, an index IVFLAT:
|
|
608
|
+
await conn.execute(
|
|
609
|
+
sqlalchemy.text(
|
|
610
|
+
f"CREATE INDEX IF NOT EXISTS idx_{schema}_{table}_ivflat ON {tablename} USING ivfflat ({embedding_column} vector_cosine_ops);" # pylint: disable=C0301
|
|
611
|
+
)
|
|
612
|
+
)
|
|
613
|
+
# Then, create the info column and id column (if required):
|
|
614
|
+
# Text info Column (content)
|
|
615
|
+
await conn.execute(
|
|
616
|
+
sqlalchemy.text(
|
|
617
|
+
f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {document_column} TEXT;'
|
|
618
|
+
)
|
|
619
|
+
)
|
|
620
|
+
# ID Column (if required)
|
|
621
|
+
await conn.execute(
|
|
622
|
+
sqlalchemy.text(
|
|
623
|
+
f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {id_column} varchar;'
|
|
624
|
+
)
|
|
625
|
+
)
|
|
626
|
+
# Metadata Column (JSONB):
|
|
627
|
+
await conn.execute(
|
|
628
|
+
sqlalchemy.text(
|
|
629
|
+
f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {metadata_column} jsonb;'
|
|
630
|
+
)
|
|
631
|
+
)
|
|
632
|
+
# And ID Column:
|
|
633
|
+
await conn.execute(
|
|
634
|
+
sqlalchemy.text(
|
|
635
|
+
f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {id_column} varchar;'
|
|
636
|
+
)
|
|
637
|
+
)
|
|
638
|
+
for row in rows:
|
|
639
|
+
_id = getattr(row, id_column)
|
|
640
|
+
metadata = {col: getattr(row, col) for col in columns}
|
|
641
|
+
data = " ".join([f"{col}: {metadata[col]}" for col in columns])
|
|
642
|
+
# Get the vector information from data:
|
|
643
|
+
vector = self._embed_.embedding.embed_query(data)
|
|
644
|
+
vector_str = "[" + ",".join(str(v) for v in vector) + "]"
|
|
645
|
+
await conn.execute(
|
|
646
|
+
sqlalchemy.text(f"""
|
|
647
|
+
UPDATE {tablename}
|
|
648
|
+
SET {embedding_column} = :vector, {document_column} = :info, {metadata_column} = :metadata
|
|
649
|
+
WHERE {id_column} = :id
|
|
650
|
+
"""),
|
|
651
|
+
{"vector": vector_str, "id": _id, "info": data, "metadata": json_encoder(metadata)}
|
|
652
|
+
)
|
|
653
|
+
print("✅ Updated Table embeddings.")
|