ai-parrot 0.8.3__cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (128) hide show
  1. ai_parrot-0.8.3.dist-info/LICENSE +21 -0
  2. ai_parrot-0.8.3.dist-info/METADATA +306 -0
  3. ai_parrot-0.8.3.dist-info/RECORD +128 -0
  4. ai_parrot-0.8.3.dist-info/WHEEL +6 -0
  5. ai_parrot-0.8.3.dist-info/top_level.txt +2 -0
  6. parrot/__init__.py +30 -0
  7. parrot/bots/__init__.py +5 -0
  8. parrot/bots/abstract.py +1115 -0
  9. parrot/bots/agent.py +492 -0
  10. parrot/bots/basic.py +9 -0
  11. parrot/bots/bose.py +17 -0
  12. parrot/bots/chatbot.py +271 -0
  13. parrot/bots/cody.py +17 -0
  14. parrot/bots/copilot.py +117 -0
  15. parrot/bots/data.py +730 -0
  16. parrot/bots/dataframe.py +103 -0
  17. parrot/bots/hrbot.py +15 -0
  18. parrot/bots/interfaces/__init__.py +1 -0
  19. parrot/bots/interfaces/retrievers.py +12 -0
  20. parrot/bots/notebook.py +619 -0
  21. parrot/bots/odoo.py +17 -0
  22. parrot/bots/prompts/__init__.py +41 -0
  23. parrot/bots/prompts/agents.py +91 -0
  24. parrot/bots/prompts/data.py +214 -0
  25. parrot/bots/retrievals/__init__.py +1 -0
  26. parrot/bots/retrievals/constitutional.py +19 -0
  27. parrot/bots/retrievals/multi.py +122 -0
  28. parrot/bots/retrievals/retrieval.py +610 -0
  29. parrot/bots/tools/__init__.py +7 -0
  30. parrot/bots/tools/eda.py +325 -0
  31. parrot/bots/tools/pdf.py +50 -0
  32. parrot/bots/tools/plot.py +48 -0
  33. parrot/bots/troc.py +16 -0
  34. parrot/conf.py +170 -0
  35. parrot/crew/__init__.py +3 -0
  36. parrot/crew/tools/__init__.py +22 -0
  37. parrot/crew/tools/bing.py +13 -0
  38. parrot/crew/tools/config.py +43 -0
  39. parrot/crew/tools/duckgo.py +62 -0
  40. parrot/crew/tools/file.py +24 -0
  41. parrot/crew/tools/google.py +168 -0
  42. parrot/crew/tools/gtrends.py +16 -0
  43. parrot/crew/tools/md2pdf.py +25 -0
  44. parrot/crew/tools/rag.py +42 -0
  45. parrot/crew/tools/search.py +32 -0
  46. parrot/crew/tools/url.py +21 -0
  47. parrot/exceptions.cpython-39-x86_64-linux-gnu.so +0 -0
  48. parrot/handlers/__init__.py +4 -0
  49. parrot/handlers/agents.py +292 -0
  50. parrot/handlers/bots.py +196 -0
  51. parrot/handlers/chat.py +192 -0
  52. parrot/interfaces/__init__.py +6 -0
  53. parrot/interfaces/database.py +27 -0
  54. parrot/interfaces/http.py +805 -0
  55. parrot/interfaces/images/__init__.py +0 -0
  56. parrot/interfaces/images/plugins/__init__.py +18 -0
  57. parrot/interfaces/images/plugins/abstract.py +58 -0
  58. parrot/interfaces/images/plugins/exif.py +709 -0
  59. parrot/interfaces/images/plugins/hash.py +52 -0
  60. parrot/interfaces/images/plugins/vision.py +104 -0
  61. parrot/interfaces/images/plugins/yolo.py +66 -0
  62. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  63. parrot/llms/__init__.py +1 -0
  64. parrot/llms/abstract.py +69 -0
  65. parrot/llms/anthropic.py +58 -0
  66. parrot/llms/gemma.py +15 -0
  67. parrot/llms/google.py +44 -0
  68. parrot/llms/groq.py +67 -0
  69. parrot/llms/hf.py +45 -0
  70. parrot/llms/openai.py +61 -0
  71. parrot/llms/pipes.py +114 -0
  72. parrot/llms/vertex.py +89 -0
  73. parrot/loaders/__init__.py +9 -0
  74. parrot/loaders/abstract.py +628 -0
  75. parrot/loaders/files/__init__.py +0 -0
  76. parrot/loaders/files/abstract.py +39 -0
  77. parrot/loaders/files/text.py +63 -0
  78. parrot/loaders/txt.py +26 -0
  79. parrot/manager.py +333 -0
  80. parrot/models.py +504 -0
  81. parrot/py.typed +0 -0
  82. parrot/stores/__init__.py +11 -0
  83. parrot/stores/abstract.py +248 -0
  84. parrot/stores/chroma.py +188 -0
  85. parrot/stores/duck.py +162 -0
  86. parrot/stores/embeddings/__init__.py +10 -0
  87. parrot/stores/embeddings/abstract.py +46 -0
  88. parrot/stores/embeddings/base.py +52 -0
  89. parrot/stores/embeddings/bge.py +20 -0
  90. parrot/stores/embeddings/fastembed.py +17 -0
  91. parrot/stores/embeddings/google.py +18 -0
  92. parrot/stores/embeddings/huggingface.py +20 -0
  93. parrot/stores/embeddings/ollama.py +14 -0
  94. parrot/stores/embeddings/openai.py +26 -0
  95. parrot/stores/embeddings/transformers.py +21 -0
  96. parrot/stores/embeddings/vertexai.py +17 -0
  97. parrot/stores/empty.py +10 -0
  98. parrot/stores/faiss.py +160 -0
  99. parrot/stores/milvus.py +397 -0
  100. parrot/stores/postgres.py +653 -0
  101. parrot/stores/qdrant.py +170 -0
  102. parrot/tools/__init__.py +23 -0
  103. parrot/tools/abstract.py +68 -0
  104. parrot/tools/asknews.py +33 -0
  105. parrot/tools/basic.py +51 -0
  106. parrot/tools/bby.py +359 -0
  107. parrot/tools/bing.py +13 -0
  108. parrot/tools/docx.py +343 -0
  109. parrot/tools/duck.py +62 -0
  110. parrot/tools/execute.py +56 -0
  111. parrot/tools/gamma.py +28 -0
  112. parrot/tools/google.py +170 -0
  113. parrot/tools/gvoice.py +301 -0
  114. parrot/tools/results.py +278 -0
  115. parrot/tools/stack.py +27 -0
  116. parrot/tools/weather.py +70 -0
  117. parrot/tools/wikipedia.py +58 -0
  118. parrot/tools/zipcode.py +198 -0
  119. parrot/utils/__init__.py +2 -0
  120. parrot/utils/parsers/__init__.py +5 -0
  121. parrot/utils/parsers/toml.cpython-39-x86_64-linux-gnu.so +0 -0
  122. parrot/utils/toml.py +11 -0
  123. parrot/utils/types.cpython-39-x86_64-linux-gnu.so +0 -0
  124. parrot/utils/uv.py +11 -0
  125. parrot/version.py +10 -0
  126. resources/users/__init__.py +5 -0
  127. resources/users/handlers.py +13 -0
  128. resources/users/models.py +205 -0
@@ -0,0 +1,653 @@
1
+ """
2
+ Powerful PostgreSQL Vector Database Store with Custom Table Support.
3
+ """
4
+ from typing import (
5
+ Any,
6
+ Dict,
7
+ List,
8
+ Tuple,
9
+ Union,
10
+ Optional,
11
+ Sequence
12
+ )
13
+ from collections.abc import Callable
14
+ import asyncio
15
+ import uuid
16
+ # SQL Alchemy
17
+ import sqlalchemy
18
+ from sqlalchemy import inspect, text
19
+ from sqlalchemy.ext.declarative import declarative_base
20
+ from sqlalchemy import Column, String, ARRAY, Float, JSON, text, func
21
+ from sqlalchemy.dialects.postgresql import JSON, JSONB, JSONPATH, UUID, insert
22
+ from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine
23
+ from sqlalchemy.future import select
24
+ from sqlalchemy.ext.asyncio import AsyncSession
25
+ # PgVector
26
+ from pgvector.sqlalchemy import Vector # type: ignore
27
+ # Langchain
28
+ from langchain_core.embeddings import Embeddings
29
+ from langchain.docstore.document import Document
30
+ from langchain.memory import VectorStoreRetrieverMemory
31
+ from langchain_community.vectorstores.pgembedding import PGEmbedding
32
+ from langchain_community.vectorstores.utils import DistanceStrategy
33
+ from langchain_postgres.vectorstores import (
34
+ PGVector,
35
+ _get_embedding_collection_store,
36
+ _results_to_docs
37
+ )
38
+ from datamodel.parsers.json import json_encoder # pylint: disable=E0611
39
+ from .abstract import AbstractStore
40
+
41
+
42
+
43
+ Base = declarative_base()
44
+
45
+
46
+ # Define the async classmethods to be attached to our ORM model.
47
+ async def aget_by_name(cls, session: AsyncSession, name: str) -> Optional["CustomEmbeddingStore"]:
48
+ # result = await session.execute(select(cls).where(cls.name == name))
49
+ # return result.scalars().first()
50
+ return cls(cmetadata={})
51
+
52
+
53
+ class PgVector(PGVector):
54
+ """
55
+ PgVector extends PGVector so that it uses an existing table from a specified schema.
56
+
57
+ When instantiating, you provide:
58
+ - connection: an AsyncEngine (or synchronous engine) to your PostgreSQL database.
59
+ - schema: the database schema where your table lives.
60
+ - table_name: the name of the table that stores the embeddings.
61
+ - embedding_length: the dimension of the embedding vectors.
62
+ - embeddings: your embedding function/model (which must provide embed_query).
63
+
64
+ This implementation overrides the _get_embedding_collection_store method to return a tuple of
65
+ ORM model classes that both refer to your table. It validates (using SQLAlchemy’s inspector)
66
+ that the table contains the required columns: 'id', 'embedding', 'document', and 'cmetadata'.
67
+
68
+ The returned ORM models can then be used by PGVector’s built-in similarity search and retriever.
69
+ """
70
+ def __init__(
71
+ self,
72
+ embeddings: Embeddings,
73
+ *,
74
+ table_name: str = None,
75
+ schema: str = 'public',
76
+ collection_name: Optional[str] = None,
77
+ id_column: str = 'id',
78
+ **kwargs
79
+ ) -> None:
80
+ self.table_name = table_name
81
+ self.schema = schema
82
+ self._id_column: str = id_column
83
+ self._schema_based: bool = False
84
+ if self.table_name:
85
+ self._schema_based: bool = True
86
+ elif '.' in collection_name:
87
+ self.schema, self.table_name = collection_name.split('.')
88
+ self._schema_based: bool = True
89
+ super().__init__(
90
+ embeddings=embeddings,
91
+ collection_name=collection_name,
92
+ **kwargs
93
+ )
94
+
95
+ async def _get_embedding_collection_store(
96
+ self,
97
+ table: str,
98
+ schema: str,
99
+ dimension: int = 768,
100
+ **kwargs
101
+ ) -> Tuple[type, type]:
102
+ """
103
+ Return custom ORM model classes (EmbeddingStore, CollectionStore)
104
+ that both reference the same table.
105
+
106
+ In this custom implementation, both the "collection" and "embedding" stores
107
+ are represented by a single table.
108
+ The table is expected to have the following columns:
109
+ - id: unique identifier (String)
110
+ - embedding: the vector column (Vector(dimension))
111
+ - document: text column containing the document
112
+ - cmetadata: JSONB column for metadata
113
+
114
+ Raises an error if the table does not have the required schema.
115
+ """
116
+ # Dynamically create the model class.
117
+ attrs = {
118
+ '__tablename__': table,
119
+ '__table_args__': {"schema": schema},
120
+ self._id_column: sqlalchemy.Column(
121
+ sqlalchemy.String,
122
+ primary_key=True,
123
+ index=True,
124
+ unique=True,
125
+ default=lambda: str(uuid.uuid4())
126
+ ),
127
+ 'embedding': sqlalchemy.Column(Vector(dimension)),
128
+ 'document': sqlalchemy.Column(sqlalchemy.String, nullable=True),
129
+ 'cmetadata': sqlalchemy.Column(JSONB, nullable=True),
130
+ # Attach the async classmethods.
131
+ 'aget_by_name': classmethod(aget_by_name),
132
+ # 'aget_or_create': classmethod(aget_or_create)
133
+ }
134
+ EmbeddingStore = type("CustomEmbeddingStore", (Base,), attrs)
135
+ EmbeddingStore.__name__ = "EmbeddingStore"
136
+ return (EmbeddingStore, EmbeddingStore)
137
+
138
+ async def __apost_init__(
139
+ self,
140
+ ) -> None:
141
+ """Async initialize the store (use lazy approach)."""
142
+ if self._async_init: # Warning: possible race condition
143
+ return
144
+ self._async_init = True
145
+ if self._schema_based:
146
+ ebstore, cstore = await self._get_embedding_collection_store(
147
+ table=self.table_name,
148
+ schema=self.schema,
149
+ dimension=self._embedding_length
150
+ )
151
+ else:
152
+ ebstore, cstore = _get_embedding_collection_store(
153
+ self._embedding_length
154
+ )
155
+ self.CollectionStore = cstore
156
+ self.EmbeddingStore = ebstore
157
+
158
+ if not self._schema_based:
159
+ await self.acreate_tables_if_not_exists()
160
+ await self.acreate_collection()
161
+
162
+ async def asimilarity_search(
163
+ self,
164
+ query: str,
165
+ k: int = 4,
166
+ score_threshold: Optional[float] = None,
167
+ filter: Optional[dict] = None,
168
+ **kwargs: Any,
169
+ ) -> List[Document]:
170
+ """Run similarity search with PGVector with distance.
171
+
172
+ Args:
173
+ query (str): Query text to search for.
174
+ k (int): Number of results to return. Defaults to 4.
175
+ filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
176
+
177
+ Returns:
178
+ List of Documents most similar to the query.
179
+ """
180
+ await self.__apost_init__() # Lazy async init
181
+ embedding = await self.embeddings.aembed_query(query)
182
+ return await self.asimilarity_search_by_vector(
183
+ embedding=embedding,
184
+ k=k,
185
+ score_threshold=score_threshold,
186
+ filter=filter,
187
+ )
188
+
189
+ async def asimilarity_search_by_vector(
190
+ self,
191
+ embedding: List[float],
192
+ k: int = 4,
193
+ score_threshold: Optional[float] = None,
194
+ filter: Optional[dict] = None,
195
+ **kwargs: Any,
196
+ ) -> List[Document]:
197
+ """Return docs most similar to embedding vector.
198
+
199
+ Args:
200
+ embedding: Embedding to look up documents similar to.
201
+ k: Number of Documents to return. Defaults to 4.
202
+ filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
203
+
204
+ Returns:
205
+ List of Documents most similar to the query vector.
206
+ """
207
+ assert self._async_engine, "This method must be called with async_mode"
208
+ await self.__apost_init__() # Lazy async init
209
+ docs_and_scores = await self.asimilarity_search_with_score_by_vector(
210
+ embedding=embedding, k=k, score_threshold=score_threshold, filter=filter
211
+ )
212
+ return _results_to_docs(docs_and_scores)
213
+
214
+ async def asimilarity_search_with_score_by_vector(
215
+ self,
216
+ embedding: List[float],
217
+ k: int = 4,
218
+ score_threshold: Optional[float] = None,
219
+ filter: Optional[dict] = None,
220
+ ) -> List[Tuple[Document, float]]:
221
+ await self.__apost_init__() # Lazy async init
222
+ async with self._make_async_session() as session: # type: ignore[arg-type]
223
+ results = await self._aquery_collection(
224
+ session=session, embedding=embedding, k=k, score_threshold=score_threshold, filter=filter
225
+ )
226
+ return self._results_to_docs_and_scores(results)
227
+
228
+ async def _aquery_collection(
229
+ self,
230
+ session: AsyncSession,
231
+ embedding: List[float],
232
+ k: int = 4,
233
+ score_threshold: Optional[float] = None,
234
+ filter: Optional[Dict[str, str]] = None,
235
+ ) -> List[Tuple[Document, float]]:
236
+ """Search for similar documents in the collection.
237
+
238
+ If score_threshold is provided, returns all documents whose computed distance is below that threshold.
239
+ Otherwise, if k is provided, returns at most k documents.
240
+ """
241
+ async with self._make_async_session() as session: # type: ignore[arg-type]
242
+ filter_by = []
243
+ if filter:
244
+ if self.use_jsonb:
245
+ filter_clause = self._create_filter_clause(filter)
246
+ if filter_clause is not None:
247
+ filter_by.append(filter_clause)
248
+ else:
249
+ # For non-JSONB cases, you might use a deprecated method:
250
+ filter_clauses = self._create_filter_clause_json_deprecated(filter)
251
+ filter_by.extend(filter_clauses)
252
+
253
+ # Compute the distance expression
254
+ distance_expr = self.distance_strategy(embedding).label("distance")
255
+ stmt = (
256
+ sqlalchemy.select(
257
+ self.EmbeddingStore,
258
+ self.distance_strategy(embedding).label("distance")
259
+ )
260
+ .filter(*filter_by)
261
+ )
262
+ # If a score threshold is provided, add a filter on the distance.
263
+ if score_threshold is not None:
264
+ stmt = stmt.filter(distance_expr < score_threshold)
265
+ else:
266
+ # Otherwise, limit the number of results.
267
+ stmt = stmt.order_by(distance_expr).limit(k)
268
+
269
+ stmt = stmt.order_by(sqlalchemy.asc(distance_expr))
270
+ # Execute the query and return the results.
271
+ results: Sequence[Any] = (await session.execute(stmt)).all()
272
+ return results
273
+
274
+ def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, float]]:
275
+ """Return docs and scores from results."""
276
+ id_col = getattr(self, "_id_column", "id")
277
+ docs = [
278
+ (
279
+ Document(
280
+ id=str(getattr(result.EmbeddingStore, id_col)),
281
+ page_content=result.EmbeddingStore.document,
282
+ metadata=result.EmbeddingStore.cmetadata,
283
+ ),
284
+ result.distance if self.embeddings is not None else None,
285
+ )
286
+ for result in results
287
+ ]
288
+ return docs
289
+
290
+
291
+ class PgvectorStore(AbstractStore):
292
+ """Pgvector Store Class.
293
+
294
+ Using PostgreSQL + PgVector to saving vectors in database.
295
+ """
296
+ def __init__(
297
+ self,
298
+ embedding_model: Union[dict, str] = None,
299
+ embedding: Union[dict, Callable] = None,
300
+ **kwargs
301
+ ):
302
+ super().__init__(
303
+ embedding_model=embedding_model,
304
+ embedding=embedding,
305
+ **kwargs
306
+ )
307
+ self.table: str = kwargs.get('table', None)
308
+ self.schema: str = kwargs.get('schema', 'public')
309
+ self._id_column = kwargs.get('id_column', 'id')
310
+ if self.table and not self.collection_name:
311
+ self.collection_name = f"{self.schema}.{self.table}"
312
+ self.dsn = kwargs.get('dsn', self.database)
313
+ self._drop: bool = kwargs.pop('drop', False)
314
+ self._connection: AsyncEngine = None
315
+
316
+ async def collection_exists(self, collection: str = None) -> bool:
317
+ """Check if a collection exists in the database."""
318
+ if not collection:
319
+ collection = self.collection_name
320
+ async with self._connection.connect() as conn:
321
+ # ✅ Check if the collection (table) exists
322
+ check_query = f"""
323
+ SELECT EXISTS (
324
+ SELECT FROM information_schema.tables
325
+ WHERE table_name = '{collection}'
326
+ );
327
+ """
328
+ result = await conn.execute(sqlalchemy.text(check_query))
329
+ return bool(result.scalar())
330
+
331
+ async def connection(self, alias: str = None):
332
+ """Connection to DuckDB.
333
+
334
+ Args:
335
+ alias (str): Database alias.
336
+
337
+ Returns:
338
+ Callable: DuckDB connection.
339
+
340
+ """
341
+ self._connection = create_async_engine(self.dsn, future=True, echo=False)
342
+ async with self._connection.begin() as conn:
343
+ if getattr(self, "_drop", False):
344
+ vectorstore = PgVector(
345
+ embeddings=self._embed_.embedding,
346
+ table_name=self.table,
347
+ schema=self.schema,
348
+ id_column=self._id_column,
349
+ collection_name=self.collection_name,
350
+ embedding_length=self.dimension,
351
+ connection=self._connection,
352
+ use_jsonb=True,
353
+ create_extension=False
354
+ )
355
+ await vectorstore.adrop_tables()
356
+ if not await self.collection_exists(self.collection_name):
357
+ print(f"⚠️ Collection `{self.collection_name}` not found. Creating a new one...")
358
+ await self.create_collection(self.collection_name)
359
+ self._connected = True
360
+ return self._connection
361
+
362
+ def engine(self):
363
+ return self._connection
364
+
365
+ async def disconnect(self) -> None:
366
+ """
367
+ Closing the Connection on DuckDB
368
+ """
369
+ try:
370
+ if self._connection:
371
+ await self._connection.dispose()
372
+ except Exception as err:
373
+ raise RuntimeError(
374
+ message=f"{__name__!s}: Closing Error: {err!s}"
375
+ ) from err
376
+ finally:
377
+ self._connection = None
378
+ self._connected = False
379
+
380
+ async def create_collection(self, collection: str) -> None:
381
+ """Create a new collection in the database."""
382
+ async with self._connection.connect() as conn:
383
+ # ✅ Create the collection in PgVector
384
+ _embed_ = self._embed_ or self.create_embedding(
385
+ embedding_model=self.embedding_model
386
+ )
387
+ self._client = PgVector(
388
+ embeddings=_embed_.embedding,
389
+ embedding_length=self.dimension,
390
+ collection_name=self.collection_name,
391
+ connection=self._connection,
392
+ use_jsonb=True,
393
+ create_extension=False
394
+ )
395
+ print(
396
+ f"✅ Collection `{self.collection_name}` created successfully."
397
+ )
398
+
399
+ def get_vector(
400
+ self,
401
+ table: Optional[str] = None,
402
+ schema: Optional[str] = None,
403
+ collection: Union[str, None] = None,
404
+ embedding: Optional[Callable] = None,
405
+ **kwargs
406
+ ) -> PGVector:
407
+ """
408
+ This function retrieves a vector from the specified collection using the provided embedding.
409
+ If no collection is specified, it uses the default collection name.
410
+ If no embedding is provided, it creates a new embedding using the specified embedding model.
411
+
412
+ Parameters:
413
+ - collection (Union[str, None]): The name of the collection from which to retrieve the vector.
414
+ - embedding (Optional[Callable]): The embedding function to use for vector retrieval.
415
+ - kwargs: Additional keyword arguments to pass to the PGVector constructor.
416
+
417
+ Returns:
418
+ - PGVector: The retrieved vector from the specified collection.
419
+ """
420
+ if not table:
421
+ table = self.table
422
+ if not schema:
423
+ schema = self.schema
424
+ if not collection:
425
+ collection = self.collection_name
426
+ if embedding is not None:
427
+ _embed_ = embedding
428
+ else:
429
+ _embed_ = self.create_embedding(
430
+ embedding_model=self.embedding_model
431
+ )
432
+ return PgVector(
433
+ connection=self._connection,
434
+ table_name=table,
435
+ schema=schema,
436
+ id_column=self._id_column,
437
+ collection_name=collection,
438
+ embedding_length=self.dimension,
439
+ embeddings=_embed_.embedding,
440
+ logger=self.logger,
441
+ async_mode=True,
442
+ use_jsonb=True,
443
+ create_extension=False,
444
+ **kwargs
445
+ )
446
+
447
+ def memory_retriever(
448
+ self,
449
+ documents: Optional[List[Document]] = None,
450
+ num_results: int = 5
451
+ ) -> VectorStoreRetrieverMemory:
452
+ _embed_ = self._embed_ or self.create_embedding(
453
+ embedding_model=self.embedding_model
454
+ )
455
+ vectordb = PgVector.from_documents(
456
+ documents or [],
457
+ embedding=_embed_.embedding,
458
+ connection=self._connection,
459
+ collection_name=self.collection_name,
460
+ embedding_length=self.dimension,
461
+ use_jsonb=True,
462
+ async_mode=True,
463
+ create_extension=False,
464
+ )
465
+ retriever = PgVector.as_retriever(
466
+ vectordb,
467
+ search_kwargs=dict(k=num_results)
468
+ )
469
+ return VectorStoreRetrieverMemory(retriever=retriever)
470
+
471
+ async def from_documents(
472
+ self,
473
+ documents: List[Document],
474
+ table: Optional[str] = None,
475
+ schema: Optional[str] = 'public',
476
+ collection: Union[str, None] = None,
477
+ **kwargs
478
+ ) -> None:
479
+ """Save Documents as Vectors in VectorStore."""
480
+ _embed_ = self._embed_ or self.create_embedding(
481
+ embedding_model=self.embedding_model
482
+ )
483
+ if not collection:
484
+ collection = self.collection_name
485
+ vectordb = await PgVector.afrom_documents(
486
+ documents,
487
+ connection=self._connection,
488
+ table_name=table,
489
+ schema=schema,
490
+ id_column=self._id_column,
491
+ collection_name=collection,
492
+ embedding=_embed_.embedding,
493
+ embedding_length=self.dimension,
494
+ use_jsonb=True,
495
+ async_mode=True,
496
+ )
497
+ return vectordb
498
+
499
+ async def add_documents(
500
+ self,
501
+ documents: List[Document],
502
+ collection: Union[str, None] = None,
503
+ **kwargs
504
+ ) -> None:
505
+ """Save Documents as Vectors in VectorStore."""
506
+ if not collection:
507
+ collection = self.collection_name
508
+ vectordb = self.get_vector(collection=collection, **kwargs)
509
+ # Asynchronously add documents to PGVector
510
+ await vectordb.aadd_documents(documents)
511
+
512
+ async def similarity_search(
513
+ self,
514
+ query: str,
515
+ table: Optional[str] = None,
516
+ schema: Optional[str] = None,
517
+ collection: Union[str, None] = None,
518
+ limit: int = 2,
519
+ score_threshold: Optional[float] = None,
520
+ filter: Optional[dict] = None,
521
+ **kwargs
522
+ ) -> List[Document]:
523
+ """Search for similar documents in VectorStore."""
524
+ if not table:
525
+ table = self.table
526
+ if not schema:
527
+ schema = self.schema
528
+ if collection is None:
529
+ collection = self.collection_name
530
+ async with self:
531
+ vector_db = self.get_vector(table=table, schema=schema, collection=collection, **kwargs)
532
+ return await vector_db.asimilarity_search(
533
+ query,
534
+ k=limit,
535
+ score_threshold=score_threshold,
536
+ filter=filter
537
+ )
538
+
539
+ async def create_embedding_table(
540
+ self,
541
+ table: str,
542
+ columns: List[str],
543
+ schema: str = 'public',
544
+ embedding_column: str = 'embedding',
545
+ document_column: str = 'document',
546
+ metadata_column: str = 'metadata',
547
+ id_column: str = 'id',
548
+ dimension: int = 768,
549
+ use_jsonb: bool = False,
550
+ drop_columns: bool = False,
551
+ **kwargs
552
+ ):
553
+ """
554
+ Create an embedding column and vectorize Table information.
555
+ """
556
+ tablename = f'{schema}.{table}'
557
+ cols = ', '.join(columns)
558
+ _qry = f'SELECT {cols} FROM {tablename};'
559
+ # Generate a sample embedding to determine its dimension
560
+ sample_vector = self._embed_.embedding.embed_query("sample text")
561
+ vector_dim = len(sample_vector)
562
+ # Compare it with the expected dimension
563
+ if vector_dim != dimension:
564
+ raise ValueError(
565
+ f"Expected embedding dimension {self.dimension}, but got {vector_dim}"
566
+ )
567
+ async with self._connection.begin() as conn:
568
+ result = await conn.execute(
569
+ sqlalchemy.text(_qry)
570
+ )
571
+ rows = result.fetchall()
572
+ # Concatenate column names and values to form an input string:
573
+ # 'store_name: BestBuy, location_code: 123456, ...'
574
+ # data = [
575
+ # ', '.join([f"{col}: {row[col]}" for col in columns])
576
+ # for row in rows
577
+ # ]
578
+ # if drop columns, then first remove the existing columns:
579
+ if drop_columns:
580
+ for column in (document_column, embedding_column, metadata_column):
581
+ await conn.execute(
582
+ sqlalchemy.text(
583
+ f'ALTER TABLE {tablename} DROP COLUMN IF EXISTS {column};'
584
+ )
585
+ )
586
+ # Create a new column for embeddings
587
+ if use_jsonb:
588
+ await conn.execute(
589
+ sqlalchemy.text(
590
+ f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {embedding_column} JSONB;' # pylint: disable=C0301
591
+ )
592
+ )
593
+ else:
594
+ # Use Embedding pgvector type:
595
+ await conn.execute(
596
+ sqlalchemy.text(
597
+ f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {embedding_column} vector({dimension});' # pylint: disable=C0301
598
+ )
599
+ )
600
+ # Create a Index for vector:
601
+ # TODO: define index algorithm and options.
602
+ await conn.execute(
603
+ sqlalchemy.text(
604
+ f"CREATE INDEX IF NOT EXISTS idx_{schema}_{table}_embeddings ON {tablename} USING hnsw ({embedding_column} vector_l2_ops);" # pylint: disable=C0301
605
+ )
606
+ )
607
+ # And also, an index IVFLAT:
608
+ await conn.execute(
609
+ sqlalchemy.text(
610
+ f"CREATE INDEX IF NOT EXISTS idx_{schema}_{table}_ivflat ON {tablename} USING ivfflat ({embedding_column} vector_cosine_ops);" # pylint: disable=C0301
611
+ )
612
+ )
613
+ # Then, create the info column and id column (if required):
614
+ # Text info Column (content)
615
+ await conn.execute(
616
+ sqlalchemy.text(
617
+ f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {document_column} TEXT;'
618
+ )
619
+ )
620
+ # ID Column (if required)
621
+ await conn.execute(
622
+ sqlalchemy.text(
623
+ f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {id_column} varchar;'
624
+ )
625
+ )
626
+ # Metadata Column (JSONB):
627
+ await conn.execute(
628
+ sqlalchemy.text(
629
+ f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {metadata_column} jsonb;'
630
+ )
631
+ )
632
+ # And ID Column:
633
+ await conn.execute(
634
+ sqlalchemy.text(
635
+ f'ALTER TABLE {tablename} ADD COLUMN IF NOT EXISTS {id_column} varchar;'
636
+ )
637
+ )
638
+ for row in rows:
639
+ _id = getattr(row, id_column)
640
+ metadata = {col: getattr(row, col) for col in columns}
641
+ data = " ".join([f"{col}: {metadata[col]}" for col in columns])
642
+ # Get the vector information from data:
643
+ vector = self._embed_.embedding.embed_query(data)
644
+ vector_str = "[" + ",".join(str(v) for v in vector) + "]"
645
+ await conn.execute(
646
+ sqlalchemy.text(f"""
647
+ UPDATE {tablename}
648
+ SET {embedding_column} = :vector, {document_column} = :info, {metadata_column} = :metadata
649
+ WHERE {id_column} = :id
650
+ """),
651
+ {"vector": vector_str, "id": _id, "info": data, "metadata": json_encoder(metadata)}
652
+ )
653
+ print("✅ Updated Table embeddings.")