MindsDB 25.1.2.1__py3-none-any.whl → 25.1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/METADATA +246 -255
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/RECORD +94 -83
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +5 -3
- mindsdb/api/executor/__init__.py +0 -1
- mindsdb/api/executor/command_executor.py +2 -1
- mindsdb/api/executor/data_types/answer.py +1 -1
- mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +8 -3
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +9 -26
- mindsdb/api/executor/sql_query/__init__.py +1 -0
- mindsdb/api/executor/sql_query/result_set.py +36 -21
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
- mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
- mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
- mindsdb/api/executor/utilities/sql.py +2 -10
- mindsdb/api/http/namespaces/agents.py +3 -1
- mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
- mindsdb/api/http/namespaces/sql.py +3 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
- mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
- mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
- mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
- mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
- mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
- mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +50 -16
- mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
- mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
- mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
- mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
- mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
- mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +2 -2
- mindsdb/integrations/utilities/files/__init__.py +0 -0
- mindsdb/integrations/utilities/files/file_reader.py +258 -0
- mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
- mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
- mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
- mindsdb/integrations/utilities/rag/pipelines/rag.py +74 -21
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +108 -78
- mindsdb/integrations/utilities/rag/settings.py +37 -16
- mindsdb/integrations/utilities/sql_utils.py +1 -1
- mindsdb/interfaces/agents/agents_controller.py +18 -8
- mindsdb/interfaces/agents/constants.py +1 -0
- mindsdb/interfaces/agents/langchain_agent.py +124 -157
- mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -37
- mindsdb/interfaces/agents/mindsdb_database_agent.py +21 -13
- mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
- mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
- mindsdb/interfaces/chatbot/memory.py +58 -13
- mindsdb/interfaces/database/integrations.py +5 -1
- mindsdb/interfaces/database/projects.py +55 -16
- mindsdb/interfaces/database/views.py +12 -25
- mindsdb/interfaces/knowledge_base/controller.py +39 -15
- mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
- mindsdb/interfaces/model/functions.py +15 -4
- mindsdb/interfaces/model/model_controller.py +4 -7
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +51 -40
- mindsdb/interfaces/skills/retrieval_tool.py +10 -3
- mindsdb/interfaces/skills/skill_tool.py +97 -54
- mindsdb/interfaces/skills/skills_controller.py +7 -3
- mindsdb/interfaces/skills/sql_agent.py +127 -41
- mindsdb/interfaces/storage/db.py +1 -1
- mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
- mindsdb/utilities/cache.py +7 -4
- mindsdb/utilities/context.py +11 -1
- mindsdb/utilities/langfuse.py +279 -0
- mindsdb/utilities/log.py +20 -2
- mindsdb/utilities/otel/__init__.py +206 -0
- mindsdb/utilities/otel/logger.py +25 -0
- mindsdb/utilities/otel/meter.py +19 -0
- mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
- mindsdb/utilities/otel/tracer.py +16 -0
- mindsdb/utilities/partitioning.py +52 -0
- mindsdb/utilities/render/sqlalchemy_render.py +7 -1
- mindsdb/utilities/utils.py +34 -0
- mindsdb/utilities/otel.py +0 -72
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/WHEEL +0 -0
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
from langchain_core.embeddings import Embeddings
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FastAPIEmbeddings(Embeddings):
|
|
7
|
+
"""An embedding extension that interfaces with FAST API. Useful for custom serving solutions."""
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
api_base: str,
|
|
12
|
+
model: str,
|
|
13
|
+
batch_size: int = 32,
|
|
14
|
+
**kwargs: Any,
|
|
15
|
+
):
|
|
16
|
+
"""Initialize the embeddings class.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
api_base: Base URL for the VLLM server
|
|
20
|
+
model: Model name/path to use for embeddings
|
|
21
|
+
batch_size: Batch size for generating embeddings
|
|
22
|
+
"""
|
|
23
|
+
super().__init__()
|
|
24
|
+
self.api_base = api_base
|
|
25
|
+
self.model = model
|
|
26
|
+
self.batch_size = batch_size
|
|
27
|
+
|
|
28
|
+
# initialize requests here with the api_base
|
|
29
|
+
|
|
30
|
+
def _get_embeddings(self, texts: List[str]) -> List[str]:
|
|
31
|
+
"""Get embeddings for a batch of text chunks.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of embeddings as strings. For sparse vectors, returns strings in format
|
|
35
|
+
"{key:value,...}/size" where size is the dimension of the vector space.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
|
39
|
+
|
|
40
|
+
data = {
|
|
41
|
+
"input": texts,
|
|
42
|
+
"model": self.model
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
response = requests.post(self.api_base, headers=headers, json=data)
|
|
46
|
+
|
|
47
|
+
response.raise_for_status()
|
|
48
|
+
|
|
49
|
+
embeddings = []
|
|
50
|
+
for response_dict in response.json()["data"]:
|
|
51
|
+
embedding = response_dict["embedding"]
|
|
52
|
+
embeddings.append(embedding)
|
|
53
|
+
|
|
54
|
+
return embeddings
|
|
55
|
+
|
|
56
|
+
def embed_documents(self, texts: List[str]) -> List[str]:
|
|
57
|
+
"""Embed a list of documents using vLLM.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
texts: List of documents to embed
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
List of embeddings as strings, one for each document.
|
|
64
|
+
For sparse embeddings, returns strings in format "{key:value,...}/size"
|
|
65
|
+
For dense embeddings, returns JSON strings of float lists
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
return self._get_embeddings(texts)
|
|
69
|
+
|
|
70
|
+
def embed_query(self, text: str) -> str:
|
|
71
|
+
"""Embed a single query text using vLLM.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
text: Query text to embed
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Query embedding as a string.
|
|
78
|
+
For sparse embeddings, returns string in format "{key:value,...}/size"
|
|
79
|
+
For dense embeddings, returns JSON string of float list
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
return self._get_embeddings([text])[0]
|
|
@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.base import BaseMLEngine
|
|
|
10
10
|
from mindsdb.utilities import log
|
|
11
11
|
from langchain_core.embeddings import Embeddings
|
|
12
12
|
from mindsdb.integrations.handlers.langchain_embedding_handler.vllm_embeddings import VLLMEmbeddings
|
|
13
|
+
from mindsdb.integrations.handlers.langchain_embedding_handler.fastapi_embeddings import FastAPIEmbeddings
|
|
13
14
|
|
|
14
15
|
logger = log.getLogger(__name__)
|
|
15
16
|
|
|
@@ -20,7 +21,10 @@ logger = log.getLogger(__name__)
|
|
|
20
21
|
# This is used for the user to select the embedding model
|
|
21
22
|
EMBEDDING_MODELS = {
|
|
22
23
|
'VLLM': 'VLLMEmbeddings',
|
|
23
|
-
'vllm': 'VLLMEmbeddings'
|
|
24
|
+
'vllm': 'VLLMEmbeddings',
|
|
25
|
+
'FastAPI': 'FastAPIEmbeddings',
|
|
26
|
+
'fastapi': 'FastAPIEmbeddings'
|
|
27
|
+
|
|
24
28
|
}
|
|
25
29
|
|
|
26
30
|
try:
|
|
@@ -55,6 +59,9 @@ def get_langchain_class(class_name: str) -> Embeddings:
|
|
|
55
59
|
if class_name == "VLLMEmbeddings":
|
|
56
60
|
return VLLMEmbeddings
|
|
57
61
|
|
|
62
|
+
if class_name == "FastAPIEmbeddings":
|
|
63
|
+
return FastAPIEmbeddings
|
|
64
|
+
|
|
58
65
|
# Then try langchain_community.embeddings
|
|
59
66
|
try:
|
|
60
67
|
module = importlib.import_module("langchain_community.embeddings")
|
|
@@ -3,6 +3,6 @@ wikipedia==1.4.0
|
|
|
3
3
|
tiktoken
|
|
4
4
|
anthropic>=0.26.1
|
|
5
5
|
litellm==1.44.8
|
|
6
|
-
chromadb # Knowledge bases.
|
|
6
|
+
chromadb~=0.6.3 # Knowledge bases.
|
|
7
7
|
-r mindsdb/integrations/handlers/openai_handler/requirements.txt
|
|
8
8
|
-r mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt
|
|
@@ -28,7 +28,7 @@ class MSOneDriveHandler(APIHandler):
|
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
30
|
name = 'one_drive'
|
|
31
|
-
supported_file_formats = ['csv', 'tsv', 'json', 'parquet']
|
|
31
|
+
supported_file_formats = ['csv', 'tsv', 'json', 'parquet', 'pdf', 'txt']
|
|
32
32
|
|
|
33
33
|
def __init__(self, name: Text, connection_data: Dict, **kwargs: Any) -> None:
|
|
34
34
|
"""
|
|
@@ -9,6 +9,8 @@ from mindsdb.integrations.utilities.sql_utils import (
|
|
|
9
9
|
SortColumn
|
|
10
10
|
)
|
|
11
11
|
|
|
12
|
+
from mindsdb.integrations.utilities.files.file_reader import FileReader
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class ListFilesTable(APIResource):
|
|
14
16
|
"""
|
|
@@ -97,4 +99,10 @@ class FileTable(APIResource):
|
|
|
97
99
|
elif file_extension == "parquet":
|
|
98
100
|
df = pd.read_parquet(BytesIO(file_content))
|
|
99
101
|
|
|
102
|
+
elif file_extension == "pdf":
|
|
103
|
+
df = FileReader().read_pdf(BytesIO(file_content))
|
|
104
|
+
|
|
105
|
+
elif file_extension == "txt":
|
|
106
|
+
df = FileReader().read_txt(BytesIO(file_content))
|
|
107
|
+
|
|
100
108
|
return df
|
|
@@ -37,6 +37,11 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
37
37
|
super().__init__(name=name, **kwargs)
|
|
38
38
|
self._is_shared_db = False
|
|
39
39
|
self._is_vector_registered = False
|
|
40
|
+
# we get these from the connection args on PostgresHandler parent
|
|
41
|
+
self._is_sparse = self.connection_args.get('is_sparse', False)
|
|
42
|
+
self._vector_size = self.connection_args.get('vector_size', None)
|
|
43
|
+
if self._is_sparse and not self._vector_size:
|
|
44
|
+
raise ValueError("vector_size is required when is_sparse=True")
|
|
40
45
|
self.connect()
|
|
41
46
|
|
|
42
47
|
def _make_connection_args(self):
|
|
@@ -190,13 +195,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
190
195
|
if filter_conditions:
|
|
191
196
|
|
|
192
197
|
if embedding_search:
|
|
193
|
-
# if search vector, return similar rows, apply other filters after if any
|
|
194
198
|
search_vector = filter_conditions["embeddings"]["value"][0]
|
|
195
199
|
filter_conditions.pop("embeddings")
|
|
196
|
-
|
|
200
|
+
|
|
201
|
+
if self._is_sparse:
|
|
202
|
+
# Convert dict to sparse vector if needed
|
|
203
|
+
if isinstance(search_vector, dict):
|
|
204
|
+
from pgvector.utils import SparseVector
|
|
205
|
+
embedding = SparseVector(search_vector, self._vector_size)
|
|
206
|
+
search_vector = embedding.to_text()
|
|
207
|
+
# Use inner product for sparse vectors
|
|
208
|
+
distance_op = "<#>"
|
|
209
|
+
else:
|
|
210
|
+
# Convert list to vector string if needed
|
|
211
|
+
if isinstance(search_vector, list):
|
|
212
|
+
search_vector = f"[{','.join(str(x) for x in search_vector)}]"
|
|
213
|
+
# Use cosine similarity for dense vectors
|
|
214
|
+
distance_op = "<=>"
|
|
215
|
+
|
|
216
|
+
return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
|
|
217
|
+
|
|
197
218
|
else:
|
|
198
|
-
# if filter conditions, return
|
|
219
|
+
# if filter conditions, return rows that satisfy the conditions
|
|
199
220
|
return f"SELECT {targets} FROM {table_name} {after_from_clause}"
|
|
221
|
+
|
|
200
222
|
else:
|
|
201
223
|
# if no filter conditions, return all rows
|
|
202
224
|
return f"SELECT {targets} FROM {table_name} {after_from_clause}"
|
|
@@ -283,7 +305,7 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
283
305
|
# See https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
|
|
284
306
|
#
|
|
285
307
|
# We can break down the below query as follows:
|
|
286
|
-
#
|
|
308
|
+
#
|
|
287
309
|
# Start with a CTE (Common Table Expression) called semantic_search (https://www.postgresql.org/docs/current/queries-with.html).
|
|
288
310
|
# This expression calculates rank by the defined distance function, which measures the distance between the
|
|
289
311
|
# embeddings column and the given embeddings vector. Results are ordered by this rank.
|
|
@@ -339,17 +361,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
339
361
|
full_search_query = f'{semantic_search_cte}{full_text_search_cte}{hybrid_select}'
|
|
340
362
|
return self.raw_query(full_search_query)
|
|
341
363
|
|
|
342
|
-
def create_table(self, table_name: str
|
|
343
|
-
"""
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
364
|
+
def create_table(self, table_name: str):
|
|
365
|
+
"""Create a table with a vector column."""
|
|
366
|
+
with self.connection.cursor() as cur:
|
|
367
|
+
# For sparse vectors, use sparsevec type
|
|
368
|
+
vector_column_type = 'sparsevec' if self._is_sparse else 'vector'
|
|
369
|
+
|
|
370
|
+
# Vector size is required for sparse vectors, optional for dense
|
|
371
|
+
if self._is_sparse and not self._vector_size:
|
|
372
|
+
raise ValueError("vector_size is required for sparse vectors")
|
|
373
|
+
|
|
374
|
+
# Add vector size specification only if provided
|
|
375
|
+
size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
|
|
376
|
+
if vector_column_type == 'vector':
|
|
377
|
+
size_spec = ''
|
|
378
|
+
|
|
379
|
+
cur.execute(f"""
|
|
380
|
+
CREATE TABLE IF NOT EXISTS {table_name} (
|
|
381
|
+
id TEXT PRIMARY KEY,
|
|
382
|
+
embeddings {vector_column_type}{size_spec},
|
|
383
|
+
content TEXT,
|
|
384
|
+
metadata JSONB
|
|
385
|
+
)
|
|
386
|
+
""")
|
|
387
|
+
self.connection.commit()
|
|
353
388
|
|
|
354
389
|
def insert(
|
|
355
390
|
self, table_name: str, data: pd.DataFrame
|
|
@@ -447,4 +482,3 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
|
|
|
447
482
|
"""
|
|
448
483
|
table_name = self._check_table(table_name)
|
|
449
484
|
self.raw_query(f"DROP TABLE IF EXISTS {table_name}")
|
|
450
|
-
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
import ast
|
|
1
2
|
from typing import List, Optional
|
|
2
3
|
|
|
3
|
-
import
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pinecone import Pinecone, ServerlessSpec
|
|
6
|
+
from pinecone.core.openapi.shared.exceptions import NotFoundException, PineconeApiException
|
|
4
7
|
import pandas as pd
|
|
5
|
-
import ast
|
|
6
8
|
|
|
7
9
|
from mindsdb.integrations.libs.response import RESPONSE_TYPE
|
|
8
10
|
from mindsdb.integrations.libs.response import HandlerResponse
|
|
@@ -18,32 +20,30 @@ from mindsdb.utilities import log
|
|
|
18
20
|
|
|
19
21
|
logger = log.getLogger(__name__)
|
|
20
22
|
|
|
23
|
+
DEFAULT_CREATE_TABLE_PARAMS = {
|
|
24
|
+
"dimension": 8,
|
|
25
|
+
"metric": "cosine",
|
|
26
|
+
"spec": {
|
|
27
|
+
"cloud": "aws",
|
|
28
|
+
"region": "us-east-1"
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
MAX_FETCH_LIMIT = 10000
|
|
32
|
+
UPSERT_BATCH_SIZE = 99 # API reccomendation
|
|
33
|
+
|
|
21
34
|
|
|
22
35
|
class PineconeHandler(VectorStoreHandler):
|
|
23
36
|
"""This handler handles connection and execution of the Pinecone statements."""
|
|
24
37
|
|
|
25
38
|
name = "pinecone"
|
|
26
39
|
|
|
27
|
-
def __init__(self, name: str, **kwargs):
|
|
40
|
+
def __init__(self, name: str, connection_data: dict, **kwargs):
|
|
28
41
|
super().__init__(name)
|
|
29
|
-
self.
|
|
30
|
-
self.
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
"environment": self._connection_data.get("environment")
|
|
34
|
-
}
|
|
35
|
-
self._table_create_params = {
|
|
36
|
-
"dimension": 8,
|
|
37
|
-
"metric": "cosine",
|
|
38
|
-
"pods": 1,
|
|
39
|
-
"replicas": 1,
|
|
40
|
-
"pod_type": 'p1',
|
|
41
|
-
}
|
|
42
|
-
for key in self._table_create_params:
|
|
43
|
-
if key in self._connection_data:
|
|
44
|
-
self._table_create_params[key] = self._connection_data[key]
|
|
42
|
+
self.connection_data = connection_data
|
|
43
|
+
self.kwargs = kwargs
|
|
44
|
+
|
|
45
|
+
self.connection = None
|
|
45
46
|
self.is_connected = False
|
|
46
|
-
self.connect()
|
|
47
47
|
|
|
48
48
|
def __del__(self):
|
|
49
49
|
if self.is_connected is True:
|
|
@@ -51,7 +51,8 @@ class PineconeHandler(VectorStoreHandler):
|
|
|
51
51
|
|
|
52
52
|
def _get_index_handle(self, index_name):
|
|
53
53
|
"""Returns handler to index specified by `index_name`"""
|
|
54
|
-
|
|
54
|
+
connection = self.connect()
|
|
55
|
+
index = connection.Index(index_name)
|
|
55
56
|
try:
|
|
56
57
|
index.describe_index_stats()
|
|
57
58
|
except Exception:
|
|
@@ -135,10 +136,15 @@ class PineconeHandler(VectorStoreHandler):
|
|
|
135
136
|
|
|
136
137
|
def connect(self):
|
|
137
138
|
"""Connect to a pinecone database."""
|
|
139
|
+
if self.is_connected is True:
|
|
140
|
+
return self.connection
|
|
141
|
+
|
|
142
|
+
if 'api_key' not in self.connection_data:
|
|
143
|
+
raise ValueError('Required parameter (api_key) must be provided.')
|
|
144
|
+
|
|
138
145
|
try:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
self.is_connected = True
|
|
146
|
+
self.connection = Pinecone(api_key=self.connection_data['api_key'])
|
|
147
|
+
return self.connection
|
|
142
148
|
except Exception as e:
|
|
143
149
|
logger.error(f"Error connecting to Pinecone client, {e}!")
|
|
144
150
|
self.is_connected = False
|
|
@@ -147,55 +153,99 @@ class PineconeHandler(VectorStoreHandler):
|
|
|
147
153
|
"""Close the pinecone connection."""
|
|
148
154
|
if self.is_connected is False:
|
|
149
155
|
return
|
|
150
|
-
|
|
156
|
+
self.connection = None
|
|
151
157
|
self.is_connected = False
|
|
152
158
|
|
|
153
159
|
def check_connection(self):
|
|
154
160
|
"""Check the connection to pinecone."""
|
|
155
|
-
|
|
161
|
+
response = StatusResponse(False)
|
|
162
|
+
need_to_close = self.is_connected is False
|
|
163
|
+
|
|
156
164
|
try:
|
|
157
|
-
|
|
158
|
-
|
|
165
|
+
connection = self.connect()
|
|
166
|
+
connection.list_indexes()
|
|
167
|
+
response.success = True
|
|
159
168
|
except Exception as e:
|
|
160
169
|
logger.error(f"Error connecting to pinecone , {e}!")
|
|
161
|
-
|
|
162
|
-
|
|
170
|
+
response.error_message = str(e)
|
|
171
|
+
|
|
172
|
+
if response.success is True and need_to_close:
|
|
173
|
+
self.disconnect()
|
|
174
|
+
if response.success is False and self.is_connected is True:
|
|
175
|
+
self.is_connected = False
|
|
176
|
+
|
|
177
|
+
return response
|
|
163
178
|
|
|
164
179
|
def get_tables(self) -> HandlerResponse:
|
|
165
180
|
"""Get the list of indexes in the pinecone database."""
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
181
|
+
connection = self.connect()
|
|
182
|
+
indexes = connection.list_indexes()
|
|
183
|
+
df = pd.DataFrame(
|
|
184
|
+
columns=["table_name"],
|
|
185
|
+
data=[index['name'] for index in indexes],
|
|
170
186
|
)
|
|
171
|
-
return Response(resp_type=RESPONSE_TYPE.TABLE, data_frame=
|
|
187
|
+
return Response(resp_type=RESPONSE_TYPE.TABLE, data_frame=df)
|
|
172
188
|
|
|
173
189
|
def create_table(self, table_name: str, if_not_exists=True):
|
|
174
190
|
"""Create an index with the given name in the Pinecone database."""
|
|
175
|
-
|
|
191
|
+
connection = self.connect()
|
|
192
|
+
|
|
193
|
+
# TODO: Should other parameters be supported? Pod indexes?
|
|
194
|
+
# TODO: Should there be a better way to provide these parameters rather than when establishing the connection?
|
|
195
|
+
create_table_params = {}
|
|
196
|
+
for key, val in DEFAULT_CREATE_TABLE_PARAMS.items():
|
|
197
|
+
if key in self.connection_data:
|
|
198
|
+
create_table_params[key] = self.connection_data[key]
|
|
199
|
+
else:
|
|
200
|
+
create_table_params[key] = val
|
|
201
|
+
|
|
202
|
+
create_table_params["spec"] = ServerlessSpec(**create_table_params["spec"])
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
connection.create_index(name=table_name, **create_table_params)
|
|
206
|
+
except PineconeApiException as pinecone_error:
|
|
207
|
+
if pinecone_error.status == 409 and if_not_exists:
|
|
208
|
+
return
|
|
209
|
+
raise Exception(f"Error creating index '{table_name}': {pinecone_error}")
|
|
176
210
|
|
|
177
|
-
def insert(self, table_name: str, data: pd.DataFrame
|
|
211
|
+
def insert(self, table_name: str, data: pd.DataFrame):
|
|
178
212
|
"""Insert data into pinecone index passed in through `table_name` parameter."""
|
|
179
|
-
upsert_batch_size = 99 # API reccomendation
|
|
180
213
|
index = self._get_index_handle(table_name)
|
|
181
214
|
if index is None:
|
|
182
215
|
raise Exception(f"Error getting index '{table_name}', are you sure the name is correct?")
|
|
183
216
|
|
|
184
217
|
data.rename(columns={
|
|
185
218
|
TableField.ID.value: "id",
|
|
186
|
-
TableField.EMBEDDINGS.value: "values",
|
|
187
|
-
TableField.METADATA.value: "metadata"},
|
|
219
|
+
TableField.EMBEDDINGS.value: "values"},
|
|
188
220
|
inplace=True)
|
|
189
|
-
data = data[["id", "values", "metadata"]]
|
|
190
221
|
|
|
191
|
-
|
|
222
|
+
columns = ["id", "values"]
|
|
223
|
+
|
|
224
|
+
if TableField.METADATA.value in data.columns:
|
|
225
|
+
data.rename(columns={TableField.METADATA.value: "metadata"}, inplace=True)
|
|
226
|
+
# fill None and NaN values with empty dict
|
|
227
|
+
if data['metadata'].isnull().any():
|
|
228
|
+
data['metadata'] = data['metadata'].apply(lambda x: {} if x is None or (isinstance(x, float) and np.isnan(x)) else x)
|
|
229
|
+
columns.append("metadata")
|
|
230
|
+
|
|
231
|
+
data = data[columns]
|
|
232
|
+
|
|
233
|
+
# convert the embeddings to lists if they are strings
|
|
234
|
+
data["values"] = data["values"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
|
235
|
+
|
|
236
|
+
for chunk in (data[pos:pos + UPSERT_BATCH_SIZE] for pos in range(0, len(data), UPSERT_BATCH_SIZE)):
|
|
192
237
|
chunk = chunk.to_dict(orient="records")
|
|
193
238
|
index.upsert(vectors=chunk)
|
|
194
239
|
|
|
195
240
|
def drop_table(self, table_name: str, if_exists=True):
|
|
196
241
|
"""Delete an index passed in through `table_name` from the pinecone ."""
|
|
197
|
-
|
|
198
|
-
|
|
242
|
+
connection = self.connect()
|
|
243
|
+
try:
|
|
244
|
+
connection.delete_index(table_name)
|
|
245
|
+
except NotFoundException:
|
|
246
|
+
if if_exists:
|
|
247
|
+
return
|
|
248
|
+
raise Exception(f"Error deleting index '{table_name}', are you sure the name is correct?")
|
|
199
249
|
|
|
200
250
|
def delete(self, table_name: str, conditions: List[FilterCondition] = None):
|
|
201
251
|
"""Delete records in pinecone index `table_name` based on ids or based on metadata conditions."""
|
|
@@ -225,6 +275,7 @@ class PineconeHandler(VectorStoreHandler):
|
|
|
225
275
|
limit: int = None,
|
|
226
276
|
):
|
|
227
277
|
"""Run query on pinecone index named `table_name` and get results."""
|
|
278
|
+
# TODO: Add support for namespaces.
|
|
228
279
|
index = self._get_index_handle(table_name)
|
|
229
280
|
if index is None:
|
|
230
281
|
raise Exception(f"Error getting index '{table_name}', are you sure the name is correct?")
|
|
@@ -233,23 +284,28 @@ class PineconeHandler(VectorStoreHandler):
|
|
|
233
284
|
"include_values": True,
|
|
234
285
|
"include_metadata": True
|
|
235
286
|
}
|
|
287
|
+
|
|
236
288
|
# check for metadata filter
|
|
237
289
|
metadata_filters = self._translate_metadata_condition(conditions)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
290
|
+
if metadata_filters is not None:
|
|
291
|
+
query["filter"] = metadata_filters
|
|
292
|
+
|
|
293
|
+
# check for vector and id filters
|
|
294
|
+
vector_filters = []
|
|
295
|
+
id_filters = []
|
|
296
|
+
|
|
297
|
+
if conditions:
|
|
298
|
+
for condition in conditions:
|
|
299
|
+
if condition.column == TableField.SEARCH_VECTOR.value:
|
|
300
|
+
vector_filters.append(condition.value)
|
|
301
|
+
elif condition.column == TableField.ID.value:
|
|
302
|
+
id_filters.append(condition.value)
|
|
303
|
+
|
|
304
|
+
if vector_filters:
|
|
305
|
+
if len(vector_filters) > 1:
|
|
250
306
|
raise Exception("You cannot have multiple search_vectors in query")
|
|
251
307
|
|
|
252
|
-
query["vector"] =
|
|
308
|
+
query["vector"] = vector_filters[0]
|
|
253
309
|
# For subqueries, the vector filter is a list of list of strings
|
|
254
310
|
if isinstance(query["vector"], list) and isinstance(query["vector"][0], str):
|
|
255
311
|
if len(query["vector"]) > 1:
|
|
@@ -260,26 +316,21 @@ class PineconeHandler(VectorStoreHandler):
|
|
|
260
316
|
except Exception as e:
|
|
261
317
|
raise Exception(f"Cannot parse the search vector '{query['vector']}'into a list: {e}")
|
|
262
318
|
|
|
263
|
-
# check for limit
|
|
264
|
-
if limit is not None:
|
|
265
|
-
query["top_k"] = limit
|
|
266
|
-
else:
|
|
267
|
-
query["top_k"] = self.MAX_FETCH_LIMIT
|
|
268
|
-
if metadata_filters is not None:
|
|
269
|
-
query["filter"] = metadata_filters
|
|
270
|
-
# check for id filter
|
|
271
|
-
id_filters = None
|
|
272
|
-
if conditions is not None:
|
|
273
|
-
id_filters = [
|
|
274
|
-
condition.value
|
|
275
|
-
for condition in conditions
|
|
276
|
-
if condition.column == TableField.ID.value
|
|
277
|
-
] or None
|
|
278
319
|
if id_filters:
|
|
279
320
|
if len(id_filters) > 1:
|
|
280
321
|
raise Exception("You cannot have multiple IDs in query")
|
|
281
322
|
|
|
282
323
|
query["id"] = id_filters[0]
|
|
324
|
+
|
|
325
|
+
if not vector_filters and not id_filters:
|
|
326
|
+
raise Exception("You must provide either a search_vector or an ID in the query")
|
|
327
|
+
|
|
328
|
+
# check for limit
|
|
329
|
+
if limit is not None:
|
|
330
|
+
query["top_k"] = limit
|
|
331
|
+
else:
|
|
332
|
+
query["top_k"] = MAX_FETCH_LIMIT
|
|
333
|
+
|
|
283
334
|
# exec query
|
|
284
335
|
try:
|
|
285
336
|
result = index.query(**query)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
pinecone-client
|
|
1
|
+
pinecone-client==5.0.1
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import time
|
|
2
2
|
import json
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
import psycopg
|
|
@@ -161,7 +162,7 @@ class PostgresHandler(DatabaseHandler):
|
|
|
161
162
|
'float8': 'float64'
|
|
162
163
|
}
|
|
163
164
|
columns = df.columns
|
|
164
|
-
df =
|
|
165
|
+
df.columns = list(range(len(columns)))
|
|
165
166
|
for column_index, column_name in enumerate(df.columns):
|
|
166
167
|
col = df[column_name]
|
|
167
168
|
if str(col.dtype) == 'object':
|
|
@@ -172,7 +173,7 @@ class PostgresHandler(DatabaseHandler):
|
|
|
172
173
|
df[column_name] = col.astype(types_map[pg_type.name])
|
|
173
174
|
except ValueError as e:
|
|
174
175
|
logger.error(f'Error casting column {col.name} to {types_map[pg_type.name]}: {e}')
|
|
175
|
-
|
|
176
|
+
df.columns = columns
|
|
176
177
|
|
|
177
178
|
@profiler.profile()
|
|
178
179
|
def native_query(self, query: str, params=None) -> Response:
|
|
@@ -202,7 +203,7 @@ class PostgresHandler(DatabaseHandler):
|
|
|
202
203
|
result,
|
|
203
204
|
columns=[x.name for x in cur.description]
|
|
204
205
|
)
|
|
205
|
-
|
|
206
|
+
self._cast_dtypes(df, cur.description)
|
|
206
207
|
response = Response(
|
|
207
208
|
RESPONSE_TYPE.TABLE,
|
|
208
209
|
df
|
|
@@ -281,21 +282,27 @@ class PostgresHandler(DatabaseHandler):
|
|
|
281
282
|
"""
|
|
282
283
|
return self.native_query(query)
|
|
283
284
|
|
|
284
|
-
def get_columns(self, table_name: str) -> Response:
|
|
285
|
+
def get_columns(self, table_name: str, schema_name: Optional[str] = None) -> Response:
|
|
285
286
|
"""
|
|
286
287
|
Retrieves column details for a specified table in the PostgreSQL database.
|
|
287
288
|
|
|
288
289
|
Args:
|
|
289
290
|
table_name (str): The name of the table for which to retrieve column information.
|
|
291
|
+
schema_name (str): The name of the schema in which the table is located.
|
|
290
292
|
|
|
291
293
|
Returns:
|
|
292
294
|
Response: A response object containing the column details, formatted as per the `Response` class.
|
|
295
|
+
|
|
293
296
|
Raises:
|
|
294
297
|
ValueError: If the 'table_name' is not a valid string.
|
|
295
298
|
"""
|
|
296
299
|
|
|
297
300
|
if not table_name or not isinstance(table_name, str):
|
|
298
301
|
raise ValueError("Invalid table name provided.")
|
|
302
|
+
if isinstance(schema_name, str):
|
|
303
|
+
schema_name = f"'{schema_name}'"
|
|
304
|
+
else:
|
|
305
|
+
schema_name = 'current_schema()'
|
|
299
306
|
query = f"""
|
|
300
307
|
SELECT
|
|
301
308
|
column_name as "Field",
|
|
@@ -305,12 +312,11 @@ class PostgresHandler(DatabaseHandler):
|
|
|
305
312
|
WHERE
|
|
306
313
|
table_name = '{table_name}'
|
|
307
314
|
AND
|
|
308
|
-
table_schema =
|
|
315
|
+
table_schema = {schema_name}
|
|
309
316
|
"""
|
|
310
317
|
return self.native_query(query)
|
|
311
318
|
|
|
312
319
|
def subscribe(self, stop_event, callback, table_name, columns=None, **kwargs):
|
|
313
|
-
|
|
314
320
|
config = self._make_connection_args()
|
|
315
321
|
config['autocommit'] = True
|
|
316
322
|
|
|
@@ -12,7 +12,7 @@ class RayServeHandler(BaseMLEngine):
|
|
|
12
12
|
- A Ray Serve server should be running
|
|
13
13
|
|
|
14
14
|
Example:
|
|
15
|
-
|
|
15
|
+
|
|
16
16
|
""" # noqa
|
|
17
17
|
name = 'ray_serve'
|
|
18
18
|
|
|
@@ -42,9 +42,11 @@ class RayServeHandler(BaseMLEngine):
|
|
|
42
42
|
raise Exception("Error: Training failed: " + resp['status'])
|
|
43
43
|
|
|
44
44
|
def predict(self, df, args=None):
|
|
45
|
-
args = self.model_storage.json_get('args') #
|
|
45
|
+
args = {**(self.model_storage.json_get('args')), **args} # merge incoming args
|
|
46
|
+
pred_args = args.get('predict_params', {})
|
|
47
|
+
args = {**args, **pred_args} # merge pred_args
|
|
46
48
|
resp = requests.post(args['predict_url'],
|
|
47
|
-
json={'df': df.to_json(orient='records')},
|
|
49
|
+
json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
|
|
48
50
|
headers={'content-type': 'application/json; format=pandas-records'})
|
|
49
51
|
response = resp.json()
|
|
50
52
|
|