MindsDB 25.1.2.1__py3-none-any.whl → 25.1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (95) hide show
  1. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/METADATA +246 -255
  2. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/RECORD +94 -83
  3. mindsdb/__about__.py +1 -1
  4. mindsdb/__main__.py +5 -3
  5. mindsdb/api/executor/__init__.py +0 -1
  6. mindsdb/api/executor/command_executor.py +2 -1
  7. mindsdb/api/executor/data_types/answer.py +1 -1
  8. mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  10. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +8 -3
  11. mindsdb/api/executor/datahub/datanodes/project_datanode.py +9 -26
  12. mindsdb/api/executor/sql_query/__init__.py +1 -0
  13. mindsdb/api/executor/sql_query/result_set.py +36 -21
  14. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
  15. mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
  16. mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
  17. mindsdb/api/executor/utilities/sql.py +2 -10
  18. mindsdb/api/http/namespaces/agents.py +3 -1
  19. mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
  20. mindsdb/api/http/namespaces/sql.py +3 -1
  21. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
  22. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
  23. mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
  24. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
  25. mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
  27. mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
  28. mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
  29. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
  30. mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
  31. mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
  32. mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
  33. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
  34. mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
  35. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
  36. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
  37. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +50 -16
  38. mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
  39. mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
  40. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
  41. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
  42. mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
  43. mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
  44. mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
  45. mindsdb/integrations/libs/ml_handler_process/learn_process.py +2 -2
  46. mindsdb/integrations/utilities/files/__init__.py +0 -0
  47. mindsdb/integrations/utilities/files/file_reader.py +258 -0
  48. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
  49. mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
  50. mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
  51. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
  52. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
  53. mindsdb/integrations/utilities/rag/pipelines/rag.py +74 -21
  54. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
  55. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +108 -78
  56. mindsdb/integrations/utilities/rag/settings.py +37 -16
  57. mindsdb/integrations/utilities/sql_utils.py +1 -1
  58. mindsdb/interfaces/agents/agents_controller.py +18 -8
  59. mindsdb/interfaces/agents/constants.py +1 -0
  60. mindsdb/interfaces/agents/langchain_agent.py +124 -157
  61. mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -37
  62. mindsdb/interfaces/agents/mindsdb_database_agent.py +21 -13
  63. mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
  64. mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
  65. mindsdb/interfaces/chatbot/memory.py +58 -13
  66. mindsdb/interfaces/database/integrations.py +5 -1
  67. mindsdb/interfaces/database/projects.py +55 -16
  68. mindsdb/interfaces/database/views.py +12 -25
  69. mindsdb/interfaces/knowledge_base/controller.py +39 -15
  70. mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
  71. mindsdb/interfaces/model/functions.py +15 -4
  72. mindsdb/interfaces/model/model_controller.py +4 -7
  73. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +51 -40
  74. mindsdb/interfaces/skills/retrieval_tool.py +10 -3
  75. mindsdb/interfaces/skills/skill_tool.py +97 -54
  76. mindsdb/interfaces/skills/skills_controller.py +7 -3
  77. mindsdb/interfaces/skills/sql_agent.py +127 -41
  78. mindsdb/interfaces/storage/db.py +1 -1
  79. mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
  80. mindsdb/utilities/cache.py +7 -4
  81. mindsdb/utilities/context.py +11 -1
  82. mindsdb/utilities/langfuse.py +279 -0
  83. mindsdb/utilities/log.py +20 -2
  84. mindsdb/utilities/otel/__init__.py +206 -0
  85. mindsdb/utilities/otel/logger.py +25 -0
  86. mindsdb/utilities/otel/meter.py +19 -0
  87. mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
  88. mindsdb/utilities/otel/tracer.py +16 -0
  89. mindsdb/utilities/partitioning.py +52 -0
  90. mindsdb/utilities/render/sqlalchemy_render.py +7 -1
  91. mindsdb/utilities/utils.py +34 -0
  92. mindsdb/utilities/otel.py +0 -72
  93. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/LICENSE +0 -0
  94. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/WHEEL +0 -0
  95. {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,82 @@
1
+ from typing import Any, List
2
+ from langchain_core.embeddings import Embeddings
3
+ import requests
4
+
5
+
6
+ class FastAPIEmbeddings(Embeddings):
7
+ """An embedding extension that interfaces with FAST API. Useful for custom serving solutions."""
8
+
9
+ def __init__(
10
+ self,
11
+ api_base: str,
12
+ model: str,
13
+ batch_size: int = 32,
14
+ **kwargs: Any,
15
+ ):
16
+ """Initialize the embeddings class.
17
+
18
+ Args:
19
+ api_base: Base URL for the VLLM server
20
+ model: Model name/path to use for embeddings
21
+ batch_size: Batch size for generating embeddings
22
+ """
23
+ super().__init__()
24
+ self.api_base = api_base
25
+ self.model = model
26
+ self.batch_size = batch_size
27
+
28
+ # initialize requests here with the api_base
29
+
30
+ def _get_embeddings(self, texts: List[str]) -> List[str]:
31
+ """Get embeddings for a batch of text chunks.
32
+
33
+ Returns:
34
+ List of embeddings as strings. For sparse vectors, returns strings in format
35
+ "{key:value,...}/size" where size is the dimension of the vector space.
36
+ """
37
+
38
+ headers = {"accept": "application/json", "Content-Type": "application/json"}
39
+
40
+ data = {
41
+ "input": texts,
42
+ "model": self.model
43
+ }
44
+
45
+ response = requests.post(self.api_base, headers=headers, json=data)
46
+
47
+ response.raise_for_status()
48
+
49
+ embeddings = []
50
+ for response_dict in response.json()["data"]:
51
+ embedding = response_dict["embedding"]
52
+ embeddings.append(embedding)
53
+
54
+ return embeddings
55
+
56
+ def embed_documents(self, texts: List[str]) -> List[str]:
57
+ """Embed a list of documents using vLLM.
58
+
59
+ Args:
60
+ texts: List of documents to embed
61
+
62
+ Returns:
63
+ List of embeddings as strings, one for each document.
64
+ For sparse embeddings, returns strings in format "{key:value,...}/size"
65
+ For dense embeddings, returns JSON strings of float lists
66
+ """
67
+
68
+ return self._get_embeddings(texts)
69
+
70
+ def embed_query(self, text: str) -> str:
71
+ """Embed a single query text using vLLM.
72
+
73
+ Args:
74
+ text: Query text to embed
75
+
76
+ Returns:
77
+ Query embedding as a string.
78
+ For sparse embeddings, returns string in format "{key:value,...}/size"
79
+ For dense embeddings, returns JSON string of float list
80
+ """
81
+
82
+ return self._get_embeddings([text])[0]
@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.base import BaseMLEngine
10
10
  from mindsdb.utilities import log
11
11
  from langchain_core.embeddings import Embeddings
12
12
  from mindsdb.integrations.handlers.langchain_embedding_handler.vllm_embeddings import VLLMEmbeddings
13
+ from mindsdb.integrations.handlers.langchain_embedding_handler.fastapi_embeddings import FastAPIEmbeddings
13
14
 
14
15
  logger = log.getLogger(__name__)
15
16
 
@@ -20,7 +21,10 @@ logger = log.getLogger(__name__)
20
21
  # This is used for the user to select the embedding model
21
22
  EMBEDDING_MODELS = {
22
23
  'VLLM': 'VLLMEmbeddings',
23
- 'vllm': 'VLLMEmbeddings'
24
+ 'vllm': 'VLLMEmbeddings',
25
+ 'FastAPI': 'FastAPIEmbeddings',
26
+ 'fastapi': 'FastAPIEmbeddings'
27
+
24
28
  }
25
29
 
26
30
  try:
@@ -55,6 +59,9 @@ def get_langchain_class(class_name: str) -> Embeddings:
55
59
  if class_name == "VLLMEmbeddings":
56
60
  return VLLMEmbeddings
57
61
 
62
+ if class_name == "FastAPIEmbeddings":
63
+ return FastAPIEmbeddings
64
+
58
65
  # Then try langchain_community.embeddings
59
66
  try:
60
67
  module = importlib.import_module("langchain_community.embeddings")
@@ -3,6 +3,6 @@ wikipedia==1.4.0
3
3
  tiktoken
4
4
  anthropic>=0.26.1
5
5
  litellm==1.44.8
6
- chromadb # Knowledge bases.
6
+ chromadb~=0.6.3 # Knowledge bases.
7
7
  -r mindsdb/integrations/handlers/openai_handler/requirements.txt
8
8
  -r mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt
@@ -28,7 +28,7 @@ class MSOneDriveHandler(APIHandler):
28
28
  """
29
29
 
30
30
  name = 'one_drive'
31
- supported_file_formats = ['csv', 'tsv', 'json', 'parquet']
31
+ supported_file_formats = ['csv', 'tsv', 'json', 'parquet', 'pdf', 'txt']
32
32
 
33
33
  def __init__(self, name: Text, connection_data: Dict, **kwargs: Any) -> None:
34
34
  """
@@ -9,6 +9,8 @@ from mindsdb.integrations.utilities.sql_utils import (
9
9
  SortColumn
10
10
  )
11
11
 
12
+ from mindsdb.integrations.utilities.files.file_reader import FileReader
13
+
12
14
 
13
15
  class ListFilesTable(APIResource):
14
16
  """
@@ -97,4 +99,10 @@ class FileTable(APIResource):
97
99
  elif file_extension == "parquet":
98
100
  df = pd.read_parquet(BytesIO(file_content))
99
101
 
102
+ elif file_extension == "pdf":
103
+ df = FileReader().read_pdf(BytesIO(file_content))
104
+
105
+ elif file_extension == "txt":
106
+ df = FileReader().read_txt(BytesIO(file_content))
107
+
100
108
  return df
@@ -37,6 +37,11 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
37
37
  super().__init__(name=name, **kwargs)
38
38
  self._is_shared_db = False
39
39
  self._is_vector_registered = False
40
+ # we get these from the connection args on PostgresHandler parent
41
+ self._is_sparse = self.connection_args.get('is_sparse', False)
42
+ self._vector_size = self.connection_args.get('vector_size', None)
43
+ if self._is_sparse and not self._vector_size:
44
+ raise ValueError("vector_size is required when is_sparse=True")
40
45
  self.connect()
41
46
 
42
47
  def _make_connection_args(self):
@@ -190,13 +195,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
190
195
  if filter_conditions:
191
196
 
192
197
  if embedding_search:
193
- # if search vector, return similar rows, apply other filters after if any
194
198
  search_vector = filter_conditions["embeddings"]["value"][0]
195
199
  filter_conditions.pop("embeddings")
196
- return f"SELECT {targets} FROM {table_name} ORDER BY embeddings <=> '{search_vector}' {after_from_clause}"
200
+
201
+ if self._is_sparse:
202
+ # Convert dict to sparse vector if needed
203
+ if isinstance(search_vector, dict):
204
+ from pgvector.utils import SparseVector
205
+ embedding = SparseVector(search_vector, self._vector_size)
206
+ search_vector = embedding.to_text()
207
+ # Use inner product for sparse vectors
208
+ distance_op = "<#>"
209
+ else:
210
+ # Convert list to vector string if needed
211
+ if isinstance(search_vector, list):
212
+ search_vector = f"[{','.join(str(x) for x in search_vector)}]"
213
+ # Use cosine similarity for dense vectors
214
+ distance_op = "<=>"
215
+
216
+ return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
217
+
197
218
  else:
198
- # if filter conditions, return filtered rows
219
+ # if filter conditions, return rows that satisfy the conditions
199
220
  return f"SELECT {targets} FROM {table_name} {after_from_clause}"
221
+
200
222
  else:
201
223
  # if no filter conditions, return all rows
202
224
  return f"SELECT {targets} FROM {table_name} {after_from_clause}"
@@ -283,7 +305,7 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
283
305
  # See https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
284
306
  #
285
307
  # We can break down the below query as follows:
286
- #
308
+ #
287
309
  # Start with a CTE (Common Table Expression) called semantic_search (https://www.postgresql.org/docs/current/queries-with.html).
288
310
  # This expression calculates rank by the defined distance function, which measures the distance between the
289
311
  # embeddings column and the given embeddings vector. Results are ordered by this rank.
@@ -339,17 +361,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
339
361
  full_search_query = f'{semantic_search_cte}{full_text_search_cte}{hybrid_select}'
340
362
  return self.raw_query(full_search_query)
341
363
 
342
- def create_table(self, table_name: str, sparse=False, if_not_exists=True):
343
- """
344
- Run a create table query on the pgvector database.
345
- """
346
- table_name = self._check_table(table_name)
347
-
348
- query = f"CREATE TABLE IF NOT EXISTS {table_name} (id text PRIMARY KEY, content text, embeddings vector, metadata jsonb)"
349
- if sparse:
350
- query = f"CREATE TABLE IF NOT EXISTS {table_name} (id text PRIMARY KEY, content text, embeddings sparsevec, metadata jsonb)"
351
-
352
- self.raw_query(query)
364
+ def create_table(self, table_name: str):
365
+ """Create a table with a vector column."""
366
+ with self.connection.cursor() as cur:
367
+ # For sparse vectors, use sparsevec type
368
+ vector_column_type = 'sparsevec' if self._is_sparse else 'vector'
369
+
370
+ # Vector size is required for sparse vectors, optional for dense
371
+ if self._is_sparse and not self._vector_size:
372
+ raise ValueError("vector_size is required for sparse vectors")
373
+
374
+ # Add vector size specification only if provided
375
+ size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
376
+ if vector_column_type == 'vector':
377
+ size_spec = ''
378
+
379
+ cur.execute(f"""
380
+ CREATE TABLE IF NOT EXISTS {table_name} (
381
+ id TEXT PRIMARY KEY,
382
+ embeddings {vector_column_type}{size_spec},
383
+ content TEXT,
384
+ metadata JSONB
385
+ )
386
+ """)
387
+ self.connection.commit()
353
388
 
354
389
  def insert(
355
390
  self, table_name: str, data: pd.DataFrame
@@ -447,4 +482,3 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
447
482
  """
448
483
  table_name = self._check_table(table_name)
449
484
  self.raw_query(f"DROP TABLE IF EXISTS {table_name}")
450
-
@@ -1,8 +1,10 @@
1
+ import ast
1
2
  from typing import List, Optional
2
3
 
3
- import pinecone
4
+ import numpy as np
5
+ from pinecone import Pinecone, ServerlessSpec
6
+ from pinecone.core.openapi.shared.exceptions import NotFoundException, PineconeApiException
4
7
  import pandas as pd
5
- import ast
6
8
 
7
9
  from mindsdb.integrations.libs.response import RESPONSE_TYPE
8
10
  from mindsdb.integrations.libs.response import HandlerResponse
@@ -18,32 +20,30 @@ from mindsdb.utilities import log
18
20
 
19
21
  logger = log.getLogger(__name__)
20
22
 
23
+ DEFAULT_CREATE_TABLE_PARAMS = {
24
+ "dimension": 8,
25
+ "metric": "cosine",
26
+ "spec": {
27
+ "cloud": "aws",
28
+ "region": "us-east-1"
29
+ }
30
+ }
31
+ MAX_FETCH_LIMIT = 10000
32
+ UPSERT_BATCH_SIZE = 99 # API reccomendation
33
+
21
34
 
22
35
  class PineconeHandler(VectorStoreHandler):
23
36
  """This handler handles connection and execution of the Pinecone statements."""
24
37
 
25
38
  name = "pinecone"
26
39
 
27
- def __init__(self, name: str, **kwargs):
40
+ def __init__(self, name: str, connection_data: dict, **kwargs):
28
41
  super().__init__(name)
29
- self.MAX_FETCH_LIMIT = 10000
30
- self._connection_data = kwargs.get("connection_data")
31
- self._client_config = {
32
- "api_key": self._connection_data.get("api_key"),
33
- "environment": self._connection_data.get("environment")
34
- }
35
- self._table_create_params = {
36
- "dimension": 8,
37
- "metric": "cosine",
38
- "pods": 1,
39
- "replicas": 1,
40
- "pod_type": 'p1',
41
- }
42
- for key in self._table_create_params:
43
- if key in self._connection_data:
44
- self._table_create_params[key] = self._connection_data[key]
42
+ self.connection_data = connection_data
43
+ self.kwargs = kwargs
44
+
45
+ self.connection = None
45
46
  self.is_connected = False
46
- self.connect()
47
47
 
48
48
  def __del__(self):
49
49
  if self.is_connected is True:
@@ -51,7 +51,8 @@ class PineconeHandler(VectorStoreHandler):
51
51
 
52
52
  def _get_index_handle(self, index_name):
53
53
  """Returns handler to index specified by `index_name`"""
54
- index = pinecone.Index(index_name)
54
+ connection = self.connect()
55
+ index = connection.Index(index_name)
55
56
  try:
56
57
  index.describe_index_stats()
57
58
  except Exception:
@@ -135,10 +136,15 @@ class PineconeHandler(VectorStoreHandler):
135
136
 
136
137
  def connect(self):
137
138
  """Connect to a pinecone database."""
139
+ if self.is_connected is True:
140
+ return self.connection
141
+
142
+ if 'api_key' not in self.connection_data:
143
+ raise ValueError('Required parameter (api_key) must be provided.')
144
+
138
145
  try:
139
- pinecone.init(api_key=self._client_config["api_key"], environment=self._client_config["environment"])
140
- pinecone.list_indexes()
141
- self.is_connected = True
146
+ self.connection = Pinecone(api_key=self.connection_data['api_key'])
147
+ return self.connection
142
148
  except Exception as e:
143
149
  logger.error(f"Error connecting to Pinecone client, {e}!")
144
150
  self.is_connected = False
@@ -147,55 +153,99 @@ class PineconeHandler(VectorStoreHandler):
147
153
  """Close the pinecone connection."""
148
154
  if self.is_connected is False:
149
155
  return
150
- pinecone.init(api_key="", environment="")
156
+ self.connection = None
151
157
  self.is_connected = False
152
158
 
153
159
  def check_connection(self):
154
160
  """Check the connection to pinecone."""
155
- response_code = StatusResponse(False)
161
+ response = StatusResponse(False)
162
+ need_to_close = self.is_connected is False
163
+
156
164
  try:
157
- pinecone.list_indexes()
158
- response_code.success = True
165
+ connection = self.connect()
166
+ connection.list_indexes()
167
+ response.success = True
159
168
  except Exception as e:
160
169
  logger.error(f"Error connecting to pinecone , {e}!")
161
- response_code.error_message = str(e)
162
- return response_code
170
+ response.error_message = str(e)
171
+
172
+ if response.success is True and need_to_close:
173
+ self.disconnect()
174
+ if response.success is False and self.is_connected is True:
175
+ self.is_connected = False
176
+
177
+ return response
163
178
 
164
179
  def get_tables(self) -> HandlerResponse:
165
180
  """Get the list of indexes in the pinecone database."""
166
- indexes = pinecone.list_indexes()
167
- indexes_names = pd.DataFrame(
168
- columns=["index_name"],
169
- data=[index for index in indexes],
181
+ connection = self.connect()
182
+ indexes = connection.list_indexes()
183
+ df = pd.DataFrame(
184
+ columns=["table_name"],
185
+ data=[index['name'] for index in indexes],
170
186
  )
171
- return Response(resp_type=RESPONSE_TYPE.TABLE, data_frame=indexes_names)
187
+ return Response(resp_type=RESPONSE_TYPE.TABLE, data_frame=df)
172
188
 
173
189
  def create_table(self, table_name: str, if_not_exists=True):
174
190
  """Create an index with the given name in the Pinecone database."""
175
- pinecone.create_index(name=table_name, **self._table_create_params)
191
+ connection = self.connect()
192
+
193
+ # TODO: Should other parameters be supported? Pod indexes?
194
+ # TODO: Should there be a better way to provide these parameters rather than when establishing the connection?
195
+ create_table_params = {}
196
+ for key, val in DEFAULT_CREATE_TABLE_PARAMS.items():
197
+ if key in self.connection_data:
198
+ create_table_params[key] = self.connection_data[key]
199
+ else:
200
+ create_table_params[key] = val
201
+
202
+ create_table_params["spec"] = ServerlessSpec(**create_table_params["spec"])
203
+
204
+ try:
205
+ connection.create_index(name=table_name, **create_table_params)
206
+ except PineconeApiException as pinecone_error:
207
+ if pinecone_error.status == 409 and if_not_exists:
208
+ return
209
+ raise Exception(f"Error creating index '{table_name}': {pinecone_error}")
176
210
 
177
- def insert(self, table_name: str, data: pd.DataFrame, columns: List[str] = None):
211
+ def insert(self, table_name: str, data: pd.DataFrame):
178
212
  """Insert data into pinecone index passed in through `table_name` parameter."""
179
- upsert_batch_size = 99 # API reccomendation
180
213
  index = self._get_index_handle(table_name)
181
214
  if index is None:
182
215
  raise Exception(f"Error getting index '{table_name}', are you sure the name is correct?")
183
216
 
184
217
  data.rename(columns={
185
218
  TableField.ID.value: "id",
186
- TableField.EMBEDDINGS.value: "values",
187
- TableField.METADATA.value: "metadata"},
219
+ TableField.EMBEDDINGS.value: "values"},
188
220
  inplace=True)
189
- data = data[["id", "values", "metadata"]]
190
221
 
191
- for chunk in (data[pos:pos + upsert_batch_size] for pos in range(0, len(data), upsert_batch_size)):
222
+ columns = ["id", "values"]
223
+
224
+ if TableField.METADATA.value in data.columns:
225
+ data.rename(columns={TableField.METADATA.value: "metadata"}, inplace=True)
226
+ # fill None and NaN values with empty dict
227
+ if data['metadata'].isnull().any():
228
+ data['metadata'] = data['metadata'].apply(lambda x: {} if x is None or (isinstance(x, float) and np.isnan(x)) else x)
229
+ columns.append("metadata")
230
+
231
+ data = data[columns]
232
+
233
+ # convert the embeddings to lists if they are strings
234
+ data["values"] = data["values"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
235
+
236
+ for chunk in (data[pos:pos + UPSERT_BATCH_SIZE] for pos in range(0, len(data), UPSERT_BATCH_SIZE)):
192
237
  chunk = chunk.to_dict(orient="records")
193
238
  index.upsert(vectors=chunk)
194
239
 
195
240
  def drop_table(self, table_name: str, if_exists=True):
196
241
  """Delete an index passed in through `table_name` from the pinecone ."""
197
-
198
- pinecone.delete_index(table_name)
242
+ connection = self.connect()
243
+ try:
244
+ connection.delete_index(table_name)
245
+ except NotFoundException:
246
+ if if_exists:
247
+ return
248
+ raise Exception(f"Error deleting index '{table_name}', are you sure the name is correct?")
199
249
 
200
250
  def delete(self, table_name: str, conditions: List[FilterCondition] = None):
201
251
  """Delete records in pinecone index `table_name` based on ids or based on metadata conditions."""
@@ -225,6 +275,7 @@ class PineconeHandler(VectorStoreHandler):
225
275
  limit: int = None,
226
276
  ):
227
277
  """Run query on pinecone index named `table_name` and get results."""
278
+ # TODO: Add support for namespaces.
228
279
  index = self._get_index_handle(table_name)
229
280
  if index is None:
230
281
  raise Exception(f"Error getting index '{table_name}', are you sure the name is correct?")
@@ -233,23 +284,28 @@ class PineconeHandler(VectorStoreHandler):
233
284
  "include_values": True,
234
285
  "include_metadata": True
235
286
  }
287
+
236
288
  # check for metadata filter
237
289
  metadata_filters = self._translate_metadata_condition(conditions)
238
- # check for vector filter
239
- vector_filter = (
240
- None
241
- if conditions is None
242
- else [
243
- condition.value
244
- for condition in conditions
245
- if condition.column == TableField.SEARCH_VECTOR.value
246
- ]
247
- )
248
- if vector_filter:
249
- if len(vector_filter) > 1:
290
+ if metadata_filters is not None:
291
+ query["filter"] = metadata_filters
292
+
293
+ # check for vector and id filters
294
+ vector_filters = []
295
+ id_filters = []
296
+
297
+ if conditions:
298
+ for condition in conditions:
299
+ if condition.column == TableField.SEARCH_VECTOR.value:
300
+ vector_filters.append(condition.value)
301
+ elif condition.column == TableField.ID.value:
302
+ id_filters.append(condition.value)
303
+
304
+ if vector_filters:
305
+ if len(vector_filters) > 1:
250
306
  raise Exception("You cannot have multiple search_vectors in query")
251
307
 
252
- query["vector"] = vector_filter[0]
308
+ query["vector"] = vector_filters[0]
253
309
  # For subqueries, the vector filter is a list of list of strings
254
310
  if isinstance(query["vector"], list) and isinstance(query["vector"][0], str):
255
311
  if len(query["vector"]) > 1:
@@ -260,26 +316,21 @@ class PineconeHandler(VectorStoreHandler):
260
316
  except Exception as e:
261
317
  raise Exception(f"Cannot parse the search vector '{query['vector']}'into a list: {e}")
262
318
 
263
- # check for limit
264
- if limit is not None:
265
- query["top_k"] = limit
266
- else:
267
- query["top_k"] = self.MAX_FETCH_LIMIT
268
- if metadata_filters is not None:
269
- query["filter"] = metadata_filters
270
- # check for id filter
271
- id_filters = None
272
- if conditions is not None:
273
- id_filters = [
274
- condition.value
275
- for condition in conditions
276
- if condition.column == TableField.ID.value
277
- ] or None
278
319
  if id_filters:
279
320
  if len(id_filters) > 1:
280
321
  raise Exception("You cannot have multiple IDs in query")
281
322
 
282
323
  query["id"] = id_filters[0]
324
+
325
+ if not vector_filters and not id_filters:
326
+ raise Exception("You must provide either a search_vector or an ID in the query")
327
+
328
+ # check for limit
329
+ if limit is not None:
330
+ query["top_k"] = limit
331
+ else:
332
+ query["top_k"] = MAX_FETCH_LIMIT
333
+
283
334
  # exec query
284
335
  try:
285
336
  result = index.query(**query)
@@ -1 +1 @@
1
- pinecone-client
1
+ pinecone-client==5.0.1
@@ -1,5 +1,6 @@
1
1
  import time
2
2
  import json
3
+ from typing import Optional
3
4
 
4
5
  import pandas as pd
5
6
  import psycopg
@@ -161,7 +162,7 @@ class PostgresHandler(DatabaseHandler):
161
162
  'float8': 'float64'
162
163
  }
163
164
  columns = df.columns
164
- df = df.set_axis(range(len(columns)), axis=1)
165
+ df.columns = list(range(len(columns)))
165
166
  for column_index, column_name in enumerate(df.columns):
166
167
  col = df[column_name]
167
168
  if str(col.dtype) == 'object':
@@ -172,7 +173,7 @@ class PostgresHandler(DatabaseHandler):
172
173
  df[column_name] = col.astype(types_map[pg_type.name])
173
174
  except ValueError as e:
174
175
  logger.error(f'Error casting column {col.name} to {types_map[pg_type.name]}: {e}')
175
- return df.set_axis(columns, axis=1)
176
+ df.columns = columns
176
177
 
177
178
  @profiler.profile()
178
179
  def native_query(self, query: str, params=None) -> Response:
@@ -202,7 +203,7 @@ class PostgresHandler(DatabaseHandler):
202
203
  result,
203
204
  columns=[x.name for x in cur.description]
204
205
  )
205
- df = self._cast_dtypes(df, cur.description)
206
+ self._cast_dtypes(df, cur.description)
206
207
  response = Response(
207
208
  RESPONSE_TYPE.TABLE,
208
209
  df
@@ -281,21 +282,27 @@ class PostgresHandler(DatabaseHandler):
281
282
  """
282
283
  return self.native_query(query)
283
284
 
284
- def get_columns(self, table_name: str) -> Response:
285
+ def get_columns(self, table_name: str, schema_name: Optional[str] = None) -> Response:
285
286
  """
286
287
  Retrieves column details for a specified table in the PostgreSQL database.
287
288
 
288
289
  Args:
289
290
  table_name (str): The name of the table for which to retrieve column information.
291
+ schema_name (str): The name of the schema in which the table is located.
290
292
 
291
293
  Returns:
292
294
  Response: A response object containing the column details, formatted as per the `Response` class.
295
+
293
296
  Raises:
294
297
  ValueError: If the 'table_name' is not a valid string.
295
298
  """
296
299
 
297
300
  if not table_name or not isinstance(table_name, str):
298
301
  raise ValueError("Invalid table name provided.")
302
+ if isinstance(schema_name, str):
303
+ schema_name = f"'{schema_name}'"
304
+ else:
305
+ schema_name = 'current_schema()'
299
306
  query = f"""
300
307
  SELECT
301
308
  column_name as "Field",
@@ -305,12 +312,11 @@ class PostgresHandler(DatabaseHandler):
305
312
  WHERE
306
313
  table_name = '{table_name}'
307
314
  AND
308
- table_schema = current_schema()
315
+ table_schema = {schema_name}
309
316
  """
310
317
  return self.native_query(query)
311
318
 
312
319
  def subscribe(self, stop_event, callback, table_name, columns=None, **kwargs):
313
-
314
320
  config = self._make_connection_args()
315
321
  config['autocommit'] = True
316
322
 
@@ -12,7 +12,7 @@ class RayServeHandler(BaseMLEngine):
12
12
  - A Ray Serve server should be running
13
13
 
14
14
  Example:
15
-
15
+
16
16
  """ # noqa
17
17
  name = 'ray_serve'
18
18
 
@@ -42,9 +42,11 @@ class RayServeHandler(BaseMLEngine):
42
42
  raise Exception("Error: Training failed: " + resp['status'])
43
43
 
44
44
  def predict(self, df, args=None):
45
- args = self.model_storage.json_get('args') # override any incoming args for now
45
+ args = {**(self.model_storage.json_get('args')), **args} # merge incoming args
46
+ pred_args = args.get('predict_params', {})
47
+ args = {**args, **pred_args} # merge pred_args
46
48
  resp = requests.post(args['predict_url'],
47
- json={'df': df.to_json(orient='records')},
49
+ json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
48
50
  headers={'content-type': 'application/json; format=pandas-records'})
49
51
  response = resp.json()
50
52