PyPI - MindsDB - Versions diffs - 25.1.2.1__py3-none-any.whl → 25.1.5.0__py3-none-any.whl - Mend

MindsDB 25.1.2.1py3-none-any.whl → 25.1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (95) hide show

mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py ADDED Viewed

@@ -0,0 +1,82 @@
+from typing import Any, List
+from langchain_core.embeddings import Embeddings
+import requests
+class FastAPIEmbeddings(Embeddings):
+    """An embedding extension that interfaces with FAST API. Useful for custom serving solutions."""
+    def __init__(
+        self,
+        api_base: str,
+        model: str,
+        batch_size: int = 32,
+        **kwargs: Any,
+    ):
+        """Initialize the embeddings class.
+        Args:
+            api_base: Base URL for the VLLM server
+            model: Model name/path to use for embeddings
+            batch_size: Batch size for generating embeddings
+        """
+        super().__init__()
+        self.api_base = api_base
+        self.model = model
+        self.batch_size = batch_size
+        # initialize requests here with the api_base
+    def _get_embeddings(self, texts: List[str]) -> List[str]:
+        """Get embeddings for a batch of text chunks.
+        Returns:
+            List of embeddings as strings. For sparse vectors, returns strings in format
+            "{key:value,...}/size" where size is the dimension of the vector space.
+        """
+        headers = {"accept": "application/json", "Content-Type": "application/json"}
+        data = {
+            "input": texts,
+            "model": self.model
+        }
+        response = requests.post(self.api_base, headers=headers, json=data)
+        response.raise_for_status()
+        embeddings = []
+        for response_dict in response.json()["data"]:
+            embedding = response_dict["embedding"]
+            embeddings.append(embedding)
+        return embeddings
+    def embed_documents(self, texts: List[str]) -> List[str]:
+        """Embed a list of documents using vLLM.
+        Args:
+            texts: List of documents to embed
+        Returns:
+            List of embeddings as strings, one for each document.
+            For sparse embeddings, returns strings in format "{key:value,...}/size"
+            For dense embeddings, returns JSON strings of float lists
+        """
+        return self._get_embeddings(texts)
+    def embed_query(self, text: str) -> str:
+        """Embed a single query text using vLLM.
+        Args:
+            text: Query text to embed
+        Returns:
+            Query embedding as a string.
+            For sparse embeddings, returns string in format "{key:value,...}/size"
+            For dense embeddings, returns JSON string of float list
+        """
+        return self._get_embeddings([text])[0]

mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py CHANGED Viewed

@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.base import BaseMLEngine
 from mindsdb.utilities import log
 from langchain_core.embeddings import Embeddings
 from mindsdb.integrations.handlers.langchain_embedding_handler.vllm_embeddings import VLLMEmbeddings
+from mindsdb.integrations.handlers.langchain_embedding_handler.fastapi_embeddings import FastAPIEmbeddings
 logger = log.getLogger(__name__)
@@ -20,7 +21,10 @@ logger = log.getLogger(__name__)
 # This is used for the user to select the embedding model
 EMBEDDING_MODELS = {
     'VLLM': 'VLLMEmbeddings',
-    'vllm': 'VLLMEmbeddings'
+    'vllm': 'VLLMEmbeddings',
+    'FastAPI': 'FastAPIEmbeddings',
+    'fastapi': 'FastAPIEmbeddings'
 }
 try:
@@ -55,6 +59,9 @@ def get_langchain_class(class_name: str) -> Embeddings:
     if class_name == "VLLMEmbeddings":
         return VLLMEmbeddings
+    if class_name == "FastAPIEmbeddings":
+        return FastAPIEmbeddings
     # Then try langchain_community.embeddings
     try:
         module = importlib.import_module("langchain_community.embeddings")

mindsdb/integrations/handlers/langchain_handler/requirements.txt CHANGED Viewed

@@ -3,6 +3,6 @@ wikipedia==1.4.0
 tiktoken
 anthropic>=0.26.1
 litellm==1.44.8
-chromadb # Knowledge bases.
+chromadb~=0.6.3 # Knowledge bases.
 -r mindsdb/integrations/handlers/openai_handler/requirements.txt
 -r mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt

mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py CHANGED Viewed

@@ -28,7 +28,7 @@ class MSOneDriveHandler(APIHandler):
     """
     name = 'one_drive'
-    supported_file_formats = ['csv', 'tsv', 'json', 'parquet']
+    supported_file_formats = ['csv', 'tsv', 'json', 'parquet', 'pdf', 'txt']
     def __init__(self, name: Text, connection_data: Dict, **kwargs: Any) -> None:
         """

mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py CHANGED Viewed

@@ -9,6 +9,8 @@ from mindsdb.integrations.utilities.sql_utils import (
     SortColumn
 )
+from mindsdb.integrations.utilities.files.file_reader import FileReader
 class ListFilesTable(APIResource):
     """
@@ -97,4 +99,10 @@ class FileTable(APIResource):
         elif file_extension == "parquet":
             df = pd.read_parquet(BytesIO(file_content))
+        elif file_extension == "pdf":
+            df = FileReader().read_pdf(BytesIO(file_content))
+        elif file_extension == "txt":
+            df = FileReader().read_txt(BytesIO(file_content))
         return df

mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py CHANGED Viewed

@@ -37,6 +37,11 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
         super().__init__(name=name, **kwargs)
         self._is_shared_db = False
         self._is_vector_registered = False
+        # we get these from the connection args on PostgresHandler parent
+        self._is_sparse = self.connection_args.get('is_sparse', False)
+        self._vector_size = self.connection_args.get('vector_size', None)
+        if self._is_sparse and not self._vector_size:
+            raise ValueError("vector_size is required when is_sparse=True")
         self.connect()
     def _make_connection_args(self):
@@ -190,13 +195,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
         if filter_conditions:
             if embedding_search:
-                # if search vector, return similar rows, apply other filters after if any
                 search_vector = filter_conditions["embeddings"]["value"][0]
                 filter_conditions.pop("embeddings")
-                return f"SELECT {targets} FROM {table_name} ORDER BY embeddings <=> '{search_vector}' {after_from_clause}"
+                if self._is_sparse:
+                    # Convert dict to sparse vector if needed
+                    if isinstance(search_vector, dict):
+                        from pgvector.utils import SparseVector
+                        embedding = SparseVector(search_vector, self._vector_size)
+                        search_vector = embedding.to_text()
+                    # Use inner product for sparse vectors
+                    distance_op = "<#>"
+                else:
+                    # Convert list to vector string if needed
+                    if isinstance(search_vector, list):
+                        search_vector = f"[{','.join(str(x) for x in search_vector)}]"
+                    # Use cosine similarity for dense vectors
+                    distance_op = "<=>"
+                return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
             else:
-                # if filter conditions, return filtered rows
+                # if filter conditions, return rows that satisfy the conditions
                 return f"SELECT {targets} FROM {table_name} {after_from_clause}"
         else:
             # if no filter conditions, return all rows
             return f"SELECT {targets} FROM {table_name} {after_from_clause}"
@@ -283,7 +305,7 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
         # See https://docs.pgvecto.rs/use-case/hybrid-search.html#advanced-search-merge-the-results-of-full-text-search-and-vector-search.
         #
         # We can break down the below query as follows:
-        #
+        #
         # Start with a CTE (Common Table Expression) called semantic_search (https://www.postgresql.org/docs/current/queries-with.html).
         # This expression calculates rank by the defined distance function, which measures the distance between the
         # embeddings column and the given embeddings vector. Results are ordered by this rank.
@@ -339,17 +361,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
         full_search_query = f'{semantic_search_cte}{full_text_search_cte}{hybrid_select}'
         return self.raw_query(full_search_query)
-    def create_table(self, table_name: str, sparse=False, if_not_exists=True):
-        """
-        Run a create table query on the pgvector database.
-        """
-        table_name = self._check_table(table_name)
-        query = f"CREATE TABLE IF NOT EXISTS {table_name} (id text PRIMARY KEY, content text, embeddings vector, metadata jsonb)"
-        if sparse:
-            query = f"CREATE TABLE IF NOT EXISTS {table_name} (id text PRIMARY KEY, content text, embeddings sparsevec, metadata jsonb)"
-        self.raw_query(query)
+    def create_table(self, table_name: str):
+        """Create a table with a vector column."""
+        with self.connection.cursor() as cur:
+            # For sparse vectors, use sparsevec type
+            vector_column_type = 'sparsevec' if self._is_sparse else 'vector'
+            # Vector size is required for sparse vectors, optional for dense
+            if self._is_sparse and not self._vector_size:
+                raise ValueError("vector_size is required for sparse vectors")
+            # Add vector size specification only if provided
+            size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
+            if vector_column_type == 'vector':
+                size_spec = ''
+            cur.execute(f"""
+                CREATE TABLE IF NOT EXISTS {table_name} (
+                    id TEXT PRIMARY KEY,
+                    embeddings {vector_column_type}{size_spec},
+                    content TEXT,
+                    metadata JSONB
+                )
+            """)
+            self.connection.commit()
     def insert(
         self, table_name: str, data: pd.DataFrame
@@ -447,4 +482,3 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
         """
         table_name = self._check_table(table_name)
         self.raw_query(f"DROP TABLE IF EXISTS {table_name}")

mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py CHANGED Viewed

@@ -1,8 +1,10 @@
+import ast
 from typing import List, Optional
-import pinecone
+import numpy as np
+from pinecone import Pinecone, ServerlessSpec
+from pinecone.core.openapi.shared.exceptions import NotFoundException, PineconeApiException
 import pandas as pd
-import ast
 from mindsdb.integrations.libs.response import RESPONSE_TYPE
 from mindsdb.integrations.libs.response import HandlerResponse
@@ -18,32 +20,30 @@ from mindsdb.utilities import log
 logger = log.getLogger(__name__)
+DEFAULT_CREATE_TABLE_PARAMS = {
+    "dimension": 8,
+    "metric": "cosine",
+    "spec": {
+        "cloud": "aws",
+        "region": "us-east-1"
+    }
+}
+MAX_FETCH_LIMIT = 10000
+UPSERT_BATCH_SIZE = 99  # API reccomendation
 class PineconeHandler(VectorStoreHandler):
     """This handler handles connection and execution of the Pinecone statements."""
     name = "pinecone"
-    def __init__(self, name: str, **kwargs):
+    def __init__(self, name: str, connection_data: dict, **kwargs):
         super().__init__(name)
-        self.MAX_FETCH_LIMIT = 10000
-        self._connection_data = kwargs.get("connection_data")
-        self._client_config = {
-            "api_key": self._connection_data.get("api_key"),
-            "environment": self._connection_data.get("environment")
-        }
-        self._table_create_params = {
-            "dimension": 8,
-            "metric": "cosine",
-            "pods": 1,
-            "replicas": 1,
-            "pod_type": 'p1',
-        }
-        for key in self._table_create_params:
-            if key in self._connection_data:
-                self._table_create_params[key] = self._connection_data[key]
+        self.connection_data = connection_data
+        self.kwargs = kwargs
+        self.connection = None
         self.is_connected = False
-        self.connect()
     def __del__(self):
         if self.is_connected is True:
@@ -51,7 +51,8 @@ class PineconeHandler(VectorStoreHandler):
     def _get_index_handle(self, index_name):
         """Returns handler to index specified by `index_name`"""
-        index = pinecone.Index(index_name)
+        connection = self.connect()
+        index = connection.Index(index_name)
         try:
             index.describe_index_stats()
         except Exception:
@@ -135,10 +136,15 @@ class PineconeHandler(VectorStoreHandler):
     def connect(self):
         """Connect to a pinecone database."""
+        if self.is_connected is True:
+            return self.connection
+        if 'api_key' not in self.connection_data:
+            raise ValueError('Required parameter (api_key) must be provided.')
         try:
-            pinecone.init(api_key=self._client_config["api_key"], environment=self._client_config["environment"])
-            pinecone.list_indexes()
-            self.is_connected = True
+            self.connection = Pinecone(api_key=self.connection_data['api_key'])
+            return self.connection
         except Exception as e:
             logger.error(f"Error connecting to Pinecone client, {e}!")
             self.is_connected = False
@@ -147,55 +153,99 @@ class PineconeHandler(VectorStoreHandler):
         """Close the pinecone connection."""
         if self.is_connected is False:
             return
-        pinecone.init(api_key="", environment="")
+        self.connection = None
         self.is_connected = False
     def check_connection(self):
         """Check the connection to pinecone."""
-        response_code = StatusResponse(False)
+        response = StatusResponse(False)
+        need_to_close = self.is_connected is False
         try:
-            pinecone.list_indexes()
-            response_code.success = True
+            connection = self.connect()
+            connection.list_indexes()
+            response.success = True
         except Exception as e:
             logger.error(f"Error connecting to pinecone , {e}!")
-            response_code.error_message = str(e)
-        return response_code
+            response.error_message = str(e)
+        if response.success is True and need_to_close:
+            self.disconnect()
+        if response.success is False and self.is_connected is True:
+            self.is_connected = False
+        return response
     def get_tables(self) -> HandlerResponse:
         """Get the list of indexes in the pinecone database."""
-        indexes = pinecone.list_indexes()
-        indexes_names = pd.DataFrame(
-            columns=["index_name"],
-            data=[index for index in indexes],
+        connection = self.connect()
+        indexes = connection.list_indexes()
+        df = pd.DataFrame(
+            columns=["table_name"],
+            data=[index['name'] for index in indexes],
         )
-        return Response(resp_type=RESPONSE_TYPE.TABLE, data_frame=indexes_names)
+        return Response(resp_type=RESPONSE_TYPE.TABLE, data_frame=df)
     def create_table(self, table_name: str, if_not_exists=True):
         """Create an index with the given name in the Pinecone database."""
-        pinecone.create_index(name=table_name, **self._table_create_params)
+        connection = self.connect()
+        # TODO: Should other parameters be supported? Pod indexes?
+        # TODO: Should there be a better way to provide these parameters rather than when establishing the connection?
+        create_table_params = {}
+        for key, val in DEFAULT_CREATE_TABLE_PARAMS.items():
+            if key in self.connection_data:
+                create_table_params[key] = self.connection_data[key]
+            else:
+                create_table_params[key] = val
+        create_table_params["spec"] = ServerlessSpec(**create_table_params["spec"])
+        try:
+            connection.create_index(name=table_name, **create_table_params)
+        except PineconeApiException as pinecone_error:
+            if pinecone_error.status == 409 and if_not_exists:
+                return
+            raise Exception(f"Error creating index '{table_name}': {pinecone_error}")
-    def insert(self, table_name: str, data: pd.DataFrame, columns: List[str] = None):
+    def insert(self, table_name: str, data: pd.DataFrame):
         """Insert data into pinecone index passed in through `table_name` parameter."""
-        upsert_batch_size = 99  # API reccomendation
         index = self._get_index_handle(table_name)
         if index is None:
             raise Exception(f"Error getting index '{table_name}', are you sure the name is correct?")
         data.rename(columns={
             TableField.ID.value: "id",
-            TableField.EMBEDDINGS.value: "values",
-            TableField.METADATA.value: "metadata"},
+            TableField.EMBEDDINGS.value: "values"},
             inplace=True)
-        data = data[["id", "values", "metadata"]]
-        for chunk in (data[pos:pos + upsert_batch_size] for pos in range(0, len(data), upsert_batch_size)):
+        columns = ["id", "values"]
+        if TableField.METADATA.value in data.columns:
+            data.rename(columns={TableField.METADATA.value: "metadata"}, inplace=True)
+            # fill None and NaN values with empty dict
+            if data['metadata'].isnull().any():
+                data['metadata'] = data['metadata'].apply(lambda x: {} if x is None or (isinstance(x, float) and np.isnan(x)) else x)
+            columns.append("metadata")
+        data = data[columns]
+        # convert the embeddings to lists if they are strings
+        data["values"] = data["values"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
+        for chunk in (data[pos:pos + UPSERT_BATCH_SIZE] for pos in range(0, len(data), UPSERT_BATCH_SIZE)):
             chunk = chunk.to_dict(orient="records")
             index.upsert(vectors=chunk)
     def drop_table(self, table_name: str, if_exists=True):
         """Delete an index passed in through `table_name` from the pinecone ."""
-        pinecone.delete_index(table_name)
+        connection = self.connect()
+        try:
+            connection.delete_index(table_name)
+        except NotFoundException:
+            if if_exists:
+                return
+            raise Exception(f"Error deleting index '{table_name}', are you sure the name is correct?")
     def delete(self, table_name: str, conditions: List[FilterCondition] = None):
         """Delete records in pinecone index `table_name` based on ids or based on metadata conditions."""
@@ -225,6 +275,7 @@ class PineconeHandler(VectorStoreHandler):
         limit: int = None,
     ):
         """Run query on pinecone index named `table_name` and get results."""
+        # TODO: Add support for namespaces.
         index = self._get_index_handle(table_name)
         if index is None:
             raise Exception(f"Error getting index '{table_name}', are you sure the name is correct?")
@@ -233,23 +284,28 @@ class PineconeHandler(VectorStoreHandler):
             "include_values": True,
             "include_metadata": True
         }
         # check for metadata filter
         metadata_filters = self._translate_metadata_condition(conditions)
-        # check for vector filter
-        vector_filter = (
-            None
-            if conditions is None
-            else [
-                condition.value
-                for condition in conditions
-                if condition.column == TableField.SEARCH_VECTOR.value
-            ]
-        )
-        if vector_filter:
-            if len(vector_filter) > 1:
+        if metadata_filters is not None:
+            query["filter"] = metadata_filters
+        # check for vector and id filters
+        vector_filters = []
+        id_filters = []
+        if conditions:
+            for condition in conditions:
+                if condition.column == TableField.SEARCH_VECTOR.value:
+                    vector_filters.append(condition.value)
+                elif condition.column == TableField.ID.value:
+                    id_filters.append(condition.value)
+        if vector_filters:
+            if len(vector_filters) > 1:
                 raise Exception("You cannot have multiple search_vectors in query")
-            query["vector"] = vector_filter[0]
+            query["vector"] = vector_filters[0]
             # For subqueries, the vector filter is a list of list of strings
             if isinstance(query["vector"], list) and isinstance(query["vector"][0], str):
                 if len(query["vector"]) > 1:
@@ -260,26 +316,21 @@ class PineconeHandler(VectorStoreHandler):
                 except Exception as e:
                     raise Exception(f"Cannot parse the search vector '{query['vector']}'into a list: {e}")
-        # check for limit
-        if limit is not None:
-            query["top_k"] = limit
-        else:
-            query["top_k"] = self.MAX_FETCH_LIMIT
-        if metadata_filters is not None:
-            query["filter"] = metadata_filters
-        # check for id filter
-        id_filters = None
-        if conditions is not None:
-            id_filters = [
-                condition.value
-                for condition in conditions
-                if condition.column == TableField.ID.value
-            ] or None
         if id_filters:
             if len(id_filters) > 1:
                 raise Exception("You cannot have multiple IDs in query")
             query["id"] = id_filters[0]
+        if not vector_filters and not id_filters:
+            raise Exception("You must provide either a search_vector or an ID in the query")
+        # check for limit
+        if limit is not None:
+            query["top_k"] = limit
+        else:
+            query["top_k"] = MAX_FETCH_LIMIT
         # exec query
         try:
             result = index.query(**query)

mindsdb/integrations/handlers/pinecone_handler/requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- pinecone-client
1	+ pinecone-client==5.0.1

mindsdb/integrations/handlers/postgres_handler/postgres_handler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import time
 import json
+from typing import Optional
 import pandas as pd
 import psycopg
@@ -161,7 +162,7 @@ class PostgresHandler(DatabaseHandler):
             'float8': 'float64'
         }
         columns = df.columns
-        df = df.set_axis(range(len(columns)), axis=1)
+        df.columns = list(range(len(columns)))
         for column_index, column_name in enumerate(df.columns):
             col = df[column_name]
             if str(col.dtype) == 'object':
@@ -172,7 +173,7 @@ class PostgresHandler(DatabaseHandler):
                         df[column_name] = col.astype(types_map[pg_type.name])
                     except ValueError as e:
                         logger.error(f'Error casting column {col.name} to {types_map[pg_type.name]}: {e}')
-        return df.set_axis(columns, axis=1)
+        df.columns = columns
     @profiler.profile()
     def native_query(self, query: str, params=None) -> Response:
@@ -202,7 +203,7 @@ class PostgresHandler(DatabaseHandler):
                         result,
                         columns=[x.name for x in cur.description]
                     )
-                    df = self._cast_dtypes(df, cur.description)
+                    self._cast_dtypes(df, cur.description)
                     response = Response(
                         RESPONSE_TYPE.TABLE,
                         df
@@ -281,21 +282,27 @@ class PostgresHandler(DatabaseHandler):
         """
         return self.native_query(query)
-    def get_columns(self, table_name: str) -> Response:
+    def get_columns(self, table_name: str, schema_name: Optional[str] = None) -> Response:
         """
         Retrieves column details for a specified table in the PostgreSQL database.
         Args:
             table_name (str): The name of the table for which to retrieve column information.
+            schema_name (str): The name of the schema in which the table is located.
         Returns:
             Response: A response object containing the column details, formatted as per the `Response` class.
         Raises:
             ValueError: If the 'table_name' is not a valid string.
         """
         if not table_name or not isinstance(table_name, str):
             raise ValueError("Invalid table name provided.")
+        if isinstance(schema_name, str):
+            schema_name = f"'{schema_name}'"
+        else:
+            schema_name = 'current_schema()'
         query = f"""
             SELECT
                 column_name as "Field",
@@ -305,12 +312,11 @@ class PostgresHandler(DatabaseHandler):
             WHERE
                 table_name = '{table_name}'
             AND
-                table_schema = current_schema()
+                table_schema = {schema_name}
         """
         return self.native_query(query)
     def subscribe(self, stop_event, callback, table_name, columns=None, **kwargs):
         config = self._make_connection_args()
         config['autocommit'] = True

mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py CHANGED Viewed

@@ -12,7 +12,7 @@ class RayServeHandler(BaseMLEngine):
         - A Ray Serve server should be running
     Example:
     """  # noqa
     name = 'ray_serve'
@@ -42,9 +42,11 @@ class RayServeHandler(BaseMLEngine):
             raise Exception("Error: Training failed: " + resp['status'])
     def predict(self, df, args=None):
-        args = self.model_storage.json_get('args')  # override any incoming args for now
+        args = {**(self.model_storage.json_get('args')), **args}  # merge incoming args
+        pred_args = args.get('predict_params', {})
+        args = {**args, **pred_args}  # merge pred_args
         resp = requests.post(args['predict_url'],
-                             json={'df': df.to_json(orient='records')},
+                             json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
                              headers={'content-type': 'application/json; format=pandas-records'})
         response = resp.json()

MindsDB 25.1.2.1__py3-none-any.whl → 25.1.5.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.1.2.1py3-none-any.whl → 25.1.5.0py3-none-any.whl