PyPI - MindsDB - Versions diffs - 25.4.2.0__py3-none-any.whl → 25.4.3.0__py3-none-any.whl - Mend

MindsDB 25.4.2.0py3-none-any.whl → 25.4.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (39) hide show

mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py ADDED Viewed

@@ -0,0 +1,231 @@
+import pandas as pd
+from typing import List
+from mindsdb_sql_parser import ASTNode
+from mindsdb.api.executor.planner.steps import FetchDataframeStepPartition
+from mindsdb.integrations.utilities.query_traversal import query_traversal
+from mindsdb.interfaces.query_context.context_controller import RunningQuery
+from mindsdb.api.executor.sql_query.result_set import ResultSet
+from mindsdb.utilities import log
+from mindsdb.utilities.config import Config
+from mindsdb.utilities.partitioning import get_max_thread_count, split_data_frame
+from mindsdb.api.executor.sql_query.steps.fetch_dataframe import get_table_alias, get_fill_param_fnc
+from mindsdb.utilities.context_executor import ContextThreadPoolExecutor
+from .base import BaseStepCall
+logger = log.getLogger(__name__)
+class FetchDataframePartitionCall(BaseStepCall):
+    """
+    Alternative to FetchDataframeCall but fetch data by batches wrapping user's query to:
+     select * from ({user query})
+      where {track_column} > {previous value}
+      order by track_column
+      limit size {batch_size} `
+    """
+    bind = FetchDataframeStepPartition
+    def call(self, step: FetchDataframeStepPartition) -> ResultSet:
+        """
+        Parameters:
+        - batch_size - count of rows to fetch from database per iteration, optional default 1000
+        - threads - run partitioning in threads, bool or int, optinal, if set:
+           - int value: use this as count of threads
+           - true: table threads, autodetect count of thread
+           - false: disable threads even if ml task queue is enabled
+        - track_column - column used for creating partitions
+          - query will be sorted by this column and select will be limited by batch_size
+        - error (default raise)
+          - when `error='skip'`, errors in partition will be skipped and execution will be continued
+        """
+        self.dn = self.session.datahub.get(step.integration)
+        query = step.query
+        # fill params
+        fill_params = get_fill_param_fnc(self.steps_data)
+        query_traversal(query, fill_params)
+        # get query record
+        run_query = self.sql_query.run_query
+        if run_query is None:
+            raise RuntimeError('Error with partitioning of the query')
+        run_query.set_params(step.params)
+        self.table_alias = get_table_alias(step.query.from_table, self.context.get('database'))
+        self.current_step_num = step.step_num
+        self.substeps = step.steps
+        config = Config()
+        # ml task queue enabled?
+        use_threads, thread_count = False, None
+        if config['ml_task_queue']['type'] == 'redis':
+            use_threads = True
+        # use threads?
+        if 'threads' in step.params:
+            threads = step.params['threads']
+            if isinstance(threads, int):
+                thread_count = threads
+                use_threads = True
+            if threads is True:
+                use_threads = True
+            if threads is False:
+                # disable even with ml task queue
+                use_threads = False
+        on_error = step.params.get('error', 'raise')
+        if use_threads:
+            return self.fetch_threads(run_query, query, thread_count=thread_count, on_error=on_error)
+        else:
+            return self.fetch_iterate(run_query, query, on_error=on_error)
+    def fetch_iterate(self, run_query: RunningQuery, query: ASTNode, on_error: str = None) -> ResultSet:
+        """
+         Process batches one by one in circle
+        """
+        results = []
+        while True:
+            # fetch batch
+            query2 = run_query.get_partition_query(self.current_step_num, query)
+            response = self.dn.query(
+                query=query2,
+                session=self.session
+            )
+            df = response.data_frame
+            if df is None or len(df) == 0:
+                break
+            # executing of sub steps can modify dataframe columns, lets memorise max tracking value
+            max_track_value = run_query.get_max_track_value(df)
+            try:
+                sub_data = self.exec_sub_steps(df)
+                results.append(sub_data)
+            except Exception as e:
+                if on_error == 'skip':
+                    logger.error(e)
+                else:
+                    raise e
+            run_query.set_progress(df, max_track_value)
+        return self.concat_results(results)
+    def concat_results(self, results: List[ResultSet]) -> ResultSet:
+        """
+        Concatenate list of result sets to single result set
+        """
+        df_list = []
+        for res in results:
+            df, col_names = res.to_df_cols()
+            if len(df) > 0:
+                df_list.append(df)
+        data = ResultSet()
+        if len(df_list) > 0:
+            data.from_df_cols(pd.concat(df_list), col_names)
+        return data
+    def exec_sub_steps(self, df: pd.DataFrame) -> ResultSet:
+        """
+        FetchDataframeStepPartition has substeps defined
+        Every batch of data have to be used to execute these substeps
+        - batch of data is put as result of FetchDataframeStepPartition
+        - substep are executed using result of previos step (like it is all fetched data is available)
+        - the final result is returned and used outside to concatenate with results of other's batches
+        """
+        input_data = ResultSet()
+        input_data.from_df(
+            df,
+            table_name=self.table_alias[1],
+            table_alias=self.table_alias[2],
+            database=self.table_alias[0]
+        )
+        # execute with modified previous results
+        steps_data2 = self.steps_data.copy()
+        steps_data2[self.current_step_num] = input_data
+        sub_data = None
+        for substep in self.substeps:
+            sub_data = self.sql_query.execute_step(substep, steps_data=steps_data2)
+            steps_data2[substep.step_num] = sub_data
+        return sub_data
+    def fetch_threads(self, run_query: RunningQuery, query: ASTNode,
+                      thread_count: int = None, on_error: str = None) -> ResultSet:
+        """
+        Process batches in threads
+        - spawn required count of threads
+        - create in/out queue to communicate with threads
+        - send task to threads and receive results
+        """
+        # create communication queues
+        if thread_count is None:
+            thread_count = get_max_thread_count()
+        # 3 tasks per worker during 1 batch
+        partition_size = int(run_query.batch_size / thread_count / 3)
+        # min partition size
+        if partition_size < 10:
+            partition_size = 10
+        results = []
+        with ContextThreadPoolExecutor(max_workers=thread_count) as executor:
+            while True:
+                # fetch batch
+                query2 = run_query.get_partition_query(self.current_step_num, query)
+                response = self.dn.query(
+                    query=query2,
+                    session=self.session
+                )
+                df = response.data_frame
+                if df is None or len(df) == 0:
+                    # TODO detect circles: data handler ignores condition and output is repeated
+                    # exit & stop workers
+                    break
+                max_track_value = run_query.get_max_track_value(df)
+                # split into chunks and send to workers
+                futures = []
+                for df2 in split_data_frame(df, partition_size):
+                    futures.append(executor.submit(self.exec_sub_steps, df2))
+                for future in futures:
+                    try:
+                        results.append(future.result())
+                    except Exception as e:
+                        if on_error == 'skip':
+                            logger.error(e)
+                        else:
+                            executor.shutdown()
+                            raise e
+                # TODO
+                #  1. get next batch without updating track_value:
+                #    it allows to keep queue_in filled with data between fetching batches
+                run_query.set_progress(df, max_track_value)
+        return self.concat_results(results)

mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py CHANGED Viewed

@@ -244,6 +244,7 @@ class ChromaDBHandler(VectorStoreHandler):
         offset: int = None,
         limit: int = None,
     ) -> pd.DataFrame:
         collection = self._client.get_collection(table_name)
         filters = self._translate_metadata_condition(conditions)
@@ -313,7 +314,7 @@ class ChromaDBHandler(VectorStoreHandler):
             TableField.ID.value: ids,
             TableField.CONTENT.value: documents,
             TableField.METADATA.value: metadatas,
-            TableField.EMBEDDINGS.value: embeddings,
+            TableField.EMBEDDINGS.value: list(embeddings),
         }
         if columns is not None:

mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py CHANGED Viewed

@@ -104,6 +104,22 @@ def construct_model_from_args(args: Dict) -> Embeddings:
     return model
+def row_to_document(row: pd.Series) -> str:
+    """
+    Convert a row in the input dataframe into a document
+    Default implementation is to concatenate all the columns
+    in the form of
+    field1: value1\nfield2: value2\n...
+    """
+    fields = row.index.tolist()
+    values = row.values.tolist()
+    document = "\n".join(
+        [f"{field}: {value}" for field, value in zip(fields, values)]
+    )
+    return document
 class LangchainEmbeddingHandler(BaseMLEngine):
     """
     Bridge class to connect langchain.embeddings module to mindsDB
@@ -180,7 +196,7 @@ class LangchainEmbeddingHandler(BaseMLEngine):
             )
         # convert each row into a document
-        df_texts = df[input_columns].apply(self.row_to_document, axis=1)
+        df_texts = df[input_columns].apply(row_to_document, axis=1)
         embeddings = model.embed_documents(df_texts.tolist())
         # create a new dataframe with the embeddings
@@ -188,21 +204,6 @@ class LangchainEmbeddingHandler(BaseMLEngine):
         return df_embeddings
-    def row_to_document(self, row: pd.Series) -> str:
-        """
-        Convert a row in the input dataframe into a document
-        Default implementation is to concatenate all the columns
-        in the form of
-        field1: value1\nfield2: value2\n...
-        """
-        fields = row.index.tolist()
-        values = row.values.tolist()
-        document = "\n".join(
-            [f"{field}: {value}" for field, value in zip(fields, values)]
-        )
-        return document
     def finetune(
         self, df: Union[DataFrame, None] = None, args: Union[Dict, None] = None
     ) -> None:

mindsdb/integrations/handlers/langchain_handler/langchain_handler.py CHANGED Viewed

@@ -50,6 +50,7 @@ class LangChainHandler(BaseMLEngine):
         - OpenAI
         - Anthropic
         - Anyscale
+        - Google
         - LiteLLM
         - Ollama

mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py CHANGED Viewed

@@ -46,7 +46,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
     def _make_connection_args(self):
         cloud_pgvector_url = os.environ.get('KB_PGVECTOR_URL')
-        if cloud_pgvector_url is not None:
+        # if no connection args and shared pg vector defined - use it
+        if len(self.connection_args) == 0 and cloud_pgvector_url is not None:
             result = urlparse(cloud_pgvector_url)
             self.connection_args = {
                 'host': result.hostname,
@@ -157,7 +158,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
             where_clauses.append(f'{key} {value["op"]} {value["value"]}')
         if len(where_clauses) > 1:
-            return f"WHERE{' AND '.join(where_clauses)}"
+            return f"WHERE {' AND '.join(where_clauses)}"
         elif len(where_clauses) == 1:
             return f"WHERE {where_clauses[0]}"
         else:
@@ -195,11 +196,6 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
         # given filter conditions, construct where clause
         where_clause = self._construct_where_clause(filter_conditions)
-        # construct full after from clause, where clause + offset clause + limit clause
-        after_from_clause = self._construct_full_after_from_clause(
-            where_clause, offset_clause, limit_clause
-        )
         # Handle distance column specially since it's calculated, not stored
         modified_columns = []
         has_distance = False
@@ -219,7 +215,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
         if filter_conditions:
             if embedding_search:
-                search_vector = filter_conditions["embeddings"]["value"][0]
+                search_vector = filter_conditions["embeddings"]["value"]
                 filter_conditions.pop("embeddings")
                 if self._is_sparse:
@@ -241,15 +237,15 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
                 if has_distance:
                     targets = f"{targets}, (embeddings {distance_op} '{search_vector}') as distance"
-                return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
+                return f"SELECT {targets} FROM {table_name} {where_clause} ORDER BY embeddings {distance_op} '{search_vector}' ASC {limit_clause} {offset_clause} "
             else:
                 # if filter conditions, return rows that satisfy the conditions
-                return f"SELECT {targets} FROM {table_name} {after_from_clause}"
+                return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
         else:
             # if no filter conditions, return all rows
-            return f"SELECT {targets} FROM {table_name} {after_from_clause}"
+            return f"SELECT {targets} FROM {table_name} {limit_clause} {offset_clause}"
     def _check_table(self, table_name: str):
         # Apply namespace for a user

mindsdb/integrations/handlers/postgres_handler/postgres_handler.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import time
 import json
 from typing import Optional
+import threading
 import pandas as pd
 import psycopg
@@ -77,6 +78,8 @@ class PostgresHandler(DatabaseHandler):
         self.is_connected = False
         self.thread_safe = True
+        self._insert_lock = threading.Lock()
     def __del__(self):
         if self.is_connected:
             self.disconnect()
@@ -261,14 +264,35 @@ class PostgresHandler(DatabaseHandler):
         connection = self.connect()
-        columns = [f'"{c}"' for c in df.columns]
+        columns = df.columns
+        # postgres 'copy' is not thread safe. use lock to prevent concurrent execution
+        with self._insert_lock:
+            resp = self.get_columns(table_name)
+        # copy requires precise cases of names: get current column names from table and adapt input dataframe columns
+        if resp.data_frame is not None and not resp.data_frame.empty:
+            db_columns = {
+                c.lower(): c
+                for c in resp.data_frame['Field']
+            }
+            # try to get case of existing column
+            columns = [
+                db_columns.get(c.lower(), c)
+                for c in columns
+            ]
+        columns = [f'"{c}"' for c in columns]
         rowcount = None
         with connection.cursor() as cur:
             try:
-                with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN WITH CSV') as copy:
-                    df.to_csv(copy, index=False, header=False)
+                with self._insert_lock:
+                    with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN WITH CSV') as copy:
+                        df.to_csv(copy, index=False, header=False)
-                connection.commit()
+                    connection.commit()
             except Exception as e:
                 logger.error(f'Error running insert to {table_name} on {self.database}, {e}!')
                 connection.rollback()

mindsdb/integrations/libs/llm/config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 class BaseLLMConfig(BaseModel):
@@ -104,3 +104,13 @@ class NvidiaNIMConfig(BaseLLMConfig):
 class MindsdbConfig(BaseLLMConfig):
     model_name: str
     project_name: str
+# See https://python.langchain.com/api_reference/google_genai/chat_models/langchain_google_genai.chat_models.ChatGoogleGenerativeAI.html
+class GoogleConfig(BaseLLMConfig):
+    model: str = Field(description="Gemini model name to use (e.g., 'gemini-1.5-pro')")
+    temperature: Optional[float] = Field(default=None, description="Controls randomness in responses")
+    top_p: Optional[float] = Field(default=None, description="Nucleus sampling parameter")
+    top_k: Optional[int] = Field(default=None, description="Number of highest probability tokens to consider")
+    max_output_tokens: Optional[int] = Field(default=None, description="Maximum number of tokens to generate")
+    google_api_key: Optional[str] = Field(default=None, description="API key for Google Generative AI")

mindsdb/integrations/libs/llm/utils.py CHANGED Viewed

@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.llm.config import (
     AnthropicConfig,
     AnyscaleConfig,
     BaseLLMConfig,
+    GoogleConfig,
     LiteLLMConfig,
     OllamaConfig,
     OpenAIConfig,
@@ -31,6 +32,8 @@ DEFAULT_ANTHROPIC_MODEL = "claude-3-haiku-20240307"
 DEFAULT_ANYSCALE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
 DEFAULT_ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1"
+DEFAULT_GOOGLE_MODEL = "gemini-2.5-pro-preview-03-25"
 DEFAULT_LITELLM_MODEL = "gpt-3.5-turbo"
 DEFAULT_LITELLM_PROVIDER = "openai"
 DEFAULT_LITELLM_BASE_URL = "https://ai.dev.mindsdb.com"
@@ -225,6 +228,15 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
             openai_organization=args.get("api_organization", None),
             request_timeout=args.get("request_timeout", None),
         )
+    if provider == "google":
+        return GoogleConfig(
+            model=args.get("model_name", DEFAULT_GOOGLE_MODEL),
+            temperature=temperature,
+            top_p=args.get("top_p", None),
+            top_k=args.get("top_k", None),
+            max_output_tokens=args.get("max_tokens", None),
+            google_api_key=args["api_keys"].get("google", None),
+        )
     raise ValueError(f"Provider {provider} is not supported.")

mindsdb/integrations/libs/vectordatabase_handler.py CHANGED Viewed

@@ -278,8 +278,16 @@ class VectorStoreHandler(BaseHandler):
         return self.do_upsert(table_name, df)
     def do_upsert(self, table_name, df):
-        # if handler supports it, call upsert method
+        """Upsert data into table, handling document updates and deletions.
+        Args:
+            table_name (str): Name of the table
+            df (pd.DataFrame): DataFrame containing the data to upsert
+        The function handles three cases:
+        1. New documents: Insert them
+        2. Updated documents: Delete old chunks and insert new ones
+        """
         id_col = TableField.ID.value
         content_col = TableField.CONTENT.value

mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py CHANGED Viewed

@@ -18,7 +18,7 @@ log = logging.getLogger(__name__)
 class LLMReranker(BaseDocumentCompressor):
-    filtering_threshold: float = 0.5  # Default threshold for filtering
+    filtering_threshold: float = 0.0  # Default threshold for filtering
     model: str = DEFAULT_RERANKING_MODEL  # Model to use for reranking
     temperature: float = 0.0  # Temperature for the model
     openai_api_key: Optional[str] = None

mindsdb/interfaces/agents/constants.py CHANGED Viewed

@@ -15,7 +15,8 @@ SUPPORTED_PROVIDERS = {
     "litellm",
     "ollama",
     "nvidia_nim",
-    "vllm"
+    "vllm",
+    "google"
 }
 # Chat models
 ANTHROPIC_CHAT_MODELS = (
@@ -153,6 +154,15 @@ NVIDIA_NIM_CHAT_MODELS = (
     "ibm/granite-34b-code-instruct",
 )
+GOOGLE_GEMINI_CHAT_MODELS = (
+    "gemini-2.5-pro-preview-03-25",
+    "gemini-2.0-flash",
+    "gemini-2.0-flash-lite",
+    "gemini-1.5-flash",
+    "gemini-1.5-flash-8b",
+    "gemini-1.5-pro",
+)
 # Define a read-only dictionary mapping providers to their models
 PROVIDER_TO_MODELS = MappingProxyType(
     {
@@ -160,6 +170,7 @@ PROVIDER_TO_MODELS = MappingProxyType(
         "ollama": OLLAMA_CHAT_MODELS,
         "openai": OPEN_AI_CHAT_MODELS,
         "nvidia_nim": NVIDIA_NIM_CHAT_MODELS,
+        "google": GOOGLE_GEMINI_CHAT_MODELS,
     }
 )

mindsdb/interfaces/agents/langchain_agent.py CHANGED Viewed

@@ -15,6 +15,7 @@ from langchain_community.chat_models import (
     ChatAnyscale,
     ChatLiteLLM,
     ChatOllama)
+from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_core.agents import AgentAction, AgentStep
 from langchain_core.callbacks.base import BaseCallbackHandler
@@ -50,6 +51,7 @@ from .constants import (
     DEFAULT_TIKTOKEN_MODEL_NAME,
     SUPPORTED_PROVIDERS,
     ANTHROPIC_CHAT_MODELS,
+    GOOGLE_GEMINI_CHAT_MODELS,
     OLLAMA_CHAT_MODELS,
     NVIDIA_NIM_CHAT_MODELS,
     USER_COLUMN,
@@ -85,6 +87,8 @@ def get_llm_provider(args: Dict) -> str:
         return "ollama"
     if args["model_name"] in NVIDIA_NIM_CHAT_MODELS:
         return "nvidia_nim"
+    if args["model_name"] in GOOGLE_GEMINI_CHAT_MODELS:
+        return "google"
     # For vLLM, require explicit provider specification
     raise ValueError("Invalid model name. Please define a supported llm provider")
@@ -162,6 +166,8 @@ def create_chat_model(args: Dict):
         return ChatOllama(**model_kwargs)
     if args["provider"] == "nvidia_nim":
         return ChatNVIDIA(**model_kwargs)
+    if args["provider"] == "google":
+        return ChatGoogleGenerativeAI(**model_kwargs)
     if args["provider"] == "mindsdb":
         return ChatMindsdb(**model_kwargs)
     raise ValueError(f'Unknown provider: {args["provider"]}')

mindsdb/interfaces/database/projects.py CHANGED Viewed

@@ -69,6 +69,12 @@ class Project:
         self.id = record.id
     def delete(self):
+        if self.record.metadata_ and self.record.metadata_.get('is_default', False):
+            raise Exception(
+                f"Project '{self.name}' can not be deleted, because it is default project."
+                "The default project can be changed in the config file or by setting the environment variable MINDSDB_DEFAULT_PROJECT."
+            )
         tables = self.get_tables()
         tables = [key for key, val in tables.items() if val['type'] != 'table']
         if len(tables) > 0:
@@ -466,7 +472,7 @@ class ProjectController:
         if new_metadata is not None:
             project.metadata = new_metadata
-            project.record.metadata = new_metadata
+            project.record.metadata_ = new_metadata
             flag_modified(project.record, 'metadata_')
         db.session.commit()

MindsDB 25.4.2.0__py3-none-any.whl → 25.4.3.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.4.2.0py3-none-any.whl → 25.4.3.0py3-none-any.whl