PyPI - MindsDB - Versions diffs - 25.4.2.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl - Mend

MindsDB 25.4.2.0py3-none-any.whl → 25.4.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (30) hide show

mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py ADDED Viewed

@@ -0,0 +1,288 @@
+import pandas as pd
+import threading
+import queue
+from typing import List
+from mindsdb_sql_parser import ASTNode
+from mindsdb.api.executor.planner.steps import FetchDataframeStepPartition
+from mindsdb.integrations.utilities.query_traversal import query_traversal
+from mindsdb.interfaces.query_context.context_controller import RunningQuery
+from mindsdb.api.executor.sql_query.result_set import ResultSet
+from mindsdb.utilities import log
+from mindsdb.utilities.config import Config
+from mindsdb.utilities.context import Context, context as ctx
+from mindsdb.utilities.partitioning import get_max_thread_count, split_data_frame
+from mindsdb.api.executor.sql_query.steps.fetch_dataframe import get_table_alias, get_fill_param_fnc
+from .base import BaseStepCall
+logger = log.getLogger(__name__)
+class FetchDataframePartitionCall(BaseStepCall):
+    """
+    Alternative to FetchDataframeCall but fetch data by batches wrapping user's query to:
+     select * from ({user query})
+      where {track_column} > {previous value}
+      order by track_column
+      limit size {batch_size} `
+    """
+    bind = FetchDataframeStepPartition
+    def call(self, step: FetchDataframeStepPartition) -> ResultSet:
+        """
+        Parameters:
+        - batch_size - count of rows to fetch from database per iteration, optional default 1000
+        - threads - run partitioning in threads, bool or int, optinal, if set:
+           - int value: use this as count of threads
+           - true: table threads, autodetect count of thread
+           - false: disable threads even if ml task queue is enabled
+        - track_column - column used for creating partitions
+          - query will be sorted by this column and select will be limited by batch_size
+        - error (default raise)
+          - when `error='skip'`, errors in partition will be skipped and execution will be continued
+        """
+        self.dn = self.session.datahub.get(step.integration)
+        query = step.query
+        # fill params
+        fill_params = get_fill_param_fnc(self.steps_data)
+        query_traversal(query, fill_params)
+        # get query record
+        run_query = self.sql_query.run_query
+        if run_query is None:
+            raise RuntimeError('Error with partitioning of the query')
+        run_query.set_params(step.params)
+        self.table_alias = get_table_alias(step.query.from_table, self.context.get('database'))
+        self.current_step_num = step.step_num
+        self.substeps = step.steps
+        config = Config()
+        # ml task queue enabled?
+        use_threads, thread_count = False, None
+        if config['ml_task_queue']['type'] == 'redis':
+            use_threads = True
+        # use threads?
+        if 'threads' in step.params:
+            threads = step.params['threads']
+            if isinstance(threads, int):
+                thread_count = threads
+                use_threads = True
+            if threads is True:
+                use_threads = True
+            if threads is False:
+                # disable even with ml task queue
+                use_threads = False
+        on_error = step.params.get('error', 'raise')
+        if use_threads:
+            return self.fetch_threads(run_query, query, thread_count=thread_count, on_error=on_error)
+        else:
+            return self.fetch_iterate(run_query, query, on_error=on_error)
+    def fetch_iterate(self, run_query: RunningQuery, query: ASTNode, on_error: str = None) -> ResultSet:
+        """
+         Process batches one by one in circle
+        """
+        results = []
+        while True:
+            # fetch batch
+            query2 = run_query.get_partition_query(self.current_step_num, query)
+            response = self.dn.query(
+                query=query2,
+                session=self.session
+            )
+            df = response.data_frame
+            if df is None or len(df) == 0:
+                break
+            # executing of sub steps can modify dataframe columns, lets memorise max tracking value
+            max_track_value = run_query.get_max_track_value(df)
+            try:
+                sub_data = self.exec_sub_steps(df)
+                results.append(sub_data)
+            except Exception as e:
+                if on_error == 'skip':
+                    logger.error(e)
+                else:
+                    raise e
+            run_query.set_progress(df, max_track_value)
+        return self.concat_results(results)
+    def concat_results(self, results: List[ResultSet]) -> ResultSet:
+        """
+        Concatenate list of result sets to single result set
+        """
+        df_list = []
+        for res in results:
+            df, col_names = res.to_df_cols()
+            if len(df) > 0:
+                df_list.append(df)
+        data = ResultSet()
+        if len(df_list) > 0:
+            data.from_df_cols(pd.concat(df_list), col_names)
+        return data
+    def exec_sub_steps(self, df: pd.DataFrame) -> ResultSet:
+        """
+        FetchDataframeStepPartition has substeps defined
+        Every batch of data have to be used to execute these substeps
+        - batch of data is put as result of FetchDataframeStepPartition
+        - substep are executed using result of previos step (like it is all fetched data is available)
+        - the final result is returned and used outside to concatenate with results of other's batches
+        """
+        input_data = ResultSet()
+        input_data.from_df(
+            df,
+            table_name=self.table_alias[1],
+            table_alias=self.table_alias[2],
+            database=self.table_alias[0]
+        )
+        # execute with modified previous results
+        steps_data2 = self.steps_data.copy()
+        steps_data2[self.current_step_num] = input_data
+        sub_data = None
+        for substep in self.substeps:
+            sub_data = self.sql_query.execute_step(substep, steps_data=steps_data2)
+            steps_data2[substep.step_num] = sub_data
+        return sub_data
+    def fetch_threads(self, run_query: RunningQuery, query: ASTNode,
+                      thread_count: int = None, on_error: str = None) -> ResultSet:
+        """
+        Process batches in threads
+        - spawn required count of threads
+        - create in/out queue to communicate with threads
+        - send task to threads and receive results
+        """
+        # create communication queues
+        queue_in = queue.Queue()
+        queue_out = queue.Queue()
+        self.stop_event = threading.Event()
+        if thread_count is None:
+            thread_count = get_max_thread_count()
+        # 3 tasks per worker during 1 batch
+        partition_size = int(run_query.batch_size / thread_count / 3)
+        # min partition size
+        if partition_size < 10:
+            partition_size = 10
+        # create N workers pool
+        workers = []
+        results = []
+        try:
+            for i in range(thread_count):
+                worker = threading.Thread(target=self._worker, daemon=True, args=(ctx.dump(), queue_in,
+                                                                                  queue_out, self.stop_event))
+                worker.start()
+                workers.append(worker)
+            while True:
+                # fetch batch
+                query2 = run_query.get_partition_query(self.current_step_num, query)
+                response = self.dn.query(
+                    query=query2,
+                    session=self.session
+                )
+                df = response.data_frame
+                if df is None or len(df) == 0:
+                    # TODO detect circles: data handler ignores condition and output is repeated
+                    # exit & stop workers
+                    break
+                max_track_value = run_query.get_max_track_value(df)
+                # split into chunks and send to workers
+                sent_chunks = 0
+                for df2 in split_data_frame(df, partition_size):
+                    queue_in.put([sent_chunks, df2])
+                    sent_chunks += 1
+                batch_results = []
+                for i in range(sent_chunks):
+                    res = queue_out.get()
+                    if 'error' in res:
+                        if on_error == 'skip':
+                            logger.error(res['error'])
+                        else:
+                            raise RuntimeError(res['error'])
+                    if res['data']:
+                        batch_results.append(res)
+                # sort results
+                batch_results.sort(key=lambda x: x['num'])
+                results.append(self.concat_results(
+                    [item['data'] for item in batch_results]
+                ))
+                # TODO
+                #  1. get next batch without updating track_value:
+                #    it allows to keep queue_in filled with data between fetching batches
+                run_query.set_progress(df, max_track_value)
+        finally:
+            self.close_workers(workers)
+        return self.concat_results(results)
+    def close_workers(self, workers: List[threading.Thread]):
+        """
+        Sent signal to workers to stop
+        """
+        self.stop_event.set()
+        for worker in workers:
+            if worker.is_alive():
+                worker.join()
+    def _worker(self, context: Context, queue_in: queue.Queue, queue_out: queue.Queue, stop_event: threading.Event):
+        """
+        Worker function. Execute incoming tasks unless stop_event is set
+        """
+        ctx.load(context)
+        while True:
+            if stop_event.is_set():
+                break
+            try:
+                chunk_num, df = queue_in.get(timeout=1)
+                if df is None:
+                    continue
+                sub_data = self.exec_sub_steps(df)
+                queue_out.put({'data': sub_data, 'num': chunk_num})
+            except queue.Empty:
+                continue
+            except Exception as e:
+                queue_out.put({'error': str(e)})
+                stop_event.set()

mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py CHANGED Viewed

@@ -104,6 +104,22 @@ def construct_model_from_args(args: Dict) -> Embeddings:
     return model
+def row_to_document(row: pd.Series) -> str:
+    """
+    Convert a row in the input dataframe into a document
+    Default implementation is to concatenate all the columns
+    in the form of
+    field1: value1\nfield2: value2\n...
+    """
+    fields = row.index.tolist()
+    values = row.values.tolist()
+    document = "\n".join(
+        [f"{field}: {value}" for field, value in zip(fields, values)]
+    )
+    return document
 class LangchainEmbeddingHandler(BaseMLEngine):
     """
     Bridge class to connect langchain.embeddings module to mindsDB
@@ -180,7 +196,7 @@ class LangchainEmbeddingHandler(BaseMLEngine):
             )
         # convert each row into a document
-        df_texts = df[input_columns].apply(self.row_to_document, axis=1)
+        df_texts = df[input_columns].apply(row_to_document, axis=1)
         embeddings = model.embed_documents(df_texts.tolist())
         # create a new dataframe with the embeddings
@@ -188,21 +204,6 @@ class LangchainEmbeddingHandler(BaseMLEngine):
         return df_embeddings
-    def row_to_document(self, row: pd.Series) -> str:
-        """
-        Convert a row in the input dataframe into a document
-        Default implementation is to concatenate all the columns
-        in the form of
-        field1: value1\nfield2: value2\n...
-        """
-        fields = row.index.tolist()
-        values = row.values.tolist()
-        document = "\n".join(
-            [f"{field}: {value}" for field, value in zip(fields, values)]
-        )
-        return document
     def finetune(
         self, df: Union[DataFrame, None] = None, args: Union[Dict, None] = None
     ) -> None:

mindsdb/integrations/handlers/langchain_handler/langchain_handler.py CHANGED Viewed

@@ -50,6 +50,7 @@ class LangChainHandler(BaseMLEngine):
         - OpenAI
         - Anthropic
         - Anyscale
+        - Google
         - LiteLLM
         - Ollama

mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py CHANGED Viewed

@@ -46,7 +46,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
     def _make_connection_args(self):
         cloud_pgvector_url = os.environ.get('KB_PGVECTOR_URL')
-        if cloud_pgvector_url is not None:
+        # if no connection args and shared pg vector defined - use it
+        if len(self.connection_args) == 0 and cloud_pgvector_url is not None:
             result = urlparse(cloud_pgvector_url)
             self.connection_args = {
                 'host': result.hostname,
@@ -157,7 +158,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
             where_clauses.append(f'{key} {value["op"]} {value["value"]}')
         if len(where_clauses) > 1:
-            return f"WHERE{' AND '.join(where_clauses)}"
+            return f"WHERE {' AND '.join(where_clauses)}"
         elif len(where_clauses) == 1:
             return f"WHERE {where_clauses[0]}"
         else:
@@ -195,11 +196,6 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
         # given filter conditions, construct where clause
         where_clause = self._construct_where_clause(filter_conditions)
-        # construct full after from clause, where clause + offset clause + limit clause
-        after_from_clause = self._construct_full_after_from_clause(
-            where_clause, offset_clause, limit_clause
-        )
         # Handle distance column specially since it's calculated, not stored
         modified_columns = []
         has_distance = False
@@ -219,7 +215,7 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
         if filter_conditions:
             if embedding_search:
-                search_vector = filter_conditions["embeddings"]["value"][0]
+                search_vector = filter_conditions["embeddings"]["value"]
                 filter_conditions.pop("embeddings")
                 if self._is_sparse:
@@ -241,15 +237,15 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
                 if has_distance:
                     targets = f"{targets}, (embeddings {distance_op} '{search_vector}') as distance"
-                return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
+                return f"SELECT {targets} FROM {table_name} {where_clause} ORDER BY embeddings {distance_op} '{search_vector}' ASC {limit_clause} {offset_clause} "
             else:
                 # if filter conditions, return rows that satisfy the conditions
-                return f"SELECT {targets} FROM {table_name} {after_from_clause}"
+                return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
         else:
             # if no filter conditions, return all rows
-            return f"SELECT {targets} FROM {table_name} {after_from_clause}"
+            return f"SELECT {targets} FROM {table_name} {limit_clause} {offset_clause}"
     def _check_table(self, table_name: str):
         # Apply namespace for a user

mindsdb/integrations/handlers/postgres_handler/postgres_handler.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import time
 import json
 from typing import Optional
+import threading
 import pandas as pd
 import psycopg
@@ -77,6 +78,8 @@ class PostgresHandler(DatabaseHandler):
         self.is_connected = False
         self.thread_safe = True
+        self._insert_lock = threading.Lock()
     def __del__(self):
         if self.is_connected:
             self.disconnect()
@@ -261,14 +264,35 @@ class PostgresHandler(DatabaseHandler):
         connection = self.connect()
-        columns = [f'"{c}"' for c in df.columns]
+        columns = df.columns
+        # postgres 'copy' is not thread safe. use lock to prevent concurrent execution
+        with self._insert_lock:
+            resp = self.get_columns(table_name)
+        # copy requires precise cases of names: get current column names from table and adapt input dataframe columns
+        if resp.data_frame is not None and not resp.data_frame.empty:
+            db_columns = {
+                c.lower(): c
+                for c in resp.data_frame['Field']
+            }
+            # try to get case of existing column
+            columns = [
+                db_columns.get(c.lower(), c)
+                for c in columns
+            ]
+        columns = [f'"{c}"' for c in columns]
         rowcount = None
         with connection.cursor() as cur:
             try:
-                with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN WITH CSV') as copy:
-                    df.to_csv(copy, index=False, header=False)
+                with self._insert_lock:
+                    with cur.copy(f'copy "{table_name}" ({",".join(columns)}) from STDIN WITH CSV') as copy:
+                        df.to_csv(copy, index=False, header=False)
-                connection.commit()
+                    connection.commit()
             except Exception as e:
                 logger.error(f'Error running insert to {table_name} on {self.database}, {e}!')
                 connection.rollback()

mindsdb/integrations/libs/llm/config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 class BaseLLMConfig(BaseModel):
@@ -104,3 +104,13 @@ class NvidiaNIMConfig(BaseLLMConfig):
 class MindsdbConfig(BaseLLMConfig):
     model_name: str
     project_name: str
+# See https://python.langchain.com/api_reference/google_genai/chat_models/langchain_google_genai.chat_models.ChatGoogleGenerativeAI.html
+class GoogleConfig(BaseLLMConfig):
+    model: str = Field(description="Gemini model name to use (e.g., 'gemini-1.5-pro')")
+    temperature: Optional[float] = Field(default=None, description="Controls randomness in responses")
+    top_p: Optional[float] = Field(default=None, description="Nucleus sampling parameter")
+    top_k: Optional[int] = Field(default=None, description="Number of highest probability tokens to consider")
+    max_output_tokens: Optional[int] = Field(default=None, description="Maximum number of tokens to generate")
+    google_api_key: Optional[str] = Field(default=None, description="API key for Google Generative AI")

mindsdb/integrations/libs/llm/utils.py CHANGED Viewed

@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.llm.config import (
     AnthropicConfig,
     AnyscaleConfig,
     BaseLLMConfig,
+    GoogleConfig,
     LiteLLMConfig,
     OllamaConfig,
     OpenAIConfig,
@@ -31,6 +32,8 @@ DEFAULT_ANTHROPIC_MODEL = "claude-3-haiku-20240307"
 DEFAULT_ANYSCALE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
 DEFAULT_ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1"
+DEFAULT_GOOGLE_MODEL = "gemini-2.5-pro-preview-03-25"
 DEFAULT_LITELLM_MODEL = "gpt-3.5-turbo"
 DEFAULT_LITELLM_PROVIDER = "openai"
 DEFAULT_LITELLM_BASE_URL = "https://ai.dev.mindsdb.com"
@@ -225,6 +228,15 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
             openai_organization=args.get("api_organization", None),
             request_timeout=args.get("request_timeout", None),
         )
+    if provider == "google":
+        return GoogleConfig(
+            model=args.get("model_name", DEFAULT_GOOGLE_MODEL),
+            temperature=temperature,
+            top_p=args.get("top_p", None),
+            top_k=args.get("top_k", None),
+            max_output_tokens=args.get("max_tokens", None),
+            google_api_key=args["api_keys"].get("google", None),
+        )
     raise ValueError(f"Provider {provider} is not supported.")

mindsdb/interfaces/agents/constants.py CHANGED Viewed

@@ -15,7 +15,8 @@ SUPPORTED_PROVIDERS = {
     "litellm",
     "ollama",
     "nvidia_nim",
-    "vllm"
+    "vllm",
+    "google"
 }
 # Chat models
 ANTHROPIC_CHAT_MODELS = (
@@ -153,6 +154,15 @@ NVIDIA_NIM_CHAT_MODELS = (
     "ibm/granite-34b-code-instruct",
 )
+GOOGLE_GEMINI_CHAT_MODELS = (
+    "gemini-2.5-pro-preview-03-25",
+    "gemini-2.0-flash",
+    "gemini-2.0-flash-lite",
+    "gemini-1.5-flash",
+    "gemini-1.5-flash-8b",
+    "gemini-1.5-pro",
+)
 # Define a read-only dictionary mapping providers to their models
 PROVIDER_TO_MODELS = MappingProxyType(
     {
@@ -160,6 +170,7 @@ PROVIDER_TO_MODELS = MappingProxyType(
         "ollama": OLLAMA_CHAT_MODELS,
         "openai": OPEN_AI_CHAT_MODELS,
         "nvidia_nim": NVIDIA_NIM_CHAT_MODELS,
+        "google": GOOGLE_GEMINI_CHAT_MODELS,
     }
 )

mindsdb/interfaces/agents/langchain_agent.py CHANGED Viewed

@@ -15,6 +15,7 @@ from langchain_community.chat_models import (
     ChatAnyscale,
     ChatLiteLLM,
     ChatOllama)
+from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_core.agents import AgentAction, AgentStep
 from langchain_core.callbacks.base import BaseCallbackHandler
@@ -50,6 +51,7 @@ from .constants import (
     DEFAULT_TIKTOKEN_MODEL_NAME,
     SUPPORTED_PROVIDERS,
     ANTHROPIC_CHAT_MODELS,
+    GOOGLE_GEMINI_CHAT_MODELS,
     OLLAMA_CHAT_MODELS,
     NVIDIA_NIM_CHAT_MODELS,
     USER_COLUMN,
@@ -85,6 +87,8 @@ def get_llm_provider(args: Dict) -> str:
         return "ollama"
     if args["model_name"] in NVIDIA_NIM_CHAT_MODELS:
         return "nvidia_nim"
+    if args["model_name"] in GOOGLE_GEMINI_CHAT_MODELS:
+        return "google"
     # For vLLM, require explicit provider specification
     raise ValueError("Invalid model name. Please define a supported llm provider")
@@ -162,6 +166,8 @@ def create_chat_model(args: Dict):
         return ChatOllama(**model_kwargs)
     if args["provider"] == "nvidia_nim":
         return ChatNVIDIA(**model_kwargs)
+    if args["provider"] == "google":
+        return ChatGoogleGenerativeAI(**model_kwargs)
     if args["provider"] == "mindsdb":
         return ChatMindsdb(**model_kwargs)
     raise ValueError(f'Unknown provider: {args["provider"]}')

MindsDB 25.4.2.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl

Potentially problematic release.

MindsDB 25.4.2.0py3-none-any.whl → 25.4.2.1py3-none-any.whl