PyPI - MindsDB - Versions diffs - 25.4.2.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl - Mend

MindsDB 25.4.2.0py3-none-any.whl → 25.4.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (30) hide show

mindsdb/interfaces/knowledge_base/controller.py CHANGED Viewed

@@ -27,6 +27,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
 )
 from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
 from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
+from mindsdb.integrations.utilities.handler_utils import get_api_key
+from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args, row_to_document
 from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
 from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
@@ -36,6 +38,7 @@ from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor impor
 from mindsdb.interfaces.model.functions import PredictorRecordNotFound
 from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
 from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
+from mindsdb.utilities.context import context as ctx
 from mindsdb.api.executor.command_executor import ExecuteCommands
 from mindsdb.utilities import log
@@ -50,6 +53,42 @@ KB_TO_VECTORDB_COLUMNS = {
 }
+def get_embedding_model_from_params(embedding_model_params: dict):
+    """
+    Create embedding model from parameters.
+    """
+    params_copy = copy.deepcopy(embedding_model_params)
+    provider = params_copy.pop('provider', None).lower()
+    api_key = get_api_key(provider, params_copy, strict=False) or params_copy.get('api_key')
+    # Underscores are replaced because the provider name ultimately gets mapped to a class name.
+    # This is mostly to support Azure OpenAI (azure_openai); the mapped class name is 'AzureOpenAIEmbeddings'.
+    params_copy['class'] = provider.replace('_', '')
+    if provider == 'azure_openai':
+        # Azure OpenAI expects the api_key to be passed as 'openai_api_key'.
+        params_copy['openai_api_key'] = api_key
+    else:
+        params_copy[f"{provider}_api_key"] = api_key
+    params_copy.pop('api_key', None)
+    params_copy['model'] = params_copy.pop('model_name', None)
+    return construct_model_from_args(params_copy)
+def get_reranking_model_from_params(reranking_model_params: dict):
+    """
+    Create reranking model from parameters.
+    """
+    params_copy = copy.deepcopy(reranking_model_params)
+    provider = params_copy.pop('provider', "openai").lower()
+    if provider != 'openai':
+        raise ValueError("Only OpenAI provider is supported for the reranking model.")
+    params_copy[f"{provider}_api_key"] = get_api_key(provider, params_copy, strict=False) or params_copy.get('api_key')
+    params_copy.pop('api_key', None)
+    params_copy['model'] = params_copy.pop('model_name', None)
+    return LLMReranker(**params_copy)
 class KnowledgeBaseTable:
     """
     Knowledge base table interface
@@ -163,18 +202,17 @@ class KnowledgeBaseTable:
     def add_relevance(self, df, query_text, reranking_threshold=None):
         relevance_column = TableField.RELEVANCE.value
-        rerank_model = self._kb.params.get("rerank_model")
-        if rerank_model and query_text and len(df) > 0:
+        reranking_model_params = self._kb.params.get("reranking_model")
+        if reranking_model_params and query_text and len(df) > 0:
             # Use reranker for relevance score
             try:
-                logger.info(f"Using reranker model {rerank_model} for relevance calculation")
-                reranker_params = {"model": rerank_model}
+                logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
                 # Apply custom filtering threshold if provided
                 if reranking_threshold is not None:
-                    reranker_params["filtering_threshold"] = reranking_threshold
+                    reranking_model_params["filtering_threshold"] = reranking_threshold
                     logger.info(f"Using custom filtering threshold: {reranking_threshold}")
-                reranker = LLMReranker(**reranker_params)
+                reranker = get_reranking_model_from_params(reranking_model_params)
                 # Get documents to rerank
                 documents = df['chunk_content'].tolist()
                 # Use the get_scores method with disable_events=True
@@ -185,7 +223,7 @@ class KnowledgeBaseTable:
                 # Filter by threshold
                 scores_array = np.array(scores)
                 df = df[scores_array > reranker.filtering_threshold]
-                logger.debug(f"Applied reranking with model {rerank_model}, threshold: {reranker.filtering_threshold}")
+                logger.debug(f"Applied reranking with params: {reranking_model_params}")
             except Exception as e:
                 logger.error(f"Error during reranking: {str(e)}")
                 # Fallback to distance-based relevance
@@ -198,6 +236,8 @@ class KnowledgeBaseTable:
             # Calculate relevance from distance
             logger.info("Calculating relevance from vector distance")
             df[relevance_column] = 1 / (1 + df['distance'])
+            if reranking_threshold is not None:
+                df = df[df[relevance_column] > reranking_threshold]
         else:
             df[relevance_column] = None
@@ -373,6 +413,16 @@ class KnowledgeBaseTable:
         if df.empty:
             return
+        try:
+            run_query_id = ctx.run_query_id
+            # Link current KB to running query (where KB is used to insert data)
+            if run_query_id is not None:
+                self._kb.query_id = run_query_id
+                db.session.commit()
+        except AttributeError:
+            ...
         # First adapt column names to identify content and metadata columns
         adapted_df = self._adapt_column_names(df)
         content_columns = self._kb.params.get('content_columns', [TableField.CONTENT.value])
@@ -577,36 +627,48 @@ class KnowledgeBaseTable:
         if df.empty:
             return pd.DataFrame([], columns=[TableField.EMBEDDINGS.value])
+        # keep only content
+        df = df[[TableField.CONTENT.value]]
         model_id = self._kb.embedding_model_id
-        # get the input columns
-        model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
+        if model_id:
+            # get the input columns
+            model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
-        assert model_rec is not None, f"Model not found: {model_id}"
-        model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
+            assert model_rec is not None, f"Model not found: {model_id}"
+            model_project = db.session.query(db.Project).filter_by(id=model_rec.project_id).first()
-        project_datanode = self.session.datahub.get(model_project.name)
+            project_datanode = self.session.datahub.get(model_project.name)
-        # keep only content
-        df = df[[TableField.CONTENT.value]]
+            model_using = model_rec.learn_args.get('using', {})
+            input_col = model_using.get('question_column')
+            if input_col is None:
+                input_col = model_using.get('input_column')
-        model_using = model_rec.learn_args.get('using', {})
-        input_col = model_using.get('question_column')
-        if input_col is None:
-            input_col = model_using.get('input_column')
+            if input_col is not None and input_col != TableField.CONTENT.value:
+                df = df.rename(columns={TableField.CONTENT.value: input_col})
-        if input_col is not None and input_col != TableField.CONTENT.value:
-            df = df.rename(columns={TableField.CONTENT.value: input_col})
+            df_out = project_datanode.predict(
+                model_name=model_rec.name,
+                df=df,
+                params=self.model_params
+            )
-        df_out = project_datanode.predict(
-            model_name=model_rec.name,
-            df=df,
-            params=self.model_params
-        )
+            target = model_rec.to_predict[0]
+            if target != TableField.EMBEDDINGS.value:
+                # adapt output for vectordb
+                df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
+        elif self._kb.params.get('embedding_model'):
+            embedding_model = get_embedding_model_from_params(self._kb.params.get('embedding_model'))
+            df_texts = df.apply(row_to_document, axis=1)
+            embeddings = embedding_model.embed_documents(df_texts.tolist())
+            df_out = df.copy().assign(**{TableField.EMBEDDINGS.value: embeddings})
+        else:
+            raise ValueError("No embedding model found for the knowledge base.")
-        target = model_rec.to_predict[0]
-        if target != TableField.EMBEDDINGS.value:
-            # adapt output for vectordb
-            df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
         df_out = df_out[[TableField.EMBEDDINGS.value]]
         return df_out
@@ -640,9 +702,11 @@ class KnowledgeBaseTable:
             # Extract embedding model args from knowledge base table
             embedding_args = self._kb.embedding_model.learn_args.get('using', {})
             # Construct the embedding model directly
-            from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args
             embeddings_model = construct_model_from_args(embedding_args)
             logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
+        elif self._kb.params.get('embedding_model'):
+            embeddings_model = get_embedding_model_from_params(self._kb.params['embedding_model'])
+            logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
         else:
             embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
             logger.debug("Using default embedding model as knowledge base has no embedding model")
@@ -788,26 +852,46 @@ class KnowledgeBaseController:
                 return kb
             raise EntityExistsError("Knowledge base already exists", name)
-        if embedding_model is None:
-            # create default embedding model
-            model_name = self._get_default_embedding_model(project.name, params=params)
-            params['default_embedding_model'] = model_name
-        else:
-            # get embedding model from input
+        embedding_model_params = params.get('embedding_model', None)
+        reranking_model_params = params.get('reranking_model', None)
+        if embedding_model:
             model_name = embedding_model.parts[-1]
+        elif embedding_model_params:
+            # Get embedding model from params.
+            # This is called here to check validaity of the parameters.
+            get_embedding_model_from_params(
+                embedding_model_params
+            )
+        else:
+            model_name = self._get_default_embedding_model(
+                project.name,
+                params=params
+            )
+            params['default_embedding_model'] = model_name
+        model_project = None
         if embedding_model is not None and len(embedding_model.parts) > 1:
             # model project is set
             model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
-        else:
+        elif not embedding_model_params:
             model_project = project
-        model = self.session.model_controller.get_model(
-            name=model_name,
-            project_name=model_project.name
-        )
-        model_record = db.Predictor.query.get(model['id'])
-        embedding_model_id = model_record.id
+        embedding_model_id = None
+        if model_project:
+            model = self.session.model_controller.get_model(
+                name=model_name,
+                project_name=model_project.name
+            )
+            model_record = db.Predictor.query.get(model['id'])
+            embedding_model_id = model_record.id
+        if reranking_model_params:
+            # Get reranking model from params.
+            # This is called here to check validaity of the parameters.
+            get_reranking_model_from_params(reranking_model_params)
         # search for the vector database table
         if storage is None:
@@ -1029,6 +1113,7 @@ class KnowledgeBaseController:
                 'embedding_model': embedding_model.name if embedding_model is not None else None,
                 'vector_database': None if vector_database is None else vector_database.name,
                 'vector_database_table': record.vector_database_table,
+                'query_id': record.query_id,
                 'params': record.params
             })

mindsdb/interfaces/query_context/context_controller.py CHANGED Viewed

@@ -1,11 +1,17 @@
 from typing import List
+import pickle
+import datetime as dt
+from sqlalchemy.orm.attributes import flag_modified
 import pandas as pd
+from mindsdb_sql_parser import Select, Star, OrderBy
 from mindsdb_sql_parser.ast import (
     Identifier, BinaryOperation, Last, Constant, ASTNode
 )
 from mindsdb.integrations.utilities.query_traversal import query_traversal
+from mindsdb.utilities.cache import get_cache
 from mindsdb.interfaces.storage import db
 from mindsdb.utilities.context import context as ctx
@@ -13,6 +19,147 @@ from mindsdb.utilities.context import context as ctx
 from .last_query import LastQuery
+class RunningQuery:
+    """
+      Query in progres
+    """
+    def __init__(self, record: db.Queries):
+        self.record = record
+        self.sql = record.sql
+    def get_partition_query(self, step_num: int, query: Select) -> Select:
+        """
+           Generate query for fetching the next partition
+           It wraps query to
+              select * from ({query})
+              where {track_column} > {previous_value}
+              order by track_column
+              limit size {batch_size}
+           And fill track_column, previous_value, batch_size
+        """
+        track_column = self.record.parameters['track_column']
+        query = Select(
+            targets=[Star()],
+            from_table=query,
+            order_by=[OrderBy(Identifier(track_column))],
+            limit=Constant(self.batch_size)
+        )
+        track_value = self.record.context.get('track_value')
+        # is it different step?
+        cur_step_num = self.record.context.get('step_num')
+        if cur_step_num is not None and cur_step_num != step_num:
+            # reset track_value
+            track_value = None
+            self.record.context['track_value'] = None
+            self.record.context['step_num'] = step_num
+            flag_modified(self.record, 'context')
+            db.session.commit()
+        if track_value is not None:
+            query.where = BinaryOperation(
+                op='>',
+                args=[Identifier(track_column), Constant(track_value)],
+            )
+        return query
+    def set_params(self, params: dict):
+        """
+            Store parameters of the step which is about to be split into partitions
+        """
+        if 'track_column' not in params:
+            raise ValueError('Track column is not defined')
+        if 'batch_size' not in params:
+            params['batch_size'] = 1000
+        self.record.parameters = params
+        self.batch_size = self.record.parameters['batch_size']
+        db.session.commit()
+    def get_max_track_value(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+            return max value to use in `set_progress`.
+            this function is called before execution substeps,
+             `set_progress` function - after
+        """
+        track_column = self.record.parameters['track_column']
+        return df[track_column].max()
+    def set_progress(self, df: pd.DataFrame, max_track_value: int):
+        """
+           Store progres of the query, it is called after processing of batch
+        """
+        if len(df) == 0:
+            return
+        self.record.processed_rows = self.record.processed_rows + len(df)
+        cur_value = self.record.context.get('track_value')
+        new_value = max_track_value
+        if new_value is not None:
+            if cur_value is None or new_value > cur_value:
+                self.record.context['track_value'] = new_value
+                flag_modified(self.record, 'context')
+        db.session.commit()
+    def on_error(self, error: Exception, step_num: int, steps_data: dict):
+        """
+            Saves error of the query in database
+            Also saves step data and current step num to be able to resume query
+        """
+        self.record.error = str(error)
+        self.record.context['step_num'] = step_num
+        flag_modified(self.record, 'context')
+        # save steps_data
+        cache = get_cache('steps_data')
+        data = pickle.dumps(steps_data, protocol=5)
+        cache.set(str(self.record.id), data)
+        db.session.commit()
+    def clear_error(self):
+        """
+            Reset error of the query in database
+        """
+        if self.record.error is not None:
+            self.record.error = None
+            db.session.commit()
+    def get_state(self) -> dict:
+        """
+            Returns stored state for resuming the query
+        """
+        cache = get_cache('steps_data')
+        key = self.record.id
+        data = cache.get(key)
+        cache.delete(key)
+        steps_data = pickle.loads(data)
+        return {
+            'step_num': self.record.context.get('step_num'),
+            'steps_data': steps_data,
+        }
+    def finish(self):
+        """
+            Mark query as finished
+        """
+        self.record.finished_at = dt.datetime.now()
+        db.session.commit()
 class QueryContextController:
     IGNORE_CONTEXT = '<IGNORE>'
@@ -287,5 +434,79 @@ class QueryContextController:
         rec.values = values
         db.session.commit()
+    def get_query(self, query_id: int) -> RunningQuery:
+        """
+           Get running query by id
+        """
+        rec = db.Queries.query.filter(
+            db.Queries.id == query_id,
+            db.Queries.company_id == ctx.company_id
+        ).first()
+        if rec is None:
+            raise RuntimeError(f'Query not found: {query_id}')
+        return RunningQuery(rec)
+    def create_query(self, query: ASTNode) -> RunningQuery:
+        """
+           Create a new running query from AST query
+        """
+        # remove old queries
+        remove_query = db.session.query(db.Queries).filter(
+            db.Queries.company_id == ctx.company_id,
+            db.Queries.finished_at < (dt.datetime.now() - dt.timedelta(days=1))
+        )
+        for rec in remove_query.all():
+            db.session.delete(rec)
+        rec = db.Queries(
+            sql=str(query),
+            company_id=ctx.company_id,
+        )
+        db.session.add(rec)
+        db.session.commit()
+        return RunningQuery(rec)
+    def list_queries(self) -> List[dict]:
+        """
+           Get list of all running queries with metadata
+        """
+        query = db.session.query(db.Queries).filter(
+            db.Queries.company_id == ctx.company_id
+        )
+        return [
+            {
+                'id': record.id,
+                'sql': record.sql,
+                'started_at': record.started_at,
+                'finished_at': record.finished_at,
+                'parameters': record.parameters,
+                'context': record.context,
+                'processed_rows': record.processed_rows,
+                'error': record.error,
+                'updated_at': record.updated_at,
+            }
+            for record in query
+        ]
+    def cancel_query(self, query_id: int):
+        """
+           Cancels running query by id
+        """
+        rec = db.Queries.query.filter(
+            db.Queries.id == query_id,
+            db.Queries.company_id == ctx.company_id
+        ).first()
+        if rec is None:
+            raise RuntimeError(f'Query not found: {query_id}')
+        # the query in progress will fail when it tries to update status
+        db.session.delete(rec)
+        db.session.commit()
 query_context_controller = QueryContextController()

mindsdb/interfaces/storage/db.py CHANGED Viewed

@@ -523,6 +523,7 @@ class KnowledgeBase(Base):
     embedding_model = relationship(
         "Predictor", foreign_keys=[embedding_model_id], doc="embedding model"
     )
+    query_id = Column(Integer, nullable=True)
     created_at = Column(DateTime, default=datetime.datetime.now)
     updated_at = Column(
@@ -564,6 +565,28 @@ class QueryContext(Base):
     created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
+class Queries(Base):
+    __tablename__ = "queries"
+    id: int = Column(Integer, primary_key=True)
+    company_id: int = Column(Integer, nullable=True)
+    sql: str = Column(String, nullable=False)
+    # step_data: JSON = Column(JSON, nullable=True)
+    started_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
+    finished_at: datetime.datetime = Column(DateTime)
+    parameters = Column(JSON, default={})
+    context = Column(JSON, default={})
+    processed_rows = Column(Integer, default=0)
+    error: str = Column(String, nullable=True)
+    updated_at: datetime.datetime = Column(
+        DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
+    )
+    created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
 class LLMLog(Base):
     __tablename__ = "llm_log"
     id: int = Column(Integer, primary_key=True)

mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""queries
+Revision ID: fda503400e43
+Revises: 11347c213b36
+Create Date: 2025-03-21 18:50:20.795930
+"""
+from alembic import op
+import sqlalchemy as sa
+import mindsdb.interfaces.storage.db  # noqa
+# revision identifiers, used by Alembic.
+revision = 'fda503400e43'
+down_revision = '11347c213b36'
+branch_labels = None
+depends_on = None
+def upgrade():
+    op.create_table(
+        'queries',
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('company_id', sa.Integer(), nullable=True),
+        sa.Column('sql', sa.String(), nullable=False),
+        sa.Column('started_at', sa.DateTime(), nullable=True),
+        sa.Column('finished_at', sa.DateTime(), nullable=True),
+        sa.Column('parameters', sa.JSON(), nullable=True),
+        sa.Column('context', sa.JSON(), nullable=True),
+        sa.Column('processed_rows', sa.Integer(), nullable=True),
+        sa.Column('error', sa.String(), nullable=True),
+        sa.Column('updated_at', sa.DateTime(), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=True),
+        sa.PrimaryKeyConstraint('id')
+    )
+    with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('query_id', sa.INTEGER(), nullable=True))
+def downgrade():
+    with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
+        batch_op.drop_column('query_id')
+    op.drop_table('queries')

mindsdb/utilities/context_executor.py CHANGED Viewed

@@ -43,7 +43,7 @@ def execute_in_threads(func, tasks, thread_count=3, queue_size_k=1.5):
         for i in range(queue_size):
             try:
                 args = next(tasks)
-                futures.append(executor.submit(func, *args))
+                futures.append(executor.submit(func, args))
             except StopIteration:
                 break

mindsdb/utilities/partitioning.py CHANGED Viewed

@@ -6,6 +6,35 @@ from mindsdb.utilities.config import Config
 from mindsdb.utilities.context_executor import execute_in_threads
+def get_max_thread_count() -> int:
+    """
+        Calculate the maximum number of threads allowed for the system.
+    """
+    # workers count
+    is_cloud = Config().is_cloud
+    if is_cloud:
+        max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
+    else:
+        max_threads = os.cpu_count() - 3
+    if max_threads < 1:
+        max_threads = 1
+    return max_threads
+def split_data_frame(df: pd.DataFrame, partition_size: int) -> Iterable[pd.DataFrame]:
+    """
+    Split data frame into chunks with partition_size and yield them out
+    """
+    chunk = 0
+    while chunk * partition_size < len(df):
+        # create results with partition
+        df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
+        chunk += 1
+        yield df1
 def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partition_size: int) -> Iterable:
     """
     Splits dataframe into partitions and apply callback on each partition
@@ -17,35 +46,21 @@ def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partit
     """
     # tasks
-    def split_data_f(df):
-        chunk = 0
-        while chunk * partition_size < len(df):
-            # create results with partition
-            df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
-            chunk += 1
-            yield [df1]
-    tasks = split_data_f(df)
+    tasks = split_data_frame(df, partition_size)
-    # workers count
-    is_cloud = Config().is_cloud
-    if is_cloud:
-        max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
-    else:
-        max_threads = os.cpu_count() - 2
+    max_threads = get_max_thread_count()
-    # don't exceed chunk_count
     chunk_count = int(len(df) / partition_size)
-    max_threads = min(max_threads, chunk_count)
-    if max_threads < 1:
-        max_threads = 1
+    # don't exceed chunk_count
+    if chunk_count > 0:
+        max_threads = min(max_threads, chunk_count)
     if max_threads == 1:
         # don't spawn threads
         for task in tasks:
-            yield callback(*task)
+            yield callback(task)
     else:
         for result in execute_in_threads(callback, tasks, thread_count=max_threads):

MindsDB 25.4.2.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl

Potentially problematic release.

MindsDB 25.4.2.0py3-none-any.whl → 25.4.2.1py3-none-any.whl