PyPI - MindsDB - Versions diffs - 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl - Mend

MindsDB 25.6.4.0py3-none-any.whl → 25.7.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show

mindsdb/__about__.py +1 -1
mindsdb/__main__.py +53 -94
mindsdb/api/a2a/agent.py +30 -206
mindsdb/api/a2a/common/server/server.py +26 -27
mindsdb/api/a2a/task_manager.py +93 -227
mindsdb/api/a2a/utils.py +21 -0
mindsdb/api/executor/command_executor.py +8 -6
mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
mindsdb/api/executor/planner/query_prepare.py +68 -87
mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
mindsdb/api/executor/utilities/sql.py +97 -21
mindsdb/api/http/namespaces/agents.py +126 -201
mindsdb/api/http/namespaces/config.py +12 -1
mindsdb/api/http/namespaces/file.py +49 -24
mindsdb/api/mcp/start.py +45 -31
mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
mindsdb/integrations/libs/keyword_search_base.py +41 -0
mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
mindsdb/integrations/utilities/sql_utils.py +11 -0
mindsdb/interfaces/agents/agents_controller.py +29 -9
mindsdb/interfaces/agents/langchain_agent.py +7 -5
mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
mindsdb/interfaces/database/projects.py +1 -3
mindsdb/interfaces/functions/controller.py +54 -64
mindsdb/interfaces/functions/to_markdown.py +47 -14
mindsdb/interfaces/knowledge_base/controller.py +228 -110
mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
mindsdb/interfaces/knowledge_base/executor.py +346 -0
mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
mindsdb/interfaces/skills/sql_agent.py +181 -130
mindsdb/interfaces/storage/db.py +9 -7
mindsdb/utilities/config.py +58 -40
mindsdb/utilities/exception.py +58 -7
mindsdb/utilities/security.py +54 -11
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0

mindsdb/interfaces/knowledge_base/controller.py CHANGED Viewed

@@ -1,16 +1,19 @@
 import os
 import copy
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any, Text
 import json
 import decimal
 import pandas as pd
 import numpy as np
+from pydantic import BaseModel, ValidationError
+from sqlalchemy.orm.attributes import flag_modified
 from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
 from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
 from mindsdb_sql_parser import parse_sql
+from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
 from mindsdb.integrations.utilities.query_traversal import query_traversal
 import mindsdb.interfaces.storage.db as db
@@ -33,9 +36,10 @@ from mindsdb.interfaces.variables.variables_controller import variables_controll
 from mindsdb.interfaces.knowledge_base.preprocessing.models import PreprocessingConfig, Document
 from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import PreprocessorFactory
 from mindsdb.interfaces.knowledge_base.evaluate import EvaluateBase
+from mindsdb.interfaces.knowledge_base.executor import KnowledgeBaseQueryExecutor
 from mindsdb.interfaces.model.functions import PredictorRecordNotFound
 from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
-from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
+from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator, KeywordSearchArgs
 from mindsdb.utilities.config import config
 from mindsdb.utilities.context import context as ctx
@@ -46,7 +50,19 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
 logger = log.getLogger(__name__)
-KB_TO_VECTORDB_COLUMNS = {"id": "original_doc_id", "chunk_id": "id", "chunk_content": "content"}
+class KnowledgeBaseInputParams(BaseModel):
+    metadata_columns: List[str] | None = None
+    content_columns: List[str] | None = None
+    id_column: str | None = None
+    kb_no_upsert: bool = False
+    embedding_model: Dict[Text, Any] | None = None
+    is_sparse: bool = False
+    vector_size: int | None = None
+    reranking_model: Dict[Text, Any] | None = None
+    class Config:
+        extra = "forbid"
 def get_model_params(model_params: dict, default_config_key: str):
@@ -101,7 +117,10 @@ def get_reranking_model_from_params(reranking_model_params: dict):
     if "api_key" not in params_copy:
         params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
-    params_copy["model"] = params_copy.pop("model_name", None)
+    if "model_name" not in params_copy:
+        raise ValueError("'model_name' must be provided for reranking model")
+    params_copy["model"] = params_copy.pop("model_name")
     return BaseLLMReranker(**params_copy)
@@ -140,23 +159,29 @@ class KnowledgeBaseTable:
         self.document_loader = None
         self.model_params = None
+        self.kb_to_vector_columns = {"id": "_original_doc_id", "chunk_id": "id", "chunk_content": "content"}
+        if self._kb.params.get("version", 0) < 2:
+            self.kb_to_vector_columns["id"] = "original_doc_id"
     def configure_preprocessing(self, config: Optional[dict] = None):
         """Configure preprocessing for the knowledge base table"""
         logger.debug(f"Configuring preprocessing with config: {config}")
         self.document_preprocessor = None  # Reset existing preprocessor
-        if config is not None:
-            # Ensure content_column is set for JSON chunking if not already specified
-            if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
-                if "content_column" not in config["json_chunking_config"]:
-                    config["json_chunking_config"]["content_column"] = "content"
-            preprocessing_config = PreprocessingConfig(**config)
-            self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
-            logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
-        else:
-            # Always create a default preprocessor if none specified
-            self.document_preprocessor = PreprocessorFactory.create_preprocessor()
-            logger.debug("Created default preprocessor")
+        if config is None:
+            config = {}
+        # Ensure content_column is set for JSON chunking if not already specified
+        if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
+            if "content_column" not in config["json_chunking_config"]:
+                config["json_chunking_config"]["content_column"] = "content"
+        preprocessing_config = PreprocessingConfig(**config)
+        self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
+        # set doc_id column name
+        self.document_preprocessor.config.doc_id_column_name = self.kb_to_vector_columns["id"]
+        logger.debug(f"Created preprocessor of type: {type(self.document_preprocessor)}")
     def select_query(self, query: Select) -> pd.DataFrame:
         """
@@ -165,6 +190,33 @@ class KnowledgeBaseTable:
         :param query: query to KB table
         :return: dataframe with the result table
         """
+        # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
+        query_copy = copy.deepcopy(query)
+        executor = KnowledgeBaseQueryExecutor(self)
+        df = executor.run(query)
+        if (
+            query_copy.group_by is not None
+            or query_copy.order_by is not None
+            or query_copy.having is not None
+            or query_copy.distinct is True
+            or len(query_copy.targets) != 1
+            or not isinstance(query_copy.targets[0], Star)
+        ):
+            query_copy.where = None
+            if "metadata" in df.columns:
+                df["metadata"] = df["metadata"].apply(to_json)
+            if query_copy.from_table is None:
+                query_copy.from_table = Identifier(parts=[self._kb.name])
+            df = query_df(df, query_copy, session=self.session)
+        return df
+    def select(self, query, disable_reranking=False):
         logger.debug(f"Processing select query: {query}")
         # Extract the content query text for potential reranking
@@ -176,9 +228,6 @@ class KnowledgeBaseTable:
         query.from_table = Identifier(parts=[self._kb.vector_database_table])
         logger.debug(f"Set table name to: {self._kb.vector_database_table}")
-        # Copy query for complex execution via DuckDB: DISTINCT, GROUP BY etc.
-        query_copy = copy.deepcopy(query)
         query.targets = [
             Identifier(TableField.ID.value),
             Identifier(TableField.CONTENT.value),
@@ -191,9 +240,12 @@ class KnowledgeBaseTable:
         # extract values from conditions and prepare for vectordb
         conditions = []
+        keyword_search_conditions = []
+        keyword_search_cols_and_values = []
         query_text = None
         relevance_threshold = None
         reranking_enabled_flag = True
+        hybrid_search_enabled_flag = False
         query_conditions = db_handler.extract_conditions(query.where)
         if query_conditions is not None:
             for item in query_conditions:
@@ -213,6 +265,13 @@ class KnowledgeBaseTable:
                     # cast to boolean
                     if isinstance(reranking_enabled_flag, str):
                         reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
+                elif item.column == "hybrid_search":
+                    hybrid_search_enabled_flag = item.value
+                    # cast to boolean
+                    if isinstance(hybrid_search_enabled_flag, str):
+                        hybrid_search_enabled_flag = hybrid_search_enabled_flag.lower() not in ("false")
+                    if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
+                        disable_reranking = True
                 elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
                     raise ValueError(
                         f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
@@ -228,8 +287,16 @@ class KnowledgeBaseTable:
                             op=FilterOperator.EQUAL,
                         )
                     )
+                    keyword_search_cols_and_values.append((TableField.CONTENT.value, item.value))
                 else:
                     conditions.append(item)
+                    keyword_search_conditions.append(item)  # keyword search conditions do not use embeddings
+        if len(keyword_search_cols_and_values) > 1:
+            raise ValueError(
+                "Multiple content columns found in query conditions. "
+                "Only one content column is allowed for keyword search."
+            )
         logger.debug(f"Extracted query text: {query_text}")
@@ -244,66 +311,92 @@ class KnowledgeBaseTable:
                 limit = 100
             query.limit = Constant(limit)
-        df = db_handler.dispatch_select(query, conditions)
+        allowed_metadata_columns = self._get_allowed_metadata_columns()
+        df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
         df = self.addapt_result_columns(df)
         logger.debug(f"Query returned {len(df)} rows")
         logger.debug(f"Columns in response: {df.columns.tolist()}")
-        # Check if we have a rerank_model configured in KB params
-        df = self.add_relevance(df, query_text, relevance_threshold, reranking_enabled_flag)
-        if (
-            query.group_by is not None
-            or query.order_by is not None
-            or query.having is not None
-            or query.distinct is True
-            or len(query.targets) != 1
-            or not isinstance(query.targets[0], Star)
-        ):
-            query_copy.where = None
-            if "metadata" in df.columns:
-                df["metadata"] = df["metadata"].apply(to_json)
+        if hybrid_search_enabled_flag and not isinstance(db_handler, KeywordSearchBase):
+            raise ValueError(f"Hybrid search is enabled but the db_handler {type(db_handler)} does not support it. ")
+        # check if db_handler inherits from KeywordSearchBase
+        if hybrid_search_enabled_flag and isinstance(db_handler, KeywordSearchBase):
+            # If query_text is present, use it for keyword search
+            logger.debug(f"Performing keyword search with query text: {query_text}")
+            keyword_search_args = KeywordSearchArgs(query=query_text, column=TableField.CONTENT.value)
+            keyword_query_obj = copy.deepcopy(query)
+            keyword_query_obj.targets = [
+                Identifier(TableField.ID.value),
+                Identifier(TableField.CONTENT.value),
+                Identifier(TableField.METADATA.value),
+            ]
-            df = query_df(df, query_copy, session=self.session)
+            df_keyword_select = db_handler.dispatch_select(
+                keyword_query_obj, keyword_search_conditions, keyword_search_args=keyword_search_args
+            )
+            df_keyword_select = self.addapt_result_columns(df_keyword_select)
+            logger.debug(f"Keyword search returned {len(df_keyword_select)} rows")
+            logger.debug(f"Columns in keyword search response: {df_keyword_select.columns.tolist()}")
+            # ensure df and df_keyword_select have exactly the same columns
+            if not df_keyword_select.empty:
+                if set(df.columns) != set(df_keyword_select.columns):
+                    raise ValueError(
+                        f"Keyword search returned different columns: {df_keyword_select.columns} "
+                        f"than expected: {df.columns}"
+                    )
+                df = pd.concat([df, df_keyword_select], ignore_index=True)
+                # if chunk_id column exists remove duplicates based on chunk_id
+                if "chunk_id" in df.columns:
+                    df = df.drop_duplicates(subset=["chunk_id"])
+        # Check if we have a rerank_model configured in KB params
+        df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
         return df
+    def _get_allowed_metadata_columns(self) -> List[str] | None:
+        # Return list of KB columns to restrict querying, if None: no restrictions
+        if self._kb.params.get("version", 0) < 2:
+            # disable for old version KBs
+            return None
+        user_columns = self._kb.params.get("metadata_columns", [])
+        dynamic_columns = self._kb.params.get("inserted_metadata", [])
+        columns = set(user_columns) | set(dynamic_columns)
+        return [col.lower() for col in columns]
     def score_documents(self, query_text, documents, reranking_model_params):
         reranker = get_reranking_model_from_params(reranking_model_params)
         return reranker.get_scores(query_text, documents)
-    def add_relevance(self, df, query_text, relevance_threshold=None, reranking_enabled_flag=True):
+    def add_relevance(self, df, query_text, relevance_threshold=None, disable_reranking=False):
         relevance_column = TableField.RELEVANCE.value
         reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
-        if reranking_model_params and query_text and len(df) > 0 and reranking_enabled_flag:
+        if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
             # Use reranker for relevance score
-            try:
-                logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
-                # Apply custom filtering threshold if provided
-                if relevance_threshold is not None:
-                    reranking_model_params["filtering_threshold"] = relevance_threshold
-                    logger.info(f"Using custom filtering threshold: {relevance_threshold}")
-                reranker = get_reranking_model_from_params(reranking_model_params)
-                # Get documents to rerank
-                documents = df["chunk_content"].tolist()
-                # Use the get_scores method with disable_events=True
-                scores = reranker.get_scores(query_text, documents)
-                # Add scores as the relevance column
-                df[relevance_column] = scores
-                # Filter by threshold
-                scores_array = np.array(scores)
-                df = df[scores_array > reranker.filtering_threshold]
-                logger.debug(f"Applied reranking with params: {reranking_model_params}")
-            except Exception as e:
-                logger.error(f"Error during reranking: {str(e)}")
-                # Fallback to distance-based relevance
-                if "distance" in df.columns:
-                    df[relevance_column] = 1 / (1 + df["distance"])
-                else:
-                    logger.info("No distance or reranker available")
+            logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
+            # Apply custom filtering threshold if provided
+            if relevance_threshold is not None:
+                reranking_model_params["filtering_threshold"] = relevance_threshold
+                logger.info(f"Using custom filtering threshold: {relevance_threshold}")
+            reranker = get_reranking_model_from_params(reranking_model_params)
+            # Get documents to rerank
+            documents = df["chunk_content"].tolist()
+            # Use the get_scores method with disable_events=True
+            scores = reranker.get_scores(query_text, documents)
+            # Add scores as the relevance column
+            df[relevance_column] = scores
+            # Filter by threshold
+            scores_array = np.array(scores)
+            df = df[scores_array > reranker.filtering_threshold]
+            logger.debug(f"Applied reranking with params: {reranking_model_params}")
         elif "distance" in df.columns:
             # Calculate relevance from distance
@@ -323,12 +416,12 @@ class KnowledgeBaseTable:
         if conditions is None:
             return
         for condition in conditions:
-            if condition.column in KB_TO_VECTORDB_COLUMNS:
-                condition.column = KB_TO_VECTORDB_COLUMNS[condition.column]
+            if condition.column in self.kb_to_vector_columns:
+                condition.column = self.kb_to_vector_columns[condition.column]
     def addapt_result_columns(self, df):
         col_update = {}
-        for kb_col, vec_col in KB_TO_VECTORDB_COLUMNS.items():
+        for kb_col, vec_col in self.kb_to_vector_columns.items():
             if vec_col in df.columns:
                 col_update[vec_col] = kb_col
@@ -337,7 +430,7 @@ class KnowledgeBaseTable:
         columns = list(df.columns)
         # update id, get from metadata
         df[TableField.ID.value] = df[TableField.METADATA.value].apply(
-            lambda m: None if m is None else m.get("original_doc_id")
+            lambda m: None if m is None else m.get(self.kb_to_vector_columns["id"])
         )
         # id on first place
@@ -524,8 +617,8 @@ class KnowledgeBaseTable:
                     metadata = {
                         **base_metadata,
-                        "original_row_index": str(idx),  # provide link to original row index
-                        "content_column": col,
+                        "_original_row_index": str(idx),  # provide link to original row index
+                        "_content_column": col,
                     }
                     raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
@@ -620,16 +713,22 @@ class KnowledgeBaseTable:
             metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
             logger.debug(f"Mapped metadata columns: {metadata_columns}")
-        if content_columns is not None:
-            content_columns = list(set(content_columns).intersection(columns))
-            if len(content_columns) == 0:
-                raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
+        content_columns = list(set(content_columns).intersection(columns))
+        if len(content_columns) == 0:
+            raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
-            if metadata_columns is not None:
-                metadata_columns = list(set(metadata_columns).intersection(columns))
-            else:
-                # all the rest columns
-                metadata_columns = list(set(columns).difference(content_columns))
+        if metadata_columns is not None:
+            metadata_columns = list(set(metadata_columns).intersection(columns))
+        else:
+            # all the rest columns
+            metadata_columns = list(set(columns).difference(content_columns))
+            # update list of used columns
+            inserted_metadata = set(self._kb.params.get("inserted_metadata", []))
+            inserted_metadata.update(metadata_columns)
+            self._kb.params["inserted_metadata"] = list(inserted_metadata)
+            flag_modified(self._kb, "params")
+            db.session.commit()
         # Add content columns directly (don't combine them)
         for col in content_columns:
@@ -655,7 +754,7 @@ class KnowledgeBaseTable:
                     elif isinstance(value, dict):
                         metadata.update(value)
                         continue
-                    else:
+                    elif value is not None:
                         value = str(value)
                     metadata[col] = value
                 return metadata
@@ -712,8 +811,7 @@ class KnowledgeBaseTable:
         if model_id is None:
             # call litellm handler
             messages = list(df[TableField.CONTENT.value])
-            embedding_params = copy.deepcopy(config.get("default_embedding_model", {}))
-            embedding_params.update(self._kb.params["embedding_model"])
+            embedding_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
             results = self.call_litellm_embedding(self.session, embedding_params, messages)
             results = [[val] for val in results]
             return pd.DataFrame(results, columns=[TableField.EMBEDDINGS.value])
@@ -759,18 +857,16 @@ class KnowledgeBaseTable:
     def call_litellm_embedding(session, model_params, messages):
         args = copy.deepcopy(model_params)
+        if "model_name" not in args:
+            raise ValueError("'model_name' must be provided for embedding model")
         llm_model = args.pop("model_name")
         engine = args.pop("provider")
-        llm_model = f"{engine}/{llm_model}"
-        if "base_url" in args:
-            args["api_base"] = args.pop("base_url")
         module = session.integration_controller.get_handler_module("litellm")
         if module is None or module.Handler is None:
             raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
-        return module.Handler.embeddings(llm_model, messages, args)
+        return module.Handler.embeddings(engine, llm_model, messages, args)
     def build_rag_pipeline(self, retrieval_config: dict):
         """
@@ -892,6 +988,8 @@ class KnowledgeBaseController:
     manages knowledge bases
     """
+    KB_VERSION = 2
     def __init__(self, session) -> None:
         self.session = session
@@ -903,6 +1001,7 @@ class KnowledgeBaseController:
         params: dict,
         preprocessing_config: Optional[dict] = None,
         if_not_exists: bool = False,
+        keyword_search_enabled: bool = False,
         # embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
     ) -> db.KnowledgeBase:
         """
@@ -914,6 +1013,24 @@ class KnowledgeBaseController:
         # fill variables
         params = variables_controller.fill_parameters(params)
+        try:
+            KnowledgeBaseInputParams.model_validate(params)
+        except ValidationError as e:
+            problems = []
+            for error in e.errors():
+                parameter = ".".join([str(i) for i in error["loc"]])
+                param_type = error["type"]
+                if param_type == "extra_forbidden":
+                    msg = f"Parameter '{parameter}' is not allowed"
+                else:
+                    msg = f"Error in '{parameter}' (type: {param_type}): {error['msg']}. Input: {repr(error['input'])}"
+                problems.append(msg)
+            msg = "\n".join(problems)
+            if len(problems) > 1:
+                msg = "\n" + msg
+            raise ValueError(f"Problem with knowledge base parameters: {msg}")
         # Validate preprocessing config first if provided
         if preprocessing_config is not None:
             PreprocessingConfig(**preprocessing_config)  # Validate before storing
@@ -939,24 +1056,6 @@ class KnowledgeBaseController:
                 return kb
             raise EntityExistsError("Knowledge base already exists", name)
-        embedding_params = copy.deepcopy(config.get("default_embedding_model", {}))
-        # Legacy
-        # model_name = None
-        # model_project = project
-        # if embedding_model:
-        #     model_name = embedding_model.parts[-1]
-        #     if len(embedding_model.parts) > 1:
-        #         model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
-        # elif "embedding_model" in params:
-        #     if isinstance(params["embedding_model"], str):
-        #         # it is model name
-        #         model_name = params["embedding_model"]
-        #     else:
-        #         # it is params for model
-        #         embedding_params.update(params["embedding_model"])
         embedding_params = get_model_params(params.get("embedding_model", {}), "default_embedding_model")
         # if model_name is None:  # Legacy
@@ -987,7 +1086,11 @@ class KnowledgeBaseController:
         if reranking_model_params:
             # Get reranking model from params.
             # This is called here to check validaity of the parameters.
-            get_reranking_model_from_params(reranking_model_params)
+            try:
+                reranker = get_reranking_model_from_params(reranking_model_params)
+                reranker.get_scores("test", ["test"])
+            except (ValueError, RuntimeError) as e:
+                raise RuntimeError(f"Problem with reranker config: {e}")
         # search for the vector database table
         if storage is None:
@@ -1016,7 +1119,10 @@ class KnowledgeBaseController:
             vector_db_name, vector_table_name = storage.parts
         # create table in vectordb before creating KB
-        self.session.datahub.get(vector_db_name).integration_handler.create_table(vector_table_name)
+        vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
+        vector_store_handler.create_table(vector_table_name)
+        if keyword_search_enabled:
+            vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
         vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
         # Store sparse vector settings in params if specified
@@ -1026,6 +1132,7 @@ class KnowledgeBaseController:
             if vector_size is not None:
                 params["vector_config"]["vector_size"] = vector_size
+        params["version"] = self.KB_VERSION
         kb = db.KnowledgeBase(
             name=name,
             project_id=project_id,
@@ -1076,15 +1183,26 @@ class KnowledgeBaseController:
         except PredictorRecordNotFound:
             pass
-        if params.get("provider", None) not in ("openai", "azure_openai"):
+        if "provider" not in params:
+            raise ValueError("'provider' parameter is required for embedding model")
+        if params["provider"] not in ("openai", "azure_openai"):
             # try use litellm
-            KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
+            try:
+                KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
+            except Exception as e:
+                raise RuntimeError(f"Problem with embedding model config: {e}")
             return
         if "provider" in params:
             engine = params.pop("provider").lower()
-        api_key = get_api_key(engine, params, strict=False) or params.pop("api_key")
+        api_key = get_api_key(engine, params, strict=False)
+        if api_key is None:
+            if "api_key" in params:
+                params.pop("api_key")
+            else:
+                raise ValueError("'api_key' parameter is required for embedding model")
         if engine == "azure_openai":
             engine = "openai"

mindsdb/interfaces/knowledge_base/evaluate.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 import datetime as dt
 from mindsdb.api.executor.sql_query.result_set import ResultSet
-from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql
+from mindsdb_sql_parser import Identifier, Select, Constant, Star, parse_sql, BinaryOperation
 from mindsdb.utilities import log
 from mindsdb.interfaces.knowledge_base.llm_client import LLMClient
@@ -90,7 +90,7 @@ class EvaluateBase:
             df = response.data_frame
             if "content" not in df.columns:
-                raise ValueError("`content` column isn't found in source data")
+                raise ValueError(f"`content` column isn't found in provided sql: {gen_params['from_sql']}")
             df.rename(columns={"content": "chunk_content"}, inplace=True)
         else:
@@ -130,6 +130,8 @@ class EvaluateBase:
         integration_name = table_name.parts[0]
         table_name = Identifier(parts=table_name.parts[1:])
         dn = self.session.datahub.get(integration_name)
+        if dn is None:
+            raise ValueError(f"Can't find database: {integration_name}")
         return dn, table_name
     def save_to_table(self, table_name: Identifier, df: pd.DataFrame, is_replace=False):
@@ -184,7 +186,7 @@ class EvaluateBase:
             to_table = params["save_to"]
             if isinstance(to_table, str):
                 to_table = Identifier(to_table)
-            self.save_to_table(to_table, scores)
+            self.save_to_table(to_table, scores.copy())
         return scores
@@ -256,7 +258,13 @@ class EvaluateRerank(EvaluateBase):
             start_time = time.time()
             logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
-            df_answers = self.kb.select_query(Select(targets=[Identifier("chunk_content")], limit=Constant(self.TOP_K)))
+            df_answers = self.kb.select_query(
+                Select(
+                    targets=[Identifier("chunk_content")],
+                    where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
+                    limit=Constant(self.TOP_K),
+                )
+            )
             query_time = time.time() - start_time
             proposed_responses = list(df_answers["chunk_content"])
@@ -410,7 +418,7 @@ class EvaluateDocID(EvaluateBase):
     Checks if ID in response from KB is matched with doc ID in test dataset
     """
-    TOP_K = 100
+    TOP_K = 20
     def generate(self, sampled_df: pd.DataFrame) -> pd.DataFrame:
         if "id" not in sampled_df.columns:
@@ -462,7 +470,11 @@ class EvaluateDocID(EvaluateBase):
             start_time = time.time()
             logger.debug(f"Querying [{i + 1}/{len(questions)}]: {question}")
             df_answers = self.kb.select_query(
-                Select(targets=[Identifier("chunk_content"), Identifier("id")], limit=Constant(self.TOP_K))
+                Select(
+                    targets=[Identifier("chunk_content"), Identifier("id")],
+                    where=BinaryOperation(op="=", args=[Identifier("content"), Constant(question)]),
+                    limit=Constant(self.TOP_K),
+                )
             )
             query_time = time.time() - start_time

MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.6.4.0py3-none-any.whl → 25.7.2.0py3-none-any.whl