PyPI - MindsDB - Versions diffs - 25.7.2.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl - Mend

MindsDB 25.7.2.0py3-none-any.whl → 25.7.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (69) hide show

mindsdb/__about__.py +1 -1
mindsdb/__main__.py +1 -1
mindsdb/api/a2a/common/server/server.py +16 -6
mindsdb/api/executor/command_executor.py +213 -137
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
mindsdb/api/executor/planner/plan_join.py +3 -0
mindsdb/api/executor/planner/plan_join_ts.py +117 -100
mindsdb/api/executor/planner/query_planner.py +1 -0
mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
mindsdb/api/http/initialize.py +16 -43
mindsdb/api/http/namespaces/agents.py +24 -21
mindsdb/api/http/namespaces/chatbots.py +83 -120
mindsdb/api/http/namespaces/file.py +1 -1
mindsdb/api/http/namespaces/jobs.py +38 -60
mindsdb/api/http/namespaces/tree.py +69 -61
mindsdb/api/mcp/start.py +2 -0
mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -76
mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +16 -3
mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
mindsdb/integrations/handlers/salesforce_handler/constants.py +208 -0
mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +142 -81
mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -4
mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
mindsdb/integrations/utilities/handler_utils.py +32 -12
mindsdb/interfaces/agents/agents_controller.py +169 -110
mindsdb/interfaces/agents/langchain_agent.py +10 -3
mindsdb/interfaces/data_catalog/data_catalog_loader.py +22 -8
mindsdb/interfaces/database/database.py +38 -13
mindsdb/interfaces/database/integrations.py +20 -5
mindsdb/interfaces/database/projects.py +63 -16
mindsdb/interfaces/database/views.py +86 -60
mindsdb/interfaces/jobs/jobs_controller.py +103 -110
mindsdb/interfaces/knowledge_base/controller.py +33 -5
mindsdb/interfaces/knowledge_base/evaluate.py +53 -9
mindsdb/interfaces/knowledge_base/executor.py +24 -0
mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
mindsdb/interfaces/query_context/context_controller.py +100 -133
mindsdb/interfaces/skills/skills_controller.py +18 -6
mindsdb/interfaces/storage/db.py +40 -6
mindsdb/interfaces/variables/variables_controller.py +8 -15
mindsdb/utilities/config.py +3 -3
mindsdb/utilities/functions.py +72 -60
mindsdb/utilities/log.py +38 -6
mindsdb/utilities/ps.py +7 -7
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/METADATA +262 -263
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/RECORD +69 -68
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/WHEEL +0 -0
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/top_level.txt +0 -0

mindsdb/interfaces/jobs/jobs_controller.py CHANGED Viewed

@@ -21,29 +21,29 @@ from mindsdb.utilities import log
 logger = log.getLogger(__name__)
-default_project = config.get('default_project')
+default_project = config.get("default_project")
 def split_sql(sql):
     # split sql by ';' ignoring delimiter in quotes
-    pattern = re.compile(r'''((?:[^;"']|"[^"]*"|'[^']*')+)''')
+    pattern = re.compile(r"""((?:[^;"']|"[^"]*"|'[^']*')+)""")
     return pattern.split(sql)[1::2]
 def calc_next_date(schedule_str, base_date: dt.datetime):
     schedule_str = schedule_str.lower().strip()
-    repeat_prefix = 'every '
+    repeat_prefix = "every "
     if schedule_str.startswith(repeat_prefix):
-        repeat_str = schedule_str[len(repeat_prefix):]
+        repeat_str = schedule_str[len(repeat_prefix) :]
     else:
         # TODO cron format
-        raise NotImplementedError(f'Schedule: {schedule_str}')
+        raise NotImplementedError(f"Schedule: {schedule_str}")
     items = repeat_str.split()
     if len(items) == 1:
-        value = '1'
+        value = "1"
         period = items[0]
     elif len(items) == 2:
         value, period = items
@@ -53,15 +53,15 @@ def calc_next_date(schedule_str, base_date: dt.datetime):
     if not value.isdigit():
         raise Exception(f"Number expected: {value}")
     value = int(value)
-    if period in ('minute', 'minutes', 'min'):
+    if period in ("minute", "minutes", "min"):
         delta = dt.timedelta(minutes=value)
-    elif period in ('hour', 'hours'):
+    elif period in ("hour", "hours"):
         delta = dt.timedelta(hours=value)
-    elif period in ('day', 'days'):
+    elif period in ("day", "days"):
         delta = dt.timedelta(days=value)
-    elif period in ('week', 'weeks'):
+    elif period in ("week", "weeks"):
         delta = dt.timedelta(days=value * 7)  # 1 week = 7 days
-    elif period in ('month', 'months'):
+    elif period in ("month", "months"):
         delta = relativedelta(months=value)
     else:
         raise Exception(f"Unknown period: {period}")
@@ -85,10 +85,10 @@ def parse_job_date(date_str: str) -> dt.datetime:
     :return:
     """
-    if date_str.upper() == 'NOW':
+    if date_str.upper() == "NOW":
         return dt.datetime.now()
-    date_formats = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d']
+    date_formats = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]
     date = None
     for date_format in date_formats:
         try:
@@ -128,39 +128,41 @@ class JobsController:
             at the moment supports: 'every <number> <dimension>' or 'every <dimension>'
         :return: name of created job
         """
+        if not name.islower():
+            raise ValueError(f"The name must be in lower case: {name}")
         project_controller = ProjectController()
         project = project_controller.get(name=project_name)
         # check if exists
         if self.get(name, project_name) is not None:
-            raise EntityExistsError('Job already exists', name)
+            raise EntityExistsError("Job already exists", name)
         if start_at is None:
             start_at = dt.datetime.now()
         if end_at is not None and end_at < start_at:
-            raise Exception(f'Wrong end date {start_at} > {end_at}')
+            raise Exception(f"Wrong end date {start_at} > {end_at}")
         # check sql = try to parse it
         for sql in split_sql(query):
             try:
                 # replace template variables with null
-                sql = re.sub(r'\{\{[\w\d]+}}', "", sql)
+                sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
                 parse_sql(sql)
             except ParsingException as e:
-                raise ParsingException(f'Unable to parse: {sql}: {e}')
+                raise ParsingException(f"Unable to parse: {sql}: {e}")
         if if_query is not None:
             for sql in split_sql(if_query):
                 try:
                     # replace template variables with null
-                    sql = re.sub(r'\{\{[\w\d]+}}', "", sql)
+                    sql = re.sub(r"\{\{[\w\d]+}}", "", sql)
                     parse_sql(sql)
                 except ParsingException as e:
-                    raise ParsingException(f'Unable to parse: {sql}: {e}')
+                    raise ParsingException(f"Unable to parse: {sql}: {e}")
         # plan next run
         next_run_at = start_at
@@ -185,7 +187,7 @@ class JobsController:
             start_at=start_at,
             end_at=end_at,
             next_run_at=next_run_at,
-            schedule_str=schedule_str
+            schedule_str=schedule_str,
         )
         db.session.add(record)
         db.session.commit()
@@ -219,10 +221,11 @@ class JobsController:
         schedule_str = None
         if query.repeat_str is not None:
-            schedule_str = 'every ' + query.repeat_str
+            schedule_str = "every " + query.repeat_str
         return self.add(
-            name, project_name,
+            name,
+            project_name,
             query=query_str,
             start_at=start_at,
             end_at=end_at,
@@ -231,36 +234,30 @@ class JobsController:
         )
     def delete(self, name, project_name):
         project_controller = ProjectController()
         project = project_controller.get(name=project_name)
         # check if exists
-        record = db.session.query(db.Jobs).filter_by(
-            company_id=ctx.company_id,
-            name=name,
-            project_id=project.id,
-            deleted_at=sa.null()
-        ).first()
+        record = (
+            db.session.query(db.Jobs)
+            .filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
+            .first()
+        )
         if record is None:
-            raise EntityNotExistsError('Job does not exist', name)
+            raise EntityNotExistsError("Job does not exist", name)
         self._delete_record(record)
         db.session.commit()
         # delete context
-        query_context_controller.drop_query_context('job', record.id)
-        query_context_controller.drop_query_context('job-if', record.id)
+        query_context_controller.drop_query_context("job", record.id)
+        query_context_controller.drop_query_context("job-if", record.id)
     def _delete_record(self, record):
         record.deleted_at = dt.datetime.now()
     def get_list(self, project_name=None):
-        query = db.session.query(db.Jobs).filter_by(
-            company_id=ctx.company_id,
-            deleted_at=sa.null()
-        )
+        query = db.session.query(db.Jobs).filter_by(company_id=ctx.company_id, deleted_at=sa.null())
         project_controller = ProjectController()
         if project_name is not None:
@@ -268,23 +265,22 @@ class JobsController:
             query = query.filter_by(project_id=project.id)
         data = []
-        project_names = {
-            i.id: i.name
-            for i in project_controller.get_list()
-        }
+        project_names = {i.id: i.name for i in project_controller.get_list()}
         for record in query:
-            data.append({
-                'id': record.id,
-                'name': record.name,
-                'project': project_names[record.project_id],
-                'start_at': record.start_at,
-                'end_at': record.end_at,
-                'next_run_at': record.next_run_at,
-                'schedule_str': record.schedule_str,
-                'query': record.query_str,
-                'if_query': record.if_query_str,
-                'variables': query_context_controller.get_context_vars('job', record.id)
-            })
+            data.append(
+                {
+                    "id": record.id,
+                    "name": record.name,
+                    "project": project_names[record.project_id],
+                    "start_at": record.start_at,
+                    "end_at": record.end_at,
+                    "next_run_at": record.next_run_at,
+                    "schedule_str": record.schedule_str,
+                    "query": record.query_str,
+                    "if_query": record.if_query_str,
+                    "variables": query_context_controller.get_context_vars("job", record.id),
+                }
+            )
         return data
     def get(self, name: str, project_name: str) -> dict:
@@ -298,25 +294,24 @@ class JobsController:
         project_controller = ProjectController()
         project = project_controller.get(name=project_name)
-        record = db.session.query(db.Jobs).filter_by(
-            company_id=ctx.company_id,
-            name=name,
-            project_id=project.id,
-            deleted_at=sa.null()
-        ).first()
+        record = (
+            db.session.query(db.Jobs)
+            .filter_by(company_id=ctx.company_id, name=name, project_id=project.id, deleted_at=sa.null())
+            .first()
+        )
         if record is not None:
             return {
-                'id': record.id,
-                'name': record.name,
-                'project': project_name,
-                'start_at': record.start_at,
-                'end_at': record.end_at,
-                'next_run_at': record.next_run_at,
-                'schedule_str': record.schedule_str,
-                'query': record.query_str,
-                'if_query': record.if_query_str,
-                'variables': query_context_controller.get_context_vars('job', record.id)
+                "id": record.id,
+                "name": record.name,
+                "project": project_name,
+                "start_at": record.start_at,
+                "end_at": record.end_at,
+                "next_run_at": record.next_run_at,
+                "schedule_str": record.schedule_str,
+                "query": record.query_str,
+                "if_query": record.if_query_str,
+                "variables": query_context_controller.get_context_vars("job", record.id),
             }
     def get_history(self, name: str, project_name: str) -> List[dict]:
@@ -331,27 +326,33 @@ class JobsController:
         query = Select(
             targets=[Star()],
-            from_table=Identifier('jobs_history'),
-            where=BinaryOperation(op='and', args=[
-                BinaryOperation(op='=', args=[Identifier('name'), Constant(name)]),
-                BinaryOperation(op='=', args=[Identifier('project'), Constant(project_name)])
-            ])
+            from_table=Identifier("jobs_history"),
+            where=BinaryOperation(
+                op="and",
+                args=[
+                    BinaryOperation(op="=", args=[Identifier("name"), Constant(name)]),
+                    BinaryOperation(op="=", args=[Identifier("project"), Constant(project_name)]),
+                ],
+            ),
         )
         response = logs_db_controller.query(query)
-        names = [i['name'] for i in response.columns]
-        return response.data_frame[names].to_dict(orient='records')
+        names = [i["name"] for i in response.columns]
+        return response.data_frame[names].to_dict(orient="records")
 class JobsExecutor:
     def get_next_tasks(self):
         # filter next_run < now
-        query = db.session.query(db.Jobs).filter(
-            db.Jobs.next_run_at < dt.datetime.now(),
-            db.Jobs.deleted_at == sa.null(),
-            db.Jobs.active == True,  # noqa
-        ).order_by(db.Jobs.next_run_at)
+        query = (
+            db.session.query(db.Jobs)
+            .filter(
+                db.Jobs.next_run_at < dt.datetime.now(),
+                db.Jobs.deleted_at == sa.null(),
+                db.Jobs.active == True,  # noqa
+            )
+            .order_by(db.Jobs.next_run_at)
+        )
         return query.all()
@@ -389,12 +390,7 @@ class JobsExecutor:
         record = db.Jobs.query.get(record_id)
         try:
-            history_record = db.JobsHistory(
-                job_id=record.id,
-                start_at=record.next_run_at,
-                company_id=record.company_id
-            )
+            history_record = db.JobsHistory(job_id=record.id, start_at=record.next_run_at, company_id=record.company_id)
             db.session.add(history_record)
             db.session.commit()
@@ -408,9 +404,7 @@ class JobsExecutor:
             # check if it is an old lock
             history_record = db.JobsHistory.query.filter_by(
-                job_id=record.id,
-                start_at=record.next_run_at,
-                company_id=record.company_id
+                job_id=record.id, start_at=record.next_run_at, company_id=record.company_id
             ).first()
             if history_record.updated_at < dt.datetime.now() - dt.timedelta(seconds=30):
                 db.session.delete(history_record)
@@ -419,13 +413,14 @@ class JobsExecutor:
         return None
     def __fill_variables(self, sql, record, history_record):
-        if '{{PREVIOUS_START_DATETIME}}' in sql:
+        if "{{PREVIOUS_START_DATETIME}}" in sql:
             # get previous run date
-            history_prev = db.session.query(db.JobsHistory.start_at) \
-                .filter(db.JobsHistory.job_id == record.id,
-                        db.JobsHistory.id != history_record.id) \
-                .order_by(db.JobsHistory.id.desc()) \
+            history_prev = (
+                db.session.query(db.JobsHistory.start_at)
+                .filter(db.JobsHistory.job_id == record.id, db.JobsHistory.id != history_record.id)
+                .order_by(db.JobsHistory.id.desc())
                 .first()
+            )
             if history_prev is None:
                 # start date of the job
                 value = record.created_at
@@ -433,18 +428,17 @@ class JobsExecutor:
                 # fix for twitter: created_at filter must be minimum of 10 seconds prior to the current time
                 value = history_prev.start_at - dt.timedelta(seconds=60)
             value = value.strftime("%Y-%m-%d %H:%M:%S")
-            sql = sql.replace('{{PREVIOUS_START_DATETIME}}', value)
+            sql = sql.replace("{{PREVIOUS_START_DATETIME}}", value)
-        if '{{START_DATE}}' in sql:
+        if "{{START_DATE}}" in sql:
             value = history_record.start_at.strftime("%Y-%m-%d")
-            sql = sql.replace('{{START_DATE}}', value)
-        if '{{START_DATETIME}}' in sql:
+            sql = sql.replace("{{START_DATE}}", value)
+        if "{{START_DATETIME}}" in sql:
             value = history_record.start_at.strftime("%Y-%m-%d %H:%M:%S")
-            sql = sql.replace('{{START_DATETIME}}', value)
+            sql = sql.replace("{{START_DATETIME}}", value)
         return sql
     def execute_task_local(self, record_id, history_id=None):
         record = db.Jobs.query.get(record_id)
         # set up environment
@@ -470,7 +464,7 @@ class JobsExecutor:
         project_controller = ProjectController()
         project = project_controller.get(record.project_id)
-        executed_sql = ''
+        executed_sql = ""
         from mindsdb.api.executor.controllers.session_controller import SessionController
         from mindsdb.api.executor.command_executor import ExecuteCommands
@@ -480,8 +474,8 @@ class JobsExecutor:
         command_executor = ExecuteCommands(sql_session)
         # job with condition?
-        query_context_controller.set_context('job-if', record.id)
-        error = ''
+        query_context_controller.set_context("job-if", record.id)
+        error = ""
         to_execute_query = True
         if record.if_query_str is not None:
             data = None
@@ -491,7 +485,7 @@ class JobsExecutor:
                     sql = self.__fill_variables(sql, record, history_record)
                     query = parse_sql(sql)
-                    executed_sql += sql + '; '
+                    executed_sql += sql + "; "
                     ret = command_executor.execute_command(query)
                     if ret.error_code is not None:
@@ -508,17 +502,16 @@ class JobsExecutor:
             if error or data is None or len(data) == 0:
                 to_execute_query = False
-        query_context_controller.release_context('job-if', record.id)
+        query_context_controller.release_context("job-if", record.id)
         if to_execute_query:
-            query_context_controller.set_context('job', record.id)
+            query_context_controller.set_context("job", record.id)
             for sql in split_sql(record.query_str):
                 try:
                     #  fill template variables
                     sql = self.__fill_variables(sql, record, history_record)
                     query = parse_sql(sql)
-                    executed_sql += sql + '; '
+                    executed_sql += sql + "; "
                     ret = command_executor.execute_command(query)
                     if ret.error_code is not None:

mindsdb/interfaces/knowledge_base/controller.py CHANGED Viewed

@@ -60,6 +60,7 @@ class KnowledgeBaseInputParams(BaseModel):
     is_sparse: bool = False
     vector_size: int | None = None
     reranking_model: Dict[Text, Any] | None = None
+    preprocessing: Dict[Text, Any] | None = None
     class Config:
         extra = "forbid"
@@ -244,9 +245,9 @@ class KnowledgeBaseTable:
         keyword_search_cols_and_values = []
         query_text = None
         relevance_threshold = None
-        reranking_enabled_flag = True
         hybrid_search_enabled_flag = False
         query_conditions = db_handler.extract_conditions(query.where)
+        hybrid_search_alpha = None  # Default to None, meaning no alpha weighted blending
         if query_conditions is not None:
             for item in query_conditions:
                 if item.column == "relevance" and item.op.value == FilterOperator.GREATER_THAN_OR_EQUAL.value:
@@ -261,10 +262,8 @@ class KnowledgeBaseTable:
                         logger.error(error_msg)
                         raise ValueError(error_msg)
                 elif item.column == "reranking":
-                    reranking_enabled_flag = item.value
-                    # cast to boolean
-                    if isinstance(reranking_enabled_flag, str):
-                        reranking_enabled_flag = reranking_enabled_flag.lower() not in ("false")
+                    if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
+                        disable_reranking = True
                 elif item.column == "hybrid_search":
                     hybrid_search_enabled_flag = item.value
                     # cast to boolean
@@ -272,6 +271,14 @@ class KnowledgeBaseTable:
                         hybrid_search_enabled_flag = hybrid_search_enabled_flag.lower() not in ("false")
                     if item.value is False or (isinstance(item.value, str) and item.value.lower() == "false"):
                         disable_reranking = True
+                elif item.column == "hybrid_search_alpha":
+                    # validate item.value is a float
+                    if not isinstance(item.value, (float, int)):
+                        raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be a float or int.")
+                    # validate hybrid search alpha is between 0 and 1
+                    if not (0 <= item.value <= 1):
+                        raise ValueError(f"Invalid hybrid_search_alpha value: {item.value}. Must be between 0 and 1.")
+                    hybrid_search_alpha = item.value
                 elif item.column == "relevance" and item.op.value != FilterOperator.GREATER_THAN_OR_EQUAL.value:
                     raise ValueError(
                         f"Invalid operator for relevance: {item.op.value}. Only GREATER_THAN_OR_EQUAL is allowed."
@@ -345,7 +352,15 @@ class KnowledgeBaseTable:
                         f"Keyword search returned different columns: {df_keyword_select.columns} "
                         f"than expected: {df.columns}"
                     )
+                if hybrid_search_alpha:
+                    df_keyword_select[TableField.DISTANCE.value] = (
+                        hybrid_search_alpha * df_keyword_select[TableField.DISTANCE.value]
+                    )
+                    df[TableField.DISTANCE.value] = (1 - hybrid_search_alpha) * df[TableField.DISTANCE.value]
                 df = pd.concat([df, df_keyword_select], ignore_index=True)
+                # sort by distance if distance column exists
+                if TableField.DISTANCE.value in df.columns:
+                    df = df.sort_values(by=TableField.DISTANCE.value, ascending=True)
                 # if chunk_id column exists remove duplicates based on chunk_id
                 if "chunk_id" in df.columns:
                     df = df.drop_duplicates(subset=["chunk_id"])
@@ -519,6 +534,9 @@ class KnowledgeBaseTable:
             query.update_columns[emb_col] = Constant(self._content_to_embeddings(content))
+        if "metadata" not in query.update_columns:
+            query.update_columns["metadata"] = Constant({})
         # TODO search content in where clause?
         # set table name
@@ -1010,6 +1028,9 @@ class KnowledgeBaseController:
         :param is_sparse: Whether to use sparse vectors for embeddings
         :param vector_size: Optional size specification for vectors, required when is_sparse=True
         """
+        if not name.islower():
+            raise ValueError(f"The name must be in lower case: {name}")
         # fill variables
         params = variables_controller.fill_parameters(params)
@@ -1186,6 +1207,13 @@ class KnowledgeBaseController:
         if "provider" not in params:
             raise ValueError("'provider' parameter is required for embedding model")
+        # check available providers
+        avail_providers = ("openai", "azure_openai", "bedrock", "gemini", "google")
+        if params["provider"] not in avail_providers:
+            raise ValueError(
+                f"Wrong embedding provider: {params['provider']}. Available providers: {', '.join(avail_providers)}"
+            )
         if params["provider"] not in ("openai", "azure_openai"):
             # try use litellm
             try:

mindsdb/interfaces/knowledge_base/evaluate.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import math
+import re
 import time
 from typing import List
@@ -16,15 +17,15 @@ logger = log.getLogger(__name__)
 GENERATE_QA_SYSTEM_PROMPT = """
-Your task is to generate question and answer pairs for a search engine.
+Your task is to generate question and answer pairs for a search engine.
 The search engine will take your query and return a list of documents.
 You will be given a text and you need to generate a question that can be answered using the information in the text.
 Your questions will be used to evaluate the search engine.
-Question should always have enough clues to identify the specific text that this question is generated from.
+Question should always have enough clues to identify the specific text that this question is generated from.
 Never ask questions like "What license number is associated with Amend 6" because Amend 6 could be found in many documents and the question is not specific enough.
-Example output 1:  {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
+Example output 1:  {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
 Example output 2: {\"query\": \"What is the name of the river in Paris?\", \"reference_answer\": \"Seine\"}
-Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
+Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
 The question should be answerable without the text, but the answer should be present in the text.
 Return ONLY a json response. No other text.
 """
@@ -43,6 +44,39 @@ def calc_entropy(values: List[float]) -> float:
     return -sum([pk * math.log(pk) for pk in values])
+def sanitize_json_response(response: str) -> str:
+    """Remove markdown code block formatting from JSON response and extract valid JSON."""
+    if not response or not response.strip():
+        raise ValueError("Empty response provided.")
+    # Remove leading/trailing whitespace
+    response = response.strip()
+    # Remove markdown code block markers if present
+    response = re.sub(r"^```(?:json|JSON)?\s*", "", response, flags=re.MULTILINE)
+    response = re.sub(r"\s*```$", "", response, flags=re.MULTILINE)
+    response = response.strip()
+    # Find the first opening brace
+    start_idx = response.find("{")
+    if start_idx == -1:
+        raise ValueError("No JSON object found in the response.")
+    # Try to parse JSON starting from first { with increasing end positions
+    # This handles nested objects and strings with braces correctly
+    for end_idx in range(len(response), start_idx, -1):  # Start from end and work backwards
+        candidate = response[start_idx:end_idx]
+        try:
+            parsed = json.loads(candidate)
+            # Ensure it's a dictionary (object) not just any valid JSON
+            if isinstance(parsed, dict):
+                return candidate
+        except json.JSONDecodeError:
+            continue
+    raise ValueError("No valid JSON object found in the response.")
 class EvaluateBase:
     DEFAULT_QUESTION_COUNT = 20
     DEFAULT_SAMPLE_SIZE = 10000
@@ -84,7 +118,8 @@ class EvaluateBase:
             dn, table_name = self._get_dn_table(query.from_table)
             query.from_table = table_name
-            query.limit = Constant(self.DEFAULT_SAMPLE_SIZE)
+            if query.limit is None:
+                query.limit = Constant(self.DEFAULT_SAMPLE_SIZE)
             response = dn.query(query=query, session=self.session)
             df = response.data_frame
@@ -178,6 +213,7 @@ class EvaluateBase:
         test_data = self.read_from_table(test_table)
         scores = self.evaluate(test_data)
+        scores["id"] = math.floor(time.time())  # unique ID for the evaluation run
         scores["name"] = self.name
         scores["created_at"] = dt.datetime.now()
@@ -237,9 +273,13 @@ class EvaluateRerank(EvaluateBase):
             {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
             {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
         ]
-        answer = self.llm_client.completion(messages)
+        answer = self.llm_client.completion(messages, json_output=True)
+        # Sanitize the response by removing markdown code block formatting like ```json
+        sanitized_answer = sanitize_json_response(answer)
         try:
-            output = json.loads(answer)
+            output = json.loads(sanitized_answer)
         except json.JSONDecodeError:
             raise ValueError(f"Could not parse response from LLM: {answer}")
@@ -448,9 +488,13 @@ class EvaluateDocID(EvaluateBase):
             {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
             {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
         ]
-        answer = self.llm_client.completion(messages)
+        answer = self.llm_client.completion(messages, json_output=True)
+        # Sanitize the response by removing markdown code block formatting like ```json
+        sanitized_answer = sanitize_json_response(answer)
         try:
-            output = json.loads(answer)
+            output = json.loads(sanitized_answer)
         except json.JSONDecodeError:
             raise ValueError(f"Could not parse response from LLM: {answer}")

MindsDB 25.7.2.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.7.2.0py3-none-any.whl → 25.7.4.0py3-none-any.whl