PyPI - MindsDB - Versions diffs - 25.7.1.0__py3-none-any.whl → 25.7.3.0__py3-none-any.whl - Mend

MindsDB 25.7.1.0py3-none-any.whl → 25.7.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (38) hide show

mindsdb/__about__.py +1 -1
mindsdb/__main__.py +54 -95
mindsdb/api/a2a/agent.py +30 -206
mindsdb/api/a2a/common/server/server.py +26 -27
mindsdb/api/a2a/task_manager.py +93 -227
mindsdb/api/a2a/utils.py +21 -0
mindsdb/api/executor/command_executor.py +7 -2
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
mindsdb/api/executor/utilities/sql.py +97 -21
mindsdb/api/http/namespaces/agents.py +127 -202
mindsdb/api/http/namespaces/config.py +12 -1
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +11 -1
mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +94 -1
mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +4 -3
mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -3
mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
mindsdb/integrations/libs/keyword_search_base.py +41 -0
mindsdb/integrations/libs/vectordatabase_handler.py +35 -14
mindsdb/integrations/utilities/sql_utils.py +11 -0
mindsdb/interfaces/agents/agents_controller.py +2 -2
mindsdb/interfaces/data_catalog/data_catalog_loader.py +18 -4
mindsdb/interfaces/database/projects.py +1 -3
mindsdb/interfaces/functions/controller.py +54 -64
mindsdb/interfaces/functions/to_markdown.py +47 -14
mindsdb/interfaces/knowledge_base/controller.py +134 -35
mindsdb/interfaces/knowledge_base/evaluate.py +53 -10
mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
mindsdb/utilities/config.py +46 -39
mindsdb/utilities/exception.py +11 -0
{mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/METADATA +236 -236
{mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/RECORD +38 -36
{mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/WHEEL +0 -0
{mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/top_level.txt +0 -0

mindsdb/interfaces/knowledge_base/evaluate.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import math
+import re
 import time
 from typing import List
@@ -16,15 +17,15 @@ logger = log.getLogger(__name__)
 GENERATE_QA_SYSTEM_PROMPT = """
-Your task is to generate question and answer pairs for a search engine.
+Your task is to generate question and answer pairs for a search engine.
 The search engine will take your query and return a list of documents.
 You will be given a text and you need to generate a question that can be answered using the information in the text.
 Your questions will be used to evaluate the search engine.
-Question should always have enough clues to identify the specific text that this question is generated from.
+Question should always have enough clues to identify the specific text that this question is generated from.
 Never ask questions like "What license number is associated with Amend 6" because Amend 6 could be found in many documents and the question is not specific enough.
-Example output 1:  {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
+Example output 1:  {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
 Example output 2: {\"query\": \"What is the name of the river in Paris?\", \"reference_answer\": \"Seine\"}
-Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
+Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
 The question should be answerable without the text, but the answer should be present in the text.
 Return ONLY a json response. No other text.
 """
@@ -43,6 +44,39 @@ def calc_entropy(values: List[float]) -> float:
     return -sum([pk * math.log(pk) for pk in values])
+def sanitize_json_response(response: str) -> str:
+    """Remove markdown code block formatting from JSON response and extract valid JSON."""
+    if not response or not response.strip():
+        raise ValueError("Empty response provided.")
+    # Remove leading/trailing whitespace
+    response = response.strip()
+    # Remove markdown code block markers if present
+    response = re.sub(r"^```(?:json|JSON)?\s*", "", response, flags=re.MULTILINE)
+    response = re.sub(r"\s*```$", "", response, flags=re.MULTILINE)
+    response = response.strip()
+    # Find the first opening brace
+    start_idx = response.find("{")
+    if start_idx == -1:
+        raise ValueError("No JSON object found in the response.")
+    # Try to parse JSON starting from first { with increasing end positions
+    # This handles nested objects and strings with braces correctly
+    for end_idx in range(len(response), start_idx, -1):  # Start from end and work backwards
+        candidate = response[start_idx:end_idx]
+        try:
+            parsed = json.loads(candidate)
+            # Ensure it's a dictionary (object) not just any valid JSON
+            if isinstance(parsed, dict):
+                return candidate
+        except json.JSONDecodeError:
+            continue
+    raise ValueError("No valid JSON object found in the response.")
 class EvaluateBase:
     DEFAULT_QUESTION_COUNT = 20
     DEFAULT_SAMPLE_SIZE = 10000
@@ -90,7 +124,7 @@ class EvaluateBase:
             df = response.data_frame
             if "content" not in df.columns:
-                raise ValueError("`content` column isn't found in source data")
+                raise ValueError(f"`content` column isn't found in provided sql: {gen_params['from_sql']}")
             df.rename(columns={"content": "chunk_content"}, inplace=True)
         else:
@@ -178,6 +212,7 @@ class EvaluateBase:
         test_data = self.read_from_table(test_table)
         scores = self.evaluate(test_data)
+        scores["id"] = math.floor(time.time())  # unique ID for the evaluation run
         scores["name"] = self.name
         scores["created_at"] = dt.datetime.now()
@@ -186,7 +221,7 @@ class EvaluateBase:
             to_table = params["save_to"]
             if isinstance(to_table, str):
                 to_table = Identifier(to_table)
-            self.save_to_table(to_table, scores)
+            self.save_to_table(to_table, scores.copy())
         return scores
@@ -237,9 +272,13 @@ class EvaluateRerank(EvaluateBase):
             {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
             {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
         ]
-        answer = self.llm_client.completion(messages)
+        answer = self.llm_client.completion(messages, json_output=True)
+        # Sanitize the response by removing markdown code block formatting like ```json
+        sanitized_answer = sanitize_json_response(answer)
         try:
-            output = json.loads(answer)
+            output = json.loads(sanitized_answer)
         except json.JSONDecodeError:
             raise ValueError(f"Could not parse response from LLM: {answer}")
@@ -448,9 +487,13 @@ class EvaluateDocID(EvaluateBase):
             {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
             {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
         ]
-        answer = self.llm_client.completion(messages)
+        answer = self.llm_client.completion(messages, json_output=True)
+        # Sanitize the response by removing markdown code block formatting like ```json
+        sanitized_answer = sanitize_json_response(answer)
         try:
-            output = json.loads(answer)
+            output = json.loads(sanitized_answer)
         except json.JSONDecodeError:
             raise ValueError(f"Could not parse response from LLM: {answer}")

mindsdb/interfaces/knowledge_base/llm_client.py CHANGED Viewed

@@ -54,12 +54,12 @@ class LLMClient:
             self.client = module.Handler
-    def completion(self, messages: List[dict]) -> str:
+    def completion(self, messages: List[dict], json_output: bool = False) -> str:
         """
         Call LLM completion and get response
         """
         params = self.params
+        params["json_output"] = json_output
         if self.provider in ("azure_openai", "openai"):
             response = self.client.chat.completions.create(
                 model=params["model_name"],
@@ -69,6 +69,6 @@ class LLMClient:
         else:
             kwargs = params.copy()
             model = kwargs.pop("model_name")
+            kwargs.pop("provider", None)
             response = self.client.completion(self.provider, model=model, messages=messages, args=kwargs)
             return response.choices[0].message.content

mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py CHANGED Viewed

@@ -1,16 +1,17 @@
+import re
+import html
+import asyncio
 from typing import List, Dict, Optional, Any
 import pandas as pd
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-import asyncio
+from langchain_core.documents import Document as LangchainDocument
 from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
     FileSplitter,
     FileSplitterConfig,
 )
 from mindsdb.interfaces.agents.langchain_agent import create_chat_model
 from mindsdb.interfaces.knowledge_base.preprocessing.models import (
     PreprocessingConfig,
     ProcessedChunk,
@@ -21,7 +22,6 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import (
 )
 from mindsdb.utilities import log
-from langchain_core.documents import Document as LangchainDocument
 logger = log.getLogger(__name__)
@@ -123,11 +123,11 @@ class ContextualPreprocessor(DocumentPreprocessor):
     DEFAULT_CONTEXT_TEMPLATE = """
 <document>
-{{WHOLE_DOCUMENT}}
+{WHOLE_DOCUMENT}
 </document>
 Here is the chunk we want to situate within the whole document
 <chunk>
-{{CHUNK_CONTENT}}
+{CHUNK_CONTENT}
 </chunk>
 Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
@@ -149,12 +149,20 @@ Please give a short succinct context to situate this chunk within the overall do
         self.summarize = self.config.summarize
     def _prepare_prompts(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
-        prompts = [
-            self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document) for full_document in full_documents
-        ]
-        prompts = [
-            prompt.replace("{{CHUNK_CONTENT}}", chunk_content) for prompt, chunk_content in zip(prompts, chunk_contents)
-        ]
+        def tag_replacer(match):
+            tag = match.group(0)
+            if tag.lower() not in ["<document>", "</document>", "<chunk>", "</chunk>"]:
+                return tag
+            return html.escape(tag)
+        tag_pattern = r"</?document>|</?chunk>"
+        prompts = []
+        for chunk_content, full_document in zip(chunk_contents, full_documents):
+            chunk_content = re.sub(tag_pattern, tag_replacer, chunk_content, flags=re.IGNORECASE)
+            full_document = re.sub(tag_pattern, tag_replacer, full_document, flags=re.IGNORECASE)
+            prompts.append(
+                self.DEFAULT_CONTEXT_TEMPLATE.format(WHOLE_DOCUMENT=full_document, CHUNK_CONTENT=chunk_content)
+            )
         return prompts

mindsdb/utilities/config.py CHANGED Viewed

@@ -28,6 +28,13 @@ def _merge_configs(original_config: dict, override_config: dict) -> dict:
     return original_config
+def _overwrite_configs(original_config: dict, override_config: dict) -> dict:
+    """Overwrite original config with override config."""
+    for key in list(override_config.keys()):
+        original_config[key] = override_config[key]
+    return original_config
 def create_data_dir(path: Path) -> None:
     """Create a directory and checks that it is writable.
@@ -196,6 +203,15 @@ class Config:
                     "host": "0.0.0.0",  # API server binds to all interfaces by default
                     "port": "8000",
                 },
+                "a2a": {
+                    "host": api_host,
+                    "port": 47338,
+                    "mindsdb_host": "localhost",
+                    "mindsdb_port": 47334,
+                    "agent_name": "my_agent",
+                    "project_name": "mindsdb",
+                    "enabled": False,
+                },
             },
             "cache": {"type": "local"},
             "ml_task_queue": {"type": "local"},
@@ -209,15 +225,6 @@ class Config:
             "default_llm": {},
             "default_embedding_model": {},
             "default_reranking_model": {},
-            "a2a": {
-                "host": "localhost",
-                "port": 47338,
-                "mindsdb_host": "localhost",
-                "mindsdb_port": 47334,
-                "agent_name": "my_agent",
-                "project_name": "mindsdb",
-                "enabled": False,
-            },
             "data_catalog": {
                 "enabled": False,
             },
@@ -243,12 +250,11 @@ class Config:
         """Collect config values from env vars to self._env_config"""
         self._env_config = {
             "logging": {"handlers": {"console": {}, "file": {}}},
-            "api": {"http": {"server": {}}},
+            "api": {"http": {"server": {}}, "a2a": {}},
             "auth": {},
             "paths": {},
             "permanent_storage": {},
             "ml_task_queue": {},
-            "a2a": {},
         }
         # region storage root path
@@ -390,7 +396,7 @@ class Config:
             )
         if a2a_config:
-            self._env_config["a2a"] = a2a_config
+            self._env_config["api"]["a2a"] = a2a_config
         # endregion
     def fetch_auto_config(self) -> bool:
@@ -457,47 +463,36 @@ class Config:
         _merge_configs(new_config, self._env_config)
         # Apply command-line arguments for A2A
-        cmd_args_config = {}
+        a2a_config = {}
         # Check for A2A command-line arguments
         if hasattr(self.cmd_args, "a2a_host") and self.cmd_args.a2a_host is not None:
-            if "a2a" not in cmd_args_config:
-                cmd_args_config["a2a"] = {}
-            cmd_args_config["a2a"]["host"] = self.cmd_args.a2a_host
+            a2a_config["host"] = self.cmd_args.a2a_host
         if hasattr(self.cmd_args, "a2a_port") and self.cmd_args.a2a_port is not None:
-            if "a2a" not in cmd_args_config:
-                cmd_args_config["a2a"] = {}
-            cmd_args_config["a2a"]["port"] = self.cmd_args.a2a_port
+            a2a_config["port"] = self.cmd_args.a2a_port
         if hasattr(self.cmd_args, "mindsdb_host") and self.cmd_args.mindsdb_host is not None:
-            if "a2a" not in cmd_args_config:
-                cmd_args_config["a2a"] = {}
-            cmd_args_config["a2a"]["mindsdb_host"] = self.cmd_args.mindsdb_host
+            a2a_config["mindsdb_host"] = self.cmd_args.mindsdb_host
         if hasattr(self.cmd_args, "mindsdb_port") and self.cmd_args.mindsdb_port is not None:
-            if "a2a" not in cmd_args_config:
-                cmd_args_config["a2a"] = {}
-            cmd_args_config["a2a"]["mindsdb_port"] = self.cmd_args.mindsdb_port
+            a2a_config["mindsdb_port"] = self.cmd_args.mindsdb_port
         if hasattr(self.cmd_args, "agent_name") and self.cmd_args.agent_name is not None:
-            if "a2a" not in cmd_args_config:
-                cmd_args_config["a2a"] = {}
-            cmd_args_config["a2a"]["agent_name"] = self.cmd_args.agent_name
+            a2a_config["agent_name"] = self.cmd_args.agent_name
         if hasattr(self.cmd_args, "project_name") and self.cmd_args.project_name is not None:
-            if "a2a" not in cmd_args_config:
-                cmd_args_config["a2a"] = {}
-            cmd_args_config["a2a"]["project_name"] = self.cmd_args.project_name
+            a2a_config["project_name"] = self.cmd_args.project_name
         # Merge command-line args config with highest priority
-        if cmd_args_config:
-            _merge_configs(new_config, cmd_args_config)
+        if a2a_config:
+            _merge_configs(new_config, {"api": {"a2a": a2a_config}})
         # Ensure A2A port is never 0, which would prevent the A2A API from starting
-        if "a2a" in new_config and isinstance(new_config["a2a"], dict):
-            if "port" in new_config["a2a"] and (new_config["a2a"]["port"] == 0 or new_config["a2a"]["port"] is None):
-                new_config["a2a"]["port"] = 47338  # Use the default port value
+        a2a_config = new_config["api"].get("a2a")
+        if a2a_config is not None and isinstance(a2a_config, dict):
+            if "port" in a2a_config and (a2a_config["port"] == 0 or a2a_config["port"] is None):
+                a2a_config["port"] = 47338  # Use the default port value
         # region create dirs
         for key, value in new_config["paths"].items():
@@ -522,11 +517,23 @@ class Config:
         self.ensure_auto_config_is_relevant()
         return self._config
-    def update(self, data: dict) -> None:
-        """Update calues in `auto` config"""
+    def update(self, data: dict, overwrite: bool = False) -> None:
+        """
+        Update values in `auto` config.
+        Args:
+            data (dict): data to update in `auto` config.
+            overwrite (bool): if True, overwrite existing keys, otherwise merge them.
+                - False (default): Merge recursively. Existing nested dictionaries are preserved
+                and only the specified keys in `data` are updated.
+                - True: Overwrite completely. Existing keys are replaced entirely with values
+                from `data`, discarding any nested structure not present in `data`.
+        """
         self.ensure_auto_config_is_relevant()
-        _merge_configs(self._auto_config, data)
+        if overwrite:
+            _overwrite_configs(self._auto_config, data)
+        else:
+            _merge_configs(self._auto_config, data)
         self.auto_config_path.write_text(json.dumps(self._auto_config, indent=4))

mindsdb/utilities/exception.py CHANGED Viewed

@@ -40,6 +40,7 @@ def format_db_error_message(
     db_type: str | None = None,
     db_error_msg: str | None = None,
     failed_query: str | None = None,
+    is_external: bool = True,
 ) -> str:
     """Format the error message for the database query.
@@ -48,11 +49,21 @@ def format_db_error_message(
         db_type (str | None): The type of the database.
         db_error_msg (str | None): The error message.
         failed_query (str | None): The failed query.
+        is_external (bool): True if error appeared in external database, False if in internal duckdb
     Returns:
         str: The formatted error message.
     """
     error_message = "Failed to execute external database query during query processing."
+    if is_external:
+        error_message = (
+            "An error occurred while executing a derived query on the external "
+            "database during processing of your original SQL query."
+        )
+    else:
+        error_message = (
+            "An error occurred while processing an internally generated query derived from your original SQL statement."
+        )
     if db_name is not None or db_type is not None:
         error_message += "\n\nDatabase Details:"
         if db_name is not None:

MindsDB 25.7.1.0__py3-none-any.whl → 25.7.3.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.7.1.0py3-none-any.whl → 25.7.3.0py3-none-any.whl