PyPI - MindsDB - Versions diffs - 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl - Mend

MindsDB 25.6.4.0py3-none-any.whl → 25.7.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show

mindsdb/__about__.py +1 -1
mindsdb/__main__.py +53 -94
mindsdb/api/a2a/agent.py +30 -206
mindsdb/api/a2a/common/server/server.py +26 -27
mindsdb/api/a2a/task_manager.py +93 -227
mindsdb/api/a2a/utils.py +21 -0
mindsdb/api/executor/command_executor.py +8 -6
mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
mindsdb/api/executor/planner/query_prepare.py +68 -87
mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
mindsdb/api/executor/utilities/sql.py +97 -21
mindsdb/api/http/namespaces/agents.py +126 -201
mindsdb/api/http/namespaces/config.py +12 -1
mindsdb/api/http/namespaces/file.py +49 -24
mindsdb/api/mcp/start.py +45 -31
mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
mindsdb/integrations/libs/keyword_search_base.py +41 -0
mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
mindsdb/integrations/utilities/sql_utils.py +11 -0
mindsdb/interfaces/agents/agents_controller.py +29 -9
mindsdb/interfaces/agents/langchain_agent.py +7 -5
mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
mindsdb/interfaces/database/projects.py +1 -3
mindsdb/interfaces/functions/controller.py +54 -64
mindsdb/interfaces/functions/to_markdown.py +47 -14
mindsdb/interfaces/knowledge_base/controller.py +228 -110
mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
mindsdb/interfaces/knowledge_base/executor.py +346 -0
mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
mindsdb/interfaces/skills/sql_agent.py +181 -130
mindsdb/interfaces/storage/db.py +9 -7
mindsdb/utilities/config.py +58 -40
mindsdb/utilities/exception.py +58 -7
mindsdb/utilities/security.py +54 -11
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0

mindsdb/interfaces/knowledge_base/executor.py ADDED Viewed

@@ -0,0 +1,346 @@
+from dataclasses import dataclass
+import copy
+from typing import List, Optional, Union
+from mindsdb_sql_parser.ast import (
+    BinaryOperation,
+    Identifier,
+    Constant,
+    UnaryOperation,
+    Select,
+    Star,
+    Tuple,
+    ASTNode,
+    BetweenOperation,
+    NullConstant,
+)
+import pandas as pd
+from mindsdb.integrations.utilities.query_traversal import query_traversal
+@dataclass
+class ConditionBlock:
+    op: str
+    items: list
+class KnowledgeBaseQueryExecutor:
+    def __init__(self, kb, content_column="content", id_column="chunk_id"):
+        self.kb = kb
+        self.content_column = content_column.lower()
+        self.id_column = id_column
+        self.limit = None
+        self._negative_set_size = 100
+        self._negative_set_threshold = 0.5
+    def is_content_condition(self, node: ASTNode) -> bool:
+        """
+        Checks if the node is a condition to Content column
+        :param node: condition to check
+        """
+        if isinstance(node, BinaryOperation):
+            if isinstance(node.args[0], Identifier):
+                parts = node.args[0].parts
+                if len(parts) == 1 and parts[0].lower() == self.content_column:
+                    return True
+        return False
+    @staticmethod
+    def invert_content_op(node: BinaryOperation) -> BinaryOperation:
+        # Change operator of binary operation to opposite one
+        op_map = {"=": "!=", "!=": "=", "LIKE": "!=", "NOT LIKE": "=", "IN": "NOT IN", "NOT IN": "IN"}
+        if node.op.upper() not in op_map:
+            raise NotImplementedError(f"Can't handle condition: '{str(node)}'")
+        node.op = op_map[node.op.upper()]
+        return node
+    def convert_unary_ops(self, node: ASTNode, callstack: List[ASTNode], **kwargs) -> ASTNode:
+        """
+        Tries to remove unary operator and apply it to Binary operation.
+        Supported cases:
+        - "NOT content <op> value" => "content <!op> value"
+        - "content <op> NOT value" => "content <!op> value"
+        Where <!op> is inverted operator of <op>
+        """
+        if isinstance(node, UnaryOperation):
+            if node.op.upper() == "NOT":
+                # two options:
+                # 1. NOT content <op> value
+                if self.is_content_condition(node.args[0]):
+                    self.invert_content_op(node.args[0])
+                    return node.args[0]
+                # 2. content <op> NOT value
+                if self.is_content_condition(callstack[0]):
+                    self.invert_content_op(callstack[0])
+                    return node.args[0]
+    def union(self, results: List[pd.DataFrame]) -> pd.DataFrame:
+        # combine dataframes from input list to single one
+        if len(results) == 1:
+            return results[0]
+        res = pd.concat(results)
+        df = res.drop_duplicates(subset=[self.id_column]).reset_index()
+        return df
+    def intersect(self, results: List[pd.DataFrame]) -> pd.DataFrame:
+        # intersect dataframes from input list: return dataframe with rows that exist in all input dataframes
+        if len(results) == 1:
+            return results[0]
+        item = results[0]
+        for item2 in results[1:]:
+            item = item[item[self.id_column].isin(item2[self.id_column])]
+        df = item
+        return df
+    @classmethod
+    def flatten_conditions(cls, node: ASTNode) -> Union[ASTNode, ConditionBlock]:
+        """
+        Recursively inspect conditions tree and move conditions related to 'OR' or 'AND' operators of the same level
+          to same ConditionBlock
+        Example: or (a=1, or (b=2, c=3))
+          is converted to: ConditionBlock(or, [a=1, b=2, c=3])
+        """
+        if isinstance(node, BinaryOperation):
+            op = node.op.upper()
+            if op in ("AND", "OR"):
+                block = ConditionBlock(op, [])
+                for arg in node.args:
+                    item = cls.flatten_conditions(arg)
+                    if isinstance(item, ConditionBlock):
+                        if item.op == block.op:
+                            block.items.extend(item.items)
+                        else:
+                            # new type of block
+                            block.items.append(item)
+                    else:
+                        block.items.append(item)
+                return block
+            else:
+                node.op = node.op.upper()
+                return node
+        elif isinstance(node, BetweenOperation):
+            block = ConditionBlock(
+                "AND",
+                [
+                    BinaryOperation(">=", args=[node.args[0], node.args[1]]),
+                    BinaryOperation("<=", args=[node.args[0], node.args[2]]),
+                ],
+            )
+            return block
+        raise NotImplementedError(f"Unknown node '{node}'")
+    def call_kb(
+        self, conditions: List[BinaryOperation], disable_reranking: bool = False, limit: int = None
+    ) -> pd.DataFrame:
+        """
+        Call KB with list of prepared conditions
+        :param conditions: input conditions
+        :param disable_reranking: flag disable reranking
+        :param limit: use custom limit
+        :return: result of querying KB
+        """
+        where = None
+        for condition in conditions:
+            if where is None:
+                where = condition
+            else:
+                where = BinaryOperation("AND", args=[where, condition])
+        query = Select(targets=[Star()], where=where)
+        if limit is not None:
+            query.limit = Constant(limit)
+        elif self.limit is not None:
+            query.limit = Constant(self.limit)
+        return self.kb.select(query, disable_reranking=disable_reranking)
+    def execute_content_condition(
+        self,
+        content_condition: BinaryOperation,
+        other_conditions: List[BinaryOperation] = None,
+        disable_reranking: bool = False,
+        limit: int = None,
+    ) -> pd.DataFrame:
+        """
+        Call KB using content condition. Only positive conditions for content can be here.
+        Negative conditions can be only as filter of ID
+        :param content_condition: condition for Content column
+        :param other_conditions: conditions for other columns
+        :param disable_reranking: turn off reranking
+        :param limit: override default limit
+        :return: result of the query
+        """
+        if other_conditions is None:
+            other_conditions = []
+        if content_condition.op == "IN":
+            # (select where content = ‘a’) UNION (select where content = ‘b’)
+            results = []
+            for el in content_condition.args[1].items:
+                el_cond = BinaryOperation(op="=", args=[Identifier(self.content_column), el])
+                results.append(
+                    self.call_kb([el_cond] + other_conditions, disable_reranking=disable_reranking, limit=limit)
+                )
+            return self.union(results)
+        elif content_condition.op in ("=", "LIKE"):
+            # just '='
+            content_condition2 = copy.deepcopy(content_condition)
+            content_condition2.op = "="
+            return self.call_kb([content_condition2] + other_conditions)
+        elif content_condition.op == "IS" and isinstance(content_condition.args[1], NullConstant):
+            # return empty dataset, call to get column names
+            return self.call_kb([], limit=1)[:0]
+        elif content_condition.op == "IS NOT" and isinstance(content_condition.args[1], NullConstant):
+            # execute without conditions
+            return self.call_kb([])
+        else:
+            raise NotImplementedError(
+                f'Operator "{content_condition.op}" is not supported for condition: {content_condition}'
+            )
+    def to_excluded_ids(
+        self, content_condition: BinaryOperation, other_conditions: List[BinaryOperation]
+    ) -> Optional[List[str]]:
+        """
+        Handles negative conditions for content. If it is negative condition: extract and return list of IDs
+         that have to be excluded by parent query
+        :param content_condition: condition for Content column
+        :param other_conditions:  conditions for other columns
+        :return: list of IDs to exclude or None
+        """
+        if content_condition.op in ("!=", "<>", "NOT LIKE"):
+            # id NOT IN (
+            #    SELECT id FROM kb WHERE content =’...’ limit X
+            # )
+            el_cond = BinaryOperation(op="=", args=content_condition.args)
+            threshold = BinaryOperation(op=">=", args=[Identifier("relevance"), Constant(self._negative_set_threshold)])
+            res = self.call_kb(
+                [el_cond, threshold] + other_conditions, disable_reranking=True, limit=self._negative_set_size
+            )
+            return list(res[self.id_column])
+        elif content_condition.op == "NOT IN":
+            # id NOT IN (
+            #   select id where content in (‘a’, ‘b’)
+            # )
+            content_condition2 = copy.deepcopy(content_condition)
+            content_condition2.op = "IN"
+            threshold = BinaryOperation(op=">=", args=[Identifier("relevance"), Constant(self._negative_set_threshold)])
+            res = self.execute_content_condition(
+                content_condition2,
+                other_conditions + [threshold],
+                disable_reranking=True,
+                limit=self._negative_set_size,
+            )
+            return list(res[self.id_column])
+        else:
+            return None
+    def execute_blocks(self, block: ConditionBlock) -> pd.DataFrame:
+        """
+        Split block to set of calls with conditions and execute them. Nested blocks are supported
+        :param block:
+        :return: dataframe with result of block execution
+        """
+        if not isinstance(block, ConditionBlock):
+            # single condition
+            if self.is_content_condition(block):
+                return self.execute_content_condition(block)
+            else:
+                return self.call_kb([block])
+        if block.op == "AND":
+            results = []
+            content_filters, other_filters = [], []
+            for item in block.items:
+                if isinstance(item, ConditionBlock):
+                    results.append(self.execute_blocks(item))
+                else:
+                    if self.is_content_condition(item):
+                        content_filters.append(item)
+                    else:
+                        other_filters.append(item)
+            if len(content_filters) > 0:
+                content_filters2 = []
+                exclude_ids = set()
+                # exclude content conditions
+                for condition in content_filters:
+                    ids = self.to_excluded_ids(condition, other_filters)
+                    if ids is not None:
+                        exclude_ids.update(ids)
+                    else:
+                        # keep origin content filter
+                        content_filters2.append(condition)
+                if exclude_ids:
+                    # add to filter
+                    values = [Constant(i) for i in exclude_ids]
+                    condition = BinaryOperation(op="NOT IN", args=[Identifier(self.id_column), Tuple(values)])
+                    other_filters.append(condition)
+                # execute content filters
+                for condition in content_filters2:
+                    result = self.execute_content_condition(condition, other_filters)
+                    results.append(result)
+            elif len(other_filters) > 0:
+                results.append(self.call_kb(other_filters))
+            return self.intersect(results)
+        elif block.op == "OR":
+            results = []
+            for item in block.items:
+                results.append(self.execute_blocks(item))
+            return self.union(results)
+    def run(self, query: Select) -> pd.DataFrame:
+        """
+        Plan and execute query to KB. If query has complex conditions:
+         - convert them to several queries with simple conditions, execute them and combine results
+        Stages:
+        - Remove unary NOT from condition: try to apply it to related operator
+        - Flat conditions tree: convert into condition blocks:
+           - having with same operators of the same levels in the same block
+        - Recursively execute blocks
+           - get data from OR blocks and union them
+           - get data from AND blocks and intersect them
+        :param query: select query
+        :return: results
+        """
+        if query.where is not None:
+            query_traversal(query.where, self.convert_unary_ops)
+            blocks_tree = self.flatten_conditions(query.where)
+            if query.limit is not None:
+                self.limit = query.limit.value
+            return self.execute_blocks(blocks_tree)
+        else:
+            return self.kb.select(query)

mindsdb/interfaces/knowledge_base/llm_client.py CHANGED Viewed

@@ -36,8 +36,11 @@ class LLMClient:
             )
         elif self.provider == "openai":
             openai_api_key = params.get("api_key") or os.getenv("OPENAI_API_KEY")
+            kwargs = {"api_key": openai_api_key, "max_retries": 2}
             base_url = params.get("base_url")
-            self.client = OpenAI(api_key=openai_api_key, base_url=base_url, max_retries=2)
+            if base_url:
+                kwargs["base_url"] = base_url
+            self.client = OpenAI(**kwargs)
         else:
             # try to use litellm
@@ -67,9 +70,5 @@ class LLMClient:
             kwargs = params.copy()
             model = kwargs.pop("model_name")
-            base_url = params.pop("base_url", None)
-            if base_url is not None:
-                kwargs["api_base"] = base_url
-            response = self.client.completion(model=f"{self.provider}/{model}", messages=messages, args=kwargs)
+            response = self.client.completion(self.provider, model=model, messages=messages, args=kwargs)
             return response.choices[0].message.content

mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py CHANGED Viewed

@@ -31,17 +31,10 @@ _DEFAULT_CONTENT_COLUMN_NAME = "content"
 class DocumentPreprocessor:
     """Base class for document preprocessing"""
-    RESERVED_METADATA_FIELDS = {
-        "content",
-        "id",
-        "embeddings",
-        "original_doc_id",
-        "chunk_index",
-    }
     def __init__(self):
         """Initialize preprocessor"""
         self.splitter = None  # Will be set by child classes
+        self.config = None
     def process_documents(self, documents: List[Document]) -> List[ProcessedChunk]:
         """Base implementation - should be overridden by child classes
@@ -57,15 +50,10 @@ class DocumentPreprocessor:
             raise ValueError("Splitter not configured")
         # Convert to langchain Document for splitting
-        langchain_doc = LangchainDocument(
-            page_content=doc.content, metadata=doc.metadata or {}
-        )
+        langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
         # Split and convert back to our Document type
         split_docs = self.splitter.split_documents([langchain_doc])
-        return [
-            Document(content=split_doc.page_content, metadata=split_doc.metadata)
-            for split_doc in split_docs
-        ]
+        return [Document(content=split_doc.page_content, metadata=split_doc.metadata) for split_doc in split_docs]
     def _get_source(self) -> str:
         """Get the source identifier for this preprocessor"""
@@ -118,14 +106,14 @@ class DocumentPreprocessor:
         # Always preserve original document ID
         if doc_id is not None:
-            metadata["original_doc_id"] = doc_id
+            metadata[self.config.doc_id_column_name] = doc_id
         # Add chunk index only for multi-chunk cases
         if chunk_index is not None:
-            metadata["chunk_index"] = chunk_index
+            metadata["_chunk_index"] = chunk_index
         # Always set source
-        metadata["source"] = self._get_source()
+        metadata["_source"] = self._get_source()
         return metadata
@@ -148,9 +136,7 @@ Please give a short succinct context to situate this chunk within the overall do
         super().__init__()
         self.config = config
         self.splitter = FileSplitter(
-            FileSplitterConfig(
-                chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap
-            )
+            FileSplitterConfig(chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap)
         )
         self.llm = create_chat_model(
             {
@@ -162,28 +148,22 @@ Please give a short succinct context to situate this chunk within the overall do
         self.context_template = config.context_template or self.DEFAULT_CONTEXT_TEMPLATE
         self.summarize = self.config.summarize
-    def _prepare_prompts(
-        self, chunk_contents: list[str], full_documents: list[str]
-    ) -> list[str]:
+    def _prepare_prompts(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
         prompts = [
-            self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document)
-            for full_document in full_documents
+            self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document) for full_document in full_documents
         ]
         prompts = [
-            prompt.replace("{{CHUNK_CONTENT}}", chunk_content)
-            for prompt, chunk_content in zip(prompts, chunk_contents)
+            prompt.replace("{{CHUNK_CONTENT}}", chunk_content) for prompt, chunk_content in zip(prompts, chunk_contents)
         ]
         return prompts
-    def _generate_context(
-        self, chunk_contents: list[str], full_documents: list[str]
-    ) -> list[str]:
+    def _generate_context(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
         """Generate contextual description for a chunk using LLM"""
         prompts = self._prepare_prompts(chunk_contents, full_documents)
         # Check if LLM supports async
-        if hasattr(self.llm, 'abatch'):
+        if hasattr(self.llm, "abatch"):
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
             try:
@@ -211,7 +191,6 @@ Please give a short succinct context to situate this chunk within the overall do
         processed_chunks = []
         for doc_index, doc in enumerate(documents):
             # Document ID must be provided by this point
             if doc.id is None:
                 raise ValueError("Document ID must be provided before preprocessing")
@@ -247,12 +226,8 @@ Please give a short succinct context to situate this chunk within the overall do
         chunk_contents = [chunk_doc.content for chunk_doc in chunks_list]
         contexts = self._generate_context(chunk_contents, doc_contents)
-        for context, chunk_doc, chunk_index, doc_index in zip(
-            contexts, chunks_list, chunk_index_list, doc_index_list
-        ):
-            processed_content = (
-                context if self.summarize else f"{context}\n\n{chunk_doc.content}"
-            )
+        for context, chunk_doc, chunk_index, doc_index in zip(contexts, chunks_list, chunk_index_list, doc_index_list):
+            processed_content = context if self.summarize else f"{context}\n\n{chunk_doc.content}"
             doc = documents[doc_index]
             # Initialize metadata
@@ -261,7 +236,7 @@ Please give a short succinct context to situate this chunk within the overall do
                 metadata.update(doc.metadata)
             # Get content_column from metadata or use default
-            content_column = metadata.get('content_column')
+            content_column = metadata.get("_content_column")
             if content_column is None:
                 # If content_column is not in metadata, use the default column name
                 content_column = _DEFAULT_CONTENT_COLUMN_NAME
@@ -305,7 +280,6 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
         processed_chunks = []
         for doc in documents:
             # Document ID must be provided by this point
             if doc.id is None:
                 raise ValueError("Document ID must be provided before preprocessing")
@@ -334,13 +308,13 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
                     metadata.update(doc.metadata)
                 # Add position metadata
-                metadata["start_char"] = start_char
-                metadata["end_char"] = end_char
+                metadata["_start_char"] = start_char
+                metadata["_end_char"] = end_char
                 # Get content_column from metadata or use default
                 content_column = None
                 if doc.metadata:
-                    content_column = doc.metadata.get('content_column')
+                    content_column = doc.metadata.get("_content_column")
                 if content_column is None:
                     # If content_column is not in metadata, use the default column name
@@ -353,7 +327,7 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
                     start_char=start_char,
                     end_char=end_char,
                     provided_id=doc.id,
-                    content_column=content_column
+                    content_column=content_column,
                 )
                 processed_chunks.append(
@@ -392,6 +366,7 @@ class PreprocessorFactory:
         elif config.type == PreprocessorType.JSON_CHUNKING:
             # Import here to avoid circular imports
             from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingPreprocessor
             return JSONChunkingPreprocessor(config.json_chunking_config)
         else:
             raise ValueError(f"Unknown preprocessor type: {config.type}")

MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.6.4.0py3-none-any.whl → 25.7.2.0py3-none-any.whl