PyPI - MindsDB - Versions diffs - 25.9.2.0a1__py3-none-any.whl → 25.9.3rc1__py3-none-any.whl - Mend

MindsDB 25.9.2.0a1py3-none-any.whl → 25.9.3rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (116) hide show

mindsdb/__about__.py +1 -1
mindsdb/__main__.py +39 -20
mindsdb/api/a2a/agent.py +7 -9
mindsdb/api/a2a/common/server/server.py +3 -3
mindsdb/api/a2a/common/server/task_manager.py +4 -4
mindsdb/api/a2a/task_manager.py +15 -17
mindsdb/api/common/middleware.py +9 -11
mindsdb/api/executor/command_executor.py +2 -4
mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +100 -48
mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
mindsdb/api/executor/exceptions.py +29 -10
mindsdb/api/executor/planner/plan_join.py +17 -3
mindsdb/api/executor/sql_query/sql_query.py +74 -74
mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
mindsdb/api/executor/utilities/functions.py +6 -6
mindsdb/api/executor/utilities/sql.py +32 -16
mindsdb/api/http/gui.py +5 -11
mindsdb/api/http/initialize.py +8 -10
mindsdb/api/http/namespaces/agents.py +10 -12
mindsdb/api/http/namespaces/analysis.py +13 -20
mindsdb/api/http/namespaces/auth.py +1 -1
mindsdb/api/http/namespaces/config.py +15 -11
mindsdb/api/http/namespaces/databases.py +140 -201
mindsdb/api/http/namespaces/file.py +15 -4
mindsdb/api/http/namespaces/handlers.py +7 -2
mindsdb/api/http/namespaces/knowledge_bases.py +8 -7
mindsdb/api/http/namespaces/models.py +94 -126
mindsdb/api/http/namespaces/projects.py +13 -22
mindsdb/api/http/namespaces/sql.py +33 -25
mindsdb/api/http/namespaces/tab.py +27 -37
mindsdb/api/http/namespaces/views.py +1 -1
mindsdb/api/http/start.py +14 -8
mindsdb/api/mcp/__init__.py +2 -1
mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
mindsdb/api/postgres/postgres_proxy/executor/executor.py +6 -13
mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +40 -28
mindsdb/integrations/handlers/byom_handler/byom_handler.py +168 -185
mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +13 -1
mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
mindsdb/integrations/libs/api_handler.py +10 -10
mindsdb/integrations/libs/base.py +4 -4
mindsdb/integrations/libs/llm/utils.py +2 -2
mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
mindsdb/integrations/libs/process_cache.py +132 -140
mindsdb/integrations/libs/response.py +18 -12
mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
mindsdb/integrations/utilities/files/file_reader.py +6 -7
mindsdb/integrations/utilities/rag/config_loader.py +37 -26
mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +59 -9
mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
mindsdb/integrations/utilities/rag/settings.py +58 -133
mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
mindsdb/interfaces/agents/agents_controller.py +2 -1
mindsdb/interfaces/agents/constants.py +0 -2
mindsdb/interfaces/agents/litellm_server.py +34 -58
mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
mindsdb/interfaces/chatbot/polling.py +30 -18
mindsdb/interfaces/data_catalog/data_catalog_loader.py +10 -10
mindsdb/interfaces/database/integrations.py +19 -2
mindsdb/interfaces/file/file_controller.py +6 -6
mindsdb/interfaces/functions/controller.py +1 -1
mindsdb/interfaces/functions/to_markdown.py +2 -2
mindsdb/interfaces/jobs/jobs_controller.py +5 -5
mindsdb/interfaces/jobs/scheduler.py +3 -8
mindsdb/interfaces/knowledge_base/controller.py +50 -23
mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
mindsdb/interfaces/model/model_controller.py +170 -166
mindsdb/interfaces/query_context/context_controller.py +14 -2
mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +6 -4
mindsdb/interfaces/skills/retrieval_tool.py +43 -50
mindsdb/interfaces/skills/skill_tool.py +2 -2
mindsdb/interfaces/skills/sql_agent.py +25 -19
mindsdb/interfaces/storage/fs.py +114 -169
mindsdb/interfaces/storage/json.py +19 -18
mindsdb/interfaces/tabs/tabs_controller.py +49 -72
mindsdb/interfaces/tasks/task_monitor.py +3 -9
mindsdb/interfaces/tasks/task_thread.py +7 -9
mindsdb/interfaces/triggers/trigger_task.py +7 -13
mindsdb/interfaces/triggers/triggers_controller.py +47 -50
mindsdb/migrations/migrate.py +16 -16
mindsdb/utilities/api_status.py +58 -0
mindsdb/utilities/config.py +49 -0
mindsdb/utilities/exception.py +40 -1
mindsdb/utilities/fs.py +0 -1
mindsdb/utilities/hooks/profiling.py +17 -14
mindsdb/utilities/langfuse.py +40 -45
mindsdb/utilities/log.py +272 -0
mindsdb/utilities/ml_task_queue/consumer.py +52 -58
mindsdb/utilities/ml_task_queue/producer.py +26 -30
mindsdb/utilities/render/sqlalchemy_render.py +7 -6
mindsdb/utilities/utils.py +2 -2
{mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/METADATA +269 -264
{mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/RECORD +115 -115
mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
{mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/WHEEL +0 -0
{mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/top_level.txt +0 -0

mindsdb/integrations/utilities/rag/config_loader.py CHANGED Viewed

@@ -1,17 +1,26 @@
 """Utility functions for RAG pipeline configuration"""
 from typing import Dict, Any, Optional
 from mindsdb.utilities.log import getLogger
 from mindsdb.integrations.utilities.rag.settings import (
-    RetrieverType, MultiVectorRetrieverMode, SearchType,
-    SearchKwargs, SummarizationConfig, VectorStoreConfig,
-    RerankerConfig, RAGPipelineModel, DEFAULT_COLLECTION_NAME
+    RetrieverType,
+    MultiVectorRetrieverMode,
+    SearchType,
+    SearchKwargs,
+    SummarizationConfig,
+    VectorStoreConfig,
+    RerankerConfig,
+    RAGPipelineModel,
+    DEFAULT_COLLECTION_NAME,
 )
 logger = getLogger(__name__)
-def load_rag_config(base_config: Dict[str, Any], kb_params: Optional[Dict[str, Any]] = None, embedding_model: Any = None) -> RAGPipelineModel:
+def load_rag_config(
+    base_config: Dict[str, Any], kb_params: Optional[Dict[str, Any]] = None, embedding_model: Any = None
+) -> RAGPipelineModel:
     """
     Load and validate RAG configuration parameters. This function handles the conversion of configuration
     parameters into their appropriate types and ensures all required settings are properly configured.
@@ -37,41 +46,43 @@ def load_rag_config(base_config: Dict[str, Any], kb_params: Optional[Dict[str, A
     # Set embedding model if provided
     if embedding_model is not None:
-        rag_params['embedding_model'] = embedding_model
+        rag_params["embedding_model"] = embedding_model
     # Handle enums and type conversions
-    if 'retriever_type' in rag_params:
-        rag_params['retriever_type'] = RetrieverType(rag_params['retriever_type'])
-    if 'multi_retriever_mode' in rag_params:
-        rag_params['multi_retriever_mode'] = MultiVectorRetrieverMode(rag_params['multi_retriever_mode'])
-    if 'search_type' in rag_params:
-        rag_params['search_type'] = SearchType(rag_params['search_type'])
+    if "retriever_type" in rag_params:
+        rag_params["retriever_type"] = RetrieverType(rag_params["retriever_type"])
+    if "multi_retriever_mode" in rag_params:
+        rag_params["multi_retriever_mode"] = MultiVectorRetrieverMode(rag_params["multi_retriever_mode"])
+    if "search_type" in rag_params:
+        rag_params["search_type"] = SearchType(rag_params["search_type"])
     # Handle search kwargs if present
-    if 'search_kwargs' in rag_params and isinstance(rag_params['search_kwargs'], dict):
-        rag_params['search_kwargs'] = SearchKwargs(**rag_params['search_kwargs'])
+    if "search_kwargs" in rag_params and isinstance(rag_params["search_kwargs"], dict):
+        rag_params["search_kwargs"] = SearchKwargs(**rag_params["search_kwargs"])
     # Handle summarization config if present
-    summarization_config = rag_params.get('summarization_config')
+    summarization_config = rag_params.get("summarization_config")
     if summarization_config is not None and isinstance(summarization_config, dict):
-        rag_params['summarization_config'] = SummarizationConfig(**summarization_config)
+        rag_params["summarization_config"] = SummarizationConfig(**summarization_config)
     # Handle vector store config
-    if 'vector_store_config' in rag_params:
-        if isinstance(rag_params['vector_store_config'], dict):
-            rag_params['vector_store_config'] = VectorStoreConfig(**rag_params['vector_store_config'])
+    if "vector_store_config" in rag_params:
+        if isinstance(rag_params["vector_store_config"], dict):
+            rag_params["vector_store_config"] = VectorStoreConfig(**rag_params["vector_store_config"])
     else:
-        rag_params['vector_store_config'] = {}
-        logger.warning(f'No collection_name specified for the retrieval tool, '
-                       f"using default collection_name: '{DEFAULT_COLLECTION_NAME}'"
-                       f'\nWarning: If this collection does not exist, no data will be retrieved')
+        rag_params["vector_store_config"] = {}
+        logger.warning(
+            f"No collection_name specified for the retrieval tool, "
+            f"using default collection_name: '{DEFAULT_COLLECTION_NAME}'"
+            f"\nWarning: If this collection does not exist, no data will be retrieved"
+        )
-    if 'reranker_config' in rag_params:
-        rag_params['reranker_config'] = RerankerConfig(**rag_params['reranker_config'])
+    if "reranker_config" in rag_params:
+        rag_params["reranker_config"] = RerankerConfig(**rag_params["reranker_config"])
     # Convert to RAGPipelineModel with validation
     try:
         return RAGPipelineModel(**rag_params)
     except Exception as e:
-        logger.error(f"Invalid RAG configuration: {str(e)}")
-        raise ValueError(f"Configuration validation failed: {str(e)}")
+        logger.exception("Invalid RAG configuration:")
+        raise ValueError(f"Configuration validation failed: {str(e)}") from e

mindsdb/integrations/utilities/rag/rerankers/base_reranker.py CHANGED Viewed

@@ -13,7 +13,15 @@ from typing import Any, List, Optional, Tuple
 from openai import AsyncOpenAI, AsyncAzureOpenAI
 from pydantic import BaseModel
-from mindsdb.integrations.utilities.rag.settings import DEFAULT_RERANKING_MODEL, DEFAULT_LLM_ENDPOINT
+from mindsdb.integrations.utilities.rag.settings import (
+    DEFAULT_RERANKING_MODEL,
+    DEFAULT_LLM_ENDPOINT,
+    DEFAULT_RERANKER_N,
+    DEFAULT_RERANKER_LOGPROBS,
+    DEFAULT_RERANKER_TOP_LOGPROBS,
+    DEFAULT_RERANKER_MAX_TOKENS,
+    DEFAULT_VALID_CLASS_TOKENS,
+)
 from mindsdb.integrations.libs.base import BaseMLEngine
 log = logging.getLogger(__name__)
@@ -38,6 +46,11 @@ class BaseLLMReranker(BaseModel, ABC):
     request_timeout: float = 20.0  # Timeout for API requests
     early_stop: bool = True  # Whether to enable early stopping
     early_stop_threshold: float = 0.8  # Confidence threshold for early stopping
+    n: int = DEFAULT_RERANKER_N  # Number of completions to generate
+    logprobs: bool = DEFAULT_RERANKER_LOGPROBS  # Whether to include log probabilities
+    top_logprobs: int = DEFAULT_RERANKER_TOP_LOGPROBS  # Number of top log probabilities to include
+    max_tokens: int = DEFAULT_RERANKER_MAX_TOKENS  # Maximum tokens to generate
+    valid_class_tokens: List[str] = DEFAULT_VALID_CLASS_TOKENS
     class Config:
         arbitrary_types_allowed = True
@@ -142,7 +155,7 @@ class BaseLLMReranker(BaseModel, ABC):
                         return ranked_results
                 except Exception as e:
                     # Don't let early stopping errors stop the whole process
-                    log.warning(f"Error in early stopping check: {str(e)}")
+                    log.warning(f"Error in early stopping check: {e}")
         return ranked_results
@@ -234,6 +247,28 @@ class BaseLLMReranker(BaseModel, ABC):
         return rerank_data
     async def search_relevancy_score(self, query: str, document: str) -> Any:
+        """
+        This method is used to score the relevance of a document to a query.
+        Args:
+            query: The query to score the relevance of.
+            document: The document to score the relevance of.
+        Returns:
+            A dictionary with the document and the relevance score.
+        """
+        log.debug("Start search_relevancy_score")
+        log.debug(f"Reranker query: {query[:5]}")
+        log.debug(f"Reranker document: {document[:50]}")
+        log.debug(f"Reranker model: {self.model}")
+        log.debug(f"Reranker temperature: {self.temperature}")
+        log.debug(f"Reranker n: {self.n}")
+        log.debug(f"Reranker logprobs: {self.logprobs}")
+        log.debug(f"Reranker top_logprobs: {self.top_logprobs}")
+        log.debug(f"Reranker max_tokens: {self.max_tokens}")
+        log.debug(f"Reranker valid_class_tokens: {self.valid_class_tokens}")
         response = await self.client.chat.completions.create(
             model=self.model,
             messages=[
@@ -306,17 +341,30 @@ class BaseLLMReranker(BaseModel, ABC):
                 },
             ],
             temperature=self.temperature,
-            n=1,
-            logprobs=True,
-            top_logprobs=4,
-            max_tokens=3,
+            n=self.n,
+            logprobs=self.logprobs,
+            top_logprobs=self.top_logprobs,
+            max_tokens=self.max_tokens,
         )
         # Extract response and logprobs
         token_logprobs = response.choices[0].logprobs.content
-        # Reconstruct the prediction and extract the top logprobs from the final token (e.g., "1")
-        final_token_logprob = token_logprobs[-1]
-        top_logprobs = final_token_logprob.top_logprobs
+        # Find the token that contains the class number
+        # Instead of just taking the last token, search for the actual class number token
+        class_token_logprob = None
+        for token_logprob in reversed(token_logprobs):
+            if token_logprob.token in self.valid_class_tokens:
+                class_token_logprob = token_logprob
+                break
+        # If we couldn't find a class token, fall back to the last non-empty token
+        if class_token_logprob is None:
+            log.warning("No class token logprob found, using the last token as fallback")
+            class_token_logprob = token_logprobs[-1]
+        top_logprobs = class_token_logprob.top_logprobs
         # Create a map of 'class_1' -> probability, using token combinations
         class_probs = {}
         for top_token in top_logprobs:
@@ -337,6 +385,8 @@ class BaseLLMReranker(BaseModel, ABC):
                 score = 0.0
         rerank_data = {"document": document, "relevance_score": score}
+        log.debug(f"Reranker score: {score}")
+        log.debug("End search_relevancy_score")
         return rerank_data
     def get_scores(self, query: str, documents: list[str]):

mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py CHANGED Viewed

@@ -36,7 +36,7 @@ class LLMReranker(BaseDocumentCompressor, BaseLLMReranker):
             return []
         # Stream reranking update.
-        dispatch_custom_event('rerank_begin', {'num_documents': len(documents)})
+        dispatch_custom_event("rerank_begin", {"num_documents": len(documents)})
         try:
             # Prepare query-document pairs
@@ -73,10 +73,10 @@ class LLMReranker(BaseDocumentCompressor, BaseLLMReranker):
             return filtered_docs
         except Exception as e:
-            error_msg = f"Error during async document compression: {str(e)}"
-            log.error(error_msg)
+            error_msg = "Error during async document compression:"
+            log.exception(error_msg)
             if callbacks:
-                await callbacks.on_retriever_error(error_msg)
+                await callbacks.on_retriever_error(f"{error_msg} {e}")
             return documents  # Return original documents on error
     def compress_documents(

mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import re
-from pydantic import BaseModel, Field
-from typing import List, Any, Optional, Dict, Tuple, Union, Callable
-import collections
 import math
+import logging
+import collections
+from typing import List, Any, Optional, Dict, Tuple, Union, Callable
+from pydantic import BaseModel, Field
 from langchain.chains.llm import LLMChain
 from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
 from langchain_core.documents.base import Document
@@ -39,9 +39,7 @@ class MetadataFilter(BaseModel):
     """Represents an LLM generated metadata filter to apply to a PostgreSQL query."""
     attribute: str = Field(description="Database column to apply filter to")
-    comparator: str = Field(
-        description="PostgreSQL comparator to use to filter database column"
-    )
+    comparator: str = Field(description="PostgreSQL comparator to use to filter database column")
     value: Any = Field(description="Value to use to filter database column")
@@ -56,9 +54,7 @@ class AblativeMetadataFilter(MetadataFilter):
 class MetadataFilters(BaseModel):
     """List of LLM generated metadata filters to apply to a PostgreSQL query."""
-    filters: List[MetadataFilter] = Field(
-        description="List of PostgreSQL metadata filters to apply for user query"
-    )
+    filters: List[MetadataFilter] = Field(description="List of PostgreSQL metadata filters to apply for user query")
 class SQLRetriever(BaseRetriever):
@@ -142,25 +138,17 @@ class SQLRetriever(BaseRetriever):
         elif isinstance(schema, ColumnSchema):
             collection_key = "values"
         else:
-            raise Exception(
-                "schema must be either a DatabaseSchema, TableSchema, or ColumnSchema."
-            )
+            raise Exception("schema must be either a DatabaseSchema, TableSchema, or ColumnSchema.")
         if update is not None:
-            ordered = collections.OrderedDict(
-                sorted(update.items(), key=key, reverse=True)
-            )
+            ordered = collections.OrderedDict(sorted(update.items(), key=key, reverse=True))
         else:
-            ordered = collections.OrderedDict(
-                sorted(getattr(schema, collection_key).items(), key=key, reverse=True)
-            )
+            ordered = collections.OrderedDict(sorted(getattr(schema, collection_key).items(), key=key, reverse=True))
         schema = schema.model_copy(update={collection_key: ordered})
         return schema
-    def _sort_database_schema_by_key(
-        self, database_schema: DatabaseSchema, key: Callable
-    ) -> DatabaseSchema:
+    def _sort_database_schema_by_key(self, database_schema: DatabaseSchema, key: Callable) -> DatabaseSchema:
         """Re-build schema with OrderedDicts"""
         tables = {}
         # build new tables dict
@@ -169,17 +157,11 @@ class SQLRetriever(BaseRetriever):
             # build new column dict
             for column_key, column_schema in table_schema.columns.items():
                 # sort values directly and update column schema
-                columns[column_key] = self._sort_schema_by_key(
-                    schema=column_schema, key=key
-                )
+                columns[column_key] = self._sort_schema_by_key(schema=column_schema, key=key)
             # update table schema and sort
-            tables[table_key] = self._sort_schema_by_key(
-                schema=table_schema, key=key, update=columns
-            )
+            tables[table_key] = self._sort_schema_by_key(schema=table_schema, key=key, update=columns)
         # update table schema and sort
-        database_schema = self._sort_schema_by_key(
-            schema=database_schema, key=key, update=tables
-        )
+        database_schema = self._sort_schema_by_key(schema=database_schema, key=key, update=tables)
         return database_schema
@@ -191,15 +173,12 @@ class SQLRetriever(BaseRetriever):
         boolean_system_prompt: bool = True,
         format_instructions: Optional[str] = None,
     ) -> ChatPromptTemplate:
         if boolean_system_prompt is True:
             system_prompt = self.boolean_system_prompt
         else:
             system_prompt = self.generative_system_prompt
-        prepared_column_prompt = self._prepare_column_prompt(
-            column_schema=column_schema, table_schema=table_schema
-        )
+        prepared_column_prompt = self._prepare_column_prompt(column_schema=column_schema, table_schema=table_schema)
         column_schema_str = (
             prepared_column_prompt.messages[1]
             .format(
@@ -290,7 +269,6 @@ Below is a list of comparison operators for constructing filters for this value
         table_schema: TableSchema,
         boolean_system_prompt: bool = True,
     ) -> ChatPromptTemplate:
         if boolean_system_prompt is True:
             system_prompt = self.boolean_system_prompt
         else:
@@ -312,9 +290,7 @@ Below is a list of comparison operators for constructing filters for this value
             [("system", system_prompt), ("user", self.column_prompt_template)]
         )
-        header_str = (
-            f"This schema describes a column in the {table_schema.table} table."
-        )
+        header_str = f"This schema describes a column in the {table_schema.table} table."
         value_str = """
 ## **Content**
@@ -388,26 +364,18 @@ Below is a description of the contents in this column in list format:
         )
     def _rank_schema(self, prompt: ChatPromptTemplate, query: str) -> float:
-        rank_chain = LLMChain(
-            llm=self.llm.bind(logprobs=True), prompt=prompt, return_final_only=False
-        )
+        rank_chain = LLMChain(llm=self.llm.bind(logprobs=True), prompt=prompt, return_final_only=False)
         output = rank_chain({"query": query})  # returns metadata
         #  parse through metadata tokens until encountering either yes, or no.
         score = None  # a None score indicates the model output could not be parsed.
-        for content in output["full_generation"][0].message.response_metadata[
-            "logprobs"
-        ]["content"]:
+        for content in output["full_generation"][0].message.response_metadata["logprobs"]["content"]:
             #  Convert answer to score using the model's confidence
             if content["token"].lower().strip() == "yes":
-                score = (
-                    1 + math.exp(content["logprob"])
-                ) / 2  # If yes, use the model's confidence
+                score = (1 + math.exp(content["logprob"])) / 2  # If yes, use the model's confidence
                 break
             elif content["token"].lower().strip() == "no":
-                score = (
-                    1 - math.exp(content["logprob"])
-                ) / 2  # If no, invert the confidence
+                score = (1 - math.exp(content["logprob"])) / 2  # If no, invert the confidence
                 break
         if score is None:
@@ -465,9 +433,7 @@ Below is a description of the contents in this column in list format:
                         table_schema=table_schema,
                         boolean_system_prompt=True,
                     )
-                    column_schema.relevance = self._rank_schema(
-                        prompt=prompt, query=query
-                    )
+                    column_schema.relevance = self._rank_schema(prompt=prompt, query=query)
                     columns[column_key] = column_schema
@@ -512,9 +478,7 @@ Below is a description of the contents in this column in list format:
                             table_schema=table_schema,
                             boolean_system_prompt=True,
                         )
-                        value_schema.relevance = self._rank_schema(
-                            prompt=prompt, query=query
-                        )
+                        value_schema.relevance = self._rank_schema(prompt=prompt, query=query)
                         values[value_key] = value_schema
@@ -592,19 +556,13 @@ Below is a description of the contents in this column in list format:
         for table_key, table_schema in ordered_database_schema.tables.items():
             for column_key, column_schema in table_schema.columns.items():
                 for value_key, value_schema in column_schema.values.items():
-                    ablation_value_dict[(table_key, column_key, value_key)] = (
-                        value_schema.relevance
-                    )
+                    ablation_value_dict[(table_key, column_key, value_key)] = value_schema.relevance
-        ablation_value_dict = collections.OrderedDict(
-            sorted(ablation_value_dict.items(), key=lambda x: x[1])
-        )
+        ablation_value_dict = collections.OrderedDict(sorted(ablation_value_dict.items(), key=lambda x: x[1]))
         relevance_scores = list(ablation_value_dict.values())
         if len(relevance_scores) > 0:
-            ablation_quantiles = np.quantile(
-                relevance_scores, np.linspace(0, 1, self.num_retries + 2)[1:-1]
-            )
+            ablation_quantiles = np.quantile(relevance_scores, np.linspace(0, 1, self.num_retries + 2)[1:-1])
         else:
             ablation_quantiles = None
@@ -628,11 +586,7 @@ Below is a description of the contents in this column in list format:
         ablated_filters = []
         for filter in metadata_filters:
             for key in ablated_dict.keys():
-                if (
-                    filter.schema_table in key
-                    and filter.schema_column in key
-                    and filter.schema_value in key
-                ):
+                if filter.schema_table in key and filter.schema_column in key and filter.schema_value in key:
                     ablated_filters.append(filter)
         return ablated_filters
@@ -646,9 +600,7 @@ Below is a description of the contents in this column in list format:
         pass
     def _prepare_retrieval_query(self, query: str) -> str:
-        rewrite_prompt = PromptTemplate(
-            input_variables=["input"], template=self.rewrite_prompt_template
-        )
+        rewrite_prompt = PromptTemplate(input_variables=["input"], template=self.rewrite_prompt_template)
         rewrite_chain = LLMChain(llm=self.llm, prompt=rewrite_prompt)
         return rewrite_chain.predict(input=query)
@@ -668,9 +620,7 @@ Below is a description of the contents in this column in list format:
         # Add Table JOIN statements
         join_clauses = set()
         for metadata_filter in metadata_filters:
-            join_clause = ranked_database_schema.tables[
-                metadata_filter.schema_table
-            ].join
+            join_clause = ranked_database_schema.tables[metadata_filter.schema_table].join
             if join_clause in join_clauses:
                 continue
             else:
@@ -688,12 +638,12 @@ Below is a description of the contents in this column in list format:
             if i < len(metadata_filters) - 1:
                 base_query += " AND "
-        base_query += f" ORDER BY e.embeddings {self.distance_function.value[0]} '{{embeddings}}' LIMIT {self.search_kwargs.k};"
+        base_query += (
+            f" ORDER BY e.embeddings {self.distance_function.value[0]} '{{embeddings}}' LIMIT {self.search_kwargs.k};"
+        )
         return base_query
-    def _generate_filter(
-        self, prompt: ChatPromptTemplate, query: str
-    ) -> MetadataFilter:
+    def _generate_filter(self, prompt: ChatPromptTemplate, query: str) -> MetadataFilter:
         gen_filter_chain = LLMChain(llm=self.llm, prompt=prompt)
         output = gen_filter_chain({"query": query})
         return output
@@ -714,28 +664,22 @@ Below is a description of the contents in this column in list format:
                         # must use generation if field is a dictionary of tuples or a list
                         if type(value_schema.value) in [list, dict]:
                             try:
-                                metadata_prompt: ChatPromptTemplate = (
-                                    self._prepare_value_prompt(
-                                        format_instructions=parser.get_format_instructions(),
-                                        value_schema=value_schema,
-                                        column_schema=column_schema,
-                                        table_schema=table_schema,
-                                        boolean_system_prompt=False,
-                                    )
+                                metadata_prompt: ChatPromptTemplate = self._prepare_value_prompt(
+                                    format_instructions=parser.get_format_instructions(),
+                                    value_schema=value_schema,
+                                    column_schema=column_schema,
+                                    table_schema=table_schema,
+                                    boolean_system_prompt=False,
                                 )
-                                metadata_filters_chain = LLMChain(
-                                    llm=self.llm, prompt=metadata_prompt
-                                )
+                                metadata_filters_chain = LLMChain(llm=self.llm, prompt=metadata_prompt)
                                 metadata_filter_output = metadata_filters_chain.predict(
                                     query=query,
                                 )
                                 # If the LLM outputs raw JSON, use it as-is.
                                 # If the LLM outputs anything including a json markdown section, use the last one.
-                                json_markdown_output = re.findall(
-                                    r"```json.*```", metadata_filter_output, re.DOTALL
-                                )
+                                json_markdown_output = re.findall(r"```json.*```", metadata_filter_output, re.DOTALL)
                                 if json_markdown_output:
                                     metadata_filter_output = json_markdown_output[-1]
                                     # Clean the json tags.
@@ -754,11 +698,10 @@ Below is a description of the contents in this column in list format:
                                 metadata_filter = AblativeMetadataFilter(**model_dump)
                             except OutputParserException as e:
                                 logger.warning(
-                                    f"LLM failed to generate structured metadata filters: {str(e)}"
-                                )
-                                return HandlerResponse(
-                                    RESPONSE_TYPE.ERROR, error_message=str(e)
+                                    f"LLM failed to generate structured metadata filters: {e}",
+                                    exc_info=logger.isEnabledFor(logging.DEBUG),
                                 )
+                                return HandlerResponse(RESPONSE_TYPE.ERROR, error_message=str(e))
                         else:
                             metadata_filter = AblativeMetadataFilter(
                                 attribute=column_schema.column,
@@ -779,24 +722,17 @@ Below is a description of the contents in this column in list format:
         embeddings_str: str,
     ) -> HandlerResponse:
         try:
-            checked_sql_query = self._prepare_pgvector_query(
-                ranked_database_schema, metadata_filters
-            )
-            checked_sql_query_with_embeddings = checked_sql_query.format(
-                embeddings=embeddings_str
-            )
-            return self.vector_store_handler.native_query(
-                checked_sql_query_with_embeddings
-            )
+            checked_sql_query = self._prepare_pgvector_query(ranked_database_schema, metadata_filters)
+            checked_sql_query_with_embeddings = checked_sql_query.format(embeddings=embeddings_str)
+            return self.vector_store_handler.native_query(checked_sql_query_with_embeddings)
         except Exception as e:
             logger.warning(
-                f"Failed to prepare and execute SQL query from structured metadata: {str(e)}"
+                f"Failed to prepare and execute SQL query from structured metadata: {e}",
+                exc_info=logger.isEnabledFor(logging.DEBUG),
             )
             return HandlerResponse(RESPONSE_TYPE.ERROR, error_message=str(e))
-    def _get_relevant_documents(
-        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-    ) -> List[Document]:
+    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
         # Rewrite query to be suitable for retrieval.
         retrieval_query = self._prepare_retrieval_query(query)
@@ -804,14 +740,10 @@ Below is a description of the contents in this column in list format:
         embedded_query = self.embeddings_model.embed_query(retrieval_query)
         # Search for relevant filters
-        ranked_database_schema, ablation_value_dict, ablation_quantiles = (
-            self._breadth_first_search(query=query)
-        )
+        ranked_database_schema, ablation_value_dict, ablation_quantiles = self._breadth_first_search(query=query)
         # Generate metadata filters
-        metadata_filters = self._generate_metadata_filters(
-            query=query, ranked_database_schema=ranked_database_schema
-        )
+        metadata_filters = self._generate_metadata_filters(query=query, ranked_database_schema=ranked_database_schema)
         if type(metadata_filters) is list:
             # Initial Execution of the similarity search with metadata filters.
@@ -830,9 +762,7 @@ Below is a description of the contents in this column in list format:
                     break
                 elif document_response.resp_type == RESPONSE_TYPE.ERROR:
                     # LLMs won't always generate structured metadata so we should have a fallback after retrying.
-                    logger.info(
-                        f"SQL Retriever query failed with error {document_response.error_message}"
-                    )
+                    logger.info(f"SQL Retriever query failed with error {document_response.error_message}")
                 else:
                     logger.info(
                         f"SQL Retriever did not retrieve {self.min_k} documents: {len(document_response.data_frame)} documents retrieved."
@@ -867,17 +797,9 @@ Below is a description of the contents in this column in list format:
                 return retrieved_documents
             # If the SQL query constructed did not return any documents, fallback.
-            logger.info(
-                "No documents returned from SQL retriever, using fallback retriever."
-            )
-            return self.fallback_retriever._get_relevant_documents(
-                retrieval_query, run_manager=run_manager
-            )
+            logger.info("No documents returned from SQL retriever, using fallback retriever.")
+            return self.fallback_retriever._get_relevant_documents(retrieval_query, run_manager=run_manager)
         else:
             # If no metadata fields could be generated fallback.
-            logger.info(
-                "No metadata fields were successfully generated, using fallback retriever."
-            )
-            return self.fallback_retriever._get_relevant_documents(
-                retrieval_query, run_manager=run_manager
-            )
+            logger.info("No metadata fields were successfully generated, using fallback retriever.")
+            return self.fallback_retriever._get_relevant_documents(retrieval_query, run_manager=run_manager)

MindsDB 25.9.2.0a1__py3-none-any.whl → 25.9.3rc1__py3-none-any.whl

Potentially problematic release.

MindsDB 25.9.2.0a1py3-none-any.whl → 25.9.3rc1py3-none-any.whl