PyPI - MindsDB - Versions diffs - 25.3.4.2__py3-none-any.whl → 25.4.2.0__py3-none-any.whl - Mend

MindsDB 25.3.4.2py3-none-any.whl → 25.4.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (53) hide show

mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py CHANGED Viewed

@@ -54,7 +54,7 @@ class LLMReranker(BaseDocumentCompressor):
                 max_retries=2  # Client-level retries
             )
-    async def search_relevancy(self, query: str, document: str) -> Any:
+    async def search_relevancy(self, query: str, document: str, custom_event: bool = True) -> Any:
         await self._init_client()
         async with self._semaphore:
@@ -82,7 +82,8 @@ class LLMReranker(BaseDocumentCompressor):
                     }
                     # Stream reranking update.
-                    dispatch_custom_event("rerank", rerank_data)
+                    if custom_event:
+                        dispatch_custom_event("rerank", rerank_data)
                     return rerank_data
                 except Exception as e:
@@ -93,7 +94,7 @@ class LLMReranker(BaseDocumentCompressor):
                     retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
                     await asyncio.sleep(retry_delay)
-    async def _rank(self, query_document_pairs: List[Tuple[str, str]]) -> List[Tuple[str, float]]:
+    async def _rank(self, query_document_pairs: List[Tuple[str, str]], custom_event: bool = True) -> List[Tuple[str, float]]:
         ranked_results = []
         # Process in larger batches for better throughput
@@ -102,7 +103,7 @@ class LLMReranker(BaseDocumentCompressor):
             batch = query_document_pairs[i:i + batch_size]
             try:
                 results = await asyncio.gather(
-                    *[self.search_relevancy(query=query, document=document) for (query, document) in batch],
+                    *[self.search_relevancy(query=query, document=document, custom_event=custom_event) for (query, document) in batch],
                     return_exceptions=True
                 )
@@ -127,17 +128,21 @@ class LLMReranker(BaseDocumentCompressor):
                     ranked_results.append((batch[idx][1], score))
                     # Check if we should stop early
-                    high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
-                    can_stop_early = (
-                        self.early_stop  # Early stopping is enabled
-                        and self.num_docs_to_keep  # We have a target number of docs
-                        and len(high_scoring_docs) >= self.num_docs_to_keep  # Found enough good docs
-                        and score >= self.early_stop_threshold  # Current doc is good enough
-                    )
-                    if can_stop_early:
-                        log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
-                        return ranked_results
+                    try:
+                        high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
+                        can_stop_early = (
+                            self.early_stop  # Early stopping is enabled
+                            and self.num_docs_to_keep  # We have a target number of docs
+                            and len(high_scoring_docs) >= self.num_docs_to_keep  # Found enough good docs
+                            and score >= self.early_stop_threshold  # Current doc is good enough
+                        )
+                        if can_stop_early:
+                            log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
+                            return ranked_results
+                    except Exception as e:
+                        # Don't let early stopping errors stop the whole process
+                        log.warning(f"Error in early stopping check: {str(e)}")
             except Exception as e:
                 log.error(f"Batch processing error: {str(e)}")
@@ -222,3 +227,18 @@ class LLMReranker(BaseDocumentCompressor):
             "temperature": self.temperature,
             "remove_irrelevant": self.remove_irrelevant,
         }
+    def get_scores(self, query: str, documents: list[str], custom_event: bool = False):
+        query_document_pairs = [(query, doc) for doc in documents]
+        # Create event loop and run async code
+        import asyncio
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            # If no running loop exists, create a new one
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs, custom_event=custom_event))
+        scores = [score for _, score in documents_and_scores]
+        return scores

mindsdb/interfaces/database/log.py CHANGED Viewed

@@ -1,21 +1,21 @@
+from typing import List
 from copy import deepcopy
 from abc import ABC, abstractmethod
-from typing import List, Union, Tuple
 from collections import OrderedDict
 import pandas as pd
 from mindsdb_sql_parser import parse_sql
 from mindsdb_sql_parser.ast import Select, Identifier, Star, BinaryOperation, Constant, Join, Function
 from mindsdb_sql_parser.utils import JoinType
 from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
 from mindsdb.integrations.utilities.query_traversal import query_traversal
 from mindsdb.utilities.functions import resolve_table_identifier
 from mindsdb.api.executor.utilities.sql import get_query_tables
 from mindsdb.utilities.exception import EntityNotExistsError
 import mindsdb.interfaces.storage.db as db
 from mindsdb.utilities.context import context as ctx
+from mindsdb.api.executor.datahub.classes.response import DataHubResponse
 from mindsdb.api.executor.datahub.classes.tables_row import (
     TABLES_ROW_TYPE,
     TablesRow,
@@ -223,8 +223,7 @@ class LogDBController:
             for table_name in self._tables.keys()
         ]
-    def query(self, query: Select = None, native_query: str = None,
-              session=None, return_as: str = 'split') -> Union[pd.DataFrame, Tuple[pd.DataFrame, list]]:
+    def query(self, query: Select = None, native_query: str = None, session=None) -> DataHubResponse:
         if native_query is not None:
             if query is not None:
                 raise Exception("'query' and 'native_query' arguments can not be used together")
@@ -286,12 +285,12 @@ class LogDBController:
                     df[df_column_name] = df[df_column_name].astype(column_type)
         # endregion
-        if return_as != 'split':
-            return df
         columns_info = [{
             'name': k,
             'type': v
         } for k, v in df.dtypes.items()]
-        return df, columns_info
+        return DataHubResponse(
+            data_frame=df,
+            columns=columns_info
+        )

mindsdb/interfaces/database/projects.py CHANGED Viewed

@@ -137,14 +137,10 @@ class Project:
                 view_meta['query_ast'],
                 session=session
             )
-            result = sqlquery.fetch(view='dataframe')
+            df = sqlquery.fetched_data.to_df()
         finally:
             query_context_controller.release_context('view', view_meta['id'])
-        if result['success'] is False:
-            raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
-        df = result['result']
         # remove duplicated columns
         df = df.loc[:, ~df.columns.duplicated()]
@@ -296,6 +292,19 @@ class Project:
         ]
         return data
+    def get_knowledge_bases(self):
+        from mindsdb.api.executor.controllers.session_controller import SessionController
+        session = SessionController()
+        return {
+            kb['name']: {
+                'type': 'knowledge_base',
+                'id': kb['id'],
+                'deletable': True
+            }
+            for kb in session.kb_controller.list(self.name)
+        }
     def get_views(self):
         records = (
             db.session.query(db.View).filter_by(
@@ -353,6 +362,8 @@ class Project:
         for agent in agents:
             data[agent['name']] = agent['metadata']
+        data.update(self.get_knowledge_bases())
         return data
     def get_columns(self, table_name: str):

mindsdb/interfaces/functions/controller.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from duckdb.typing import BIGINT, DOUBLE, VARCHAR, BLOB, BOOLEAN
+from mindsdb.interfaces.functions.to_markdown import ToMarkdown
 from mindsdb.interfaces.storage.model_fs import HandlerStorage
@@ -121,32 +122,20 @@ class FunctionController(BYOMFunctionsController):
         if meta is not None:
             return meta
-        # builtin function
+        # builtin functions
         if node.op.lower() == 'llm':
             return self.llm_call_function(node)
+        elif node.op.lower() == 'to_markdown':
+            return self.to_markdown_call_function(node)
     def llm_call_function(self, node):
         name = node.op.lower()
         if name in self.callbacks:
             return self.callbacks[name]
-        param_prefix = 'LLM_FUNCTION_'
-        chat_model_params = {}
-        for k, v in os.environ.items():
-            if k.startswith(param_prefix):
-                param_name = k[len(param_prefix):]
-                if param_name == 'MODEL':
-                    chat_model_params['model_name'] = v
-                else:
-                    chat_model_params[param_name.lower()] = v
-        if 'provider' not in chat_model_params:
-            chat_model_params['provider'] = 'openai'
-        if 'api_key' in chat_model_params:
-            # move to api_keys dict
-            chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
+        chat_model_params = self._parse_chat_model_params()
         try:
             from langchain_core.messages import HumanMessage
@@ -168,6 +157,59 @@ class FunctionController(BYOMFunctionsController):
         self.callbacks[name] = meta
         return meta
+    def to_markdown_call_function(self, node):
+        name = node.op.lower()
+        if name in self.callbacks:
+            return self.callbacks[name]
+        def callback(file_path_or_url, use_llm):
+            chat_model_params = self._parse_chat_model_params()
+            llm_client = None
+            llm_model = None
+            try:
+                from mindsdb.interfaces.agents.langchain_agent import create_chat_model
+                llm = create_chat_model(chat_model_params)
+                llm_client = llm.root_client
+                llm_model = llm.model_name
+            except Exception:
+                pass
+            to_markdown = ToMarkdown(use_llm, llm_client, llm_model)
+            return to_markdown.call(file_path_or_url)
+        meta = {
+            'name': name,
+            'callback': callback,
+            'input_types': ['str', 'bool'],
+            'output_type': 'str'
+        }
+        self.callbacks[name] = meta
+        return meta
+    def _parse_chat_model_params(self, param_prefix: str = 'LLM_FUNCTION_'):
+        """
+        Parses the environment variables for chat model parameters.
+        """
+        chat_model_params = {}
+        for k, v in os.environ.items():
+            if k.startswith(param_prefix):
+                param_name = k[len(param_prefix):]
+                if param_name == 'MODEL':
+                    chat_model_params['model_name'] = v
+                else:
+                    chat_model_params[param_name.lower()] = v
+        if 'provider' not in chat_model_params:
+            chat_model_params['provider'] = 'openai'
+        if 'api_key' in chat_model_params:
+            # move to api_keys dict
+            chat_model_params["api_keys"] = {chat_model_params['provider']: chat_model_params['api_key']}
+        return chat_model_params
 class DuckDBFunctions:
     def __init__(self, controller):

mindsdb/interfaces/functions/to_markdown.py ADDED Viewed

@@ -0,0 +1,194 @@
+import base64
+from io import BytesIO
+import os
+from typing import Union
+from urllib.parse import urlparse
+import fitz  # PyMuPDF
+from markitdown import MarkItDown
+import mimetypes
+from openai import OpenAI
+import requests
+class ToMarkdown:
+    """
+    Extracts the content of documents of various formats in markdown format.
+    """
+    def __init__(self, use_llm: bool, llm_client: OpenAI = None, llm_model: str = None):
+        """
+        Initializes the ToMarkdown class.
+        """
+        # If use_llm is True, llm_client and llm_model must be provided.
+        if use_llm and (llm_client is None or llm_model is None):
+            raise ValueError('LLM client and model must be provided when use_llm is True.')
+        # If use_llm is False, set llm_client and llm_model to None even if they are provided.
+        if not use_llm:
+            llm_client = None
+            llm_model = None
+        # Only OpenAI is supported for now.
+        # TODO: Add support for other LLMs.
+        if llm_client is not None and not isinstance(llm_client, OpenAI):
+            raise ValueError('Only OpenAI models are supported at the moment.')
+        self.use_llm = use_llm
+        self.llm_client = llm_client
+        self.llm_model = llm_model
+    def call(self, file_path_or_url: str) -> str:
+        """
+        Converts a file to markdown.
+        """
+        file_extension = self._get_file_extension(file_path_or_url)
+        file = self._get_file_content(file_path_or_url)
+        if file_extension == '.pdf':
+            return self._pdf_to_markdown(file)
+        elif file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
+            return self._image_to_markdown(file)
+        else:
+            return self._other_to_markdown(file)
+    def _get_file_content(self, file_path_or_url: str) -> str:
+        """
+        Retrieves the content of a file.
+        """
+        parsed_url = urlparse(file_path_or_url)
+        if parsed_url.scheme in ('http', 'https'):
+            response = requests.get(file_path_or_url)
+            if response.status_code == 200:
+                return response
+            else:
+                raise RuntimeError(f'Unable to retrieve file from URL: {file_path_or_url}')
+        else:
+            with open(file_path_or_url, 'rb') as file:
+                return BytesIO(file.read())
+    def _get_file_extension(self, file_path_or_url: str) -> str:
+        """
+        Retrieves the file extension from a file path or URL.
+        """
+        parsed_url = urlparse(file_path_or_url)
+        if parsed_url.scheme in ('http', 'https'):
+            try:
+                # Make a HEAD request to get headers without downloading the file.
+                response = requests.head(file_path_or_url, allow_redirects=True)
+                content_type = response.headers.get('Content-Type', '')
+                if content_type:
+                    ext = mimetypes.guess_extension(content_type.split(';')[0].strip())
+                    if ext:
+                        return ext
+                # Fallback to extracting extension from the URL path
+                ext = os.path.splitext(parsed_url.path)[1]
+                if ext:
+                    return ext
+            except requests.RequestException:
+                raise RuntimeError(f'Unable to retrieve file extension from URL: {file_path_or_url}')
+        else:
+            return os.path.splitext(file_path_or_url)[1]
+    def _pdf_to_markdown(self, file_content: Union[requests.Response, bytes]) -> str:
+        """
+        Converts a PDF file to markdown.
+        """
+        if self.llm_client is None:
+            return self._pdf_to_markdown_no_llm(file_content)
+        else:
+            return self._pdf_to_markdown_llm(file_content)
+    def _pdf_to_markdown_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
+        """
+        Converts a PDF file to markdown using LLM.
+        The LLM is used mainly for the purpose of generating descriptions of any images in the PDF.
+        """
+        if isinstance(file_content, requests.Response):
+            file_content = BytesIO(file_content.content)
+        document = fitz.open(stream=file_content, filetype="pdf")
+        markdown_content = []
+        for page_num in range(len(document)):
+            page = document.load_page(page_num)
+            # Get text blocks with coordinates.
+            page_content = []
+            blocks = page.get_text("blocks")
+            for block in blocks:
+                x0, y0, x1, y1, text, _, _ = block
+                if text.strip():  # Skip empty or whitespace blocks.
+                    page_content.append((y0, text.strip()))
+            # Extract images from the page.
+            image_list = page.get_images(full=True)
+            for img_index, img in enumerate(image_list):
+                xref = img[0]
+                base_image = document.extract_image(xref)
+                image_bytes = base_image["image"]
+                # Use actual image y-coordinate if available.
+                y0 = float(base_image.get("y", 0))
+                image_description = self._generate_image_description(image_bytes)
+                page_content.append((y0, f"![{image_description}](image_{page_num + 1}_{img_index + 1}.png)"))
+            # Sort the content by y0 coordinate
+            page_content.sort(key=lambda x: x[0])
+            # Add sorted content to the markdown
+            for _, text in page_content:
+                markdown_content.append(text)
+            markdown_content.append("\n")
+        document.close()
+        return "\n".join(markdown_content)
+    def _generate_image_description(self, image_bytes: bytes) -> str:
+        """
+        Generates a description of the image using LLM.
+        """
+        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+        response = self.llm_client.chat.completions.create(
+            model=self.llm_model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Describe this image"},
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                    ],
+                }
+            ],
+        )
+        description = response.choices[0].message.content
+        return description
+    def _pdf_to_markdown_no_llm(self, file_content: Union[requests.Response, BytesIO]) -> str:
+        """
+        Converts a PDF file to markdown without using LLM.
+        """
+        md = MarkItDown(enable_plugins=True)
+        result = md.convert(file_content)
+        return result.markdown
+    def _image_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
+        """
+        Converts images to markdown.
+        """
+        if not self.use_llm or self.llm_client is None:
+            raise ValueError('LLM client must be enabled to convert images to markdown.')
+        md = MarkItDown(llm_client=self.llm_client, llm_model=self.llm_model, enable_plugins=True)
+        result = md.convert(file_content)
+        return result.markdown
+    def _other_to_markdown(self, file_content: Union[requests.Response, BytesIO]) -> str:
+        """
+        Converts other file formats to markdown.
+        """
+        md = MarkItDown(enable_plugins=True)
+        result = md.convert(file_content)
+        return result.markdown

mindsdb/interfaces/jobs/jobs_controller.py CHANGED Viewed

@@ -337,10 +337,10 @@ class JobsController:
                 BinaryOperation(op='=', args=[Identifier('project'), Constant(project_name)])
             ])
         )
-        data, columns = logs_db_controller.query(query)
+        response = logs_db_controller.query(query)
-        names = [i['name'] for i in columns]
-        return data[names].to_dict(orient='records')
+        names = [i['name'] for i in response.columns]
+        return response.data_frame[names].to_dict(orient='records')
 class JobsExecutor:

MindsDB 25.3.4.2__py3-none-any.whl → 25.4.2.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.3.4.2py3-none-any.whl → 25.4.2.0py3-none-any.whl