PyPI - MindsDB - Versions diffs - 25.5.4.2__py3-none-any.whl → 25.6.3.0__py3-none-any.whl - Mend

MindsDB 25.5.4.2py3-none-any.whl → 25.6.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (76) hide show

mindsdb/__about__.py +1 -1
mindsdb/api/a2a/agent.py +50 -26
mindsdb/api/a2a/common/server/server.py +32 -26
mindsdb/api/a2a/task_manager.py +68 -6
mindsdb/api/executor/command_executor.py +69 -14
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +91 -84
mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
mindsdb/api/executor/planner/plan_join.py +67 -77
mindsdb/api/executor/planner/query_planner.py +176 -155
mindsdb/api/executor/planner/steps.py +37 -12
mindsdb/api/executor/sql_query/result_set.py +45 -64
mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
mindsdb/api/executor/utilities/sql.py +42 -48
mindsdb/api/http/namespaces/config.py +1 -1
mindsdb/api/http/namespaces/file.py +14 -23
mindsdb/api/http/namespaces/knowledge_bases.py +132 -154
mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +219 -28
mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +29 -33
mindsdb/integrations/handlers/openai_handler/openai_handler.py +277 -356
mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +145 -40
mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +352 -83
mindsdb/integrations/libs/api_handler.py +279 -57
mindsdb/integrations/libs/base.py +185 -30
mindsdb/integrations/utilities/files/file_reader.py +99 -73
mindsdb/integrations/utilities/handler_utils.py +23 -8
mindsdb/integrations/utilities/sql_utils.py +35 -40
mindsdb/interfaces/agents/agents_controller.py +226 -196
mindsdb/interfaces/agents/constants.py +8 -1
mindsdb/interfaces/agents/langchain_agent.py +42 -11
mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
mindsdb/interfaces/agents/mindsdb_database_agent.py +23 -18
mindsdb/interfaces/data_catalog/__init__.py +0 -0
mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
mindsdb/interfaces/data_catalog/data_catalog_loader.py +375 -0
mindsdb/interfaces/data_catalog/data_catalog_reader.py +38 -0
mindsdb/interfaces/database/database.py +81 -57
mindsdb/interfaces/database/integrations.py +222 -234
mindsdb/interfaces/database/log.py +72 -104
mindsdb/interfaces/database/projects.py +156 -193
mindsdb/interfaces/file/file_controller.py +21 -65
mindsdb/interfaces/knowledge_base/controller.py +66 -25
mindsdb/interfaces/knowledge_base/evaluate.py +516 -0
mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
mindsdb/interfaces/skills/skills_controller.py +31 -36
mindsdb/interfaces/skills/sql_agent.py +113 -86
mindsdb/interfaces/storage/db.py +242 -82
mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
mindsdb/utilities/config.py +13 -2
mindsdb/utilities/log.py +35 -26
mindsdb/utilities/ml_task_queue/task.py +19 -22
mindsdb/utilities/render/sqlalchemy_render.py +129 -181
mindsdb/utilities/starters.py +40 -0
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/METADATA +257 -257
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/RECORD +76 -68
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/WHEEL +0 -0
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/top_level.txt +0 -0

mindsdb/integrations/libs/base.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import ast
+import concurrent.futures
 import inspect
 import textwrap
 from _ast import AnnAssign, AugAssign
@@ -8,20 +9,20 @@ import pandas as pd
 from mindsdb_sql_parser.ast.base import ASTNode
 from mindsdb.utilities import log
-from mindsdb.integrations.libs.response import HandlerResponse, HandlerStatusResponse
+from mindsdb.integrations.libs.response import HandlerResponse, HandlerStatusResponse, RESPONSE_TYPE
 logger = log.getLogger(__name__)
 class BaseHandler:
-    """ Base class for database handlers
+    """Base class for database handlers
     Base class for handlers that associate a source of information with the
     broader MindsDB ecosystem via SQL commands.
     """
     def __init__(self, name: str):
-        """ constructor
+        """constructor
         Args:
             name (str): the handler name
         """
@@ -29,7 +30,7 @@ class BaseHandler:
         self.name = name
     def connect(self):
-        """ Set up any connections required by the handler
+        """Set up any connections required by the handler
         Should return connection
@@ -37,7 +38,7 @@ class BaseHandler:
         raise NotImplementedError()
     def disconnect(self):
-        """ Close any existing connections
+        """Close any existing connections
         Should switch self.is_connected.
         """
@@ -45,7 +46,7 @@ class BaseHandler:
         return
     def check_connection(self) -> HandlerStatusResponse:
-        """ Check connection to the handler
+        """Check connection to the handler
         Returns:
             HandlerStatusResponse
@@ -77,7 +78,7 @@ class BaseHandler:
         raise NotImplementedError()
     def get_tables(self) -> HandlerResponse:
-        """ Return list of entities
+        """Return list of entities
         Return list of entities that will be accesible as tables.
@@ -89,7 +90,7 @@ class BaseHandler:
         raise NotImplementedError()
     def get_columns(self, table_name: str) -> HandlerResponse:
-        """ Returns a list of entity columns
+        """Returns a list of entity columns
         Args:
             table_name (str): name of one of tables returned by self.get_tables()
@@ -113,6 +114,174 @@ class DatabaseHandler(BaseHandler):
         super().__init__(name)
+class MetaDatabaseHandler(DatabaseHandler):
+    """
+    Base class for handlers associated to data storage systems (e.g. databases, data warehouses, streaming services, etc.)
+    This class is used when the handler is also needed to store information in the data catalog.
+    This information is typically avaiable in the information schema or system tables of the database.
+    """
+    def __init__(self, name: str):
+        super().__init__(name)
+    def meta_get_tables(self, table_names: Optional[List[str]]) -> HandlerResponse:
+        """
+        Returns metadata information about the tables to be stored in the data catalog.
+        Returns:
+            HandlerResponse: The response should consist of the following columns:
+            - TABLE_NAME (str): Name of the table.
+            - TABLE_TYPE (str): Type of the table, e.g. 'BASE TABLE', 'VIEW', etc. (optional).
+            - TABLE_SCHEMA (str): Schema of the table (optional).
+            - TABLE_DESCRIPTION (str): Description of the table (optional).
+            - ROW_COUNT (int): Estimated number of rows in the table (optional).
+        """
+        raise NotImplementedError()
+    def meta_get_columns(self, table_names: Optional[List[str]]) -> HandlerResponse:
+        """
+        Returns metadata information about the columns in the tables to be stored in the data catalog.
+        Returns:
+            HandlerResponse: The response should consist of the following columns:
+            - TABLE_NAME (str): Name of the table.
+            - COLUMN_NAME (str): Name of the column.
+            - DATA_TYPE (str): Data type of the column, e.g. 'VARCHAR', 'INT', etc.
+            - COLUMN_DESCRIPTION (str): Description of the column (optional).
+            - IS_NULLABLE (bool): Whether the column can contain NULL values (optional).
+            - COLUMN_DEFAULT (str): Default value of the column (optional).
+        """
+        raise NotImplementedError()
+    def meta_get_column_statistics(self, table_names: Optional[List[str]]) -> HandlerResponse:
+        """
+        Returns metadata statisical information about the columns in the tables to be stored in the data catalog.
+        Either this method should be overridden in the handler or `meta_get_column_statistics_for_table` should be implemented.
+        Returns:
+            HandlerResponse: The response should consist of the following columns:
+            - TABLE_NAME (str): Name of the table.
+            - COLUMN_NAME (str): Name of the column.
+            - MOST_COMMON_VALUES (List[str]): Most common values in the column (optional).
+            - MOST_COMMON_FREQUENCIES (List[str]): Frequencies of the most common values in the column (optional).
+            - NULL_PERCENTAGE: Percentage of NULL values in the column (optional).
+            - MINIMUM_VALUE (str): Minimum value in the column (optional).
+            - MAXIMUM_VALUE (str): Maximum value in the column (optional).
+            - DISTINCT_VALUES_COUNT (int): Count of distinct values in the column (optional).
+        """
+        method = getattr(self, "meta_get_column_statistics_for_table")
+        if method.__func__ is not MetaDatabaseHandler.meta_get_column_statistics_for_table:
+            meta_columns = self.meta_get_columns(table_names)
+            grouped_columns = (
+                meta_columns.data_frame.groupby("table_name")
+                .agg(
+                    {
+                        "column_name": list,
+                    }
+                )
+                .reset_index()
+            )
+            executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
+            futures = []
+            results = []
+            with executor:
+                for _, row in grouped_columns.iterrows():
+                    table_name = row["table_name"]
+                    columns = row["column_name"]
+                    futures.append(executor.submit(self.meta_get_column_statistics_for_table, table_name, columns))
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    result = future.result(timeout=120)
+                    if result.resp_type == RESPONSE_TYPE.TABLE:
+                        results.append(result.data_frame)
+                    else:
+                        logger.error(
+                            f"Error retrieving column statistics for table {table_name}: {result.error_message}"
+                        )
+                except Exception as e:
+                    logger.error(f"Exception occurred while retrieving column statistics for table {table_name}: {e}")
+            if not results:
+                logger.warning("No column statistics could be retrieved for the specified tables.")
+                return HandlerResponse(RESPONSE_TYPE.ERROR, error_message="No column statistics could be retrieved.")
+            return HandlerResponse(
+                RESPONSE_TYPE.TABLE, pd.concat(results, ignore_index=True) if results else pd.DataFrame()
+            )
+        else:
+            raise NotImplementedError()
+    def meta_get_column_statistics_for_table(
+        self, table_name: str, column_names: Optional[List[str]] = None
+    ) -> HandlerResponse:
+        """
+        Returns metadata statistical information about the columns in a specific table to be stored in the data catalog.
+        Either this method should be implemented in the handler or `meta_get_column_statistics` should be overridden.
+        Args:
+            table_name (str): Name of the table.
+            column_names (Optional[List[str]]): List of column names to retrieve statistics for. If None, statistics for all columns will be returned.
+        Returns:
+            HandlerResponse: The response should consist of the following columns:
+            - TABLE_NAME (str): Name of the table.
+            - COLUMN_NAME (str): Name of the column.
+            - MOST_COMMON_VALUES (List[str]): Most common values in the column (optional).
+            - MOST_COMMON_FREQUENCIES (List[str]): Frequencies of the most common values in the column (optional).
+            - NULL_PERCENTAGE: Percentage of NULL values in the column (optional).
+            - MINIMUM_VALUE (str): Minimum value in the column (optional).
+            - MAXIMUM_VALUE (str): Maximum value in the column (optional).
+            - DISTINCT_VALUES_COUNT (int): Count of distinct values in the column (optional).
+        """
+        pass
+    def meta_get_primary_keys(self, table_names: Optional[List[str]]) -> HandlerResponse:
+        """
+        Returns metadata information about the primary keys in the tables to be stored in the data catalog.
+        Returns:
+            HandlerResponse: The response should consist of the following columns:
+            - TABLE_NAME (str): Name of the table.
+            - COLUMN_NAME (str): Name of the column that is part of the primary key.
+            - ORDINAL_POSITION (int): Position of the column in the primary key (optional).
+            - CONSTRAINT_NAME (str): Name of the primary key constraint (optional).
+        """
+        raise NotImplementedError()
+    def meta_get_foreign_keys(self, table_names: Optional[List[str]]) -> HandlerResponse:
+        """
+        Returns metadata information about the foreign keys in the tables to be stored in the data catalog.
+        Returns:
+            HandlerResponse: The response should consist of the following columns:
+            - PARENT_TABLE_NAME (str): Name of the parent table.
+            - PARENT_COLUMN_NAME (str): Name of the parent column that is part of the foreign key.
+            - CHILD_TABLE_NAME (str): Name of the child table.
+            - CHILD_COLUMN_NAME (str): Name of the child column that is part of the foreign key.
+            - CONSTRAINT_NAME (str): Name of the foreign key constraint (optional).
+        """
+        raise NotImplementedError()
+    def meta_get_handler_info(self, **kwargs) -> str:
+        """
+        Retrieves information about the design and implementation of the database handler.
+        This should include, but not be limited to, the following:
+        - The type of SQL queries and operations that the handler supports.
+        - etc.
+        Args:
+            kwargs: Additional keyword arguments that may be used in generating the handler information.
+        Returns:
+            str: A string containing information about the database handler's design and implementation.
+        """
+        pass
 class ArgProbeMixin:
     """
     A mixin class that provides probing of arguments that
@@ -154,26 +323,16 @@ class ArgProbeMixin:
             self.visit(node.value)
         def visit_Subscript(self, node):
-            if (
-                isinstance(node.value, ast.Name)
-                and node.value.id in self.var_names_to_track
-            ):
-                if isinstance(node.slice, ast.Index) and isinstance(
-                    node.slice.value, ast.Str
-                ):
+            if isinstance(node.value, ast.Name) and node.value.id in self.var_names_to_track:
+                if isinstance(node.slice, ast.Index) and isinstance(node.slice.value, ast.Str):
                     self.arg_keys.append({"name": node.slice.value.s, "required": True})
             self.generic_visit(node)
         def visit_Call(self, node):
             if isinstance(node.func, ast.Attribute) and node.func.attr == "get":
-                if (
-                    isinstance(node.func.value, ast.Name)
-                    and node.func.value.id in self.var_names_to_track
-                ):
+                if isinstance(node.func.value, ast.Name) and node.func.value.id in self.var_names_to_track:
                     if isinstance(node.args[0], ast.Str):
-                        self.arg_keys.append(
-                            {"name": node.args[0].s, "required": False}
-                        )
+                        self.arg_keys.append({"name": node.args[0].s, "required": False})
             self.generic_visit(node)
     @classmethod
@@ -197,9 +356,7 @@ class ArgProbeMixin:
         try:
             source_code = self.get_source_code(method_name)
         except Exception as e:
-            logger.error(
-                f"Failed to get source code of method {method_name} in {self.__class__.__name__}. Reason: {e}"
-            )
+            logger.error(f"Failed to get source code of method {method_name} in {self.__class__.__name__}. Reason: {e}")
             return []
         # parse the source code
@@ -238,9 +395,7 @@ class ArgProbeMixin:
         """
         method = getattr(self, method_name)
         if method is None:
-            raise Exception(
-                f"Method {method_name} does not exist in {self.__class__.__name__}"
-            )
+            raise Exception(f"Method {method_name} does not exist in {self.__class__.__name__}")
         source_code = inspect.getsource(method)
         return source_code
@@ -288,8 +443,8 @@ class BaseMLEngine(ArgProbeMixin):
         self.engine_storage = engine_storage
         self.generative = False  # if True, the target column name does not have to be specified at creation time
-        if kwargs.get('base_model_storage'):
-            self.base_model_storage = kwargs['base_model_storage']  # available when updating a model
+        if kwargs.get("base_model_storage"):
+            self.base_model_storage = kwargs["base_model_storage"]  # available when updating a model
         else:
             self.base_model_storage = None

mindsdb/integrations/utilities/files/file_reader.py CHANGED Viewed

@@ -1,10 +1,11 @@
+from dataclasses import dataclass, astuple
 import traceback
 import json
 import csv
 from io import BytesIO, StringIO, IOBase
 from pathlib import Path
 import codecs
-from typing import List
+from typing import List, Generator
 import filetype
 import pandas as pd
@@ -18,8 +19,27 @@ DEFAULT_CHUNK_SIZE = 500
 DEFAULT_CHUNK_OVERLAP = 250
-class FileDetectError(Exception):
-    ...
+class FileProcessingError(Exception): ...
+@dataclass(frozen=True, slots=True)
+class _SINGLE_PAGE_FORMAT:
+    CSV: str = "csv"
+    JSON: str = "json"
+    TXT: str = "txt"
+    PDF: str = "pdf"
+    PARQUET: str = "parquet"
+SINGLE_PAGE_FORMAT = _SINGLE_PAGE_FORMAT()
+@dataclass(frozen=True, slots=True)
+class _MULTI_PAGE_FORMAT:
+    XLSX: str = "xlsx"
+MULTI_PAGE_FORMAT = _MULTI_PAGE_FORMAT()
 def decode(file_obj: IOBase) -> StringIO:
@@ -56,21 +76,20 @@ def decode(file_obj: IOBase) -> StringIO:
                 data_str = StringIO(byte_str.decode(encoding, errors))
     except Exception as e:
         logger.error(traceback.format_exc())
-        raise FileDetectError("Could not load into string") from e
+        raise FileProcessingError("Could not load into string") from e
     return data_str
 class FormatDetector:
-    supported_formats = ['parquet', 'csv', 'xlsx', 'pdf', 'json', 'txt']
-    multipage_formats = ['xlsx']
+    supported_formats = astuple(SINGLE_PAGE_FORMAT) + astuple(MULTI_PAGE_FORMAT)
+    multipage_formats = astuple(MULTI_PAGE_FORMAT)
     def __init__(
         self,
-        path: str = None,
-        name: str = None,
-        file: IOBase = None
+        path: str | None = None,
+        name: str | None = None,
+        file: IOBase | None = None,
     ):
         """
         File format detector
@@ -81,16 +100,16 @@ class FormatDetector:
         :param file: file descriptor (via open(...), of BytesIO(...))
         """
         if path is not None:
-            file = open(path, 'rb')
+            file = open(path, "rb")
         elif file is not None:
             if name is None:
-                if hasattr(file, 'name'):
+                if hasattr(file, "name"):
                     path = file.name
                 else:
-                    path = 'file'
+                    path = "file"
         else:
-            raise FileDetectError('Wrong arguments: path or file is required')
+            raise FileProcessingError("Wrong arguments: path or file is required")
         if name is None:
             name = Path(path).name
@@ -108,14 +127,14 @@ class FormatDetector:
         format = self.get_format_by_name()
         if format is not None:
             if format not in self.supported_formats:
-                raise FileDetectError(f'Not supported format: {format}')
+                raise FileProcessingError(f"Not supported format: {format}")
         if format is None and self.file_obj is not None:
             format = self.get_format_by_content()
             self.file_obj.seek(0)
         if format is None:
-            raise FileDetectError(f'Unable to detect format: {self.name}')
+            raise FileProcessingError(f"Unable to detect format: {self.name}")
         self.format = format
         return format
@@ -124,33 +143,32 @@ class FormatDetector:
         extension = Path(self.name).suffix.strip(".").lower()
         if extension == "tsv":
             extension = "csv"
-            self.parameters['delimiter'] = '\t'
+            self.parameters["delimiter"] = "\t"
         return extension or None
     def get_format_by_content(self):
         if self.is_parquet(self.file_obj):
-            return "parquet"
+            return SINGLE_PAGE_FORMAT.PARQUET
         file_type = filetype.guess(self.file_obj)
         if file_type is not None:
             if file_type.mime in {
                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                 "application/vnd.ms-excel",
             }:
-                return 'xlsx'
+                return MULTI_PAGE_FORMAT.XLSX
-            if file_type.mime == 'application/pdf':
-                return "pdf"
+            if file_type.mime == "application/pdf":
+                return SINGLE_PAGE_FORMAT.PDF
         file_obj = decode(self.file_obj)
         if self.is_json(file_obj):
-            return "json"
+            return SINGLE_PAGE_FORMAT.JSON
         if self.is_csv(file_obj):
-            return "csv"
+            return SINGLE_PAGE_FORMAT.CSV
     @staticmethod
     def is_json(data_obj: StringIO) -> bool:
@@ -198,35 +216,53 @@ class FormatDetector:
         return False
-class FileReader(FormatDetector):
+def format_column_names(df: pd.DataFrame):
+    df.columns = [column.strip(" \t") for column in df.columns]
+    if len(df.columns) != len(set(df.columns)) or any(len(column_name) == 0 for column_name in df.columns):
+        raise FileProcessingError("Each column should have a unique and non-empty name.")
+class FileReader(FormatDetector):
     def _get_fnc(self):
         format = self.get_format()
-        func = getattr(self, f'read_{format}', None)
+        func = getattr(self, f"read_{format}", None)
         if func is None:
-            raise FileDetectError(f'Unsupported format: {format}')
-        return func
+            raise FileProcessingError(f"Unsupported format: {format}")
+        if format in astuple(MULTI_PAGE_FORMAT):
+            def format_multipage(*args, **kwargs):
+                for page_number, df in func(*args, **kwargs):
+                    format_column_names(df)
+                    yield page_number, df
+            return format_multipage
+        def format_singlepage(*args, **kwargs) -> pd.DataFrame:
+            """Check that the columns have unique not-empty names"""
+            df = func(*args, **kwargs)
+            format_column_names(df)
+            return df
+        return format_singlepage
     def get_pages(self, **kwargs) -> List[str]:
         """
-            Get list of tables in file
+        Get list of tables in file
         """
         format = self.get_format()
         if format not in self.multipage_formats:
             # only one table
-            return ['main']
+            return ["main"]
         func = self._get_fnc()
         self.file_obj.seek(0)
-        return [
-            name for name, _ in
-            func(self.file_obj, only_names=True, **kwargs)
-        ]
+        return [name for name, _ in func(self.file_obj, only_names=True, **kwargs)]
-    def get_contents(self, **kwargs):
+    def get_contents(self, **kwargs) -> dict[str, pd.DataFrame]:
         """
-            Get all info(pages with content) from file as dict: {tablename, content}
+        Get all info(pages with content) from file as dict: {tablename, content}
         """
         func = self._get_fnc()
         self.file_obj.seek(0)
@@ -234,17 +270,13 @@ class FileReader(FormatDetector):
         format = self.get_format()
         if format not in self.multipage_formats:
             # only one table
-            return {'main': func(self.file_obj, name=self.name, **kwargs)}
+            return {"main": func(self.file_obj, name=self.name, **kwargs)}
-        return {
-            name: df
-            for name, df in
-            func(self.file_obj, **kwargs)
-        }
+        return {name: df for name, df in func(self.file_obj, **kwargs)}
-    def get_page_content(self, page_name: str = None, **kwargs) -> pd.DataFrame:
+    def get_page_content(self, page_name: str | None = None, **kwargs) -> pd.DataFrame:
         """
-            Get content of a single table
+        Get content of a single table
         """
         func = self._get_fnc()
         self.file_obj.seek(0)
@@ -258,7 +290,7 @@ class FileReader(FormatDetector):
             return df
     @staticmethod
-    def _get_csv_dialect(buffer, delimiter=None) -> csv.Dialect:
+    def _get_csv_dialect(buffer, delimiter: str | None = None) -> csv.Dialect | None:
         sample = buffer.readline()  # trying to get dialect from header
         buffer.seek(0)
         try:
@@ -270,42 +302,35 @@ class FileReader(FormatDetector):
             else:
                 accepted_csv_delimiters = [",", "\t", ";"]
             try:
-                dialect = csv.Sniffer().sniff(
-                    sample, delimiters=accepted_csv_delimiters
-                )
-                dialect.doublequote = (
-                    True  # assume that all csvs have " as string escape
-                )
+                dialect = csv.Sniffer().sniff(sample, delimiters=accepted_csv_delimiters)
+                dialect.doublequote = True  # assume that all csvs have " as string escape
             except Exception:
                 dialect = csv.reader(sample).dialect
                 if dialect.delimiter not in accepted_csv_delimiters:
-                    raise Exception(
-                        f"CSV delimeter '{dialect.delimiter}' is not supported"
-                    )
+                    raise FileProcessingError(f"CSV delimeter '{dialect.delimiter}' is not supported")
         except csv.Error:
             dialect = None
         return dialect
     @classmethod
-    def read_csv(cls, file_obj: BytesIO, delimiter=None, **kwargs):
+    def read_csv(cls, file_obj: BytesIO, delimiter: str | None = None, **kwargs) -> pd.DataFrame:
         file_obj = decode(file_obj)
         dialect = cls._get_csv_dialect(file_obj, delimiter=delimiter)
         return pd.read_csv(file_obj, sep=dialect.delimiter, index_col=False)
     @staticmethod
-    def read_txt(file_obj: BytesIO, name=None, **kwargs):
+    def read_txt(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
         # the lib is heavy, so import it only when needed
         from langchain_text_splitters import RecursiveCharacterTextSplitter
         file_obj = decode(file_obj)
         try:
             from langchain_core.documents import Document
         except ImportError:
-            raise ImportError(
-                "To import TXT document please install 'langchain-community':\n"
-                "    pip install langchain-community"
+            raise FileProcessingError(
+                "To import TXT document please install 'langchain-community':\n    pip install langchain-community"
             )
         text = file_obj.read()
@@ -317,15 +342,10 @@ class FileReader(FormatDetector):
         )
         docs = text_splitter.split_documents(documents)
-        return pd.DataFrame(
-            [
-                {"content": doc.page_content, "metadata": doc.metadata}
-                for doc in docs
-            ]
-        )
+        return pd.DataFrame([{"content": doc.page_content, "metadata": doc.metadata} for doc in docs])
     @staticmethod
-    def read_pdf(file_obj: BytesIO, name=None, **kwargs):
+    def read_pdf(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
         # the libs are heavy, so import it only when needed
         import fitz  # pymupdf
         from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -340,30 +360,36 @@ class FileReader(FormatDetector):
         split_text = text_splitter.split_text(text)
         return pd.DataFrame(
-            {"content": split_text, "metadata": [{"file_format": "pdf", "source_file": name}] * len(split_text)}
+            {
+                "content": split_text,
+                "metadata": [{"file_format": "pdf", "source_file": name}] * len(split_text),
+            }
         )
     @staticmethod
-    def read_json(file_obj: BytesIO, **kwargs):
+    def read_json(file_obj: BytesIO, **kwargs) -> pd.DataFrame:
         file_obj = decode(file_obj)
         file_obj.seek(0)
         json_doc = json.loads(file_obj.read())
         return pd.json_normalize(json_doc, max_level=0)
     @staticmethod
-    def read_parquet(file_obj: BytesIO, **kwargs):
+    def read_parquet(file_obj: BytesIO, **kwargs) -> pd.DataFrame:
         return pd.read_parquet(file_obj)
     @staticmethod
-    def read_xlsx(file_obj: BytesIO, page_name=None, only_names=False, **kwargs):
+    def read_xlsx(
+        file_obj: BytesIO,
+        page_name: str | None = None,
+        only_names: bool = False,
+        **kwargs,
+    ) -> Generator[tuple[str, pd.DataFrame | None], None, None]:
         with pd.ExcelFile(file_obj) as xls:
             if page_name is not None:
                 # return specific page
                 yield page_name, pd.read_excel(xls, sheet_name=page_name)
             for page_name in xls.sheet_names:
                 if only_names:
                     # extract only pages names
                     df = None

MindsDB 25.5.4.2__py3-none-any.whl → 25.6.3.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.5.4.2py3-none-any.whl → 25.6.3.0py3-none-any.whl