PyPI - MindsDB - Versions diffs - 25.5.4.2__py3-none-any.whl → 25.6.3.0__py3-none-any.whl - Mend

MindsDB 25.5.4.2py3-none-any.whl → 25.6.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (76) hide show

mindsdb/__about__.py +1 -1
mindsdb/api/a2a/agent.py +50 -26
mindsdb/api/a2a/common/server/server.py +32 -26
mindsdb/api/a2a/task_manager.py +68 -6
mindsdb/api/executor/command_executor.py +69 -14
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +91 -84
mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
mindsdb/api/executor/planner/plan_join.py +67 -77
mindsdb/api/executor/planner/query_planner.py +176 -155
mindsdb/api/executor/planner/steps.py +37 -12
mindsdb/api/executor/sql_query/result_set.py +45 -64
mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
mindsdb/api/executor/utilities/sql.py +42 -48
mindsdb/api/http/namespaces/config.py +1 -1
mindsdb/api/http/namespaces/file.py +14 -23
mindsdb/api/http/namespaces/knowledge_bases.py +132 -154
mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +219 -28
mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +29 -33
mindsdb/integrations/handlers/openai_handler/openai_handler.py +277 -356
mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +145 -40
mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +352 -83
mindsdb/integrations/libs/api_handler.py +279 -57
mindsdb/integrations/libs/base.py +185 -30
mindsdb/integrations/utilities/files/file_reader.py +99 -73
mindsdb/integrations/utilities/handler_utils.py +23 -8
mindsdb/integrations/utilities/sql_utils.py +35 -40
mindsdb/interfaces/agents/agents_controller.py +226 -196
mindsdb/interfaces/agents/constants.py +8 -1
mindsdb/interfaces/agents/langchain_agent.py +42 -11
mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
mindsdb/interfaces/agents/mindsdb_database_agent.py +23 -18
mindsdb/interfaces/data_catalog/__init__.py +0 -0
mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
mindsdb/interfaces/data_catalog/data_catalog_loader.py +375 -0
mindsdb/interfaces/data_catalog/data_catalog_reader.py +38 -0
mindsdb/interfaces/database/database.py +81 -57
mindsdb/interfaces/database/integrations.py +222 -234
mindsdb/interfaces/database/log.py +72 -104
mindsdb/interfaces/database/projects.py +156 -193
mindsdb/interfaces/file/file_controller.py +21 -65
mindsdb/interfaces/knowledge_base/controller.py +66 -25
mindsdb/interfaces/knowledge_base/evaluate.py +516 -0
mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
mindsdb/interfaces/skills/skills_controller.py +31 -36
mindsdb/interfaces/skills/sql_agent.py +113 -86
mindsdb/interfaces/storage/db.py +242 -82
mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
mindsdb/utilities/config.py +13 -2
mindsdb/utilities/log.py +35 -26
mindsdb/utilities/ml_task_queue/task.py +19 -22
mindsdb/utilities/render/sqlalchemy_render.py +129 -181
mindsdb/utilities/starters.py +40 -0
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/METADATA +257 -257
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/RECORD +76 -68
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/WHEEL +0 -0
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/top_level.txt +0 -0

mindsdb/api/mysql/mysql_proxy/utilities/dump.py CHANGED Viewed

@@ -1,6 +1,7 @@
-import json
+import struct
 import datetime
 from typing import Any
+from array import array
 import numpy as np
 from numpy import dtype as np_dtype
@@ -9,11 +10,19 @@ from pandas.api import types as pd_types
 from mindsdb.api.executor.sql_query.result_set import ResultSet, get_mysql_data_type_from_series, Column
 from mindsdb.api.mysql.mysql_proxy.utilities.lightwood_dtype import dtype as lightwood_dtype
-from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE, DATA_C_TYPE_MAP, CTypeProperties
+from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import (
+    MYSQL_DATA_TYPE,
+    DATA_C_TYPE_MAP,
+    CTypeProperties,
+    CHARSET_NUMBERS,
+)
 from mindsdb.utilities import log
+from mindsdb.utilities.json_encoder import CustomJSONEncoder
 logger = log.getLogger(__name__)
+json_encoder = CustomJSONEncoder()
 def column_to_mysql_column_dict(column: Column, database_name: str | None = None) -> dict[str, str | int]:
     """Convert Column object to dict with column properties.
@@ -52,9 +61,13 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
     # endregion
     if isinstance(column.type, MYSQL_DATA_TYPE) is False:
-        logger.warning(f'Unexpected column type: {column.type}. Use TEXT as fallback.')
+        logger.warning(f"Unexpected column type: {column.type}. Use TEXT as fallback.")
         column.type = MYSQL_DATA_TYPE.TEXT
+    charset = CHARSET_NUMBERS["utf8_unicode_ci"]
+    if column.type in (MYSQL_DATA_TYPE.JSON, MYSQL_DATA_TYPE.VECTOR):
+        charset = CHARSET_NUMBERS["binary"]
     type_properties: CTypeProperties = DATA_C_TYPE_MAP[column.type]
     result = {
@@ -66,6 +79,7 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
         "size": type_properties.size,
         "flags": type_properties.flags,
         "type": type_properties.code,
+        "charset": charset,
     }
     return result
@@ -82,7 +96,7 @@ def _dump_bool(var: Any) -> int | None:
     """
     if pd.isna(var):
         return None
-    return '1' if var else '0'
+    return "1" if var else "0"
 def _dump_str(var: Any) -> str | None:
@@ -94,18 +108,19 @@ def _dump_str(var: Any) -> str | None:
     Returns:
         str | None: The string representation of the value or None if the value is None
     """
-    if pd.isna(var):
-        return None
     if isinstance(var, bytes):
         try:
-            return var.decode('utf-8')
+            return var.decode("utf-8")
         except Exception:
             return str(var)[2:-1]
-    if isinstance(var, dict):
+    if isinstance(var, (dict, list)):
         try:
-            return json.dumps(var)
+            return json_encoder.encode(var)
         except Exception:
             return str(var)
+    if isinstance(var, list) is False and pd.isna(var):
+        # pd.isna returns array of bools for list, so we need to check if it is not a list
+        return None
     return str(var)
@@ -142,7 +157,7 @@ def _dump_date(var: datetime.date | str | None) -> str | None:
         return var
     elif pd.isna(var):
         return None
-    logger.warning(f'Unexpected value type for DATE: {type(var)}, {var}')
+    logger.warning(f"Unexpected value type for DATE: {type(var)}, {var}")
     return _dump_str(var)
@@ -157,18 +172,18 @@ def _dump_datetime(var: datetime.datetime | str | None) -> str | None:
         str | None: The string representation of the datetime value or None if the value is None
     """
     if isinstance(var, datetime.date):  # it is also datetime.datetime
-        if hasattr(var, 'tzinfo') and var.tzinfo is not None:
+        if hasattr(var, "tzinfo") and var.tzinfo is not None:
             return var.astimezone(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
         return var.strftime("%Y-%m-%d %H:%M:%S")
     elif isinstance(var, pd.Timestamp):
         if var.tzinfo is not None:
-            return var.tz_convert('UTC').strftime("%Y-%m-%d %H:%M:%S")
+            return var.tz_convert("UTC").strftime("%Y-%m-%d %H:%M:%S")
         return var.strftime("%Y-%m-%d %H:%M:%S")
     elif isinstance(var, str):
         return var
     elif pd.isna(var):
         return None
-    logger.warning(f'Unexpected value type for DATETIME: {type(var)}, {var}')
+    logger.warning(f"Unexpected value type for DATETIME: {type(var)}, {var}")
     return _dump_str(var)
@@ -198,16 +213,34 @@ def _dump_time(var: datetime.time | str | None) -> str | None:
         return var.strftime("%H:%M:%S")
     elif isinstance(var, pd.Timestamp):
         if var.tzinfo is not None:
-            return var.tz_convert('UTC').strftime("%H:%M:%S")
+            return var.tz_convert("UTC").strftime("%H:%M:%S")
         return var.strftime("%H:%M:%S")
     elif isinstance(var, str):
         return var
     elif pd.isna(var):
         return None
-    logger.warning(f'Unexpected value type for TIME: {type(var)}, {var}')
+    logger.warning(f"Unexpected value type for TIME: {type(var)}, {var}")
     return _dump_str(var)
+def _dump_vector(value: Any) -> bytes | None:
+    """Convert array or list of floats to a bytes.
+    Args:
+        value (Any): The value to dump
+    Returns:
+        bytes | None: The bytes representation of the vector value or None if the value is None
+    """
+    if isinstance(value, (array, list, np.ndarray)):
+        return b"".join([struct.pack("<f", el) for el in value])
+    elif pd.isna(value):
+        return None
+    err_msg = f"Unexpected value type for VECTOR: {type(value)}, {value}"
+    logger.error(err_msg)
+    raise ValueError(err_msg)
 def _handle_series_as_date(series: pd.Series) -> pd.Series:
     """Convert values in a series to a string representation of a date.
     NOTE: MySQL require exactly %Y-%m-%d for DATE type.
@@ -219,10 +252,10 @@ def _handle_series_as_date(series: pd.Series) -> pd.Series:
         pd.Series: The series with the date values as strings
     """
     if pd_types.is_datetime64_any_dtype(series.dtype):
-        return series.dt.strftime('%Y-%m-%d')
+        return series.dt.strftime("%Y-%m-%d")
     elif pd_types.is_object_dtype(series.dtype):
         return series.apply(_dump_date)
-    logger.info(f'Unexpected dtype: {series.dtype} for column with type DATE')
+    logger.info(f"Unexpected dtype: {series.dtype} for column with type DATE")
     return series.apply(_dump_str)
@@ -237,10 +270,10 @@ def _handle_series_as_datetime(series: pd.Series) -> pd.Series:
         pd.Series: The series with the datetime values as strings
     """
     if pd_types.is_datetime64_any_dtype(series.dtype):
-        return series.dt.strftime('%Y-%m-%d %H:%M:%S')
+        return series.dt.strftime("%Y-%m-%d %H:%M:%S")
     elif pd_types.is_object_dtype(series.dtype):
         return series.apply(_dump_datetime)
-    logger.info(f'Unexpected dtype: {series.dtype} for column with type DATETIME')
+    logger.info(f"Unexpected dtype: {series.dtype} for column with type DATETIME")
     return series.apply(_dump_str)
@@ -255,14 +288,14 @@ def _handle_series_as_time(series: pd.Series) -> pd.Series:
         pd.Series: The series with the time values as strings
     """
     if pd_types.is_timedelta64_ns_dtype(series.dtype):
-        base_time = pd.Timestamp('2000-01-01')
-        series = ((base_time + series).dt.strftime('%H:%M:%S'))
+        base_time = pd.Timestamp("2000-01-01")
+        series = (base_time + series).dt.strftime("%H:%M:%S")
     elif pd_types.is_datetime64_dtype(series.dtype):
-        series = series.dt.strftime('%H:%M:%S')
+        series = series.dt.strftime("%H:%M:%S")
     elif pd_types.is_object_dtype(series.dtype):
         series = series.apply(_dump_time)
     else:
-        logger.info(f'Unexpected dtype: {series.dtype} for column with type TIME')
+        logger.info(f"Unexpected dtype: {series.dtype} for column with type TIME")
         series = series.apply(_dump_str)
     return series
@@ -278,14 +311,29 @@ def _handle_series_as_int(series: pd.Series) -> pd.Series:
         pd.Series: The series with the int values as strings
     """
     if pd_types.is_integer_dtype(series.dtype):
-        if series.dtype == 'Int64':
+        if series.dtype == "Int64":
             # NOTE: 'apply' converts values to python floats
             return series.astype(object).apply(_dump_str)
         return series.apply(_dump_str)
     return series.apply(_dump_int_or_str)
-def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = False) -> tuple[pd.DataFrame, list[dict[str, str | int]]]:
+def _handle_series_as_vector(series: pd.Series) -> pd.Series:
+    """Convert values in a series to a bytes representation of a vector.
+    NOTE: MySQL's VECTOR type require exactly 4 bytes per float.
+    Args:
+        series (pd.Series): The series to handle
+    Returns:
+        pd.Series: The series with the vector values as bytes
+    """
+    return series.apply(_dump_vector)
+def dump_result_set_to_mysql(
+    result_set: ResultSet, infer_column_size: bool = False
+) -> tuple[pd.DataFrame, list[dict[str, str | int]]]:
     """
     Dumps the ResultSet to a format that can be used to send as MySQL response packet.
     NOTE: This method modifies the original DataFrame and columns.
@@ -319,10 +367,16 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
             case MYSQL_DATA_TYPE.TIME:
                 series = _handle_series_as_time(series)
             case (
-                MYSQL_DATA_TYPE.INT | MYSQL_DATA_TYPE.TINYINT | MYSQL_DATA_TYPE.SMALLINT
-                | MYSQL_DATA_TYPE.MEDIUMINT | MYSQL_DATA_TYPE.BIGINT | MYSQL_DATA_TYPE.YEAR
+                MYSQL_DATA_TYPE.INT
+                | MYSQL_DATA_TYPE.TINYINT
+                | MYSQL_DATA_TYPE.SMALLINT
+                | MYSQL_DATA_TYPE.MEDIUMINT
+                | MYSQL_DATA_TYPE.BIGINT
+                | MYSQL_DATA_TYPE.YEAR
             ):
                 series = _handle_series_as_int(series)
+            case MYSQL_DATA_TYPE.VECTOR:
+                series = _handle_series_as_vector(series)
             case _:
                 series = series.apply(_dump_str)
@@ -330,22 +384,19 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
         # we may split this operation for dt and other types for optimisation
         df[i] = series.replace([np.NaN, pd.NA, pd.NaT], None)
-    columns_dicts = [
-        column_to_mysql_column_dict(column)
-        for column in result_set.columns
-    ]
+    columns_dicts = [column_to_mysql_column_dict(column) for column in result_set.columns]
-    if infer_column_size and any(column_info.get('size') is None for column_info in columns_dicts):
+    if infer_column_size and any(column_info.get("size") is None for column_info in columns_dicts):
         if len(df) == 0:
             for column_info in columns_dicts:
-                if column_info['size'] is None:
-                    column_info['size'] = 1
+                if column_info["size"] is None:
+                    column_info["size"] = 1
         else:
             sample = df.head(100)
             for i, column_info in enumerate(columns_dicts):
                 try:
-                    column_info['size'] = sample[sample.columns[i]].astype(str).str.len().max()
+                    column_info["size"] = sample[sample.columns[i]].astype(str).str.len().max()
                 except Exception:
-                    column_info['size'] = 1
+                    column_info["size"] = 1
     return df, columns_dicts

mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py CHANGED Viewed

@@ -1,26 +1,28 @@
-from typing import Text, Dict, Any
+from google.cloud.bigquery import Client, QueryJobConfig
 from google.api_core.exceptions import BadRequest
+import pandas as pd
 from sqlalchemy_bigquery.base import BigQueryDialect
-from google.cloud.bigquery import Client, QueryJobConfig
+from typing import Any, Dict, Optional, Text
 from mindsdb.utilities import log
 from mindsdb_sql_parser.ast.base import ASTNode
-from mindsdb.integrations.libs.base import DatabaseHandler
+from mindsdb.integrations.libs.base import MetaDatabaseHandler
 from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
 from mindsdb.integrations.utilities.handlers.auth_utilities.google import GoogleServiceAccountOAuth2Manager
 from mindsdb.integrations.libs.response import (
     HandlerStatusResponse as StatusResponse,
     HandlerResponse as Response,
-    RESPONSE_TYPE
+    RESPONSE_TYPE,
 )
 logger = log.getLogger(__name__)
-class BigQueryHandler(DatabaseHandler):
+class BigQueryHandler(MetaDatabaseHandler):
     """
     This handler handles connection and execution of Google BigQuery statements.
     """
     name = "bigquery"
     def __init__(self, name: Text, connection_data: Dict, **kwargs: Any):
@@ -49,19 +51,16 @@ class BigQueryHandler(DatabaseHandler):
             return self.connection
         # Mandatory connection parameters
-        if not all(key in self.connection_data for key in ['project_id', 'dataset']):
-            raise ValueError('Required parameters (project_id, dataset) must be provided.')
+        if not all(key in self.connection_data for key in ["project_id", "dataset"]):
+            raise ValueError("Required parameters (project_id, dataset) must be provided.")
         google_sa_oauth2_manager = GoogleServiceAccountOAuth2Manager(
-            credentials_file=self.connection_data.get('service_account_keys'),
-            credentials_json=self.connection_data.get('service_account_json')
+            credentials_file=self.connection_data.get("service_account_keys"),
+            credentials_json=self.connection_data.get("service_account_json"),
         )
         credentials = google_sa_oauth2_manager.get_oauth2_credentials()
-        client = Client(
-            project=self.connection_data["project_id"],
-            credentials=credentials
-        )
+        client = Client(project=self.connection_data["project_id"], credentials=credentials)
         self.is_connected = True
         self.connection = client
         return self.connection
@@ -86,14 +85,14 @@ class BigQueryHandler(DatabaseHandler):
         try:
             connection = self.connect()
-            connection.query('SELECT 1;')
+            connection.query("SELECT 1;")
             # Check if the dataset exists
-            connection.get_dataset(self.connection_data['dataset'])
+            connection.get_dataset(self.connection_data["dataset"])
             response.success = True
         except (BadRequest, ValueError) as e:
-            logger.error(f'Error connecting to BigQuery {self.connection_data["project_id"]}, {e}!')
+            logger.error(f"Error connecting to BigQuery {self.connection_data['project_id']}, {e}!")
             response.error_message = e
         if response.success is False and self.is_connected is True:
@@ -113,22 +112,18 @@ class BigQueryHandler(DatabaseHandler):
         """
         connection = self.connect()
         try:
-            job_config = QueryJobConfig(default_dataset=f"{self.connection_data['project_id']}.{self.connection_data['dataset']}")
+            job_config = QueryJobConfig(
+                default_dataset=f"{self.connection_data['project_id']}.{self.connection_data['dataset']}"
+            )
             query = connection.query(query, job_config=job_config)
             result = query.to_dataframe()
             if not result.empty:
-                response = Response(
-                    RESPONSE_TYPE.TABLE,
-                    result
-                )
+                response = Response(RESPONSE_TYPE.TABLE, result)
             else:
                 response = Response(RESPONSE_TYPE.OK)
         except Exception as e:
-            logger.error(f'Error running query: {query} on {self.connection_data["project_id"]}!')
-            response = Response(
-                RESPONSE_TYPE.ERROR,
-                error_message=str(e)
-            )
+            logger.error(f"Error running query: {query} on {self.connection_data['project_id']}!")
+            response = Response(RESPONSE_TYPE.ERROR, error_message=str(e))
         return response
     def query(self, query: ASTNode) -> Response:
@@ -154,7 +149,7 @@ class BigQueryHandler(DatabaseHandler):
         """
         query = f"""
             SELECT table_name, table_schema, table_type
-            FROM `{self.connection_data['project_id']}.{self.connection_data['dataset']}.INFORMATION_SCHEMA.TABLES`
+            FROM `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLES`
             WHERE table_type IN ('BASE TABLE', 'VIEW')
         """
         result = self.native_query(query)
@@ -174,8 +169,204 @@ class BigQueryHandler(DatabaseHandler):
         """
         query = f"""
             SELECT column_name AS Field, data_type as Type
-            FROM `{self.connection_data['project_id']}.{self.connection_data['dataset']}.INFORMATION_SCHEMA.COLUMNS`
+            FROM `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.COLUMNS`
             WHERE table_name = '{table_name}'
         """
         result = self.native_query(query)
         return result
+    def meta_get_tables(self, table_names: Optional[list] = None) -> Response:
+        """
+        Retrieves table metadata for the specified tables (or all tables if no list is provided).
+        Args:
+            table_names (list): A list of table names for which to retrieve metadata information.
+        Returns:
+            Response: A response object containing the metadata information, formatted as per the `Response` class.
+        """
+        query = f"""
+            SELECT
+                t.table_name,
+                t.table_schema,
+                t.table_type,
+                st.row_count
+            FROM
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLES` AS t
+            JOIN
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.__TABLES__` AS st
+            ON
+                t.table_name = st.table_id
+            WHERE
+                t.table_type IN ('BASE TABLE', 'VIEW')
+        """
+        if table_names is not None and len(table_names) > 0:
+            table_names = [f"'{t}'" for t in table_names]
+            query += f" AND t.table_name IN ({','.join(table_names)})"
+        result = self.native_query(query)
+        return result
+    def meta_get_columns(self, table_names: Optional[list] = None) -> Response:
+        """
+        Retrieves column metadata for the specified tables (or all tables if no list is provided).
+        Args:
+            table_names (list): A list of table names for which to retrieve column metadata.
+        Returns:
+            Response: A response object containing the column metadata.
+        """
+        query = f"""
+            SELECT
+                table_name,
+                column_name,
+                data_type,
+                column_default,
+                CASE is_nullable
+                    WHEN 'YES' THEN TRUE
+                    ELSE FALSE
+                END AS is_nullable
+            FROM
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.COLUMNS`
+        """
+        if table_names is not None and len(table_names) > 0:
+            table_names = [f"'{t}'" for t in table_names]
+            query += f" WHERE table_name IN ({','.join(table_names)})"
+        result = self.native_query(query)
+        return result
+    def meta_get_column_statistics_for_table(self, table_name: str, columns: list) -> Response:
+        """
+        Retrieves statistics for the specified columns in a table.
+        Args:
+            table_name (str): The name of the table.
+            columns (list): A list of column names to retrieve statistics for.
+        Returns:
+            Response: A response object containing the column statistics.
+        """
+        # To avoid hitting BigQuery's query size limits, we will chunk the columns into batches.
+        # This is because the queries are combined using UNION ALL, which can lead to very large queries if there are many columns.
+        BATCH_SIZE = 20
+        def chunked(lst, n):
+            """
+            Yields successive n-sized chunks from lst.
+            """
+            for i in range(0, len(lst), n):
+                yield lst[i : i + n]
+        queries = []
+        for column_batch in chunked(columns, BATCH_SIZE):
+            batch_queries = []
+            for column in column_batch:
+                batch_queries.append(
+                    f"""
+                    SELECT
+                        '{table_name}' AS table_name,
+                        '{column}' AS column_name,
+                        SAFE_DIVIDE(COUNTIF({column} IS NULL), COUNT(*)) * 100 AS null_percentage,
+                        CAST(MIN(`{column}`) AS STRING) AS minimum_value,
+                        CAST(MAX(`{column}`) AS STRING) AS maximum_value,
+                        COUNT(DISTINCT {column}) AS distinct_values_count
+                    FROM
+                        `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.{table_name}`
+                    """
+                )
+            query = " UNION ALL ".join(batch_queries)
+            queries.append(query)
+        results = []
+        for query in queries:
+            try:
+                result = self.native_query(query)
+                if result.resp_type == RESPONSE_TYPE.TABLE:
+                    results.append(result.data_frame)
+                else:
+                    logger.error(f"Error retrieving column statistics for table {table_name}: {result.error_message}")
+            except Exception as e:
+                logger.error(f"Exception occurred while retrieving column statistics for table {table_name}: {e}")
+        if not results:
+            logger.warning(f"No column statistics could be retrieved for table {table_name}.")
+            return Response(
+                RESPONSE_TYPE.ERROR, error_message=f"No column statistics could be retrieved for table {table_name}."
+            )
+        return Response(RESPONSE_TYPE.TABLE, pd.concat(results, ignore_index=True) if results else pd.DataFrame())
+    def meta_get_primary_keys(self, table_names: Optional[list] = None) -> Response:
+        """
+        Retrieves primary key information for the specified tables (or all tables if no list is provided).
+        Args:
+            table_names (list): A list of table names for which to retrieve primary key information.
+        Returns:
+            Response: A response object containing the primary key information.
+        """
+        query = f"""
+            SELECT
+                tc.table_name,
+                kcu.column_name,
+                kcu.ordinal_position,
+                tc.constraint_name,
+            FROM
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` AS tc
+            JOIN
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` AS kcu
+            ON
+                tc.constraint_name = kcu.constraint_name
+            WHERE
+                tc.constraint_type = 'PRIMARY KEY'
+        """
+        if table_names is not None and len(table_names) > 0:
+            table_names = [f"'{t}'" for t in table_names]
+            query += f" AND tc.table_name IN ({','.join(table_names)})"
+        result = self.native_query(query)
+        return result
+    def meta_get_foreign_keys(self, table_names: Optional[list] = None) -> Response:
+        """
+        Retrieves foreign key information for the specified tables (or all tables if no list is provided).
+        Args:
+            table_names (list): A list of table names for which to retrieve foreign key information.
+        Returns:
+            Response: A response object containing the foreign key information.
+        """
+        query = f"""
+            SELECT
+                ccu.table_name AS parent_table_name,
+                ccu.column_name AS parent_column_name,
+                kcu.table_name AS child_table_name,
+                kcu.column_name AS child_column_name,
+                tc.constraint_name
+            FROM
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` AS tc
+            JOIN
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` AS kcu
+            ON
+                tc.constraint_name = kcu.constraint_name
+            JOIN
+                `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE` AS ccu
+            ON
+                tc.constraint_name = ccu.constraint_name
+            WHERE
+                tc.constraint_type = 'FOREIGN KEY'
+        """
+        if table_names is not None and len(table_names) > 0:
+            table_names = [f"'{t}'" for t in table_names]
+            query += f" AND tc.table_name IN ({','.join(table_names)})"
+        result = self.native_query(query)
+        return result

mindsdb/integrations/handlers/file_handler/file_handler.py CHANGED Viewed

@@ -75,10 +75,7 @@ class FileHandler(DatabaseHandler):
     def query(self, query: ASTNode) -> Response:
         if type(query) is DropTables:
             for table_identifier in query.tables:
-                if (
-                    len(table_identifier.parts) == 2
-                    and table_identifier.parts[0] != self.name
-                ):
+                if len(table_identifier.parts) == 2 and table_identifier.parts[0] != self.name:
                     return Response(
                         RESPONSE_TYPE.ERROR,
                         error_message=f"Can't delete table from database '{table_identifier.parts[0]}'",
@@ -136,9 +133,20 @@ class FileHandler(DatabaseHandler):
             return Response(RESPONSE_TYPE.OK)
         elif isinstance(query, Select):
-            table_name, page_name = self._get_table_page_names(query.from_table)
+            if isinstance(query.from_table, Select):
+                # partitioning mode
+                sub_result = self.query(query.from_table)
+                if sub_result.error_message is not None:
+                    raise RuntimeError(sub_result.error_message)
-            df = self.file_controller.get_file_data(table_name, page_name)
+                df = sub_result.data_frame
+                query.from_table = Identifier("t")
+            elif isinstance(query.from_table, Identifier):
+                table_name, page_name = self._get_table_page_names(query.from_table)
+                df = self.file_controller.get_file_data(table_name, page_name)
+            else:
+                raise RuntimeError(f"Not supported query target: {query}")
             # Process the SELECT query
             result_df = query_df(df, query)
@@ -191,9 +199,7 @@ class FileHandler(DatabaseHandler):
             data_frame=pd.DataFrame(
                 [
                     {
-                        "Field": x["name"].strip()
-                        if isinstance(x, dict)
-                        else x.strip(),
+                        "Field": x["name"].strip() if isinstance(x, dict) else x.strip(),
                         "Type": "str",
                     }
                     for x in file_meta["columns"]

MindsDB 25.5.4.2__py3-none-any.whl → 25.6.3.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.5.4.2py3-none-any.whl → 25.6.3.0py3-none-any.whl