PyPI - MindsDB - Versions diffs - 25.1.2.0__py3-none-any.whl → 25.1.5.0__py3-none-any.whl - Mend

MindsDB 25.1.2.0py3-none-any.whl → 25.1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (99) hide show

mindsdb/interfaces/skills/sql_agent.py CHANGED Viewed

@@ -1,37 +1,84 @@
-from typing import Iterable, List, Optional
 import re
-from mindsdb_sql_parser.ast import Select, Show, Describe, Explain
+import inspect
+from typing import Iterable, List, Optional
 import pandas as pd
 from mindsdb_sql_parser import parse_sql
-from mindsdb_sql_parser.ast import Identifier
-from mindsdb.integrations.utilities.query_traversal import query_traversal
+from mindsdb_sql_parser.ast import Select, Show, Describe, Explain, Identifier
 from mindsdb.utilities import log
 from mindsdb.utilities.context import context as ctx
+from mindsdb.integrations.utilities.query_traversal import query_traversal
 logger = log.getLogger(__name__)
-class SQLAgent:
+def split_table_name(table_name: str) -> List[str]:
+    """Split table name from llm to parst
+    Args:
+        table_name (str): input table name
+    Returns:
+        List[str]: parts of table identifier like ['database', 'schema', 'table']
+    Example:
+        Input: 'aaa.bbb', Output: ['aaa', 'bbb']
+        Input: '`aaa.bbb`', Output: ['aaa', 'bbb']
+        Input: '`aaa.`bbb``', Output: ['aaa', 'bbb']
+        Input: 'aaa.bbb.ccc', Output: ['aaa', 'bbb', 'ccc']
+        Input: '`aaa.bbb.ccc`', Output: ['aaa', 'bbb', 'ccc']
+        Input: '`aaa.`bbb.ccc``', Output: ['aaa', 'bbb.ccc']
+        Input: 'aaa.`bbb.ccc`', Output: ['aaa', 'bbb.ccc']
+        Input: 'aaa.`bbb.ccc`', Output: ['aaa', 'bbb.ccc']
+        Input: '`` aaa.`bbb.ccc``  \n`', Output: ['aaa', 'bbb.ccc']
+    """
+    table_name = table_name.strip(' "\'\n\r')
+    while table_name.startswith('`') and table_name.endswith('`'):
+        table_name = table_name[1:-1]
+        table_name = table_name.strip(' "\'\n\r')
+    result = []
+    part = []
+    inside_quotes = False
+    for char in table_name:
+        if char == '`':
+            inside_quotes = not inside_quotes
+            continue
+        if char == '.' and not inside_quotes:
+            result.append(''.join(part))
+            part = []
+        else:
+            part.append(char)
+    if part:
+        result.append(''.join(part))
+    return [x for x in result if len(x) > 0]
+class SQLAgent:
     def __init__(
             self,
             command_executor,
-            database: str,
+            databases: List[str],
+            databases_struct: dict,
             include_tables: Optional[List[str]] = None,
             ignore_tables: Optional[List[str]] = None,
             sample_rows_in_table_info: int = 3,
             cache: Optional[dict] = None
     ):
         self._command_executor = command_executor
+        self._mindsdb_db_struct = databases_struct
         self._sample_rows_in_table_info = int(sample_rows_in_table_info)
         self._tables_to_include = include_tables
         self._tables_to_ignore = []
-        self._databases = database.split(',')
+        self._databases = databases
         if not self._tables_to_include:
             # ignore_tables and include_tables should not be used together.
             # include_tables takes priority if it's set.
@@ -40,7 +87,6 @@ class SQLAgent:
     def _call_engine(self, query: str, database=None):
         # switch database
         ast_query = parse_sql(query.strip('`'))
         self._check_permissions(ast_query)
@@ -55,7 +101,6 @@ class SQLAgent:
         return ret
     def _check_permissions(self, ast_query):
         # check type of query
         if not isinstance(ast_query, (Select, Show, Describe, Explain)):
             raise ValueError(f"Query is not allowed: {ast_query.to_string()}")
@@ -66,14 +111,21 @@ class SQLAgent:
                 if is_table and isinstance(node, Identifier):
                     name1 = node.to_string()
                     name2 = '.'.join(node.parts)
-                    name3 = node.parts[-1]
+                    if len(node.parts) == 3:
+                        name3 = '.'.join(node.parts[1:])
+                    else:
+                        name3 = node.parts[-1]
                     if not {name1, name2, name3}.intersection(self._tables_to_include):
                         raise ValueError(f"Table {name1} not found. Available tables: {', '.join(self._tables_to_include)}")
             query_traversal(ast_query, _check_f)
     def get_usable_table_names(self) -> Iterable[str]:
+        """Get a list of tables that the agent has access to.
+        Returns:
+            Iterable[str]: list with table names
+        """
         cache_key = f'{ctx.company_id}_{",".join(self._databases)}_tables'
         # first check cache and return if found
@@ -85,25 +137,52 @@ class SQLAgent:
         if self._tables_to_include:
             return self._tables_to_include
-        ret = self._call_engine('show databases;')
-        dbs = [lst[0] for lst in ret.data.to_lists() if lst[0] != 'information_schema']
-        usable_tables = []
-        for db in dbs:
-            if db != 'mindsdb' and db in self._databases:
-                try:
-                    ret = self._call_engine('show tables', database=db)
-                    tables = [lst[0] for lst in ret.data.to_lists() if lst[0] != 'information_schema']
-                    for table in tables:
-                        # By default, include all tables in a database unless expilcitly ignored.
-                        table_name = f'{db}.{table}'
-                        if table_name not in self._tables_to_ignore:
-                            usable_tables.append(table_name)
-                except Exception as e:
-                    logger.warning('Unable to get tables for %s: %s', db, str(e))
+        result_tables = []
+        for db_name in self._mindsdb_db_struct:
+            handler = self._command_executor.session.integration_controller.get_data_handler(db_name)
+            schemas_names = list(self._mindsdb_db_struct[db_name].keys())
+            if len(schemas_names) > 1 and None in schemas_names:
+                raise Exception('default schema and named schemas can not be used in same filter')
+            if None in schemas_names:
+                # get tables only from default schema
+                response = handler.get_tables()
+                tables_in_default_schema = list(response.data_frame.table_name)
+                schema_tables_restrictions = self._mindsdb_db_struct[db_name][None]     # None - is default schema
+                if schema_tables_restrictions is None:
+                    for table_name in tables_in_default_schema:
+                        result_tables.append([db_name, table_name])
+                else:
+                    for table_name in schema_tables_restrictions:
+                        if table_name in tables_in_default_schema:
+                            result_tables.append([db_name, table_name])
+            else:
+                if 'all' in inspect.signature(handler.get_tables).parameters:
+                    response = handler.get_tables(all=True)
+                else:
+                    response = handler.get_tables()
+                response_schema_names = list(response.data_frame.table_schema.unique())
+                schemas_intersection = set(schemas_names) & set(response_schema_names)
+                if len(schemas_intersection) == 0:
+                    raise Exception('There are no allowed schemas in ds')
+                for schema_name in schemas_intersection:
+                    schema_sub_df = response.data_frame[response.data_frame['table_schema'] == schema_name]
+                    if self._mindsdb_db_struct[db_name][schema_name] is None:
+                        # all tables from schema allowed
+                        for row in schema_sub_df:
+                            result_tables.append([db_name, schema_name, row['table_name']])
+                    else:
+                        for table_name in self._mindsdb_db_struct[db_name][schema_name]:
+                            if table_name in schema_sub_df['table_name'].values:
+                                result_tables.append([db_name, schema_name, table_name])
+        result_tables = ['.'.join(x) for x in result_tables]
         if self._cache:
-            self._cache.set(cache_key, set(usable_tables))
-        return usable_tables
+            self._cache.set(cache_key, set(result_tables))
+        return result_tables
     def _resolve_table_names(self, table_names: List[str], all_tables: List[Identifier]) -> List[Identifier]:
         """
@@ -115,7 +194,10 @@ class SQLAgent:
         tables_idx = {}
         for table in all_tables:
             # by name
-            tables_idx[(table.parts[-1],)] = table
+            if len(table.parts) == 3:
+                tables_idx[tuple(table.parts[1:])] = table
+            else:
+                tables_idx[(table.parts[-1],)] = table
             # by path
             tables_idx[tuple(table.parts)] = table
@@ -125,15 +207,14 @@ class SQLAgent:
                 continue
             # Some LLMs (e.g. gpt-4o) may include backticks or quotes when invoking tools.
-            table_name = table_name.strip(' `"\'\n\r')
-            table = Identifier(table_name)
+            table_parts = split_table_name(table_name)
             # resolved table
-            table2 = tables_idx.get(tuple(table.parts))
+            table_identifier = tables_idx.get(tuple(table_parts))
-            if table2 is None:
+            if table_identifier is None:
                 raise ValueError(f"Table {table} not found in database")
-            tables.append(table2)
+            tables.append(table_identifier)
         return tables
@@ -165,26 +246,31 @@ class SQLAgent:
     def _get_single_table_info(self, table: Identifier) -> str:
         if len(table.parts) < 2:
             raise ValueError(f"Database is required for table: {table}")
-        integration, table_name = table.parts[-2:]
+        if len(table.parts) == 3:
+            integration, schema_name, table_name = table.parts[-3:]
+        else:
+            schema_name = None
+            integration, table_name = table.parts[-2:]
         table_str = str(table)
         dn = self._command_executor.session.datahub.get(integration)
         fields, dtypes = [], []
-        for column in dn.get_table_columns(table_name):
+        for column in dn.get_table_columns(table_name, schema_name):
             fields.append(column['name'])
             dtypes.append(column.get('type', ''))
-        info = f'Table named `{table_name}`\n'
-        info += f"\n/* Sample with first {self._sample_rows_in_table_info} rows from table {table_str}:\n"
+        info = f'Table named `{table_str}`:\n'
+        info += f"\nSample with first {self._sample_rows_in_table_info} rows from table {table_str}:\n"
         info += "\t".join([field for field in fields])
-        info += self._get_sample_rows(table_str, fields) + "\n*/"
+        info += self._get_sample_rows(table_str, fields) + "\n"
         info += '\nColumn data types: ' + ",\t".join(
-            [f'`{field}` : `{dtype}`' for field, dtype in zip(fields, dtypes)]) + '\n'  # noqa
+            [f'\n`{field}` : `{dtype}`' for field, dtype in zip(fields, dtypes)]) + '\n'  # noqa
         return info
     def _get_sample_rows(self, table: str, fields: List[str]) -> str:
-        command = f"select {','.join(fields)} from {table} limit {self._sample_rows_in_table_info};"
+        command = f"select {', '.join(fields)} from {table} limit {self._sample_rows_in_table_info};"
         try:
             ret = self._call_engine(command)
             sample_rows = ret.data.to_lists()

mindsdb/interfaces/storage/db.py CHANGED Viewed

@@ -212,7 +212,7 @@ class Project(Base):
     )
     deleted_at = Column(DateTime)
     name = Column(String, nullable=False)
-    company_id = Column(Integer)
+    company_id = Column(Integer, default=0)
     __table_args__ = (
         UniqueConstraint("name", "company_id", name="unique_project_name_company_id"),
     )

mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""project-company
+Revision ID: c06c35f7e8e1
+Revises: f6dc924079fa
+Create Date: 2025-01-15 14:14:29.295834
+"""
+from collections import defaultdict
+from alembic import op
+import sqlalchemy as sa
+import mindsdb.interfaces.storage.db  # noqa
+from mindsdb.utilities import log
+# revision identifiers, used by Alembic.
+revision = 'c06c35f7e8e1'
+down_revision = 'f6dc924079fa'
+branch_labels = None
+depends_on = None
+logger = log.getLogger(__name__)
+def upgrade():
+    """
+    convert company_id from null to 0 to make constrain works
+    duplicated names are renamed
+    """
+    conn = op.get_bind()
+    table = sa.Table(
+        'project',
+        sa.MetaData(),
+        sa.Column('id', sa.Integer()),
+        sa.Column('name', sa.String()),
+        sa.Column('company_id', sa.Integer()),
+    )
+    data = conn.execute(
+        table
+        .select()
+        .where(table.c.company_id == sa.null())
+    ).fetchall()
+    names = defaultdict(list)
+    for id, name, _ in data:
+        names[name].append(id)
+    # get duplicated
+    for name, ids in names.items():
+        if len(ids) == 1:
+            continue
+        # rename all except first
+        for id in ids[1:]:
+            new_name = f'{name}__{id}'
+            op.execute(
+                table
+                .update()
+                .where(table.c.id == id)
+                .values({'name': new_name})
+            )
+            logger.warning(f'Found duplicated project name: {name}, renamed to: {new_name}')
+    op.execute(
+        table
+        .update()
+        .where(table.c.company_id == sa.null())
+        .values({'company_id': 0})
+    )
+def downgrade():
+    table = sa.Table(
+        'project',
+        sa.MetaData(),
+        sa.Column('company_id', sa.Integer())
+    )
+    op.execute(
+        table
+        .update()
+        .where(table.c.company_id == 0)
+        .values({'company_id': sa.null()})
+    )

mindsdb/utilities/cache.py CHANGED Viewed

@@ -71,10 +71,13 @@ _CACHE_MAX_SIZE = 500
 def dataframe_checksum(df: pd.DataFrame):
-    return str_checksum(str(
-        df.set_axis(range(len(df.columns)), axis=1).to_records(index=False)
-    ))
+    original_columns = df.columns
+    df.columns = list(range(len(df.columns)))
+    result = hashlib.sha256(
+        str(df.values).encode()
+    ).hexdigest()
+    df.columns = original_columns
+    return result
 def json_checksum(obj: t.Union[dict, list]):

mindsdb/utilities/context.py CHANGED Viewed

@@ -24,7 +24,8 @@ class Context:
                 'enabled': False,
                 'pointer': None,
                 'tree': None
-            }
+            },
+            'email_confirmed': 0,
         })
     def __getattr__(self, name: str) -> Any:
@@ -52,6 +53,15 @@ class Context:
     def load(self, storage: dict) -> None:
         self._storage.set(storage)
+    def metadata(self, **kwargs) -> dict:
+        return {
+            'user_id': self.user_id or "",
+            'company_id': self.company_id or "",
+            'session_id': self.session_id,
+            'user_class': self.user_class,
+            **kwargs
+        }
 _context_var = ContextVar('mindsdb.context')
 context = Context(_context_var)

mindsdb/utilities/langfuse.py ADDED Viewed

@@ -0,0 +1,279 @@
+import os
+import typing
+from mindsdb.utilities import log
+from langfuse import Langfuse
+from langfuse.client import StatefulSpanClient
+from langfuse.callback import CallbackHandler
+from langfuse.api.resources.commons.errors.not_found_error import NotFoundError as TraceNotFoundError
+logger = log.getLogger(__name__)
+# Define Langfuse public key.
+LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY", "langfuse_public_key")
+# Define Langfuse secret key.
+LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY", "langfuse_secret_key")
+# Define Langfuse host.
+LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "http://localhost:3000")
+# Define Langfuse environment.
+LANGFUSE_ENVIRONMENT = os.getenv("LANGFUSE_ENVIRONMENT", "local")
+# Define Langfuse release.
+LANGFUSE_RELEASE = os.getenv("LANGFUSE_RELEASE", "local")
+# Define Langfuse debug mode.
+LANGFUSE_DEBUG = os.getenv("LANGFUSE_DEBUG", "false").lower() == "true"
+# Define Langfuse timeout.
+LANGFUSE_TIMEOUT = int(os.getenv("LANGFUSE_TIMEOUT", 10))
+# Define Langfuse sample rate.
+LANGFUSE_SAMPLE_RATE = float(os.getenv("LANGFUSE_SAMPLE_RATE", 1.0))
+# Define if Langfuse is disabled.
+LANGFUSE_DISABLED = os.getenv("LANGFUSE_DISABLED", "false").lower() == "true" or LANGFUSE_ENVIRONMENT == "local"
+LANGFUSE_FORCE_RUN = os.getenv("LANGFUSE_FORCE_RUN", "false").lower() == "true"
+class LangfuseClientWrapper:
+    """
+    Langfuse client wrapper. Defines Langfuse client configuration and initializes Langfuse client.
+    """
+    def __init__(self,
+                 public_key: str = LANGFUSE_PUBLIC_KEY,
+                 secret_key: str = LANGFUSE_SECRET_KEY,
+                 host: str = LANGFUSE_HOST,
+                 environment: str = LANGFUSE_ENVIRONMENT,
+                 release: str = LANGFUSE_RELEASE,
+                 debug: bool = LANGFUSE_DEBUG,
+                 timeout: int = LANGFUSE_TIMEOUT,
+                 sample_rate: float = LANGFUSE_SAMPLE_RATE,
+                 disable: bool = LANGFUSE_DISABLED,
+                 force_run: bool = LANGFUSE_FORCE_RUN) -> None:
+        """
+        Initialize Langfuse client.
+        Args:
+            public_key (str): Langfuse public key.
+            secret_key (str): Langfuse secret key.
+            host (str): Langfuse host.
+            release (str): Langfuse release.
+            timeout (int): Langfuse timeout.
+            sample_rate (float): Langfuse sample rate.
+        """
+        self.metadata = None
+        self.public_key = public_key
+        self.secret_key = secret_key
+        self.host = host
+        self.environment = environment
+        self.release = release
+        self.debug = debug
+        self.timeout = timeout
+        self.sample_rate = sample_rate
+        self.disable = disable
+        self.force_run = force_run
+        self.client = None
+        self.trace = None
+        self.metadata = None
+        self.tags = None
+        # Check if Langfuse is disabled.
+        if LANGFUSE_DISABLED and not LANGFUSE_FORCE_RUN:
+            logger.info("Langfuse is disabled.")
+            return
+        logger.info("Langfuse enabled")
+        logger.debug(f"LANGFUSE_PUBLIC_KEY: {LANGFUSE_PUBLIC_KEY}")
+        logger.debug(f"LANGFUSE_SECRET_KEY: {'*' * len(LANGFUSE_SECRET_KEY)}")
+        logger.debug(f"LANGFUSE_HOST: {LANGFUSE_HOST}")
+        logger.debug(f"LANGFUSE_ENVIRONMENT: {LANGFUSE_ENVIRONMENT}")
+        logger.debug(f"LANGFUSE_RELEASE: {LANGFUSE_RELEASE}")
+        logger.debug(f"LANGFUSE_DEBUG: {LANGFUSE_DEBUG}")
+        logger.debug(f"LANGFUSE_TIMEOUT: {LANGFUSE_TIMEOUT}")
+        logger.debug(f"LANGFUSE_SAMPLE_RATE: {LANGFUSE_SAMPLE_RATE * 100}%")
+        self.client = Langfuse(
+            public_key=public_key,
+            secret_key=secret_key,
+            host=host,
+            release=release,
+            debug=debug,
+            timeout=timeout,
+            sample_rate=sample_rate
+        )
+    def setup_trace(self,
+                    name: str,
+                    input: typing.Optional[typing.Any] = None,
+                    tags: typing.Optional[typing.List] = None,
+                    metadata: typing.Optional[typing.Dict] = None,
+                    user_id: str = None,
+                    session_id: str = None) -> None:
+        """
+        Setup trace. If Langfuse is disabled, nothing will be done.
+        Args:
+            name (str): Trace name.
+            input (dict): Trace input.
+            tags (dict): Trace tags.
+            metadata (dict): Trace metadata.
+            user_id (str): User ID.
+            session_id (str): Session ID.
+        """
+        if self.client is None:
+            logger.debug("Langfuse is disabled.")
+            return
+        self.set_metadata(metadata)
+        self.set_tags(tags)
+        try:
+            self.trace = self.client.trace(
+                name=name,
+                input=input,
+                metadata=self.metadata,
+                tags=self.tags,
+                user_id=user_id,
+                session_id=session_id
+            )
+        except Exception as e:
+            logger.error(f'Something went wrong while processing Langfuse trace {self.trace.id}: {str(e)}')
+        logger.info(f"Langfuse trace configured with ID: {self.trace.id}")
+    def get_trace_id(self) -> typing.Optional[str]:
+        """
+        Get trace ID. If Langfuse is disabled, returns None.
+        """
+        if self.client is None:
+            logger.debug("Langfuse is disabled.")
+            return ""
+        if self.trace is None:
+            logger.debug("Langfuse trace is not setup.")
+            return ""
+        return self.trace.id
+    def start_span(self,
+                   name: str,
+                   input: typing.Optional[typing.Any] = None) -> typing.Optional[StatefulSpanClient]:
+        """
+        Create span. If Langfuse is disabled, nothing will be done.
+        Args:
+            name (str): Span name.
+            input (dict): Span input.
+        """
+        if self.client is None:
+            logger.debug("Langfuse is disabled.")
+            return None
+        return self.trace.span(name=name, input=input)
+    def end_span_stream(self,
+                        span: typing.Optional[StatefulSpanClient] = None) -> None:
+        """
+        End span. If Langfuse is disabled, nothing will happen.
+        Args:
+            span (Any): Span object.
+        """
+        if self.client is None:
+            logger.debug("Langfuse is disabled.")
+            return
+        span.end()
+        self.trace.update()
+    def end_span(self,
+                 span: typing.Optional[StatefulSpanClient] = None,
+                 output: typing.Optional[typing.Any] = None) -> None:
+        """
+        End trace. If Langfuse is disabled, nothing will be done.
+        Args:
+            span (Any): Span object.
+            output (Any): Span output.
+        """
+        if self.client is None:
+            logger.debug("Langfuse is disabled.")
+            return
+        if span is None:
+            logger.debug("Langfuse span is not created.")
+            return
+        span.end(output=output)
+        self.trace.update(output=output)
+        metadata = self.metadata or {}
+        try:
+            # Ensure all batched traces are sent before fetching.
+            self.client.flush()
+            metadata['tool_usage'] = self._get_tool_usage()
+            self.trace.update(metadata=metadata)
+        except Exception as e:
+            logger.error(f'Something went wrong while processing Langfuse trace {self.trace.id}: {str(e)}')
+    def get_langchain_handler(self) -> typing.Optional[CallbackHandler]:
+        """
+        Get Langchain handler. If Langfuse is disabled, returns None.
+        """
+        if self.client is None:
+            logger.debug("Langfuse is disabled.")
+            return None
+        return self.trace.get_langchain_handler()
+    def set_metadata(self, custom_metadata: dict = None) -> None:
+        """
+        Get default metadata.
+        """
+        self.metadata = custom_metadata or {}
+        self.metadata["environment"] = self.environment
+        self.metadata["release"] = self.release
+    def set_tags(self, custom_tags: typing.Optional[typing.List] = None) -> None:
+        """
+        Get default tags.
+        """
+        self.tags = custom_tags or []
+        self.tags.append(self.environment)
+        self.tags.append(self.release)
+    def _get_tool_usage(self) -> typing.Dict:
+        """ Retrieves tool usage information from a langfuse trace.
+        Note: assumes trace marks an action with string `AgentAction` """
+        tool_usage = {}
+        try:
+            fetched_trace = self.client.get_trace(self.trace.id)
+            steps = [s.name for s in fetched_trace.observations]
+            for step in steps:
+                if 'AgentAction' in step:
+                    tool_name = step.split('-')[1]
+                    if tool_name not in tool_usage:
+                        tool_usage[tool_name] = 0
+                    tool_usage[tool_name] += 1
+        except TraceNotFoundError:
+            logger.warning(f'Langfuse trace {self.trace.id} not found')
+        except Exception as e:
+            logger.error(f'Something went wrong while processing Langfuse trace {self.trace.id}: {str(e)}')
+        return tool_usage

MindsDB 25.1.2.0__py3-none-any.whl → 25.1.5.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.1.2.0py3-none-any.whl → 25.1.5.0py3-none-any.whl