PyPI - MindsDB - Versions diffs - 25.2.2.2__py3-none-any.whl → 25.2.4.0__py3-none-any.whl - Mend

MindsDB 25.2.2.2py3-none-any.whl → 25.2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (54) hide show

mindsdb/interfaces/file/file_controller.py CHANGED Viewed

@@ -3,12 +3,17 @@ import os
 import shutil
 from pathlib import Path
-from mindsdb.integrations.handlers.file_handler import Handler as FileHandler
+import pandas as pd
 from mindsdb.interfaces.storage import db
 from mindsdb.interfaces.storage.fs import FsStore
 from mindsdb.utilities import log
 from mindsdb.utilities.config import Config
 from mindsdb.utilities.context import context as ctx
+from sqlalchemy.orm.attributes import flag_modified
+from mindsdb.integrations.utilities.files.file_reader import FileReader
 logger = log.getLogger(__name__)
@@ -82,31 +87,38 @@ class FileController:
         file_dir = None
         try:
-            df = FileHandler.handle_source(file_path)
-            ds_meta = {"row_count": len(df), "column_names": list(df.columns)}
+            pages_files, pages_index = self.get_file_pages(file_path)
+            metadata = {
+                'is_feather': True,
+                'pages': pages_index
+            }
+            df = pages_files[0]
             file_record = db.File(
                 name=name,
                 company_id=ctx.company_id,
                 source_file_path=file_name,
                 file_path="",
-                row_count=ds_meta["row_count"],
-                columns=ds_meta["column_names"],
+                row_count=len(df),
+                columns=list(df.columns),
+                metadata_=metadata
             )
             db.session.add(file_record)
-            db.session.commit()
+            db.session.flush()
             store_file_path = f"file_{ctx.company_id}_{file_record.id}"
             file_record.file_path = store_file_path
-            db.session.commit()
             file_dir = Path(self.dir).joinpath(store_file_path)
             file_dir.mkdir(parents=True, exist_ok=True)
-            source = file_dir.joinpath(file_name)
-            # NOTE may be delay between db record exists and file is really in folder
-            shutil.move(file_path, str(source))
+            self.store_pages_as_feather(file_dir, pages_files)
+            # store original file
+            shutil.move(file_path, str(file_dir.joinpath(file_name)))
             self.fs_store.put(store_file_path, base_dir=self.dir)
+            db.session.commit()
         except Exception as e:
             logger.error(e)
             if file_dir is not None:
@@ -115,6 +127,39 @@ class FileController:
         return file_record.id
+    def get_file_pages(self, source_path: str):
+        """
+        Reads file and extract pages from it
+        Returned structures:
+          - page_files: dict with content, {page_num: dataframe}
+          - pages_index: dict, link between page name and num: {page_name: page_num}
+        """
+        file_reader = FileReader(path=source_path)
+        tables = file_reader.get_contents()
+        pages_files = {}
+        pages_index = {}
+        if len(tables) == 1:
+            df = list(tables.values())[0]
+            pages_files[0] = df
+        else:
+            # file has several pages, create a new one with info
+            df = pd.DataFrame(tables.keys(), columns=["Tables"])
+            pages_files[0] = df
+            for i, page_name in enumerate(tables.keys(), 1):
+                pages_files[i] = tables[page_name]
+                pages_index[page_name] = i
+        return pages_files, pages_index
+    def store_pages_as_feather(self, dest_dir: Path, pages_files: dict):
+        """
+        Stores pages in file storage dir in feather format
+        """
+        for num, df in pages_files.items():
+            dest = dest_dir.joinpath(f'{num}.feather')
+            df.to_feather(str(dest))
     def delete_file(self, name):
         file_record = (
             db.session.query(db.File)
@@ -144,3 +189,87 @@ class FileController:
             .joinpath(file_dir)
             .joinpath(Path(file_record.source_file_path).name)
         )
+    def get_file_data(self, name: str, page_name: str = None) -> pd.DataFrame:
+        """
+        Returns file content as dataframe
+        :param name: name of file
+        :param page_name: page name, optional
+        :return: Page or file content
+        """
+        file_record = (
+            db.session.query(db.File)
+            .filter_by(company_id=ctx.company_id, name=name)
+            .first()
+        )
+        if file_record is None:
+            raise Exception(f"File '{name}' does not exists")
+        file_dir = f"file_{ctx.company_id}_{file_record.id}"
+        self.fs_store.get(file_dir, base_dir=self.dir)
+        metadata = file_record.metadata_ or {}
+        if metadata.get('is_feather') is not True:
+            # migrate file
+            file_path = (
+                Path(self.dir)
+                .joinpath(file_dir)
+                .joinpath(Path(file_record.source_file_path).name)
+            )
+            pages_files, pages_index = self.get_file_pages(str(file_path))
+            self.store_pages_as_feather(file_path.parent, pages_files)
+            metadata['is_feather'] = True
+            metadata['pages'] = pages_index
+            file_record.metadata_ = metadata
+            flag_modified(file_record, 'metadata_')
+            db.session.commit()
+        if page_name is None:
+            num = 0
+        else:
+            num = metadata.get('pages', {}).get(page_name)
+            if num is None:
+                raise KeyError(f'Page not found: {page_name}')
+        path = (
+            Path(self.dir)
+            .joinpath(file_dir)
+            .joinpath(f'{num}.feather')
+        )
+        return pd.read_feather(path)
+    def set_file_data(self, name: str, df: pd.DataFrame, page_name: str = None):
+        """
+        Save file content
+        :param name: name of file
+        :param df: content to store
+        :param page_name: name of page, optional
+        """
+        file_record = (
+            db.session.query(db.File)
+            .filter_by(company_id=ctx.company_id, name=name)
+            .first()
+        )
+        if file_record is None:
+            raise Exception(f"File '{name}' does not exists")
+        file_dir = f"file_{ctx.company_id}_{file_record.id}"
+        self.fs_store.get(file_dir, base_dir=self.dir)
+        num = 0
+        if page_name is not None and file_record.metadata_ is not None:
+            num = file_record.metadata_.get('pages', {}).get(page_name, 0)
+        path = (
+            Path(self.dir)
+            .joinpath(file_dir)
+            .joinpath(f'{num}.feather')
+        )
+        df.to_feather(path)
+        self.fs_store.put(file_dir, base_dir=self.dir)

mindsdb/interfaces/jobs/scheduler.py CHANGED Viewed

@@ -44,7 +44,7 @@ class Scheduler:
         self.q_in = queue.Queue()
         self.q_out = queue.Queue()
         self.work_thread = threading.Thread(
-            target=execute_async, args=(self.q_in, self.q_out)
+            target=execute_async, args=(self.q_in, self.q_out), name='Scheduler.execute_async'
         )
         self.work_thread.start()

mindsdb/interfaces/knowledge_base/preprocessing/constants.py CHANGED Viewed

@@ -6,8 +6,8 @@ DEFAULT_MARKDOWN_HEADERS = [
 ]
 # Limits for web crawling
-DEFAULT_CRAWL_DEPTH = 1
-DEFAULT_WEB_CRAWL_LIMIT = 100
+DEFAULT_CRAWL_DEPTH = None
+DEFAULT_WEB_CRAWL_LIMIT = 1
 DEFAULT_WEB_FILTERS = []
 DEFAULT_CONTEXT_DOCUMENT_LIMIT = 50

mindsdb/interfaces/skills/skills_controller.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import datetime
 from typing import Dict, List, Optional
-from sqlalchemy import null
+from sqlalchemy import null, func
 from sqlalchemy.orm.attributes import flag_modified
 from mindsdb.interfaces.storage import db
@@ -33,7 +33,7 @@ class SkillsController:
         project = self.project_controller.get(name=project_name)
         return db.Skills.query.filter(
-            db.Skills.name == skill_name,
+            func.lower(db.Skills.name) == func.lower(skill_name),
             db.Skills.project_id == project.id,
             db.Skills.deleted_at == null()
         ).first()

mindsdb/interfaces/skills/sql_agent.py CHANGED Viewed

@@ -287,6 +287,7 @@ class SQLAgent:
         return info
     def _get_sample_rows(self, table: str, fields: List[str]) -> str:
+        logger.info(f'_get_sample_rows: table={table} fields={fields}')
         command = f"select {', '.join(fields)} from {table} limit {self._sample_rows_in_table_info};"
         try:
             ret = self._call_engine(command)
@@ -300,7 +301,7 @@ class SQLAgent:
                 map(lambda row: [truncate_value(value) for value in row], sample_rows))
             sample_rows_str = "\n" + list_to_csv_str([fields] + sample_rows)
         except Exception as e:
-            logger.warning(e)
+            logger.info(f'_get_sample_rows error: {e}')
             sample_rows_str = "\n" + "\t [error] Couldn't retrieve sample rows!"
         return sample_rows_str
@@ -347,14 +348,18 @@ class SQLAgent:
     def get_table_info_safe(self, table_names: Optional[List[str]] = None) -> str:
         try:
+            logger.info(f'get_table_info_safe: {table_names}')
             return self.get_table_info(table_names)
         except Exception as e:
+            logger.info(f'get_table_info_safe error: {e}')
             return f"Error: {e}"
     def query_safe(self, command: str, fetch: str = "all") -> str:
         try:
+            logger.info(f'query_safe (fetch={fetch}): {command}')
             return self.query(command, fetch)
         except Exception as e:
+            logger.info(f'query_safe error: {e}')
             msg = f"Error: {e}"
             if 'does not exist' in msg and ' relation ' in msg:
                 msg += '\nAvailable tables: ' + ', '.join(self.get_usable_table_names())

mindsdb/interfaces/storage/db.py CHANGED Viewed

@@ -218,18 +218,6 @@ class Project(Base):
     )
-class Log(Base):
-    __tablename__ = "log"
-    id = Column(Integer, primary_key=True)
-    created_at = Column(DateTime, default=datetime.datetime.now)
-    log_type = Column(String)  # log, info, warning, traceback etc
-    source = Column(String)  # file + line
-    company_id = Column(Integer)
-    payload = Column(String)
-    created_at_index = Index("some_index", "created_at_index")
 class Integration(Base):
     __tablename__ = "integration"
     id = Column(Integer, primary_key=True)
@@ -258,6 +246,7 @@ class File(Base):
     row_count = Column(Integer, nullable=False)
     columns = Column(Json, nullable=False)
     created_at = Column(DateTime, default=datetime.datetime.now)
+    metadata_: dict = Column("metadata", JSON, nullable=True)
     updated_at = Column(
         DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
     )

mindsdb/migrations/versions/2025-02-09_4943359e354a_file_metadata.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""file_metadata
+Revision ID: 4943359e354a
+Revises: c06c35f7e8e1
+Create Date: 2025-02-09 10:10:55.577407
+"""
+from alembic import op
+import sqlalchemy as sa
+import mindsdb.interfaces.storage.db  # noqa
+# revision identifiers, used by Alembic.
+revision = '4943359e354a'
+down_revision = 'c06c35f7e8e1'
+branch_labels = None
+depends_on = None
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('file', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('metadata', sa.JSON(), nullable=True))
+    # ### end Alembic commands ###
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('file', schema=None) as batch_op:
+        batch_op.drop_column('metadata')
+    # ### end Alembic commands ###

mindsdb/migrations/versions/2025-02-10_6ab9903fc59a_del_log_table.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""del_log_table
+Revision ID: 6ab9903fc59a
+Revises: 4943359e354a
+Create Date: 2025-02-10 16:50:27.186697
+"""
+from alembic import op
+import sqlalchemy as sa
+import mindsdb.interfaces.storage.db  # noqa
+# revision identifiers, used by Alembic.
+revision = '6ab9903fc59a'
+down_revision = '4943359e354a'
+branch_labels = None
+depends_on = None
+def upgrade():
+    op.drop_table('log')
+def downgrade():
+    op.create_table(
+        'log',
+        sa.Column('id', sa.INTEGER(), nullable=False),
+        sa.Column('created_at', sa.DATETIME(), nullable=True),
+        sa.Column('log_type', sa.VARCHAR(), nullable=True),
+        sa.Column('source', sa.VARCHAR(), nullable=True),
+        sa.Column('company_id', sa.INTEGER(), nullable=True),
+        sa.Column('payload', sa.VARCHAR(), nullable=True),
+        sa.PrimaryKeyConstraint('id')
+    )

mindsdb/utilities/config.py CHANGED Viewed

@@ -149,6 +149,7 @@ class Config:
                 "handlers": {
                     "console": {
                         "enabled": True,
+                        "formatter": "default",
                         "level": "INFO"     # MINDSDB_CONSOLE_LOG_LEVEL or MINDSDB_LOG_LEVEL (obsolete)
                     },
                     "file": {

mindsdb/utilities/log.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 import logging
 from logging.config import dictConfig
@@ -7,6 +8,19 @@ from mindsdb.utilities.config import config as app_config
 logging_initialized = False
+class JsonFormatter(logging.Formatter):
+    def format(self, record):
+        record_message = super().format(record)
+        log_record = {
+            'process_name': record.processName,
+            'name': record.name,
+            'message': record_message,
+            'level': record.levelname,
+            'time': record.created
+        }
+        return json.dumps(log_record)
 class ColorFormatter(logging.Formatter):
     green = "\x1b[32;20m"
     default = "\x1b[39;20m"
@@ -53,7 +67,7 @@ def configure_logging():
     if console_handler_config['enabled'] is True:
         handlers_config['console'] = {
             "class": "logging.StreamHandler",
-            "formatter": "f",
+            "formatter": console_handler_config.get('formatter', 'default'),
             "level": console_handler_config_level
         }
@@ -74,7 +88,8 @@ def configure_logging():
     logging_config = dict(
         version=1,
         formatters={
-            "f": {"()": ColorFormatter},
+            "default": {"()": ColorFormatter},
+            "json": {"()": JsonFormatter},
             "file": {
                 "format": "%(asctime)s %(processName)15s %(levelname)-8s %(name)s: %(message)s"
             }

mindsdb/utilities/ml_task_queue/consumer.py CHANGED Viewed

@@ -74,7 +74,9 @@ class MLTaskConsumer(BaseRedisQueue):
         # region collect cpu usage statistic
         self.cpu_stat = [0] * 10
-        self._collect_cpu_stat_thread = threading.Thread(target=self._collect_cpu_stat)
+        self._collect_cpu_stat_thread = threading.Thread(
+            target=self._collect_cpu_stat, name='MLTaskConsumer._collect_cpu_stat'
+        )
         self._collect_cpu_stat_thread.start()
         # endregion
@@ -221,7 +223,7 @@ class MLTaskConsumer(BaseRedisQueue):
             if self._ready_event.is_set() is False:
                 continue
             self._ready_event.clear()
-            threading.Thread(target=self._listen).start()
+            threading.Thread(target=self._listen, name='MLTaskConsumer._listen').start()
         self.stop()
     def stop(self) -> None:

mindsdb/utilities/render/sqlalchemy_render.py CHANGED Viewed

@@ -63,6 +63,10 @@ class AttributedStr(str):
         obj.is_quoted = is_quoted
         return obj
+    def replace(self, *args):
+        obj = super().replace(*args)
+        return AttributedStr(obj, self.is_quoted)
 def get_is_quoted(identifier: ast.Identifier):
     quoted = getattr(identifier, 'is_quoted', [])
@@ -93,9 +97,6 @@ class SqlalchemyRender:
         if hasattr(dialect, 'preparer'):
             class Preparer(dialect.preparer):
-                def __init__(self, *args, **kwargs):
-                    super().__init__(*args, **kwargs)
                 def _requires_quotes(self, value: str) -> bool:
                     # check force-quote flag
                     if isinstance(value, AttributedStr):
@@ -242,6 +243,8 @@ class SqlalchemyRender:
             op = t.op.lower()
             if op in ('in', 'not in'):
+                if t.args[1].parentheses:
+                    arg1 = [arg1]
                 if isinstance(arg1, sa.sql.selectable.ColumnClause):
                     raise NotImplementedError(f'Required list argument for: {op}')
@@ -536,12 +539,19 @@ class SqlalchemyRender:
                 query = query.select_from(table)
                 # other tables
+                has_explicit_join = False
                 for item in join_list[1:]:
                     table = self.to_table(item['table'])
                     if item['is_implicit']:
                         # add to from clause
-                        query = query.select_from(table)
+                        if has_explicit_join:
+                            # sqlalchemy doesn't support implicit join after explicit
+                            # convert it to explicit
+                            query = query.join(table, sa.text('1=1'))
+                        else:
+                            query = query.select_from(table)
                     else:
+                        has_explicit_join = True
                         if item['condition'] is None:
                             # otherwise, sqlalchemy raises "Don't know how to join to ..."
                             condition = sa.text('1=1')
@@ -564,7 +574,7 @@ class SqlalchemyRender:
                             condition,
                             full=is_full
                         )
-            elif isinstance(from_table, ast.Union):
+            elif isinstance(from_table, (ast.Union, ast.Intersect, ast.Except)):
                 alias = None
                 if from_table.alias:
                     alias = self.get_alias(from_table.alias)

mindsdb/utilities/log_controller.py DELETED Viewed

@@ -1,39 +0,0 @@
-from mindsdb.interfaces.storage import db
-from mindsdb.utilities.context import context as ctx
-def fmt_log_record(log_record):
-    return {
-        'log_from': 'mindsdb',
-        'level': log_record.log_type,
-        'context': 'unknown',
-        'text': log_record.payload,
-        'created_at': str(log_record.created_at).split('.')[0]
-    }
-def get_logs(min_timestamp, max_timestamp, context, level, log_from, limit):
-    logs = db.session.query(db.Log).filter(
-        db.Log.company_id == ctx.company_id,
-        db.Log.created_at > min_timestamp
-    )
-    if max_timestamp is not None:
-        logs = logs.filter(db.Log.created_at < max_timestamp)
-    if context is not None:
-        # e.g. datasource/predictor and assoicated id
-        pass
-    if level is not None:
-        logs = logs.filter(db.Log.log_type == level)
-    if log_from is not None:
-        # mindsdb/native/lightwood/all
-        pass
-    if limit is not None:
-        logs = logs.limit(limit)
-    logs = [fmt_log_record(x) for x in logs]
-    return logs

mindsdb/utilities/telemetry.py DELETED Viewed

@@ -1,44 +0,0 @@
-import os
-from pathlib import Path
-TELEMETRY_FILE = 'telemetry.lock'
-def enable_telemetry(storage_dir):
-    os.environ['CHECK_FOR_UPDATES'] = '1'
-    path = os.path.join(storage_dir, TELEMETRY_FILE)
-    if os.path.exists(path):
-        os.remove(path)
-def disable_telemetry(storage_dir):
-    os.environ['CHECK_FOR_UPDATES'] = '0'
-    path = os.path.join(storage_dir, TELEMETRY_FILE)
-    with open(path, 'w') as _:
-        pass
-def telemetry_file_exists(storage_dir):
-    path = os.path.join(storage_dir, TELEMETRY_FILE)
-    return os.path.exists(path)
-def inject_telemetry_to_static(static_folder):
-    TEXT = '<script>localStorage.isTestUser = true;</script>'
-    index = Path(static_folder).joinpath('index.html')
-    disable_telemetry = os.getenv('CHECK_FOR_UPDATES', '1').lower() in ['0', 'false', 'False']
-    if index.is_file():
-        with open(str(index), 'rt') as f:
-            content = f.read()
-        script_index = content.find('<script>')
-        need_update = True
-        if TEXT not in content and disable_telemetry:
-            content = content[:script_index] + TEXT + content[script_index:]
-        elif not disable_telemetry and TEXT in content:
-            content = content.replace(TEXT, '')
-        else:
-            need_update = False
-        if need_update:
-            with open(str(index), 'wt') as f:
-                f.write(content)

{MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

MindsDB 25.2.2.2__py3-none-any.whl → 25.2.4.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.2.2.2py3-none-any.whl → 25.2.4.0py3-none-any.whl