PyPI - MindsDB - Versions diffs - 25.2.1.2__py3-none-any.whl → 25.2.2.1__py3-none-any.whl - Mend

MindsDB 25.2.1.2py3-none-any.whl → 25.2.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (33) hide show

mindsdb/integrations/handlers/file_handler/file_handler.py CHANGED Viewed

@@ -1,18 +1,9 @@
-import codecs
-import csv
-import json
 import os
 import shutil
 import tempfile
-import traceback
-from io import BytesIO, StringIO
 from pathlib import Path
-from urllib.parse import urlparse
-import filetype
 import pandas as pd
-import requests
-from charset_normalizer import from_bytes
 from mindsdb_sql_parser import parse_sql
 from mindsdb_sql_parser.ast import CreateTable, DropTables, Insert, Select
 from mindsdb_sql_parser.ast.base import ASTNode
@@ -23,7 +14,9 @@ from mindsdb.integrations.libs.response import RESPONSE_TYPE
 from mindsdb.integrations.libs.response import HandlerResponse as Response
 from mindsdb.integrations.libs.response import HandlerStatusResponse as StatusResponse
 from mindsdb.utilities import log
-from langchain_text_splitters import RecursiveCharacterTextSplitter
+from mindsdb.integrations.utilities.files.file_reader import FileReader
 logger = log.getLogger(__name__)
@@ -144,14 +137,9 @@ class FileHandler(DatabaseHandler):
             else:
                 sheet_name = None
             file_path = self.file_controller.get_file_path(table_name)
-            df, _columns = self._handle_source(
-                file_path,
-                self.clean_rows,
-                self.custom_parser,
-                self.chunk_size,
-                self.chunk_overlap,
-                sheet_name=sheet_name
-            )
+            df = self.handle_source(file_path, sheet_name=sheet_name)
             # Process the SELECT query
             result_df = query_df(df, query)
             return Response(RESPONSE_TYPE.TABLE, data_frame=result_df)
@@ -160,14 +148,9 @@ class FileHandler(DatabaseHandler):
             table_name = query.table.parts[-1]
             file_path = self.file_controller.get_file_path(table_name)
-            # Load the existing data from the file
-            df, _ = self._handle_source(
-                file_path,
-                self.clean_rows,
-                self.custom_parser,
-                self.chunk_size,
-                self.chunk_overlap,
-            )
+            file_reader = FileReader(path=file_path)
+            df = file_reader.to_df()
             # Create a new dataframe with the values from the query
             new_df = pd.DataFrame(query.values, columns=[col.name for col in query.columns])
@@ -193,306 +176,16 @@ class FileHandler(DatabaseHandler):
         return self.query(ast)
     @staticmethod
-    def _handle_source(
-        file_path,
-        clean_rows=True,
-        custom_parser=None,
-        chunk_size=DEFAULT_CHUNK_SIZE,
-        chunk_overlap=DEFAULT_CHUNK_OVERLAP,
-        sheet_name=None   # for "xlsx", "xls" files
-    ):
-        """
-        This function takes a file path and returns a pandas dataframe
-        """
-        # get file data io, format and dialect
-        data, fmt, dialect = FileHandler._get_data_io(file_path)
-        data.seek(0)  # make sure we are at 0 in file pointer
-        if custom_parser:
-            header, file_data = custom_parser(data, fmt)
-            df = pd.DataFrame(file_data, columns=header)
-        elif fmt == "parquet":
-            df = pd.read_parquet(data)
-        elif fmt == "csv":
-            df = pd.read_csv(data, sep=dialect.delimiter, index_col=False)
-        elif fmt in ["xlsx", "xls"]:
-            data.seek(0)
-            with pd.ExcelFile(data) as xls:
-                if sheet_name is None:
-                    # No sheet specified: Return list of sheets
-                    sheet_list = xls.sheet_names
-                    df = pd.DataFrame(sheet_list, columns=["Sheet_Name"])
-                else:
-                    # Specific sheet requested: Load that sheet
-                    df = pd.read_excel(xls, sheet_name=sheet_name)
-        elif fmt == "json":
-            data.seek(0)
-            json_doc = json.loads(data.read())
-            df = pd.json_normalize(json_doc, max_level=0)
-        elif fmt == "txt" or fmt == "pdf":
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=chunk_size, chunk_overlap=chunk_overlap
-            )
-            if fmt == "txt":
-                try:
-                    from langchain_community.document_loaders import TextLoader
-                except ImportError:
-                    raise ImportError(
-                        "To import TXT document please install 'langchain-community':\n"
-                        "    pip install langchain-community"
-                    )
-                loader = TextLoader(file_path, encoding="utf8")
-                docs = text_splitter.split_documents(loader.load())
-                df = pd.DataFrame(
-                    [
-                        {"content": doc.page_content, "metadata": doc.metadata}
-                        for doc in docs
-                    ]
-                )
+    def handle_source(file_path, **kwargs):
+        file_reader = FileReader(path=file_path)
-            elif fmt == "pdf":
-                import fitz  # pymupdf
-                with fitz.open(file_path) as pdf:  # open pdf
-                    text = chr(12).join([page.get_text() for page in pdf])
-                split_text = text_splitter.split_text(text)
-                df = pd.DataFrame(
-                    {"content": split_text, "metadata": [{}] * len(split_text)}
-                )
-        else:
-            raise ValueError(
-                "Could not load file into any format, supported formats are csv, json, xls, xlsx, pdf, txt"
-            )
+        df = file_reader.to_df(**kwargs)
         header = df.columns.values.tolist()
         df.columns = [key.strip() for key in header]
         df = df.applymap(clean_cell)
-        header = [x.strip() for x in header]
-        col_map = dict((col, col) for col in header)
-        return df, col_map
-    @staticmethod
-    def is_it_parquet(data: BytesIO) -> bool:
-        # Check first and last 4 bytes equal to PAR1.
-        # Refer: https://parquet.apache.org/docs/file-format/
-        parquet_sig = b"PAR1"
-        data.seek(0, 0)
-        start_meta = data.read(4)
-        data.seek(-4, 2)
-        end_meta = data.read()
-        data.seek(0)
-        if start_meta == parquet_sig and end_meta == parquet_sig:
-            return True
-        return False
-    @staticmethod
-    def is_it_xlsx(file_path: str) -> bool:
-        file_type = filetype.guess(file_path)
-        if file_type and file_type.mime in {
-            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-            "application/vnd.ms-excel",
-        }:
-            return True
-        return False
-    @staticmethod
-    def is_it_json(data_str: StringIO) -> bool:
-        # see if its JSON
-        text = data_str.read(100).strip()
-        data_str.seek(0)
-        if len(text) > 0:
-            # it it looks like a json, then try to parse it
-            if text.startswith("{") or text.startswith("["):
-                try:
-                    json.loads(data_str.read())
-                    return True
-                except Exception:
-                    return False
-                finally:
-                    data_str.seek(0)
-        return False
-    @staticmethod
-    def is_it_csv(data_str: StringIO) -> bool:
-        sample = data_str.readline()  # trying to get dialect from header
-        data_str.seek(0)
-        try:
-            csv.Sniffer().sniff(sample)
-            # Avoid a false-positive for json files
-            try:
-                json.loads(data_str.read())
-                data_str.seek(0)
-                return False
-            except json.decoder.JSONDecodeError:
-                data_str.seek(0)
-                return True
-        except Exception:
-            return False
-    @staticmethod
-    def _get_data_io(file_path):
-        """
-        @TODO: Use python-magic to simplify the function and detect the file types as the xlsx example
-        This gets a file either url or local file and defines what the format is as well as dialect
-        :param file: file path or url
-        :return: data_io, format, dialect
-        """
-        data = BytesIO()
-        data_str = None
-        dialect = None
-        try:
-            with open(file_path, "rb") as fp:
-                data = BytesIO(fp.read())
-        except Exception as e:
-            error = "Could not load file, possible exception : {exception}".format(
-                exception=e
-            )
-            logger.error(error)
-            raise ValueError(error)
-        suffix = Path(file_path).suffix.strip(".").lower()
-        if suffix not in ("csv", "json", "xlsx", "parquet"):
-            if FileHandler.is_it_parquet(data):
-                suffix = "parquet"
-            elif FileHandler.is_it_xlsx(file_path):
-                suffix = "xlsx"
-        if suffix == "parquet":
-            return data, "parquet", dialect
-        if suffix == "xlsx":
-            return data, "xlsx", dialect
-        if suffix == "txt":
-            return data, "txt", dialect
-        if suffix == "pdf":
-            return data, "pdf", dialect
-        byte_str = data.read()
-        # Move it to StringIO
-        try:
-            # Handle Microsoft's BOM "special" UTF-8 encoding
-            if byte_str.startswith(codecs.BOM_UTF8):
-                data_str = StringIO(byte_str.decode("utf-8-sig"))
-            else:
-                file_encoding_meta = from_bytes(
-                    byte_str[: 32 * 1024],
-                    steps=32,  # Number of steps/block to extract from my_byte_str
-                    chunk_size=1024,  # Set block size of each extraction)
-                    explain=False,
-                )
-                best_meta = file_encoding_meta.best()
-                errors = "strict"
-                if best_meta is not None:
-                    encoding = file_encoding_meta.best().encoding
-                    try:
-                        data_str = StringIO(byte_str.decode(encoding, errors))
-                    except UnicodeDecodeError:
-                        encoding = "utf-8"
-                        errors = "replace"
-                        data_str = StringIO(byte_str.decode(encoding, errors))
-                else:
-                    encoding = "utf-8"
-                    errors = "replace"
-                    data_str = StringIO(byte_str.decode(encoding, errors))
-        except Exception:
-            logger.error(traceback.format_exc())
-            logger.error("Could not load into string")
-        if suffix not in ("csv", "json"):
-            if FileHandler.is_it_json(data_str):
-                suffix = "json"
-            elif FileHandler.is_it_csv(data_str):
-                suffix = "csv"
-        if suffix == "json":
-            return data_str, suffix, dialect
-        if suffix == "csv":
-            try:
-                dialect = FileHandler._get_csv_dialect(data_str)
-                if dialect:
-                    return data_str, "csv", dialect
-            except Exception:
-                logger.error("Could not detect format for this file")
-                logger.error(traceback.format_exc())
-        data_str.seek(0)
-        data.seek(0)
-        # No file type identified
-        return data, None, dialect
-    @staticmethod
-    def _get_file_path(path) -> str:
-        try:
-            is_url = urlparse(path).scheme in ("http", "https")
-        except Exception:
-            is_url = False
-        if is_url:
-            path = FileHandler._fetch_url(path)
-        return path
-    @staticmethod
-    def _get_csv_dialect(buffer) -> csv.Dialect:
-        sample = buffer.readline()  # trying to get dialect from header
-        buffer.seek(0)
-        try:
-            if isinstance(sample, bytes):
-                sample = sample.decode()
-            accepted_csv_delimiters = [",", "\t", ";"]
-            try:
-                dialect = csv.Sniffer().sniff(
-                    sample, delimiters=accepted_csv_delimiters
-                )
-                dialect.doublequote = (
-                    True  # assume that all csvs have " as string escape
-                )
-            except Exception:
-                dialect = csv.reader(sample).dialect
-                if dialect.delimiter not in accepted_csv_delimiters:
-                    raise Exception(
-                        f"CSV delimeter '{dialect.delimiter}' is not supported"
-                    )
-        except csv.Error:
-            dialect = None
-        return dialect
-    @staticmethod
-    def _fetch_url(url: str) -> str:
-        temp_dir = tempfile.mkdtemp(prefix="mindsdb_file_url_")
-        try:
-            r = requests.get(url, stream=True)
-            if r.status_code == 200:
-                with open(os.path.join(temp_dir, "file"), "wb") as f:
-                    for chunk in r:
-                        f.write(chunk)
-            else:
-                raise Exception(f"Response status code is {r.status_code}")
-        except Exception as e:
-            logger.error(f"Error during getting {url}")
-            logger.error(e)
-            raise
-        return os.path.join(temp_dir, "file")
+        return df
     def get_tables(self) -> Response:
         """

mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py CHANGED Viewed

@@ -1,20 +1,20 @@
-import json
 import os
 import shutil
 import tempfile
 from io import BytesIO, StringIO
-from unittest.mock import patch
+from pathlib import Path
 import pandas
 import pytest
-import responses
 from mindsdb_sql_parser.exceptions import ParsingException
-from mindsdb_sql_parser.ast import CreateTable, DropTables, Identifier, Insert, Select, Star, TableColumn, Update
+from mindsdb_sql_parser.ast import CreateTable, DropTables, Identifier, Insert, TableColumn, Update
 from pytest_lazyfixture import lazy_fixture
 from mindsdb.integrations.handlers.file_handler.file_handler import FileHandler
 from mindsdb.integrations.libs.response import RESPONSE_TYPE
-from mindsdb.interfaces.file.file_controller import FileController
+from mindsdb.integrations.utilities.files.file_reader import FileReader
 # Define a table to use as content for all of the file types
 # This data needs to match that saved in the files in the ./data/ dir (except pdf and txt files)
@@ -110,21 +110,21 @@ class TestIsItX:
     )
     def test_is_it_csv(self, file_path, result):
         with open(file_path, "r") as fh:
-            assert FileHandler.is_it_csv(StringIO(fh.read())) is result
+            assert FileReader.is_csv(StringIO(fh.read())) is result
     @pytest.mark.parametrize(
         "file_path,result",
         [
-            (lazy_fixture("csv_file"), False),
-            (lazy_fixture("xlsx_file"), True),
-            (lazy_fixture("json_file"), False),
-            (lazy_fixture("parquet_file"), False),
-            (lazy_fixture("txt_file"), False),
-            (lazy_fixture("pdf_file"), False),
+            (lazy_fixture("csv_file"), 'csv'),
+            (lazy_fixture("xlsx_file"), 'xlsx'),
+            (lazy_fixture("json_file"), 'json'),
+            (lazy_fixture("parquet_file"), 'parquet'),
+            (lazy_fixture("txt_file"), 'txt'),
+            (lazy_fixture("pdf_file"), 'pdf'),
         ],
     )
-    def test_is_it_xlsx(self, file_path, result):
-        assert FileHandler.is_it_xlsx(file_path) is result
+    def test_format(self, file_path, result):
+        assert FileReader(path=file_path).get_format() == result
     # We can't test xlsx or parquet here because they're binary files
     @pytest.mark.parametrize(
@@ -137,7 +137,7 @@ class TestIsItX:
     )
     def test_is_it_json(self, file_path, result):
         with open(file_path, "r") as fh:
-            assert FileHandler.is_it_json(StringIO(fh.read())) is result
+            assert FileReader.is_json(StringIO(fh.read())) is result
     @pytest.mark.parametrize(
         "file_path,result",
@@ -152,7 +152,7 @@ class TestIsItX:
     )
     def test_is_it_parquet(self, file_path, result):
         with open(file_path, "rb") as fh:
-            assert FileHandler.is_it_parquet(BytesIO(fh.read())) is result
+            assert FileReader.is_parquet(BytesIO(fh.read())) is result
 class TestQuery:
@@ -182,64 +182,6 @@ class TestQuery:
         assert response.type == RESPONSE_TYPE.ERROR
-    def test_query_select(self, csv_file):
-        """Test a valid select query"""
-        expected_df = pandas.read_csv(csv_file)
-        # This is temporary because the file controller currently absconds with our file when we save it:
-        # https://github.com/mindsdb/mindsdb/issues/8141
-        csv_tmp = os.path.join(tempfile.gettempdir(), "test.csv")
-        if os.path.exists(csv_tmp):
-            os.remove(csv_tmp)
-        shutil.copy(csv_file, csv_tmp)
-        # Configure mindsdb and set up the file controller
-        # Ideally this would be a lot simpler..
-        db_file = tempfile.mkstemp(prefix="mindsdb_db_")[1]
-        config = {"storage_db": "sqlite:///" + db_file}
-        fdi, cfg_file = tempfile.mkstemp(prefix="mindsdb_conf_")
-        with os.fdopen(fdi, "w") as fd:
-            json.dump(config, fd)
-        os.environ["MINDSDB_CONFIG_PATH"] = cfg_file
-        from mindsdb.utilities.config import Config
-        Config()
-        from mindsdb.interfaces.storage import db
-        db.init()
-        db.session.rollback()
-        db.Base.metadata.drop_all(db.engine)
-        # create
-        db.Base.metadata.create_all(db.engine)
-        # fill with data
-        r = db.Integration(name="files", data={}, engine="files")
-        db.session.add(r)
-        db.session.flush()
-        # Config #
-        file_controller = FileController()
-        file_controller.save_file(
-            os.path.splitext(os.path.basename(csv_file))[0], csv_tmp
-        )
-        file_handler = FileHandler(file_controller=file_controller)
-        response = file_handler.query(
-            Select(
-                targets=[Star()],
-                from_table=Identifier(
-                    parts=[os.path.splitext(os.path.basename(csv_file))[0]]
-                ),
-            )
-        )
-        assert response.type == RESPONSE_TYPE.TABLE
-        assert response.error_code == 0
-        assert response.error_message is None
-        assert expected_df.equals(response.data_frame)
     def test_query_insert(self, csv_file, monkeypatch):
         """Test an invalid insert query"""
         # Create a temporary file to save the csv file to.
@@ -322,26 +264,6 @@ class TestQuery:
             file_handler.native_query("INVALID QUERY")
-def test_get_file_path_with_file_path():
-    """Test an valid native table query"""
-    file_path = "example.txt"
-    result = FileHandler._get_file_path(file_path)
-    assert result == file_path
-@patch("mindsdb.integrations.handlers.file_handler.file_handler.FileHandler._fetch_url")
-def test_get_file_path_with_url(mock_fetch_url):
-    url = "http://example.com/file.txt"
-    expected_result = "some_file_path"
-    # we test _fetch_url separately below. Mock it for this test
-    mock_fetch_url.return_value = expected_result
-    result = FileHandler._get_file_path(url)
-    assert result == expected_result
-    mock_fetch_url.assert_called_with(url)
 @pytest.mark.parametrize(
     "file_path,expected_columns",
     [
@@ -354,48 +276,44 @@ def test_get_file_path_with_url(mock_fetch_url):
     ],
 )
 def test_handle_source(file_path, expected_columns):
-    sheet_name = None
-    # Excel files return a list of sheets when queried without a sheet name
-    if file_path.endswith(".xlsx"):
-        df, _ = FileHandler._handle_source(file_path)
+    def get_reader(file_path):
+        # using path
+        reader = FileReader(path=file_path)
+        yield reader
+        # using file descriptor
+        with open(file_path, 'rb') as fd:
+            reader = FileReader(file=fd)
+            yield reader
+            fd.seek(0)
+            content = fd.read()
+        # using bytesio
+        fd = BytesIO(content)
+        reader = FileReader(file=fd, name=Path(file_path).name)
+        yield reader
+    # using different methods to create reader
+    for reader in get_reader(file_path):
+        df = reader.to_df()
         assert isinstance(df, pandas.DataFrame)
-        assert df.columns.tolist() == test_excel_sheet_content[0]
-        assert len(df) == len(test_excel_sheet_content) - 1
-        assert df.values.tolist() == test_excel_sheet_content[1:]
-        sheet_name = test_excel_sheet_content[1][0]
+        if reader.get_format() == 'xlsx':
-    df, _ = FileHandler._handle_source(file_path, sheet_name=sheet_name)
-    assert isinstance(df, pandas.DataFrame)
-    assert df.columns.tolist() == expected_columns
+            assert df.columns.tolist() == test_excel_sheet_content[0]
+            assert len(df) == len(test_excel_sheet_content) - 1
+            assert df.values.tolist() == test_excel_sheet_content[1:]
+            sheet_name = test_excel_sheet_content[1][0]
-    # The pdf and txt files have some different content
-    if not file_path.endswith(".pdf") and not file_path.endswith(".txt"):
-        assert len(df) == len(test_file_content) - 1
-        assert df.values.tolist() == test_file_content[1:]
+            df = reader.to_df(sheet_name=sheet_name)
+        assert df.columns.tolist() == expected_columns
-@pytest.mark.parametrize(
-    "file_path,expected_file_type,expected_delimiter,expected_data_type",
-    [
-        (lazy_fixture("csv_file"), "csv", ",", StringIO),
-        (lazy_fixture("xlsx_file"), "xlsx", None, BytesIO),
-        (lazy_fixture("json_file"), "json", None, StringIO),
-        (lazy_fixture("parquet_file"), "parquet", None, BytesIO),
-        (lazy_fixture("pdf_file"), "pdf", None, BytesIO),
-        (lazy_fixture("txt_file"), "txt", None, BytesIO),
-    ],
-)
-def test_get_data_io(
-    file_path, expected_file_type, expected_delimiter, expected_data_type
-):
-    data_io, file_type, file_dialect = FileHandler._get_data_io(file_path)
-    assert file_type == expected_file_type
-    assert type(data_io) == expected_data_type
-    if expected_delimiter is None:
-        assert file_dialect is None
-    else:
-        assert file_dialect.delimiter == expected_delimiter
+        # The pdf and txt files have some different content
+        if reader.get_format() not in ("pdf", "txt"):
+            assert len(df) == len(test_file_content) - 1
+            assert df.values.tolist() == test_file_content[1:]
 @pytest.mark.parametrize(
@@ -407,10 +325,21 @@ def test_get_data_io(
     ],
 )
 def test_check_valid_dialects(csv_string, delimiter):
-    dialect = FileHandler._get_csv_dialect(csv_string)
+    dialect = FileReader._get_csv_dialect(csv_string)
     assert dialect.delimiter == delimiter
+def test_tsv():
+    file = BytesIO(b"example;csv;file\tname")
+    reader = FileReader(file=file, name='test.tsv')
+    assert reader.get_format() == 'csv'
+    assert reader.parameters['delimiter'] == '\t'
+    df = reader.to_df()
+    assert len(df.columns) == 2
 def test_check_invalid_dialects():
     with pytest.raises(Exception):
         FileHandler._get_csv_dialect("example csv file")
@@ -420,31 +349,6 @@ def test_check_invalid_dialects():
         FileHandler._get_csv_dialect("example|csv|file")
-@responses.activate
-def test_fetch_url():
-    file_content = "Fake File Content 1234567890"
-    file_url = "https://test.fake/robots.txt"
-    responses.add(
-        responses.GET, file_url, body=file_content, status=200
-    )  # mock the response
-    file_path = FileHandler._fetch_url(file_url)
-    with open(file_path, "r") as fh:
-        saved_file_content = fh.read()
-    assert saved_file_content == file_content
-@responses.activate
-def test_fetch_url_raises():
-    responses.add(responses.GET, "https://google.com", status=404)
-    with pytest.raises(Exception):
-        FileHandler._fetch_url("obvious_broken_url")
-    with pytest.raises(Exception):
-        FileHandler._fetch_url("https://google.com")  # will get 404 response
 def test_get_tables():
     file_handler = FileHandler(file_controller=MockFileController())
     response = file_handler.get_tables()

mindsdb/integrations/handlers/huggingface_handler/requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 datasets==2.16.1
 evaluate
 torch
-nltk
+nltk>=3.9
 huggingface-hub

mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 datasets==2.16.1
 evaluate
-nltk
+nltk>=3.9
 huggingface-hub
 # Needs to be installed with `pip install --extra-index-url https://download.pytorch.org/whl/ .[huggingface_cpu]`
 torch==2.2.0+cpu

mindsdb/integrations/handlers/lancedb_handler/requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 lancedb~=0.3.1
 lance
-pyarrow~=14.0.1
+pyarrow~=19.0.0

MindsDB 25.2.1.2__py3-none-any.whl → 25.2.2.1__py3-none-any.whl

Potentially problematic release.

MindsDB 25.2.1.2py3-none-any.whl → 25.2.2.1py3-none-any.whl