PyPI - MindsDB - Versions diffs - 25.2.1.2__py3-none-any.whl → 25.2.2.1__py3-none-any.whl - Mend

MindsDB 25.2.1.2py3-none-any.whl → 25.2.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (33) hide show

mindsdb/integrations/handlers/lightwood_handler/requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-lightwood~=24.12.3.0
-lightwood[extra]~=24.12.3.0
-lightwood[xai]~=24.12.3.0
+lightwood>=25.2.2.0
+lightwood[extra]>=25.2.2.0
+lightwood[xai]>=25.2.2.0
 type_infer==0.0.20

mindsdb/integrations/handlers/ms_one_drive_handler/ms_graph_api_one_drive_client.py CHANGED Viewed

@@ -76,15 +76,15 @@ class MSGraphAPIOneDriveClient(MSGraphAPIBaseClient):
         child_items = []
         for items in self.fetch_paginated_data(f"me/drive/items/{item_id}/children"):
             for item in items:
-                path = f"{path}/{item['name']}"
+                child_path = f"{path}/{item['name']}"
                 # If the item is a folder, get its child items.
                 if "folder" in item:
                     # Recursively get the child items of the folder.
-                    child_items.extend(self.get_child_items(item["id"], path))
+                    child_items.extend(self.get_child_items(item["id"], child_path))
                 else:
                     # Add the path to the item.
-                    item["path"] = path
+                    item["path"] = child_path
                     child_items.append(item)
         return child_items

mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py CHANGED Viewed

@@ -84,25 +84,7 @@ class FileTable(APIResource):
         client = self.handler.connect()
         file_content = client.get_item_content(table_name)
-        file_extension = table_name.split(".")[-1]
-        # Read the file content based and return a DataFrame based on the file extension.
-        if file_extension == "csv":
-            df = pd.read_csv(BytesIO(file_content))
+        reader = FileReader(file=BytesIO(file_content), name=table_name)
-        elif file_extension == "tsv":
-            df = pd.read_csv(BytesIO(file_content), sep="\t")
-        elif file_extension == "json":
-            df = pd.DataFrame(file_content)
-        elif file_extension == "parquet":
-            df = pd.read_parquet(BytesIO(file_content))
-        elif file_extension == "pdf":
-            df = FileReader().read_pdf(BytesIO(file_content))
-        elif file_extension == "txt":
-            df = FileReader().read_txt(BytesIO(file_content))
-        return df
+        return reader.to_df()

mindsdb/integrations/handlers/salesforce_handler/connection_args.py CHANGED Viewed

@@ -28,6 +28,13 @@ connection_args = OrderedDict(
         'description': 'The client secret (consumer secret) from a connected app in Salesforce.',
         'required': True,
         'label': 'Client Secret (Consumer Secret)'
+    },
+    is_sandbox={
+        'type': ARG_TYPE.BOOL,
+        'description': 'Set this to True if you need to connect to a sandbox, False for production environments. '
+                       'If not provided defaults to False.',
+        'required': False,
+        'label': 'Is Sandbox'
     }
 )
@@ -35,5 +42,6 @@ connection_args_example = OrderedDict(
     username='demo@example.com',
     password='demo_password',
     client_id='3MVG9lKcPoNINVBIPJjdw1J9LLM82HnZz9Yh7ZJnY',
-    client_secret='5A52C1A1E21DF9012IODC9ISNXXAADDA9'
+    client_secret='5A52C1A1E21DF9012IODC9ISNXXAADDA9',
+    is_sandbox=True
 )

mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py CHANGED Viewed

@@ -88,7 +88,8 @@ class SalesforceHandler(APIHandler):
                 username=self.connection_data['username'],
                 password=self.connection_data['password'],
                 client_id=self.connection_data['client_id'],
-                client_secret=self.connection_data['client_secret']
+                client_secret=self.connection_data['client_secret'],
+                is_sandbox=self.connection_data.get('is_sandbox', False)
             )
             self.is_connected = True
             return self.connection

mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py CHANGED Viewed

@@ -17,7 +17,7 @@ from mindsdb.integrations.libs.response import (
 try:
     import pyarrow as pa
-    memory_pool = pa.memory_pool()
+    memory_pool = pa.default_memory_pool()
 except Exception:
     memory_pool = None

mindsdb/integrations/handlers/writer_handler/requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 -r mindsdb/integrations/handlers/rag_handler/requirements.txt
-nltk>=3.8.1
+nltk>=3.9
 rouge-score>=0.1.2
 scipy

mindsdb/integrations/utilities/files/file_reader.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import traceback
 import json
 import csv
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, IOBase
 from pathlib import Path
 import codecs
@@ -9,6 +9,7 @@ import filetype
 import pandas as pd
 from charset_normalizer import from_bytes
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+import fitz  # pymupdf
 from mindsdb.utilities import log
@@ -22,7 +23,8 @@ class FileDetectError(Exception):
     ...
-def decode(file_obj: BytesIO) -> StringIO:
+def decode(file_obj: IOBase) -> StringIO:
+    file_obj.seek(0)
     byte_str = file_obj.read()
     # Move it to StringIO
     try:
@@ -62,39 +64,87 @@ def decode(file_obj: BytesIO) -> StringIO:
 class FormatDetector:
-    def get(self, name, file_obj: BytesIO = None):
-        format = self.get_format_by_name(name)
-        if format is None and file_obj is not None:
-            format = self.get_format_by_content(file_obj)
+    supported_formats = ['parquet', 'csv', 'xlsx', 'pdf', 'json', 'txt']
+    def __init__(
+        self,
+        path: str = None,
+        name: str = None,
+        file: IOBase = None
+    ):
+        """
+        File format detector
+        One of these arguments has to be passed: `path` or `file`
+        :param path: path to the file
+        :param name: name of the file
+        :param file: file descriptor (via open(...), of BytesIO(...))
+        """
+        if path is not None:
+            file = open(path, 'rb')
+        elif file is not None:
+            if name is None:
+                if hasattr(file, 'name'):
+                    path = file.name
+                else:
+                    path = 'file'
+        else:
+            raise FileDetectError('Wrong arguments: path or file is required')
+        if name is None:
+            name = Path(path).name
+        self.name = name
+        self.file_obj = file
+        self.format = None
+        self.parameters = {}
+    def get_format(self) -> str:
+        if self.format is not None:
+            return self.format
+        format = self.get_format_by_name()
         if format is not None:
-            return format
-        raise FileDetectError(f'Unable to detect format: {name}')
+            if format not in self.supported_formats:
+                raise FileDetectError(f'Not supported format: {format}')
+        if format is None and self.file_obj is not None:
+            format = self.get_format_by_content()
+            self.file_obj.seek(0)
-    def get_format_by_name(self, filename):
-        extension = Path(filename).suffix.strip(".").lower()
+        if format is None:
+            raise FileDetectError(f'Unable to detect format: {self.name}')
+        self.format = format
+        return format
+    def get_format_by_name(self):
+        extension = Path(self.name).suffix.strip(".").lower()
         if extension == "tsv":
             extension = "csv"
+            self.parameters['delimiter'] = '\t'
         return extension or None
-    def get_format_by_content(self, file_obj):
-        if self.is_parquet(file_obj):
+    def get_format_by_content(self):
+        if self.is_parquet(self.file_obj):
             return "parquet"
-        file_type = filetype.guess(file_obj)
-        if file_type is None:
-            return
+        file_type = filetype.guess(self.file_obj)
+        if file_type is not None:
-        if file_type.mime in {
-            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-            "application/vnd.ms-excel",
-        }:
-            return 'xlsx'
+            if file_type.mime in {
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                "application/vnd.ms-excel",
+            }:
+                return 'xlsx'
-        if file_type.mime == 'application/pdf':
-            return "pdf"
+            if file_type.mime == 'application/pdf':
+                return "pdf"
-        file_obj = decode(file_obj)
+        file_obj = decode(self.file_obj)
         if self.is_json(file_obj):
             return "json"
@@ -102,8 +152,10 @@ class FormatDetector:
         if self.is_csv(file_obj):
             return "csv"
-    def is_json(self, data_obj: StringIO) -> bool:
+    @staticmethod
+    def is_json(data_obj: StringIO) -> bool:
         # see if its JSON
+        data_obj.seek(0)
         text = data_obj.read(100).strip()
         data_obj.seek(0)
         if len(text) > 0:
@@ -114,20 +166,25 @@ class FormatDetector:
                     return True
                 except Exception:
                     return False
-                finally:
-                    data_obj.seek(0)
         return False
-    def is_csv(self, data_obj: StringIO) -> bool:
-        sample = data_obj.readline()  # trying to get dialect from header
+    @classmethod
+    def is_csv(cls, data_obj: StringIO) -> bool:
         data_obj.seek(0)
+        sample = data_obj.readline()  # trying to get dialect from header
         try:
+            data_obj.seek(0)
             csv.Sniffer().sniff(sample)
+            # Avoid a false-positive for json files
+            if cls.is_json(data_obj):
+                return False
+            return True
         except Exception:
             return False
-    def is_parquet(self, data: BytesIO) -> bool:
+    @staticmethod
+    def is_parquet(data: IOBase) -> bool:
         # Check first and last 4 bytes equal to PAR1.
         # Refer: https://parquet.apache.org/docs/file-format/
         parquet_sig = b"PAR1"
@@ -141,15 +198,31 @@ class FormatDetector:
         return False
-class FileReader:
+class FileReader(FormatDetector):
+    def to_df(self, **kwargs) -> pd.DataFrame:
+        format = self.get_format()
+        func = getattr(self, f'read_{format}', None)
+        if func is None:
+            raise FileDetectError(f'Unsupported format: {format}')
+        self.file_obj.seek(0)
+        kwargs.update(self.parameters)
+        return func(self.file_obj, name=self.name, **kwargs)
-    def _get_csv_dialect(self, buffer) -> csv.Dialect:
+    @staticmethod
+    def _get_csv_dialect(buffer, delimiter=None) -> csv.Dialect:
         sample = buffer.readline()  # trying to get dialect from header
         buffer.seek(0)
         try:
             if isinstance(sample, bytes):
                 sample = sample.decode()
-            accepted_csv_delimiters = [",", "\t", ";"]
+            if delimiter is not None:
+                accepted_csv_delimiters = [delimiter]
+            else:
+                accepted_csv_delimiters = [",", "\t", ";"]
             try:
                 dialect = csv.Sniffer().sniff(
                     sample, delimiters=accepted_csv_delimiters
@@ -168,29 +241,15 @@ class FileReader:
             dialect = None
         return dialect
-    def read(self, format, file_obj: BytesIO, **kwargs) -> pd.DataFrame:
-        func = {
-            'parquet': self.read_parquet,
-            'csv': self.read_csv,
-            'xlsx': self.read_excel,
-            'pdf': self.read_pdf,
-            'json': self.read_json,
-            'txt': self.read_txt,
-        }
-        if format not in func:
-            raise FileDetectError(f'Unsupported format: {format}')
-        func = func[format]
-        return func(file_obj, **kwargs)
-    def read_csv(self, file_obj: BytesIO, **kwargs):
+    @classmethod
+    def read_csv(cls, file_obj: BytesIO, delimiter=None, **kwargs):
         file_obj = decode(file_obj)
-        dialect = self._get_csv_dialect(file_obj)
+        dialect = cls._get_csv_dialect(file_obj, delimiter=delimiter)
         return pd.read_csv(file_obj, sep=dialect.delimiter, index_col=False)
-    def read_txt(self, file_obj: BytesIO, **kwargs):
+    @staticmethod
+    def read_txt(file_obj: BytesIO, name=None, **kwargs):
         file_obj = decode(file_obj)
         try:
@@ -202,10 +261,7 @@ class FileReader:
             )
         text = file_obj.read()
-        file_name = None
-        if hasattr(file_obj, "name"):
-            file_name = file_obj.name
-        metadata = {"source": file_name}
+        metadata = {"source": name}
         documents = [Document(page_content=text, metadata=metadata)]
         text_splitter = RecursiveCharacterTextSplitter(
@@ -220,10 +276,10 @@ class FileReader:
             ]
         )
-    def read_pdf(self, file_obj: BytesIO, **kwargs):
-        import fitz  # pymupdf
+    @staticmethod
+    def read_pdf(file_obj: BytesIO, **kwargs):
-        with fitz.open(stream=file_obj) as pdf:  # open pdf
+        with fitz.open(stream=file_obj.read()) as pdf:  # open pdf
             text = chr(12).join([page.get_text() for page in pdf])
         text_splitter = RecursiveCharacterTextSplitter(
@@ -236,16 +292,19 @@ class FileReader:
             {"content": split_text, "metadata": [{}] * len(split_text)}
         )
-    def read_json(self, file_obj: BytesIO, **kwargs):
+    @staticmethod
+    def read_json(file_obj: BytesIO, **kwargs):
         file_obj = decode(file_obj)
         file_obj.seek(0)
         json_doc = json.loads(file_obj.read())
         return pd.json_normalize(json_doc, max_level=0)
-    def read_parquet(self, file_obj: BytesIO, **kwargs):
+    @staticmethod
+    def read_parquet(file_obj: BytesIO, **kwargs):
         return pd.read_parquet(file_obj)
-    def read_excel(self, file_obj: BytesIO, sheet_name=None, **kwargs) -> pd.DataFrame:
+    @staticmethod
+    def read_xlsx(file_obj: BytesIO, sheet_name=None, **kwargs) -> pd.DataFrame:
         file_obj.seek(0)
         with pd.ExcelFile(file_obj) as xls:

mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py CHANGED Viewed

@@ -129,11 +129,4 @@ class MSGraphAPIBaseClient:
         api_url = self._get_api_url(endpoint)
         response = self._make_request(api_url, params)
-        # If the response content is a binary file or a TSV file, return the raw content.
-        if response.headers["Content-Type"] in ("application/octet-stream", "text/plain",
-                                                "text/tab-separated-values", "application/pdf"):
-            return response.content
-        # Otherwise, return the JSON content.
-        else:
-            return response.json()
+        return response.content

MindsDB 25.2.1.2__py3-none-any.whl → 25.2.2.1__py3-none-any.whl

Potentially problematic release.

MindsDB 25.2.1.2py3-none-any.whl → 25.2.2.1py3-none-any.whl