PyPI - dragon-ml-toolbox - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (16) hide show

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA +1 -1
dragon_ml_toolbox-2.2.0.dist-info/RECORD +21 -0
ml_tools/ETL_engineering.py +543 -0
ml_tools/MICE_imputation.py +27 -28
ml_tools/PSO_optimization.py +15 -15
ml_tools/VIF_factor.py +20 -17
ml_tools/data_exploration.py +58 -32
ml_tools/ensemble_learning.py +40 -42
ml_tools/handle_excel.py +98 -78
ml_tools/logger.py +13 -11
ml_tools/utilities.py +165 -60
dragon_ml_toolbox-2.0.0.dist-info/RECORD +0 -20
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt +0 -0

ml_tools/logger.py CHANGED Viewed

@@ -1,11 +1,11 @@
-import os
+from pathlib import Path
 from datetime import datetime
 from typing import Union, List, Dict, Any
 import pandas as pd
 from openpyxl.styles import Font, PatternFill
 import traceback
 import json
-from .utilities import sanitize_filename, _script_info
+from .utilities import sanitize_filename, _script_info, make_fullpath
 __all__ = [
@@ -21,7 +21,7 @@ def custom_logger(
         str,
         BaseException
     ],
-    save_directory: str,
+    save_directory: Union[str, Path],
     log_name: str,
 ) -> None:
     """
@@ -54,10 +54,12 @@ def custom_logger(
         ValueError: If the data type is unsupported.
     """
     try:
-        os.makedirs(save_directory, exist_ok=True)
+        save_path = make_fullpath(save_directory, make=True)
         timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
         log_name = sanitize_filename(log_name)
-        base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
+        base_path = save_path / f"{log_name}_{timestamp}"
         if isinstance(data, list):
             _log_list_to_txt(data, base_path + ".txt")
@@ -86,7 +88,7 @@ def custom_logger(
         print(f"Error in custom_logger: {e}")
-def _log_list_to_txt(data: List[Any], path: str) -> None:
+def _log_list_to_txt(data: List[Any], path: Path) -> None:
     log_lines = []
     for item in data:
         try:
@@ -98,7 +100,7 @@ def _log_list_to_txt(data: List[Any], path: str) -> None:
         f.write('\n'.join(log_lines))
-def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
+def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
     sanitized_dict = {}
     max_length = max(len(v) for v in data.values()) if data else 0
@@ -113,7 +115,7 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
     df.to_csv(path, index=False)
-def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
+def _log_dataframe_to_xlsx(data: pd.DataFrame, path: Path) -> None:
     writer = pd.ExcelWriter(path, engine='openpyxl')
     data.to_excel(writer, index=True, sheet_name='Data')
@@ -134,18 +136,18 @@ def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
     writer.close()
-def _log_string_to_log(data: str, path: str) -> None:
+def _log_string_to_log(data: str, path: Path) -> None:
     with open(path, 'w', encoding='utf-8') as f:
         f.write(data.strip() + '\n')
-def _log_exception_to_log(exc: BaseException, path: str) -> None:
+def _log_exception_to_log(exc: BaseException, path: Path) -> None:
     with open(path, 'w', encoding='utf-8') as f:
         f.write("Exception occurred:\n")
         traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
-def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
+def _log_dict_to_json(data: Dict[Any, Any], path: Path) -> None:
     with open(path, 'w', encoding='utf-8') as f:
         json.dump(data, f, indent=4, ensure_ascii=False)

ml_tools/utilities.py CHANGED Viewed

@@ -2,7 +2,6 @@ import math
 import numpy as np
 import pandas as pd
 import polars as pl
-import os
 from pathlib import Path
 import re
 from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
@@ -12,6 +11,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
 # Keep track of available tools
 __all__ = [
+    "make_fullpath",
     "list_csv_paths",
     "list_files_by_extension",
     "load_dataframe",
@@ -28,27 +28,83 @@ __all__ = [
 ]
-def list_csv_paths(directory: str) -> dict[str, str]:
+def make_fullpath(
+        input_path: Union[str, Path],
+        make: bool = False,
+        verbose: bool = False
+    ) -> Path:
     """
-    Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
+    Resolves a string or Path into an absolute Path.
+    - If the path exists, it is returned.
+    - If the path does not exist and `make=True`, it will:
+        - Create the file if the path has a suffix (i.e., is treated as a file)
+        - Create the directory if it has no suffix
+    - If `make=False` and the path does not exist, an error is raised.
+    - Optionally prints whether the resolved path is a file or directory.
     Parameters:
-        directory (str): Path to the directory containing `.csv` files.
+        input_path (str | Path): Path to resolve.
+        make (bool): If True, attempt to create file or directory.
+        verbose (bool): Print classification after resolution.
     Returns:
-        (dict[str, str]): Dictionary mapping {filename: filepath}.
+        Path: Resolved absolute path.
+    Raises:
+        ValueError: If the path doesn't exist and can't be created.
+    """
+    path = Path(input_path).expanduser()
+    is_file = path.suffix != ""
+    try:
+        resolved = path.resolve(strict=True)
+    except FileNotFoundError:
+        if not make:
+            raise ValueError(f"❌ Path does not exist: '{path}'")
+        try:
+            if is_file:
+                # Create parent directories first
+                path.parent.mkdir(parents=True, exist_ok=True)
+                path.touch(exist_ok=False)
+            else:
+                path.mkdir(parents=True, exist_ok=True)
+            resolved = path.resolve(strict=True)
+        except Exception as e:
+            raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
+    if verbose:
+        if resolved.is_file():
+            print("📄 Path is a File")
+        elif resolved.is_dir():
+            print("📁 Path is a Directory")
+        else:
+            print("❓ Path exists but is neither file nor directory")
+    return resolved
+def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
     """
-    dir_path = Path(directory).expanduser().resolve()
+    Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
-    if not dir_path.is_dir():
-        raise FileNotFoundError(f"Directory not found: {dir_path}")
+    Parameters:
+        directory (str | Path): Path to the directory containing `.csv` files.
+    Returns:
+        (dict[str, Path]): Dictionary mapping {filename: filepath}.
+    """
+    dir_path = make_fullpath(directory)
     csv_paths = list(dir_path.glob("*.csv"))
     if not csv_paths:
-        raise IOError(f"No CSV files found in directory: {dir_path}")
+        raise IOError(f"No CSV files found in directory: {dir_path.name}")
     # make a dictionary of paths and names
-    name_path_dict = {p.stem: str(p) for p in csv_paths}
+    name_path_dict = {p.stem: p for p in csv_paths}
     print("\n🗂️ CSV files found:")
     for name in name_path_dict.keys():
@@ -57,22 +113,19 @@ def list_csv_paths(directory: str) -> dict[str, str]:
     return name_path_dict
-def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
+def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
     """
     Lists all files with the specified extension in the given directory and returns a mapping:
     filenames (without extensions) to their absolute paths.
     Parameters:
-        directory (str): Path to the directory to search in.
+        directory (str | Path): Path to the directory to search in.
         extension (str): File extension to search for (e.g., 'json', 'txt').
     Returns:
-        (dict[str, str]): Dictionary mapping {filename: filepath}.
+        (dict[str, Path]): Dictionary mapping {filename: filepath}.
     """
-    dir_path = Path(directory).expanduser().resolve()
-    if not dir_path.is_dir():
-        raise FileNotFoundError(f"Directory not found: {dir_path}")
+    dir_path = make_fullpath(directory)
     # Normalize the extension (remove leading dot if present)
     normalized_ext = extension.lstrip(".").lower()
@@ -82,7 +135,7 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
     if not matched_paths:
         raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
-    name_path_dict = {p.stem: str(p) for p in matched_paths}
+    name_path_dict = {p.stem: p for p in matched_paths}
     print(f"\n📂 '{normalized_ext.upper()}' files found:")
     for name in name_path_dict:
@@ -91,32 +144,70 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
     return name_path_dict
-def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
+def load_dataframe(
+    df_path: Union[str, Path],
+    kind: Literal["pandas", "polars"] = "pandas",
+    all_strings: bool = False
+) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
     """
-    Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
+    Load a CSV file into a DataFrame and extract its base name.
+    Can load data as either a pandas or a polars DataFrame. Allows for loading all
+    columns as string types to prevent type inference errors.
     Args:
-        df_path (str): The path to the CSV file.
+        df_path (Union[str, Path]):
+            The path to the CSV file.
+        kind (Literal["pandas", "polars"], optional):
+            The type of DataFrame to load. Defaults to "pandas".
+        all_strings (bool, optional):
+            If True, loads all columns as string data types. This is useful for
+            ETL tasks and to avoid type-inference errors. Defaults to False.
     Returns:
-        Tuple ([pd.DataFrame, str]):
-        A tuple containing the loaded pandas DataFrame and the base name of the file.
+        (Tuple[DataFrameType, str]):
+            A tuple containing the loaded DataFrame (either pandas or polars)
+            and the base name of the file (without extension).
+    Raises:
+        FileNotFoundError: If the file does not exist at the given path.
+        ValueError: If the DataFrame is empty or an invalid 'kind' is provided.
     """
-    path = Path(df_path).expanduser().resolve()
-    df = pd.read_csv(path, encoding='utf-8')
+    path = make_fullpath(df_path)
     df_name = path.stem
-    if df.empty:
-        raise ValueError(f"DataFrame '{df_name}' is empty.")
-    print(f"\n💿 Loaded dataset: '{df_name}' with shape: {df.shape}")
+    if kind == "pandas":
+        if all_strings:
+            df = pd.read_csv(path, encoding='utf-8', dtype=str)
+        else:
+            df = pd.read_csv(path, encoding='utf-8')
+    elif kind == "polars":
+        if all_strings:
+            df = pl.read_csv(path, infer_schema=False)
+        else:
+            # Default behavior: infer the schema.
+            df = pl.read_csv(path, infer_schema_length=1000)
+    else:
+        raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
+    # This check works for both pandas and polars DataFrames
+    if df.shape[0] == 0:
+        raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
+    print(f"\n💿 Loaded {kind} dataset: '{df_name}' with shape: {df.shape}")
     return df, df_name
-def yield_dataframes_from_dir(datasets_dir: str):
+def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
     """
     Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
     Parameters:
-        datasets_dir (str):
+        datasets_dir (str | Path):
         The path to the directory containing `.csv` dataset files.
     Yields:
@@ -129,7 +220,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
     - CSV files are read using UTF-8 encoding.
     - Output is streamed via a generator to support lazy loading of multiple datasets.
     """
-    for df_name, df_path in list_csv_paths(datasets_dir).items():
+    datasets_path = make_fullpath(datasets_dir)
+    for df_name, df_path in list_csv_paths(datasets_path).items():
         df, _ = load_dataframe(df_path)
         yield df, df_name
@@ -193,29 +285,42 @@ def merge_dataframes(
     return merged_df
-def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
+def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
     """
-    Save a pandas DataFrame to a CSV file.
+    Saves a pandas or polars DataFrame to a CSV file.
-    Parameters:
-        df: pandas.DataFrame to save
-        save_dir: str, directory where the CSV file will be saved.
-        filename: str, CSV filename, extension will be added if missing.
+    Args:
+        df (Union[pd.DataFrame, pl.DataFrame]):
+            The DataFrame to save.
+        save_dir (Union[str, Path]):
+            The directory where the CSV file will be saved.
+        filename (str):
+            The CSV filename. The '.csv' extension will be added if missing.
     """
-    if df.empty:
+    # This check works for both pandas and polars
+    if df.shape[0] == 0:
         print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
         return
-    os.makedirs(save_dir, exist_ok=True)
+    # Create the directory if it doesn't exist
+    save_path = make_fullpath(save_dir, make=True)
+    # Clean the filename
     filename = sanitize_filename(filename)
     if not filename.endswith('.csv'):
         filename += '.csv'
-    output_path = os.path.join(save_dir, filename)
+    output_path = save_path / filename
-    df.to_csv(output_path, index=False, encoding='utf-8')
+    # --- Type-specific saving logic ---
+    if isinstance(df, pd.DataFrame):
+        df.to_csv(output_path, index=False, encoding='utf-8')
+    elif isinstance(df, pl.DataFrame):
+        df.write_csv(output_path) # Polars defaults to utf8 and no index
+    else:
+        # This error handles cases where an unsupported type is passed
+        raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
     print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
@@ -392,24 +497,24 @@ def threshold_binary_values_batch(
     return np.hstack([cont_part, bin_part])
-def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
+def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[Path]:
     """
     Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
     Parameters:
         obj (Any) : The Python object to serialize.
-        save_dir (str) : Directory path where the serialized object will be saved.
+        save_dir (str | Path) : Directory path where the serialized object will be saved.
         filename (str) : Name for the output file, extension will be appended if needed.
     Returns:
-        (str | None) : The full file path where the object was saved if successful; otherwise, None.
+        (Path | None) : The full file path where the object was saved if successful; otherwise, None.
     """
     try:
-        os.makedirs(save_dir, exist_ok=True)
+        save_path = make_fullpath(save_dir, make=True)
         sanitized_name = sanitize_filename(filename)
         if not sanitized_name.endswith('.joblib'):
             sanitized_name = sanitized_name + ".joblib"
-        full_path = os.path.join(save_dir, sanitized_name)
+        full_path = save_path / sanitized_name
         joblib.dump(obj, full_path)
     except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
         message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
@@ -424,23 +529,22 @@ def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True,
         return full_path
-def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
+def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
     """
     Loads a serialized object from a .joblib file.
     Parameters:
-        filepath (str): Full path to the serialized .joblib file.
+        filepath (str | Path): Full path to the serialized .joblib file.
     Returns:
         (Any | None): The deserialized Python object, or None if loading fails.
     """
-    if not os.path.exists(filepath):
-        print(f"❌ File does not exist: {filepath}")
-        return None
+    true_filepath = make_fullpath(filepath)
     try:
-        obj = joblib.load(filepath)
+        obj = joblib.load(true_filepath)
     except (IOError, OSError, EOFError, TypeError, ValueError) as e:
-        message = f"❌ Failed to deserialize object from '{filepath}': {e}"
+        message = f"❌ Failed to deserialize object from '{true_filepath}': {e}"
         if raise_on_error:
             raise Exception(message)
         else:
@@ -453,7 +557,7 @@ def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=T
 def distribute_datasets_by_target(
-    df_or_path: Union[pd.DataFrame, str],
+    df_or_path: Union[pd.DataFrame, str, Path],
     target_columns: list[str],
     verbose: bool = False
 ) -> Iterator[Tuple[str, pd.DataFrame]]:
@@ -463,7 +567,7 @@ def distribute_datasets_by_target(
     Parameters
     ----------
-    df_or_path : [pd.DataFrame | str]
+    df_or_path : [pd.DataFrame | str | Path]
         Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
     target_columns : List[str]
         List of target column names to generate per-target DataFrames.
@@ -476,9 +580,10 @@ def distribute_datasets_by_target(
         * Target name.
         * Pandas DataFrame.
     """
-    # Validate path
-    if isinstance(df_or_path, str):
-        df, _ = load_dataframe(df_or_path)
+    # Validate path or dataframe
+    if isinstance(df_or_path, str) or isinstance(df_or_path, Path):
+        df_path = make_fullpath(df_or_path)
+        df, _ = load_dataframe(df_path)
     else:
         df = df_or_path
@@ -486,7 +591,7 @@ def distribute_datasets_by_target(
     feature_columns = [col for col in df.columns if col not in valid_targets]
     for target in valid_targets:
-        subset = df[feature_columns + [target]].dropna(subset=[target])
+        subset = df[feature_columns + [target]].dropna(subset=[target]) # type: ignore
         if verbose:
             print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
         yield target, subset

dragon_ml_toolbox-2.0.0.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-dragon_ml_toolbox-2.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-2.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
-ml_tools/MICE_imputation.py,sha256=wIfl8I3SyHUett-0vizaCiv0y_q43-zij8VczsbEIOI,11088
-ml_tools/PSO_optimization.py,sha256=bNiuKqyVoShGM4VBx4exJ8jjVVxQjlunkVpzaMb7fwY,20850
-ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
-ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
-ml_tools/data_exploration.py,sha256=NfPuN57wL5CXBnRyvIayxaYMe_ZKieHT3ZIcmtO_XIQ,20115
-ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
-ml_tools/ensemble_learning.py,sha256=v_btCkVthuEl3Pu1WipASvU5lGAVbXxxKEMq3boF-HI,37305
-ml_tools/handle_excel.py,sha256=NrCOWSENgb1HdqId_QOdPTjBUIJPePI9a2pnmmBd3lw,12613
-ml_tools/logger.py,sha256=WI7wiGmmALCQPl0AIauw_mPzFNTbaQf0v9J8pojvHUg,4708
-ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
-ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
-ml_tools/utilities.py,sha256=_7RDgk9uBxPuHJRVOOFYFUOZyJ1o9QILnxYsKdGCfLQ,16772
-ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
-dragon_ml_toolbox-2.0.0.dist-info/METADATA,sha256=7MHJGUXvWThm8-Rv9NZyogTQKBBMH4x0EXLsHel9Dns,2974
-dragon_ml_toolbox-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-2.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-2.0.0.dist-info/RECORD,,

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl