PyPI - dragon-ml-toolbox - Versions diffs - 11.1.1__py3-none-any.whl → 12.0.1__py3-none-any.whl - Mend

dragon-ml-toolbox 11.1.1py3-none-any.whl → 12.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (34) hide show

{dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.1.dist-info}/METADATA +22 -36
dragon_ml_toolbox-12.0.1.dist-info/RECORD +40 -0
ml_tools/ETL_cleaning.py +1 -0
ml_tools/ETL_engineering.py +17 -5
ml_tools/GUI_tools.py +2 -1
ml_tools/MICE_imputation.py +5 -2
ml_tools/ML_callbacks.py +3 -3
ml_tools/ML_datasetmaster.py +1 -0
ml_tools/ML_evaluation.py +2 -1
ml_tools/ML_evaluation_multi.py +1 -0
ml_tools/ML_inference.py +1 -0
ml_tools/ML_models.py +3 -1
ml_tools/ML_optimization.py +2 -1
ml_tools/ML_scaler.py +3 -0
ml_tools/ML_utilities.py +219 -0
ml_tools/PSO_optimization.py +5 -6
ml_tools/RNN_forecast.py +2 -0
ml_tools/SQL.py +1 -0
ml_tools/VIF_factor.py +2 -1
ml_tools/_logger.py +0 -2
ml_tools/custom_logger.py +1 -0
ml_tools/data_exploration.py +16 -10
ml_tools/ensemble_inference.py +5 -6
ml_tools/ensemble_learning.py +3 -2
ml_tools/handle_excel.py +1 -0
ml_tools/math_utilities.py +235 -0
ml_tools/path_manager.py +2 -1
ml_tools/serde.py +103 -0
ml_tools/utilities.py +19 -453
dragon_ml_toolbox-11.1.1.dist-info/RECORD +0 -37
{dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.1.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.1.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.1.dist-info}/top_level.txt +0 -0

ml_tools/VIF_factor.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
@@ -7,11 +6,13 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
 from statsmodels.tools.tools import add_constant
 import warnings
 from pathlib import Path
 from .utilities import yield_dataframes_from_dir, save_dataframe
 from .path_manager import sanitize_filename, make_fullpath
 from ._logger import _LOGGER
 from ._script_info import _script_info
 __all__ = [
     "compute_vif",
     "drop_vif_based",

ml_tools/_logger.py CHANGED Viewed

@@ -8,8 +8,6 @@ except ImportError:
     colorlog = None
 # --- Centralized Configuration ---
 LEVEL_EMOJIS = {
     logging.INFO: "✅",

ml_tools/custom_logger.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Union, List, Dict, Any
 import traceback
 import json
 import csv
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER

ml_tools/data_exploration.py CHANGED Viewed

@@ -417,7 +417,7 @@ def encode_categorical_features(
     # Handle the dataset splitting logic
     if split_resulting_dataset:
-        df_categorical = df_encoded[valid_columns].to_frame()
+        df_categorical = df_encoded[valid_columns].to_frame() # type: ignore
         df_non_categorical = df.drop(columns=valid_columns)
         return mappings, df_non_categorical, df_categorical
     else:
@@ -493,9 +493,9 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
     return df_cont, df_bin # type: ignore
-def plot_correlation_heatmap(df: pd.DataFrame,
+def plot_correlation_heatmap(df: pd.DataFrame,
+                             plot_title: str,
                              save_dir: Union[str, Path, None] = None,
-                             plot_title: str="Correlation Heatmap",
                              method: Literal["pearson", "kendall", "spearman"]="pearson"):
     """
     Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
@@ -503,7 +503,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
     Args:
         df (pd.DataFrame): The input dataset.
         save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
-        plot_title: To make different plots, or overwrite existing ones.
+        plot_title: The suffix "`method` Correlation Heatmap" will be automatically appended.
         method (str): Correlation method to use. Must be one of:
             - 'pearson' (default): measures linear correlation (assumes normally distributed data),
             - 'kendall': rank correlation (non-parametric),
@@ -518,6 +518,9 @@ def plot_correlation_heatmap(df: pd.DataFrame,
     if numeric_df.empty:
         _LOGGER.warning("No numeric columns found. Heatmap not generated.")
         return
+    if method not in ["pearson", "kendall", "spearman"]:
+        _LOGGER.error(f"'method' must be pearson, kendall, or spearman.")
+        raise ValueError()
     corr = numeric_df.corr(method=method)
@@ -538,7 +541,10 @@ def plot_correlation_heatmap(df: pd.DataFrame,
         cbar_kws={"shrink": 0.8}
     )
-    plt.title(plot_title)
+    # add suffix to title
+    full_plot_title = f"{plot_title} - {method.title()} Correlation Heatmap"
+    plt.title(full_plot_title)
     plt.xticks(rotation=45, ha='right')
     plt.yticks(rotation=0)
@@ -547,13 +553,13 @@ def plot_correlation_heatmap(df: pd.DataFrame,
     if save_dir:
         save_path = make_fullpath(save_dir, make=True)
         # sanitize the plot title to save the file
-        plot_title = sanitize_filename(plot_title)
-        plot_title = plot_title + ".svg"
+        sanitized_plot_title = sanitize_filename(plot_title)
+        plot_filename = sanitized_plot_title + ".svg"
-        full_path = save_path / plot_title
+        full_path = save_path / plot_filename
         plt.savefig(full_path, bbox_inches="tight", format='svg')
-        _LOGGER.info(f"Saved correlation heatmap: '{plot_title}'")
+        _LOGGER.info(f"Saved correlation heatmap: '{plot_filename}'")
     plt.show()
     plt.close()
@@ -968,7 +974,7 @@ def reconstruct_one_hot(
         # Handle rows where all OHE columns were 0 (e.g., original value was NaN).
         # In these cases, idxmax returns the first column name, but the sum of values is 0.
         all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0
-        new_column_values.loc[all_zero_mask] = np.nan
+        new_column_values.loc[all_zero_mask] = np.nan # type: ignore
         # Assign the new reconstructed column to the DataFrame
         new_df[base_name] = new_column_values

ml_tools/ensemble_inference.py CHANGED Viewed

@@ -1,18 +1,17 @@
-from ._script_info import _script_info
-from ._logger import _LOGGER
-from .path_manager import make_fullpath, list_files_by_extension
-from .keys import EnsembleKeys
 from typing import Union, Literal, Dict, Any, Optional, List
 from pathlib import Path
 import json
 import joblib
 import numpy as np
 # Inference models
 import xgboost
 import lightgbm
+from ._script_info import _script_info
+from ._logger import _LOGGER
+from .path_manager import make_fullpath, list_files_by_extension
+from .keys import EnsembleKeys
 __all__ = [
     "InferenceHandler",

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -13,7 +13,8 @@ import lightgbm as lgb
 from sklearn.model_selection import train_test_split
 from sklearn.base import clone
-from .utilities import yield_dataframes_from_dir, serialize_object, train_dataset_yielder
+from .utilities import yield_dataframes_from_dir, train_dataset_yielder
+from .serde import serialize_object
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from .keys import EnsembleKeys
@@ -481,7 +482,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
 ###### 4. Execution ######
 def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
-         handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
+         handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=True,
          test_size: float=0.2, debug:bool=False, generate_learning_curves: bool = False):
     #Check models
     if isinstance(model_object, RegressionTreeModels):

ml_tools/handle_excel.py CHANGED Viewed

@@ -2,6 +2,7 @@ from pathlib import Path
 from openpyxl import load_workbook, Workbook
 import pandas as pd
 from typing import List, Optional, Union
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER

ml_tools/math_utilities.py ADDED Viewed

@@ -0,0 +1,235 @@
+import pandas as pd
+import numpy as np
+import math
+from typing import Union, Sequence, Optional
+from ._script_info import _script_info
+from ._logger import _LOGGER
+__all__ = [
+    "normalize_mixed_list",
+    "threshold_binary_values",
+    "threshold_binary_values_batch",
+    "discretize_categorical_values",
+]
+def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
+    """
+    Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
+    applying heuristic adjustments to correct for potential data entry scale mismatches.
+    Parameters:
+        data (list):
+            A list of values that may include strings, floats, integers, or None.
+            None values are treated as 0.0.
+        threshold (int, optional):
+            The number of log10 orders of magnitude below the median scale
+            at which a value is considered suspect and is scaled upward accordingly.
+            Default is 2.
+    Returns:
+        List[float]: A list of normalized float values summing to 1.0.
+    Notes:
+        - Zeros and None values remain zero.
+        - Input strings are automatically cast to floats if possible.
+    Example:
+        >>> normalize_mixed_list([1, "0.01", 4, None])
+        [0.2, 0.2, 0.6, 0.0]
+    """
+    # Step 1: Convert all values to float, treat None as 0.0
+    float_list = [float(x) if x is not None else 0.0 for x in data]
+    # Raise for negative values
+    if any(x < 0 for x in float_list):
+        _LOGGER.error("Negative values are not allowed in the input list.")
+        raise ValueError()
+    # Step 2: Compute log10 of non-zero values
+    nonzero = [x for x in float_list if x > 0]
+    if not nonzero:
+        return [0.0 for _ in float_list]
+    log_scales = [math.log10(x) for x in nonzero]
+    log_median = np.median(log_scales)
+    # Step 3: Adjust values that are much smaller than median
+    adjusted = []
+    for x in float_list:
+        if x == 0.0:
+            adjusted.append(0.0)
+        else:
+            log_x = math.log10(x)
+            if log_median - log_x > threshold:
+                scale_diff = round(log_median - log_x)
+                adjusted.append(x * (10 ** scale_diff))
+            else:
+                adjusted.append(x)
+    # Step 4: Normalize to sum to 1.0
+    total = sum(adjusted)
+    if total == 0:
+        return [0.0 for _ in adjusted]
+    return [x / total for x in adjusted]
+def threshold_binary_values(
+    input_array: Union[Sequence[float], np.ndarray, pd.Series],
+    binary_values: Optional[int] = None
+) -> Union[np.ndarray, pd.Series, list[float], tuple[float]]:
+    """
+    Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
+    Binary elements are converted to 0 or 1 using a 0.5 threshold.
+    Parameters:
+        input_array: 1D sequence, NumPy array, or pandas Series.
+        binary_values (Optional[int]) :
+            - If `None`, all values are treated as binary.
+            - If `int`, only this many last `binary_values` are thresholded.
+    Returns:
+        Any:
+        Same type as input
+    """
+    original_type = type(input_array)
+    if isinstance(input_array, (pd.Series, np.ndarray)):
+        array = np.asarray(input_array)
+    elif isinstance(input_array, (list, tuple)):
+        array = np.array(input_array)
+    else:
+        _LOGGER.error("Unsupported input type")
+        raise TypeError()
+    array = array.flatten()
+    total = array.shape[0]
+    bin_count = total if binary_values is None else binary_values
+    if not (0 <= bin_count <= total):
+        _LOGGER.error("'binary_values' must be between 0 and the total number of elements")
+        raise ValueError()
+    if bin_count == 0:
+        result = array
+    else:
+        cont_part = array[:-bin_count] if bin_count < total else np.array([])
+        bin_part = (array[-bin_count:] > 0.5).astype(int)
+        result = np.concatenate([cont_part, bin_part])
+    if original_type is pd.Series:
+        return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
+    elif original_type is list:
+        return result.tolist()
+    elif original_type is tuple:
+        return tuple(result)
+    else:
+        return result
+def threshold_binary_values_batch(
+    input_array: np.ndarray,
+    binary_values: int
+) -> np.ndarray:
+    """
+    Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
+    Parameters
+    ----------
+    input_array : np.ndarray
+        2D array with shape (batch_size, n_features).
+    binary_values : int
+        Number of binary features located at the END of each row.
+    Returns
+    -------
+    np.ndarray
+        Thresholded array, same shape as input.
+    """
+    if input_array.ndim != 2:
+        _LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
+        raise AssertionError()
+    batch_size, total_features = input_array.shape
+    if not (0 <= binary_values <= total_features):
+        _LOGGER.error("'binary_values' out of valid range.")
+        raise AssertionError()
+    if binary_values == 0:
+        return input_array.copy()
+    cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
+    bin_part = input_array[:, -binary_values:] > 0.5
+    bin_part = bin_part.astype(np.int32)
+    return np.hstack([cont_part, bin_part])
+def discretize_categorical_values(
+    input_array: np.ndarray,
+    categorical_info: dict[int, int],
+    start_at_zero: bool = False
+) -> np.ndarray:
+    """
+    Rounds specified columns of a 2D NumPy array to the nearest integer and
+    clamps the result to a valid categorical range.
+    Parameters
+    ----------
+    input_array : np.ndarray
+        2D array with shape (batch_size, n_features) containing continuous values.
+    categorical_info : dict[int, int]
+        A dictionary mapping column indices to their cardinality (number of categories).
+        Example: {3: 4} means column 3 will be clamped to its 4 valid categories.
+    start_at_zero : bool
+        If True, categories range from 0 to k-1.
+        If False, categories range from 1 to k.
+    Returns
+    -------
+    np.ndarray
+        A new array with the specified columns converted to integer categories.
+    """
+    # --- Input Validation ---
+    if input_array.ndim != 2:
+        _LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
+        raise ValueError()
+    if not isinstance(categorical_info, dict) or not categorical_info:
+        _LOGGER.error(f"'categorical_info' is not a dictionary, or is empty.")
+        raise ValueError()
+    _, total_features = input_array.shape
+    for col_idx, cardinality in categorical_info.items():
+        if not (0 <= col_idx < total_features):
+            _LOGGER.error(f"Column index {col_idx} is out of bounds for an array with {total_features} features.")
+            raise ValueError()
+        if not isinstance(cardinality, int) or cardinality < 2:
+            _LOGGER.error(f"Cardinality for column {col_idx} must be an integer >= 2, but got {cardinality}.")
+            raise ValueError()
+    # --- Core Logic ---
+    output_array = input_array.copy()
+    for col_idx, cardinality in categorical_info.items():
+        # 1. Round the column values using "round half up"
+        rounded_col = np.floor(output_array[:, col_idx] + 0.5)
+        # 2. Determine clamping bounds
+        min_bound = 0 if start_at_zero else 1
+        max_bound = cardinality - 1 if start_at_zero else cardinality
+        # 3. Clamp the values and update the output array
+        output_array[:, col_idx] = np.clip(rounded_col, min_bound, max_bound)
+    return output_array.astype(np.int32)
+def info():
+    _script_info(__all__)

ml_tools/path_manager.py CHANGED Viewed

@@ -2,9 +2,10 @@ from pprint import pprint
 from typing import Optional, List, Dict, Union, Literal
 from pathlib import Path
 import re
+import sys
 from ._script_info import _script_info
 from ._logger import _LOGGER
-import sys
 __all__ = [

ml_tools/serde.py ADDED Viewed

@@ -0,0 +1,103 @@
+import joblib
+from joblib.externals.loky.process_executor import TerminatedWorkerError
+from typing import Any, Union, TypeVar, get_origin, Type, Optional
+from pathlib import Path
+from .path_manager import make_fullpath, sanitize_filename
+from ._script_info import _script_info
+from ._logger import _LOGGER
+__all__ = [
+    "serialize_object",
+    "deserialize_object",
+]
+def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
+    """
+    Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
+    Parameters:
+        obj (Any) : The Python object to serialize.
+        save_dir (str | Path) : Directory path where the serialized object will be saved.
+        filename (str) : Name for the output file, extension will be appended if needed.
+    """
+    try:
+        save_path = make_fullpath(save_dir, make=True)
+        sanitized_name = sanitize_filename(filename)
+        if not sanitized_name.endswith('.joblib'):
+            sanitized_name = sanitized_name + ".joblib"
+        full_path = save_path / sanitized_name
+        joblib.dump(obj, full_path)
+    except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
+        _LOGGER.error(f"Failed to serialize object of type '{type(obj)}'.")
+        if raise_on_error:
+            raise e
+        return None
+    else:
+        if verbose:
+            _LOGGER.info(f"Object of type '{type(obj)}' saved to '{full_path}'")
+        return None
+# Define a TypeVar to link the expected type to the return type of deserialization
+T = TypeVar('T')
+def deserialize_object(
+    filepath: Union[str, Path],
+    expected_type: Optional[Type[T]] = None,
+    verbose: bool = True,
+    raise_on_error: bool = True
+    ) -> Optional[T]:
+    """
+    Loads a serialized object from a .joblib file.
+    Parameters:
+        filepath (str | Path): Full path to the serialized .joblib file.
+        expected_type (Type[T] | None): The expected type of the object.
+            If provided, the function raises a TypeError if the loaded object
+            is not an instance of this type. It correctly handles generics
+            like `list[str]` by checking the base type (e.g., `list`).
+            Defaults to None, which skips the type check.
+        verbose (bool): If True, logs success messages.
+        raise_on_error (bool): If True, raises exceptions on errors. If False, returns None instead.
+    Returns:
+        (Any | None): The deserialized Python object, which will match the
+            `expected_type` if provided. Returns None if an error
+            occurs and `raise_on_error` is False.
+    """
+    true_filepath = make_fullpath(filepath)
+    try:
+        obj = joblib.load(true_filepath)
+    except (IOError, OSError, EOFError, TypeError, ValueError) as e:
+        _LOGGER.error(f"Failed to deserialize object from '{true_filepath}'.")
+        if raise_on_error:
+            raise e
+        return None
+    else:
+        # --- Type Validation Step ---
+        if expected_type:
+            # get_origin handles generics (e.g., list[str] -> list)
+            # If it's not a generic, get_origin returns None, so we use the type itself.
+            type_to_check = get_origin(expected_type) or expected_type
+            # Can't do an isinstance check on 'Any', skip it.
+            if type_to_check is not Any and not isinstance(obj, type_to_check):
+                error_msg = (
+                    f"Type mismatch: Expected an instance of '{expected_type}', "
+                    f"but found '{type(obj)}' in '{true_filepath}'."
+                )
+                _LOGGER.error(error_msg)
+                if raise_on_error:
+                    raise TypeError()
+                return None
+        if verbose:
+            _LOGGER.info(f"Loaded object of type '{type(obj)}' from '{true_filepath}'.")
+        return obj
+def info():
+    _script_info(__all__)

dragon-ml-toolbox 11.1.1__py3-none-any.whl → 12.0.1__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 11.1.1py3-none-any.whl → 12.0.1py3-none-any.whl