PyPI - dragon-ml-toolbox - Versions diffs - 1.4.3__tar.gz → 1.4.5__tar.gz - Mend

dragon-ml-toolbox 1.4.3tar.gz → 1.4.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (24) hide show

{dragon_ml_toolbox-1.4.3/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.4.3
+Version: 1.4.5
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.4.3
+Version: 1.4.5
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/MICE_imputation.py RENAMED Viewed

@@ -3,8 +3,9 @@ import miceforest as mf
 import os
 import matplotlib.pyplot as plt
 import numpy as np
-from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
+from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
 from plotnine import ggplot, labs, theme, element_blank # type: ignore
+from typing import Optional
 __all__ = [
@@ -17,7 +18,7 @@ __all__ = [
 ]
-def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
+def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str]]=None, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
     # Initialize kernel with number of imputed datasets to generate
     kernel = mf.ImputationKernel(
@@ -35,10 +36,20 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
     if imputed_datasets is None or len(imputed_datasets) == 0:
         raise ValueError("No imputed datasets were generated. Check the MICE process.")
+    # threshold binary columns
+    if binary_columns is not None:
+        invalid_binary_columns = set(binary_columns) - set(df.columns)
+        if invalid_binary_columns:
+            print(f"⚠️ These 'binary columns' are not in the dataset: {invalid_binary_columns}")
+        valid_binary_columns = [col for col in binary_columns if col not in invalid_binary_columns]
+        for imputed_df in imputed_datasets:
+            for binary_column_name in valid_binary_columns:
+                imputed_df[binary_column_name] = threshold_binary_values(imputed_df[binary_column_name]) # type: ignore
     if resulting_datasets == 1:
-        imputed_dataset_names = [f"{df_name}_imputed"]
+        imputed_dataset_names = [f"{df_name}_MICE"]
     else:
-        imputed_dataset_names = [f"{df_name}_imputed_{i+1}" for i in range(resulting_datasets)]
+        imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(resulting_datasets)]
     # Ensure indexes match
     for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
@@ -106,7 +117,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
             # Adjust plot display for the X axis
             _ticks = np.arange(iterations_cap)
             _labels = np.arange(1, iterations_cap + 1)
-            plt.xticks(ticks=_ticks, labels=_labels)
+            plt.xticks(ticks=_ticks, labels=_labels) # type: ignore
             plt.grid(True)
             feature_save_name = sanitize_filename(feature_name)
@@ -202,7 +213,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     print(f"{local_dir_name} completed.")
-def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
+def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
+                      save_datasets_dir: str, save_metrics_dir: str,
+                      binary_columns: Optional[list[str]]=None,
+                      resulting_datasets: int=1,
+                      iterations: int=20,
+                      random_state: int=101):
     """
     Call functions in sequence for each dataset in the provided path or directory:
         1. Load dataframe
@@ -211,7 +227,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
         4. Save convergence metrics
         5. Save distribution metrics
-    Target columns must be skipped from the imputation.
+    Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
     """
     # Check paths
     os.makedirs(save_datasets_dir, exist_ok=True)
@@ -229,7 +245,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
         df, df_targets = _skip_targets(df, target_columns)
-        kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
+        kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
         save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/VIF_factor.py RENAMED Viewed

@@ -26,6 +26,7 @@ def compute_vif(
     filename: Optional[str] = None,
     fontsize: int = 14,
     show_plot: bool = True,
+    verbose: bool = True
 ) -> pd.DataFrame:
     """
     Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
@@ -52,19 +53,20 @@ def compute_vif(
     if use_columns is None:
         sanitized_columns = df.select_dtypes(include='number').columns.tolist()
         missing_features = set(ground_truth_cols) - set(sanitized_columns)
-        if missing_features:
+        if missing_features and verbose:
             print(f"⚠️ These columns are not Numeric:\n{missing_features}")
     else:
         sanitized_columns = list()
         for feature in use_columns:
             if feature not in ground_truth_cols:
-                print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
+                if verbose:
+                    print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
             else:
                 sanitized_columns.append(feature)
     if ignore_columns is not None and use_columns is None:
         missing_ignore = set(ignore_columns) - set(ground_truth_cols)
-        if missing_ignore:
+        if missing_ignore and verbose:
             print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
         sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
@@ -182,7 +184,7 @@ def compute_vif_multi(input_directory: str,
                       max_features_to_plot: int = 20,
                       fontsize: int = 14):
     """
-    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
+    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
     Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
     Args:
@@ -210,10 +212,11 @@ def compute_vif_multi(input_directory: str,
                             fontsize=fontsize,
                             save_dir=output_plot_directory,
                             filename=df_name,
-                            show_plot=False)
+                            show_plot=False,
+                            verbose=False)
         if output_dataset_directory is not None:
-            new_filename = 'VIF_' + df_name
+            new_filename = df_name + '_VIF'
             result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
             if len(dropped_cols) > 0:

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/data_exploration.py RENAMED Viewed

@@ -22,8 +22,7 @@ __all__ = [
     "check_value_distributions",
     "plot_value_distributions",
     "clip_outliers_single",
-    "clip_outliers_multi",
-    "distribute_datasets_by_target"
+    "clip_outliers_multi"
 ]
@@ -90,18 +89,18 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
     Returns:
         tuple: A tuple containing:
-            - pd.DataFrame: Targets dataframe.
             - pd.DataFrame: Features dataframe.
+            - pd.DataFrame: Targets dataframe.
     Prints:
         - Shape of the original dataframe.
-        - Shape of the targets dataframe.
         - Shape of the features dataframe.
+        - Shape of the targets dataframe.
     """
     df_targets = df[targets]
     df_features = df.drop(columns=targets)
-    print(f"Original shape: {df.shape}\nTargets shape: {df_targets.shape}\nFeatures shape: {df_features.shape}")
-    return df_targets, df_features
+    print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
+    return df_features, df_targets
 def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
@@ -153,7 +152,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
         result_df = df.drop(columns=cols_to_drop)
         if show_nulls_after:
-            show_null_columns(df=result_df).head(20)
+            print(show_null_columns(df=result_df))
         return result_df
     else:
@@ -259,7 +258,7 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
         os.makedirs(save_dir, exist_ok=True)
         full_path = os.path.join(save_dir, plot_title + ".svg")
         plt.savefig(full_path, bbox_inches="tight", format='svg')
-        print(f"Saved correlation heatmap to: {full_path}")
+        print(f"Saved correlation heatmap: '{plot_title}.svg'")
     plt.show()
     plt.close()
@@ -519,38 +518,9 @@ def clip_outliers_multi(
     return new_df
-def distribute_datasets_by_target(
-    df: pd.DataFrame,
-    target_columns: list[str]
-) -> Iterator[Tuple[str, pd.DataFrame]]:
-    """
-    Yields cleaned DataFrames for each target column, where rows with missing
-    target values are removed. The target column is placed at the end.
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Preprocessed dataframe with all feature and target columns ready to train.
-    target_columns : List[str]
-        List of target column names to generate per-target DataFrames.
-    Yields
-    ------
-    Tuple[str, pd.DataFrame]
-        * First element is the target column name.
-        * Second element is the corresponding cleaned DataFrame.
-    """
-    feature_columns = [col for col in df.columns if col not in target_columns]
-    for target in target_columns:
-        subset = df[feature_columns + [target]].dropna(subset=[target])
-        yield target, subset
 def _is_notebook():
     return get_ipython() is not None
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/ensemble_learning.py RENAMED Viewed

@@ -20,7 +20,7 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
 import shap
-from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
+from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
 import warnings # Ignore warnings
 warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -157,9 +157,7 @@ class RegressionTreeModels:
         self.gamma = gamma
         # LightGBM specific
-        if num_leaves >= (2**max_depth):
-            num_leaves = (2**max_depth) - 1
-            print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
+        num_leaves = min(num_leaves, 2 ** (max_depth - 1))
         self.num_leaves = num_leaves
         self.min_data_in_leaf = min_data_in_leaf
@@ -202,7 +200,7 @@ class RegressionTreeModels:
             verbose=-1,
             reg_alpha=self.L1,
             reg_lambda=self.L2,
-            boosting_type='dart',
+            boosting_type='gbdt',
             num_leaves=self.num_leaves,
             min_data_in_leaf=self.min_data_in_leaf
         )
@@ -321,9 +319,7 @@ class ClassificationTreeModels:
         self.gamma = gamma
         # LightGBM specific
-        if num_leaves >= (2**max_depth):
-            num_leaves = (2**max_depth) - 1
-            print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
+        num_leaves = min(num_leaves, 2 ** (max_depth - 1))
         self.num_leaves = num_leaves
         self.min_data_in_leaf = min_data_in_leaf
@@ -370,7 +366,7 @@ class ClassificationTreeModels:
             verbose=-1,
             reg_alpha=self.L1,
             reg_lambda=self.L2,
-            boosting_type='dart' if self.use_model_balance else 'goss',
+            boosting_type='gbdt' if self.use_model_balance else 'goss',
             num_leaves=self.num_leaves,
             min_data_in_leaf=self.min_data_in_leaf,
             class_weight='balanced' if self.use_model_balance else None
@@ -489,8 +485,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
 def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
     #Sanitize filenames to save
     sanitized_target_name = sanitize_filename(target_name)
-    full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
-    joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
+    filename = f"{model_name}_{sanitized_target_name}"
+    to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
+    serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
 # function to evaluate the model and save metrics (Classification)
 def evaluate_model_classification(

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/logger.py RENAMED Viewed

@@ -55,7 +55,7 @@ def custom_logger(
     """
     try:
         os.makedirs(save_directory, exist_ok=True)
-        timestamp = datetime.now().strftime(r"%Y%m%d_%H%M")
+        timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
         log_name = sanitize_filename(log_name)
         base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
@@ -80,7 +80,7 @@ def custom_logger(
         else:
             raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
-        print(f"Log saved to: {base_path}")
+        print(f"Log saved to: '{base_path}'")
     except Exception as e:
         print(f"Error in custom_logger: {e}")

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/particle_swarm_optimization.py RENAMED Viewed

@@ -8,7 +8,7 @@ from sklearn.base import ClassifierMixin
 from typing import Literal, Union, Tuple, Dict, Optional
 import polars as pl
 from functools import partial
-from .utilities import sanitize_filename, _script_info, threshold_binary_values
+from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object
 __all__ = [
@@ -38,7 +38,7 @@ class ObjectiveFunction():
         self.binary_features = binary_features
         self.is_hybrid = False if binary_features <= 0 else True
         self.use_noise = add_noise
-        self._artifact = joblib.load(trained_model_path)
+        self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
         self.model = self._get_from_artifact('model')
         self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
         self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
@@ -49,7 +49,7 @@ class ObjectiveFunction():
         if self.use_noise:
             features_array = self.add_noise(features_array)
         if self.is_hybrid:
-            features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
+            features_array = threshold_binary_values(input_array=features_array, binary_values=self.binary_features) # type: ignore
         if features_array.ndim == 1:
             features_array = features_array.reshape(1, -1)
@@ -83,6 +83,8 @@ class ObjectiveFunction():
             raise ValueError("Loaded model is None")
     def _get_from_artifact(self, key: str):
+        if self._artifact is None:
+            raise TypeError("Load model error")
         val = self._artifact.get(key)
         if key == "feature_names":
             result = val if isinstance(val, list) and val else None
@@ -129,10 +131,10 @@ def run_pso(lower_boundaries: list[float],
             target_name: Union[str, None]=None,
             feature_names: Union[list[str], None]=None,
             swarm_size: int=200,
-            max_iterations: int=1000,
+            max_iterations: int=1500,
             inequality_constrain_function=None,
-            post_hoc_analysis: Optional[int]=3,
-            workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
+            post_hoc_analysis: Optional[int]=5,
+            workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
     """
     Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/utilities.py RENAMED Viewed

@@ -1,10 +1,13 @@
 import math
 import numpy as np
 import pandas as pd
+import polars as pl
 import os
 from pathlib import Path
 import re
-from typing import Literal, Union, Sequence
+from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
+import joblib
+from joblib.externals.loky.process_executor import TerminatedWorkerError
 # Keep track of available tools
@@ -16,7 +19,10 @@ __all__ = [
     "save_dataframe",
     "normalize_mixed_list",
     "sanitize_filename",
-    "threshold_binary_values"
+    "threshold_binary_values",
+    "serialize_object",
+    "deserialize_object",
+    "distribute_datasets_by_target"
 ]
@@ -194,12 +200,9 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
     Returns:
         List[float]: A list of normalized float values summing to 1.0.
-            Values significantly smaller than the median scale are scaled up
-            before normalization to correct likely input errors.
     Notes:
         - Zeros and None values remain zero.
-        - If all input values are zero or None, the function returns a list of zeros.
         - Input strings are automatically cast to floats if possible.
     Example:
@@ -268,35 +271,156 @@ def sanitize_filename(filename: str) -> str:
 def threshold_binary_values(
-    input_array: Union[Sequence[float], np.ndarray],
-    binary_features: int
-) -> np.ndarray:
+    input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
+    binary_values: Optional[int] = None
+) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
     """
-    Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
+    Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
-    Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
+    Parameters:
+        input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
+        binary_values (Optional[int]) :
+            - If `None`, all values are treated as binary.
+            - If `int`, only this many last `binary_values` are thresholded.
+    Returns:
+        Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
+    """
+    original_type = type(input_array)
+    if isinstance(input_array, pl.Series):
+        array = input_array.to_numpy()
+    elif isinstance(input_array, (pd.Series, np.ndarray)):
+        array = np.asarray(input_array)
+    elif isinstance(input_array, (list, tuple)):
+        array = np.array(input_array)
+    else:
+        raise TypeError("Unsupported input type")
+    array = array.flatten()
+    total = array.shape[0]
+    bin_count = total if binary_values is None else binary_values
+    if not (0 <= bin_count <= total):
+        raise ValueError("binary_values must be between 0 and the total number of elements")
+    if bin_count == 0:
+        result = array
+    else:
+        cont_part = array[:-bin_count] if bin_count < total else np.array([])
+        bin_part = (array[-bin_count:] > 0.5).astype(int)
+        result = np.concatenate([cont_part, bin_part])
+    if original_type is pd.Series:
+        return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
+    elif original_type is pl.Series:
+        return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
+    elif original_type is list:
+        return result.tolist()
+    elif original_type is tuple:
+        return tuple(result)
+    else:
+        return result
+def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
+    """
+    Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
     Parameters:
-        input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
+        obj (Any) : The Python object to serialize.
+        save_dir (str) : Directory path where the serialized object will be saved.
+        filename (str) : Name for the output file, extension will be appended if needed.
+    Returns:
+        (str | None) : The full file path where the object was saved if successful; otherwise, None.
+    """
+    try:
+        os.makedirs(save_dir, exist_ok=True)
+        sanitized_name = sanitize_filename(filename)
+        if not sanitized_name.endswith('.joblib'):
+            sanitized_name = sanitized_name + ".joblib"
+        full_path = os.path.join(save_dir, sanitized_name)
+        joblib.dump(obj, full_path)
+    except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
+        message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
+        if raise_on_error:
+            raise Exception(message)
+        else:
+            print(message)
+        return None
+    else:
+        if verbose:
+            print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
+        return full_path
-        binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
+def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
+    """
+    Loads a serialized object from a .joblib file.
+    Parameters:
+        filepath (str): Full path to the serialized .joblib file.
     Returns:
-        np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
+        (Any | None): The deserialized Python object, or None if loading fails.
     """
-    array = np.asarray(input_array).flatten()
-    total = array.shape[0]
-    if binary_features < 0 or binary_features > total:
-        raise ValueError("Binary features must be between 0 and the total number of features.")
-    if binary_features == 0:
-        return array
+    if not os.path.exists(filepath):
+        print(f"❌ File does not exist: {filepath}")
+        return None
+    try:
+        obj = joblib.load(filepath)
+    except (IOError, OSError, EOFError, TypeError, ValueError) as e:
+        message = f"❌ Failed to deserialize object from '{filepath}': {e}"
+        if raise_on_error:
+            raise Exception(message)
+        else:
+            print(message)
+        return None
+    else:
+        if verbose:
+            print(f"✅ Loaded object of type '{type(obj)}'")
+        return obj
-    cont_part = array[:-binary_features]
-    bin_part = (array[-binary_features:] > 0.5).astype(int)
+def distribute_datasets_by_target(
+    df_or_path: Union[pd.DataFrame, str],
+    target_columns: list[str],
+    verbose: bool = False
+) -> Iterator[Tuple[str, pd.DataFrame]]:
+    """
+    Yields cleaned DataFrames for each target column, where rows with missing
+    target values are removed. The target column is placed at the end.
+    Parameters
+    ----------
+    df_or_path : [pd.DataFrame | str]
+        Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
+    target_columns : List[str]
+        List of target column names to generate per-target DataFrames.
+    verbose: bool
+        Whether to print info for each yielded dataset.
+    Yields
+    ------
+    Tuple[str, pd.DataFrame]
+        * First element is the target column name.
+        * Second element is the corresponding cleaned DataFrame.
+    """
+    # Validate path
+    if isinstance(df_or_path, str):
+        df, _ = load_dataframe(df_or_path)
+    else:
+        df = df_or_path
-    return np.concatenate([cont_part, bin_part])
+    valid_targets = [col for col in df.columns if col in target_columns]
+    feature_columns = [col for col in df.columns if col not in valid_targets]
+    for target in valid_targets:
+        subset = df[feature_columns + [target]].dropna(subset=[target])
+        if verbose:
+            print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
+        yield target, subset
 def _script_info(all_data: list[str]):

{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "1.4.3"
+version = "1.4.5"
 description = "A collection of tools for data science and machine learning projects"
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }