PyPI - dragon-ml-toolbox - Versions diffs - 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl - Mend

dragon-ml-toolbox 1.4.1py3-none-any.whl → 1.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -6,7 +6,7 @@ from matplotlib.colors import Colormap
 from matplotlib import rcdefaults
 import os
-from typing import Literal, Union, Optional
+from typing import Literal, Union, Optional, Iterator, Tuple
 import joblib
 from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
@@ -17,11 +17,10 @@ import xgboost as xgb
 import lightgbm as lgb
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
 import shap
-from .utilities import yield_dataframes_from_dir, sanitize_filename
+from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
 import warnings # Ignore warnings
 warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -29,113 +28,377 @@ warnings.filterwarnings('ignore', category=FutureWarning)
 warnings.filterwarnings('ignore', category=UserWarning)
+__all__ = [
+    "dataset_yielder",
+    "RegressionTreeModels",
+    "ClassificationTreeModels",
+    "dataset_pipeline",
+    "evaluate_model_classification",
+    "plot_roc_curve",
+    "evaluate_model_regression",
+    "get_shap_values",
+    "train_test_pipeline",
+    "run_ensemble_pipeline"
+]
+## Type aliases
+HandleImbalanceStrategy = Literal[
+    "ADASYN", "SMOTE", "RAND_OVERSAMPLE", "RAND_UNDERSAMPLE", "by_model", None
+]
+TaskType = Literal[
+    "classification", "regression"
+]
 ###### 1. Dataset Loader ######
-#Split a dataset into features and targets datasets
-def dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
-    '''
-    Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
-    '''
-    df_features = df.drop(columns=target_cols)
+def dataset_yielder(
+    df: pd.DataFrame,
+    target_cols: list[str]
+) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
+    """
+    Yields one tuple at a time:
+        (features_dataframe, target_series, feature_names, target_name)
+    Skips any target columns not found in the DataFrame.
+    """
+    # Determine which target columns actually exist in the DataFrame
+    valid_targets = [col for col in target_cols if col in df.columns]
+    # Features = all columns excluding valid target columns
+    df_features = df.drop(columns=valid_targets)
     feature_names = df_features.columns.to_list()
-    for target_col in target_cols:
+    for target_col in valid_targets:
         df_target = df[target_col]
         yield (df_features, df_target, feature_names, target_col)
 ###### 2. Initialize Models ######
-def get_models(task: Literal["classification", "regression"], random_state: int=101, is_balanced: bool = True,
-              L1_regularization: float = 1.0, L2_regularization: float = 1.0, learning_rate: float=0.005) -> dict:
-    '''
-    Returns a dictionary `{Model_Name: Model}` with new instances of models.
-    Valid tasks: "classification" or "regression".
+class RegressionTreeModels:
+    """
+    A factory class for creating and configuring multiple gradient boosting regression models
+    with unified hyperparameters. This includes XGBoost, LightGBM, and HistGradientBoostingRegressor.
-    Classification Models:
-        - "XGBoost" - XGBClassifier
-        - "LightGBM" - LGBMClassifier
-        - "HistGB" - HistGradientBoostingClassifier
-    Regression Models:
-        - "XGBoost" - XGBRegressor
-        - "LightGBM" - LGBMRegressor
-        - "HistGB" - HistGradientBoostingRegressor
-    For classification only: Set `is_balanced=False` for imbalanced datasets.
+    Use the `__call__`, `()` method.
+    Parameters
+    ----------
+    random_state : int
+        Seed used by the random number generator.
+    learning_rate : float [0.001 - 0.300]
+        Boosting learning rate (shrinkage).
-    Increase L1 and L2 if model is overfitting
-    '''
+    L1_regularization : float [0.0 - 10.0]
+        L1 regularization term (alpha). Might drive to sparsity.
+    L2_regularization : float [0.0 - 10.0]
+        L2 regularization term (lambda).
+    n_estimators : int [100 - 3000]
+        Number of boosting iterations for XGBoost and LightGBM.
+    max_depth : int [3 - 15]
+        Maximum depth of individual trees. Controls model complexity; high values may overfit.
+    subsample : float [0.5 - 1.0]
+        Fraction of rows per tree; used to prevent overfitting.
+    colsample_bytree : float [0.3 - 1.0]
+        Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
+    min_samples_leaf : int [10 - 100]
+        Minimum samples per leaf; higher = less overfitting (used in HistGB).
+    max_iter : int [100 - 2000]
+        Maximum number of iterations (used in HistGB).
+    min_child_weight : float [0.1 - 10.0]
+        Minimum sum of instance weight (hessian) needed in a child; larger values make the algorithm more conservative (used in XGBoost).
+    gamma : float [0.0 - 5.0]
+        Minimum loss reduction required to make a further partition on a leaf node; higher = more regularization (used in XGBoost).
+    num_leaves : int [20 - 200]
+        Maximum number of leaves in one tree; should be less than 2^(max_depth); larger = more complex (used in LightGBM).
+    min_data_in_leaf : int [10 - 100]
+        Minimum number of data points in a leaf; increasing may prevent overfitting (used in LightGBM).
+    """
+    def __init__(self,
+                 random_state: int = 101,
+                 learning_rate: float = 0.005,
+                 L1_regularization: float = 1.0,
+                 L2_regularization: float = 1.0,
+                 n_estimators: int = 1000,
+                 max_depth: int = 8,
+                 subsample: float = 0.8,
+                 colsample_bytree: float = 0.8,
+                 min_samples_leaf: int = 50,
+                 max_iter: int = 1000,
+                 min_child_weight: float = 3.0,
+                 gamma: float = 1.0,
+                 num_leaves: int = 31,
+                 min_data_in_leaf: int = 40):
+        # General config
+        self.random_state = random_state
+        self.lr = learning_rate
+        self.L1 = L1_regularization
+        self.L2 = L2_regularization
+        # Shared tree structure
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.subsample = subsample
+        self.colsample_bytree = colsample_bytree
+        # XGBoost specific
+        self.min_child_weight = min_child_weight
+        self.gamma = gamma
+        # LightGBM specific
+        if num_leaves >= (2**max_depth):
+            num_leaves = (2**max_depth) - 1
+            print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
+        self.num_leaves = num_leaves
+        self.min_data_in_leaf = min_data_in_leaf
+        # HistGB specific
+        self.max_iter = max_iter
+        self.min_samples_leaf = min_samples_leaf
+    def __call__(self) -> dict[str, object]:
+        """
+        Returns a dictionary with new instances of:
+            - "XGBoost": XGBRegressor
+            - "LightGBM": LGBMRegressor
+            - "HistGB": HistGradientBoostingRegressor
+        """
+        # XGBoost Regressor
+        xgb_model = xgb.XGBRegressor(
+            n_estimators=self.n_estimators,
+            max_depth=self.max_depth,
+            learning_rate=self.lr,
+            subsample=self.subsample,
+            colsample_bytree=self.colsample_bytree,
+            random_state=self.random_state,
+            reg_alpha=self.L1,
+            reg_lambda=self.L2,
+            eval_metric='rmse',
+            min_child_weight=self.min_child_weight,
+            gamma=self.gamma,
+            tree_method='hist',
+            grow_policy='lossguide'
+        )
+        # LightGBM Regressor
+        lgb_model = lgb.LGBMRegressor(
+            n_estimators=self.n_estimators,
+            learning_rate=self.lr,
+            max_depth=self.max_depth,
+            subsample=self.subsample,
+            colsample_bytree=self.colsample_bytree,
+            random_state=self.random_state,
+            verbose=-1,
+            reg_alpha=self.L1,
+            reg_lambda=self.L2,
+            boosting_type='dart',
+            num_leaves=self.num_leaves,
+            min_data_in_leaf=self.min_data_in_leaf
+        )
+        # HistGradientBoosting Regressor
+        hist_model = HistGradientBoostingRegressor(
+            max_iter=self.max_iter,
+            learning_rate=self.lr,
+            max_depth=self.max_depth,
+            min_samples_leaf=self.min_samples_leaf,
+            random_state=self.random_state,
+            l2_regularization=self.L2,
+            scoring='neg_mean_squared_error',
+            early_stopping=True,
+            validation_fraction=0.1
+        )
+        return {
+            "XGBoost": xgb_model,
+            "LightGBM": lgb_model,
+            "HistGB": hist_model
+        }
-    # Model initialization logic
-    if task not in ["classification", "regression"]:
-        raise ValueError(f"Invalid task: {task}. Must be 'classification' or 'regression'.")
-    models = {}
-    # Common parameters
-    xgb_params = {
-        'n_estimators': 200,
-        'max_depth': 5,
-        'learning_rate': learning_rate,
-        'subsample': 0.8,
-        'colsample_bytree': 0.8,
-        'random_state': random_state,
-        'reg_alpha': L1_regularization,
-        'reg_lambda': L2_regularization,
-    }
-    lgbm_params = {
-        'n_estimators': 200,
-        'learning_rate': learning_rate,
-        'max_depth': 5,
-        'subsample': 0.8,
-        'colsample_bytree': 0.8,
-        'random_state': random_state,
-        'verbose': -1,
-        'reg_alpha': L1_regularization,
-        'reg_lambda': L2_regularization,
-    }
-    hist_params = {
-        'max_iter': 200,
-        'learning_rate': learning_rate,
-        'max_depth': 5,
-        'min_samples_leaf': 30,
-        'random_state': random_state,
-        'l2_regularization': L2_regularization,
-    }
-    # XGB Model
-    if task == "classification":
-        xgb_params.update({
-            'scale_pos_weight': 1 if is_balanced else 8,
-            'eval_metric': 'aucpr'
-        })
-        models["XGBoost"] = xgb.XGBClassifier(**xgb_params)
-    else:
-        xgb_params.update({'eval_metric': 'rmse'})
-        models["XGBoost"] = xgb.XGBRegressor(**xgb_params)
+    def __str__(self):
+        return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
-    # LGBM Model
-    if task == "classification":
-        lgbm_params.update({
-            'class_weight': None if is_balanced else 'balanced',
-            'boosting_type': 'goss' if is_balanced else 'dart',
-        })
-        models["LightGBM"] = lgb.LGBMClassifier(**lgbm_params)
-    else:
-        lgbm_params['boosting_type'] = 'dart'
-        models["LightGBM"] = lgb.LGBMRegressor(**lgbm_params)
-    # HistGB Model
-    if task == "classification":
-        hist_params.update({
-            'class_weight': None if is_balanced else 'balanced',
-            'scoring': 'loss' if is_balanced else 'balanced_accuracy',
-        })
-        models["HistGB"] = HistGradientBoostingClassifier(**hist_params)
-    else:
-        hist_params['scoring'] = 'neg_mean_squared_error'
-        models["HistGB"] = HistGradientBoostingRegressor(**hist_params)
+class ClassificationTreeModels:
+    """
+    A factory class for creating and configuring multiple gradient boosting classification models
+    with unified hyperparameters. This includes: XGBoost, LightGBM, and HistGradientBoostingClassifier.
+    Use the `__call__`, `()` method.
+    Parameters
+    ----------
+    random_state : int
+        Seed used by the random number generator to ensure reproducibility.
+    learning_rate : float [0.001 - 0.300]
+        Boosting learning rate (shrinkage factor).
+    L1_regularization : float [0.0 - 10.0]
+        L1 regularization term (alpha), might drive to sparsity.
+    L2_regularization : float [0.0 - 10.0]
+        L2 regularization term (lambda).
+    n_estimators : int [100 - 3000]
+        Number of boosting rounds for XGBoost and LightGBM.
+    max_depth : int [3 - 15]
+        Maximum depth of individual trees in the ensemble. Controls model complexity; high values may overfit.
+    subsample : float [0.5 - 1.0]
+        Fraction of samples to use when fitting base learners; used to prevent overfitting.
+    colsample_bytree : float [0.3 - 1.0]
+        Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
+    min_samples_leaf : int [10 - 100]
+        Minimum number of samples required to be at a leaf node; higher = less overfitting (used in HistGB).
+    max_iter : int [100 - 2000]
+        Maximum number of boosting iteration (used in HistGB).
+    min_child_weight : float [0.1 - 10.0]
+        Minimum sum of instance weight (Hessian) in a child node; larger values make the algorithm more conservative (used in XGBoost).
+    gamma : float [0.0 - 5.0]
+        Minimum loss reduction required to make a further partition; higher = more regularization (used in XGBoost).
+    num_leaves : int [20 - 200]
+        Maximum number of leaves in one tree. Should be less than 2^(max_depth); larger = more complex (used in LightGBM).
+    min_data_in_leaf : int [10 -100]
+        Minimum number of samples required in a leaf; increasing may prevent overfitting (used in LightGBM).
+    Attributes
+    ----------
+    use_model_balance : bool
+        Indicates whether to apply class balancing strategies internally. Can be overridden at runtime via the `__call__` method.
+    """
+    def __init__(self,
+                 random_state: int = 101,
+                 learning_rate: float = 0.005,
+                 L1_regularization: float = 1.0,
+                 L2_regularization: float = 1.0,
+                 n_estimators: int = 1000,
+                 max_depth: int = 8,
+                 subsample: float = 0.8,
+                 colsample_bytree: float = 0.8,
+                 min_samples_leaf: int = 50,
+                 max_iter: int = 1000,
+                 min_child_weight: float = 3.0,
+                 gamma: float = 1.0,
+                 num_leaves: int = 31,
+                 min_data_in_leaf: int = 40):
+        # General config
+        self.random_state = random_state
+        self.lr = learning_rate
+        self.L1 = L1_regularization
+        self.L2 = L2_regularization
+        # To be set by the pipeline
+        self.use_model_balance: bool = True
+        # Shared tree structure
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.subsample = subsample
+        self.colsample_bytree = colsample_bytree
+        # XGBoost specific
+        self.min_child_weight = min_child_weight
+        self.gamma = gamma
+        # LightGBM specific
+        if num_leaves >= (2**max_depth):
+            num_leaves = (2**max_depth) - 1
+            print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
+        self.num_leaves = num_leaves
+        self.min_data_in_leaf = min_data_in_leaf
+        # HistGB specific
+        self.max_iter = max_iter
+        self.min_samples_leaf = min_samples_leaf
+    def __call__(self, use_model_balance: Optional[bool]=None) -> dict[str, object]:
+        """
+        Returns a dictionary with new instances of:
+            - "XGBoost": XGBClassifier
+            - "LightGBM": LGBMClassifier
+            - "HistGB": HistGradientBoostingClassifier
+        """
+        if use_model_balance is not None:
+            self.use_model_balance = use_model_balance
+        # XGBoost Classifier
+        xgb_model = xgb.XGBClassifier(
+            n_estimators=self.n_estimators,
+            max_depth=self.max_depth,
+            learning_rate=self.lr,
+            subsample=self.subsample,
+            colsample_bytree=self.colsample_bytree,
+            random_state=self.random_state,
+            reg_alpha=self.L1,
+            reg_lambda=self.L2,
+            eval_metric='aucpr',
+            min_child_weight=self.min_child_weight,
+            gamma=self.gamma,
+            tree_method='hist',
+            grow_policy='lossguide',
+            scale_pos_weight=8.0 if self.use_model_balance else 1.0
+        )
+        # LightGBM Classifier
+        lgb_model = lgb.LGBMClassifier(
+            n_estimators=self.n_estimators,
+            learning_rate=self.lr,
+            max_depth=self.max_depth,
+            subsample=self.subsample,
+            colsample_bytree=self.colsample_bytree,
+            random_state=self.random_state,
+            verbose=-1,
+            reg_alpha=self.L1,
+            reg_lambda=self.L2,
+            boosting_type='dart' if self.use_model_balance else 'goss',
+            num_leaves=self.num_leaves,
+            min_data_in_leaf=self.min_data_in_leaf,
+            class_weight='balanced' if self.use_model_balance else None
+        )
+        # HistGradientBoosting Classifier
+        hist_model = HistGradientBoostingClassifier(
+            max_iter=self.max_iter,
+            learning_rate=self.lr,
+            max_depth=self.max_depth,
+            min_samples_leaf=self.min_samples_leaf,
+            random_state=self.random_state,
+            l2_regularization=self.L2,
+            early_stopping=True,
+            validation_fraction=0.1,
+            class_weight='balanced' if self.use_model_balance else None,
+            scoring='balanced_accuracy' if self.use_model_balance else 'loss'
+        )
+        return {
+            "XGBoost": xgb_model,
+            "LightGBM": lgb_model,
+            "HistGB": hist_model
+        }
+    def __str__(self):
+        return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
-    return models
 ###### 3. Process Dataset ######
 # function to split data into train and test
@@ -144,23 +407,9 @@ def _split_data(features, target, test_size, random_state, task):
                                                         stratify=target if task=="classification" else None)
     return X_train, X_test, y_train, y_test
-# function to standardize the data
-def _standardize_data(train_features, test_features, scaler_code):
-    if scaler_code == "standard":
-        scaler = StandardScaler()
-    elif scaler_code == "minmax":
-        scaler = MinMaxScaler()
-    elif scaler_code == "maxabs":
-        scaler = MaxAbsScaler()
-    else:
-        raise ValueError(f"Unrecognized scaler {scaler_code}")
-    train_scaled = scaler.fit_transform(train_features)
-    test_scaled = scaler.transform(test_features)
-    return train_scaled, test_scaled, scaler
 # Over-sample minority class (Positive cases) and return several single target datasets (Classification)
-def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
-              strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
+def _resample(X_train: np.ndarray, y_train: pd.Series,
+              strategy: HandleImbalanceStrategy, random_state):
     '''
     Oversample minority class or undersample majority class.
@@ -168,30 +417,29 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
     '''
     if strategy == 'SMOTE':
         resample_algorithm = SMOTE(random_state=random_state, k_neighbors=3)
-    elif strategy == 'RANDOM':
+    elif strategy == 'RAND_OVERSAMPLE':
         resample_algorithm = RandomOverSampler(random_state=random_state)
-    elif strategy == 'UNDERSAMPLE':
+    elif strategy == 'RAND_UNDERSAMPLE':
         resample_algorithm = RandomUnderSampler(random_state=random_state)
     elif strategy == 'ADASYN':
         resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
     else:
         raise ValueError(f"Invalid resampling strategy: {strategy}")
-    X_res, y_res, *_ = resample_algorithm.fit_resample(X_train_scaled, y_train)
+    X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
     return X_res, y_res
 # DATASET PIPELINE
-def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
-                     resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None], scaler: Literal["standard", "minmax", "maxabs"],
+def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: TaskType,
+                     resample_strategy: HandleImbalanceStrategy,
                      test_size: float=0.2, debug: bool=False, random_state: int=101):
     '''
     1. Make Train/Test splits
-    2. Standardize Train and Test Features
-    3. Oversample imbalanced classes (classification)
+    2. Oversample imbalanced classes (classification)
-    Return a processed Tuple: (X_train, y_train, X_test, y_test, Scaler)
+    Return a processed Tuple: (X_train, y_train, X_test, y_test)
-    `(nD-array, 1D-array, nD-array, Series, Scaler)`
+    `(nD-array, 1D-array, nD-array, Series)`
     '''
     #DEBUG
     if debug:
@@ -206,24 +454,18 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
     if debug:
         print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
-    # Standardize
-    X_train_scaled, X_test_scaled, scaler_object = _standardize_data(train_features=X_train, test_features=X_test, scaler_code=scaler)
-    #DEBUG
-    if debug:
-        print(f"Shapes after scaling features - X_train: {X_train_scaled.shape}, y_train: {y_train.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
-    # Scale
-    if resample_strategy is None or task == "regression":
-        X_train_oversampled, y_train_oversampled = X_train_scaled, y_train
+    # Resample
+    if resample_strategy is None or resample_strategy == "by_model" or task == "regression":
+        X_train_oversampled, y_train_oversampled = X_train, y_train
     else:
-        X_train_oversampled, y_train_oversampled = _resample(X_train_scaled=X_train_scaled, y_train=y_train, strategy=resample_strategy, random_state=random_state)
+        X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
     #DEBUG
     if debug:
-        print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
+        print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
-    return X_train_oversampled, y_train_oversampled, X_test_scaled, y_test, scaler_object
+    return X_train_oversampled, y_train_oversampled, X_test, y_test
 ###### 4. Train and Evaluation ######
 # Trainer function
@@ -244,11 +486,11 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
     return model_dir
 # save model
-def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
+def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
     #Sanitize filenames to save
     sanitized_target_name = sanitize_filename(target_name)
     full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
-    joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
+    joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
 # function to evaluate the model and save metrics (Classification)
 def evaluate_model_classification(
@@ -257,10 +499,9 @@ def evaluate_model_classification(
     save_dir: str,
     x_test_scaled: np.ndarray,
     single_y_test: np.ndarray,
-    target_id: str,
+    target_name: str,
     figsize: tuple = (10, 8),
-    title_fontsize: int = 24,
-    label_fontsize: int = 24,
+    base_fontsize: int = 24,
     cmap: Colormap = plt.cm.Blues # type: ignore
 ) -> np.ndarray:
     """
@@ -271,8 +512,8 @@ def evaluate_model_classification(
         model_name: Identifier for the model
         save_dir: Directory where results are saved
         x_test_scaled: Feature matrix for test set
-        single_y_test: True binary labels
-        target_id: Suffix for naming output files
+        single_y_test: True targets
+        target_name: Target name
         figsize: Size of the confusion matrix figure (width, height)
         fontsize: Font size used for title, axis labels and ticks
         cmap: Color map for the confusion matrix. Examples include:
@@ -300,10 +541,10 @@ def evaluate_model_classification(
     )
     # Save text report
-    sanitized_target_id = sanitize_filename(target_id)
-    report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_id}.txt")
+    sanitized_target_name = sanitize_filename(target_name)
+    report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
     with open(report_path, "w") as f:
-        f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
+        f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
         f.write("Classification Report:\n")
         f.write(report) # type: ignore
@@ -318,20 +559,20 @@ def evaluate_model_classification(
         ax=ax
     )
-    ax.set_title(f"{model_name} - {target_id}", fontsize=title_fontsize)
-    ax.tick_params(axis='both', labelsize=label_fontsize)
-    ax.set_xlabel("Predicted label", fontsize=label_fontsize)
-    ax.set_ylabel("True label", fontsize=label_fontsize)
+    ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
+    ax.tick_params(axis='both', labelsize=base_fontsize)
+    ax.set_xlabel("Predicted label", fontsize=base_fontsize)
+    ax.set_ylabel("True label", fontsize=base_fontsize)
     # Turn off gridlines
     ax.grid(False)
     # Manually update font size of cell texts
     for text in ax.texts:
-        text.set_fontsize(title_fontsize+4)
+        text.set_fontsize(base_fontsize+4)
     fig.tight_layout()
-    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_id}.svg")
+    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
     fig.savefig(fig_path, format="svg", bbox_inches="tight")
     plt.close(fig)
@@ -356,7 +597,7 @@ def plot_roc_curve(
     Parameters:
         true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
         probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
-        target_name: str, used for figure title and filename.
+        target_name: str, Target name.
         save_directory: str, path to directory where figure is saved.
         color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
             - Named colors: "darkorange", "blue", "red", "green", "black"
@@ -425,7 +666,7 @@ def plot_roc_curve(
 def evaluate_model_regression(model, model_name: str,
                                save_dir: str,
                                x_test_scaled: np.ndarray, single_y_test: np.ndarray,
-                               target_id: str,
+                               target_name: str,
                                figure_size: tuple = (12, 8),
                                alpha_transparency: float = 0.5,
                                base_fontsize: int = 24):
@@ -439,10 +680,10 @@ def evaluate_model_regression(model, model_name: str,
     r2 = r2_score(single_y_test, y_pred)
     # Create formatted report
-    sanitized_target_id = sanitize_filename(target_id)
-    report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_id}.txt")
+    sanitized_target_name = sanitize_filename(target_name)
+    report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
     with open(report_path, "w") as f:
-        f.write(f"{model_name} - {target_id} Regression Performance\n")
+        f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
         f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
         f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
         f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
@@ -455,10 +696,10 @@ def evaluate_model_regression(model, model_name: str,
     plt.axhline(0, color='red', linestyle='--')
     plt.xlabel("Predicted Values", fontsize=base_fontsize)
     plt.ylabel("Residuals", fontsize=base_fontsize)
-    plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
+    plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
     plt.grid(True)
     plt.tight_layout()
-    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_id}.svg"), bbox_inches='tight', format="svg")
+    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
     plt.close()
     # Create true vs predicted values plot
@@ -469,9 +710,9 @@ def evaluate_model_regression(model, model_name: str,
              'k--', lw=2)
     plt.xlabel('True Values', fontsize=base_fontsize)
     plt.ylabel('Predictions', fontsize=base_fontsize)
-    plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
+    plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
     plt.grid(True)
-    plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_id}.svg")
+    plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
     plt.savefig(plot_path, bbox_inches='tight', format="svg")
     plt.close()
@@ -485,7 +726,7 @@ def get_shap_values(
     save_dir: str,
     features_to_explain: np.ndarray,
     feature_names: list[str],
-    target_id: str,
+    target_name: str,
     task: Literal["classification", "regression"],
     max_display_features: int = 10,
     figsize: tuple = (16, 20),
@@ -504,7 +745,7 @@ def get_shap_values(
         features_to_explain: Should match the model's training data format, including scaling.
         save_dir: Directory to save visualizations
     """
-    sanitized_target_id = sanitize_filename(target_id)
+    sanitized_target_name = sanitize_filename(target_name)
     def _apply_plot_style():
         styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -567,9 +808,9 @@ def get_shap_values(
                     _create_shap_plot(
                         shap_values=class_shap,
                         features=features_to_explain,
-                        save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_Class{class_name}_{plot_type}.svg"),
+                        save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
                         plot_type=plot_type,
-                        title=f"{model_name} - {target_id} (Class {class_name})"
+                        title=f"{model_name} - {target_name} (Class {class_name})"
                     )
         else:
             values = shap_values[1] if isinstance(shap_values, list) else shap_values
@@ -577,9 +818,9 @@ def get_shap_values(
                 _create_shap_plot(
                     shap_values=values,
                     features=features_to_explain,
-                    save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
+                    save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
                     plot_type=plot_type,
-                    title=f"{model_name} - {target_id}"
+                    title=f"{model_name} - {target_name}"
                 )
     def _plot_for_regression(shap_values):
@@ -587,9 +828,9 @@ def get_shap_values(
             _create_shap_plot(
                 shap_values=shap_values,
                 features=features_to_explain,
-                save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
+                save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
                 plot_type=plot_type,
-                title=f"{model_name} - {target_id}"
+                title=f"{model_name} - {target_name}"
             )
     #START_O
@@ -607,10 +848,10 @@ def get_shap_values(
 # TRAIN TEST PIPELINE
-def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
+def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
              train_features: np.ndarray, train_target: np.ndarray,
              test_features: np.ndarray, test_target: np.ndarray,
-             feature_names: list[str], target_id: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler],
+             feature_names: list[str], target_name: str,
              save_dir: str,
              debug: bool=False, save_model: bool=False):
     '''
@@ -620,7 +861,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
     Returns: Tuple(Trained model, Test-set Predictions)
     '''
-    print(f"\tModel: {model_name} for Target: {target_id}...")
+    print(f"\tTraining model: {model_name} for Target: {target_name}...")
     trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
     if debug:
         print(f"Trained model object: {type(trained_model)}")
@@ -628,52 +869,66 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
     if save_model:
         _save_model(trained_model=trained_model, model_name=model_name,
-                    target_name=target_id, feature_names=feature_names,
-                    save_directory=local_save_directory, scaler_object=scaler_object)
+                    target_name=target_name, feature_names=feature_names,
+                    save_directory=local_save_directory)
     if task == "classification":
         y_pred = evaluate_model_classification(model=trained_model, model_name=model_name, save_dir=local_save_directory,
-                             x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
+                             x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
         plot_roc_curve(true_labels=test_target,
                        probabilities_or_model=trained_model, model_name=model_name,
-                       target_name=target_id, save_directory=local_save_directory,
+                       target_name=target_name, save_directory=local_save_directory,
                        input_features=test_features)
     elif task == "regression":
         y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
-                             x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
+                             x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
     else:
         raise ValueError(f"Unrecognized task '{task}' for model training,")
     if debug:
         print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
     get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
-                    features_to_explain=train_features, feature_names=feature_names, target_id=target_id, task=task)
-    print("\t...done.")
+                    features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
+    # print("\t...done.")
     return trained_model, y_pred
 ###### 5. Execution ######
-def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
-         resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, scaler: Literal["standard", "minmax", "maxabs"]="minmax", save_model: bool=False,
-         test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
+def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
+         handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
+         test_size: float=0.2, debug:bool=False):
+    #Check models
+    if isinstance(model_object, RegressionTreeModels):
+        task = "regression"
+    elif isinstance(model_object, ClassificationTreeModels):
+        task = "classification"
+        if handle_classification_imbalance is None:
+            print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
+        elif handle_classification_imbalance == "by_model":
+            model_object.use_model_balance = True
+        else:
+            model_object.use_model_balance = False
+    else:
+        raise TypeError(f"Unrecognized model {type(model_object)}")
     #Check paths
     _check_paths(datasets_dir, save_dir)
     #Yield imputed dataset
     for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
         #Yield features dataframe and target dataframe
         for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
             #Dataset pipeline
-            X_train, y_train, X_test, y_test, scaler_object = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
-                                                                resample_strategy=resample_strategy, scaler=scaler,
-                                                                test_size=test_size, debug=debug, random_state=random_state)
+            X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
+                                                                resample_strategy=handle_classification_imbalance,
+                                                                test_size=test_size, debug=debug, random_state=model_object.random_state)
             #Get models
-            models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
-                                     L1_regularization=L1_regularization, L2_regularization=L2_regularization, learning_rate=learning_rate)
+            models_dict = model_object()
             #Train models
             for model_name, model in models_dict.items():
                 train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
                                     train_features=X_train, train_target=y_train, # type: ignore
                                     test_features=X_test, test_target=y_test,
-                                    feature_names=feature_names,target_id=target_name, scaler_object=scaler_object,
+                                    feature_names=feature_names,target_name=target_name,
                                     debug=debug, save_dir=save_dir, save_model=save_model)
     print("\n✅ Training and evaluation complete.")
@@ -683,3 +938,7 @@ def _check_paths(datasets_dir: str, save_dir:str):
         os.makedirs(save_dir)
     if not os.path.isdir(datasets_dir):
         raise IOError(f"Datasets directory '{datasets_dir}' not found.")
+def info():
+    _script_info(__all__)

dragon-ml-toolbox 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl

dragon-ml-toolbox 1.4.1py3-none-any.whl → 1.4.3py3-none-any.whl