dragon-ml-toolbox 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ from matplotlib.colors import Colormap
6
6
  from matplotlib import rcdefaults
7
7
 
8
8
  import os
9
- from typing import Literal, Union, Optional
9
+ from typing import Literal, Union, Optional, Iterator, Tuple
10
10
  import joblib
11
11
 
12
12
  from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
@@ -17,11 +17,10 @@ import xgboost as xgb
17
17
  import lightgbm as lgb
18
18
 
19
19
  from sklearn.model_selection import train_test_split
20
- from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
21
20
  from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
22
21
  import shap
23
22
 
24
- from .utilities import yield_dataframes_from_dir, sanitize_filename
23
+ from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
25
24
 
26
25
  import warnings # Ignore warnings
27
26
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -29,113 +28,377 @@ warnings.filterwarnings('ignore', category=FutureWarning)
29
28
  warnings.filterwarnings('ignore', category=UserWarning)
30
29
 
31
30
 
31
+ __all__ = [
32
+ "dataset_yielder",
33
+ "RegressionTreeModels",
34
+ "ClassificationTreeModels",
35
+ "dataset_pipeline",
36
+ "evaluate_model_classification",
37
+ "plot_roc_curve",
38
+ "evaluate_model_regression",
39
+ "get_shap_values",
40
+ "train_test_pipeline",
41
+ "run_ensemble_pipeline"
42
+ ]
43
+
44
+ ## Type aliases
45
+ HandleImbalanceStrategy = Literal[
46
+ "ADASYN", "SMOTE", "RAND_OVERSAMPLE", "RAND_UNDERSAMPLE", "by_model", None
47
+ ]
48
+
49
+ TaskType = Literal[
50
+ "classification", "regression"
51
+ ]
52
+
32
53
  ###### 1. Dataset Loader ######
33
- #Split a dataset into features and targets datasets
34
- def dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
35
- '''
36
- Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
37
- '''
38
- df_features = df.drop(columns=target_cols)
54
+ def dataset_yielder(
55
+ df: pd.DataFrame,
56
+ target_cols: list[str]
57
+ ) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
58
+ """
59
+ Yields one tuple at a time:
60
+ (features_dataframe, target_series, feature_names, target_name)
61
+
62
+ Skips any target columns not found in the DataFrame.
63
+ """
64
+ # Determine which target columns actually exist in the DataFrame
65
+ valid_targets = [col for col in target_cols if col in df.columns]
66
+
67
+ # Features = all columns excluding valid target columns
68
+ df_features = df.drop(columns=valid_targets)
39
69
  feature_names = df_features.columns.to_list()
40
-
41
- for target_col in target_cols:
70
+
71
+ for target_col in valid_targets:
42
72
  df_target = df[target_col]
43
73
  yield (df_features, df_target, feature_names, target_col)
44
74
 
75
+
45
76
  ###### 2. Initialize Models ######
46
- def get_models(task: Literal["classification", "regression"], random_state: int=101, is_balanced: bool = True,
47
- L1_regularization: float = 1.0, L2_regularization: float = 1.0, learning_rate: float=0.005) -> dict:
48
- '''
49
- Returns a dictionary `{Model_Name: Model}` with new instances of models.
50
- Valid tasks: "classification" or "regression".
77
+ class RegressionTreeModels:
78
+ """
79
+ A factory class for creating and configuring multiple gradient boosting regression models
80
+ with unified hyperparameters. This includes XGBoost, LightGBM, and HistGradientBoostingRegressor.
51
81
 
52
- Classification Models:
53
- - "XGBoost" - XGBClassifier
54
- - "LightGBM" - LGBMClassifier
55
- - "HistGB" - HistGradientBoostingClassifier
56
- Regression Models:
57
- - "XGBoost" - XGBRegressor
58
- - "LightGBM" - LGBMRegressor
59
- - "HistGB" - HistGradientBoostingRegressor
60
-
61
- For classification only: Set `is_balanced=False` for imbalanced datasets.
82
+ Use the `__call__`, `()` method.
83
+
84
+ Parameters
85
+ ----------
86
+ random_state : int
87
+ Seed used by the random number generator.
88
+
89
+ learning_rate : float [0.001 - 0.300]
90
+ Boosting learning rate (shrinkage).
62
91
 
63
- Increase L1 and L2 if model is overfitting
64
- '''
92
+ L1_regularization : float [0.0 - 10.0]
93
+ L1 regularization term (alpha). Might drive to sparsity.
94
+
95
+ L2_regularization : float [0.0 - 10.0]
96
+ L2 regularization term (lambda).
97
+
98
+ n_estimators : int [100 - 3000]
99
+ Number of boosting iterations for XGBoost and LightGBM.
100
+
101
+ max_depth : int [3 - 15]
102
+ Maximum depth of individual trees. Controls model complexity; high values may overfit.
103
+
104
+ subsample : float [0.5 - 1.0]
105
+ Fraction of rows per tree; used to prevent overfitting.
106
+
107
+ colsample_bytree : float [0.3 - 1.0]
108
+ Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
109
+
110
+ min_samples_leaf : int [10 - 100]
111
+ Minimum samples per leaf; higher = less overfitting (used in HistGB).
112
+
113
+ max_iter : int [100 - 2000]
114
+ Maximum number of iterations (used in HistGB).
115
+
116
+ min_child_weight : float [0.1 - 10.0]
117
+ Minimum sum of instance weight (hessian) needed in a child; larger values make the algorithm more conservative (used in XGBoost).
118
+
119
+ gamma : float [0.0 - 5.0]
120
+ Minimum loss reduction required to make a further partition on a leaf node; higher = more regularization (used in XGBoost).
121
+
122
+ num_leaves : int [20 - 200]
123
+ Maximum number of leaves in one tree; should be less than 2^(max_depth); larger = more complex (used in LightGBM).
124
+
125
+ min_data_in_leaf : int [10 - 100]
126
+ Minimum number of data points in a leaf; increasing may prevent overfitting (used in LightGBM).
127
+ """
128
+ def __init__(self,
129
+ random_state: int = 101,
130
+ learning_rate: float = 0.005,
131
+ L1_regularization: float = 1.0,
132
+ L2_regularization: float = 1.0,
133
+ n_estimators: int = 1000,
134
+ max_depth: int = 8,
135
+ subsample: float = 0.8,
136
+ colsample_bytree: float = 0.8,
137
+ min_samples_leaf: int = 50,
138
+ max_iter: int = 1000,
139
+ min_child_weight: float = 3.0,
140
+ gamma: float = 1.0,
141
+ num_leaves: int = 31,
142
+ min_data_in_leaf: int = 40):
143
+ # General config
144
+ self.random_state = random_state
145
+ self.lr = learning_rate
146
+ self.L1 = L1_regularization
147
+ self.L2 = L2_regularization
148
+
149
+ # Shared tree structure
150
+ self.n_estimators = n_estimators
151
+ self.max_depth = max_depth
152
+ self.subsample = subsample
153
+ self.colsample_bytree = colsample_bytree
154
+
155
+ # XGBoost specific
156
+ self.min_child_weight = min_child_weight
157
+ self.gamma = gamma
158
+
159
+ # LightGBM specific
160
+ if num_leaves >= (2**max_depth):
161
+ num_leaves = (2**max_depth) - 1
162
+ print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
163
+ self.num_leaves = num_leaves
164
+ self.min_data_in_leaf = min_data_in_leaf
165
+
166
+ # HistGB specific
167
+ self.max_iter = max_iter
168
+ self.min_samples_leaf = min_samples_leaf
169
+
170
+ def __call__(self) -> dict[str, object]:
171
+ """
172
+ Returns a dictionary with new instances of:
173
+ - "XGBoost": XGBRegressor
174
+ - "LightGBM": LGBMRegressor
175
+ - "HistGB": HistGradientBoostingRegressor
176
+ """
177
+ # XGBoost Regressor
178
+ xgb_model = xgb.XGBRegressor(
179
+ n_estimators=self.n_estimators,
180
+ max_depth=self.max_depth,
181
+ learning_rate=self.lr,
182
+ subsample=self.subsample,
183
+ colsample_bytree=self.colsample_bytree,
184
+ random_state=self.random_state,
185
+ reg_alpha=self.L1,
186
+ reg_lambda=self.L2,
187
+ eval_metric='rmse',
188
+ min_child_weight=self.min_child_weight,
189
+ gamma=self.gamma,
190
+ tree_method='hist',
191
+ grow_policy='lossguide'
192
+ )
193
+
194
+ # LightGBM Regressor
195
+ lgb_model = lgb.LGBMRegressor(
196
+ n_estimators=self.n_estimators,
197
+ learning_rate=self.lr,
198
+ max_depth=self.max_depth,
199
+ subsample=self.subsample,
200
+ colsample_bytree=self.colsample_bytree,
201
+ random_state=self.random_state,
202
+ verbose=-1,
203
+ reg_alpha=self.L1,
204
+ reg_lambda=self.L2,
205
+ boosting_type='dart',
206
+ num_leaves=self.num_leaves,
207
+ min_data_in_leaf=self.min_data_in_leaf
208
+ )
209
+
210
+ # HistGradientBoosting Regressor
211
+ hist_model = HistGradientBoostingRegressor(
212
+ max_iter=self.max_iter,
213
+ learning_rate=self.lr,
214
+ max_depth=self.max_depth,
215
+ min_samples_leaf=self.min_samples_leaf,
216
+ random_state=self.random_state,
217
+ l2_regularization=self.L2,
218
+ scoring='neg_mean_squared_error',
219
+ early_stopping=True,
220
+ validation_fraction=0.1
221
+ )
222
+
223
+ return {
224
+ "XGBoost": xgb_model,
225
+ "LightGBM": lgb_model,
226
+ "HistGB": hist_model
227
+ }
65
228
 
66
- # Model initialization logic
67
- if task not in ["classification", "regression"]:
68
- raise ValueError(f"Invalid task: {task}. Must be 'classification' or 'regression'.")
69
-
70
- models = {}
71
-
72
- # Common parameters
73
- xgb_params = {
74
- 'n_estimators': 200,
75
- 'max_depth': 5,
76
- 'learning_rate': learning_rate,
77
- 'subsample': 0.8,
78
- 'colsample_bytree': 0.8,
79
- 'random_state': random_state,
80
- 'reg_alpha': L1_regularization,
81
- 'reg_lambda': L2_regularization,
82
- }
83
-
84
- lgbm_params = {
85
- 'n_estimators': 200,
86
- 'learning_rate': learning_rate,
87
- 'max_depth': 5,
88
- 'subsample': 0.8,
89
- 'colsample_bytree': 0.8,
90
- 'random_state': random_state,
91
- 'verbose': -1,
92
- 'reg_alpha': L1_regularization,
93
- 'reg_lambda': L2_regularization,
94
- }
95
-
96
- hist_params = {
97
- 'max_iter': 200,
98
- 'learning_rate': learning_rate,
99
- 'max_depth': 5,
100
- 'min_samples_leaf': 30,
101
- 'random_state': random_state,
102
- 'l2_regularization': L2_regularization,
103
- }
104
-
105
- # XGB Model
106
- if task == "classification":
107
- xgb_params.update({
108
- 'scale_pos_weight': 1 if is_balanced else 8,
109
- 'eval_metric': 'aucpr'
110
- })
111
- models["XGBoost"] = xgb.XGBClassifier(**xgb_params)
112
- else:
113
- xgb_params.update({'eval_metric': 'rmse'})
114
- models["XGBoost"] = xgb.XGBRegressor(**xgb_params)
229
+ def __str__(self):
230
+ return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
115
231
 
116
- # LGBM Model
117
- if task == "classification":
118
- lgbm_params.update({
119
- 'class_weight': None if is_balanced else 'balanced',
120
- 'boosting_type': 'goss' if is_balanced else 'dart',
121
- })
122
- models["LightGBM"] = lgb.LGBMClassifier(**lgbm_params)
123
- else:
124
- lgbm_params['boosting_type'] = 'dart'
125
- models["LightGBM"] = lgb.LGBMRegressor(**lgbm_params)
126
232
 
127
- # HistGB Model
128
- if task == "classification":
129
- hist_params.update({
130
- 'class_weight': None if is_balanced else 'balanced',
131
- 'scoring': 'loss' if is_balanced else 'balanced_accuracy',
132
- })
133
- models["HistGB"] = HistGradientBoostingClassifier(**hist_params)
134
- else:
135
- hist_params['scoring'] = 'neg_mean_squared_error'
136
- models["HistGB"] = HistGradientBoostingRegressor(**hist_params)
233
+ class ClassificationTreeModels:
234
+ """
235
+ A factory class for creating and configuring multiple gradient boosting classification models
236
+ with unified hyperparameters. This includes: XGBoost, LightGBM, and HistGradientBoostingClassifier.
237
+
238
+ Use the `__call__`, `()` method.
239
+
240
+ Parameters
241
+ ----------
242
+ random_state : int
243
+ Seed used by the random number generator to ensure reproducibility.
244
+
245
+ learning_rate : float [0.001 - 0.300]
246
+ Boosting learning rate (shrinkage factor).
247
+
248
+ L1_regularization : float [0.0 - 10.0]
249
+ L1 regularization term (alpha), might drive to sparsity.
250
+
251
+ L2_regularization : float [0.0 - 10.0]
252
+ L2 regularization term (lambda).
253
+
254
+ n_estimators : int [100 - 3000]
255
+ Number of boosting rounds for XGBoost and LightGBM.
256
+
257
+ max_depth : int [3 - 15]
258
+ Maximum depth of individual trees in the ensemble. Controls model complexity; high values may overfit.
259
+
260
+ subsample : float [0.5 - 1.0]
261
+ Fraction of samples to use when fitting base learners; used to prevent overfitting.
262
+
263
+ colsample_bytree : float [0.3 - 1.0]
264
+ Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
265
+
266
+ min_samples_leaf : int [10 - 100]
267
+ Minimum number of samples required to be at a leaf node; higher = less overfitting (used in HistGB).
268
+
269
+ max_iter : int [100 - 2000]
270
+ Maximum number of boosting iteration (used in HistGB).
271
+
272
+ min_child_weight : float [0.1 - 10.0]
273
+ Minimum sum of instance weight (Hessian) in a child node; larger values make the algorithm more conservative (used in XGBoost).
274
+
275
+ gamma : float [0.0 - 5.0]
276
+ Minimum loss reduction required to make a further partition; higher = more regularization (used in XGBoost).
277
+
278
+ num_leaves : int [20 - 200]
279
+ Maximum number of leaves in one tree. Should be less than 2^(max_depth); larger = more complex (used in LightGBM).
280
+
281
+ min_data_in_leaf : int [10 -100]
282
+ Minimum number of samples required in a leaf; increasing may prevent overfitting (used in LightGBM).
283
+
284
+ Attributes
285
+ ----------
286
+ use_model_balance : bool
287
+ Indicates whether to apply class balancing strategies internally. Can be overridden at runtime via the `__call__` method.
288
+ """
289
+ def __init__(self,
290
+ random_state: int = 101,
291
+ learning_rate: float = 0.005,
292
+ L1_regularization: float = 1.0,
293
+ L2_regularization: float = 1.0,
294
+ n_estimators: int = 1000,
295
+ max_depth: int = 8,
296
+ subsample: float = 0.8,
297
+ colsample_bytree: float = 0.8,
298
+ min_samples_leaf: int = 50,
299
+ max_iter: int = 1000,
300
+ min_child_weight: float = 3.0,
301
+ gamma: float = 1.0,
302
+ num_leaves: int = 31,
303
+ min_data_in_leaf: int = 40):
304
+ # General config
305
+ self.random_state = random_state
306
+ self.lr = learning_rate
307
+ self.L1 = L1_regularization
308
+ self.L2 = L2_regularization
309
+
310
+ # To be set by the pipeline
311
+ self.use_model_balance: bool = True
312
+
313
+ # Shared tree structure
314
+ self.n_estimators = n_estimators
315
+ self.max_depth = max_depth
316
+ self.subsample = subsample
317
+ self.colsample_bytree = colsample_bytree
318
+
319
+ # XGBoost specific
320
+ self.min_child_weight = min_child_weight
321
+ self.gamma = gamma
322
+
323
+ # LightGBM specific
324
+ if num_leaves >= (2**max_depth):
325
+ num_leaves = (2**max_depth) - 1
326
+ print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
327
+ self.num_leaves = num_leaves
328
+ self.min_data_in_leaf = min_data_in_leaf
329
+
330
+ # HistGB specific
331
+ self.max_iter = max_iter
332
+ self.min_samples_leaf = min_samples_leaf
333
+
334
+ def __call__(self, use_model_balance: Optional[bool]=None) -> dict[str, object]:
335
+ """
336
+ Returns a dictionary with new instances of:
337
+ - "XGBoost": XGBClassifier
338
+ - "LightGBM": LGBMClassifier
339
+ - "HistGB": HistGradientBoostingClassifier
340
+ """
341
+ if use_model_balance is not None:
342
+ self.use_model_balance = use_model_balance
343
+
344
+ # XGBoost Classifier
345
+ xgb_model = xgb.XGBClassifier(
346
+ n_estimators=self.n_estimators,
347
+ max_depth=self.max_depth,
348
+ learning_rate=self.lr,
349
+ subsample=self.subsample,
350
+ colsample_bytree=self.colsample_bytree,
351
+ random_state=self.random_state,
352
+ reg_alpha=self.L1,
353
+ reg_lambda=self.L2,
354
+ eval_metric='aucpr',
355
+ min_child_weight=self.min_child_weight,
356
+ gamma=self.gamma,
357
+ tree_method='hist',
358
+ grow_policy='lossguide',
359
+ scale_pos_weight=8.0 if self.use_model_balance else 1.0
360
+ )
361
+
362
+ # LightGBM Classifier
363
+ lgb_model = lgb.LGBMClassifier(
364
+ n_estimators=self.n_estimators,
365
+ learning_rate=self.lr,
366
+ max_depth=self.max_depth,
367
+ subsample=self.subsample,
368
+ colsample_bytree=self.colsample_bytree,
369
+ random_state=self.random_state,
370
+ verbose=-1,
371
+ reg_alpha=self.L1,
372
+ reg_lambda=self.L2,
373
+ boosting_type='dart' if self.use_model_balance else 'goss',
374
+ num_leaves=self.num_leaves,
375
+ min_data_in_leaf=self.min_data_in_leaf,
376
+ class_weight='balanced' if self.use_model_balance else None
377
+ )
378
+
379
+ # HistGradientBoosting Classifier
380
+ hist_model = HistGradientBoostingClassifier(
381
+ max_iter=self.max_iter,
382
+ learning_rate=self.lr,
383
+ max_depth=self.max_depth,
384
+ min_samples_leaf=self.min_samples_leaf,
385
+ random_state=self.random_state,
386
+ l2_regularization=self.L2,
387
+ early_stopping=True,
388
+ validation_fraction=0.1,
389
+ class_weight='balanced' if self.use_model_balance else None,
390
+ scoring='balanced_accuracy' if self.use_model_balance else 'loss'
391
+ )
392
+
393
+ return {
394
+ "XGBoost": xgb_model,
395
+ "LightGBM": lgb_model,
396
+ "HistGB": hist_model
397
+ }
398
+
399
+ def __str__(self):
400
+ return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
137
401
 
138
- return models
139
402
 
140
403
  ###### 3. Process Dataset ######
141
404
  # function to split data into train and test
@@ -144,23 +407,9 @@ def _split_data(features, target, test_size, random_state, task):
144
407
  stratify=target if task=="classification" else None)
145
408
  return X_train, X_test, y_train, y_test
146
409
 
147
- # function to standardize the data
148
- def _standardize_data(train_features, test_features, scaler_code):
149
- if scaler_code == "standard":
150
- scaler = StandardScaler()
151
- elif scaler_code == "minmax":
152
- scaler = MinMaxScaler()
153
- elif scaler_code == "maxabs":
154
- scaler = MaxAbsScaler()
155
- else:
156
- raise ValueError(f"Unrecognized scaler {scaler_code}")
157
- train_scaled = scaler.fit_transform(train_features)
158
- test_scaled = scaler.transform(test_features)
159
- return train_scaled, test_scaled, scaler
160
-
161
410
  # Over-sample minority class (Positive cases) and return several single target datasets (Classification)
162
- def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
163
- strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
411
+ def _resample(X_train: np.ndarray, y_train: pd.Series,
412
+ strategy: HandleImbalanceStrategy, random_state):
164
413
  '''
165
414
  Oversample minority class or undersample majority class.
166
415
 
@@ -168,30 +417,29 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
168
417
  '''
169
418
  if strategy == 'SMOTE':
170
419
  resample_algorithm = SMOTE(random_state=random_state, k_neighbors=3)
171
- elif strategy == 'RANDOM':
420
+ elif strategy == 'RAND_OVERSAMPLE':
172
421
  resample_algorithm = RandomOverSampler(random_state=random_state)
173
- elif strategy == 'UNDERSAMPLE':
422
+ elif strategy == 'RAND_UNDERSAMPLE':
174
423
  resample_algorithm = RandomUnderSampler(random_state=random_state)
175
424
  elif strategy == 'ADASYN':
176
425
  resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
177
426
  else:
178
427
  raise ValueError(f"Invalid resampling strategy: {strategy}")
179
428
 
180
- X_res, y_res, *_ = resample_algorithm.fit_resample(X_train_scaled, y_train)
429
+ X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
181
430
  return X_res, y_res
182
431
 
183
432
  # DATASET PIPELINE
184
- def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
185
- resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None], scaler: Literal["standard", "minmax", "maxabs"],
433
+ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: TaskType,
434
+ resample_strategy: HandleImbalanceStrategy,
186
435
  test_size: float=0.2, debug: bool=False, random_state: int=101):
187
436
  '''
188
437
  1. Make Train/Test splits
189
- 2. Standardize Train and Test Features
190
- 3. Oversample imbalanced classes (classification)
438
+ 2. Oversample imbalanced classes (classification)
191
439
 
192
- Return a processed Tuple: (X_train, y_train, X_test, y_test, Scaler)
440
+ Return a processed Tuple: (X_train, y_train, X_test, y_test)
193
441
 
194
- `(nD-array, 1D-array, nD-array, Series, Scaler)`
442
+ `(nD-array, 1D-array, nD-array, Series)`
195
443
  '''
196
444
  #DEBUG
197
445
  if debug:
@@ -206,24 +454,18 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
206
454
  if debug:
207
455
  print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
208
456
 
209
- # Standardize
210
- X_train_scaled, X_test_scaled, scaler_object = _standardize_data(train_features=X_train, test_features=X_test, scaler_code=scaler)
211
-
212
- #DEBUG
213
- if debug:
214
- print(f"Shapes after scaling features - X_train: {X_train_scaled.shape}, y_train: {y_train.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
215
457
 
216
- # Scale
217
- if resample_strategy is None or task == "regression":
218
- X_train_oversampled, y_train_oversampled = X_train_scaled, y_train
458
+ # Resample
459
+ if resample_strategy is None or resample_strategy == "by_model" or task == "regression":
460
+ X_train_oversampled, y_train_oversampled = X_train, y_train
219
461
  else:
220
- X_train_oversampled, y_train_oversampled = _resample(X_train_scaled=X_train_scaled, y_train=y_train, strategy=resample_strategy, random_state=random_state)
462
+ X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
221
463
 
222
464
  #DEBUG
223
465
  if debug:
224
- print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
466
+ print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
225
467
 
226
- return X_train_oversampled, y_train_oversampled, X_test_scaled, y_test, scaler_object
468
+ return X_train_oversampled, y_train_oversampled, X_test, y_test
227
469
 
228
470
  ###### 4. Train and Evaluation ######
229
471
  # Trainer function
@@ -244,11 +486,11 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
244
486
  return model_dir
245
487
 
246
488
  # save model
247
- def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
489
+ def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
248
490
  #Sanitize filenames to save
249
491
  sanitized_target_name = sanitize_filename(target_name)
250
492
  full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
251
- joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
493
+ joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
252
494
 
253
495
  # function to evaluate the model and save metrics (Classification)
254
496
  def evaluate_model_classification(
@@ -257,10 +499,9 @@ def evaluate_model_classification(
257
499
  save_dir: str,
258
500
  x_test_scaled: np.ndarray,
259
501
  single_y_test: np.ndarray,
260
- target_id: str,
502
+ target_name: str,
261
503
  figsize: tuple = (10, 8),
262
- title_fontsize: int = 24,
263
- label_fontsize: int = 24,
504
+ base_fontsize: int = 24,
264
505
  cmap: Colormap = plt.cm.Blues # type: ignore
265
506
  ) -> np.ndarray:
266
507
  """
@@ -271,8 +512,8 @@ def evaluate_model_classification(
271
512
  model_name: Identifier for the model
272
513
  save_dir: Directory where results are saved
273
514
  x_test_scaled: Feature matrix for test set
274
- single_y_test: True binary labels
275
- target_id: Suffix for naming output files
515
+ single_y_test: True targets
516
+ target_name: Target name
276
517
  figsize: Size of the confusion matrix figure (width, height)
277
518
  fontsize: Font size used for title, axis labels and ticks
278
519
  cmap: Color map for the confusion matrix. Examples include:
@@ -300,10 +541,10 @@ def evaluate_model_classification(
300
541
  )
301
542
 
302
543
  # Save text report
303
- sanitized_target_id = sanitize_filename(target_id)
304
- report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_id}.txt")
544
+ sanitized_target_name = sanitize_filename(target_name)
545
+ report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
305
546
  with open(report_path, "w") as f:
306
- f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
547
+ f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
307
548
  f.write("Classification Report:\n")
308
549
  f.write(report) # type: ignore
309
550
 
@@ -318,20 +559,20 @@ def evaluate_model_classification(
318
559
  ax=ax
319
560
  )
320
561
 
321
- ax.set_title(f"{model_name} - {target_id}", fontsize=title_fontsize)
322
- ax.tick_params(axis='both', labelsize=label_fontsize)
323
- ax.set_xlabel("Predicted label", fontsize=label_fontsize)
324
- ax.set_ylabel("True label", fontsize=label_fontsize)
562
+ ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
563
+ ax.tick_params(axis='both', labelsize=base_fontsize)
564
+ ax.set_xlabel("Predicted label", fontsize=base_fontsize)
565
+ ax.set_ylabel("True label", fontsize=base_fontsize)
325
566
 
326
567
  # Turn off gridlines
327
568
  ax.grid(False)
328
569
 
329
570
  # Manually update font size of cell texts
330
571
  for text in ax.texts:
331
- text.set_fontsize(title_fontsize+4)
572
+ text.set_fontsize(base_fontsize+4)
332
573
 
333
574
  fig.tight_layout()
334
- fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_id}.svg")
575
+ fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
335
576
  fig.savefig(fig_path, format="svg", bbox_inches="tight")
336
577
  plt.close(fig)
337
578
 
@@ -356,7 +597,7 @@ def plot_roc_curve(
356
597
  Parameters:
357
598
  true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
358
599
  probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
359
- target_name: str, used for figure title and filename.
600
+ target_name: str, Target name.
360
601
  save_directory: str, path to directory where figure is saved.
361
602
  color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
362
603
  - Named colors: "darkorange", "blue", "red", "green", "black"
@@ -425,7 +666,7 @@ def plot_roc_curve(
425
666
  def evaluate_model_regression(model, model_name: str,
426
667
  save_dir: str,
427
668
  x_test_scaled: np.ndarray, single_y_test: np.ndarray,
428
- target_id: str,
669
+ target_name: str,
429
670
  figure_size: tuple = (12, 8),
430
671
  alpha_transparency: float = 0.5,
431
672
  base_fontsize: int = 24):
@@ -439,10 +680,10 @@ def evaluate_model_regression(model, model_name: str,
439
680
  r2 = r2_score(single_y_test, y_pred)
440
681
 
441
682
  # Create formatted report
442
- sanitized_target_id = sanitize_filename(target_id)
443
- report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_id}.txt")
683
+ sanitized_target_name = sanitize_filename(target_name)
684
+ report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
444
685
  with open(report_path, "w") as f:
445
- f.write(f"{model_name} - {target_id} Regression Performance\n")
686
+ f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
446
687
  f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
447
688
  f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
448
689
  f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
@@ -455,10 +696,10 @@ def evaluate_model_regression(model, model_name: str,
455
696
  plt.axhline(0, color='red', linestyle='--')
456
697
  plt.xlabel("Predicted Values", fontsize=base_fontsize)
457
698
  plt.ylabel("Residuals", fontsize=base_fontsize)
458
- plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
699
+ plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
459
700
  plt.grid(True)
460
701
  plt.tight_layout()
461
- plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_id}.svg"), bbox_inches='tight', format="svg")
702
+ plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
462
703
  plt.close()
463
704
 
464
705
  # Create true vs predicted values plot
@@ -469,9 +710,9 @@ def evaluate_model_regression(model, model_name: str,
469
710
  'k--', lw=2)
470
711
  plt.xlabel('True Values', fontsize=base_fontsize)
471
712
  plt.ylabel('Predictions', fontsize=base_fontsize)
472
- plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
713
+ plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
473
714
  plt.grid(True)
474
- plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_id}.svg")
715
+ plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
475
716
  plt.savefig(plot_path, bbox_inches='tight', format="svg")
476
717
  plt.close()
477
718
 
@@ -485,7 +726,7 @@ def get_shap_values(
485
726
  save_dir: str,
486
727
  features_to_explain: np.ndarray,
487
728
  feature_names: list[str],
488
- target_id: str,
729
+ target_name: str,
489
730
  task: Literal["classification", "regression"],
490
731
  max_display_features: int = 10,
491
732
  figsize: tuple = (16, 20),
@@ -504,7 +745,7 @@ def get_shap_values(
504
745
  features_to_explain: Should match the model's training data format, including scaling.
505
746
  save_dir: Directory to save visualizations
506
747
  """
507
- sanitized_target_id = sanitize_filename(target_id)
748
+ sanitized_target_name = sanitize_filename(target_name)
508
749
 
509
750
  def _apply_plot_style():
510
751
  styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -567,9 +808,9 @@ def get_shap_values(
567
808
  _create_shap_plot(
568
809
  shap_values=class_shap,
569
810
  features=features_to_explain,
570
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_Class{class_name}_{plot_type}.svg"),
811
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
571
812
  plot_type=plot_type,
572
- title=f"{model_name} - {target_id} (Class {class_name})"
813
+ title=f"{model_name} - {target_name} (Class {class_name})"
573
814
  )
574
815
  else:
575
816
  values = shap_values[1] if isinstance(shap_values, list) else shap_values
@@ -577,9 +818,9 @@ def get_shap_values(
577
818
  _create_shap_plot(
578
819
  shap_values=values,
579
820
  features=features_to_explain,
580
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
821
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
581
822
  plot_type=plot_type,
582
- title=f"{model_name} - {target_id}"
823
+ title=f"{model_name} - {target_name}"
583
824
  )
584
825
 
585
826
  def _plot_for_regression(shap_values):
@@ -587,9 +828,9 @@ def get_shap_values(
587
828
  _create_shap_plot(
588
829
  shap_values=shap_values,
589
830
  features=features_to_explain,
590
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
831
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
591
832
  plot_type=plot_type,
592
- title=f"{model_name} - {target_id}"
833
+ title=f"{model_name} - {target_name}"
593
834
  )
594
835
  #START_O
595
836
 
@@ -607,10 +848,10 @@ def get_shap_values(
607
848
 
608
849
 
609
850
  # TRAIN TEST PIPELINE
610
- def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
851
+ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
611
852
  train_features: np.ndarray, train_target: np.ndarray,
612
853
  test_features: np.ndarray, test_target: np.ndarray,
613
- feature_names: list[str], target_id: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler],
854
+ feature_names: list[str], target_name: str,
614
855
  save_dir: str,
615
856
  debug: bool=False, save_model: bool=False):
616
857
  '''
@@ -620,7 +861,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
620
861
 
621
862
  Returns: Tuple(Trained model, Test-set Predictions)
622
863
  '''
623
- print(f"\tModel: {model_name} for Target: {target_id}...")
864
+ print(f"\tTraining model: {model_name} for Target: {target_name}...")
624
865
  trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
625
866
  if debug:
626
867
  print(f"Trained model object: {type(trained_model)}")
@@ -628,52 +869,66 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
628
869
 
629
870
  if save_model:
630
871
  _save_model(trained_model=trained_model, model_name=model_name,
631
- target_name=target_id, feature_names=feature_names,
632
- save_directory=local_save_directory, scaler_object=scaler_object)
872
+ target_name=target_name, feature_names=feature_names,
873
+ save_directory=local_save_directory)
633
874
 
634
875
  if task == "classification":
635
876
  y_pred = evaluate_model_classification(model=trained_model, model_name=model_name, save_dir=local_save_directory,
636
- x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
877
+ x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
637
878
  plot_roc_curve(true_labels=test_target,
638
879
  probabilities_or_model=trained_model, model_name=model_name,
639
- target_name=target_id, save_directory=local_save_directory,
880
+ target_name=target_name, save_directory=local_save_directory,
640
881
  input_features=test_features)
641
882
  elif task == "regression":
642
883
  y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
643
- x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
884
+ x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
644
885
  else:
645
886
  raise ValueError(f"Unrecognized task '{task}' for model training,")
646
887
  if debug:
647
888
  print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
648
889
 
649
890
  get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
650
- features_to_explain=train_features, feature_names=feature_names, target_id=target_id, task=task)
651
- print("\t...done.")
891
+ features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
892
+ # print("\t...done.")
652
893
  return trained_model, y_pred
653
894
 
654
895
  ###### 5. Execution ######
655
- def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
656
- resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, scaler: Literal["standard", "minmax", "maxabs"]="minmax", save_model: bool=False,
657
- test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
896
+ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
897
+ handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
898
+ test_size: float=0.2, debug:bool=False):
899
+ #Check models
900
+ if isinstance(model_object, RegressionTreeModels):
901
+ task = "regression"
902
+ elif isinstance(model_object, ClassificationTreeModels):
903
+ task = "classification"
904
+ if handle_classification_imbalance is None:
905
+ print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
906
+ elif handle_classification_imbalance == "by_model":
907
+ model_object.use_model_balance = True
908
+ else:
909
+ model_object.use_model_balance = False
910
+ else:
911
+ raise TypeError(f"Unrecognized model {type(model_object)}")
912
+
658
913
  #Check paths
659
914
  _check_paths(datasets_dir, save_dir)
915
+
660
916
  #Yield imputed dataset
661
917
  for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
662
918
  #Yield features dataframe and target dataframe
663
919
  for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
664
920
  #Dataset pipeline
665
- X_train, y_train, X_test, y_test, scaler_object = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
666
- resample_strategy=resample_strategy, scaler=scaler,
667
- test_size=test_size, debug=debug, random_state=random_state)
921
+ X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
922
+ resample_strategy=handle_classification_imbalance,
923
+ test_size=test_size, debug=debug, random_state=model_object.random_state)
668
924
  #Get models
669
- models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
670
- L1_regularization=L1_regularization, L2_regularization=L2_regularization, learning_rate=learning_rate)
925
+ models_dict = model_object()
671
926
  #Train models
672
927
  for model_name, model in models_dict.items():
673
928
  train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
674
929
  train_features=X_train, train_target=y_train, # type: ignore
675
930
  test_features=X_test, test_target=y_test,
676
- feature_names=feature_names,target_id=target_name, scaler_object=scaler_object,
931
+ feature_names=feature_names,target_name=target_name,
677
932
  debug=debug, save_dir=save_dir, save_model=save_model)
678
933
  print("\n✅ Training and evaluation complete.")
679
934
 
@@ -683,3 +938,7 @@ def _check_paths(datasets_dir: str, save_dir:str):
683
938
  os.makedirs(save_dir)
684
939
  if not os.path.isdir(datasets_dir):
685
940
  raise IOError(f"Datasets directory '{datasets_dir}' not found.")
941
+
942
+
943
+ def info():
944
+ _script_info(__all__)