dragon-ml-toolbox 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-1.4.3.dist-info/RECORD +19 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/licenses/LICENSE-THIRD-PARTY.md +6 -1
- ml_tools/MICE_imputation.py +22 -14
- ml_tools/data_exploration.py +41 -8
- ml_tools/ensemble_learning.py +446 -187
- ml_tools/particle_swarm_optimization.py +43 -52
- ml_tools/utilities.py +44 -8
- dragon_ml_toolbox-1.4.1.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/top_level.txt +0 -0
ml_tools/ensemble_learning.py
CHANGED
|
@@ -6,7 +6,7 @@ from matplotlib.colors import Colormap
|
|
|
6
6
|
from matplotlib import rcdefaults
|
|
7
7
|
|
|
8
8
|
import os
|
|
9
|
-
from typing import Literal, Union, Optional
|
|
9
|
+
from typing import Literal, Union, Optional, Iterator, Tuple
|
|
10
10
|
import joblib
|
|
11
11
|
|
|
12
12
|
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
|
|
@@ -17,11 +17,10 @@ import xgboost as xgb
|
|
|
17
17
|
import lightgbm as lgb
|
|
18
18
|
|
|
19
19
|
from sklearn.model_selection import train_test_split
|
|
20
|
-
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
21
20
|
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
|
|
22
21
|
import shap
|
|
23
22
|
|
|
24
|
-
from .utilities import yield_dataframes_from_dir, sanitize_filename
|
|
23
|
+
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
|
|
25
24
|
|
|
26
25
|
import warnings # Ignore warnings
|
|
27
26
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -29,113 +28,377 @@ warnings.filterwarnings('ignore', category=FutureWarning)
|
|
|
29
28
|
warnings.filterwarnings('ignore', category=UserWarning)
|
|
30
29
|
|
|
31
30
|
|
|
31
|
+
__all__ = [
|
|
32
|
+
"dataset_yielder",
|
|
33
|
+
"RegressionTreeModels",
|
|
34
|
+
"ClassificationTreeModels",
|
|
35
|
+
"dataset_pipeline",
|
|
36
|
+
"evaluate_model_classification",
|
|
37
|
+
"plot_roc_curve",
|
|
38
|
+
"evaluate_model_regression",
|
|
39
|
+
"get_shap_values",
|
|
40
|
+
"train_test_pipeline",
|
|
41
|
+
"run_ensemble_pipeline"
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
## Type aliases
|
|
45
|
+
HandleImbalanceStrategy = Literal[
|
|
46
|
+
"ADASYN", "SMOTE", "RAND_OVERSAMPLE", "RAND_UNDERSAMPLE", "by_model", None
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
TaskType = Literal[
|
|
50
|
+
"classification", "regression"
|
|
51
|
+
]
|
|
52
|
+
|
|
32
53
|
###### 1. Dataset Loader ######
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
54
|
+
def dataset_yielder(
|
|
55
|
+
df: pd.DataFrame,
|
|
56
|
+
target_cols: list[str]
|
|
57
|
+
) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
|
|
58
|
+
"""
|
|
59
|
+
Yields one tuple at a time:
|
|
60
|
+
(features_dataframe, target_series, feature_names, target_name)
|
|
61
|
+
|
|
62
|
+
Skips any target columns not found in the DataFrame.
|
|
63
|
+
"""
|
|
64
|
+
# Determine which target columns actually exist in the DataFrame
|
|
65
|
+
valid_targets = [col for col in target_cols if col in df.columns]
|
|
66
|
+
|
|
67
|
+
# Features = all columns excluding valid target columns
|
|
68
|
+
df_features = df.drop(columns=valid_targets)
|
|
39
69
|
feature_names = df_features.columns.to_list()
|
|
40
|
-
|
|
41
|
-
for target_col in
|
|
70
|
+
|
|
71
|
+
for target_col in valid_targets:
|
|
42
72
|
df_target = df[target_col]
|
|
43
73
|
yield (df_features, df_target, feature_names, target_col)
|
|
44
74
|
|
|
75
|
+
|
|
45
76
|
###### 2. Initialize Models ######
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
Valid tasks: "classification" or "regression".
|
|
77
|
+
class RegressionTreeModels:
|
|
78
|
+
"""
|
|
79
|
+
A factory class for creating and configuring multiple gradient boosting regression models
|
|
80
|
+
with unified hyperparameters. This includes XGBoost, LightGBM, and HistGradientBoostingRegressor.
|
|
51
81
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
For classification only: Set `is_balanced=False` for imbalanced datasets.
|
|
82
|
+
Use the `__call__`, `()` method.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
random_state : int
|
|
87
|
+
Seed used by the random number generator.
|
|
88
|
+
|
|
89
|
+
learning_rate : float [0.001 - 0.300]
|
|
90
|
+
Boosting learning rate (shrinkage).
|
|
62
91
|
|
|
63
|
-
|
|
64
|
-
|
|
92
|
+
L1_regularization : float [0.0 - 10.0]
|
|
93
|
+
L1 regularization term (alpha). Might drive to sparsity.
|
|
94
|
+
|
|
95
|
+
L2_regularization : float [0.0 - 10.0]
|
|
96
|
+
L2 regularization term (lambda).
|
|
97
|
+
|
|
98
|
+
n_estimators : int [100 - 3000]
|
|
99
|
+
Number of boosting iterations for XGBoost and LightGBM.
|
|
100
|
+
|
|
101
|
+
max_depth : int [3 - 15]
|
|
102
|
+
Maximum depth of individual trees. Controls model complexity; high values may overfit.
|
|
103
|
+
|
|
104
|
+
subsample : float [0.5 - 1.0]
|
|
105
|
+
Fraction of rows per tree; used to prevent overfitting.
|
|
106
|
+
|
|
107
|
+
colsample_bytree : float [0.3 - 1.0]
|
|
108
|
+
Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
|
|
109
|
+
|
|
110
|
+
min_samples_leaf : int [10 - 100]
|
|
111
|
+
Minimum samples per leaf; higher = less overfitting (used in HistGB).
|
|
112
|
+
|
|
113
|
+
max_iter : int [100 - 2000]
|
|
114
|
+
Maximum number of iterations (used in HistGB).
|
|
115
|
+
|
|
116
|
+
min_child_weight : float [0.1 - 10.0]
|
|
117
|
+
Minimum sum of instance weight (hessian) needed in a child; larger values make the algorithm more conservative (used in XGBoost).
|
|
118
|
+
|
|
119
|
+
gamma : float [0.0 - 5.0]
|
|
120
|
+
Minimum loss reduction required to make a further partition on a leaf node; higher = more regularization (used in XGBoost).
|
|
121
|
+
|
|
122
|
+
num_leaves : int [20 - 200]
|
|
123
|
+
Maximum number of leaves in one tree; should be less than 2^(max_depth); larger = more complex (used in LightGBM).
|
|
124
|
+
|
|
125
|
+
min_data_in_leaf : int [10 - 100]
|
|
126
|
+
Minimum number of data points in a leaf; increasing may prevent overfitting (used in LightGBM).
|
|
127
|
+
"""
|
|
128
|
+
def __init__(self,
|
|
129
|
+
random_state: int = 101,
|
|
130
|
+
learning_rate: float = 0.005,
|
|
131
|
+
L1_regularization: float = 1.0,
|
|
132
|
+
L2_regularization: float = 1.0,
|
|
133
|
+
n_estimators: int = 1000,
|
|
134
|
+
max_depth: int = 8,
|
|
135
|
+
subsample: float = 0.8,
|
|
136
|
+
colsample_bytree: float = 0.8,
|
|
137
|
+
min_samples_leaf: int = 50,
|
|
138
|
+
max_iter: int = 1000,
|
|
139
|
+
min_child_weight: float = 3.0,
|
|
140
|
+
gamma: float = 1.0,
|
|
141
|
+
num_leaves: int = 31,
|
|
142
|
+
min_data_in_leaf: int = 40):
|
|
143
|
+
# General config
|
|
144
|
+
self.random_state = random_state
|
|
145
|
+
self.lr = learning_rate
|
|
146
|
+
self.L1 = L1_regularization
|
|
147
|
+
self.L2 = L2_regularization
|
|
148
|
+
|
|
149
|
+
# Shared tree structure
|
|
150
|
+
self.n_estimators = n_estimators
|
|
151
|
+
self.max_depth = max_depth
|
|
152
|
+
self.subsample = subsample
|
|
153
|
+
self.colsample_bytree = colsample_bytree
|
|
154
|
+
|
|
155
|
+
# XGBoost specific
|
|
156
|
+
self.min_child_weight = min_child_weight
|
|
157
|
+
self.gamma = gamma
|
|
158
|
+
|
|
159
|
+
# LightGBM specific
|
|
160
|
+
if num_leaves >= (2**max_depth):
|
|
161
|
+
num_leaves = (2**max_depth) - 1
|
|
162
|
+
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
163
|
+
self.num_leaves = num_leaves
|
|
164
|
+
self.min_data_in_leaf = min_data_in_leaf
|
|
165
|
+
|
|
166
|
+
# HistGB specific
|
|
167
|
+
self.max_iter = max_iter
|
|
168
|
+
self.min_samples_leaf = min_samples_leaf
|
|
169
|
+
|
|
170
|
+
def __call__(self) -> dict[str, object]:
|
|
171
|
+
"""
|
|
172
|
+
Returns a dictionary with new instances of:
|
|
173
|
+
- "XGBoost": XGBRegressor
|
|
174
|
+
- "LightGBM": LGBMRegressor
|
|
175
|
+
- "HistGB": HistGradientBoostingRegressor
|
|
176
|
+
"""
|
|
177
|
+
# XGBoost Regressor
|
|
178
|
+
xgb_model = xgb.XGBRegressor(
|
|
179
|
+
n_estimators=self.n_estimators,
|
|
180
|
+
max_depth=self.max_depth,
|
|
181
|
+
learning_rate=self.lr,
|
|
182
|
+
subsample=self.subsample,
|
|
183
|
+
colsample_bytree=self.colsample_bytree,
|
|
184
|
+
random_state=self.random_state,
|
|
185
|
+
reg_alpha=self.L1,
|
|
186
|
+
reg_lambda=self.L2,
|
|
187
|
+
eval_metric='rmse',
|
|
188
|
+
min_child_weight=self.min_child_weight,
|
|
189
|
+
gamma=self.gamma,
|
|
190
|
+
tree_method='hist',
|
|
191
|
+
grow_policy='lossguide'
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# LightGBM Regressor
|
|
195
|
+
lgb_model = lgb.LGBMRegressor(
|
|
196
|
+
n_estimators=self.n_estimators,
|
|
197
|
+
learning_rate=self.lr,
|
|
198
|
+
max_depth=self.max_depth,
|
|
199
|
+
subsample=self.subsample,
|
|
200
|
+
colsample_bytree=self.colsample_bytree,
|
|
201
|
+
random_state=self.random_state,
|
|
202
|
+
verbose=-1,
|
|
203
|
+
reg_alpha=self.L1,
|
|
204
|
+
reg_lambda=self.L2,
|
|
205
|
+
boosting_type='dart',
|
|
206
|
+
num_leaves=self.num_leaves,
|
|
207
|
+
min_data_in_leaf=self.min_data_in_leaf
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# HistGradientBoosting Regressor
|
|
211
|
+
hist_model = HistGradientBoostingRegressor(
|
|
212
|
+
max_iter=self.max_iter,
|
|
213
|
+
learning_rate=self.lr,
|
|
214
|
+
max_depth=self.max_depth,
|
|
215
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
216
|
+
random_state=self.random_state,
|
|
217
|
+
l2_regularization=self.L2,
|
|
218
|
+
scoring='neg_mean_squared_error',
|
|
219
|
+
early_stopping=True,
|
|
220
|
+
validation_fraction=0.1
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
"XGBoost": xgb_model,
|
|
225
|
+
"LightGBM": lgb_model,
|
|
226
|
+
"HistGB": hist_model
|
|
227
|
+
}
|
|
65
228
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
raise ValueError(f"Invalid task: {task}. Must be 'classification' or 'regression'.")
|
|
69
|
-
|
|
70
|
-
models = {}
|
|
71
|
-
|
|
72
|
-
# Common parameters
|
|
73
|
-
xgb_params = {
|
|
74
|
-
'n_estimators': 200,
|
|
75
|
-
'max_depth': 5,
|
|
76
|
-
'learning_rate': learning_rate,
|
|
77
|
-
'subsample': 0.8,
|
|
78
|
-
'colsample_bytree': 0.8,
|
|
79
|
-
'random_state': random_state,
|
|
80
|
-
'reg_alpha': L1_regularization,
|
|
81
|
-
'reg_lambda': L2_regularization,
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
lgbm_params = {
|
|
85
|
-
'n_estimators': 200,
|
|
86
|
-
'learning_rate': learning_rate,
|
|
87
|
-
'max_depth': 5,
|
|
88
|
-
'subsample': 0.8,
|
|
89
|
-
'colsample_bytree': 0.8,
|
|
90
|
-
'random_state': random_state,
|
|
91
|
-
'verbose': -1,
|
|
92
|
-
'reg_alpha': L1_regularization,
|
|
93
|
-
'reg_lambda': L2_regularization,
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
hist_params = {
|
|
97
|
-
'max_iter': 200,
|
|
98
|
-
'learning_rate': learning_rate,
|
|
99
|
-
'max_depth': 5,
|
|
100
|
-
'min_samples_leaf': 30,
|
|
101
|
-
'random_state': random_state,
|
|
102
|
-
'l2_regularization': L2_regularization,
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
# XGB Model
|
|
106
|
-
if task == "classification":
|
|
107
|
-
xgb_params.update({
|
|
108
|
-
'scale_pos_weight': 1 if is_balanced else 8,
|
|
109
|
-
'eval_metric': 'aucpr'
|
|
110
|
-
})
|
|
111
|
-
models["XGBoost"] = xgb.XGBClassifier(**xgb_params)
|
|
112
|
-
else:
|
|
113
|
-
xgb_params.update({'eval_metric': 'rmse'})
|
|
114
|
-
models["XGBoost"] = xgb.XGBRegressor(**xgb_params)
|
|
229
|
+
def __str__(self):
|
|
230
|
+
return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
|
|
115
231
|
|
|
116
|
-
# LGBM Model
|
|
117
|
-
if task == "classification":
|
|
118
|
-
lgbm_params.update({
|
|
119
|
-
'class_weight': None if is_balanced else 'balanced',
|
|
120
|
-
'boosting_type': 'goss' if is_balanced else 'dart',
|
|
121
|
-
})
|
|
122
|
-
models["LightGBM"] = lgb.LGBMClassifier(**lgbm_params)
|
|
123
|
-
else:
|
|
124
|
-
lgbm_params['boosting_type'] = 'dart'
|
|
125
|
-
models["LightGBM"] = lgb.LGBMRegressor(**lgbm_params)
|
|
126
232
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
233
|
+
class ClassificationTreeModels:
|
|
234
|
+
"""
|
|
235
|
+
A factory class for creating and configuring multiple gradient boosting classification models
|
|
236
|
+
with unified hyperparameters. This includes: XGBoost, LightGBM, and HistGradientBoostingClassifier.
|
|
237
|
+
|
|
238
|
+
Use the `__call__`, `()` method.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
random_state : int
|
|
243
|
+
Seed used by the random number generator to ensure reproducibility.
|
|
244
|
+
|
|
245
|
+
learning_rate : float [0.001 - 0.300]
|
|
246
|
+
Boosting learning rate (shrinkage factor).
|
|
247
|
+
|
|
248
|
+
L1_regularization : float [0.0 - 10.0]
|
|
249
|
+
L1 regularization term (alpha), might drive to sparsity.
|
|
250
|
+
|
|
251
|
+
L2_regularization : float [0.0 - 10.0]
|
|
252
|
+
L2 regularization term (lambda).
|
|
253
|
+
|
|
254
|
+
n_estimators : int [100 - 3000]
|
|
255
|
+
Number of boosting rounds for XGBoost and LightGBM.
|
|
256
|
+
|
|
257
|
+
max_depth : int [3 - 15]
|
|
258
|
+
Maximum depth of individual trees in the ensemble. Controls model complexity; high values may overfit.
|
|
259
|
+
|
|
260
|
+
subsample : float [0.5 - 1.0]
|
|
261
|
+
Fraction of samples to use when fitting base learners; used to prevent overfitting.
|
|
262
|
+
|
|
263
|
+
colsample_bytree : float [0.3 - 1.0]
|
|
264
|
+
Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
|
|
265
|
+
|
|
266
|
+
min_samples_leaf : int [10 - 100]
|
|
267
|
+
Minimum number of samples required to be at a leaf node; higher = less overfitting (used in HistGB).
|
|
268
|
+
|
|
269
|
+
max_iter : int [100 - 2000]
|
|
270
|
+
Maximum number of boosting iteration (used in HistGB).
|
|
271
|
+
|
|
272
|
+
min_child_weight : float [0.1 - 10.0]
|
|
273
|
+
Minimum sum of instance weight (Hessian) in a child node; larger values make the algorithm more conservative (used in XGBoost).
|
|
274
|
+
|
|
275
|
+
gamma : float [0.0 - 5.0]
|
|
276
|
+
Minimum loss reduction required to make a further partition; higher = more regularization (used in XGBoost).
|
|
277
|
+
|
|
278
|
+
num_leaves : int [20 - 200]
|
|
279
|
+
Maximum number of leaves in one tree. Should be less than 2^(max_depth); larger = more complex (used in LightGBM).
|
|
280
|
+
|
|
281
|
+
min_data_in_leaf : int [10 -100]
|
|
282
|
+
Minimum number of samples required in a leaf; increasing may prevent overfitting (used in LightGBM).
|
|
283
|
+
|
|
284
|
+
Attributes
|
|
285
|
+
----------
|
|
286
|
+
use_model_balance : bool
|
|
287
|
+
Indicates whether to apply class balancing strategies internally. Can be overridden at runtime via the `__call__` method.
|
|
288
|
+
"""
|
|
289
|
+
def __init__(self,
|
|
290
|
+
random_state: int = 101,
|
|
291
|
+
learning_rate: float = 0.005,
|
|
292
|
+
L1_regularization: float = 1.0,
|
|
293
|
+
L2_regularization: float = 1.0,
|
|
294
|
+
n_estimators: int = 1000,
|
|
295
|
+
max_depth: int = 8,
|
|
296
|
+
subsample: float = 0.8,
|
|
297
|
+
colsample_bytree: float = 0.8,
|
|
298
|
+
min_samples_leaf: int = 50,
|
|
299
|
+
max_iter: int = 1000,
|
|
300
|
+
min_child_weight: float = 3.0,
|
|
301
|
+
gamma: float = 1.0,
|
|
302
|
+
num_leaves: int = 31,
|
|
303
|
+
min_data_in_leaf: int = 40):
|
|
304
|
+
# General config
|
|
305
|
+
self.random_state = random_state
|
|
306
|
+
self.lr = learning_rate
|
|
307
|
+
self.L1 = L1_regularization
|
|
308
|
+
self.L2 = L2_regularization
|
|
309
|
+
|
|
310
|
+
# To be set by the pipeline
|
|
311
|
+
self.use_model_balance: bool = True
|
|
312
|
+
|
|
313
|
+
# Shared tree structure
|
|
314
|
+
self.n_estimators = n_estimators
|
|
315
|
+
self.max_depth = max_depth
|
|
316
|
+
self.subsample = subsample
|
|
317
|
+
self.colsample_bytree = colsample_bytree
|
|
318
|
+
|
|
319
|
+
# XGBoost specific
|
|
320
|
+
self.min_child_weight = min_child_weight
|
|
321
|
+
self.gamma = gamma
|
|
322
|
+
|
|
323
|
+
# LightGBM specific
|
|
324
|
+
if num_leaves >= (2**max_depth):
|
|
325
|
+
num_leaves = (2**max_depth) - 1
|
|
326
|
+
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
327
|
+
self.num_leaves = num_leaves
|
|
328
|
+
self.min_data_in_leaf = min_data_in_leaf
|
|
329
|
+
|
|
330
|
+
# HistGB specific
|
|
331
|
+
self.max_iter = max_iter
|
|
332
|
+
self.min_samples_leaf = min_samples_leaf
|
|
333
|
+
|
|
334
|
+
def __call__(self, use_model_balance: Optional[bool]=None) -> dict[str, object]:
|
|
335
|
+
"""
|
|
336
|
+
Returns a dictionary with new instances of:
|
|
337
|
+
- "XGBoost": XGBClassifier
|
|
338
|
+
- "LightGBM": LGBMClassifier
|
|
339
|
+
- "HistGB": HistGradientBoostingClassifier
|
|
340
|
+
"""
|
|
341
|
+
if use_model_balance is not None:
|
|
342
|
+
self.use_model_balance = use_model_balance
|
|
343
|
+
|
|
344
|
+
# XGBoost Classifier
|
|
345
|
+
xgb_model = xgb.XGBClassifier(
|
|
346
|
+
n_estimators=self.n_estimators,
|
|
347
|
+
max_depth=self.max_depth,
|
|
348
|
+
learning_rate=self.lr,
|
|
349
|
+
subsample=self.subsample,
|
|
350
|
+
colsample_bytree=self.colsample_bytree,
|
|
351
|
+
random_state=self.random_state,
|
|
352
|
+
reg_alpha=self.L1,
|
|
353
|
+
reg_lambda=self.L2,
|
|
354
|
+
eval_metric='aucpr',
|
|
355
|
+
min_child_weight=self.min_child_weight,
|
|
356
|
+
gamma=self.gamma,
|
|
357
|
+
tree_method='hist',
|
|
358
|
+
grow_policy='lossguide',
|
|
359
|
+
scale_pos_weight=8.0 if self.use_model_balance else 1.0
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# LightGBM Classifier
|
|
363
|
+
lgb_model = lgb.LGBMClassifier(
|
|
364
|
+
n_estimators=self.n_estimators,
|
|
365
|
+
learning_rate=self.lr,
|
|
366
|
+
max_depth=self.max_depth,
|
|
367
|
+
subsample=self.subsample,
|
|
368
|
+
colsample_bytree=self.colsample_bytree,
|
|
369
|
+
random_state=self.random_state,
|
|
370
|
+
verbose=-1,
|
|
371
|
+
reg_alpha=self.L1,
|
|
372
|
+
reg_lambda=self.L2,
|
|
373
|
+
boosting_type='dart' if self.use_model_balance else 'goss',
|
|
374
|
+
num_leaves=self.num_leaves,
|
|
375
|
+
min_data_in_leaf=self.min_data_in_leaf,
|
|
376
|
+
class_weight='balanced' if self.use_model_balance else None
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# HistGradientBoosting Classifier
|
|
380
|
+
hist_model = HistGradientBoostingClassifier(
|
|
381
|
+
max_iter=self.max_iter,
|
|
382
|
+
learning_rate=self.lr,
|
|
383
|
+
max_depth=self.max_depth,
|
|
384
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
385
|
+
random_state=self.random_state,
|
|
386
|
+
l2_regularization=self.L2,
|
|
387
|
+
early_stopping=True,
|
|
388
|
+
validation_fraction=0.1,
|
|
389
|
+
class_weight='balanced' if self.use_model_balance else None,
|
|
390
|
+
scoring='balanced_accuracy' if self.use_model_balance else 'loss'
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
return {
|
|
394
|
+
"XGBoost": xgb_model,
|
|
395
|
+
"LightGBM": lgb_model,
|
|
396
|
+
"HistGB": hist_model
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
def __str__(self):
|
|
400
|
+
return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
|
|
137
401
|
|
|
138
|
-
return models
|
|
139
402
|
|
|
140
403
|
###### 3. Process Dataset ######
|
|
141
404
|
# function to split data into train and test
|
|
@@ -144,23 +407,9 @@ def _split_data(features, target, test_size, random_state, task):
|
|
|
144
407
|
stratify=target if task=="classification" else None)
|
|
145
408
|
return X_train, X_test, y_train, y_test
|
|
146
409
|
|
|
147
|
-
# function to standardize the data
|
|
148
|
-
def _standardize_data(train_features, test_features, scaler_code):
|
|
149
|
-
if scaler_code == "standard":
|
|
150
|
-
scaler = StandardScaler()
|
|
151
|
-
elif scaler_code == "minmax":
|
|
152
|
-
scaler = MinMaxScaler()
|
|
153
|
-
elif scaler_code == "maxabs":
|
|
154
|
-
scaler = MaxAbsScaler()
|
|
155
|
-
else:
|
|
156
|
-
raise ValueError(f"Unrecognized scaler {scaler_code}")
|
|
157
|
-
train_scaled = scaler.fit_transform(train_features)
|
|
158
|
-
test_scaled = scaler.transform(test_features)
|
|
159
|
-
return train_scaled, test_scaled, scaler
|
|
160
|
-
|
|
161
410
|
# Over-sample minority class (Positive cases) and return several single target datasets (Classification)
|
|
162
|
-
def _resample(
|
|
163
|
-
strategy:
|
|
411
|
+
def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
412
|
+
strategy: HandleImbalanceStrategy, random_state):
|
|
164
413
|
'''
|
|
165
414
|
Oversample minority class or undersample majority class.
|
|
166
415
|
|
|
@@ -168,30 +417,29 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
|
|
|
168
417
|
'''
|
|
169
418
|
if strategy == 'SMOTE':
|
|
170
419
|
resample_algorithm = SMOTE(random_state=random_state, k_neighbors=3)
|
|
171
|
-
elif strategy == '
|
|
420
|
+
elif strategy == 'RAND_OVERSAMPLE':
|
|
172
421
|
resample_algorithm = RandomOverSampler(random_state=random_state)
|
|
173
|
-
elif strategy == '
|
|
422
|
+
elif strategy == 'RAND_UNDERSAMPLE':
|
|
174
423
|
resample_algorithm = RandomUnderSampler(random_state=random_state)
|
|
175
424
|
elif strategy == 'ADASYN':
|
|
176
425
|
resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
|
|
177
426
|
else:
|
|
178
427
|
raise ValueError(f"Invalid resampling strategy: {strategy}")
|
|
179
428
|
|
|
180
|
-
X_res, y_res, *_ = resample_algorithm.fit_resample(
|
|
429
|
+
X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
|
|
181
430
|
return X_res, y_res
|
|
182
431
|
|
|
183
432
|
# DATASET PIPELINE
|
|
184
|
-
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task:
|
|
185
|
-
resample_strategy:
|
|
433
|
+
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: TaskType,
|
|
434
|
+
resample_strategy: HandleImbalanceStrategy,
|
|
186
435
|
test_size: float=0.2, debug: bool=False, random_state: int=101):
|
|
187
436
|
'''
|
|
188
437
|
1. Make Train/Test splits
|
|
189
|
-
2.
|
|
190
|
-
3. Oversample imbalanced classes (classification)
|
|
438
|
+
2. Oversample imbalanced classes (classification)
|
|
191
439
|
|
|
192
|
-
Return a processed Tuple: (X_train, y_train, X_test, y_test
|
|
440
|
+
Return a processed Tuple: (X_train, y_train, X_test, y_test)
|
|
193
441
|
|
|
194
|
-
`(nD-array, 1D-array, nD-array, Series
|
|
442
|
+
`(nD-array, 1D-array, nD-array, Series)`
|
|
195
443
|
'''
|
|
196
444
|
#DEBUG
|
|
197
445
|
if debug:
|
|
@@ -206,24 +454,18 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
|
|
|
206
454
|
if debug:
|
|
207
455
|
print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
208
456
|
|
|
209
|
-
# Standardize
|
|
210
|
-
X_train_scaled, X_test_scaled, scaler_object = _standardize_data(train_features=X_train, test_features=X_test, scaler_code=scaler)
|
|
211
|
-
|
|
212
|
-
#DEBUG
|
|
213
|
-
if debug:
|
|
214
|
-
print(f"Shapes after scaling features - X_train: {X_train_scaled.shape}, y_train: {y_train.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
|
|
215
457
|
|
|
216
|
-
#
|
|
217
|
-
if resample_strategy is None or task == "regression":
|
|
218
|
-
X_train_oversampled, y_train_oversampled =
|
|
458
|
+
# Resample
|
|
459
|
+
if resample_strategy is None or resample_strategy == "by_model" or task == "regression":
|
|
460
|
+
X_train_oversampled, y_train_oversampled = X_train, y_train
|
|
219
461
|
else:
|
|
220
|
-
X_train_oversampled, y_train_oversampled = _resample(
|
|
462
|
+
X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
|
|
221
463
|
|
|
222
464
|
#DEBUG
|
|
223
465
|
if debug:
|
|
224
|
-
print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {
|
|
466
|
+
print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
225
467
|
|
|
226
|
-
return X_train_oversampled, y_train_oversampled,
|
|
468
|
+
return X_train_oversampled, y_train_oversampled, X_test, y_test
|
|
227
469
|
|
|
228
470
|
###### 4. Train and Evaluation ######
|
|
229
471
|
# Trainer function
|
|
@@ -244,11 +486,11 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
|
|
|
244
486
|
return model_dir
|
|
245
487
|
|
|
246
488
|
# save model
|
|
247
|
-
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str
|
|
489
|
+
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
|
|
248
490
|
#Sanitize filenames to save
|
|
249
491
|
sanitized_target_name = sanitize_filename(target_name)
|
|
250
492
|
full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
|
|
251
|
-
joblib.dump({'model': trained_model, '
|
|
493
|
+
joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
|
|
252
494
|
|
|
253
495
|
# function to evaluate the model and save metrics (Classification)
|
|
254
496
|
def evaluate_model_classification(
|
|
@@ -257,10 +499,9 @@ def evaluate_model_classification(
|
|
|
257
499
|
save_dir: str,
|
|
258
500
|
x_test_scaled: np.ndarray,
|
|
259
501
|
single_y_test: np.ndarray,
|
|
260
|
-
|
|
502
|
+
target_name: str,
|
|
261
503
|
figsize: tuple = (10, 8),
|
|
262
|
-
|
|
263
|
-
label_fontsize: int = 24,
|
|
504
|
+
base_fontsize: int = 24,
|
|
264
505
|
cmap: Colormap = plt.cm.Blues # type: ignore
|
|
265
506
|
) -> np.ndarray:
|
|
266
507
|
"""
|
|
@@ -271,8 +512,8 @@ def evaluate_model_classification(
|
|
|
271
512
|
model_name: Identifier for the model
|
|
272
513
|
save_dir: Directory where results are saved
|
|
273
514
|
x_test_scaled: Feature matrix for test set
|
|
274
|
-
single_y_test: True
|
|
275
|
-
|
|
515
|
+
single_y_test: True targets
|
|
516
|
+
target_name: Target name
|
|
276
517
|
figsize: Size of the confusion matrix figure (width, height)
|
|
277
518
|
fontsize: Font size used for title, axis labels and ticks
|
|
278
519
|
cmap: Color map for the confusion matrix. Examples include:
|
|
@@ -300,10 +541,10 @@ def evaluate_model_classification(
|
|
|
300
541
|
)
|
|
301
542
|
|
|
302
543
|
# Save text report
|
|
303
|
-
|
|
304
|
-
report_path = os.path.join(save_dir, f"Classification_Report_{
|
|
544
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
545
|
+
report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
|
|
305
546
|
with open(report_path, "w") as f:
|
|
306
|
-
f.write(f"{model_name} - {
|
|
547
|
+
f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
|
|
307
548
|
f.write("Classification Report:\n")
|
|
308
549
|
f.write(report) # type: ignore
|
|
309
550
|
|
|
@@ -318,20 +559,20 @@ def evaluate_model_classification(
|
|
|
318
559
|
ax=ax
|
|
319
560
|
)
|
|
320
561
|
|
|
321
|
-
ax.set_title(f"{model_name} - {
|
|
322
|
-
ax.tick_params(axis='both', labelsize=
|
|
323
|
-
ax.set_xlabel("Predicted label", fontsize=
|
|
324
|
-
ax.set_ylabel("True label", fontsize=
|
|
562
|
+
ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
|
|
563
|
+
ax.tick_params(axis='both', labelsize=base_fontsize)
|
|
564
|
+
ax.set_xlabel("Predicted label", fontsize=base_fontsize)
|
|
565
|
+
ax.set_ylabel("True label", fontsize=base_fontsize)
|
|
325
566
|
|
|
326
567
|
# Turn off gridlines
|
|
327
568
|
ax.grid(False)
|
|
328
569
|
|
|
329
570
|
# Manually update font size of cell texts
|
|
330
571
|
for text in ax.texts:
|
|
331
|
-
text.set_fontsize(
|
|
572
|
+
text.set_fontsize(base_fontsize+4)
|
|
332
573
|
|
|
333
574
|
fig.tight_layout()
|
|
334
|
-
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{
|
|
575
|
+
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
|
|
335
576
|
fig.savefig(fig_path, format="svg", bbox_inches="tight")
|
|
336
577
|
plt.close(fig)
|
|
337
578
|
|
|
@@ -356,7 +597,7 @@ def plot_roc_curve(
|
|
|
356
597
|
Parameters:
|
|
357
598
|
true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
|
|
358
599
|
probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
|
|
359
|
-
target_name: str,
|
|
600
|
+
target_name: str, Target name.
|
|
360
601
|
save_directory: str, path to directory where figure is saved.
|
|
361
602
|
color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
|
|
362
603
|
- Named colors: "darkorange", "blue", "red", "green", "black"
|
|
@@ -425,7 +666,7 @@ def plot_roc_curve(
|
|
|
425
666
|
def evaluate_model_regression(model, model_name: str,
|
|
426
667
|
save_dir: str,
|
|
427
668
|
x_test_scaled: np.ndarray, single_y_test: np.ndarray,
|
|
428
|
-
|
|
669
|
+
target_name: str,
|
|
429
670
|
figure_size: tuple = (12, 8),
|
|
430
671
|
alpha_transparency: float = 0.5,
|
|
431
672
|
base_fontsize: int = 24):
|
|
@@ -439,10 +680,10 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
439
680
|
r2 = r2_score(single_y_test, y_pred)
|
|
440
681
|
|
|
441
682
|
# Create formatted report
|
|
442
|
-
|
|
443
|
-
report_path = os.path.join(save_dir, f"Regression_Report_{
|
|
683
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
684
|
+
report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
|
|
444
685
|
with open(report_path, "w") as f:
|
|
445
|
-
f.write(f"{model_name} -
|
|
686
|
+
f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
|
|
446
687
|
f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
|
|
447
688
|
f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
|
|
448
689
|
f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
|
|
@@ -455,10 +696,10 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
455
696
|
plt.axhline(0, color='red', linestyle='--')
|
|
456
697
|
plt.xlabel("Predicted Values", fontsize=base_fontsize)
|
|
457
698
|
plt.ylabel("Residuals", fontsize=base_fontsize)
|
|
458
|
-
plt.title(f"{model_name} - Residual Plot for {
|
|
699
|
+
plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
|
|
459
700
|
plt.grid(True)
|
|
460
701
|
plt.tight_layout()
|
|
461
|
-
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{
|
|
702
|
+
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
|
|
462
703
|
plt.close()
|
|
463
704
|
|
|
464
705
|
# Create true vs predicted values plot
|
|
@@ -469,9 +710,9 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
469
710
|
'k--', lw=2)
|
|
470
711
|
plt.xlabel('True Values', fontsize=base_fontsize)
|
|
471
712
|
plt.ylabel('Predictions', fontsize=base_fontsize)
|
|
472
|
-
plt.title(f"{model_name} - True vs Predicted for {
|
|
713
|
+
plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
|
|
473
714
|
plt.grid(True)
|
|
474
|
-
plot_path = os.path.join(save_dir, f"Regression_Plot_{
|
|
715
|
+
plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
|
|
475
716
|
plt.savefig(plot_path, bbox_inches='tight', format="svg")
|
|
476
717
|
plt.close()
|
|
477
718
|
|
|
@@ -485,7 +726,7 @@ def get_shap_values(
|
|
|
485
726
|
save_dir: str,
|
|
486
727
|
features_to_explain: np.ndarray,
|
|
487
728
|
feature_names: list[str],
|
|
488
|
-
|
|
729
|
+
target_name: str,
|
|
489
730
|
task: Literal["classification", "regression"],
|
|
490
731
|
max_display_features: int = 10,
|
|
491
732
|
figsize: tuple = (16, 20),
|
|
@@ -504,7 +745,7 @@ def get_shap_values(
|
|
|
504
745
|
features_to_explain: Should match the model's training data format, including scaling.
|
|
505
746
|
save_dir: Directory to save visualizations
|
|
506
747
|
"""
|
|
507
|
-
|
|
748
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
508
749
|
|
|
509
750
|
def _apply_plot_style():
|
|
510
751
|
styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
|
|
@@ -567,9 +808,9 @@ def get_shap_values(
|
|
|
567
808
|
_create_shap_plot(
|
|
568
809
|
shap_values=class_shap,
|
|
569
810
|
features=features_to_explain,
|
|
570
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
811
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
|
|
571
812
|
plot_type=plot_type,
|
|
572
|
-
title=f"{model_name} - {
|
|
813
|
+
title=f"{model_name} - {target_name} (Class {class_name})"
|
|
573
814
|
)
|
|
574
815
|
else:
|
|
575
816
|
values = shap_values[1] if isinstance(shap_values, list) else shap_values
|
|
@@ -577,9 +818,9 @@ def get_shap_values(
|
|
|
577
818
|
_create_shap_plot(
|
|
578
819
|
shap_values=values,
|
|
579
820
|
features=features_to_explain,
|
|
580
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
821
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
|
|
581
822
|
plot_type=plot_type,
|
|
582
|
-
title=f"{model_name} - {
|
|
823
|
+
title=f"{model_name} - {target_name}"
|
|
583
824
|
)
|
|
584
825
|
|
|
585
826
|
def _plot_for_regression(shap_values):
|
|
@@ -587,9 +828,9 @@ def get_shap_values(
|
|
|
587
828
|
_create_shap_plot(
|
|
588
829
|
shap_values=shap_values,
|
|
589
830
|
features=features_to_explain,
|
|
590
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
831
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
|
|
591
832
|
plot_type=plot_type,
|
|
592
|
-
title=f"{model_name} - {
|
|
833
|
+
title=f"{model_name} - {target_name}"
|
|
593
834
|
)
|
|
594
835
|
#START_O
|
|
595
836
|
|
|
@@ -607,10 +848,10 @@ def get_shap_values(
|
|
|
607
848
|
|
|
608
849
|
|
|
609
850
|
# TRAIN TEST PIPELINE
|
|
610
|
-
def train_test_pipeline(model, model_name: str, dataset_id: str, task:
|
|
851
|
+
def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
|
|
611
852
|
train_features: np.ndarray, train_target: np.ndarray,
|
|
612
853
|
test_features: np.ndarray, test_target: np.ndarray,
|
|
613
|
-
feature_names: list[str],
|
|
854
|
+
feature_names: list[str], target_name: str,
|
|
614
855
|
save_dir: str,
|
|
615
856
|
debug: bool=False, save_model: bool=False):
|
|
616
857
|
'''
|
|
@@ -620,7 +861,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
620
861
|
|
|
621
862
|
Returns: Tuple(Trained model, Test-set Predictions)
|
|
622
863
|
'''
|
|
623
|
-
print(f"\
|
|
864
|
+
print(f"\tTraining model: {model_name} for Target: {target_name}...")
|
|
624
865
|
trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
|
|
625
866
|
if debug:
|
|
626
867
|
print(f"Trained model object: {type(trained_model)}")
|
|
@@ -628,52 +869,66 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
628
869
|
|
|
629
870
|
if save_model:
|
|
630
871
|
_save_model(trained_model=trained_model, model_name=model_name,
|
|
631
|
-
target_name=
|
|
632
|
-
save_directory=local_save_directory
|
|
872
|
+
target_name=target_name, feature_names=feature_names,
|
|
873
|
+
save_directory=local_save_directory)
|
|
633
874
|
|
|
634
875
|
if task == "classification":
|
|
635
876
|
y_pred = evaluate_model_classification(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
636
|
-
x_test_scaled=test_features, single_y_test=test_target,
|
|
877
|
+
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
637
878
|
plot_roc_curve(true_labels=test_target,
|
|
638
879
|
probabilities_or_model=trained_model, model_name=model_name,
|
|
639
|
-
target_name=
|
|
880
|
+
target_name=target_name, save_directory=local_save_directory,
|
|
640
881
|
input_features=test_features)
|
|
641
882
|
elif task == "regression":
|
|
642
883
|
y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
643
|
-
x_test_scaled=test_features, single_y_test=test_target,
|
|
884
|
+
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
644
885
|
else:
|
|
645
886
|
raise ValueError(f"Unrecognized task '{task}' for model training,")
|
|
646
887
|
if debug:
|
|
647
888
|
print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
|
|
648
889
|
|
|
649
890
|
get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
650
|
-
features_to_explain=train_features, feature_names=feature_names,
|
|
651
|
-
print("\t...done.")
|
|
891
|
+
features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
|
|
892
|
+
# print("\t...done.")
|
|
652
893
|
return trained_model, y_pred
|
|
653
894
|
|
|
654
895
|
###### 5. Execution ######
|
|
655
|
-
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str],
|
|
656
|
-
|
|
657
|
-
test_size: float=0.2, debug:bool=False
|
|
896
|
+
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
|
|
897
|
+
handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
|
|
898
|
+
test_size: float=0.2, debug:bool=False):
|
|
899
|
+
#Check models
|
|
900
|
+
if isinstance(model_object, RegressionTreeModels):
|
|
901
|
+
task = "regression"
|
|
902
|
+
elif isinstance(model_object, ClassificationTreeModels):
|
|
903
|
+
task = "classification"
|
|
904
|
+
if handle_classification_imbalance is None:
|
|
905
|
+
print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
|
|
906
|
+
elif handle_classification_imbalance == "by_model":
|
|
907
|
+
model_object.use_model_balance = True
|
|
908
|
+
else:
|
|
909
|
+
model_object.use_model_balance = False
|
|
910
|
+
else:
|
|
911
|
+
raise TypeError(f"Unrecognized model {type(model_object)}")
|
|
912
|
+
|
|
658
913
|
#Check paths
|
|
659
914
|
_check_paths(datasets_dir, save_dir)
|
|
915
|
+
|
|
660
916
|
#Yield imputed dataset
|
|
661
917
|
for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
|
|
662
918
|
#Yield features dataframe and target dataframe
|
|
663
919
|
for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
|
|
664
920
|
#Dataset pipeline
|
|
665
|
-
X_train, y_train, X_test, y_test
|
|
666
|
-
resample_strategy=
|
|
667
|
-
test_size=test_size, debug=debug, random_state=random_state)
|
|
921
|
+
X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
|
|
922
|
+
resample_strategy=handle_classification_imbalance,
|
|
923
|
+
test_size=test_size, debug=debug, random_state=model_object.random_state)
|
|
668
924
|
#Get models
|
|
669
|
-
models_dict =
|
|
670
|
-
L1_regularization=L1_regularization, L2_regularization=L2_regularization, learning_rate=learning_rate)
|
|
925
|
+
models_dict = model_object()
|
|
671
926
|
#Train models
|
|
672
927
|
for model_name, model in models_dict.items():
|
|
673
928
|
train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
|
|
674
929
|
train_features=X_train, train_target=y_train, # type: ignore
|
|
675
930
|
test_features=X_test, test_target=y_test,
|
|
676
|
-
feature_names=feature_names,
|
|
931
|
+
feature_names=feature_names,target_name=target_name,
|
|
677
932
|
debug=debug, save_dir=save_dir, save_model=save_model)
|
|
678
933
|
print("\n✅ Training and evaluation complete.")
|
|
679
934
|
|
|
@@ -683,3 +938,7 @@ def _check_paths(datasets_dir: str, save_dir:str):
|
|
|
683
938
|
os.makedirs(save_dir)
|
|
684
939
|
if not os.path.isdir(datasets_dir):
|
|
685
940
|
raise IOError(f"Datasets directory '{datasets_dir}' not found.")
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def info():
|
|
944
|
+
_script_info(__all__)
|