dragon-ml-toolbox 1.4.1__tar.gz → 1.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.2}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/ensemble_learning.py +74 -81
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/particle_swarm_optimization.py +42 -51
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/utilities.py +35 -2
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/pyproject.toml +1 -1
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/README.md +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/pytorch_models.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/trainer.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/ml_tools/vision_helpers.py +0 -0
- {dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/setup.cfg +0 -0
|
@@ -17,11 +17,10 @@ import xgboost as xgb
|
|
|
17
17
|
import lightgbm as lgb
|
|
18
18
|
|
|
19
19
|
from sklearn.model_selection import train_test_split
|
|
20
|
-
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
21
20
|
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
|
|
22
21
|
import shap
|
|
23
22
|
|
|
24
|
-
from .utilities import yield_dataframes_from_dir, sanitize_filename
|
|
23
|
+
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
|
|
25
24
|
|
|
26
25
|
import warnings # Ignore warnings
|
|
27
26
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -29,9 +28,21 @@ warnings.filterwarnings('ignore', category=FutureWarning)
|
|
|
29
28
|
warnings.filterwarnings('ignore', category=UserWarning)
|
|
30
29
|
|
|
31
30
|
|
|
31
|
+
__all__ = [
|
|
32
|
+
"get_models",
|
|
33
|
+
"dataset_pipeline",
|
|
34
|
+
"evaluate_model_classification",
|
|
35
|
+
"plot_roc_curve",
|
|
36
|
+
"evaluate_model_regression",
|
|
37
|
+
"get_shap_values",
|
|
38
|
+
"train_test_pipeline",
|
|
39
|
+
"run_ensemble_pipeline"
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
32
43
|
###### 1. Dataset Loader ######
|
|
33
44
|
#Split a dataset into features and targets datasets
|
|
34
|
-
def
|
|
45
|
+
def _dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
|
|
35
46
|
'''
|
|
36
47
|
Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
|
|
37
48
|
'''
|
|
@@ -144,22 +155,8 @@ def _split_data(features, target, test_size, random_state, task):
|
|
|
144
155
|
stratify=target if task=="classification" else None)
|
|
145
156
|
return X_train, X_test, y_train, y_test
|
|
146
157
|
|
|
147
|
-
# function to standardize the data
|
|
148
|
-
def _standardize_data(train_features, test_features, scaler_code):
|
|
149
|
-
if scaler_code == "standard":
|
|
150
|
-
scaler = StandardScaler()
|
|
151
|
-
elif scaler_code == "minmax":
|
|
152
|
-
scaler = MinMaxScaler()
|
|
153
|
-
elif scaler_code == "maxabs":
|
|
154
|
-
scaler = MaxAbsScaler()
|
|
155
|
-
else:
|
|
156
|
-
raise ValueError(f"Unrecognized scaler {scaler_code}")
|
|
157
|
-
train_scaled = scaler.fit_transform(train_features)
|
|
158
|
-
test_scaled = scaler.transform(test_features)
|
|
159
|
-
return train_scaled, test_scaled, scaler
|
|
160
|
-
|
|
161
158
|
# Over-sample minority class (Positive cases) and return several single target datasets (Classification)
|
|
162
|
-
def _resample(
|
|
159
|
+
def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
163
160
|
strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
|
|
164
161
|
'''
|
|
165
162
|
Oversample minority class or undersample majority class.
|
|
@@ -177,21 +174,20 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
|
|
|
177
174
|
else:
|
|
178
175
|
raise ValueError(f"Invalid resampling strategy: {strategy}")
|
|
179
176
|
|
|
180
|
-
X_res, y_res, *_ = resample_algorithm.fit_resample(
|
|
177
|
+
X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
|
|
181
178
|
return X_res, y_res
|
|
182
179
|
|
|
183
180
|
# DATASET PIPELINE
|
|
184
181
|
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
|
|
185
|
-
resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
|
|
182
|
+
resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
|
|
186
183
|
test_size: float=0.2, debug: bool=False, random_state: int=101):
|
|
187
184
|
'''
|
|
188
185
|
1. Make Train/Test splits
|
|
189
|
-
2.
|
|
190
|
-
3. Oversample imbalanced classes (classification)
|
|
186
|
+
2. Oversample imbalanced classes (classification)
|
|
191
187
|
|
|
192
|
-
Return a processed Tuple: (X_train, y_train, X_test, y_test
|
|
188
|
+
Return a processed Tuple: (X_train, y_train, X_test, y_test)
|
|
193
189
|
|
|
194
|
-
`(nD-array, 1D-array, nD-array, Series
|
|
190
|
+
`(nD-array, 1D-array, nD-array, Series)`
|
|
195
191
|
'''
|
|
196
192
|
#DEBUG
|
|
197
193
|
if debug:
|
|
@@ -206,24 +202,18 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
|
|
|
206
202
|
if debug:
|
|
207
203
|
print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
208
204
|
|
|
209
|
-
# Standardize
|
|
210
|
-
X_train_scaled, X_test_scaled, scaler_object = _standardize_data(train_features=X_train, test_features=X_test, scaler_code=scaler)
|
|
211
|
-
|
|
212
|
-
#DEBUG
|
|
213
|
-
if debug:
|
|
214
|
-
print(f"Shapes after scaling features - X_train: {X_train_scaled.shape}, y_train: {y_train.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
|
|
215
205
|
|
|
216
|
-
#
|
|
206
|
+
# Resample
|
|
217
207
|
if resample_strategy is None or task == "regression":
|
|
218
|
-
X_train_oversampled, y_train_oversampled =
|
|
208
|
+
X_train_oversampled, y_train_oversampled = X_train, y_train
|
|
219
209
|
else:
|
|
220
|
-
X_train_oversampled, y_train_oversampled = _resample(
|
|
210
|
+
X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
|
|
221
211
|
|
|
222
212
|
#DEBUG
|
|
223
213
|
if debug:
|
|
224
|
-
print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {
|
|
214
|
+
print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
225
215
|
|
|
226
|
-
return X_train_oversampled, y_train_oversampled,
|
|
216
|
+
return X_train_oversampled, y_train_oversampled, X_test, y_test
|
|
227
217
|
|
|
228
218
|
###### 4. Train and Evaluation ######
|
|
229
219
|
# Trainer function
|
|
@@ -244,11 +234,11 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
|
|
|
244
234
|
return model_dir
|
|
245
235
|
|
|
246
236
|
# save model
|
|
247
|
-
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str
|
|
237
|
+
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
|
|
248
238
|
#Sanitize filenames to save
|
|
249
239
|
sanitized_target_name = sanitize_filename(target_name)
|
|
250
240
|
full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
|
|
251
|
-
joblib.dump({'model': trained_model, '
|
|
241
|
+
joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
|
|
252
242
|
|
|
253
243
|
# function to evaluate the model and save metrics (Classification)
|
|
254
244
|
def evaluate_model_classification(
|
|
@@ -257,10 +247,9 @@ def evaluate_model_classification(
|
|
|
257
247
|
save_dir: str,
|
|
258
248
|
x_test_scaled: np.ndarray,
|
|
259
249
|
single_y_test: np.ndarray,
|
|
260
|
-
|
|
250
|
+
target_name: str,
|
|
261
251
|
figsize: tuple = (10, 8),
|
|
262
|
-
|
|
263
|
-
label_fontsize: int = 24,
|
|
252
|
+
base_fontsize: int = 24,
|
|
264
253
|
cmap: Colormap = plt.cm.Blues # type: ignore
|
|
265
254
|
) -> np.ndarray:
|
|
266
255
|
"""
|
|
@@ -271,8 +260,8 @@ def evaluate_model_classification(
|
|
|
271
260
|
model_name: Identifier for the model
|
|
272
261
|
save_dir: Directory where results are saved
|
|
273
262
|
x_test_scaled: Feature matrix for test set
|
|
274
|
-
single_y_test: True
|
|
275
|
-
|
|
263
|
+
single_y_test: True targets
|
|
264
|
+
target_name: Target name
|
|
276
265
|
figsize: Size of the confusion matrix figure (width, height)
|
|
277
266
|
fontsize: Font size used for title, axis labels and ticks
|
|
278
267
|
cmap: Color map for the confusion matrix. Examples include:
|
|
@@ -300,10 +289,10 @@ def evaluate_model_classification(
|
|
|
300
289
|
)
|
|
301
290
|
|
|
302
291
|
# Save text report
|
|
303
|
-
|
|
304
|
-
report_path = os.path.join(save_dir, f"Classification_Report_{
|
|
292
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
293
|
+
report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
|
|
305
294
|
with open(report_path, "w") as f:
|
|
306
|
-
f.write(f"{model_name} - {
|
|
295
|
+
f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
|
|
307
296
|
f.write("Classification Report:\n")
|
|
308
297
|
f.write(report) # type: ignore
|
|
309
298
|
|
|
@@ -318,20 +307,20 @@ def evaluate_model_classification(
|
|
|
318
307
|
ax=ax
|
|
319
308
|
)
|
|
320
309
|
|
|
321
|
-
ax.set_title(f"{model_name} - {
|
|
322
|
-
ax.tick_params(axis='both', labelsize=
|
|
323
|
-
ax.set_xlabel("Predicted label", fontsize=
|
|
324
|
-
ax.set_ylabel("True label", fontsize=
|
|
310
|
+
ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
|
|
311
|
+
ax.tick_params(axis='both', labelsize=base_fontsize)
|
|
312
|
+
ax.set_xlabel("Predicted label", fontsize=base_fontsize)
|
|
313
|
+
ax.set_ylabel("True label", fontsize=base_fontsize)
|
|
325
314
|
|
|
326
315
|
# Turn off gridlines
|
|
327
316
|
ax.grid(False)
|
|
328
317
|
|
|
329
318
|
# Manually update font size of cell texts
|
|
330
319
|
for text in ax.texts:
|
|
331
|
-
text.set_fontsize(
|
|
320
|
+
text.set_fontsize(base_fontsize+4)
|
|
332
321
|
|
|
333
322
|
fig.tight_layout()
|
|
334
|
-
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{
|
|
323
|
+
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
|
|
335
324
|
fig.savefig(fig_path, format="svg", bbox_inches="tight")
|
|
336
325
|
plt.close(fig)
|
|
337
326
|
|
|
@@ -356,7 +345,7 @@ def plot_roc_curve(
|
|
|
356
345
|
Parameters:
|
|
357
346
|
true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
|
|
358
347
|
probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
|
|
359
|
-
target_name: str,
|
|
348
|
+
target_name: str, Target name.
|
|
360
349
|
save_directory: str, path to directory where figure is saved.
|
|
361
350
|
color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
|
|
362
351
|
- Named colors: "darkorange", "blue", "red", "green", "black"
|
|
@@ -425,7 +414,7 @@ def plot_roc_curve(
|
|
|
425
414
|
def evaluate_model_regression(model, model_name: str,
|
|
426
415
|
save_dir: str,
|
|
427
416
|
x_test_scaled: np.ndarray, single_y_test: np.ndarray,
|
|
428
|
-
|
|
417
|
+
target_name: str,
|
|
429
418
|
figure_size: tuple = (12, 8),
|
|
430
419
|
alpha_transparency: float = 0.5,
|
|
431
420
|
base_fontsize: int = 24):
|
|
@@ -439,10 +428,10 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
439
428
|
r2 = r2_score(single_y_test, y_pred)
|
|
440
429
|
|
|
441
430
|
# Create formatted report
|
|
442
|
-
|
|
443
|
-
report_path = os.path.join(save_dir, f"Regression_Report_{
|
|
431
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
432
|
+
report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
|
|
444
433
|
with open(report_path, "w") as f:
|
|
445
|
-
f.write(f"{model_name} - {
|
|
434
|
+
f.write(f"{model_name} - {target_name} Regression Performance\n")
|
|
446
435
|
f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
|
|
447
436
|
f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
|
|
448
437
|
f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
|
|
@@ -455,10 +444,10 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
455
444
|
plt.axhline(0, color='red', linestyle='--')
|
|
456
445
|
plt.xlabel("Predicted Values", fontsize=base_fontsize)
|
|
457
446
|
plt.ylabel("Residuals", fontsize=base_fontsize)
|
|
458
|
-
plt.title(f"{model_name} - Residual Plot for {
|
|
447
|
+
plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
|
|
459
448
|
plt.grid(True)
|
|
460
449
|
plt.tight_layout()
|
|
461
|
-
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{
|
|
450
|
+
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
|
|
462
451
|
plt.close()
|
|
463
452
|
|
|
464
453
|
# Create true vs predicted values plot
|
|
@@ -469,9 +458,9 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
469
458
|
'k--', lw=2)
|
|
470
459
|
plt.xlabel('True Values', fontsize=base_fontsize)
|
|
471
460
|
plt.ylabel('Predictions', fontsize=base_fontsize)
|
|
472
|
-
plt.title(f"{model_name} - True vs Predicted for {
|
|
461
|
+
plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
|
|
473
462
|
plt.grid(True)
|
|
474
|
-
plot_path = os.path.join(save_dir, f"Regression_Plot_{
|
|
463
|
+
plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
|
|
475
464
|
plt.savefig(plot_path, bbox_inches='tight', format="svg")
|
|
476
465
|
plt.close()
|
|
477
466
|
|
|
@@ -485,7 +474,7 @@ def get_shap_values(
|
|
|
485
474
|
save_dir: str,
|
|
486
475
|
features_to_explain: np.ndarray,
|
|
487
476
|
feature_names: list[str],
|
|
488
|
-
|
|
477
|
+
target_name: str,
|
|
489
478
|
task: Literal["classification", "regression"],
|
|
490
479
|
max_display_features: int = 10,
|
|
491
480
|
figsize: tuple = (16, 20),
|
|
@@ -504,7 +493,7 @@ def get_shap_values(
|
|
|
504
493
|
features_to_explain: Should match the model's training data format, including scaling.
|
|
505
494
|
save_dir: Directory to save visualizations
|
|
506
495
|
"""
|
|
507
|
-
|
|
496
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
508
497
|
|
|
509
498
|
def _apply_plot_style():
|
|
510
499
|
styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
|
|
@@ -567,9 +556,9 @@ def get_shap_values(
|
|
|
567
556
|
_create_shap_plot(
|
|
568
557
|
shap_values=class_shap,
|
|
569
558
|
features=features_to_explain,
|
|
570
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
559
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
|
|
571
560
|
plot_type=plot_type,
|
|
572
|
-
title=f"{model_name} - {
|
|
561
|
+
title=f"{model_name} - {target_name} (Class {class_name})"
|
|
573
562
|
)
|
|
574
563
|
else:
|
|
575
564
|
values = shap_values[1] if isinstance(shap_values, list) else shap_values
|
|
@@ -577,9 +566,9 @@ def get_shap_values(
|
|
|
577
566
|
_create_shap_plot(
|
|
578
567
|
shap_values=values,
|
|
579
568
|
features=features_to_explain,
|
|
580
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
569
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
|
|
581
570
|
plot_type=plot_type,
|
|
582
|
-
title=f"{model_name} - {
|
|
571
|
+
title=f"{model_name} - {target_name}"
|
|
583
572
|
)
|
|
584
573
|
|
|
585
574
|
def _plot_for_regression(shap_values):
|
|
@@ -587,9 +576,9 @@ def get_shap_values(
|
|
|
587
576
|
_create_shap_plot(
|
|
588
577
|
shap_values=shap_values,
|
|
589
578
|
features=features_to_explain,
|
|
590
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
579
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
|
|
591
580
|
plot_type=plot_type,
|
|
592
|
-
title=f"{model_name} - {
|
|
581
|
+
title=f"{model_name} - {target_name}"
|
|
593
582
|
)
|
|
594
583
|
#START_O
|
|
595
584
|
|
|
@@ -610,7 +599,7 @@ def get_shap_values(
|
|
|
610
599
|
def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
|
|
611
600
|
train_features: np.ndarray, train_target: np.ndarray,
|
|
612
601
|
test_features: np.ndarray, test_target: np.ndarray,
|
|
613
|
-
feature_names: list[str],
|
|
602
|
+
feature_names: list[str], target_name: str,
|
|
614
603
|
save_dir: str,
|
|
615
604
|
debug: bool=False, save_model: bool=False):
|
|
616
605
|
'''
|
|
@@ -620,7 +609,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
620
609
|
|
|
621
610
|
Returns: Tuple(Trained model, Test-set Predictions)
|
|
622
611
|
'''
|
|
623
|
-
print(f"\tModel: {model_name} for Target: {
|
|
612
|
+
print(f"\tModel: {model_name} for Target: {target_name}...")
|
|
624
613
|
trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
|
|
625
614
|
if debug:
|
|
626
615
|
print(f"Trained model object: {type(trained_model)}")
|
|
@@ -628,42 +617,42 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
628
617
|
|
|
629
618
|
if save_model:
|
|
630
619
|
_save_model(trained_model=trained_model, model_name=model_name,
|
|
631
|
-
target_name=
|
|
632
|
-
save_directory=local_save_directory
|
|
620
|
+
target_name=target_name, feature_names=feature_names,
|
|
621
|
+
save_directory=local_save_directory)
|
|
633
622
|
|
|
634
623
|
if task == "classification":
|
|
635
624
|
y_pred = evaluate_model_classification(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
636
|
-
x_test_scaled=test_features, single_y_test=test_target,
|
|
625
|
+
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
637
626
|
plot_roc_curve(true_labels=test_target,
|
|
638
627
|
probabilities_or_model=trained_model, model_name=model_name,
|
|
639
|
-
target_name=
|
|
628
|
+
target_name=target_name, save_directory=local_save_directory,
|
|
640
629
|
input_features=test_features)
|
|
641
630
|
elif task == "regression":
|
|
642
631
|
y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
643
|
-
x_test_scaled=test_features, single_y_test=test_target,
|
|
632
|
+
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
644
633
|
else:
|
|
645
634
|
raise ValueError(f"Unrecognized task '{task}' for model training,")
|
|
646
635
|
if debug:
|
|
647
636
|
print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
|
|
648
637
|
|
|
649
638
|
get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
650
|
-
features_to_explain=train_features, feature_names=feature_names,
|
|
639
|
+
features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
|
|
651
640
|
print("\t...done.")
|
|
652
641
|
return trained_model, y_pred
|
|
653
642
|
|
|
654
643
|
###### 5. Execution ######
|
|
655
644
|
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
|
|
656
|
-
resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None,
|
|
645
|
+
resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, save_model: bool=False,
|
|
657
646
|
test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
|
|
658
647
|
#Check paths
|
|
659
648
|
_check_paths(datasets_dir, save_dir)
|
|
660
649
|
#Yield imputed dataset
|
|
661
650
|
for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
|
|
662
651
|
#Yield features dataframe and target dataframe
|
|
663
|
-
for df_features, df_target, feature_names, target_name in
|
|
652
|
+
for df_features, df_target, feature_names, target_name in _dataset_yielder(df=dataframe, target_cols=target_columns):
|
|
664
653
|
#Dataset pipeline
|
|
665
|
-
X_train, y_train, X_test, y_test
|
|
666
|
-
resample_strategy=resample_strategy,
|
|
654
|
+
X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
|
|
655
|
+
resample_strategy=resample_strategy,
|
|
667
656
|
test_size=test_size, debug=debug, random_state=random_state)
|
|
668
657
|
#Get models
|
|
669
658
|
models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
|
|
@@ -673,7 +662,7 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
|
|
|
673
662
|
train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
|
|
674
663
|
train_features=X_train, train_target=y_train, # type: ignore
|
|
675
664
|
test_features=X_test, test_target=y_test,
|
|
676
|
-
feature_names=feature_names,
|
|
665
|
+
feature_names=feature_names,target_name=target_name,
|
|
677
666
|
debug=debug, save_dir=save_dir, save_model=save_model)
|
|
678
667
|
print("\n✅ Training and evaluation complete.")
|
|
679
668
|
|
|
@@ -683,3 +672,7 @@ def _check_paths(datasets_dir: str, save_dir:str):
|
|
|
683
672
|
os.makedirs(save_dir)
|
|
684
673
|
if not os.path.isdir(datasets_dir):
|
|
685
674
|
raise IOError(f"Datasets directory '{datasets_dir}' not found.")
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def info():
|
|
678
|
+
_script_info(__all__)
|
|
@@ -5,11 +5,10 @@ import xgboost as xgb
|
|
|
5
5
|
import lightgbm as lgb
|
|
6
6
|
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
|
7
7
|
from sklearn.base import ClassifierMixin
|
|
8
|
-
from
|
|
9
|
-
from typing import Literal, Union, Tuple, Dict
|
|
8
|
+
from typing import Literal, Union, Tuple, Dict, Optional
|
|
10
9
|
import polars as pl
|
|
11
10
|
from functools import partial
|
|
12
|
-
from .utilities import sanitize_filename, _script_info
|
|
11
|
+
from .utilities import sanitize_filename, _script_info, threshold_binary_values
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
__all__ = [
|
|
@@ -20,14 +19,14 @@ __all__ = [
|
|
|
20
19
|
|
|
21
20
|
class ObjectiveFunction():
|
|
22
21
|
"""
|
|
23
|
-
Callable objective function designed for optimizing continuous outputs from regression models.
|
|
22
|
+
Callable objective function designed for optimizing continuous outputs from tree-based regression models.
|
|
24
23
|
|
|
25
|
-
The target serialized file (joblib) must include a
|
|
24
|
+
The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
|
|
26
25
|
|
|
27
26
|
Parameters
|
|
28
27
|
----------
|
|
29
28
|
trained_model_path : str
|
|
30
|
-
Path to a serialized model
|
|
29
|
+
Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
|
|
31
30
|
add_noise : bool
|
|
32
31
|
Whether to apply multiplicative noise to the input features during evaluation.
|
|
33
32
|
binary_features : int, default=0
|
|
@@ -35,15 +34,14 @@ class ObjectiveFunction():
|
|
|
35
34
|
task : Literal, default 'maximization'
|
|
36
35
|
Whether to maximize or minimize the target.
|
|
37
36
|
"""
|
|
38
|
-
def __init__(self, trained_model_path: str, add_noise: bool
|
|
37
|
+
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
|
|
39
38
|
self.binary_features = binary_features
|
|
40
39
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
41
40
|
self.use_noise = add_noise
|
|
42
41
|
self._artifact = joblib.load(trained_model_path)
|
|
43
42
|
self.model = self._get_from_artifact('model')
|
|
44
|
-
self.
|
|
45
|
-
self.
|
|
46
|
-
self.target_name: str = self._get_from_artifact('target_name') # type: ignore
|
|
43
|
+
self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
|
|
44
|
+
self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
|
|
47
45
|
self.task = task
|
|
48
46
|
self.check_model() # check for classification models and None values
|
|
49
47
|
|
|
@@ -51,16 +49,15 @@ class ObjectiveFunction():
|
|
|
51
49
|
if self.use_noise:
|
|
52
50
|
features_array = self.add_noise(features_array)
|
|
53
51
|
if self.is_hybrid:
|
|
54
|
-
features_array = self.
|
|
52
|
+
features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
|
|
55
53
|
|
|
56
54
|
if features_array.ndim == 1:
|
|
57
55
|
features_array = features_array.reshape(1, -1)
|
|
58
56
|
|
|
59
|
-
# scale features as the model expects
|
|
60
|
-
features_array = self.scaler.transform(features_array) # type: ignore
|
|
61
|
-
|
|
62
57
|
result = self.model.predict(features_array) # type: ignore
|
|
63
58
|
scalar = result.item()
|
|
59
|
+
# print(f"[DEBUG] Model predicted: {scalar}")
|
|
60
|
+
|
|
64
61
|
# pso minimizes by default, so we return the negative value to maximize
|
|
65
62
|
if self.task == "maximization":
|
|
66
63
|
return -scalar
|
|
@@ -68,33 +65,22 @@ class ObjectiveFunction():
|
|
|
68
65
|
return scalar
|
|
69
66
|
|
|
70
67
|
def add_noise(self, features_array):
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if self.binary_features == total_features:
|
|
82
|
-
feat_binary = (features_array > 0.5).astype(int)
|
|
83
|
-
return feat_binary
|
|
84
|
-
|
|
85
|
-
# Normal case: split into continuous and binary parts
|
|
86
|
-
feat_continuous = features_array[:-self.binary_features]
|
|
87
|
-
feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
|
|
88
|
-
new_feature_values = np.concatenate([feat_continuous, feat_binary])
|
|
89
|
-
return new_feature_values
|
|
68
|
+
if self.binary_features > 0:
|
|
69
|
+
split_idx = -self.binary_features
|
|
70
|
+
cont_part = features_array[:split_idx]
|
|
71
|
+
bin_part = features_array[split_idx:]
|
|
72
|
+
noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
|
|
73
|
+
cont_noised = cont_part * noise
|
|
74
|
+
return np.concatenate([cont_noised, bin_part])
|
|
75
|
+
else:
|
|
76
|
+
noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
|
|
77
|
+
return features_array * noise
|
|
90
78
|
|
|
91
79
|
def check_model(self):
|
|
92
80
|
if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
|
|
93
81
|
raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
|
|
94
82
|
if self.model is None:
|
|
95
83
|
raise ValueError("Loaded model is None")
|
|
96
|
-
if self.scaler is None:
|
|
97
|
-
raise ValueError("Loaded scaler is None")
|
|
98
84
|
|
|
99
85
|
def _get_from_artifact(self, key: str):
|
|
100
86
|
val = self._artifact.get(key)
|
|
@@ -105,7 +91,7 @@ class ObjectiveFunction():
|
|
|
105
91
|
return result
|
|
106
92
|
|
|
107
93
|
def __repr__(self):
|
|
108
|
-
return (f"<ObjectiveFunction(model={type(self.model).__name__},
|
|
94
|
+
return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
109
95
|
|
|
110
96
|
|
|
111
97
|
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
@@ -142,11 +128,11 @@ def run_pso(lower_boundaries: list[float],
|
|
|
142
128
|
auto_binary_boundaries: bool=True,
|
|
143
129
|
target_name: Union[str, None]=None,
|
|
144
130
|
feature_names: Union[list[str], None]=None,
|
|
145
|
-
swarm_size: int=
|
|
146
|
-
max_iterations: int=
|
|
131
|
+
swarm_size: int=200,
|
|
132
|
+
max_iterations: int=400,
|
|
147
133
|
inequality_constrain_function=None,
|
|
148
|
-
post_hoc_analysis:
|
|
149
|
-
workers: int=
|
|
134
|
+
post_hoc_analysis: Optional[int]=3,
|
|
135
|
+
workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
150
136
|
"""
|
|
151
137
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
152
138
|
|
|
@@ -157,7 +143,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
157
143
|
upper_boundaries : list[float]
|
|
158
144
|
Upper bounds for each feature in the search space (as many as features expected by the model).
|
|
159
145
|
objective_function : ObjectiveFunction
|
|
160
|
-
A callable object encapsulating a regression model
|
|
146
|
+
A callable object encapsulating a tree-based regression model.
|
|
161
147
|
save_results_dir : str
|
|
162
148
|
Directory path to save the results CSV file.
|
|
163
149
|
auto_binary_boundaries : bool
|
|
@@ -172,7 +158,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
172
158
|
Maximum number of iterations for the optimization algorithm.
|
|
173
159
|
inequality_constrain_function : callable or None, optional
|
|
174
160
|
Optional function defining inequality constraints to be respected by the optimization.
|
|
175
|
-
post_hoc_analysis : int or None
|
|
161
|
+
post_hoc_analysis : int or None
|
|
176
162
|
If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
|
|
177
163
|
workers : int
|
|
178
164
|
Number of parallel processes to use.
|
|
@@ -191,7 +177,6 @@ def run_pso(lower_boundaries: list[float],
|
|
|
191
177
|
Notes
|
|
192
178
|
-----
|
|
193
179
|
- PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
|
|
194
|
-
- Feature values are scaled before being passed to the model and inverse-transformed before result saving.
|
|
195
180
|
"""
|
|
196
181
|
# Append binary boundaries
|
|
197
182
|
binary_number = objective_function.binary_features
|
|
@@ -229,12 +214,15 @@ def run_pso(lower_boundaries: list[float],
|
|
|
229
214
|
best_features, best_target, *_ = _pso(**arguments)
|
|
230
215
|
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
231
216
|
|
|
232
|
-
#
|
|
233
|
-
|
|
234
|
-
|
|
217
|
+
# flip best_target if maximization was used
|
|
218
|
+
if objective_function.task == "maximization":
|
|
219
|
+
best_target = -best_target
|
|
220
|
+
|
|
221
|
+
# threshold binary features
|
|
222
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
235
223
|
|
|
236
224
|
# name features
|
|
237
|
-
best_features_named = {name: value for name, value in zip(names,
|
|
225
|
+
best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
|
|
238
226
|
best_target_named = {target_name: best_target}
|
|
239
227
|
|
|
240
228
|
# save results
|
|
@@ -248,11 +236,14 @@ def run_pso(lower_boundaries: list[float],
|
|
|
248
236
|
best_features, best_target, *_ = _pso(**arguments)
|
|
249
237
|
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
250
238
|
|
|
251
|
-
#
|
|
252
|
-
|
|
253
|
-
|
|
239
|
+
# flip best_target if maximization was used
|
|
240
|
+
if objective_function.task == "maximization":
|
|
241
|
+
best_target = -best_target
|
|
242
|
+
|
|
243
|
+
# threshold binary features
|
|
244
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
254
245
|
|
|
255
|
-
for i, best_feature in enumerate(
|
|
246
|
+
for i, best_feature in enumerate(best_features_threshold):
|
|
256
247
|
all_best_features[i].append(best_feature)
|
|
257
248
|
all_best_targets.append(best_target)
|
|
258
249
|
|
|
@@ -4,7 +4,7 @@ import pandas as pd
|
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
import re
|
|
7
|
-
from typing import Literal
|
|
7
|
+
from typing import Literal, Union, Sequence
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
# Keep track of available tools
|
|
@@ -15,7 +15,8 @@ __all__ = [
|
|
|
15
15
|
"merge_dataframes",
|
|
16
16
|
"save_dataframe",
|
|
17
17
|
"normalize_mixed_list",
|
|
18
|
-
"sanitize_filename"
|
|
18
|
+
"sanitize_filename",
|
|
19
|
+
"threshold_binary_values"
|
|
19
20
|
]
|
|
20
21
|
|
|
21
22
|
|
|
@@ -263,6 +264,38 @@ def sanitize_filename(filename: str) -> str:
|
|
|
263
264
|
return sanitized
|
|
264
265
|
|
|
265
266
|
|
|
267
|
+
def threshold_binary_values(
|
|
268
|
+
input_array: Union[Sequence[float], np.ndarray],
|
|
269
|
+
binary_features: int
|
|
270
|
+
) -> np.ndarray:
|
|
271
|
+
"""
|
|
272
|
+
Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
|
|
273
|
+
|
|
274
|
+
Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
|
|
275
|
+
|
|
276
|
+
Parameters:
|
|
277
|
+
input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
|
|
278
|
+
|
|
279
|
+
binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
|
|
283
|
+
"""
|
|
284
|
+
array = np.asarray(input_array).flatten()
|
|
285
|
+
total = array.shape[0]
|
|
286
|
+
|
|
287
|
+
if binary_features < 0 or binary_features > total:
|
|
288
|
+
raise ValueError("Binary features must be between 0 and the total number of features.")
|
|
289
|
+
|
|
290
|
+
if binary_features == 0:
|
|
291
|
+
return array
|
|
292
|
+
|
|
293
|
+
cont_part = array[:-binary_features]
|
|
294
|
+
bin_part = (array[-binary_features:] > 0.5).astype(int)
|
|
295
|
+
|
|
296
|
+
return np.concatenate([cont_part, bin_part])
|
|
297
|
+
|
|
298
|
+
|
|
266
299
|
def _script_info(all_data: list[str]):
|
|
267
300
|
"""
|
|
268
301
|
List available names.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.2}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|