dragon-ml-toolbox 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/METADATA +18 -2
- dragon_ml_toolbox-1.4.2.dist-info/RECORD +19 -0
- ml_tools/MICE_imputation.py +17 -2
- ml_tools/VIF_factor.py +29 -14
- ml_tools/data_exploration.py +68 -140
- ml_tools/datasetmaster.py +13 -1
- ml_tools/ensemble_learning.py +83 -82
- ml_tools/handle_excel.py +32 -9
- ml_tools/logger.py +10 -1
- ml_tools/particle_swarm_optimization.py +92 -64
- ml_tools/pytorch_models.py +13 -1
- ml_tools/trainer.py +10 -30
- ml_tools/utilities.py +133 -18
- ml_tools/vision_helpers.py +14 -1
- dragon_ml_toolbox-1.4.0.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/top_level.txt +0 -0
ml_tools/ensemble_learning.py
CHANGED
|
@@ -17,11 +17,10 @@ import xgboost as xgb
|
|
|
17
17
|
import lightgbm as lgb
|
|
18
18
|
|
|
19
19
|
from sklearn.model_selection import train_test_split
|
|
20
|
-
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
21
20
|
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
|
|
22
21
|
import shap
|
|
23
22
|
|
|
24
|
-
from .utilities import yield_dataframes_from_dir
|
|
23
|
+
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
|
|
25
24
|
|
|
26
25
|
import warnings # Ignore warnings
|
|
27
26
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -29,9 +28,21 @@ warnings.filterwarnings('ignore', category=FutureWarning)
|
|
|
29
28
|
warnings.filterwarnings('ignore', category=UserWarning)
|
|
30
29
|
|
|
31
30
|
|
|
31
|
+
__all__ = [
|
|
32
|
+
"get_models",
|
|
33
|
+
"dataset_pipeline",
|
|
34
|
+
"evaluate_model_classification",
|
|
35
|
+
"plot_roc_curve",
|
|
36
|
+
"evaluate_model_regression",
|
|
37
|
+
"get_shap_values",
|
|
38
|
+
"train_test_pipeline",
|
|
39
|
+
"run_ensemble_pipeline"
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
32
43
|
###### 1. Dataset Loader ######
|
|
33
44
|
#Split a dataset into features and targets datasets
|
|
34
|
-
def
|
|
45
|
+
def _dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
|
|
35
46
|
'''
|
|
36
47
|
Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
|
|
37
48
|
'''
|
|
@@ -144,22 +155,8 @@ def _split_data(features, target, test_size, random_state, task):
|
|
|
144
155
|
stratify=target if task=="classification" else None)
|
|
145
156
|
return X_train, X_test, y_train, y_test
|
|
146
157
|
|
|
147
|
-
# function to standardize the data
|
|
148
|
-
def _standardize_data(train_features, test_features, scaler_code):
|
|
149
|
-
if scaler_code == "standard":
|
|
150
|
-
scaler = StandardScaler()
|
|
151
|
-
elif scaler_code == "minmax":
|
|
152
|
-
scaler = MinMaxScaler()
|
|
153
|
-
elif scaler_code == "maxabs":
|
|
154
|
-
scaler = MaxAbsScaler()
|
|
155
|
-
else:
|
|
156
|
-
raise ValueError(f"Unrecognized scaler {scaler_code}")
|
|
157
|
-
train_scaled = scaler.fit_transform(train_features)
|
|
158
|
-
test_scaled = scaler.transform(test_features)
|
|
159
|
-
return train_scaled, test_scaled, scaler
|
|
160
|
-
|
|
161
158
|
# Over-sample minority class (Positive cases) and return several single target datasets (Classification)
|
|
162
|
-
def _resample(
|
|
159
|
+
def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
163
160
|
strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
|
|
164
161
|
'''
|
|
165
162
|
Oversample minority class or undersample majority class.
|
|
@@ -177,21 +174,20 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
|
|
|
177
174
|
else:
|
|
178
175
|
raise ValueError(f"Invalid resampling strategy: {strategy}")
|
|
179
176
|
|
|
180
|
-
X_res, y_res, *_ = resample_algorithm.fit_resample(
|
|
177
|
+
X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
|
|
181
178
|
return X_res, y_res
|
|
182
179
|
|
|
183
180
|
# DATASET PIPELINE
|
|
184
181
|
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
|
|
185
|
-
resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
|
|
182
|
+
resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
|
|
186
183
|
test_size: float=0.2, debug: bool=False, random_state: int=101):
|
|
187
184
|
'''
|
|
188
185
|
1. Make Train/Test splits
|
|
189
|
-
2.
|
|
190
|
-
3. Oversample imbalanced classes (classification)
|
|
186
|
+
2. Oversample imbalanced classes (classification)
|
|
191
187
|
|
|
192
|
-
Return a processed Tuple: (X_train, y_train, X_test, y_test
|
|
188
|
+
Return a processed Tuple: (X_train, y_train, X_test, y_test)
|
|
193
189
|
|
|
194
|
-
`(nD-array, 1D-array, nD-array, Series
|
|
190
|
+
`(nD-array, 1D-array, nD-array, Series)`
|
|
195
191
|
'''
|
|
196
192
|
#DEBUG
|
|
197
193
|
if debug:
|
|
@@ -206,24 +202,18 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
|
|
|
206
202
|
if debug:
|
|
207
203
|
print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
208
204
|
|
|
209
|
-
# Standardize
|
|
210
|
-
X_train_scaled, X_test_scaled, scaler_object = _standardize_data(train_features=X_train, test_features=X_test, scaler_code=scaler)
|
|
211
|
-
|
|
212
|
-
#DEBUG
|
|
213
|
-
if debug:
|
|
214
|
-
print(f"Shapes after scaling features - X_train: {X_train_scaled.shape}, y_train: {y_train.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
|
|
215
205
|
|
|
216
|
-
#
|
|
206
|
+
# Resample
|
|
217
207
|
if resample_strategy is None or task == "regression":
|
|
218
|
-
X_train_oversampled, y_train_oversampled =
|
|
208
|
+
X_train_oversampled, y_train_oversampled = X_train, y_train
|
|
219
209
|
else:
|
|
220
|
-
X_train_oversampled, y_train_oversampled = _resample(
|
|
210
|
+
X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
|
|
221
211
|
|
|
222
212
|
#DEBUG
|
|
223
213
|
if debug:
|
|
224
|
-
print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {
|
|
214
|
+
print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
225
215
|
|
|
226
|
-
return X_train_oversampled, y_train_oversampled,
|
|
216
|
+
return X_train_oversampled, y_train_oversampled, X_test, y_test
|
|
227
217
|
|
|
228
218
|
###### 4. Train and Evaluation ######
|
|
229
219
|
# Trainer function
|
|
@@ -244,9 +234,11 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
|
|
|
244
234
|
return model_dir
|
|
245
235
|
|
|
246
236
|
# save model
|
|
247
|
-
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str
|
|
248
|
-
|
|
249
|
-
|
|
237
|
+
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
|
|
238
|
+
#Sanitize filenames to save
|
|
239
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
240
|
+
full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
|
|
241
|
+
joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
|
|
250
242
|
|
|
251
243
|
# function to evaluate the model and save metrics (Classification)
|
|
252
244
|
def evaluate_model_classification(
|
|
@@ -255,10 +247,9 @@ def evaluate_model_classification(
|
|
|
255
247
|
save_dir: str,
|
|
256
248
|
x_test_scaled: np.ndarray,
|
|
257
249
|
single_y_test: np.ndarray,
|
|
258
|
-
|
|
250
|
+
target_name: str,
|
|
259
251
|
figsize: tuple = (10, 8),
|
|
260
|
-
|
|
261
|
-
label_fontsize: int = 24,
|
|
252
|
+
base_fontsize: int = 24,
|
|
262
253
|
cmap: Colormap = plt.cm.Blues # type: ignore
|
|
263
254
|
) -> np.ndarray:
|
|
264
255
|
"""
|
|
@@ -269,8 +260,8 @@ def evaluate_model_classification(
|
|
|
269
260
|
model_name: Identifier for the model
|
|
270
261
|
save_dir: Directory where results are saved
|
|
271
262
|
x_test_scaled: Feature matrix for test set
|
|
272
|
-
single_y_test: True
|
|
273
|
-
|
|
263
|
+
single_y_test: True targets
|
|
264
|
+
target_name: Target name
|
|
274
265
|
figsize: Size of the confusion matrix figure (width, height)
|
|
275
266
|
fontsize: Font size used for title, axis labels and ticks
|
|
276
267
|
cmap: Color map for the confusion matrix. Examples include:
|
|
@@ -298,9 +289,10 @@ def evaluate_model_classification(
|
|
|
298
289
|
)
|
|
299
290
|
|
|
300
291
|
# Save text report
|
|
301
|
-
|
|
292
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
293
|
+
report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
|
|
302
294
|
with open(report_path, "w") as f:
|
|
303
|
-
f.write(f"{model_name} - {
|
|
295
|
+
f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
|
|
304
296
|
f.write("Classification Report:\n")
|
|
305
297
|
f.write(report) # type: ignore
|
|
306
298
|
|
|
@@ -315,20 +307,20 @@ def evaluate_model_classification(
|
|
|
315
307
|
ax=ax
|
|
316
308
|
)
|
|
317
309
|
|
|
318
|
-
ax.set_title(f"{model_name} - {
|
|
319
|
-
ax.tick_params(axis='both', labelsize=
|
|
320
|
-
ax.set_xlabel("Predicted label", fontsize=
|
|
321
|
-
ax.set_ylabel("True label", fontsize=
|
|
310
|
+
ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
|
|
311
|
+
ax.tick_params(axis='both', labelsize=base_fontsize)
|
|
312
|
+
ax.set_xlabel("Predicted label", fontsize=base_fontsize)
|
|
313
|
+
ax.set_ylabel("True label", fontsize=base_fontsize)
|
|
322
314
|
|
|
323
315
|
# Turn off gridlines
|
|
324
316
|
ax.grid(False)
|
|
325
317
|
|
|
326
318
|
# Manually update font size of cell texts
|
|
327
319
|
for text in ax.texts:
|
|
328
|
-
text.set_fontsize(
|
|
320
|
+
text.set_fontsize(base_fontsize+4)
|
|
329
321
|
|
|
330
322
|
fig.tight_layout()
|
|
331
|
-
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{
|
|
323
|
+
fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
|
|
332
324
|
fig.savefig(fig_path, format="svg", bbox_inches="tight")
|
|
333
325
|
plt.close(fig)
|
|
334
326
|
|
|
@@ -353,7 +345,7 @@ def plot_roc_curve(
|
|
|
353
345
|
Parameters:
|
|
354
346
|
true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
|
|
355
347
|
probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
|
|
356
|
-
target_name: str,
|
|
348
|
+
target_name: str, Target name.
|
|
357
349
|
save_directory: str, path to directory where figure is saved.
|
|
358
350
|
color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
|
|
359
351
|
- Named colors: "darkorange", "blue", "red", "green", "black"
|
|
@@ -411,7 +403,8 @@ def plot_roc_curve(
|
|
|
411
403
|
|
|
412
404
|
# Save figure
|
|
413
405
|
os.makedirs(save_directory, exist_ok=True)
|
|
414
|
-
|
|
406
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
407
|
+
save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
|
|
415
408
|
fig.savefig(save_path, bbox_inches="tight", format="svg")
|
|
416
409
|
|
|
417
410
|
return fig
|
|
@@ -421,7 +414,7 @@ def plot_roc_curve(
|
|
|
421
414
|
def evaluate_model_regression(model, model_name: str,
|
|
422
415
|
save_dir: str,
|
|
423
416
|
x_test_scaled: np.ndarray, single_y_test: np.ndarray,
|
|
424
|
-
|
|
417
|
+
target_name: str,
|
|
425
418
|
figure_size: tuple = (12, 8),
|
|
426
419
|
alpha_transparency: float = 0.5,
|
|
427
420
|
base_fontsize: int = 24):
|
|
@@ -435,9 +428,10 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
435
428
|
r2 = r2_score(single_y_test, y_pred)
|
|
436
429
|
|
|
437
430
|
# Create formatted report
|
|
438
|
-
|
|
431
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
432
|
+
report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
|
|
439
433
|
with open(report_path, "w") as f:
|
|
440
|
-
f.write(f"{model_name} - {
|
|
434
|
+
f.write(f"{model_name} - {target_name} Regression Performance\n")
|
|
441
435
|
f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
|
|
442
436
|
f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
|
|
443
437
|
f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
|
|
@@ -450,10 +444,10 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
450
444
|
plt.axhline(0, color='red', linestyle='--')
|
|
451
445
|
plt.xlabel("Predicted Values", fontsize=base_fontsize)
|
|
452
446
|
plt.ylabel("Residuals", fontsize=base_fontsize)
|
|
453
|
-
plt.title(f"{model_name} - Residual Plot for {
|
|
447
|
+
plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
|
|
454
448
|
plt.grid(True)
|
|
455
449
|
plt.tight_layout()
|
|
456
|
-
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{
|
|
450
|
+
plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
|
|
457
451
|
plt.close()
|
|
458
452
|
|
|
459
453
|
# Create true vs predicted values plot
|
|
@@ -464,14 +458,15 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
464
458
|
'k--', lw=2)
|
|
465
459
|
plt.xlabel('True Values', fontsize=base_fontsize)
|
|
466
460
|
plt.ylabel('Predictions', fontsize=base_fontsize)
|
|
467
|
-
plt.title(f"{model_name} - True vs Predicted for {
|
|
461
|
+
plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
|
|
468
462
|
plt.grid(True)
|
|
469
|
-
plot_path = os.path.join(save_dir, f"Regression_Plot_{
|
|
463
|
+
plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
|
|
470
464
|
plt.savefig(plot_path, bbox_inches='tight', format="svg")
|
|
471
465
|
plt.close()
|
|
472
466
|
|
|
473
467
|
return y_pred
|
|
474
468
|
|
|
469
|
+
|
|
475
470
|
# Get SHAP values
|
|
476
471
|
def get_shap_values(
|
|
477
472
|
model,
|
|
@@ -479,7 +474,7 @@ def get_shap_values(
|
|
|
479
474
|
save_dir: str,
|
|
480
475
|
features_to_explain: np.ndarray,
|
|
481
476
|
feature_names: list[str],
|
|
482
|
-
|
|
477
|
+
target_name: str,
|
|
483
478
|
task: Literal["classification", "regression"],
|
|
484
479
|
max_display_features: int = 10,
|
|
485
480
|
figsize: tuple = (16, 20),
|
|
@@ -498,7 +493,8 @@ def get_shap_values(
|
|
|
498
493
|
features_to_explain: Should match the model's training data format, including scaling.
|
|
499
494
|
save_dir: Directory to save visualizations
|
|
500
495
|
"""
|
|
501
|
-
|
|
496
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
497
|
+
|
|
502
498
|
def _apply_plot_style():
|
|
503
499
|
styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
|
|
504
500
|
for style in styles:
|
|
@@ -560,9 +556,9 @@ def get_shap_values(
|
|
|
560
556
|
_create_shap_plot(
|
|
561
557
|
shap_values=class_shap,
|
|
562
558
|
features=features_to_explain,
|
|
563
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
559
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
|
|
564
560
|
plot_type=plot_type,
|
|
565
|
-
title=f"{model_name} - {
|
|
561
|
+
title=f"{model_name} - {target_name} (Class {class_name})"
|
|
566
562
|
)
|
|
567
563
|
else:
|
|
568
564
|
values = shap_values[1] if isinstance(shap_values, list) else shap_values
|
|
@@ -570,9 +566,9 @@ def get_shap_values(
|
|
|
570
566
|
_create_shap_plot(
|
|
571
567
|
shap_values=values,
|
|
572
568
|
features=features_to_explain,
|
|
573
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
569
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
|
|
574
570
|
plot_type=plot_type,
|
|
575
|
-
title=f"{model_name} - {
|
|
571
|
+
title=f"{model_name} - {target_name}"
|
|
576
572
|
)
|
|
577
573
|
|
|
578
574
|
def _plot_for_regression(shap_values):
|
|
@@ -580,10 +576,11 @@ def get_shap_values(
|
|
|
580
576
|
_create_shap_plot(
|
|
581
577
|
shap_values=shap_values,
|
|
582
578
|
features=features_to_explain,
|
|
583
|
-
save_path=os.path.join(save_dir, f"SHAP_{
|
|
579
|
+
save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
|
|
584
580
|
plot_type=plot_type,
|
|
585
|
-
title=f"{model_name} - {
|
|
581
|
+
title=f"{model_name} - {target_name}"
|
|
586
582
|
)
|
|
583
|
+
#START_O
|
|
587
584
|
|
|
588
585
|
explainer = shap.TreeExplainer(model)
|
|
589
586
|
shap_values = explainer.shap_values(features_to_explain)
|
|
@@ -602,7 +599,7 @@ def get_shap_values(
|
|
|
602
599
|
def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
|
|
603
600
|
train_features: np.ndarray, train_target: np.ndarray,
|
|
604
601
|
test_features: np.ndarray, test_target: np.ndarray,
|
|
605
|
-
feature_names: list[str],
|
|
602
|
+
feature_names: list[str], target_name: str,
|
|
606
603
|
save_dir: str,
|
|
607
604
|
debug: bool=False, save_model: bool=False):
|
|
608
605
|
'''
|
|
@@ -612,7 +609,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
612
609
|
|
|
613
610
|
Returns: Tuple(Trained model, Test-set Predictions)
|
|
614
611
|
'''
|
|
615
|
-
print(f"\tModel: {model_name} for Target: {
|
|
612
|
+
print(f"\tModel: {model_name} for Target: {target_name}...")
|
|
616
613
|
trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
|
|
617
614
|
if debug:
|
|
618
615
|
print(f"Trained model object: {type(trained_model)}")
|
|
@@ -620,42 +617,42 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
620
617
|
|
|
621
618
|
if save_model:
|
|
622
619
|
_save_model(trained_model=trained_model, model_name=model_name,
|
|
623
|
-
target_name=
|
|
624
|
-
save_directory=local_save_directory
|
|
620
|
+
target_name=target_name, feature_names=feature_names,
|
|
621
|
+
save_directory=local_save_directory)
|
|
625
622
|
|
|
626
623
|
if task == "classification":
|
|
627
624
|
y_pred = evaluate_model_classification(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
628
|
-
x_test_scaled=test_features, single_y_test=test_target,
|
|
625
|
+
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
629
626
|
plot_roc_curve(true_labels=test_target,
|
|
630
627
|
probabilities_or_model=trained_model, model_name=model_name,
|
|
631
|
-
target_name=
|
|
628
|
+
target_name=target_name, save_directory=local_save_directory,
|
|
632
629
|
input_features=test_features)
|
|
633
630
|
elif task == "regression":
|
|
634
631
|
y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
635
|
-
x_test_scaled=test_features, single_y_test=test_target,
|
|
632
|
+
x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
|
|
636
633
|
else:
|
|
637
634
|
raise ValueError(f"Unrecognized task '{task}' for model training,")
|
|
638
635
|
if debug:
|
|
639
636
|
print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
|
|
640
637
|
|
|
641
638
|
get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
642
|
-
features_to_explain=train_features, feature_names=feature_names,
|
|
639
|
+
features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
|
|
643
640
|
print("\t...done.")
|
|
644
641
|
return trained_model, y_pred
|
|
645
642
|
|
|
646
643
|
###### 5. Execution ######
|
|
647
644
|
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
|
|
648
|
-
resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None,
|
|
645
|
+
resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, save_model: bool=False,
|
|
649
646
|
test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
|
|
650
647
|
#Check paths
|
|
651
648
|
_check_paths(datasets_dir, save_dir)
|
|
652
649
|
#Yield imputed dataset
|
|
653
650
|
for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
|
|
654
651
|
#Yield features dataframe and target dataframe
|
|
655
|
-
for df_features, df_target, feature_names, target_name in
|
|
652
|
+
for df_features, df_target, feature_names, target_name in _dataset_yielder(df=dataframe, target_cols=target_columns):
|
|
656
653
|
#Dataset pipeline
|
|
657
|
-
X_train, y_train, X_test, y_test
|
|
658
|
-
resample_strategy=resample_strategy,
|
|
654
|
+
X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
|
|
655
|
+
resample_strategy=resample_strategy,
|
|
659
656
|
test_size=test_size, debug=debug, random_state=random_state)
|
|
660
657
|
#Get models
|
|
661
658
|
models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
|
|
@@ -665,13 +662,17 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
|
|
|
665
662
|
train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
|
|
666
663
|
train_features=X_train, train_target=y_train, # type: ignore
|
|
667
664
|
test_features=X_test, test_target=y_test,
|
|
668
|
-
feature_names=feature_names,
|
|
665
|
+
feature_names=feature_names,target_name=target_name,
|
|
669
666
|
debug=debug, save_dir=save_dir, save_model=save_model)
|
|
670
667
|
print("\n✅ Training and evaluation complete.")
|
|
671
668
|
|
|
672
669
|
|
|
673
670
|
def _check_paths(datasets_dir: str, save_dir:str):
|
|
674
671
|
if not os.path.isdir(save_dir):
|
|
675
|
-
os.makedirs(save_dir)
|
|
672
|
+
os.makedirs(save_dir)
|
|
676
673
|
if not os.path.isdir(datasets_dir):
|
|
677
674
|
raise IOError(f"Datasets directory '{datasets_dir}' not found.")
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def info():
|
|
678
|
+
_script_info(__all__)
|
ml_tools/handle_excel.py
CHANGED
|
@@ -2,6 +2,16 @@ import os
|
|
|
2
2
|
from openpyxl import load_workbook, Workbook
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from typing import List, Optional
|
|
5
|
+
from utilities import _script_info, sanitize_filename
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"unmerge_and_split_excel",
|
|
10
|
+
"unmerge_and_split_from_directory",
|
|
11
|
+
"validate_excel_schema",
|
|
12
|
+
"vertical_merge_transform_excel",
|
|
13
|
+
"horizontal_merge_transform_excel"
|
|
14
|
+
]
|
|
5
15
|
|
|
6
16
|
|
|
7
17
|
def unmerge_and_split_excel(filepath: str) -> None:
|
|
@@ -25,12 +35,12 @@ def unmerge_and_split_excel(filepath: str) -> None:
|
|
|
25
35
|
ws = wb[sheet_name]
|
|
26
36
|
new_wb = Workbook()
|
|
27
37
|
new_ws = new_wb.active
|
|
28
|
-
new_ws.title = sheet_name
|
|
38
|
+
new_ws.title = sheet_name # type: ignore
|
|
29
39
|
|
|
30
40
|
# Copy all cell values
|
|
31
41
|
for row in ws.iter_rows():
|
|
32
42
|
for cell in row:
|
|
33
|
-
new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
|
|
43
|
+
new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
|
|
34
44
|
|
|
35
45
|
# Fill and unmerge merged regions
|
|
36
46
|
for merged_range in list(ws.merged_cells.ranges):
|
|
@@ -41,10 +51,10 @@ def unmerge_and_split_excel(filepath: str) -> None:
|
|
|
41
51
|
value = ws.cell(row=min_row, column=min_col).value
|
|
42
52
|
for row in range(min_row, max_row + 1):
|
|
43
53
|
for col in range(min_col, max_col + 1):
|
|
44
|
-
new_ws.cell(row=row, column=col, value=value)
|
|
54
|
+
new_ws.cell(row=row, column=col, value=value) # type: ignore
|
|
45
55
|
|
|
46
56
|
# Construct flat output file name
|
|
47
|
-
sanitized_sheet_name = sheet_name
|
|
57
|
+
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
48
58
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
49
59
|
output_path = os.path.join(base_dir, output_filename)
|
|
50
60
|
new_wb.save(output_path)
|
|
@@ -85,12 +95,12 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
85
95
|
ws = wb[sheet_name]
|
|
86
96
|
new_wb = Workbook()
|
|
87
97
|
new_ws = new_wb.active
|
|
88
|
-
new_ws.title = sheet_name
|
|
98
|
+
new_ws.title = sheet_name # type: ignore
|
|
89
99
|
|
|
90
100
|
# Copy all cell values
|
|
91
101
|
for row in ws.iter_rows():
|
|
92
102
|
for cell in row:
|
|
93
|
-
new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
|
|
103
|
+
new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
|
|
94
104
|
|
|
95
105
|
# Fill and unmerge merged regions
|
|
96
106
|
for merged_range in list(ws.merged_cells.ranges):
|
|
@@ -101,10 +111,10 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
101
111
|
value = ws.cell(row=min_row, column=min_col).value
|
|
102
112
|
for row in range(min_row, max_row + 1):
|
|
103
113
|
for col in range(min_col, max_col + 1):
|
|
104
|
-
new_ws.cell(row=row, column=col, value=value)
|
|
114
|
+
new_ws.cell(row=row, column=col, value=value) # type: ignore
|
|
105
115
|
|
|
106
116
|
# Construct flat output file name
|
|
107
|
-
sanitized_sheet_name = sheet_name
|
|
117
|
+
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
108
118
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
109
119
|
output_path = os.path.join(output_dir, output_filename)
|
|
110
120
|
new_wb.save(output_path)
|
|
@@ -151,7 +161,7 @@ def validate_excel_schema(
|
|
|
151
161
|
wb = load_workbook(file_path, read_only=True)
|
|
152
162
|
ws = wb.active # Only check the first worksheet
|
|
153
163
|
|
|
154
|
-
header = [cell.value for cell in next(ws.iter_rows(max_row=1))]
|
|
164
|
+
header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
|
|
155
165
|
|
|
156
166
|
if strict:
|
|
157
167
|
if header != expected_columns:
|
|
@@ -202,6 +212,11 @@ def vertical_merge_transform_excel(
|
|
|
202
212
|
|
|
203
213
|
if not excel_files:
|
|
204
214
|
raise ValueError("No Excel files found in the target directory.")
|
|
215
|
+
|
|
216
|
+
# sanitize filename
|
|
217
|
+
csv_filename = sanitize_filename(csv_filename)
|
|
218
|
+
# make directory
|
|
219
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
205
220
|
|
|
206
221
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
207
222
|
csv_path = os.path.join(output_dir, csv_filename)
|
|
@@ -260,6 +275,11 @@ def horizontal_merge_transform_excel(
|
|
|
260
275
|
excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
|
|
261
276
|
if not excel_files:
|
|
262
277
|
raise ValueError("No Excel files found in the target directory.")
|
|
278
|
+
|
|
279
|
+
# sanitize filename
|
|
280
|
+
csv_filename = sanitize_filename(csv_filename)
|
|
281
|
+
# make directory
|
|
282
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
263
283
|
|
|
264
284
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
265
285
|
csv_path = os.path.join(output_dir, csv_filename)
|
|
@@ -308,3 +328,6 @@ def horizontal_merge_transform_excel(
|
|
|
308
328
|
if duplicate_columns:
|
|
309
329
|
print(f"⚠️ Duplicate columns: {duplicate_columns}")
|
|
310
330
|
|
|
331
|
+
|
|
332
|
+
def info():
|
|
333
|
+
_script_info(__all__)
|
ml_tools/logger.py
CHANGED
|
@@ -5,7 +5,12 @@ import pandas as pd
|
|
|
5
5
|
from openpyxl.styles import Font, PatternFill
|
|
6
6
|
import traceback
|
|
7
7
|
import json
|
|
8
|
-
from ml_tools.utilities import sanitize_filename
|
|
8
|
+
from ml_tools.utilities import sanitize_filename, _script_info
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"custom_logger"
|
|
13
|
+
]
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
def custom_logger(
|
|
@@ -143,3 +148,7 @@ def _log_exception_to_log(exc: BaseException, path: str) -> None:
|
|
|
143
148
|
def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
|
|
144
149
|
with open(path, 'w', encoding='utf-8') as f:
|
|
145
150
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def info():
|
|
154
|
+
_script_info(__all__)
|