dragon-ml-toolbox 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/METADATA +26 -9
- dragon_ml_toolbox-3.0.0.dist-info/RECORD +25 -0
- ml_tools/ETL_engineering.py +8 -7
- ml_tools/GUI_tools.py +495 -0
- ml_tools/MICE_imputation.py +8 -4
- ml_tools/ML_callbacks.py +341 -0
- ml_tools/ML_evaluation.py +255 -0
- ml_tools/ML_trainer.py +344 -0
- ml_tools/ML_tutorial.py +300 -0
- ml_tools/PSO_optimization.py +27 -20
- ml_tools/RNN_forecast.py +49 -0
- ml_tools/VIF_factor.py +6 -5
- ml_tools/datasetmaster.py +601 -527
- ml_tools/ensemble_learning.py +12 -9
- ml_tools/handle_excel.py +9 -10
- ml_tools/logger.py +45 -8
- ml_tools/utilities.py +18 -1
- dragon_ml_toolbox-2.3.0.dist-info/RECORD +0 -21
- ml_tools/trainer.py +0 -346
- ml_tools/vision_helpers.py +0 -231
- {dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/top_level.txt +0 -0
- /ml_tools/{pytorch_models.py → _pytorch_models.py} +0 -0
ml_tools/ensemble_learning.py
CHANGED
|
@@ -20,6 +20,7 @@ from sklearn.metrics import accuracy_score, classification_report, ConfusionMatr
|
|
|
20
20
|
import shap
|
|
21
21
|
|
|
22
22
|
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object, make_fullpath
|
|
23
|
+
from .logger import _LOGGER
|
|
23
24
|
|
|
24
25
|
import warnings # Ignore warnings
|
|
25
26
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -438,16 +439,16 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Task
|
|
|
438
439
|
'''
|
|
439
440
|
#DEBUG
|
|
440
441
|
if debug:
|
|
441
|
-
|
|
442
|
+
_LOGGER.info(f"Split Dataframes Shapes - Features DF: {df_features.shape}, Target DF: {df_target.shape}")
|
|
442
443
|
unique_values = df_target.unique() # Get unique values for the target column
|
|
443
|
-
|
|
444
|
+
_LOGGER.info(f"\tUnique values for '{df_target.name}': {unique_values}")
|
|
444
445
|
|
|
445
446
|
#Train test split
|
|
446
447
|
X_train, X_test, y_train, y_test = _split_data(features=df_features, target=df_target, test_size=test_size, random_state=random_state, task=task)
|
|
447
448
|
|
|
448
449
|
#DEBUG
|
|
449
450
|
if debug:
|
|
450
|
-
|
|
451
|
+
_LOGGER.info(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
451
452
|
|
|
452
453
|
|
|
453
454
|
# Resample
|
|
@@ -458,7 +459,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Task
|
|
|
458
459
|
|
|
459
460
|
#DEBUG
|
|
460
461
|
if debug:
|
|
461
|
-
|
|
462
|
+
_LOGGER.info(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
|
|
462
463
|
|
|
463
464
|
return X_train_oversampled, y_train_oversampled, X_test, y_test
|
|
464
465
|
|
|
@@ -864,7 +865,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
|
|
|
864
865
|
print(f"\tTraining model: {model_name} for Target: {target_name}...")
|
|
865
866
|
trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
|
|
866
867
|
if debug:
|
|
867
|
-
|
|
868
|
+
_LOGGER.info(f"Trained model object: {type(trained_model)}")
|
|
868
869
|
local_save_directory = _local_directories(model_name=model_name, dataset_id=dataset_id, save_dir=save_dir)
|
|
869
870
|
|
|
870
871
|
if save_model:
|
|
@@ -885,11 +886,11 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
|
|
|
885
886
|
else:
|
|
886
887
|
raise ValueError(f"Unrecognized task '{task}' for model training,")
|
|
887
888
|
if debug:
|
|
888
|
-
|
|
889
|
+
_LOGGER.info(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
|
|
889
890
|
|
|
890
891
|
get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
891
892
|
features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
|
|
892
|
-
|
|
893
|
+
|
|
893
894
|
return trained_model, y_pred
|
|
894
895
|
|
|
895
896
|
###### 5. Execution ######
|
|
@@ -902,7 +903,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
|
|
|
902
903
|
elif isinstance(model_object, ClassificationTreeModels):
|
|
903
904
|
task = "classification"
|
|
904
905
|
if handle_classification_imbalance is None:
|
|
905
|
-
|
|
906
|
+
_LOGGER.warning("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
|
|
906
907
|
elif handle_classification_imbalance == "by_model":
|
|
907
908
|
model_object.use_model_balance = True
|
|
908
909
|
else:
|
|
@@ -914,6 +915,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
|
|
|
914
915
|
datasets_path = make_fullpath(datasets_dir)
|
|
915
916
|
save_path = make_fullpath(save_dir, make=True)
|
|
916
917
|
|
|
918
|
+
_LOGGER.info("Training starting...")
|
|
917
919
|
#Yield imputed dataset
|
|
918
920
|
for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
|
|
919
921
|
#Yield features dataframe and target dataframe
|
|
@@ -931,7 +933,8 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
|
|
|
931
933
|
test_features=X_test, test_target=y_test,
|
|
932
934
|
feature_names=feature_names,target_name=target_name,
|
|
933
935
|
debug=debug, save_dir=save_path, save_model=save_model)
|
|
934
|
-
print("
|
|
936
|
+
print("")
|
|
937
|
+
_LOGGER.info("✅ Training and evaluation complete.")
|
|
935
938
|
|
|
936
939
|
|
|
937
940
|
def info():
|
ml_tools/handle_excel.py
CHANGED
|
@@ -3,6 +3,7 @@ from openpyxl import load_workbook, Workbook
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
from .utilities import _script_info, sanitize_filename, make_fullpath
|
|
6
|
+
from .logger import _LOGGER
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
@@ -95,10 +96,9 @@ def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
|
|
|
95
96
|
output_path = base_dir / output_filename
|
|
96
97
|
new_wb.save(output_path)
|
|
97
98
|
|
|
98
|
-
# print(f"Saved: {output_path}")
|
|
99
99
|
total_output_files += 1
|
|
100
100
|
|
|
101
|
-
|
|
101
|
+
_LOGGER.info(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
|
|
102
102
|
return None
|
|
103
103
|
|
|
104
104
|
|
|
@@ -152,10 +152,9 @@ def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Uni
|
|
|
152
152
|
output_path = global_output_path / output_filename
|
|
153
153
|
new_wb.save(output_path)
|
|
154
154
|
|
|
155
|
-
# print(f"Saved: {output_path}")
|
|
156
155
|
total_output_files += 1
|
|
157
156
|
|
|
158
|
-
|
|
157
|
+
_LOGGER.info(f"✅ Processed {len(excel_files)} input Excel file(s) with a total of {total_output_files} output Excel file(s).")
|
|
159
158
|
return None
|
|
160
159
|
|
|
161
160
|
|
|
@@ -199,13 +198,13 @@ def validate_excel_schema(
|
|
|
199
198
|
invalid_files.append(file)
|
|
200
199
|
|
|
201
200
|
except Exception as e:
|
|
202
|
-
|
|
201
|
+
_LOGGER.error(f"Error processing '{file}': {e}")
|
|
203
202
|
invalid_files.append(file)
|
|
204
203
|
|
|
205
204
|
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
206
|
-
|
|
205
|
+
_LOGGER.info(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
|
|
207
206
|
if invalid_files:
|
|
208
|
-
|
|
207
|
+
_LOGGER.warning(f"⚠️ {len(invalid_files)} excel files are invalid:")
|
|
209
208
|
for in_file in invalid_files:
|
|
210
209
|
print(f" - {in_file.name}")
|
|
211
210
|
|
|
@@ -266,7 +265,7 @@ def vertical_merge_transform_excel(
|
|
|
266
265
|
merged_df.columns = rename_columns
|
|
267
266
|
|
|
268
267
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
269
|
-
|
|
268
|
+
_LOGGER.info(f"✅ Merged {len(dataframes)} excel files into '{csv_filename}'.")
|
|
270
269
|
|
|
271
270
|
|
|
272
271
|
def horizontal_merge_transform_excel(
|
|
@@ -344,9 +343,9 @@ def horizontal_merge_transform_excel(
|
|
|
344
343
|
|
|
345
344
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
346
345
|
|
|
347
|
-
|
|
346
|
+
_LOGGER.info(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
|
|
348
347
|
if duplicate_columns:
|
|
349
|
-
|
|
348
|
+
_LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
|
|
350
349
|
|
|
351
350
|
|
|
352
351
|
def info():
|
ml_tools/logger.py
CHANGED
|
@@ -6,6 +6,9 @@ from openpyxl.styles import Font, PatternFill
|
|
|
6
6
|
import traceback
|
|
7
7
|
import json
|
|
8
8
|
from .utilities import sanitize_filename, _script_info, make_fullpath
|
|
9
|
+
import logging
|
|
10
|
+
import sys
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
__all__ = [
|
|
@@ -62,30 +65,30 @@ def custom_logger(
|
|
|
62
65
|
base_path = save_path / f"{log_name}_{timestamp}"
|
|
63
66
|
|
|
64
67
|
if isinstance(data, list):
|
|
65
|
-
_log_list_to_txt(data, base_path
|
|
68
|
+
_log_list_to_txt(data, base_path.with_suffix(".txt"))
|
|
66
69
|
|
|
67
70
|
elif isinstance(data, dict):
|
|
68
71
|
if all(isinstance(v, list) for v in data.values()):
|
|
69
|
-
_log_dict_to_csv(data, base_path
|
|
72
|
+
_log_dict_to_csv(data, base_path.with_suffix(".csv"))
|
|
70
73
|
else:
|
|
71
|
-
_log_dict_to_json(data, base_path
|
|
74
|
+
_log_dict_to_json(data, base_path.with_suffix(".json"))
|
|
72
75
|
|
|
73
76
|
elif isinstance(data, pd.DataFrame):
|
|
74
|
-
_log_dataframe_to_xlsx(data, base_path
|
|
77
|
+
_log_dataframe_to_xlsx(data, base_path.with_suffix(".xlsx"))
|
|
75
78
|
|
|
76
79
|
elif isinstance(data, str):
|
|
77
|
-
_log_string_to_log(data, base_path
|
|
80
|
+
_log_string_to_log(data, base_path.with_suffix(".log"))
|
|
78
81
|
|
|
79
82
|
elif isinstance(data, BaseException):
|
|
80
|
-
_log_exception_to_log(data, base_path
|
|
83
|
+
_log_exception_to_log(data, base_path.with_suffix(".log"))
|
|
81
84
|
|
|
82
85
|
else:
|
|
83
86
|
raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
|
|
84
87
|
|
|
85
|
-
|
|
88
|
+
_LOGGER.info(f"Log saved to: '{base_path}'")
|
|
86
89
|
|
|
87
90
|
except Exception as e:
|
|
88
|
-
|
|
91
|
+
_LOGGER.error(f"Log not saved: {e}")
|
|
89
92
|
|
|
90
93
|
|
|
91
94
|
def _log_list_to_txt(data: List[Any], path: Path) -> None:
|
|
@@ -154,3 +157,37 @@ def _log_dict_to_json(data: Dict[Any, Any], path: Path) -> None:
|
|
|
154
157
|
|
|
155
158
|
def info():
|
|
156
159
|
_script_info(__all__)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
|
|
163
|
+
"""
|
|
164
|
+
Initializes and returns a configured logger instance.
|
|
165
|
+
|
|
166
|
+
- `logger.info()`
|
|
167
|
+
- `logger.warning()`
|
|
168
|
+
- `logger.error()` the program can potentially recover.
|
|
169
|
+
- `logger.critical()` the program is going to crash.
|
|
170
|
+
"""
|
|
171
|
+
logger = logging.getLogger(name)
|
|
172
|
+
logger.setLevel(level)
|
|
173
|
+
|
|
174
|
+
# Prevents adding handlers multiple times if the function is called again
|
|
175
|
+
if not logger.handlers:
|
|
176
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
177
|
+
|
|
178
|
+
# Define the format string and the date format separately
|
|
179
|
+
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
180
|
+
date_format = '%Y-%m-%d %H:%M' # Format: Year-Month-Day Hour:Minute
|
|
181
|
+
|
|
182
|
+
# Pass both the format and the date format to the Formatter
|
|
183
|
+
formatter = logging.Formatter(log_format, datefmt=date_format)
|
|
184
|
+
|
|
185
|
+
handler.setFormatter(formatter)
|
|
186
|
+
logger.addHandler(handler)
|
|
187
|
+
|
|
188
|
+
logger.propagate = False
|
|
189
|
+
|
|
190
|
+
return logger
|
|
191
|
+
|
|
192
|
+
# Create a single logger instance to be imported by other modules
|
|
193
|
+
_LOGGER = _get_logger()
|
ml_tools/utilities.py
CHANGED
|
@@ -221,7 +221,8 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
|
|
|
221
221
|
"""
|
|
222
222
|
datasets_path = make_fullpath(datasets_dir)
|
|
223
223
|
for df_name, df_path in list_csv_paths(datasets_path).items():
|
|
224
|
-
df
|
|
224
|
+
df: pd.DataFrame
|
|
225
|
+
df, _ = load_dataframe(df_path, kind="pandas") # type: ignore
|
|
225
226
|
yield df, df_name
|
|
226
227
|
|
|
227
228
|
|
|
@@ -596,6 +597,22 @@ def distribute_datasets_by_target(
|
|
|
596
597
|
yield target, subset
|
|
597
598
|
|
|
598
599
|
|
|
600
|
+
class LogKeys:
|
|
601
|
+
"""
|
|
602
|
+
Used for ML scripts only
|
|
603
|
+
|
|
604
|
+
Centralized keys for logging and history.
|
|
605
|
+
"""
|
|
606
|
+
# --- Epoch Level ---
|
|
607
|
+
TRAIN_LOSS = 'train_loss'
|
|
608
|
+
VAL_LOSS = 'val_loss'
|
|
609
|
+
|
|
610
|
+
# --- Batch Level ---
|
|
611
|
+
BATCH_LOSS = 'loss'
|
|
612
|
+
BATCH_INDEX = 'batch'
|
|
613
|
+
BATCH_SIZE = 'size'
|
|
614
|
+
|
|
615
|
+
|
|
599
616
|
def _script_info(all_data: list[str]):
|
|
600
617
|
"""
|
|
601
618
|
List available names.
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=ns8HsLWZhByurvjtUUW10p7If1h1O5-btUfCRXxzkME,31568
|
|
4
|
-
ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
|
|
5
|
-
ml_tools/PSO_optimization.py,sha256=gi56mF-q6BApYwhAd9jix0xiYz595WTPcUh7afZsRJ4,25378
|
|
6
|
-
ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
|
|
7
|
-
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
|
|
9
|
-
ml_tools/data_exploration.py,sha256=Fzbz_DKZ7F2e3-JbahLqKr3aP6lt9aCK9rNOHvR7nlA,23665
|
|
10
|
-
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
11
|
-
ml_tools/ensemble_learning.py,sha256=q9jbu7SupvXz61sURFQ9V2-7gUsLbA3cSgyb2MQFyyc,37351
|
|
12
|
-
ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,12944
|
|
13
|
-
ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
|
|
14
|
-
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
15
|
-
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
16
|
-
ml_tools/utilities.py,sha256=T6AnNEQjUDnMAMSIJ8yZqToAVESIlEKK0bGBEm3sAUU,20670
|
|
17
|
-
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
18
|
-
dragon_ml_toolbox-2.3.0.dist-info/METADATA,sha256=4wivV_JKPd83xNzf6xzSfCwxiZgvYL5uW4yE6Da8tnU,2974
|
|
19
|
-
dragon_ml_toolbox-2.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
-
dragon_ml_toolbox-2.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
21
|
-
dragon_ml_toolbox-2.3.0.dist-info/RECORD,,
|
ml_tools/trainer.py
DELETED
|
@@ -1,346 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
import numpy
|
|
3
|
-
from typing import Literal
|
|
4
|
-
from torch.utils.data import DataLoader, Dataset
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
import torch
|
|
7
|
-
from torch import nn
|
|
8
|
-
from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
|
|
9
|
-
from .utilities import _script_info
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
__all__ = [
|
|
13
|
-
"MyTrainer"
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class MyTrainer():
|
|
18
|
-
def __init__(self, model, train_dataset: Dataset, test_dataset: Dataset, kind: Literal["regression", "classification"],
|
|
19
|
-
criterion=None , shuffle: bool=True, batch_size: float=3, device: Literal["cpu", "cuda", "mps"]='cpu', learn_rate: float=0.001, dataloader_workers: int=2):
|
|
20
|
-
"""
|
|
21
|
-
Automates the training process of a PyTorch Model using Adam optimization by default (`self.optimizer`).
|
|
22
|
-
|
|
23
|
-
`kind`: Will be used to compute and display metrics after training is complete.
|
|
24
|
-
|
|
25
|
-
`shuffle`: Whether to shuffle dataset batches at every epoch. Default is True.
|
|
26
|
-
|
|
27
|
-
`criterion`: Loss function. If 'None', defaults to `nn.NLLLoss` for classification or `nn.MSELoss` for regression.
|
|
28
|
-
|
|
29
|
-
`batch_size` Represents the fraction of the original dataset size to be used per batch. If an integer is passed, use that many samples, instead. Default is 3 samples at a time.
|
|
30
|
-
|
|
31
|
-
`learn_rate` Model learning rate. Default is 0.001.
|
|
32
|
-
|
|
33
|
-
`dataloader_workers` Subprocesses to use for data loading. Default is 2.
|
|
34
|
-
"""
|
|
35
|
-
# Validate kind
|
|
36
|
-
if kind not in ["regression", "classification"]:
|
|
37
|
-
raise TypeError("Kind must be 'regression' or 'classification'.")
|
|
38
|
-
# Validate batch size
|
|
39
|
-
batch_error = "Batch must a float in range [0.01, 1) or an integer."
|
|
40
|
-
if isinstance(batch_size, (float, int)):
|
|
41
|
-
if (1.00 > batch_size >= 0.01):
|
|
42
|
-
train_batch = int(len(train_dataset) * batch_size)
|
|
43
|
-
test_batch = int(len(test_dataset) * batch_size)
|
|
44
|
-
elif batch_size > len(train_dataset) or batch_size > len(test_dataset):
|
|
45
|
-
raise ValueError(batch_error + " Size is greater than dataset size.")
|
|
46
|
-
elif batch_size >= 1:
|
|
47
|
-
train_batch = int(batch_size)
|
|
48
|
-
test_batch = int(batch_size)
|
|
49
|
-
else:
|
|
50
|
-
raise ValueError(batch_error)
|
|
51
|
-
else:
|
|
52
|
-
raise TypeError(batch_error)
|
|
53
|
-
# Validate device
|
|
54
|
-
if device == "cuda":
|
|
55
|
-
if not torch.cuda.is_available():
|
|
56
|
-
print("CUDA not available, switching to CPU.")
|
|
57
|
-
device = "cpu"
|
|
58
|
-
elif device == "mps":
|
|
59
|
-
if not torch.backends.mps.is_available():
|
|
60
|
-
print("MPS not available, switching to CPU.")
|
|
61
|
-
device = "cpu"
|
|
62
|
-
# Validate criterion
|
|
63
|
-
if criterion is None:
|
|
64
|
-
if kind == "regression":
|
|
65
|
-
self.criterion = nn.MSELoss()
|
|
66
|
-
else:
|
|
67
|
-
self.criterion = nn.NLLLoss()
|
|
68
|
-
else:
|
|
69
|
-
self.criterion = criterion
|
|
70
|
-
# Validate dataloader workers
|
|
71
|
-
if not isinstance(dataloader_workers, int):
|
|
72
|
-
raise TypeError("Dataloader workers must be an integer value.")
|
|
73
|
-
|
|
74
|
-
# Check last layer in the model, implementation pending
|
|
75
|
-
# last_layer_name, last_layer = next(reversed(model._modules.items()))
|
|
76
|
-
# if isinstance(last_layer, nn.Linear):
|
|
77
|
-
# pass
|
|
78
|
-
|
|
79
|
-
self.train_loader = DataLoader(dataset=train_dataset, batch_size=train_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
|
|
80
|
-
self.test_loader = DataLoader(dataset=test_dataset, batch_size=test_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
|
|
81
|
-
self.kind = kind
|
|
82
|
-
self.device = torch.device(device)
|
|
83
|
-
self.model = model.to(self.device)
|
|
84
|
-
self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=learn_rate)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def auto_train(self, epochs: int=200, patience: int=3, cmap: Literal["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]="Blues",
|
|
88
|
-
roc: bool=False, **model_params):
|
|
89
|
-
"""
|
|
90
|
-
Start training-validation process of the model.
|
|
91
|
-
|
|
92
|
-
`patience` is the number of consecutive times the Validation Loss is allowed to increase before early-stopping the training process.
|
|
93
|
-
|
|
94
|
-
`cmap` Color map to use for the confusion matrix.
|
|
95
|
-
|
|
96
|
-
`model_params` Keywords parameters specific to the model, if any.
|
|
97
|
-
|
|
98
|
-
`roc` Whether to display the Receiver Operating Characteristic (ROC) Curve, for binary classification only.
|
|
99
|
-
"""
|
|
100
|
-
metric_name = "accuracy" if self.kind == "classification" else "RMSE"
|
|
101
|
-
previous_val_loss = None
|
|
102
|
-
epoch_tracker = 0
|
|
103
|
-
warnings = 0
|
|
104
|
-
feedback = None
|
|
105
|
-
val_losses = list()
|
|
106
|
-
train_losses = list()
|
|
107
|
-
|
|
108
|
-
# Validate inputs
|
|
109
|
-
if isinstance(epochs, int):
|
|
110
|
-
if epochs < 1:
|
|
111
|
-
print("Invalid number of epochs")
|
|
112
|
-
return None
|
|
113
|
-
else:
|
|
114
|
-
print("Invalid number of epochs")
|
|
115
|
-
return None
|
|
116
|
-
|
|
117
|
-
if isinstance(patience, int):
|
|
118
|
-
if patience < 0:
|
|
119
|
-
print("Invalid value for patience")
|
|
120
|
-
return None
|
|
121
|
-
else:
|
|
122
|
-
print("Invalid value for patience")
|
|
123
|
-
return None
|
|
124
|
-
|
|
125
|
-
if cmap not in ["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]:
|
|
126
|
-
print("Invalid cmap code, 'coolwarm' selected by default")
|
|
127
|
-
cmap = "coolwarm"
|
|
128
|
-
|
|
129
|
-
# Time training
|
|
130
|
-
start_time = time.time()
|
|
131
|
-
|
|
132
|
-
for epoch in range(1, epochs+1):
|
|
133
|
-
# Train model
|
|
134
|
-
self.model.train()
|
|
135
|
-
current_train_loss = 0
|
|
136
|
-
# Keep track of predictions and true labels on the last epoch to use later on scikit-learn
|
|
137
|
-
predictions_list = list()
|
|
138
|
-
true_labels_list = list()
|
|
139
|
-
probabilities_list = list()
|
|
140
|
-
|
|
141
|
-
for features, target in self.train_loader:
|
|
142
|
-
# features, targets to device
|
|
143
|
-
features = features.to(self.device)
|
|
144
|
-
target = target.to(self.device)
|
|
145
|
-
self.optimizer.zero_grad()
|
|
146
|
-
output = self.model(features, **model_params)
|
|
147
|
-
# check shapes
|
|
148
|
-
# print(features.shape, target.shape, output.shape)
|
|
149
|
-
# For Binary Cross Entropy
|
|
150
|
-
if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
|
|
151
|
-
target = target.to(torch.float32)
|
|
152
|
-
elif isinstance(self.criterion, (nn.MSELoss)):
|
|
153
|
-
output = output.view_as(target)
|
|
154
|
-
train_loss = self.criterion(output, target)
|
|
155
|
-
# Cumulative loss for current epoch on all batches
|
|
156
|
-
current_train_loss += train_loss.item()
|
|
157
|
-
# Backpropagation
|
|
158
|
-
train_loss.backward()
|
|
159
|
-
self.optimizer.step()
|
|
160
|
-
|
|
161
|
-
# Average Train Loss per sample
|
|
162
|
-
current_train_loss /= len(self.train_loader.dataset)
|
|
163
|
-
train_losses.append(current_train_loss)
|
|
164
|
-
|
|
165
|
-
# Evaluate
|
|
166
|
-
self.model.eval()
|
|
167
|
-
current_val_loss = 0
|
|
168
|
-
correct = 0
|
|
169
|
-
with torch.no_grad():
|
|
170
|
-
for features, target in self.test_loader:
|
|
171
|
-
# features, targets to device
|
|
172
|
-
features = features.to(self.device)
|
|
173
|
-
target = target.to(self.device)
|
|
174
|
-
output = self.model(features, **model_params)
|
|
175
|
-
# Save true labels for current batch (in case random shuffle was used)
|
|
176
|
-
true_labels_list.append(target.view(-1,1).cpu().numpy())
|
|
177
|
-
# For Binary Cross Entropy
|
|
178
|
-
if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
|
|
179
|
-
target = target.to(torch.float32)
|
|
180
|
-
elif isinstance(self.criterion, (nn.MSELoss)):
|
|
181
|
-
output = output.view_as(target)
|
|
182
|
-
current_val_loss += self.criterion(output, target).item()
|
|
183
|
-
# Save predictions of current batch, get accuracy
|
|
184
|
-
if self.kind == "classification":
|
|
185
|
-
predictions_list.append(output.argmax(dim=1).view(-1,1).cpu().numpy())
|
|
186
|
-
correct += output.argmax(dim=1).eq(target).sum().item()
|
|
187
|
-
if roc:
|
|
188
|
-
probabilities_local = nn.functional.softmax(output, dim=1)
|
|
189
|
-
probabilities_list.append(probabilities_local.cpu().numpy())
|
|
190
|
-
else: # Regression
|
|
191
|
-
predictions_list.append(output.view(-1,1).cpu().numpy())
|
|
192
|
-
|
|
193
|
-
# Average Validation Loss per sample
|
|
194
|
-
current_val_loss /= len(self.test_loader.dataset)
|
|
195
|
-
val_losses.append(current_val_loss)
|
|
196
|
-
|
|
197
|
-
# Concatenate all predictions and true labels
|
|
198
|
-
predictions = numpy.concatenate(predictions_list, axis=0)
|
|
199
|
-
true_labels = numpy.concatenate(true_labels_list, axis=0)
|
|
200
|
-
if roc:
|
|
201
|
-
probabilities = numpy.concatenate(probabilities_list, axis=0)
|
|
202
|
-
|
|
203
|
-
# Accuracy / RMSE
|
|
204
|
-
if self.kind == "classification":
|
|
205
|
-
accuracy = correct / len(self.test_loader.dataset)
|
|
206
|
-
accuracy = str(round(100*accuracy, ndigits=1)) + "%"
|
|
207
|
-
else: # Regression
|
|
208
|
-
accuracy = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
|
|
209
|
-
accuracy = str(round(accuracy, ndigits=4))
|
|
210
|
-
|
|
211
|
-
# Print details
|
|
212
|
-
details_format = f'epoch {epoch:2}: training loss: {current_train_loss:6.4f} validation loss: {current_val_loss:6.4f} {metric_name}: {accuracy}'
|
|
213
|
-
if (epoch % max(1, int(0.05*epochs)) == 0) or epoch in [1, 3, 5]:
|
|
214
|
-
print(details_format)
|
|
215
|
-
|
|
216
|
-
# Compare validation loss per epoch
|
|
217
|
-
# First run
|
|
218
|
-
if previous_val_loss is None:
|
|
219
|
-
previous_val_loss = current_val_loss
|
|
220
|
-
# If validation loss is increasing or the same (not improving) use patience
|
|
221
|
-
elif current_val_loss >= previous_val_loss:
|
|
222
|
-
if epoch == epoch_tracker + 1:
|
|
223
|
-
warnings += 1
|
|
224
|
-
else:
|
|
225
|
-
warnings = 1
|
|
226
|
-
epoch_tracker = epoch
|
|
227
|
-
# If validation loss decreased
|
|
228
|
-
else:
|
|
229
|
-
warnings = 0
|
|
230
|
-
|
|
231
|
-
# If patience is exhausted
|
|
232
|
-
if warnings == patience:
|
|
233
|
-
feedback = f"👁️ Validation Loss has increased {patience} consecutive times."
|
|
234
|
-
break
|
|
235
|
-
|
|
236
|
-
# Training must continue for another epoch
|
|
237
|
-
previous_val_loss = current_val_loss
|
|
238
|
-
|
|
239
|
-
# if all epochs have been completed
|
|
240
|
-
else:
|
|
241
|
-
feedback = "Training has been completed without any early-stopping criteria."
|
|
242
|
-
|
|
243
|
-
# Print feedback message
|
|
244
|
-
print('\n', details_format)
|
|
245
|
-
print(feedback, f"\n")
|
|
246
|
-
|
|
247
|
-
# Show elapsed time
|
|
248
|
-
elapsed_time = time.time() - start_time
|
|
249
|
-
minutes, seconds = divmod(elapsed_time, 60)
|
|
250
|
-
print(f"Elapsed time: {minutes:.0f} minutes {seconds:2.0f} seconds {epoch} epochs")
|
|
251
|
-
|
|
252
|
-
# Plot losses
|
|
253
|
-
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10,4), dpi=150, sharey=False)
|
|
254
|
-
|
|
255
|
-
ax1.plot(range(2, epoch+1), train_losses[1:])
|
|
256
|
-
ax1.set_title("Training Loss")
|
|
257
|
-
ax1.set_xlabel("Epochs")
|
|
258
|
-
ax1.set_ylabel("Average loss per sample")
|
|
259
|
-
|
|
260
|
-
ax2.plot(range(2, epoch+1), val_losses[1:])
|
|
261
|
-
ax2.set_title("Validation Loss")
|
|
262
|
-
ax2.set_xlabel("Epochs")
|
|
263
|
-
ax2.set_ylabel("Average loss per sample")
|
|
264
|
-
|
|
265
|
-
plt.tight_layout()
|
|
266
|
-
plt.show()
|
|
267
|
-
|
|
268
|
-
# Metrics
|
|
269
|
-
# Display metrics
|
|
270
|
-
if self.kind == "regression":
|
|
271
|
-
rmse = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
|
|
272
|
-
r2 = r2_score(y_true=true_labels, y_pred=predictions)
|
|
273
|
-
medae = median_absolute_error(y_true=true_labels, y_pred=predictions)
|
|
274
|
-
print(f"Root Mean Squared Error (RMSE): {rmse:6.4f} (range 0 to \u221E)")
|
|
275
|
-
print(f"Median Absolute Error (MedAE): {medae:6.4f} (range: 0 to \u221E)")
|
|
276
|
-
print(f"Coefficient of Determination (R2 Score): {r2:4.2f} (range: -\u221E to 1)\n")
|
|
277
|
-
|
|
278
|
-
elif self.kind == "classification":
|
|
279
|
-
print(classification_report(y_true=true_labels, y_pred=predictions))
|
|
280
|
-
ConfusionMatrixDisplay.from_predictions(y_true=true_labels, y_pred=predictions, cmap=cmap)
|
|
281
|
-
|
|
282
|
-
# ROC curve & Area under the curve
|
|
283
|
-
if roc:
|
|
284
|
-
false_positives, true_positives, thresholds = roc_curve(y_true=true_labels, y_score=probabilities[:,1])
|
|
285
|
-
area_under_curve = roc_auc_score(y_true=true_labels, y_score=probabilities[:,1])
|
|
286
|
-
|
|
287
|
-
plt.figure(figsize=(4,4))
|
|
288
|
-
plt.plot(false_positives, true_positives)
|
|
289
|
-
plt.title("Receiver Operating Characteristic (ROC) Curve")
|
|
290
|
-
plt.xlabel("False Positive Rate")
|
|
291
|
-
plt.ylabel("True Positive Rate")
|
|
292
|
-
plt.show()
|
|
293
|
-
|
|
294
|
-
print(f"Area under the curve score: {area_under_curve:4.2f}")
|
|
295
|
-
else:
|
|
296
|
-
print("Error encountered while retrieving 'model.kind' attribute.")
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
def rnn_forecast(self, sequence: torch.Tensor, steps: int):
|
|
300
|
-
"""
|
|
301
|
-
Runs a sequential forecast for a RNN, where each new prediction is obtained by feeding the previous prediction.
|
|
302
|
-
|
|
303
|
-
The input tensor representing a sequence must be of shape `(sequence length, number of features)` with normalized values (if needed).
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
`sequence`: Last subsequence of the sequence.
|
|
307
|
-
|
|
308
|
-
`steps`: Number of future time steps to predict.
|
|
309
|
-
|
|
310
|
-
Returns: Numpy array of predictions.
|
|
311
|
-
"""
|
|
312
|
-
self.model.eval()
|
|
313
|
-
with torch.no_grad():
|
|
314
|
-
# send sequence to device
|
|
315
|
-
sequence = sequence.to(self.device)
|
|
316
|
-
# Make a dummy list in memory
|
|
317
|
-
sequences = [torch.zeros_like(sequence, device=self.device, requires_grad=False) for _ in range(steps)]
|
|
318
|
-
sequences[0] = sequence
|
|
319
|
-
# Store predictions
|
|
320
|
-
predictions = list()
|
|
321
|
-
# Get predictions
|
|
322
|
-
for i in range(steps):
|
|
323
|
-
in_seq = sequences[i]
|
|
324
|
-
output = self.model(in_seq)
|
|
325
|
-
# Last timestamp
|
|
326
|
-
output = output[-1].view(1,-1)
|
|
327
|
-
# Save prediction
|
|
328
|
-
# Check if it is a single feature, get value
|
|
329
|
-
if output.shape[1] == 1:
|
|
330
|
-
predictions.append(output.item())
|
|
331
|
-
# Else, return a list of lists
|
|
332
|
-
else:
|
|
333
|
-
predictions.append(output.squeeze().cpu().tolist())
|
|
334
|
-
# Create next sequence
|
|
335
|
-
if i < steps-1:
|
|
336
|
-
current_seq = sequences[i]
|
|
337
|
-
new_seq = torch.concatenate([current_seq[1:], output], dim=0).to(self.device)
|
|
338
|
-
sequences[i+1] = new_seq
|
|
339
|
-
|
|
340
|
-
# Cast to array and return
|
|
341
|
-
predictions = numpy.array(predictions)
|
|
342
|
-
return predictions
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
def info():
|
|
346
|
-
_script_info(__all__)
|