dragon-ml-toolbox 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from sklearn.metrics import accuracy_score, classification_report, ConfusionMatr
20
20
  import shap
21
21
 
22
22
  from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object, make_fullpath
23
+ from .logger import _LOGGER
23
24
 
24
25
  import warnings # Ignore warnings
25
26
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -438,16 +439,16 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Task
438
439
  '''
439
440
  #DEBUG
440
441
  if debug:
441
- print(f"Split Dataframes Shapes - Features DF: {df_features.shape}, Target DF: {df_target.shape}")
442
+ _LOGGER.info(f"Split Dataframes Shapes - Features DF: {df_features.shape}, Target DF: {df_target.shape}")
442
443
  unique_values = df_target.unique() # Get unique values for the target column
443
- print(f"\tUnique values for '{df_target.name}': {unique_values}")
444
+ _LOGGER.info(f"\tUnique values for '{df_target.name}': {unique_values}")
444
445
 
445
446
  #Train test split
446
447
  X_train, X_test, y_train, y_test = _split_data(features=df_features, target=df_target, test_size=test_size, random_state=random_state, task=task)
447
448
 
448
449
  #DEBUG
449
450
  if debug:
450
- print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
451
+ _LOGGER.info(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
451
452
 
452
453
 
453
454
  # Resample
@@ -458,7 +459,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Task
458
459
 
459
460
  #DEBUG
460
461
  if debug:
461
- print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
462
+ _LOGGER.info(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
462
463
 
463
464
  return X_train_oversampled, y_train_oversampled, X_test, y_test
464
465
 
@@ -864,7 +865,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
864
865
  print(f"\tTraining model: {model_name} for Target: {target_name}...")
865
866
  trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
866
867
  if debug:
867
- print(f"Trained model object: {type(trained_model)}")
868
+ _LOGGER.info(f"Trained model object: {type(trained_model)}")
868
869
  local_save_directory = _local_directories(model_name=model_name, dataset_id=dataset_id, save_dir=save_dir)
869
870
 
870
871
  if save_model:
@@ -885,11 +886,11 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
885
886
  else:
886
887
  raise ValueError(f"Unrecognized task '{task}' for model training,")
887
888
  if debug:
888
- print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
889
+ _LOGGER.info(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
889
890
 
890
891
  get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
891
892
  features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
892
- # print("\t...done.")
893
+
893
894
  return trained_model, y_pred
894
895
 
895
896
  ###### 5. Execution ######
@@ -902,7 +903,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
902
903
  elif isinstance(model_object, ClassificationTreeModels):
903
904
  task = "classification"
904
905
  if handle_classification_imbalance is None:
905
- print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
906
+ _LOGGER.warning("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
906
907
  elif handle_classification_imbalance == "by_model":
907
908
  model_object.use_model_balance = True
908
909
  else:
@@ -914,6 +915,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
914
915
  datasets_path = make_fullpath(datasets_dir)
915
916
  save_path = make_fullpath(save_dir, make=True)
916
917
 
918
+ _LOGGER.info("Training starting...")
917
919
  #Yield imputed dataset
918
920
  for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
919
921
  #Yield features dataframe and target dataframe
@@ -931,7 +933,8 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
931
933
  test_features=X_test, test_target=y_test,
932
934
  feature_names=feature_names,target_name=target_name,
933
935
  debug=debug, save_dir=save_path, save_model=save_model)
934
- print("\n✅ Training and evaluation complete.")
936
+ print("")
937
+ _LOGGER.info("✅ Training and evaluation complete.")
935
938
 
936
939
 
937
940
  def info():
ml_tools/handle_excel.py CHANGED
@@ -3,6 +3,7 @@ from openpyxl import load_workbook, Workbook
3
3
  import pandas as pd
4
4
  from typing import List, Optional, Union
5
5
  from .utilities import _script_info, sanitize_filename, make_fullpath
6
+ from .logger import _LOGGER
6
7
 
7
8
 
8
9
  __all__ = [
@@ -95,10 +96,9 @@ def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
95
96
  output_path = base_dir / output_filename
96
97
  new_wb.save(output_path)
97
98
 
98
- # print(f"Saved: {output_path}")
99
99
  total_output_files += 1
100
100
 
101
- print(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
101
+ _LOGGER.info(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
102
102
  return None
103
103
 
104
104
 
@@ -152,10 +152,9 @@ def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Uni
152
152
  output_path = global_output_path / output_filename
153
153
  new_wb.save(output_path)
154
154
 
155
- # print(f"Saved: {output_path}")
156
155
  total_output_files += 1
157
156
 
158
- print(f"✅ Processed {len(excel_files)} input Excel file(s) with a total of {total_output_files} output Excel file(s).")
157
+ _LOGGER.info(f"✅ Processed {len(excel_files)} input Excel file(s) with a total of {total_output_files} output Excel file(s).")
159
158
  return None
160
159
 
161
160
 
@@ -199,13 +198,13 @@ def validate_excel_schema(
199
198
  invalid_files.append(file)
200
199
 
201
200
  except Exception as e:
202
- print(f"Error processing '{file}': {e}")
201
+ _LOGGER.error(f"Error processing '{file}': {e}")
203
202
  invalid_files.append(file)
204
203
 
205
204
  valid_excel_number = len(excel_paths) - len(invalid_files)
206
- print(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
205
+ _LOGGER.info(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
207
206
  if invalid_files:
208
- print(f"⚠️ {len(invalid_files)} excel files are invalid:")
207
+ _LOGGER.warning(f"⚠️ {len(invalid_files)} excel files are invalid:")
209
208
  for in_file in invalid_files:
210
209
  print(f" - {in_file.name}")
211
210
 
@@ -266,7 +265,7 @@ def vertical_merge_transform_excel(
266
265
  merged_df.columns = rename_columns
267
266
 
268
267
  merged_df.to_csv(csv_path, index=False, encoding='utf-8')
269
- print(f"✅ Merged {len(dataframes)} excel files into '{csv_filename}'.")
268
+ _LOGGER.info(f"✅ Merged {len(dataframes)} excel files into '{csv_filename}'.")
270
269
 
271
270
 
272
271
  def horizontal_merge_transform_excel(
@@ -344,9 +343,9 @@ def horizontal_merge_transform_excel(
344
343
 
345
344
  merged_df.to_csv(csv_path, index=False, encoding='utf-8')
346
345
 
347
- print(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
346
+ _LOGGER.info(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
348
347
  if duplicate_columns:
349
- print(f"⚠️ Duplicate columns: {duplicate_columns}")
348
+ _LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
350
349
 
351
350
 
352
351
  def info():
ml_tools/logger.py CHANGED
@@ -6,6 +6,9 @@ from openpyxl.styles import Font, PatternFill
6
6
  import traceback
7
7
  import json
8
8
  from .utilities import sanitize_filename, _script_info, make_fullpath
9
+ import logging
10
+ import sys
11
+
9
12
 
10
13
 
11
14
  __all__ = [
@@ -62,30 +65,30 @@ def custom_logger(
62
65
  base_path = save_path / f"{log_name}_{timestamp}"
63
66
 
64
67
  if isinstance(data, list):
65
- _log_list_to_txt(data, base_path + ".txt")
68
+ _log_list_to_txt(data, base_path.with_suffix(".txt"))
66
69
 
67
70
  elif isinstance(data, dict):
68
71
  if all(isinstance(v, list) for v in data.values()):
69
- _log_dict_to_csv(data, base_path + ".csv")
72
+ _log_dict_to_csv(data, base_path.with_suffix(".csv"))
70
73
  else:
71
- _log_dict_to_json(data, base_path + ".json")
74
+ _log_dict_to_json(data, base_path.with_suffix(".json"))
72
75
 
73
76
  elif isinstance(data, pd.DataFrame):
74
- _log_dataframe_to_xlsx(data, base_path + ".xlsx")
77
+ _log_dataframe_to_xlsx(data, base_path.with_suffix(".xlsx"))
75
78
 
76
79
  elif isinstance(data, str):
77
- _log_string_to_log(data, base_path + ".log")
80
+ _log_string_to_log(data, base_path.with_suffix(".log"))
78
81
 
79
82
  elif isinstance(data, BaseException):
80
- _log_exception_to_log(data, base_path + ".log")
83
+ _log_exception_to_log(data, base_path.with_suffix(".log"))
81
84
 
82
85
  else:
83
86
  raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
84
87
 
85
- print(f"Log saved to: '{base_path}'")
88
+ _LOGGER.info(f"Log saved to: '{base_path}'")
86
89
 
87
90
  except Exception as e:
88
- print(f"Error in custom_logger: {e}")
91
+ _LOGGER.error(f"Log not saved: {e}")
89
92
 
90
93
 
91
94
  def _log_list_to_txt(data: List[Any], path: Path) -> None:
@@ -154,3 +157,37 @@ def _log_dict_to_json(data: Dict[Any, Any], path: Path) -> None:
154
157
 
155
158
  def info():
156
159
  _script_info(__all__)
160
+
161
+
162
+ def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
163
+ """
164
+ Initializes and returns a configured logger instance.
165
+
166
+ - `logger.info()`
167
+ - `logger.warning()`
168
+ - `logger.error()` the program can potentially recover.
169
+ - `logger.critical()` the program is going to crash.
170
+ """
171
+ logger = logging.getLogger(name)
172
+ logger.setLevel(level)
173
+
174
+ # Prevents adding handlers multiple times if the function is called again
175
+ if not logger.handlers:
176
+ handler = logging.StreamHandler(sys.stdout)
177
+
178
+ # Define the format string and the date format separately
179
+ log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
180
+ date_format = '%Y-%m-%d %H:%M' # Format: Year-Month-Day Hour:Minute
181
+
182
+ # Pass both the format and the date format to the Formatter
183
+ formatter = logging.Formatter(log_format, datefmt=date_format)
184
+
185
+ handler.setFormatter(formatter)
186
+ logger.addHandler(handler)
187
+
188
+ logger.propagate = False
189
+
190
+ return logger
191
+
192
+ # Create a single logger instance to be imported by other modules
193
+ _LOGGER = _get_logger()
ml_tools/utilities.py CHANGED
@@ -221,7 +221,8 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
221
221
  """
222
222
  datasets_path = make_fullpath(datasets_dir)
223
223
  for df_name, df_path in list_csv_paths(datasets_path).items():
224
- df, _ = load_dataframe(df_path)
224
+ df: pd.DataFrame
225
+ df, _ = load_dataframe(df_path, kind="pandas") # type: ignore
225
226
  yield df, df_name
226
227
 
227
228
 
@@ -596,6 +597,22 @@ def distribute_datasets_by_target(
596
597
  yield target, subset
597
598
 
598
599
 
600
+ class LogKeys:
601
+ """
602
+ Used for ML scripts only
603
+
604
+ Centralized keys for logging and history.
605
+ """
606
+ # --- Epoch Level ---
607
+ TRAIN_LOSS = 'train_loss'
608
+ VAL_LOSS = 'val_loss'
609
+
610
+ # --- Batch Level ---
611
+ BATCH_LOSS = 'loss'
612
+ BATCH_INDEX = 'batch'
613
+ BATCH_SIZE = 'size'
614
+
615
+
599
616
  def _script_info(all_data: list[str]):
600
617
  """
601
618
  List available names.
@@ -1,21 +0,0 @@
1
- dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
- ml_tools/ETL_engineering.py,sha256=ns8HsLWZhByurvjtUUW10p7If1h1O5-btUfCRXxzkME,31568
4
- ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
5
- ml_tools/PSO_optimization.py,sha256=gi56mF-q6BApYwhAd9jix0xiYz595WTPcUh7afZsRJ4,25378
6
- ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
7
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
9
- ml_tools/data_exploration.py,sha256=Fzbz_DKZ7F2e3-JbahLqKr3aP6lt9aCK9rNOHvR7nlA,23665
10
- ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
11
- ml_tools/ensemble_learning.py,sha256=q9jbu7SupvXz61sURFQ9V2-7gUsLbA3cSgyb2MQFyyc,37351
12
- ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,12944
13
- ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
14
- ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
15
- ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
16
- ml_tools/utilities.py,sha256=T6AnNEQjUDnMAMSIJ8yZqToAVESIlEKK0bGBEm3sAUU,20670
17
- ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
18
- dragon_ml_toolbox-2.3.0.dist-info/METADATA,sha256=4wivV_JKPd83xNzf6xzSfCwxiZgvYL5uW4yE6Da8tnU,2974
19
- dragon_ml_toolbox-2.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- dragon_ml_toolbox-2.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
21
- dragon_ml_toolbox-2.3.0.dist-info/RECORD,,
ml_tools/trainer.py DELETED
@@ -1,346 +0,0 @@
1
- import time
2
- import numpy
3
- from typing import Literal
4
- from torch.utils.data import DataLoader, Dataset
5
- import matplotlib.pyplot as plt
6
- import torch
7
- from torch import nn
8
- from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
9
- from .utilities import _script_info
10
-
11
-
12
- __all__ = [
13
- "MyTrainer"
14
- ]
15
-
16
-
17
- class MyTrainer():
18
- def __init__(self, model, train_dataset: Dataset, test_dataset: Dataset, kind: Literal["regression", "classification"],
19
- criterion=None , shuffle: bool=True, batch_size: float=3, device: Literal["cpu", "cuda", "mps"]='cpu', learn_rate: float=0.001, dataloader_workers: int=2):
20
- """
21
- Automates the training process of a PyTorch Model using Adam optimization by default (`self.optimizer`).
22
-
23
- `kind`: Will be used to compute and display metrics after training is complete.
24
-
25
- `shuffle`: Whether to shuffle dataset batches at every epoch. Default is True.
26
-
27
- `criterion`: Loss function. If 'None', defaults to `nn.NLLLoss` for classification or `nn.MSELoss` for regression.
28
-
29
- `batch_size` Represents the fraction of the original dataset size to be used per batch. If an integer is passed, use that many samples, instead. Default is 3 samples at a time.
30
-
31
- `learn_rate` Model learning rate. Default is 0.001.
32
-
33
- `dataloader_workers` Subprocesses to use for data loading. Default is 2.
34
- """
35
- # Validate kind
36
- if kind not in ["regression", "classification"]:
37
- raise TypeError("Kind must be 'regression' or 'classification'.")
38
- # Validate batch size
39
- batch_error = "Batch must a float in range [0.01, 1) or an integer."
40
- if isinstance(batch_size, (float, int)):
41
- if (1.00 > batch_size >= 0.01):
42
- train_batch = int(len(train_dataset) * batch_size)
43
- test_batch = int(len(test_dataset) * batch_size)
44
- elif batch_size > len(train_dataset) or batch_size > len(test_dataset):
45
- raise ValueError(batch_error + " Size is greater than dataset size.")
46
- elif batch_size >= 1:
47
- train_batch = int(batch_size)
48
- test_batch = int(batch_size)
49
- else:
50
- raise ValueError(batch_error)
51
- else:
52
- raise TypeError(batch_error)
53
- # Validate device
54
- if device == "cuda":
55
- if not torch.cuda.is_available():
56
- print("CUDA not available, switching to CPU.")
57
- device = "cpu"
58
- elif device == "mps":
59
- if not torch.backends.mps.is_available():
60
- print("MPS not available, switching to CPU.")
61
- device = "cpu"
62
- # Validate criterion
63
- if criterion is None:
64
- if kind == "regression":
65
- self.criterion = nn.MSELoss()
66
- else:
67
- self.criterion = nn.NLLLoss()
68
- else:
69
- self.criterion = criterion
70
- # Validate dataloader workers
71
- if not isinstance(dataloader_workers, int):
72
- raise TypeError("Dataloader workers must be an integer value.")
73
-
74
- # Check last layer in the model, implementation pending
75
- # last_layer_name, last_layer = next(reversed(model._modules.items()))
76
- # if isinstance(last_layer, nn.Linear):
77
- # pass
78
-
79
- self.train_loader = DataLoader(dataset=train_dataset, batch_size=train_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
80
- self.test_loader = DataLoader(dataset=test_dataset, batch_size=test_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
81
- self.kind = kind
82
- self.device = torch.device(device)
83
- self.model = model.to(self.device)
84
- self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=learn_rate)
85
-
86
-
87
- def auto_train(self, epochs: int=200, patience: int=3, cmap: Literal["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]="Blues",
88
- roc: bool=False, **model_params):
89
- """
90
- Start training-validation process of the model.
91
-
92
- `patience` is the number of consecutive times the Validation Loss is allowed to increase before early-stopping the training process.
93
-
94
- `cmap` Color map to use for the confusion matrix.
95
-
96
- `model_params` Keywords parameters specific to the model, if any.
97
-
98
- `roc` Whether to display the Receiver Operating Characteristic (ROC) Curve, for binary classification only.
99
- """
100
- metric_name = "accuracy" if self.kind == "classification" else "RMSE"
101
- previous_val_loss = None
102
- epoch_tracker = 0
103
- warnings = 0
104
- feedback = None
105
- val_losses = list()
106
- train_losses = list()
107
-
108
- # Validate inputs
109
- if isinstance(epochs, int):
110
- if epochs < 1:
111
- print("Invalid number of epochs")
112
- return None
113
- else:
114
- print("Invalid number of epochs")
115
- return None
116
-
117
- if isinstance(patience, int):
118
- if patience < 0:
119
- print("Invalid value for patience")
120
- return None
121
- else:
122
- print("Invalid value for patience")
123
- return None
124
-
125
- if cmap not in ["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]:
126
- print("Invalid cmap code, 'coolwarm' selected by default")
127
- cmap = "coolwarm"
128
-
129
- # Time training
130
- start_time = time.time()
131
-
132
- for epoch in range(1, epochs+1):
133
- # Train model
134
- self.model.train()
135
- current_train_loss = 0
136
- # Keep track of predictions and true labels on the last epoch to use later on scikit-learn
137
- predictions_list = list()
138
- true_labels_list = list()
139
- probabilities_list = list()
140
-
141
- for features, target in self.train_loader:
142
- # features, targets to device
143
- features = features.to(self.device)
144
- target = target.to(self.device)
145
- self.optimizer.zero_grad()
146
- output = self.model(features, **model_params)
147
- # check shapes
148
- # print(features.shape, target.shape, output.shape)
149
- # For Binary Cross Entropy
150
- if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
151
- target = target.to(torch.float32)
152
- elif isinstance(self.criterion, (nn.MSELoss)):
153
- output = output.view_as(target)
154
- train_loss = self.criterion(output, target)
155
- # Cumulative loss for current epoch on all batches
156
- current_train_loss += train_loss.item()
157
- # Backpropagation
158
- train_loss.backward()
159
- self.optimizer.step()
160
-
161
- # Average Train Loss per sample
162
- current_train_loss /= len(self.train_loader.dataset)
163
- train_losses.append(current_train_loss)
164
-
165
- # Evaluate
166
- self.model.eval()
167
- current_val_loss = 0
168
- correct = 0
169
- with torch.no_grad():
170
- for features, target in self.test_loader:
171
- # features, targets to device
172
- features = features.to(self.device)
173
- target = target.to(self.device)
174
- output = self.model(features, **model_params)
175
- # Save true labels for current batch (in case random shuffle was used)
176
- true_labels_list.append(target.view(-1,1).cpu().numpy())
177
- # For Binary Cross Entropy
178
- if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
179
- target = target.to(torch.float32)
180
- elif isinstance(self.criterion, (nn.MSELoss)):
181
- output = output.view_as(target)
182
- current_val_loss += self.criterion(output, target).item()
183
- # Save predictions of current batch, get accuracy
184
- if self.kind == "classification":
185
- predictions_list.append(output.argmax(dim=1).view(-1,1).cpu().numpy())
186
- correct += output.argmax(dim=1).eq(target).sum().item()
187
- if roc:
188
- probabilities_local = nn.functional.softmax(output, dim=1)
189
- probabilities_list.append(probabilities_local.cpu().numpy())
190
- else: # Regression
191
- predictions_list.append(output.view(-1,1).cpu().numpy())
192
-
193
- # Average Validation Loss per sample
194
- current_val_loss /= len(self.test_loader.dataset)
195
- val_losses.append(current_val_loss)
196
-
197
- # Concatenate all predictions and true labels
198
- predictions = numpy.concatenate(predictions_list, axis=0)
199
- true_labels = numpy.concatenate(true_labels_list, axis=0)
200
- if roc:
201
- probabilities = numpy.concatenate(probabilities_list, axis=0)
202
-
203
- # Accuracy / RMSE
204
- if self.kind == "classification":
205
- accuracy = correct / len(self.test_loader.dataset)
206
- accuracy = str(round(100*accuracy, ndigits=1)) + "%"
207
- else: # Regression
208
- accuracy = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
209
- accuracy = str(round(accuracy, ndigits=4))
210
-
211
- # Print details
212
- details_format = f'epoch {epoch:2}: training loss: {current_train_loss:6.4f} validation loss: {current_val_loss:6.4f} {metric_name}: {accuracy}'
213
- if (epoch % max(1, int(0.05*epochs)) == 0) or epoch in [1, 3, 5]:
214
- print(details_format)
215
-
216
- # Compare validation loss per epoch
217
- # First run
218
- if previous_val_loss is None:
219
- previous_val_loss = current_val_loss
220
- # If validation loss is increasing or the same (not improving) use patience
221
- elif current_val_loss >= previous_val_loss:
222
- if epoch == epoch_tracker + 1:
223
- warnings += 1
224
- else:
225
- warnings = 1
226
- epoch_tracker = epoch
227
- # If validation loss decreased
228
- else:
229
- warnings = 0
230
-
231
- # If patience is exhausted
232
- if warnings == patience:
233
- feedback = f"👁️ Validation Loss has increased {patience} consecutive times."
234
- break
235
-
236
- # Training must continue for another epoch
237
- previous_val_loss = current_val_loss
238
-
239
- # if all epochs have been completed
240
- else:
241
- feedback = "Training has been completed without any early-stopping criteria."
242
-
243
- # Print feedback message
244
- print('\n', details_format)
245
- print(feedback, f"\n")
246
-
247
- # Show elapsed time
248
- elapsed_time = time.time() - start_time
249
- minutes, seconds = divmod(elapsed_time, 60)
250
- print(f"Elapsed time: {minutes:.0f} minutes {seconds:2.0f} seconds {epoch} epochs")
251
-
252
- # Plot losses
253
- fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10,4), dpi=150, sharey=False)
254
-
255
- ax1.plot(range(2, epoch+1), train_losses[1:])
256
- ax1.set_title("Training Loss")
257
- ax1.set_xlabel("Epochs")
258
- ax1.set_ylabel("Average loss per sample")
259
-
260
- ax2.plot(range(2, epoch+1), val_losses[1:])
261
- ax2.set_title("Validation Loss")
262
- ax2.set_xlabel("Epochs")
263
- ax2.set_ylabel("Average loss per sample")
264
-
265
- plt.tight_layout()
266
- plt.show()
267
-
268
- # Metrics
269
- # Display metrics
270
- if self.kind == "regression":
271
- rmse = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
272
- r2 = r2_score(y_true=true_labels, y_pred=predictions)
273
- medae = median_absolute_error(y_true=true_labels, y_pred=predictions)
274
- print(f"Root Mean Squared Error (RMSE): {rmse:6.4f} (range 0 to \u221E)")
275
- print(f"Median Absolute Error (MedAE): {medae:6.4f} (range: 0 to \u221E)")
276
- print(f"Coefficient of Determination (R2 Score): {r2:4.2f} (range: -\u221E to 1)\n")
277
-
278
- elif self.kind == "classification":
279
- print(classification_report(y_true=true_labels, y_pred=predictions))
280
- ConfusionMatrixDisplay.from_predictions(y_true=true_labels, y_pred=predictions, cmap=cmap)
281
-
282
- # ROC curve & Area under the curve
283
- if roc:
284
- false_positives, true_positives, thresholds = roc_curve(y_true=true_labels, y_score=probabilities[:,1])
285
- area_under_curve = roc_auc_score(y_true=true_labels, y_score=probabilities[:,1])
286
-
287
- plt.figure(figsize=(4,4))
288
- plt.plot(false_positives, true_positives)
289
- plt.title("Receiver Operating Characteristic (ROC) Curve")
290
- plt.xlabel("False Positive Rate")
291
- plt.ylabel("True Positive Rate")
292
- plt.show()
293
-
294
- print(f"Area under the curve score: {area_under_curve:4.2f}")
295
- else:
296
- print("Error encountered while retrieving 'model.kind' attribute.")
297
-
298
-
299
- def rnn_forecast(self, sequence: torch.Tensor, steps: int):
300
- """
301
- Runs a sequential forecast for a RNN, where each new prediction is obtained by feeding the previous prediction.
302
-
303
- The input tensor representing a sequence must be of shape `(sequence length, number of features)` with normalized values (if needed).
304
-
305
- Args:
306
- `sequence`: Last subsequence of the sequence.
307
-
308
- `steps`: Number of future time steps to predict.
309
-
310
- Returns: Numpy array of predictions.
311
- """
312
- self.model.eval()
313
- with torch.no_grad():
314
- # send sequence to device
315
- sequence = sequence.to(self.device)
316
- # Make a dummy list in memory
317
- sequences = [torch.zeros_like(sequence, device=self.device, requires_grad=False) for _ in range(steps)]
318
- sequences[0] = sequence
319
- # Store predictions
320
- predictions = list()
321
- # Get predictions
322
- for i in range(steps):
323
- in_seq = sequences[i]
324
- output = self.model(in_seq)
325
- # Last timestamp
326
- output = output[-1].view(1,-1)
327
- # Save prediction
328
- # Check if it is a single feature, get value
329
- if output.shape[1] == 1:
330
- predictions.append(output.item())
331
- # Else, return a list of lists
332
- else:
333
- predictions.append(output.squeeze().cpu().tolist())
334
- # Create next sequence
335
- if i < steps-1:
336
- current_seq = sequences[i]
337
- new_seq = torch.concatenate([current_seq[1:], output], dim=0).to(self.device)
338
- sequences[i+1] = new_seq
339
-
340
- # Cast to array and return
341
- predictions = numpy.array(predictions)
342
- return predictions
343
-
344
-
345
- def info():
346
- _script_info(__all__)