dragon-ml-toolbox 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

ml_tools/trainer.py ADDED
@@ -0,0 +1,366 @@
1
+ import time
2
+ import numpy
3
+ from typing import Literal
4
+ from torch.utils.data import DataLoader, Dataset
5
+ import matplotlib.pyplot as plt
6
+ import torch
7
+ from torch import nn
8
+ from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
9
+
10
+
11
+ class MyTrainer():
12
+ def __init__(self, model, train_dataset: Dataset, test_dataset: Dataset, kind: Literal["regression", "classification"],
13
+ criterion=None , shuffle: bool=True, batch_size: float=3, device: Literal["cpu", "cuda", "mps"]='cpu', learn_rate: float=0.001, dataloader_workers: int=2):
14
+ """
15
+ Automates the training process of a PyTorch Model using Adam optimization by default (`self.optimizer`).
16
+
17
+ `kind`: Will be used to compute and display metrics after training is complete.
18
+
19
+ `shuffle`: Whether to shuffle dataset batches at every epoch. Default is True.
20
+
21
+ `criterion`: Loss function. If 'None', defaults to `nn.NLLLoss` for classification or `nn.MSELoss` for regression.
22
+
23
+ `batch_size` Represents the fraction of the original dataset size to be used per batch. If an integer is passed, use that many samples, instead. Default is 3 samples at a time.
24
+
25
+ `learn_rate` Model learning rate. Default is 0.001.
26
+
27
+ `dataloader_workers` Subprocesses to use for data loading. Default is 2.
28
+ """
29
+ # Validate kind
30
+ if kind not in ["regression", "classification"]:
31
+ raise TypeError("Kind must be 'regression' or 'classification'.")
32
+ # Validate batch size
33
+ batch_error = "Batch must a float in range [0.01, 1) or an integer."
34
+ if isinstance(batch_size, (float, int)):
35
+ if (1.00 > batch_size >= 0.01):
36
+ train_batch = int(len(train_dataset) * batch_size)
37
+ test_batch = int(len(test_dataset) * batch_size)
38
+ elif batch_size > len(train_dataset) or batch_size > len(test_dataset):
39
+ raise ValueError(batch_error + " Size is greater than dataset size.")
40
+ elif batch_size >= 1:
41
+ train_batch = int(batch_size)
42
+ test_batch = int(batch_size)
43
+ else:
44
+ raise ValueError(batch_error)
45
+ else:
46
+ raise TypeError(batch_error)
47
+ # Validate device
48
+ if device == "cuda":
49
+ if not torch.cuda.is_available():
50
+ print("CUDA not available, switching to CPU.")
51
+ device = "cpu"
52
+ elif device == "mps":
53
+ if not torch.backends.mps.is_available():
54
+ print("MPS not available, switching to CPU.")
55
+ device = "cpu"
56
+ # Validate criterion
57
+ if criterion is None:
58
+ if kind == "regression":
59
+ self.criterion = nn.MSELoss()
60
+ else:
61
+ self.criterion = nn.NLLLoss()
62
+ else:
63
+ self.criterion = criterion
64
+ # Validate dataloader workers
65
+ if not isinstance(dataloader_workers, int):
66
+ raise TypeError("Dataloader workers must be an integer value.")
67
+
68
+ # Check last layer in the model, implementation pending
69
+ # last_layer_name, last_layer = next(reversed(model._modules.items()))
70
+ # if isinstance(last_layer, nn.Linear):
71
+ # pass
72
+
73
+ self.train_loader = DataLoader(dataset=train_dataset, batch_size=train_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
74
+ self.test_loader = DataLoader(dataset=test_dataset, batch_size=test_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
75
+ self.kind = kind
76
+ self.device = torch.device(device)
77
+ self.model = model.to(self.device)
78
+ self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=learn_rate)
79
+
80
+
81
+ def auto_train(self, epochs: int=200, patience: int=3, cmap: Literal["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]="Blues",
82
+ roc: bool=False, **model_params):
83
+ """
84
+ Start training-validation process of the model.
85
+
86
+ `patience` is the number of consecutive times the Validation Loss is allowed to increase before early-stopping the training process.
87
+
88
+ `cmap` Color map to use for the confusion matrix.
89
+
90
+ `model_params` Keywords parameters specific to the model, if any.
91
+
92
+ `roc` Whether to display the Receiver Operating Characteristic (ROC) Curve, for binary classification only.
93
+ """
94
+ metric_name = "accuracy" if self.kind == "classification" else "RMSE"
95
+ previous_val_loss = None
96
+ epoch_tracker = 0
97
+ warnings = 0
98
+ feedback = None
99
+ val_losses = list()
100
+ train_losses = list()
101
+
102
+ # Validate inputs
103
+ if isinstance(epochs, int):
104
+ if epochs < 1:
105
+ print("Invalid number of epochs")
106
+ return None
107
+ else:
108
+ print("Invalid number of epochs")
109
+ return None
110
+
111
+ if isinstance(patience, int):
112
+ if patience < 0:
113
+ print("Invalid value for patience")
114
+ return None
115
+ else:
116
+ print("Invalid value for patience")
117
+ return None
118
+
119
+ if cmap not in ["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]:
120
+ print("Invalid cmap code, 'coolwarm' selected by default")
121
+ cmap = "coolwarm"
122
+
123
+ # Time training
124
+ start_time = time.time()
125
+
126
+ for epoch in range(1, epochs+1):
127
+ # Train model
128
+ self.model.train()
129
+ current_train_loss = 0
130
+ # Keep track of predictions and true labels on the last epoch to use later on scikit-learn
131
+ predictions_list = list()
132
+ true_labels_list = list()
133
+ probabilities_list = list()
134
+
135
+ for features, target in self.train_loader:
136
+ # features, targets to device
137
+ features = features.to(self.device)
138
+ target = target.to(self.device)
139
+ self.optimizer.zero_grad()
140
+ output = self.model(features, **model_params)
141
+ # check shapes
142
+ # print(features.shape, target.shape, output.shape)
143
+ # For Binary Cross Entropy
144
+ if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
145
+ target = target.to(torch.float32)
146
+ elif isinstance(self.criterion, (nn.MSELoss)):
147
+ output = output.view_as(target)
148
+ train_loss = self.criterion(output, target)
149
+ # Cumulative loss for current epoch on all batches
150
+ current_train_loss += train_loss.item()
151
+ # Backpropagation
152
+ train_loss.backward()
153
+ self.optimizer.step()
154
+
155
+ # Average Train Loss per sample
156
+ current_train_loss /= len(self.train_loader.dataset)
157
+ train_losses.append(current_train_loss)
158
+
159
+ # Evaluate
160
+ self.model.eval()
161
+ current_val_loss = 0
162
+ correct = 0
163
+ with torch.no_grad():
164
+ for features, target in self.test_loader:
165
+ # features, targets to device
166
+ features = features.to(self.device)
167
+ target = target.to(self.device)
168
+ output = self.model(features, **model_params)
169
+ # Save true labels for current batch (in case random shuffle was used)
170
+ true_labels_list.append(target.view(-1,1).cpu().numpy())
171
+ # For Binary Cross Entropy
172
+ if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
173
+ target = target.to(torch.float32)
174
+ elif isinstance(self.criterion, (nn.MSELoss)):
175
+ output = output.view_as(target)
176
+ current_val_loss += self.criterion(output, target).item()
177
+ # Save predictions of current batch, get accuracy
178
+ if self.kind == "classification":
179
+ predictions_list.append(output.argmax(dim=1).view(-1,1).cpu().numpy())
180
+ correct += output.argmax(dim=1).eq(target).sum().item()
181
+ if roc:
182
+ probabilities_local = nn.functional.softmax(output, dim=1)
183
+ probabilities_list.append(probabilities_local.cpu().numpy())
184
+ else: # Regression
185
+ predictions_list.append(output.view(-1,1).cpu().numpy())
186
+
187
+ # Average Validation Loss per sample
188
+ current_val_loss /= len(self.test_loader.dataset)
189
+ val_losses.append(current_val_loss)
190
+
191
+ # Concatenate all predictions and true labels
192
+ predictions = numpy.concatenate(predictions_list, axis=0)
193
+ true_labels = numpy.concatenate(true_labels_list, axis=0)
194
+ if roc:
195
+ probabilities = numpy.concatenate(probabilities_list, axis=0)
196
+
197
+ # Accuracy / RMSE
198
+ if self.kind == "classification":
199
+ accuracy = correct / len(self.test_loader.dataset)
200
+ accuracy = str(round(100*accuracy, ndigits=1)) + "%"
201
+ else: # Regression
202
+ accuracy = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
203
+ accuracy = str(round(accuracy, ndigits=4))
204
+
205
+ # Print details
206
+ details_format = f'epoch {epoch:2}: training loss: {current_train_loss:6.4f} validation loss: {current_val_loss:6.4f} {metric_name}: {accuracy}'
207
+ if (epoch % max(1, int(0.05*epochs)) == 0) or epoch in [1, 3, 5]:
208
+ print(details_format)
209
+
210
+ # Compare validation loss per epoch
211
+ # First run
212
+ if previous_val_loss is None:
213
+ previous_val_loss = current_val_loss
214
+ # If validation loss is increasing or the same (not improving) use patience
215
+ elif current_val_loss >= previous_val_loss:
216
+ if epoch == epoch_tracker + 1:
217
+ warnings += 1
218
+ else:
219
+ warnings = 1
220
+ epoch_tracker = epoch
221
+ # If validation loss decreased
222
+ else:
223
+ warnings = 0
224
+
225
+ # If patience is exhausted
226
+ if warnings == patience:
227
+ feedback = f"👁️ Validation Loss has increased {patience} consecutive times."
228
+ break
229
+
230
+ # Training must continue for another epoch
231
+ previous_val_loss = current_val_loss
232
+
233
+ # if all epochs have been completed
234
+ else:
235
+ feedback = "Training has been completed without any early-stopping criteria."
236
+
237
+ # Print feedback message
238
+ print('\n', details_format)
239
+ print(feedback, f"\n")
240
+
241
+ # Show elapsed time
242
+ elapsed_time = time.time() - start_time
243
+ minutes, seconds = divmod(elapsed_time, 60)
244
+ print(f"Elapsed time: {minutes:.0f} minutes {seconds:2.0f} seconds {epoch} epochs")
245
+
246
+ # Plot losses
247
+ fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10,4), dpi=150, sharey=False)
248
+
249
+ ax1.plot(range(2, epoch+1), train_losses[1:])
250
+ ax1.set_title("Training Loss")
251
+ ax1.set_xlabel("Epochs")
252
+ ax1.set_ylabel("Average loss per sample")
253
+
254
+ ax2.plot(range(2, epoch+1), val_losses[1:])
255
+ ax2.set_title("Validation Loss")
256
+ ax2.set_xlabel("Epochs")
257
+ ax2.set_ylabel("Average loss per sample")
258
+
259
+ plt.tight_layout()
260
+ plt.show()
261
+
262
+ # Metrics
263
+ # Display metrics
264
+ if self.kind == "regression":
265
+ rmse = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
266
+ r2 = r2_score(y_true=true_labels, y_pred=predictions)
267
+ medae = median_absolute_error(y_true=true_labels, y_pred=predictions)
268
+ print(f"Root Mean Squared Error (RMSE): {rmse:6.4f} (range 0 to \u221E)")
269
+ print(f"Median Absolute Error (MedAE): {medae:6.4f} (range: 0 to \u221E)")
270
+ print(f"Coefficient of Determination (R2 Score): {r2:4.2f} (range: -\u221E to 1)\n")
271
+
272
+ elif self.kind == "classification":
273
+ print(classification_report(y_true=true_labels, y_pred=predictions))
274
+ ConfusionMatrixDisplay.from_predictions(y_true=true_labels, y_pred=predictions, cmap=cmap)
275
+
276
+ # ROC curve & Area under the curve
277
+ if roc:
278
+ false_positives, true_positives, thresholds = roc_curve(y_true=true_labels, y_score=probabilities[:,1])
279
+ area_under_curve = roc_auc_score(y_true=true_labels, y_score=probabilities[:,1])
280
+
281
+ plt.figure(figsize=(4,4))
282
+ plt.plot(false_positives, true_positives)
283
+ plt.title("Receiver Operating Characteristic (ROC) Curve")
284
+ plt.xlabel("False Positive Rate")
285
+ plt.ylabel("True Positive Rate")
286
+ plt.show()
287
+
288
+ print(f"Area under the curve score: {area_under_curve:4.2f}")
289
+ else:
290
+ print("Error encountered while retrieving 'model.kind' attribute.")
291
+
292
+
293
+ def forecast(self, samples_list: list[torch.Tensor], view_as: tuple[int,int]=(1,-1)):
294
+ """
295
+ DEPRECATED - Use `helpers.model_predict()` instead
296
+
297
+ Returns a list containing lists of predicted values, one for each sample.
298
+
299
+ Each sample must be a tensor and have the same shape and normalization expected by the model
300
+ (this method will add the batch dimension automatically).
301
+
302
+ Args:
303
+ `samples_list`: list of tensors.
304
+
305
+ `view_as`: reshape each output, default is (1,-1).
306
+
307
+ Returns: List of lists.
308
+ """
309
+ self.model.eval()
310
+ results = list()
311
+ with torch.no_grad():
312
+ for data_point in samples_list:
313
+ data_point = data_point.unsqueeze(0).to(self.device)
314
+ output = self.model(data_point)
315
+ if self.kind == "classification":
316
+ results.append(output.argmax(dim=1).view(view_as).cpu().tolist())
317
+ else: #regression
318
+ results.append(output.view(view_as).cpu().tolist())
319
+
320
+ return results
321
+
322
+
323
+ def rnn_forecast(self, sequence: torch.Tensor, steps: int):
324
+ """
325
+ Runs a sequential forecast for a RNN, where each new prediction is obtained by feeding the previous prediction.
326
+
327
+ The input tensor representing a sequence must be of shape `(sequence length, number of features)` with normalized values (if needed).
328
+
329
+ Args:
330
+ `sequence`: Last subsequence of the sequence.
331
+
332
+ `steps`: Number of future time steps to predict.
333
+
334
+ Returns: Numpy array of predictions.
335
+ """
336
+ self.model.eval()
337
+ with torch.no_grad():
338
+ # send sequence to device
339
+ sequence = sequence.to(self.device)
340
+ # Make a dummy list in memory
341
+ sequences = [torch.zeros_like(sequence, device=self.device, requires_grad=False) for _ in range(steps)]
342
+ sequences[0] = sequence
343
+ # Store predictions
344
+ predictions = list()
345
+ # Get predictions
346
+ for i in range(steps):
347
+ in_seq = sequences[i]
348
+ output = self.model(in_seq)
349
+ # Last timestamp
350
+ output = output[-1].view(1,-1)
351
+ # Save prediction
352
+ # Check if it is a single feature, get value
353
+ if output.shape[1] == 1:
354
+ predictions.append(output.item())
355
+ # Else, return a list of lists
356
+ else:
357
+ predictions.append(output.squeeze().cpu().tolist())
358
+ # Create next sequence
359
+ if i < steps-1:
360
+ current_seq = sequences[i]
361
+ new_seq = torch.concatenate([current_seq[1:], output], dim=0).to(self.device)
362
+ sequences[i+1] = new_seq
363
+
364
+ # Cast to array and return
365
+ predictions = numpy.array(predictions)
366
+ return predictions
ml_tools/utilities.py ADDED
@@ -0,0 +1,168 @@
1
+ import math
2
+ import numpy as np
3
+ import pandas as pd
4
+ import os
5
+ from pathlib import Path
6
+ import re
7
+
8
+
9
+ def list_csv_paths(directory: str) -> tuple[list[str], list[str]]:
10
+ """
11
+ Lists all CSV files in a given directory and returns their paths with corresponding base names.
12
+
13
+ Parameters:
14
+ directory (str): Path to the directory containing `.csv` files.
15
+
16
+ Returns:
17
+ Tuple ([List[str], List[str]]):
18
+ - List of absolute paths to `.csv` files.
19
+ - List of corresponding base names (without extensions).
20
+ """
21
+ dir_path = Path(directory).expanduser().resolve()
22
+
23
+ if not dir_path.is_dir():
24
+ raise FileNotFoundError(f"Directory not found: {dir_path}")
25
+
26
+ csv_paths = list(dir_path.glob("*.csv"))
27
+ if not csv_paths:
28
+ raise IOError(f"No CSV files found in directory: {dir_path}")
29
+
30
+ paths = [str(p) for p in csv_paths]
31
+ names = [p.stem for p in csv_paths]
32
+
33
+ return paths, names
34
+
35
+
36
+ def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
37
+ """
38
+ Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
39
+
40
+ Args:
41
+ df_path (str): The path to the CSV file.
42
+
43
+ Returns:
44
+ Tuple ([pd.DataFrame, str]):
45
+ A tuple containing the loaded pandas DataFrame and the base name of the file.
46
+ """
47
+ path = Path(df_path).expanduser().resolve()
48
+ df = pd.read_csv(path, encoding='utf-8')
49
+ df_name = path.stem
50
+ if df.empty:
51
+ raise ValueError(f"DataFrame '{df_name}' is empty.")
52
+ print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
53
+ return df, df_name
54
+
55
+
56
+ def yield_dataframes_from_dir(datasets_dir: str):
57
+ """
58
+ Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
59
+
60
+ Parameters:
61
+ datasets_dir (str):
62
+ The path to the directory containing `.csv` dataset files.
63
+
64
+ Yields:
65
+ Tuple: ([pd.DataFrame, str])
66
+ - The loaded pandas DataFrame.
67
+ - The base name of the file (without extension).
68
+
69
+ Notes:
70
+ - Files are expected to have a `.csv` extension.
71
+ - CSV files are read using UTF-8 encoding.
72
+ - Output is streamed via a generator to support lazy loading of multiple datasets.
73
+ """
74
+ for df_path, df_name in list_csv_paths(datasets_dir):
75
+ df = pd.read_csv(df_path)
76
+ print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
77
+ yield df, df_name
78
+
79
+
80
+ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
81
+ """
82
+ Normalize a mixed list of numeric values and strings so that the sum of the values equals 1.0,
83
+ applying heuristic adjustments to correct for potential data entry scale mismatches.
84
+
85
+ Parameters:
86
+ data (list):
87
+ A list of values that may include strings, floats, integers, or None.
88
+ None values are treated as 0.0.
89
+
90
+ threshold (int, optional):
91
+ The number of log10 orders of magnitude below the median scale
92
+ at which a value is considered suspect and is scaled upward accordingly.
93
+ Default is 2.
94
+
95
+ Returns:
96
+ List[float]: A list of normalized float values summing to 1.0.
97
+ Values significantly smaller than the median scale are scaled up
98
+ before normalization to correct likely input errors.
99
+
100
+ Notes:
101
+ - Zeros and None values remain zero.
102
+ - If all input values are zero or None, the function returns a list of zeros.
103
+ - Input strings are automatically cast to floats if possible.
104
+
105
+ Example:
106
+ >>> normalize_mixed_list([1, "0.01", 4, None])
107
+ [0.2, 0.2, 0.6, 0.0]
108
+ """
109
+ # Step 1: Convert all values to float, treat None as 0.0
110
+ float_list = [float(x) if x is not None else 0.0 for x in data]
111
+
112
+ # Raise for negative values
113
+ if any(x < 0 for x in float_list):
114
+ raise ValueError("Negative values are not allowed in the input list.")
115
+
116
+ # Step 2: Compute log10 of non-zero values
117
+ nonzero = [x for x in float_list if x > 0]
118
+ if not nonzero:
119
+ return [0.0 for _ in float_list]
120
+
121
+ log_scales = [math.log10(x) for x in nonzero]
122
+ log_median = np.median(log_scales)
123
+
124
+ # Step 3: Adjust values that are much smaller than median
125
+ adjusted = []
126
+ for x in float_list:
127
+ if x == 0.0:
128
+ adjusted.append(0.0)
129
+ else:
130
+ log_x = math.log10(x)
131
+ if log_median - log_x > threshold:
132
+ scale_diff = round(log_median - log_x)
133
+ adjusted.append(x * (10 ** scale_diff))
134
+ else:
135
+ adjusted.append(x)
136
+
137
+ # Step 4: Normalize to sum to 1.0
138
+ total = sum(adjusted)
139
+ if total == 0:
140
+ return [0.0 for _ in adjusted]
141
+
142
+ return [x / total for x in adjusted]
143
+
144
+
145
+ def sanitize_filename(filename: str) -> str:
146
+ """
147
+ Sanitizes the name by:
148
+ - Stripping leading/trailing whitespace.
149
+ - Replacing all internal whitespace characters with underscores.
150
+ - Removing or replacing characters invalid in filenames.
151
+
152
+ Args:
153
+ name (str): Base filename.
154
+
155
+ Returns:
156
+ str: A sanitized string suitable to use as a filename.
157
+ """
158
+ # Strip leading/trailing whitespace
159
+ sanitized = filename.strip()
160
+
161
+ # Replace all whitespace sequences (space, tab, etc.) with underscores
162
+ sanitized = re.sub(r'\s+', '_', sanitized)
163
+
164
+ # Conservative filter to keep filenames safe across platforms
165
+ sanitized = re.sub(r'[^\w\-.]', '', sanitized)
166
+
167
+ return sanitized
168
+