dragon-ml-toolbox 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- dragon_ml_toolbox-1.1.2.dist-info/METADATA +114 -0
- dragon_ml_toolbox-1.1.2.dist-info/RECORD +16 -0
- dragon_ml_toolbox-1.1.2.dist-info/WHEEL +5 -0
- dragon_ml_toolbox-1.1.2.dist-info/top_level.txt +1 -0
- ml_tools/MICE_imputation.py +178 -0
- ml_tools/__init__.py +0 -0
- ml_tools/data_exploration.py +751 -0
- ml_tools/datasetmaster.py +595 -0
- ml_tools/ensemble_learning.py +701 -0
- ml_tools/handle_excel.py +310 -0
- ml_tools/logger.py +145 -0
- ml_tools/particle_swarm_optimization.py +467 -0
- ml_tools/pytorch_models.py +227 -0
- ml_tools/trainer.py +366 -0
- ml_tools/utilities.py +168 -0
- ml_tools/vision_helpers.py +218 -0
ml_tools/trainer.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import numpy
|
|
3
|
+
from typing import Literal
|
|
4
|
+
from torch.utils.data import DataLoader, Dataset
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import torch
|
|
7
|
+
from torch import nn
|
|
8
|
+
from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MyTrainer():
|
|
12
|
+
def __init__(self, model, train_dataset: Dataset, test_dataset: Dataset, kind: Literal["regression", "classification"],
|
|
13
|
+
criterion=None , shuffle: bool=True, batch_size: float=3, device: Literal["cpu", "cuda", "mps"]='cpu', learn_rate: float=0.001, dataloader_workers: int=2):
|
|
14
|
+
"""
|
|
15
|
+
Automates the training process of a PyTorch Model using Adam optimization by default (`self.optimizer`).
|
|
16
|
+
|
|
17
|
+
`kind`: Will be used to compute and display metrics after training is complete.
|
|
18
|
+
|
|
19
|
+
`shuffle`: Whether to shuffle dataset batches at every epoch. Default is True.
|
|
20
|
+
|
|
21
|
+
`criterion`: Loss function. If 'None', defaults to `nn.NLLLoss` for classification or `nn.MSELoss` for regression.
|
|
22
|
+
|
|
23
|
+
`batch_size` Represents the fraction of the original dataset size to be used per batch. If an integer is passed, use that many samples, instead. Default is 3 samples at a time.
|
|
24
|
+
|
|
25
|
+
`learn_rate` Model learning rate. Default is 0.001.
|
|
26
|
+
|
|
27
|
+
`dataloader_workers` Subprocesses to use for data loading. Default is 2.
|
|
28
|
+
"""
|
|
29
|
+
# Validate kind
|
|
30
|
+
if kind not in ["regression", "classification"]:
|
|
31
|
+
raise TypeError("Kind must be 'regression' or 'classification'.")
|
|
32
|
+
# Validate batch size
|
|
33
|
+
batch_error = "Batch must a float in range [0.01, 1) or an integer."
|
|
34
|
+
if isinstance(batch_size, (float, int)):
|
|
35
|
+
if (1.00 > batch_size >= 0.01):
|
|
36
|
+
train_batch = int(len(train_dataset) * batch_size)
|
|
37
|
+
test_batch = int(len(test_dataset) * batch_size)
|
|
38
|
+
elif batch_size > len(train_dataset) or batch_size > len(test_dataset):
|
|
39
|
+
raise ValueError(batch_error + " Size is greater than dataset size.")
|
|
40
|
+
elif batch_size >= 1:
|
|
41
|
+
train_batch = int(batch_size)
|
|
42
|
+
test_batch = int(batch_size)
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(batch_error)
|
|
45
|
+
else:
|
|
46
|
+
raise TypeError(batch_error)
|
|
47
|
+
# Validate device
|
|
48
|
+
if device == "cuda":
|
|
49
|
+
if not torch.cuda.is_available():
|
|
50
|
+
print("CUDA not available, switching to CPU.")
|
|
51
|
+
device = "cpu"
|
|
52
|
+
elif device == "mps":
|
|
53
|
+
if not torch.backends.mps.is_available():
|
|
54
|
+
print("MPS not available, switching to CPU.")
|
|
55
|
+
device = "cpu"
|
|
56
|
+
# Validate criterion
|
|
57
|
+
if criterion is None:
|
|
58
|
+
if kind == "regression":
|
|
59
|
+
self.criterion = nn.MSELoss()
|
|
60
|
+
else:
|
|
61
|
+
self.criterion = nn.NLLLoss()
|
|
62
|
+
else:
|
|
63
|
+
self.criterion = criterion
|
|
64
|
+
# Validate dataloader workers
|
|
65
|
+
if not isinstance(dataloader_workers, int):
|
|
66
|
+
raise TypeError("Dataloader workers must be an integer value.")
|
|
67
|
+
|
|
68
|
+
# Check last layer in the model, implementation pending
|
|
69
|
+
# last_layer_name, last_layer = next(reversed(model._modules.items()))
|
|
70
|
+
# if isinstance(last_layer, nn.Linear):
|
|
71
|
+
# pass
|
|
72
|
+
|
|
73
|
+
self.train_loader = DataLoader(dataset=train_dataset, batch_size=train_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
|
|
74
|
+
self.test_loader = DataLoader(dataset=test_dataset, batch_size=test_batch, shuffle=shuffle, num_workers=dataloader_workers, pin_memory=True if device=="cuda" else False)
|
|
75
|
+
self.kind = kind
|
|
76
|
+
self.device = torch.device(device)
|
|
77
|
+
self.model = model.to(self.device)
|
|
78
|
+
self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=learn_rate)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def auto_train(self, epochs: int=200, patience: int=3, cmap: Literal["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]="Blues",
|
|
82
|
+
roc: bool=False, **model_params):
|
|
83
|
+
"""
|
|
84
|
+
Start training-validation process of the model.
|
|
85
|
+
|
|
86
|
+
`patience` is the number of consecutive times the Validation Loss is allowed to increase before early-stopping the training process.
|
|
87
|
+
|
|
88
|
+
`cmap` Color map to use for the confusion matrix.
|
|
89
|
+
|
|
90
|
+
`model_params` Keywords parameters specific to the model, if any.
|
|
91
|
+
|
|
92
|
+
`roc` Whether to display the Receiver Operating Characteristic (ROC) Curve, for binary classification only.
|
|
93
|
+
"""
|
|
94
|
+
metric_name = "accuracy" if self.kind == "classification" else "RMSE"
|
|
95
|
+
previous_val_loss = None
|
|
96
|
+
epoch_tracker = 0
|
|
97
|
+
warnings = 0
|
|
98
|
+
feedback = None
|
|
99
|
+
val_losses = list()
|
|
100
|
+
train_losses = list()
|
|
101
|
+
|
|
102
|
+
# Validate inputs
|
|
103
|
+
if isinstance(epochs, int):
|
|
104
|
+
if epochs < 1:
|
|
105
|
+
print("Invalid number of epochs")
|
|
106
|
+
return None
|
|
107
|
+
else:
|
|
108
|
+
print("Invalid number of epochs")
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
if isinstance(patience, int):
|
|
112
|
+
if patience < 0:
|
|
113
|
+
print("Invalid value for patience")
|
|
114
|
+
return None
|
|
115
|
+
else:
|
|
116
|
+
print("Invalid value for patience")
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
if cmap not in ["viridis", "Blues", "Greens", "Reds", "plasma", "coolwarm"]:
|
|
120
|
+
print("Invalid cmap code, 'coolwarm' selected by default")
|
|
121
|
+
cmap = "coolwarm"
|
|
122
|
+
|
|
123
|
+
# Time training
|
|
124
|
+
start_time = time.time()
|
|
125
|
+
|
|
126
|
+
for epoch in range(1, epochs+1):
|
|
127
|
+
# Train model
|
|
128
|
+
self.model.train()
|
|
129
|
+
current_train_loss = 0
|
|
130
|
+
# Keep track of predictions and true labels on the last epoch to use later on scikit-learn
|
|
131
|
+
predictions_list = list()
|
|
132
|
+
true_labels_list = list()
|
|
133
|
+
probabilities_list = list()
|
|
134
|
+
|
|
135
|
+
for features, target in self.train_loader:
|
|
136
|
+
# features, targets to device
|
|
137
|
+
features = features.to(self.device)
|
|
138
|
+
target = target.to(self.device)
|
|
139
|
+
self.optimizer.zero_grad()
|
|
140
|
+
output = self.model(features, **model_params)
|
|
141
|
+
# check shapes
|
|
142
|
+
# print(features.shape, target.shape, output.shape)
|
|
143
|
+
# For Binary Cross Entropy
|
|
144
|
+
if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
|
|
145
|
+
target = target.to(torch.float32)
|
|
146
|
+
elif isinstance(self.criterion, (nn.MSELoss)):
|
|
147
|
+
output = output.view_as(target)
|
|
148
|
+
train_loss = self.criterion(output, target)
|
|
149
|
+
# Cumulative loss for current epoch on all batches
|
|
150
|
+
current_train_loss += train_loss.item()
|
|
151
|
+
# Backpropagation
|
|
152
|
+
train_loss.backward()
|
|
153
|
+
self.optimizer.step()
|
|
154
|
+
|
|
155
|
+
# Average Train Loss per sample
|
|
156
|
+
current_train_loss /= len(self.train_loader.dataset)
|
|
157
|
+
train_losses.append(current_train_loss)
|
|
158
|
+
|
|
159
|
+
# Evaluate
|
|
160
|
+
self.model.eval()
|
|
161
|
+
current_val_loss = 0
|
|
162
|
+
correct = 0
|
|
163
|
+
with torch.no_grad():
|
|
164
|
+
for features, target in self.test_loader:
|
|
165
|
+
# features, targets to device
|
|
166
|
+
features = features.to(self.device)
|
|
167
|
+
target = target.to(self.device)
|
|
168
|
+
output = self.model(features, **model_params)
|
|
169
|
+
# Save true labels for current batch (in case random shuffle was used)
|
|
170
|
+
true_labels_list.append(target.view(-1,1).cpu().numpy())
|
|
171
|
+
# For Binary Cross Entropy
|
|
172
|
+
if isinstance(self.criterion, (nn.BCELoss, nn.BCEWithLogitsLoss)):
|
|
173
|
+
target = target.to(torch.float32)
|
|
174
|
+
elif isinstance(self.criterion, (nn.MSELoss)):
|
|
175
|
+
output = output.view_as(target)
|
|
176
|
+
current_val_loss += self.criterion(output, target).item()
|
|
177
|
+
# Save predictions of current batch, get accuracy
|
|
178
|
+
if self.kind == "classification":
|
|
179
|
+
predictions_list.append(output.argmax(dim=1).view(-1,1).cpu().numpy())
|
|
180
|
+
correct += output.argmax(dim=1).eq(target).sum().item()
|
|
181
|
+
if roc:
|
|
182
|
+
probabilities_local = nn.functional.softmax(output, dim=1)
|
|
183
|
+
probabilities_list.append(probabilities_local.cpu().numpy())
|
|
184
|
+
else: # Regression
|
|
185
|
+
predictions_list.append(output.view(-1,1).cpu().numpy())
|
|
186
|
+
|
|
187
|
+
# Average Validation Loss per sample
|
|
188
|
+
current_val_loss /= len(self.test_loader.dataset)
|
|
189
|
+
val_losses.append(current_val_loss)
|
|
190
|
+
|
|
191
|
+
# Concatenate all predictions and true labels
|
|
192
|
+
predictions = numpy.concatenate(predictions_list, axis=0)
|
|
193
|
+
true_labels = numpy.concatenate(true_labels_list, axis=0)
|
|
194
|
+
if roc:
|
|
195
|
+
probabilities = numpy.concatenate(probabilities_list, axis=0)
|
|
196
|
+
|
|
197
|
+
# Accuracy / RMSE
|
|
198
|
+
if self.kind == "classification":
|
|
199
|
+
accuracy = correct / len(self.test_loader.dataset)
|
|
200
|
+
accuracy = str(round(100*accuracy, ndigits=1)) + "%"
|
|
201
|
+
else: # Regression
|
|
202
|
+
accuracy = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
|
|
203
|
+
accuracy = str(round(accuracy, ndigits=4))
|
|
204
|
+
|
|
205
|
+
# Print details
|
|
206
|
+
details_format = f'epoch {epoch:2}: training loss: {current_train_loss:6.4f} validation loss: {current_val_loss:6.4f} {metric_name}: {accuracy}'
|
|
207
|
+
if (epoch % max(1, int(0.05*epochs)) == 0) or epoch in [1, 3, 5]:
|
|
208
|
+
print(details_format)
|
|
209
|
+
|
|
210
|
+
# Compare validation loss per epoch
|
|
211
|
+
# First run
|
|
212
|
+
if previous_val_loss is None:
|
|
213
|
+
previous_val_loss = current_val_loss
|
|
214
|
+
# If validation loss is increasing or the same (not improving) use patience
|
|
215
|
+
elif current_val_loss >= previous_val_loss:
|
|
216
|
+
if epoch == epoch_tracker + 1:
|
|
217
|
+
warnings += 1
|
|
218
|
+
else:
|
|
219
|
+
warnings = 1
|
|
220
|
+
epoch_tracker = epoch
|
|
221
|
+
# If validation loss decreased
|
|
222
|
+
else:
|
|
223
|
+
warnings = 0
|
|
224
|
+
|
|
225
|
+
# If patience is exhausted
|
|
226
|
+
if warnings == patience:
|
|
227
|
+
feedback = f"👁️ Validation Loss has increased {patience} consecutive times."
|
|
228
|
+
break
|
|
229
|
+
|
|
230
|
+
# Training must continue for another epoch
|
|
231
|
+
previous_val_loss = current_val_loss
|
|
232
|
+
|
|
233
|
+
# if all epochs have been completed
|
|
234
|
+
else:
|
|
235
|
+
feedback = "Training has been completed without any early-stopping criteria."
|
|
236
|
+
|
|
237
|
+
# Print feedback message
|
|
238
|
+
print('\n', details_format)
|
|
239
|
+
print(feedback, f"\n")
|
|
240
|
+
|
|
241
|
+
# Show elapsed time
|
|
242
|
+
elapsed_time = time.time() - start_time
|
|
243
|
+
minutes, seconds = divmod(elapsed_time, 60)
|
|
244
|
+
print(f"Elapsed time: {minutes:.0f} minutes {seconds:2.0f} seconds {epoch} epochs")
|
|
245
|
+
|
|
246
|
+
# Plot losses
|
|
247
|
+
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10,4), dpi=150, sharey=False)
|
|
248
|
+
|
|
249
|
+
ax1.plot(range(2, epoch+1), train_losses[1:])
|
|
250
|
+
ax1.set_title("Training Loss")
|
|
251
|
+
ax1.set_xlabel("Epochs")
|
|
252
|
+
ax1.set_ylabel("Average loss per sample")
|
|
253
|
+
|
|
254
|
+
ax2.plot(range(2, epoch+1), val_losses[1:])
|
|
255
|
+
ax2.set_title("Validation Loss")
|
|
256
|
+
ax2.set_xlabel("Epochs")
|
|
257
|
+
ax2.set_ylabel("Average loss per sample")
|
|
258
|
+
|
|
259
|
+
plt.tight_layout()
|
|
260
|
+
plt.show()
|
|
261
|
+
|
|
262
|
+
# Metrics
|
|
263
|
+
# Display metrics
|
|
264
|
+
if self.kind == "regression":
|
|
265
|
+
rmse = numpy.sqrt(mean_squared_error(y_true=true_labels, y_pred=predictions))
|
|
266
|
+
r2 = r2_score(y_true=true_labels, y_pred=predictions)
|
|
267
|
+
medae = median_absolute_error(y_true=true_labels, y_pred=predictions)
|
|
268
|
+
print(f"Root Mean Squared Error (RMSE): {rmse:6.4f} (range 0 to \u221E)")
|
|
269
|
+
print(f"Median Absolute Error (MedAE): {medae:6.4f} (range: 0 to \u221E)")
|
|
270
|
+
print(f"Coefficient of Determination (R2 Score): {r2:4.2f} (range: -\u221E to 1)\n")
|
|
271
|
+
|
|
272
|
+
elif self.kind == "classification":
|
|
273
|
+
print(classification_report(y_true=true_labels, y_pred=predictions))
|
|
274
|
+
ConfusionMatrixDisplay.from_predictions(y_true=true_labels, y_pred=predictions, cmap=cmap)
|
|
275
|
+
|
|
276
|
+
# ROC curve & Area under the curve
|
|
277
|
+
if roc:
|
|
278
|
+
false_positives, true_positives, thresholds = roc_curve(y_true=true_labels, y_score=probabilities[:,1])
|
|
279
|
+
area_under_curve = roc_auc_score(y_true=true_labels, y_score=probabilities[:,1])
|
|
280
|
+
|
|
281
|
+
plt.figure(figsize=(4,4))
|
|
282
|
+
plt.plot(false_positives, true_positives)
|
|
283
|
+
plt.title("Receiver Operating Characteristic (ROC) Curve")
|
|
284
|
+
plt.xlabel("False Positive Rate")
|
|
285
|
+
plt.ylabel("True Positive Rate")
|
|
286
|
+
plt.show()
|
|
287
|
+
|
|
288
|
+
print(f"Area under the curve score: {area_under_curve:4.2f}")
|
|
289
|
+
else:
|
|
290
|
+
print("Error encountered while retrieving 'model.kind' attribute.")
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def forecast(self, samples_list: list[torch.Tensor], view_as: tuple[int,int]=(1,-1)):
|
|
294
|
+
"""
|
|
295
|
+
DEPRECATED - Use `helpers.model_predict()` instead
|
|
296
|
+
|
|
297
|
+
Returns a list containing lists of predicted values, one for each sample.
|
|
298
|
+
|
|
299
|
+
Each sample must be a tensor and have the same shape and normalization expected by the model
|
|
300
|
+
(this method will add the batch dimension automatically).
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
`samples_list`: list of tensors.
|
|
304
|
+
|
|
305
|
+
`view_as`: reshape each output, default is (1,-1).
|
|
306
|
+
|
|
307
|
+
Returns: List of lists.
|
|
308
|
+
"""
|
|
309
|
+
self.model.eval()
|
|
310
|
+
results = list()
|
|
311
|
+
with torch.no_grad():
|
|
312
|
+
for data_point in samples_list:
|
|
313
|
+
data_point = data_point.unsqueeze(0).to(self.device)
|
|
314
|
+
output = self.model(data_point)
|
|
315
|
+
if self.kind == "classification":
|
|
316
|
+
results.append(output.argmax(dim=1).view(view_as).cpu().tolist())
|
|
317
|
+
else: #regression
|
|
318
|
+
results.append(output.view(view_as).cpu().tolist())
|
|
319
|
+
|
|
320
|
+
return results
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def rnn_forecast(self, sequence: torch.Tensor, steps: int):
|
|
324
|
+
"""
|
|
325
|
+
Runs a sequential forecast for a RNN, where each new prediction is obtained by feeding the previous prediction.
|
|
326
|
+
|
|
327
|
+
The input tensor representing a sequence must be of shape `(sequence length, number of features)` with normalized values (if needed).
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
`sequence`: Last subsequence of the sequence.
|
|
331
|
+
|
|
332
|
+
`steps`: Number of future time steps to predict.
|
|
333
|
+
|
|
334
|
+
Returns: Numpy array of predictions.
|
|
335
|
+
"""
|
|
336
|
+
self.model.eval()
|
|
337
|
+
with torch.no_grad():
|
|
338
|
+
# send sequence to device
|
|
339
|
+
sequence = sequence.to(self.device)
|
|
340
|
+
# Make a dummy list in memory
|
|
341
|
+
sequences = [torch.zeros_like(sequence, device=self.device, requires_grad=False) for _ in range(steps)]
|
|
342
|
+
sequences[0] = sequence
|
|
343
|
+
# Store predictions
|
|
344
|
+
predictions = list()
|
|
345
|
+
# Get predictions
|
|
346
|
+
for i in range(steps):
|
|
347
|
+
in_seq = sequences[i]
|
|
348
|
+
output = self.model(in_seq)
|
|
349
|
+
# Last timestamp
|
|
350
|
+
output = output[-1].view(1,-1)
|
|
351
|
+
# Save prediction
|
|
352
|
+
# Check if it is a single feature, get value
|
|
353
|
+
if output.shape[1] == 1:
|
|
354
|
+
predictions.append(output.item())
|
|
355
|
+
# Else, return a list of lists
|
|
356
|
+
else:
|
|
357
|
+
predictions.append(output.squeeze().cpu().tolist())
|
|
358
|
+
# Create next sequence
|
|
359
|
+
if i < steps-1:
|
|
360
|
+
current_seq = sequences[i]
|
|
361
|
+
new_seq = torch.concatenate([current_seq[1:], output], dim=0).to(self.device)
|
|
362
|
+
sequences[i+1] = new_seq
|
|
363
|
+
|
|
364
|
+
# Cast to array and return
|
|
365
|
+
predictions = numpy.array(predictions)
|
|
366
|
+
return predictions
|
ml_tools/utilities.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def list_csv_paths(directory: str) -> tuple[list[str], list[str]]:
|
|
10
|
+
"""
|
|
11
|
+
Lists all CSV files in a given directory and returns their paths with corresponding base names.
|
|
12
|
+
|
|
13
|
+
Parameters:
|
|
14
|
+
directory (str): Path to the directory containing `.csv` files.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Tuple ([List[str], List[str]]):
|
|
18
|
+
- List of absolute paths to `.csv` files.
|
|
19
|
+
- List of corresponding base names (without extensions).
|
|
20
|
+
"""
|
|
21
|
+
dir_path = Path(directory).expanduser().resolve()
|
|
22
|
+
|
|
23
|
+
if not dir_path.is_dir():
|
|
24
|
+
raise FileNotFoundError(f"Directory not found: {dir_path}")
|
|
25
|
+
|
|
26
|
+
csv_paths = list(dir_path.glob("*.csv"))
|
|
27
|
+
if not csv_paths:
|
|
28
|
+
raise IOError(f"No CSV files found in directory: {dir_path}")
|
|
29
|
+
|
|
30
|
+
paths = [str(p) for p in csv_paths]
|
|
31
|
+
names = [p.stem for p in csv_paths]
|
|
32
|
+
|
|
33
|
+
return paths, names
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
|
|
37
|
+
"""
|
|
38
|
+
Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
df_path (str): The path to the CSV file.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Tuple ([pd.DataFrame, str]):
|
|
45
|
+
A tuple containing the loaded pandas DataFrame and the base name of the file.
|
|
46
|
+
"""
|
|
47
|
+
path = Path(df_path).expanduser().resolve()
|
|
48
|
+
df = pd.read_csv(path, encoding='utf-8')
|
|
49
|
+
df_name = path.stem
|
|
50
|
+
if df.empty:
|
|
51
|
+
raise ValueError(f"DataFrame '{df_name}' is empty.")
|
|
52
|
+
print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
|
|
53
|
+
return df, df_name
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def yield_dataframes_from_dir(datasets_dir: str):
|
|
57
|
+
"""
|
|
58
|
+
Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
|
|
59
|
+
|
|
60
|
+
Parameters:
|
|
61
|
+
datasets_dir (str):
|
|
62
|
+
The path to the directory containing `.csv` dataset files.
|
|
63
|
+
|
|
64
|
+
Yields:
|
|
65
|
+
Tuple: ([pd.DataFrame, str])
|
|
66
|
+
- The loaded pandas DataFrame.
|
|
67
|
+
- The base name of the file (without extension).
|
|
68
|
+
|
|
69
|
+
Notes:
|
|
70
|
+
- Files are expected to have a `.csv` extension.
|
|
71
|
+
- CSV files are read using UTF-8 encoding.
|
|
72
|
+
- Output is streamed via a generator to support lazy loading of multiple datasets.
|
|
73
|
+
"""
|
|
74
|
+
for df_path, df_name in list_csv_paths(datasets_dir):
|
|
75
|
+
df = pd.read_csv(df_path)
|
|
76
|
+
print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
|
|
77
|
+
yield df, df_name
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
81
|
+
"""
|
|
82
|
+
Normalize a mixed list of numeric values and strings so that the sum of the values equals 1.0,
|
|
83
|
+
applying heuristic adjustments to correct for potential data entry scale mismatches.
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
data (list):
|
|
87
|
+
A list of values that may include strings, floats, integers, or None.
|
|
88
|
+
None values are treated as 0.0.
|
|
89
|
+
|
|
90
|
+
threshold (int, optional):
|
|
91
|
+
The number of log10 orders of magnitude below the median scale
|
|
92
|
+
at which a value is considered suspect and is scaled upward accordingly.
|
|
93
|
+
Default is 2.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
List[float]: A list of normalized float values summing to 1.0.
|
|
97
|
+
Values significantly smaller than the median scale are scaled up
|
|
98
|
+
before normalization to correct likely input errors.
|
|
99
|
+
|
|
100
|
+
Notes:
|
|
101
|
+
- Zeros and None values remain zero.
|
|
102
|
+
- If all input values are zero or None, the function returns a list of zeros.
|
|
103
|
+
- Input strings are automatically cast to floats if possible.
|
|
104
|
+
|
|
105
|
+
Example:
|
|
106
|
+
>>> normalize_mixed_list([1, "0.01", 4, None])
|
|
107
|
+
[0.2, 0.2, 0.6, 0.0]
|
|
108
|
+
"""
|
|
109
|
+
# Step 1: Convert all values to float, treat None as 0.0
|
|
110
|
+
float_list = [float(x) if x is not None else 0.0 for x in data]
|
|
111
|
+
|
|
112
|
+
# Raise for negative values
|
|
113
|
+
if any(x < 0 for x in float_list):
|
|
114
|
+
raise ValueError("Negative values are not allowed in the input list.")
|
|
115
|
+
|
|
116
|
+
# Step 2: Compute log10 of non-zero values
|
|
117
|
+
nonzero = [x for x in float_list if x > 0]
|
|
118
|
+
if not nonzero:
|
|
119
|
+
return [0.0 for _ in float_list]
|
|
120
|
+
|
|
121
|
+
log_scales = [math.log10(x) for x in nonzero]
|
|
122
|
+
log_median = np.median(log_scales)
|
|
123
|
+
|
|
124
|
+
# Step 3: Adjust values that are much smaller than median
|
|
125
|
+
adjusted = []
|
|
126
|
+
for x in float_list:
|
|
127
|
+
if x == 0.0:
|
|
128
|
+
adjusted.append(0.0)
|
|
129
|
+
else:
|
|
130
|
+
log_x = math.log10(x)
|
|
131
|
+
if log_median - log_x > threshold:
|
|
132
|
+
scale_diff = round(log_median - log_x)
|
|
133
|
+
adjusted.append(x * (10 ** scale_diff))
|
|
134
|
+
else:
|
|
135
|
+
adjusted.append(x)
|
|
136
|
+
|
|
137
|
+
# Step 4: Normalize to sum to 1.0
|
|
138
|
+
total = sum(adjusted)
|
|
139
|
+
if total == 0:
|
|
140
|
+
return [0.0 for _ in adjusted]
|
|
141
|
+
|
|
142
|
+
return [x / total for x in adjusted]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def sanitize_filename(filename: str) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Sanitizes the name by:
|
|
148
|
+
- Stripping leading/trailing whitespace.
|
|
149
|
+
- Replacing all internal whitespace characters with underscores.
|
|
150
|
+
- Removing or replacing characters invalid in filenames.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
name (str): Base filename.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
str: A sanitized string suitable to use as a filename.
|
|
157
|
+
"""
|
|
158
|
+
# Strip leading/trailing whitespace
|
|
159
|
+
sanitized = filename.strip()
|
|
160
|
+
|
|
161
|
+
# Replace all whitespace sequences (space, tab, etc.) with underscores
|
|
162
|
+
sanitized = re.sub(r'\s+', '_', sanitized)
|
|
163
|
+
|
|
164
|
+
# Conservative filter to keep filenames safe across platforms
|
|
165
|
+
sanitized = re.sub(r'[^\w\-.]', '', sanitized)
|
|
166
|
+
|
|
167
|
+
return sanitized
|
|
168
|
+
|