radnn 0.0.8__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- radnn/__init__.py +5 -5
- radnn/benchmark/__init__.py +1 -0
- radnn/benchmark/latency.py +55 -0
- radnn/core.py +146 -2
- radnn/data/__init__.py +5 -10
- radnn/data/dataset_base.py +100 -260
- radnn/data/dataset_base_legacy.py +280 -0
- radnn/data/errors.py +32 -0
- radnn/data/sample_preprocessor.py +58 -0
- radnn/data/sample_set.py +203 -90
- radnn/data/sample_set_kind.py +126 -0
- radnn/data/sequence_dataset.py +25 -30
- radnn/data/structs/__init__.py +1 -0
- radnn/data/structs/tree.py +322 -0
- radnn/data_beta/__init__.py +12 -0
- radnn/{data → data_beta}/data_feed.py +1 -1
- radnn/data_beta/dataset_base.py +337 -0
- radnn/data_beta/sample_set.py +166 -0
- radnn/data_beta/sequence_dataset.py +134 -0
- radnn/data_beta/structures/__init__.py +2 -0
- radnn/data_beta/structures/dictionary.py +41 -0
- radnn/{data → data_beta}/tf_classification_data_feed.py +5 -2
- radnn/errors.py +10 -2
- radnn/experiment/__init__.py +2 -0
- radnn/experiment/identification.py +7 -0
- radnn/experiment/ml_experiment.py +7 -2
- radnn/experiment/ml_experiment_log.py +47 -0
- radnn/images/image_processor.py +4 -1
- radnn/learn/__init__.py +0 -7
- radnn/learn/keras/__init__.py +4 -0
- radnn/learn/{state → keras}/keras_best_state_saver.py +5 -1
- radnn/learn/{learning_algorithm.py → keras/keras_learning_algorithm.py} +5 -9
- radnn/learn/{keras_learning_rate_scheduler.py → keras/keras_learning_rate_scheduler.py} +4 -1
- radnn/learn/{keras_optimization_algorithm.py → keras/keras_optimization_combo.py} +7 -3
- radnn/learn/torch/__init__.py +3 -0
- radnn/learn/torch/ml_model_freezer.py +330 -0
- radnn/learn/torch/ml_trainer.py +461 -0
- radnn/learn/torch/staircase_lr_scheduler.py +21 -0
- radnn/ml_system.py +68 -52
- radnn/models/__init__.py +5 -0
- radnn/models/cnn/__init__.py +0 -0
- radnn/models/cnn/cnn_stem_setup.py +35 -0
- radnn/models/model_factory.py +85 -0
- radnn/models/model_hyperparams.py +128 -0
- radnn/models/model_info.py +91 -0
- radnn/plots/plot_learning_curve.py +19 -8
- radnn/system/__init__.py +1 -0
- radnn/system/files/__init__.py +1 -1
- radnn/system/files/csvfile.py +37 -5
- radnn/system/files/filelist.py +30 -0
- radnn/system/files/fileobject.py +11 -1
- radnn/system/files/imgfile.py +1 -1
- radnn/system/files/jsonfile.py +37 -9
- radnn/system/files/picklefile.py +3 -3
- radnn/system/files/textfile.py +39 -10
- radnn/system/files/zipfile.py +96 -0
- radnn/system/filestore.py +147 -47
- radnn/system/filesystem.py +3 -3
- radnn/test/__init__.py +1 -0
- radnn/test/tensor_hash.py +130 -0
- radnn/utils.py +16 -2
- radnn-0.1.0.dist-info/METADATA +30 -0
- radnn-0.1.0.dist-info/RECORD +99 -0
- {radnn-0.0.8.dist-info → radnn-0.1.0.dist-info}/WHEEL +1 -1
- {radnn-0.0.8.dist-info → radnn-0.1.0.dist-info/licenses}/LICENSE.txt +1 -1
- radnn/learn/state/__init__.py +0 -4
- radnn-0.0.8.dist-info/METADATA +0 -58
- radnn-0.0.8.dist-info/RECORD +0 -70
- /radnn/{data → data_beta}/dataset_folder.py +0 -0
- /radnn/{data → data_beta}/image_dataset.py +0 -0
- /radnn/{data → data_beta}/image_dataset_files.py +0 -0
- /radnn/{data → data_beta}/preprocess/__init__.py +0 -0
- /radnn/{data → data_beta}/preprocess/normalizer.py +0 -0
- /radnn/{data → data_beta}/preprocess/standardizer.py +0 -0
- /radnn/{data → data_beta}/subset_type.py +0 -0
- {radnn-0.0.8.dist-info → radnn-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from radnn import mlsys, FileStore
|
|
3
|
+
|
|
4
|
+
# -----------------------------
|
|
5
|
+
# Standard Libraries
|
|
6
|
+
# -----------------------------
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
# -----------------------------
|
|
10
|
+
# PyTorch
|
|
11
|
+
# -----------------------------
|
|
12
|
+
import torch
|
|
13
|
+
import torch.nn as nn
|
|
14
|
+
import torch.optim as optim
|
|
15
|
+
from torch.optim import lr_scheduler
|
|
16
|
+
|
|
17
|
+
from radnn.evaluation import EvaluateClassification
|
|
18
|
+
from radnn.plots import PlotConfusionMatrix, PlotLearningCurve
|
|
19
|
+
from radnn.experiment import MLExperimentLog, experiment_fold_number, experiment_name_with_fold
|
|
20
|
+
import matplotlib.pyplot as plt
|
|
21
|
+
from radnn.learn.torch import StairCaseLR
|
|
22
|
+
from radnn.errors import *
|
|
23
|
+
|
|
24
|
+
# -----------------------------
|
|
25
|
+
# Progress Bar
|
|
26
|
+
# -----------------------------
|
|
27
|
+
from tqdm import tqdm
|
|
28
|
+
|
|
29
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
30
|
+
def seed_everything(seed=42):
|
|
31
|
+
import os
|
|
32
|
+
import random
|
|
33
|
+
os.environ['PYTHONHASHSEED'] = str(seed)
|
|
34
|
+
np.random.seed(seed)
|
|
35
|
+
random.seed(seed)
|
|
36
|
+
torch.manual_seed(seed)
|
|
37
|
+
torch.cuda.manual_seed(seed)
|
|
38
|
+
torch.backends.cudnn.deterministic = True
|
|
39
|
+
torch.backends.cudnn.benchmark = True
|
|
40
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MLModelTrainer():
|
|
45
|
+
# --------------------------------------------------------------------------------------------------------------------
|
|
46
|
+
def __init__(self, hyperparams, dataset, model, device):
|
|
47
|
+
self.hprm = hyperparams
|
|
48
|
+
|
|
49
|
+
# The python/numpy generators might have been used prior to the start of training.
|
|
50
|
+
# We need to re-seed here to reset to the start of the pseudo-random sequence,
|
|
51
|
+
# plus encapsulating the reproducibility for torch in case mlsys.random_seed_all has not been explicitly called
|
|
52
|
+
seed_everything(self.hprm["Experiment.RandomSeed"])
|
|
53
|
+
|
|
54
|
+
self.dataset = dataset
|
|
55
|
+
self.model = model
|
|
56
|
+
self.device = device
|
|
57
|
+
self.criterion = None
|
|
58
|
+
self.optimizer = None
|
|
59
|
+
self.scheduler = None
|
|
60
|
+
self.best_model_state = None
|
|
61
|
+
self.best_model_state_file = None
|
|
62
|
+
self.training_logs_file = None
|
|
63
|
+
self.experiment_hyperparams_file = None
|
|
64
|
+
self.get_model_paths()
|
|
65
|
+
self.mlflow_run_id = None
|
|
66
|
+
self.registered_model = None
|
|
67
|
+
|
|
68
|
+
# --------------------------------------------------------------------------------------------------------------------
|
|
69
|
+
def get_lr(self):
|
|
70
|
+
return self.optimizer.param_groups[0]["lr"]
|
|
71
|
+
|
|
72
|
+
# --------------------------------------------------------------------------------------------------------------------
|
|
73
|
+
def get_model_paths(self):
|
|
74
|
+
hprm = self.hprm
|
|
75
|
+
sExperimentName = hprm["Experiment.Name"]
|
|
76
|
+
self.best_model_state = f'{hprm["Dataset.Name"]}_{hprm["Model.Name"]}_pipeline{hprm["Data.Pipeline.Type"]}_{sExperimentName}'
|
|
77
|
+
sExperimentWithFoldNumber = experiment_name_with_fold(hprm)
|
|
78
|
+
self.experiment_fs: FileStore = mlsys.filesys.models.subfs(sExperimentWithFoldNumber)
|
|
79
|
+
self.best_model_state_file = self.experiment_fs.file(f'{self.best_model_state}.pth')
|
|
80
|
+
self.best_model_state_onnx_file = self.experiment_fs.file(f'{self.best_model_state}.onnx')
|
|
81
|
+
self.training_logs_file = self.experiment_fs.file(f"training_logs_{sExperimentName}.json")
|
|
82
|
+
self.experiment_hyperparams_file = self.experiment_fs.file(f"hyperparams_{sExperimentWithFoldNumber}.json")
|
|
83
|
+
|
|
84
|
+
# --------------------------------------------------------------------------------------------------------------------
|
|
85
|
+
def build_optimizer(self):
|
|
86
|
+
hprm = self.hprm
|
|
87
|
+
sExtra = ""
|
|
88
|
+
if hprm["Training.Optimizer"].upper() == "SGD":
|
|
89
|
+
self.optimizer = optim.SGD(self.model.parameters(), lr=hprm["Training.LearningRate"],
|
|
90
|
+
momentum=hprm.get("Training.Momentum", 0.0),
|
|
91
|
+
nesterov=hprm.get("Training.Momentum.Nesterov", False),
|
|
92
|
+
weight_decay=hprm["Training.Regularize.WeightDecay"])
|
|
93
|
+
sExtra = f'momentum={self.optimizer.defaults["momentum"]}'
|
|
94
|
+
if self.optimizer.defaults["nesterov"]:
|
|
95
|
+
sExtra += " (Nesterov)"
|
|
96
|
+
elif hprm["Training.Optimizer"].upper() == "RMSPROP":
|
|
97
|
+
self.optimizer = optim.RMSprop(self.model.parameters(), lr=hprm["Training.LearningRate"],
|
|
98
|
+
weight_decay=hprm["Training.Regularize.WeightDecay"],
|
|
99
|
+
momentum=hprm.get("Training.Momentum", 0.0),
|
|
100
|
+
eps = hprm.get("Training.RMSProp.Epsilon", 1e-8)
|
|
101
|
+
)
|
|
102
|
+
elif hprm["Training.Optimizer"].upper() == "ADAM":
|
|
103
|
+
self.optimizer = optim.Adam(self.model.parameters(), lr=hprm["Training.LearningRate"],
|
|
104
|
+
weight_decay=hprm["Training.Regularize.WeightDecay"])
|
|
105
|
+
elif hprm["Training.Optimizer"].upper() == "ADAMW":
|
|
106
|
+
self.optimizer = optim.AdamW(self.model.parameters(), lr=hprm["Training.LearningRate"],
|
|
107
|
+
weight_decay=hprm["Training.Regularize.WeightDecay"])
|
|
108
|
+
|
|
109
|
+
print(f'Using {hprm["Training.Optimizer"].upper()} optimizer {sExtra}')
|
|
110
|
+
# --------------------------------------------------------------------------------------------------------------------
|
|
111
|
+
def build_lr_scheduler(self):
|
|
112
|
+
hprm = self.hprm
|
|
113
|
+
sSchedulingType = hprm.get("Training.LearningRateSchedule", "MultiStepDivisor")
|
|
114
|
+
sSchedulingType = sSchedulingType.upper()
|
|
115
|
+
nDefaultSetup = [[0, hprm["Training.LearningRate"]], [hprm["Training.Epochs"], 0.00001]]
|
|
116
|
+
nFinalChangeEpoch, nFinalLR = hprm.get("Training.LearningRateSchedule.Setup", nDefaultSetup)[-1]
|
|
117
|
+
|
|
118
|
+
self.scheduler = None
|
|
119
|
+
if (sSchedulingType.upper() == "MultiStepDivisor".upper()):
|
|
120
|
+
if "Training.LearningRateSchedule.Epochs" in hprm:
|
|
121
|
+
self.scheduler = lr_scheduler.MultiStepLR(self.optimizer,
|
|
122
|
+
milestones=hprm["Training.LearningRateSchedule.Epochs"],
|
|
123
|
+
gamma=hprm["Training.LearningRateSchedule.StepRatio"])
|
|
124
|
+
else:
|
|
125
|
+
raise Exception(TRAINER_LR_SCHEDULER_INVALID_MILESTONE_SETUP)
|
|
126
|
+
elif (sSchedulingType.upper() == "StairCase".upper()):
|
|
127
|
+
if "Training.LearningRateSchedule.Setup" in hprm:
|
|
128
|
+
oLRSetup = hprm["Training.LearningRateSchedule.Setup"]
|
|
129
|
+
if not isinstance(oLRSetup, list):
|
|
130
|
+
raise Exception(TRAINER_LR_SCHEDULER_INVALID_SETUP)
|
|
131
|
+
self.scheduler = StairCaseLR(self.optimizer, oLRSetup)
|
|
132
|
+
else:
|
|
133
|
+
raise Exception(TRAINER_LR_SCHEDULER_INVALID_SETUP)
|
|
134
|
+
|
|
135
|
+
elif sSchedulingType.upper() == "CosineAnnealing":
|
|
136
|
+
self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer,
|
|
137
|
+
T_max=nFinalChangeEpoch,
|
|
138
|
+
eta_min=nFinalLR)
|
|
139
|
+
|
|
140
|
+
assert self.scheduler is not None, TRAINER_LR_SCHEDULER_UNSUPPORTED
|
|
141
|
+
# --------------------------------------------------------------------------------------------------------------------
|
|
142
|
+
def prepare(self):
|
|
143
|
+
hprm = self.hprm
|
|
144
|
+
hprm["Model.State.Best"] = self.best_model_state
|
|
145
|
+
sExperimentName = hprm["Experiment.Name"]
|
|
146
|
+
mlsys.filesys.configs.subfs("run_6classes").json.save(hprm, f"{sExperimentName}_hyperparams.json",
|
|
147
|
+
is_sorted_keys=False)
|
|
148
|
+
|
|
149
|
+
if "Training.CrossEntropy.UseClassWeights" in hprm:
|
|
150
|
+
class_weights_tensor = torch.tensor(self.dataset.ts.class_weights, dtype=torch.float)
|
|
151
|
+
class_weights_tensor = class_weights_tensor.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
|
152
|
+
self.criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
|
|
153
|
+
else:
|
|
154
|
+
self.criterion = nn.CrossEntropyLoss()
|
|
155
|
+
|
|
156
|
+
self.build_optimizer()
|
|
157
|
+
self.build_lr_scheduler()
|
|
158
|
+
# --------------------------------------------------------------------------------------------------------------------
|
|
159
|
+
def fit(self, device):
|
|
160
|
+
self.model.to(device)
|
|
161
|
+
hprm = self.hprm
|
|
162
|
+
dInfo = {
|
|
163
|
+
"experiment_name": hprm["Experiment.Name"],
|
|
164
|
+
"experiment_fold_number": experiment_name_with_fold(hprm),
|
|
165
|
+
"model_name": hprm["Model.Name"],
|
|
166
|
+
"model_variants": hprm["Model.Variants"]
|
|
167
|
+
}
|
|
168
|
+
oLog: MLExperimentLog = MLExperimentLog(self.training_logs_file, dInfo)
|
|
169
|
+
|
|
170
|
+
best_val_f1_score = 0.0 # Track the best validation accuracy
|
|
171
|
+
patience = 8 # Number of epochs to wait for improvement
|
|
172
|
+
epochs_without_improvement = 0 # Counter for early stopping
|
|
173
|
+
|
|
174
|
+
nTSBatchCount = self.dataset.ts.minibatch_count
|
|
175
|
+
nVSBatchCount = self.dataset.vs.minibatch_count
|
|
176
|
+
nEpochCount = hprm["Training.Epochs"]
|
|
177
|
+
bInitialInfoSave = False
|
|
178
|
+
|
|
179
|
+
nLR = hprm["Training.LearningRate"]
|
|
180
|
+
self.experiment_fs.json.save(hprm, f'hyperparams_{hprm["Experiment.Name"]}.json')
|
|
181
|
+
|
|
182
|
+
oStepLoss = []
|
|
183
|
+
oStepAccuracy = []
|
|
184
|
+
all_labels = None
|
|
185
|
+
all_predictions = None
|
|
186
|
+
|
|
187
|
+
nEpochMinibatchCount = 0
|
|
188
|
+
for nEpochIndex in range(nEpochCount):
|
|
189
|
+
print(f"\nEpoch {nEpochIndex + 1}/{nEpochCount}")
|
|
190
|
+
|
|
191
|
+
# -------------------- Training --------------------
|
|
192
|
+
self.model.train()
|
|
193
|
+
train_loss, train_correct = 0.0, 0
|
|
194
|
+
|
|
195
|
+
nLR = self.scheduler.get_last_lr()[0]
|
|
196
|
+
progress_bar = tqdm(self.dataset.ts.loader, desc=f"Epoch {nEpochIndex + 1}/{nEpochCount} LR={nLR:.5f}", leave=False)
|
|
197
|
+
nStart = time.perf_counter()
|
|
198
|
+
|
|
199
|
+
nDebugSteps = 0
|
|
200
|
+
for inputs, labels, ids in progress_bar:
|
|
201
|
+
nDebugSteps += 1
|
|
202
|
+
inputs, labels = inputs.to(self.device), labels.to(self.device)
|
|
203
|
+
|
|
204
|
+
self.optimizer.zero_grad()
|
|
205
|
+
outputs = self.model(inputs)
|
|
206
|
+
loss = self.criterion(outputs, labels)
|
|
207
|
+
loss.backward()
|
|
208
|
+
self.optimizer.step()
|
|
209
|
+
|
|
210
|
+
# Accumulate
|
|
211
|
+
mb_loss = loss.item()
|
|
212
|
+
train_loss += mb_loss
|
|
213
|
+
_, predicted = torch.max(outputs, 1)
|
|
214
|
+
mb_correct = (predicted == labels).sum().item()
|
|
215
|
+
mb_count = len(labels)
|
|
216
|
+
mb_accuracy = mb_correct / mb_count
|
|
217
|
+
|
|
218
|
+
oStepLoss.append(mb_loss)
|
|
219
|
+
oStepAccuracy.append(mb_accuracy)
|
|
220
|
+
|
|
221
|
+
train_correct += mb_correct
|
|
222
|
+
nEpochMinibatchCount += mb_count
|
|
223
|
+
train_accuracy = train_correct / nEpochMinibatchCount
|
|
224
|
+
|
|
225
|
+
progress_bar.set_postfix(loss=f"{mb_loss:.4f}", accuracy=f"{mb_accuracy:.4f}")
|
|
226
|
+
if not bInitialInfoSave:
|
|
227
|
+
bInitialInfoSave = True
|
|
228
|
+
|
|
229
|
+
#if nDebugSteps == 4:
|
|
230
|
+
#break
|
|
231
|
+
|
|
232
|
+
nElapsedSecs = time.perf_counter() - nStart
|
|
233
|
+
nStart = time.perf_counter()
|
|
234
|
+
|
|
235
|
+
train_loss /= nTSBatchCount
|
|
236
|
+
train_accuracy = train_correct / self.dataset.ts.sample_count
|
|
237
|
+
print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
|
|
238
|
+
|
|
239
|
+
# -------------------- Validation --------------------
|
|
240
|
+
self.model.eval()
|
|
241
|
+
val_loss, val_correct = 0.0, 0
|
|
242
|
+
|
|
243
|
+
all_labels= []
|
|
244
|
+
all_predictions = []
|
|
245
|
+
progress_bar = tqdm(self.dataset.vs.loader, desc=f"Validating {nEpochIndex + 1}/{nEpochCount}", leave=False)
|
|
246
|
+
with torch.no_grad():
|
|
247
|
+
for inputs, labels, ids in progress_bar:
|
|
248
|
+
inputs, labels = inputs.to(self.device), labels.to(self.device)
|
|
249
|
+
outputs = self.model(inputs)
|
|
250
|
+
loss = self.criterion(outputs, labels).double()
|
|
251
|
+
|
|
252
|
+
val_loss += loss.item()
|
|
253
|
+
_, predicted = torch.max(outputs, 1)
|
|
254
|
+
all_labels.extend(labels.cpu().numpy().tolist())
|
|
255
|
+
all_predictions.extend(predicted.cpu().numpy().tolist())
|
|
256
|
+
val_correct += (predicted == labels).sum().item()
|
|
257
|
+
|
|
258
|
+
progress_bar.set_postfix(loss=loss.item())
|
|
259
|
+
|
|
260
|
+
val_loss /= nVSBatchCount
|
|
261
|
+
val_accuracy = val_correct / self.dataset.vs.sample_count
|
|
262
|
+
|
|
263
|
+
if self.scheduler is not None:
|
|
264
|
+
self.scheduler.step()
|
|
265
|
+
nLR = self.scheduler.get_last_lr()[0]
|
|
266
|
+
print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f} | LR: {nLR:.5f}")
|
|
267
|
+
|
|
268
|
+
# -------------------- Update logs / Evaluation Report --------------------
|
|
269
|
+
oLog.append(epoch=nEpochIndex+1,
|
|
270
|
+
epoch_time=nElapsedSecs,
|
|
271
|
+
train_loss = train_loss,
|
|
272
|
+
train_accuracy = train_accuracy,
|
|
273
|
+
val_loss = val_loss,
|
|
274
|
+
val_accuracy = val_accuracy,)
|
|
275
|
+
oLog.assign_series(train_step_loss = oStepLoss, train_step_accuracy = oStepAccuracy)
|
|
276
|
+
oLog.save(self.experiment_fs)
|
|
277
|
+
print(f"📊 Training logs saved to {self.training_logs_file}")
|
|
278
|
+
|
|
279
|
+
oEvaluator = self.evaluation_report(all_labels, all_predictions, oLog.logs, is_showing_plots=False)
|
|
280
|
+
|
|
281
|
+
# -------------------- Checkpoint & Early Stopping --------------------
|
|
282
|
+
val_f1_score = oEvaluator.average_f1score
|
|
283
|
+
if (val_f1_score > best_val_f1_score):
|
|
284
|
+
best_val_f1_score = val_f1_score
|
|
285
|
+
self.best_model_state_file = self.best_model_state_file
|
|
286
|
+
torch.save(self.model.state_dict(), self.best_model_state_file)
|
|
287
|
+
hprm["Model.State.BestEpoch"] = f"Epoch{nEpochIndex + 1}"
|
|
288
|
+
self.experiment_fs.json.save(hprm, f'hyperparams_{hprm["Experiment.Name"]}.json')
|
|
289
|
+
print(f'✅ Best model updated with F1 score: {best_val_f1_score:.4f}')
|
|
290
|
+
self.export_metrics(oEvaluator, nEpochIndex+1)
|
|
291
|
+
|
|
292
|
+
epochs_without_improvement = 0
|
|
293
|
+
else:
|
|
294
|
+
epochs_without_improvement += 1
|
|
295
|
+
|
|
296
|
+
nElapsedSecs = time.perf_counter() - nStart
|
|
297
|
+
|
|
298
|
+
#if epochs_without_improvement >= patience:
|
|
299
|
+
# print(f'⏹ Early stopping after {nEpochIndex + 1} epochs without improvement.')
|
|
300
|
+
# break
|
|
301
|
+
|
|
302
|
+
print("🎉 Training complete!")
|
|
303
|
+
|
|
304
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
305
|
+
def print_trainable_blocks(self):
|
|
306
|
+
for sName, oParams in self.model.named_parameters():
|
|
307
|
+
bIsTrainable = oParams.requires_grad
|
|
308
|
+
if bIsTrainable:
|
|
309
|
+
print(f" |__ TRAINABLE: {sName}")
|
|
310
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
311
|
+
def export_per_class_metrics(self, evaluator, opened_file, class_names=None):
|
|
312
|
+
if class_names is not None:
|
|
313
|
+
nClassCount = len(evaluator.class_names.keys())
|
|
314
|
+
oClasses = [f"{evaluator.class_names[x]:7}" for x in list(range(nClassCount))]
|
|
315
|
+
else:
|
|
316
|
+
oClasses = sorted(np.unique(evaluator.actual_classes))
|
|
317
|
+
nClassCount = len(oClasses)
|
|
318
|
+
oClasses = [f"{x:^7}" for x in oClasses]
|
|
319
|
+
evaluator.class_count = nClassCount
|
|
320
|
+
|
|
321
|
+
sClasses = " |".join(oClasses)
|
|
322
|
+
nRepeat = 28 + (7+2)*evaluator.class_count
|
|
323
|
+
print(f" |{sClasses}|", file=opened_file)
|
|
324
|
+
print("-"*nRepeat, file=opened_file)
|
|
325
|
+
print(f"Per Class Recall % |{evaluator.format_series_as_pc(evaluator.recall[:])}|", file=opened_file)
|
|
326
|
+
print(f"Per Class Precision % |{evaluator.format_series_as_pc(evaluator.precision[:])}|", file=opened_file)
|
|
327
|
+
print("-" * nRepeat, file=opened_file)
|
|
328
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
329
|
+
def export_overall_metrics(self, evaluator, opened_file):
|
|
330
|
+
print(f"Accuracy % :{evaluator.accuracy*100.0 :.3f}", file=opened_file)
|
|
331
|
+
print(f"Average F1 Score % :{evaluator.average_f1score*100.0:.3f}", file=opened_file)
|
|
332
|
+
print(f"Weighted Average Recall % :{evaluator.average_recall*100.0:.3f}", file=opened_file)
|
|
333
|
+
print(f"Weighted Average Precision %:{evaluator.average_precision*100.0:.3f}", file=opened_file)
|
|
334
|
+
if (evaluator.class_count == 2) and (evaluator.auc is not None):
|
|
335
|
+
print(f"Area Under the Curve (AUC):{evaluator.auc:.4f}", file=opened_file)
|
|
336
|
+
print("", file=opened_file)
|
|
337
|
+
|
|
338
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
339
|
+
def export_metrics(self, evaluator, epoch=None):
|
|
340
|
+
hprm = self.hprm
|
|
341
|
+
nFoldNumber = experiment_fold_number(hprm)
|
|
342
|
+
nRepeat = 80
|
|
343
|
+
sMetricsFileName = self.experiment_fs.file(f'metrics_{experiment_name_with_fold(hprm)}.txt')
|
|
344
|
+
with open(sMetricsFileName, "w") as oFile:
|
|
345
|
+
print("="*nRepeat, file=oFile)
|
|
346
|
+
if epoch is None:
|
|
347
|
+
print(f'Experiment [{hprm["Experiment.Name"]}] fold {nFoldNumber} trained.', file=oFile)
|
|
348
|
+
else:
|
|
349
|
+
print(f'Experiment [{hprm["Experiment.Name"]}] fold {nFoldNumber} training in progress, best epoch {epoch}.', file=oFile)
|
|
350
|
+
|
|
351
|
+
print("="*nRepeat, file=oFile)
|
|
352
|
+
self.export_overall_metrics(evaluator, oFile)
|
|
353
|
+
self.export_per_class_metrics(evaluator, oFile)
|
|
354
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
355
|
+
def inspect_learned_params(self):
|
|
356
|
+
oParams = dict()
|
|
357
|
+
nClipCount = 0
|
|
358
|
+
nTempCount = 0
|
|
359
|
+
for nIndex, (name, tensor) in enumerate(self.model.state_dict().items()):
|
|
360
|
+
if "clip" in name:
|
|
361
|
+
nClipCount += 1
|
|
362
|
+
oParams[f"clip{nClipCount}"] = tensor.detach().cpu().numpy()
|
|
363
|
+
elif "temperature" in name:
|
|
364
|
+
nTempCount += 1
|
|
365
|
+
oParams[f"temp{nTempCount}"] = tensor.detach().cpu().numpy()
|
|
366
|
+
|
|
367
|
+
print(oParams)
|
|
368
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
369
|
+
def evaluation_report(self, all_labels, all_preds, logs: dict = None,is_showing_plots=False, class_names=None):
|
|
370
|
+
oEvaluator = EvaluateClassification(all_labels, all_preds)
|
|
371
|
+
oEvaluator.print_overall()
|
|
372
|
+
oEvaluator.print_confusion_matrix()
|
|
373
|
+
oEvaluator.class_names = class_names
|
|
374
|
+
|
|
375
|
+
oPlot = PlotConfusionMatrix(oEvaluator.confusion_matrix)
|
|
376
|
+
oPlot = oPlot.prepare().save(self.experiment_fs.file("Confusion Matrix.png"))
|
|
377
|
+
if is_showing_plots:
|
|
378
|
+
oPlot.show()
|
|
379
|
+
|
|
380
|
+
if logs is not None:
|
|
381
|
+
oTrainingLogPlot = PlotLearningCurve(logs, f'Experiment {self.hprm["Experiment.Name"]}')
|
|
382
|
+
oTrainingLogPlot = oTrainingLogPlot.prepare(metric_key="accuracy").save(self.experiment_fs.file("LearningCurve_Accuracy.png"))
|
|
383
|
+
if is_showing_plots:
|
|
384
|
+
oTrainingLogPlot.show()
|
|
385
|
+
|
|
386
|
+
oTrainingLogPlot = PlotLearningCurve(logs, f'Experiment {self.hprm["Experiment.Name"]}')
|
|
387
|
+
oTrainingLogPlot = oTrainingLogPlot.prepare(metric_key="loss").save(self.experiment_fs.file("LearningCurve_Loss.png"))
|
|
388
|
+
if is_showing_plots:
|
|
389
|
+
oTrainingLogPlot.show()
|
|
390
|
+
plt.close()
|
|
391
|
+
self.inspect_learned_params()
|
|
392
|
+
|
|
393
|
+
return oEvaluator
|
|
394
|
+
|
|
395
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
396
|
+
def load(self, filename=None):
|
|
397
|
+
if filename is None:
|
|
398
|
+
filename = self.best_model_state_file
|
|
399
|
+
|
|
400
|
+
oCheckpoint = torch.load(filename)
|
|
401
|
+
self.model.load_state_dict(oCheckpoint)
|
|
402
|
+
self.model.eval()
|
|
403
|
+
|
|
404
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
405
|
+
def evaluate(self, class_names: dict=None, filename=None):
|
|
406
|
+
if filename is None:
|
|
407
|
+
filename = self.best_model_state_file
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
oCheckpoint = torch.load(filename)
|
|
411
|
+
self.model.load_state_dict(oCheckpoint)
|
|
412
|
+
self.model.eval()
|
|
413
|
+
|
|
414
|
+
all_preds, all_labels = [], []
|
|
415
|
+
with torch.no_grad():
|
|
416
|
+
# oDS
|
|
417
|
+
for inputs, labels, ids in tqdm(self.dataset.vs.loader, desc="Final Evaluation"):
|
|
418
|
+
inputs, labels = inputs.to(self.device), labels.to(self.device)
|
|
419
|
+
outputs = self.model(inputs)
|
|
420
|
+
_, preds = torch.max(outputs, 1)
|
|
421
|
+
all_preds.extend(preds.cpu().numpy())
|
|
422
|
+
all_labels.extend(labels.cpu().numpy())
|
|
423
|
+
|
|
424
|
+
oLog: MLExperimentLog = MLExperimentLog(self.training_logs_file)
|
|
425
|
+
oLog.load(self.experiment_fs)
|
|
426
|
+
|
|
427
|
+
oEvaluator = self.evaluation_report(all_labels, all_preds, oLog.logs,is_showing_plots=False, class_names=class_names)
|
|
428
|
+
# TODO: Keep epoch number for best
|
|
429
|
+
self.export_metrics(oEvaluator)
|
|
430
|
+
|
|
431
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
432
|
+
def export_model(self):
|
|
433
|
+
nInputDim = self.hprm["Data.ModelInputSize"]
|
|
434
|
+
cpu_device = torch.device("cpu")
|
|
435
|
+
self.model.to(cpu_device)
|
|
436
|
+
self.model.eval()
|
|
437
|
+
tInput = torch.randn(self.hprm["Training.BatchSize"], 3, nInputDim, nInputDim, requires_grad=True)
|
|
438
|
+
tInput.to(cpu_device)
|
|
439
|
+
#TODO: Test
|
|
440
|
+
torch.onnx.export(self.model, tInput, self.best_model_state_onnx_file,
|
|
441
|
+
export_params=True, opset_version=12, do_constant_folding=True,
|
|
442
|
+
input_names=['input'], output_names=[], dynamo=False,
|
|
443
|
+
dynamic_axes={
|
|
444
|
+
"input": {0: "batch"},
|
|
445
|
+
"output": {0: "batch"} }
|
|
446
|
+
)
|
|
447
|
+
'''
|
|
448
|
+
# [TEMP] Guidance code for exporting the model
|
|
449
|
+
torch.onnx.export(self.model, # model being run
|
|
450
|
+
oInput, # model input (or a tuple for multiple inputs)
|
|
451
|
+
self.best_model_state_onnx_file, # where to save the model
|
|
452
|
+
export_params=True, # store the trained parameter weights inside the model file
|
|
453
|
+
opset_version=10, # the ONNX version to export the model to
|
|
454
|
+
do_constant_folding=True, # whether to execute constant folding for optimization
|
|
455
|
+
input_names=['modelInput'], # the model's input names
|
|
456
|
+
output_names=['modelOutput'], # the model's output names
|
|
457
|
+
dynamic_axes={'modelInput': {0: 'batch_size'}, # variable length axes
|
|
458
|
+
'modelOutput': {0: 'batch_size'}})
|
|
459
|
+
'''
|
|
460
|
+
print('Model has been converted to ONNX')
|
|
461
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from torch.optim.lr_scheduler import LRScheduler
|
|
2
|
+
|
|
3
|
+
class StairCaseLR(LRScheduler):
|
|
4
|
+
def __init__(self, optimizer, setup, last_epoch=-1):
|
|
5
|
+
self.setup = sorted(setup, key=lambda x: x[0])
|
|
6
|
+
self.lrs = [nLR for nEpochIndex, nLR in self.setup]
|
|
7
|
+
self.lrs_count = len(self.lrs)
|
|
8
|
+
super().__init__(optimizer, last_epoch)
|
|
9
|
+
|
|
10
|
+
def get_lr(self):
|
|
11
|
+
epoch = max(self.last_epoch, 0)
|
|
12
|
+
|
|
13
|
+
lr = self.setup[0][1]
|
|
14
|
+
for m, candidate_lr in self.setup:
|
|
15
|
+
if epoch >= m:
|
|
16
|
+
lr = candidate_lr
|
|
17
|
+
else:
|
|
18
|
+
break
|
|
19
|
+
|
|
20
|
+
return [lr for _ in self.optimizer.param_groups]
|
|
21
|
+
|
radnn/ml_system.py
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# ______________________________________________________________________________________
|
|
7
7
|
# ......................................................................................
|
|
8
8
|
|
|
9
|
-
# Copyright (c) 2018-
|
|
9
|
+
# Copyright (c) 2018-2026 Pantelis I. Kaplanoglou
|
|
10
10
|
|
|
11
11
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
12
|
# of this software and associated documentation files (the "Software"), to deal
|
|
@@ -30,35 +30,19 @@
|
|
|
30
30
|
import os
|
|
31
31
|
import random
|
|
32
32
|
import numpy as np
|
|
33
|
-
import
|
|
33
|
+
from .core import AIGridInfo, RequiredLibs
|
|
34
|
+
from .utils import classproperty
|
|
35
|
+
from radnn.system import FileSystem
|
|
34
36
|
|
|
35
37
|
class MLSystem(object):
|
|
36
|
-
# --------------------------------------------------------------------------------------
|
|
37
38
|
_instance = None
|
|
38
|
-
|
|
39
|
-
|
|
39
|
+
|
|
40
|
+
@classproperty
|
|
41
|
+
def instance(cls):
|
|
40
42
|
if cls._instance is None:
|
|
41
43
|
cls._instance = cls()
|
|
42
|
-
mlsys = cls._instance
|
|
43
44
|
return cls._instance
|
|
44
|
-
|
|
45
|
-
@property
|
|
46
|
-
def is_using_tensorflow(self):
|
|
47
|
-
return self.is_tensorflow_installed and self._is_using_tensorflow
|
|
48
|
-
# --------------------------------------------------------------------------------------
|
|
49
|
-
@is_using_tensorflow.setter
|
|
50
|
-
def is_using_tensorflow(self, value):
|
|
51
|
-
self._is_using_tensorflow = value
|
|
52
|
-
self._is_using_torch = not value
|
|
53
|
-
# --------------------------------------------------------------------------------------
|
|
54
|
-
@property
|
|
55
|
-
def is_using_torch(self):
|
|
56
|
-
return self.is_torch_installed and self.is_using_torch
|
|
57
|
-
# --------------------------------------------------------------------------------------
|
|
58
|
-
@is_using_torch.setter
|
|
59
|
-
def is_using_torch(self, value):
|
|
60
|
-
self._is_using_torch = value
|
|
61
|
-
self._is_using_tensorflow = not value
|
|
45
|
+
|
|
62
46
|
# --------------------------------------------------------------------------------------
|
|
63
47
|
def __init__(self):
|
|
64
48
|
self._is_random_seed_initialized = False
|
|
@@ -66,48 +50,87 @@ class MLSystem(object):
|
|
|
66
50
|
self._seed = None
|
|
67
51
|
self.switches = dict()
|
|
68
52
|
self.switches["IsDebuggable"] = False
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
self.
|
|
72
|
-
self.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
self.
|
|
53
|
+
self.req_libs: RequiredLibs = RequiredLibs()
|
|
54
|
+
|
|
55
|
+
self.framework = "other"
|
|
56
|
+
self.device = "CPU"
|
|
57
|
+
|
|
58
|
+
# Ensure cuBLAS reproducibility for torch and/or tensorflow
|
|
59
|
+
if self.req_libs.is_torch_installed:
|
|
60
|
+
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
|
|
61
|
+
import torch
|
|
62
|
+
if self.req_libs.is_tensorflow_installed:
|
|
63
|
+
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
|
64
|
+
import tensorflow as tf
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if self.req_libs.is_tensorflow_installed:
|
|
68
|
+
self.framework = "tensorflow"
|
|
69
|
+
# By priority use torch for model trainers and data iterators (overrides co-existing tensorflow)
|
|
70
|
+
if self.req_libs.is_torch_installed:
|
|
71
|
+
self.framework = "torch"
|
|
72
|
+
|
|
73
|
+
self._info = None
|
|
74
|
+
|
|
75
|
+
# Initialize default device
|
|
76
|
+
if self.framework == "torch":
|
|
77
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
78
|
+
elif self.framework == "tensorflow":
|
|
79
|
+
gpus = tf.config.list_physical_devices("GPU")
|
|
80
|
+
if gpus:
|
|
81
|
+
tf.config.set_visible_devices(gpus[0], "GPU")
|
|
82
|
+
tf.config.experimental.set_memory_growth(gpus[0], True)
|
|
83
|
+
self.device = "/GPU:0"
|
|
84
|
+
else:
|
|
85
|
+
self.device = "/CPU:0"
|
|
76
86
|
# --------------------------------------------------------------------------------------
|
|
77
87
|
@property
|
|
78
|
-
def
|
|
88
|
+
def info(self):
|
|
89
|
+
if self._info is None:
|
|
90
|
+
self._info = AIGridInfo()
|
|
91
|
+
self.info.discover_devices(self.framework)
|
|
92
|
+
return self._info
|
|
93
|
+
# --------------------------------------------------------------------------------------
|
|
94
|
+
@property
|
|
95
|
+
def filesys(self) -> FileSystem:
|
|
79
96
|
return self._filesys
|
|
80
|
-
|
|
97
|
+
|
|
81
98
|
@filesys.setter
|
|
82
99
|
def filesys(self, value):
|
|
83
100
|
self._filesys = value
|
|
84
|
-
|
|
85
101
|
# --------------------------------------------------------------------------------------
|
|
86
102
|
@property
|
|
87
103
|
def seed(self):
|
|
88
104
|
return self._seed
|
|
89
105
|
# --------------------------------------------------------------------------------------
|
|
90
|
-
# We are seeding the number generators to get some amount of determinism for the whole ML training process.
|
|
91
|
-
# For Tensorflow it is not ensuring 100% deterministic reproduction of an experiment on the GPU.
|
|
92
106
|
def random_seed_all(self, seed, is_done_once=False, is_parallel_deterministic=False):
|
|
107
|
+
'''
|
|
108
|
+
We are seeding the number generators to get some amount of determinism for the whole ML training process.
|
|
109
|
+
For Tensorflow it is not ensuring 100% deterministic reproduction of an experiment that runs on the GPU.
|
|
110
|
+
|
|
111
|
+
:param seed:
|
|
112
|
+
:param is_done_once:
|
|
113
|
+
:param is_parallel_deterministic:
|
|
114
|
+
:return:
|
|
115
|
+
'''
|
|
93
116
|
self._seed = seed
|
|
94
|
-
|
|
117
|
+
|
|
95
118
|
bContinue = True
|
|
96
119
|
if is_done_once:
|
|
97
120
|
bContinue = (not self._is_random_seed_initialized)
|
|
98
|
-
|
|
121
|
+
|
|
99
122
|
if bContinue:
|
|
100
123
|
random.seed(seed)
|
|
101
124
|
os.environ['PYTHONHASHSEED'] = str(seed)
|
|
102
125
|
np.random.seed(seed)
|
|
103
|
-
if
|
|
126
|
+
if self.req_libs.is_tensorflow_installed:
|
|
104
127
|
import tensorflow as tf
|
|
105
128
|
tf.compat.v1.reset_default_graph()
|
|
106
129
|
if is_parallel_deterministic:
|
|
107
130
|
tf.config.experimental.enable_op_determinism() # Enable determinism for num_parallel_calls
|
|
108
131
|
tf.random.set_seed(seed)
|
|
109
132
|
tf.keras.utils.set_random_seed(seed)
|
|
110
|
-
if
|
|
133
|
+
if self.req_libs.is_torch_installed:
|
|
111
134
|
import torch
|
|
112
135
|
torch.manual_seed(seed)
|
|
113
136
|
# GPU and multi-GPU
|
|
@@ -116,22 +139,15 @@ class MLSystem(object):
|
|
|
116
139
|
# For GPU determinism
|
|
117
140
|
torch.backends.cudnn.deterministic = True
|
|
118
141
|
torch.backends.cudnn.benchmark = False
|
|
119
|
-
|
|
142
|
+
torch.use_deterministic_algorithms(True)
|
|
143
|
+
|
|
120
144
|
self._is_random_seed_initialized = True
|
|
121
145
|
print("(>) Random seed set to %d" % seed)
|
|
122
146
|
# --------------------------------------------------------------------------------------
|
|
123
147
|
|
|
124
148
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
if not is_tensorflow_installed:
|
|
129
|
-
bIsInstalled = importlib.util.find_spec("tensorflow-gpu") is not None
|
|
130
|
-
return bIsInstalled
|
|
131
|
-
# ----------------------------------------------------------------------------------------------------------------------
|
|
149
|
+
mlsys: MLSystem = MLSystem.instance
|
|
150
|
+
|
|
151
|
+
|
|
132
152
|
|
|
133
153
|
|
|
134
|
-
mlsys: MLSystem = MLSystem.Instance()
|
|
135
|
-
mlsys.is_tensorflow_installed = is_tensorflow_installed()
|
|
136
|
-
mlsys.is_torch_installed = importlib.util.find_spec("torch") is not None
|
|
137
|
-
mlsys.is_opencv_installed = importlib.util.find_spec("cv2") is not None
|
radnn/models/__init__.py
ADDED
|
File without changes
|