radnn 0.0.9__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. radnn/__init__.py +4 -5
  2. radnn/benchmark/__init__.py +1 -0
  3. radnn/benchmark/latency.py +55 -0
  4. radnn/core.py +146 -2
  5. radnn/data/__init__.py +5 -10
  6. radnn/data/dataset_base.py +100 -272
  7. radnn/data/dataset_base_legacy.py +280 -0
  8. radnn/data/errors.py +32 -0
  9. radnn/data/sample_preprocessor.py +58 -0
  10. radnn/data/sample_set.py +203 -90
  11. radnn/data/sample_set_kind.py +126 -0
  12. radnn/data/sequence_dataset.py +25 -30
  13. radnn/data/structs/__init__.py +1 -0
  14. radnn/data/structs/tree.py +322 -0
  15. radnn/data_beta/__init__.py +12 -0
  16. radnn/{data → data_beta}/data_feed.py +1 -1
  17. radnn/data_beta/dataset_base.py +337 -0
  18. radnn/data_beta/sample_set.py +166 -0
  19. radnn/data_beta/sequence_dataset.py +134 -0
  20. radnn/data_beta/structures/__init__.py +2 -0
  21. radnn/data_beta/structures/dictionary.py +41 -0
  22. radnn/{data → data_beta}/tf_classification_data_feed.py +5 -2
  23. radnn/errors.py +10 -2
  24. radnn/experiment/__init__.py +2 -0
  25. radnn/experiment/identification.py +7 -0
  26. radnn/experiment/ml_experiment.py +7 -2
  27. radnn/experiment/ml_experiment_log.py +47 -0
  28. radnn/images/image_processor.py +4 -1
  29. radnn/learn/__init__.py +0 -7
  30. radnn/learn/keras/__init__.py +4 -0
  31. radnn/learn/{state → keras}/keras_best_state_saver.py +5 -1
  32. radnn/learn/{learning_algorithm.py → keras/keras_learning_algorithm.py} +5 -9
  33. radnn/learn/{keras_learning_rate_scheduler.py → keras/keras_learning_rate_scheduler.py} +4 -1
  34. radnn/learn/{keras_optimization_algorithm.py → keras/keras_optimization_combo.py} +7 -3
  35. radnn/learn/torch/__init__.py +3 -0
  36. radnn/learn/torch/ml_model_freezer.py +330 -0
  37. radnn/learn/torch/ml_trainer.py +465 -0
  38. radnn/learn/torch/staircase_lr_scheduler.py +21 -0
  39. radnn/ml_system.py +68 -52
  40. radnn/models/__init__.py +5 -0
  41. radnn/models/cnn/__init__.py +0 -0
  42. radnn/models/cnn/cnn_stem_setup.py +35 -0
  43. radnn/models/model_factory.py +85 -0
  44. radnn/models/model_hyperparams.py +128 -0
  45. radnn/models/model_info.py +91 -0
  46. radnn/plots/plot_learning_curve.py +19 -8
  47. radnn/system/__init__.py +1 -0
  48. radnn/system/files/__init__.py +1 -1
  49. radnn/system/files/csvfile.py +37 -5
  50. radnn/system/files/filelist.py +30 -0
  51. radnn/system/files/fileobject.py +11 -1
  52. radnn/system/files/imgfile.py +1 -1
  53. radnn/system/files/jsonfile.py +34 -9
  54. radnn/system/files/picklefile.py +3 -3
  55. radnn/system/files/textfile.py +48 -16
  56. radnn/system/files/zipfile.py +96 -0
  57. radnn/system/filestore.py +147 -47
  58. radnn/system/filesystem.py +3 -3
  59. radnn/test/__init__.py +1 -0
  60. radnn/test/tensor_hash.py +130 -0
  61. radnn/utils.py +16 -2
  62. {radnn-0.0.9.dist-info → radnn-0.1.1.dist-info}/METADATA +5 -11
  63. radnn-0.1.1.dist-info/RECORD +99 -0
  64. {radnn-0.0.9.dist-info → radnn-0.1.1.dist-info}/WHEEL +1 -1
  65. {radnn-0.0.9.dist-info → radnn-0.1.1.dist-info}/licenses/LICENSE.txt +1 -1
  66. radnn/learn/state/__init__.py +0 -4
  67. radnn-0.0.9.dist-info/RECORD +0 -70
  68. /radnn/{data → data_beta}/dataset_folder.py +0 -0
  69. /radnn/{data → data_beta}/image_dataset.py +0 -0
  70. /radnn/{data → data_beta}/image_dataset_files.py +0 -0
  71. /radnn/{data → data_beta}/preprocess/__init__.py +0 -0
  72. /radnn/{data → data_beta}/preprocess/normalizer.py +0 -0
  73. /radnn/{data → data_beta}/preprocess/standardizer.py +0 -0
  74. /radnn/{data → data_beta}/subset_type.py +0 -0
  75. {radnn-0.0.9.dist-info → radnn-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,465 @@
1
+ import numpy as np
2
+ from radnn import mlsys, FileStore
3
+
4
+ # -----------------------------
5
+ # Standard Libraries
6
+ # -----------------------------
7
+ import time
8
+
9
+ # -----------------------------
10
+ # PyTorch
11
+ # -----------------------------
12
+ import torch
13
+ import torch.nn as nn
14
+ import torch.optim as optim
15
+ from torch.optim import lr_scheduler
16
+
17
+ from radnn.evaluation import EvaluateClassification
18
+ from radnn.plots import PlotConfusionMatrix, PlotLearningCurve
19
+ from radnn.experiment import MLExperimentLog, experiment_fold_number, experiment_name_with_fold
20
+ import matplotlib.pyplot as plt
21
+ from radnn.learn.torch import StairCaseLR
22
+ from radnn.errors import *
23
+
24
+ # -----------------------------
25
+ # Progress Bar
26
+ # -----------------------------
27
+ from tqdm import tqdm
28
+
29
+ # ----------------------------------------------------------------------------------------------------------------------
30
+ def seed_everything(seed=42):
31
+ import os
32
+ import random
33
+ os.environ['PYTHONHASHSEED'] = str(seed)
34
+ np.random.seed(seed)
35
+ random.seed(seed)
36
+ torch.manual_seed(seed)
37
+ torch.cuda.manual_seed(seed)
38
+ torch.backends.cudnn.deterministic = True
39
+ torch.backends.cudnn.benchmark = True
40
+ # ----------------------------------------------------------------------------------------------------------------------
41
+
42
+
43
+
44
+ class MLModelTrainer():
45
+ # --------------------------------------------------------------------------------------------------------------------
46
+ def __init__(self, hyperparams, dataset, model, device):
47
+ self.hprm = hyperparams
48
+
49
+ # The python/numpy generators might have been used prior to the start of training.
50
+ # We need to re-seed here to reset to the start of the pseudo-random sequence,
51
+ # plus encapsulating the reproducibility for torch in case mlsys.random_seed_all has not been explicitly called
52
+ seed_everything(self.hprm["Experiment.RandomSeed"])
53
+
54
+ self.dataset = dataset
55
+ self.model = model
56
+ self.device = device
57
+ self.criterion = None
58
+ self.optimizer = None
59
+ self.scheduler = None
60
+ self.best_model_state = None
61
+ self.best_model_state_file = None
62
+ self.training_logs_file = None
63
+ self.experiment_hyperparams_file = None
64
+ self.get_model_paths()
65
+ self.mlflow_run_id = None
66
+ self.registered_model = None
67
+
68
+ # --------------------------------------------------------------------------------------------------------------------
69
+ def get_lr(self):
70
+ return self.optimizer.param_groups[0]["lr"]
71
+
72
+ # --------------------------------------------------------------------------------------------------------------------
73
+ def get_model_paths(self):
74
+ hprm = self.hprm
75
+ sExperimentName = hprm["Experiment.Name"]
76
+ self.best_model_state = f'{hprm["Dataset.Name"]}_{hprm["Model.Name"]}_pipeline{hprm["Data.Pipeline.Type"]}_{sExperimentName}'
77
+ sExperimentWithFoldNumber = experiment_name_with_fold(hprm)
78
+ self.experiment_fs: FileStore = mlsys.filesys.models.subfs(sExperimentWithFoldNumber)
79
+ self.best_model_state_file = self.experiment_fs.file(f'{self.best_model_state}.pth')
80
+ self.best_model_state_onnx_file = self.experiment_fs.file(f'{self.best_model_state}.onnx')
81
+ self.training_logs_file = self.experiment_fs.file(f"training_logs_{sExperimentName}.json")
82
+ self.experiment_hyperparams_file = self.experiment_fs.file(f"hyperparams_{sExperimentWithFoldNumber}.json")
83
+
84
+ # --------------------------------------------------------------------------------------------------------------------
85
+ def build_optimizer(self):
86
+ hprm = self.hprm
87
+ sExtra = ""
88
+ if hprm["Training.Optimizer"].upper() == "SGD":
89
+ self.optimizer = optim.SGD(self.model.parameters(), lr=hprm["Training.LearningRate"],
90
+ momentum=hprm.get("Training.Momentum", 0.0),
91
+ nesterov=hprm.get("Training.Momentum.Nesterov", False),
92
+ weight_decay=hprm["Training.Regularize.WeightDecay"])
93
+ sExtra = f'momentum={self.optimizer.defaults["momentum"]}'
94
+ if self.optimizer.defaults["nesterov"]:
95
+ sExtra += " (Nesterov)"
96
+ elif hprm["Training.Optimizer"].upper() == "RMSPROP":
97
+ self.optimizer = optim.RMSprop(self.model.parameters(), lr=hprm["Training.LearningRate"],
98
+ weight_decay=hprm["Training.Regularize.WeightDecay"],
99
+ momentum=hprm.get("Training.Momentum", 0.0),
100
+ eps = hprm.get("Training.RMSProp.Epsilon", 1e-8)
101
+ )
102
+ elif hprm["Training.Optimizer"].upper() == "ADAM":
103
+ self.optimizer = optim.Adam(self.model.parameters(), lr=hprm["Training.LearningRate"],
104
+ weight_decay=hprm["Training.Regularize.WeightDecay"])
105
+ elif hprm["Training.Optimizer"].upper() == "ADAMW":
106
+ self.optimizer = optim.AdamW(self.model.parameters(), lr=hprm["Training.LearningRate"],
107
+ weight_decay=hprm["Training.Regularize.WeightDecay"])
108
+
109
+ print(f'Using {hprm["Training.Optimizer"].upper()} optimizer {sExtra}')
110
+ # --------------------------------------------------------------------------------------------------------------------
111
+ def build_lr_scheduler(self):
112
+ hprm = self.hprm
113
+ sSchedulingType = hprm.get("Training.LearningRateSchedule", "MultiStepDivisor")
114
+ sSchedulingType = sSchedulingType.upper()
115
+ nDefaultSetup = [[0, hprm["Training.LearningRate"]], [hprm["Training.Epochs"], 0.00001]]
116
+ nFinalChangeEpoch, nFinalLR = hprm.get("Training.LearningRateSchedule.Setup", nDefaultSetup)[-1]
117
+
118
+ self.scheduler = None
119
+ if (sSchedulingType.upper() == "MultiStepDivisor".upper()):
120
+ if "Training.LearningRateSchedule.Epochs" in hprm:
121
+ self.scheduler = lr_scheduler.MultiStepLR(self.optimizer,
122
+ milestones=hprm["Training.LearningRateSchedule.Epochs"],
123
+ gamma=hprm["Training.LearningRateSchedule.StepRatio"])
124
+ else:
125
+ raise Exception(TRAINER_LR_SCHEDULER_INVALID_MILESTONE_SETUP)
126
+ elif (sSchedulingType.upper() == "StairCase".upper()):
127
+ if "Training.LearningRateSchedule.Setup" in hprm:
128
+ oLRSetup = hprm["Training.LearningRateSchedule.Setup"]
129
+ if not isinstance(oLRSetup, list):
130
+ raise Exception(TRAINER_LR_SCHEDULER_INVALID_SETUP)
131
+ self.scheduler = StairCaseLR(self.optimizer, oLRSetup)
132
+ else:
133
+ raise Exception(TRAINER_LR_SCHEDULER_INVALID_SETUP)
134
+
135
+ elif sSchedulingType.upper() == "CosineAnnealing".upper():
136
+ self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer,
137
+ T_max=nFinalChangeEpoch,
138
+ eta_min=nFinalLR)
139
+
140
+ assert self.scheduler is not None, TRAINER_LR_SCHEDULER_UNSUPPORTED
141
+ # --------------------------------------------------------------------------------------------------------------------
142
+ def prepare(self):
143
+ hprm = self.hprm
144
+ hprm["Model.State.Best"] = self.best_model_state
145
+ sExperimentName = hprm["Experiment.Name"]
146
+ mlsys.filesys.configs.subfs("run_6classes").json.save(hprm, f"{sExperimentName}_hyperparams.json",
147
+ is_sorted_keys=False)
148
+
149
+ if "Training.CrossEntropy.UseClassWeights" in hprm:
150
+ class_weights_tensor = torch.tensor(self.dataset.ts.class_weights, dtype=torch.float)
151
+ class_weights_tensor = class_weights_tensor.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
152
+ self.criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
153
+ else:
154
+ self.criterion = nn.CrossEntropyLoss()
155
+
156
+ self.build_optimizer()
157
+ self.build_lr_scheduler()
158
+ # --------------------------------------------------------------------------------------------------------------------
159
+ def fit(self, device, **kwargs):
160
+ self.model.to(device)
161
+ bIsPreview = kwargs.get("is_preview", False)
162
+
163
+ hprm = self.hprm
164
+ dInfo = {
165
+ "experiment_name": hprm["Experiment.Name"],
166
+ "experiment_fold_number": experiment_name_with_fold(hprm),
167
+ "model_name": hprm["Model.Name"],
168
+ "model_variants": hprm["Model.Variants"]
169
+ }
170
+ oLog: MLExperimentLog = MLExperimentLog(self.training_logs_file, dInfo)
171
+
172
+ best_val_f1_score = 0.0 # Track the best validation accuracy
173
+ patience = 8 # Number of epochs to wait for improvement
174
+ epochs_without_improvement = 0 # Counter for early stopping
175
+
176
+ nTSBatchCount = self.dataset.ts.minibatch_count
177
+ nVSBatchCount = self.dataset.vs.minibatch_count
178
+ nEpochCount = hprm["Training.Epochs"]
179
+ bInitialInfoSave = False
180
+
181
+ nLR = hprm["Training.LearningRate"]
182
+ self.experiment_fs.json.save(hprm, f'hyperparams_{hprm["Experiment.Name"]}.json')
183
+
184
+ oStepLoss = []
185
+ oStepAccuracy = []
186
+ all_labels = None
187
+ all_predictions = None
188
+
189
+ nEpochMinibatchCount = 0
190
+ for nEpochIndex in range(nEpochCount):
191
+ print(f"\nEpoch {nEpochIndex + 1}/{nEpochCount}")
192
+
193
+ # -------------------- Training --------------------
194
+ self.model.train()
195
+ train_loss, train_correct = 0.0, 0
196
+
197
+ nLR = self.scheduler.get_last_lr()[0]
198
+ progress_bar = tqdm(self.dataset.ts.loader, desc=f"Epoch {nEpochIndex + 1}/{nEpochCount} LR={nLR:.5f}", leave=False)
199
+ nStart = time.perf_counter()
200
+
201
+ nDebugSteps = 0
202
+ for inputs, labels, ids in progress_bar:
203
+ nDebugSteps += 1
204
+ inputs, labels = inputs.to(self.device), labels.to(self.device)
205
+
206
+ self.optimizer.zero_grad()
207
+ outputs = self.model(inputs)
208
+ loss = self.criterion(outputs, labels).double()
209
+ loss.backward()
210
+ self.optimizer.step()
211
+
212
+ # Accumulate
213
+ mb_loss = loss.item()
214
+ train_loss += mb_loss
215
+ _, predicted = torch.max(outputs, 1)
216
+ mb_correct = (predicted == labels).sum().item()
217
+ mb_count = len(labels)
218
+ mb_accuracy = mb_correct / mb_count
219
+
220
+ oStepLoss.append(mb_loss)
221
+ oStepAccuracy.append(mb_accuracy)
222
+
223
+ train_correct += mb_correct
224
+ nEpochMinibatchCount += mb_count
225
+ train_accuracy = train_correct / nEpochMinibatchCount
226
+
227
+ progress_bar.set_postfix(loss=f"{mb_loss:.4f}", accuracy=f"{mb_accuracy:.4f}")
228
+ if not bInitialInfoSave:
229
+ bInitialInfoSave = True
230
+
231
+ if bIsPreview:
232
+ break
233
+
234
+ nElapsedSecs = time.perf_counter() - nStart
235
+ nStart = time.perf_counter()
236
+
237
+ train_loss /= nTSBatchCount
238
+ train_accuracy = train_correct / self.dataset.ts.sample_count
239
+ print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
240
+
241
+ # -------------------- Validation --------------------
242
+ self.model.eval()
243
+ val_loss, val_correct = 0.0, 0
244
+
245
+ all_labels= []
246
+ all_predictions = []
247
+ progress_bar = tqdm(self.dataset.vs.loader, desc=f"Validating {nEpochIndex + 1}/{nEpochCount}", leave=False)
248
+ with torch.no_grad():
249
+ for inputs, labels, ids in progress_bar:
250
+ inputs, labels = inputs.to(self.device), labels.to(self.device)
251
+ outputs = self.model(inputs)
252
+ loss = self.criterion(outputs, labels).double()
253
+
254
+ val_loss += loss.item()
255
+ _, predicted = torch.max(outputs, 1)
256
+ all_labels.extend(labels.cpu().numpy().tolist())
257
+ all_predictions.extend(predicted.cpu().numpy().tolist())
258
+ val_correct += (predicted == labels).sum().item()
259
+
260
+ progress_bar.set_postfix(loss=loss.item())
261
+
262
+ val_loss /= nVSBatchCount
263
+ val_accuracy = val_correct / self.dataset.vs.sample_count
264
+
265
+ if self.scheduler is not None:
266
+ self.scheduler.step()
267
+ nLR = self.scheduler.get_last_lr()[0]
268
+ print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f} | LR: {nLR:.5f}")
269
+
270
+ # -------------------- Update logs / Evaluation Report --------------------
271
+ oLog.append(epoch=nEpochIndex+1,
272
+ epoch_time=nElapsedSecs,
273
+ train_loss = train_loss,
274
+ train_accuracy = train_accuracy,
275
+ val_loss = val_loss,
276
+ val_accuracy = val_accuracy,)
277
+ oLog.assign_series(train_step_loss = oStepLoss, train_step_accuracy = oStepAccuracy)
278
+ oLog.save(self.experiment_fs)
279
+ print(f"📊 Training logs saved to {self.training_logs_file}")
280
+
281
+ oEvaluator = self.evaluation_report(all_labels, all_predictions, oLog.logs, is_showing_plots=False)
282
+
283
+ # -------------------- Checkpoint & Early Stopping --------------------
284
+ val_f1_score = oEvaluator.average_f1score
285
+ if (val_f1_score > best_val_f1_score):
286
+ best_val_f1_score = val_f1_score
287
+ self.best_model_state_file = self.best_model_state_file
288
+ torch.save(self.model.state_dict(), self.best_model_state_file)
289
+ hprm["Model.State.BestEpoch"] = f"Epoch{nEpochIndex + 1}"
290
+ self.experiment_fs.json.save(hprm, f'hyperparams_{hprm["Experiment.Name"]}.json')
291
+ print(f'✅ Best model updated with F1 score: {best_val_f1_score:.4f}')
292
+ self.export_metrics(oEvaluator, nEpochIndex+1)
293
+
294
+ epochs_without_improvement = 0
295
+ else:
296
+ epochs_without_improvement += 1
297
+
298
+ if bIsPreview:
299
+ break
300
+ nElapsedSecs = time.perf_counter() - nStart
301
+
302
+ #if epochs_without_improvement >= patience:
303
+ # print(f'⏹ Early stopping after {nEpochIndex + 1} epochs without improvement.')
304
+ # break
305
+
306
+ print("🎉 Training complete!")
307
+
308
+ # --------------------------------------------------------------------------------------------------------------
309
+ def print_trainable_blocks(self):
310
+ for sName, oParams in self.model.named_parameters():
311
+ bIsTrainable = oParams.requires_grad
312
+ if bIsTrainable:
313
+ print(f" |__ TRAINABLE: {sName}")
314
+ # --------------------------------------------------------------------------------------------------------------
315
+ def export_per_class_metrics(self, evaluator, opened_file, class_names=None):
316
+ if class_names is not None:
317
+ nClassCount = len(evaluator.class_names.keys())
318
+ oClasses = [f"{evaluator.class_names[x]:7}" for x in list(range(nClassCount))]
319
+ else:
320
+ oClasses = sorted(np.unique(evaluator.actual_classes))
321
+ nClassCount = len(oClasses)
322
+ oClasses = [f"{x:^7}" for x in oClasses]
323
+ evaluator.class_count = nClassCount
324
+
325
+ sClasses = " |".join(oClasses)
326
+ nRepeat = 28 + (7+2)*evaluator.class_count
327
+ print(f" |{sClasses}|", file=opened_file)
328
+ print("-"*nRepeat, file=opened_file)
329
+ print(f"Per Class Recall % |{evaluator.format_series_as_pc(evaluator.recall[:])}|", file=opened_file)
330
+ print(f"Per Class Precision % |{evaluator.format_series_as_pc(evaluator.precision[:])}|", file=opened_file)
331
+ print("-" * nRepeat, file=opened_file)
332
+ # --------------------------------------------------------------------------------------------------------------
333
+ def export_overall_metrics(self, evaluator, opened_file):
334
+ print(f"Accuracy % :{evaluator.accuracy*100.0 :.3f}", file=opened_file)
335
+ print(f"Average F1 Score % :{evaluator.average_f1score*100.0:.3f}", file=opened_file)
336
+ print(f"Weighted Average Recall % :{evaluator.average_recall*100.0:.3f}", file=opened_file)
337
+ print(f"Weighted Average Precision %:{evaluator.average_precision*100.0:.3f}", file=opened_file)
338
+ if (evaluator.class_count == 2) and (evaluator.auc is not None):
339
+ print(f"Area Under the Curve (AUC):{evaluator.auc:.4f}", file=opened_file)
340
+ print("", file=opened_file)
341
+
342
+ # --------------------------------------------------------------------------------------------------------------
343
+ def export_metrics(self, evaluator, epoch=None):
344
+ hprm = self.hprm
345
+ nFoldNumber = experiment_fold_number(hprm)
346
+ nRepeat = 80
347
+ sMetricsFileName = self.experiment_fs.file(f'metrics_{experiment_name_with_fold(hprm)}.txt')
348
+ with open(sMetricsFileName, "w") as oFile:
349
+ print("="*nRepeat, file=oFile)
350
+ if epoch is None:
351
+ print(f'Experiment [{hprm["Experiment.Name"]}] fold {nFoldNumber} trained.', file=oFile)
352
+ else:
353
+ print(f'Experiment [{hprm["Experiment.Name"]}] fold {nFoldNumber} training in progress, best epoch {epoch}.', file=oFile)
354
+
355
+ print("="*nRepeat, file=oFile)
356
+ self.export_overall_metrics(evaluator, oFile)
357
+ self.export_per_class_metrics(evaluator, oFile)
358
+ # --------------------------------------------------------------------------------------------------------------
359
+ def inspect_learned_params(self):
360
+ oParams = dict()
361
+ nClipCount = 0
362
+ nTempCount = 0
363
+ for nIndex, (name, tensor) in enumerate(self.model.state_dict().items()):
364
+ if "clip" in name:
365
+ nClipCount += 1
366
+ oParams[f"clip{nClipCount}"] = tensor.detach().cpu().numpy()
367
+ elif "temperature" in name:
368
+ nTempCount += 1
369
+ oParams[f"temp{nTempCount}"] = tensor.detach().cpu().numpy()
370
+
371
+ print(oParams)
372
+ # --------------------------------------------------------------------------------------------------------------
373
+ def evaluation_report(self, all_labels, all_preds, logs: dict = None,is_showing_plots=False, class_names=None):
374
+ oEvaluator = EvaluateClassification(all_labels, all_preds)
375
+ oEvaluator.print_overall()
376
+ oEvaluator.print_confusion_matrix()
377
+ oEvaluator.class_names = class_names
378
+
379
+ oPlot = PlotConfusionMatrix(oEvaluator.confusion_matrix)
380
+ oPlot = oPlot.prepare().save(self.experiment_fs.file("Confusion Matrix.png"))
381
+ if is_showing_plots:
382
+ oPlot.show()
383
+
384
+ if logs is not None:
385
+ oTrainingLogPlot = PlotLearningCurve(logs, f'Experiment {self.hprm["Experiment.Name"]}')
386
+ oTrainingLogPlot = oTrainingLogPlot.prepare(metric_key="accuracy").save(self.experiment_fs.file("LearningCurve_Accuracy.png"))
387
+ if is_showing_plots:
388
+ oTrainingLogPlot.show()
389
+
390
+ oTrainingLogPlot = PlotLearningCurve(logs, f'Experiment {self.hprm["Experiment.Name"]}')
391
+ oTrainingLogPlot = oTrainingLogPlot.prepare(metric_key="loss").save(self.experiment_fs.file("LearningCurve_Loss.png"))
392
+ if is_showing_plots:
393
+ oTrainingLogPlot.show()
394
+ plt.close()
395
+ self.inspect_learned_params()
396
+
397
+ return oEvaluator
398
+
399
+ # --------------------------------------------------------------------------------------------------------------
400
+ def load(self, filename=None):
401
+ if filename is None:
402
+ filename = self.best_model_state_file
403
+
404
+ oCheckpoint = torch.load(filename)
405
+ self.model.load_state_dict(oCheckpoint)
406
+ self.model.eval()
407
+
408
+ # --------------------------------------------------------------------------------------------------------------
409
+ def evaluate(self, class_names: dict=None, filename=None):
410
+ if filename is None:
411
+ filename = self.best_model_state_file
412
+
413
+
414
+ oCheckpoint = torch.load(filename)
415
+ self.model.load_state_dict(oCheckpoint)
416
+ self.model.eval()
417
+
418
+ all_preds, all_labels = [], []
419
+ with torch.no_grad():
420
+ # oDS
421
+ for inputs, labels, ids in tqdm(self.dataset.vs.loader, desc="Final Evaluation"):
422
+ inputs, labels = inputs.to(self.device), labels.to(self.device)
423
+ outputs = self.model(inputs)
424
+ _, preds = torch.max(outputs, 1)
425
+ all_preds.extend(preds.cpu().numpy())
426
+ all_labels.extend(labels.cpu().numpy())
427
+
428
+ oLog: MLExperimentLog = MLExperimentLog(self.training_logs_file)
429
+ oLog.load(self.experiment_fs)
430
+
431
+ oEvaluator = self.evaluation_report(all_labels, all_preds, oLog.logs,is_showing_plots=False, class_names=class_names)
432
+ # TODO: Keep epoch number for best
433
+ self.export_metrics(oEvaluator)
434
+
435
+ # --------------------------------------------------------------------------------------------------------------
436
+ def export_model(self):
437
+ nInputDim = self.hprm["Data.ModelInputSize"]
438
+ cpu_device = torch.device("cpu")
439
+ self.model.to(cpu_device)
440
+ self.model.eval()
441
+ tInput = torch.randn(self.hprm["Training.BatchSize"], 3, nInputDim, nInputDim, requires_grad=True)
442
+ tInput.to(cpu_device)
443
+ #TODO: Test
444
+ torch.onnx.export(self.model, tInput, self.best_model_state_onnx_file,
445
+ export_params=True, opset_version=12, do_constant_folding=True,
446
+ input_names=['input'], output_names=[], dynamo=False,
447
+ dynamic_axes={
448
+ "input": {0: "batch"},
449
+ "output": {0: "batch"} }
450
+ )
451
+ '''
452
+ # [TEMP] Guidance code for exporting the model
453
+ torch.onnx.export(self.model, # model being run
454
+ oInput, # model input (or a tuple for multiple inputs)
455
+ self.best_model_state_onnx_file, # where to save the model
456
+ export_params=True, # store the trained parameter weights inside the model file
457
+ opset_version=10, # the ONNX version to export the model to
458
+ do_constant_folding=True, # whether to execute constant folding for optimization
459
+ input_names=['modelInput'], # the model's input names
460
+ output_names=['modelOutput'], # the model's output names
461
+ dynamic_axes={'modelInput': {0: 'batch_size'}, # variable length axes
462
+ 'modelOutput': {0: 'batch_size'}})
463
+ '''
464
+ print('Model has been converted to ONNX')
465
+ # --------------------------------------------------------------------------------------------------------------
@@ -0,0 +1,21 @@
1
+ from torch.optim.lr_scheduler import LRScheduler
2
+
3
+ class StairCaseLR(LRScheduler):
4
+ def __init__(self, optimizer, setup, last_epoch=-1):
5
+ self.setup = sorted(setup, key=lambda x: x[0])
6
+ self.lrs = [nLR for nEpochIndex, nLR in self.setup]
7
+ self.lrs_count = len(self.lrs)
8
+ super().__init__(optimizer, last_epoch)
9
+
10
+ def get_lr(self):
11
+ epoch = max(self.last_epoch, 0)
12
+
13
+ lr = self.setup[0][1]
14
+ for m, candidate_lr in self.setup:
15
+ if epoch >= m:
16
+ lr = candidate_lr
17
+ else:
18
+ break
19
+
20
+ return [lr for _ in self.optimizer.param_groups]
21
+
radnn/ml_system.py CHANGED
@@ -6,7 +6,7 @@
6
6
  # ______________________________________________________________________________________
7
7
  # ......................................................................................
8
8
 
9
- # Copyright (c) 2018-2025 Pantelis I. Kaplanoglou
9
+ # Copyright (c) 2018-2026 Pantelis I. Kaplanoglou
10
10
 
11
11
  # Permission is hereby granted, free of charge, to any person obtaining a copy
12
12
  # of this software and associated documentation files (the "Software"), to deal
@@ -30,35 +30,19 @@
30
30
  import os
31
31
  import random
32
32
  import numpy as np
33
- import importlib
33
+ from .core import AIGridInfo, RequiredLibs
34
+ from .utils import classproperty
35
+ from radnn.system import FileSystem
34
36
 
35
37
  class MLSystem(object):
36
- # --------------------------------------------------------------------------------------
37
38
  _instance = None
38
- @classmethod
39
- def Instance(cls):
39
+
40
+ @classproperty
41
+ def instance(cls):
40
42
  if cls._instance is None:
41
43
  cls._instance = cls()
42
- mlsys = cls._instance
43
44
  return cls._instance
44
- # --------------------------------------------------------------------------------------
45
- @property
46
- def is_using_tensorflow(self):
47
- return self.is_tensorflow_installed and self._is_using_tensorflow
48
- # --------------------------------------------------------------------------------------
49
- @is_using_tensorflow.setter
50
- def is_using_tensorflow(self, value):
51
- self._is_using_tensorflow = value
52
- self._is_using_torch = not value
53
- # --------------------------------------------------------------------------------------
54
- @property
55
- def is_using_torch(self):
56
- return self.is_torch_installed and self.is_using_torch
57
- # --------------------------------------------------------------------------------------
58
- @is_using_torch.setter
59
- def is_using_torch(self, value):
60
- self._is_using_torch = value
61
- self._is_using_tensorflow = not value
45
+
62
46
  # --------------------------------------------------------------------------------------
63
47
  def __init__(self):
64
48
  self._is_random_seed_initialized = False
@@ -66,48 +50,87 @@ class MLSystem(object):
66
50
  self._seed = None
67
51
  self.switches = dict()
68
52
  self.switches["IsDebuggable"] = False
69
-
70
- self.is_tensorflow_installed = False
71
- self.is_torch_installed = False
72
- self.is_opencv_installed = False
73
-
74
- self._is_using_tensorflow = False
75
- self.is_using_torch = False
53
+ self.req_libs: RequiredLibs = RequiredLibs()
54
+
55
+ self.framework = "other"
56
+ self.device = "CPU"
57
+
58
+ # Ensure cuBLAS reproducibility for torch and/or tensorflow
59
+ if self.req_libs.is_torch_installed:
60
+ os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
61
+ import torch
62
+ if self.req_libs.is_tensorflow_installed:
63
+ os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
64
+ import tensorflow as tf
65
+
66
+
67
+ if self.req_libs.is_tensorflow_installed:
68
+ self.framework = "tensorflow"
69
+ # By priority use torch for model trainers and data iterators (overrides co-existing tensorflow)
70
+ if self.req_libs.is_torch_installed:
71
+ self.framework = "torch"
72
+
73
+ self._info = None
74
+
75
+ # Initialize default device
76
+ if self.framework == "torch":
77
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
+ elif self.framework == "tensorflow":
79
+ gpus = tf.config.list_physical_devices("GPU")
80
+ if gpus:
81
+ tf.config.set_visible_devices(gpus[0], "GPU")
82
+ tf.config.experimental.set_memory_growth(gpus[0], True)
83
+ self.device = "/GPU:0"
84
+ else:
85
+ self.device = "/CPU:0"
76
86
  # --------------------------------------------------------------------------------------
77
87
  @property
78
- def filesys(self):
88
+ def info(self):
89
+ if self._info is None:
90
+ self._info = AIGridInfo()
91
+ self.info.discover_devices(self.framework)
92
+ return self._info
93
+ # --------------------------------------------------------------------------------------
94
+ @property
95
+ def filesys(self) -> FileSystem:
79
96
  return self._filesys
80
- # ............................
97
+
81
98
  @filesys.setter
82
99
  def filesys(self, value):
83
100
  self._filesys = value
84
-
85
101
  # --------------------------------------------------------------------------------------
86
102
  @property
87
103
  def seed(self):
88
104
  return self._seed
89
105
  # --------------------------------------------------------------------------------------
90
- # We are seeding the number generators to get some amount of determinism for the whole ML training process.
91
- # For Tensorflow it is not ensuring 100% deterministic reproduction of an experiment on the GPU.
92
106
  def random_seed_all(self, seed, is_done_once=False, is_parallel_deterministic=False):
107
+ '''
108
+ We are seeding the number generators to get some amount of determinism for the whole ML training process.
109
+ For Tensorflow it is not ensuring 100% deterministic reproduction of an experiment that runs on the GPU.
110
+
111
+ :param seed:
112
+ :param is_done_once:
113
+ :param is_parallel_deterministic:
114
+ :return:
115
+ '''
93
116
  self._seed = seed
94
-
117
+
95
118
  bContinue = True
96
119
  if is_done_once:
97
120
  bContinue = (not self._is_random_seed_initialized)
98
-
121
+
99
122
  if bContinue:
100
123
  random.seed(seed)
101
124
  os.environ['PYTHONHASHSEED'] = str(seed)
102
125
  np.random.seed(seed)
103
- if mlsys.is_tensorflow_installed:
126
+ if self.req_libs.is_tensorflow_installed:
104
127
  import tensorflow as tf
105
128
  tf.compat.v1.reset_default_graph()
106
129
  if is_parallel_deterministic:
107
130
  tf.config.experimental.enable_op_determinism() # Enable determinism for num_parallel_calls
108
131
  tf.random.set_seed(seed)
109
132
  tf.keras.utils.set_random_seed(seed)
110
- if mlsys.is_torch_installed:
133
+ if self.req_libs.is_torch_installed:
111
134
  import torch
112
135
  torch.manual_seed(seed)
113
136
  # GPU and multi-GPU
@@ -116,22 +139,15 @@ class MLSystem(object):
116
139
  # For GPU determinism
117
140
  torch.backends.cudnn.deterministic = True
118
141
  torch.backends.cudnn.benchmark = False
119
-
142
+ torch.use_deterministic_algorithms(True)
143
+
120
144
  self._is_random_seed_initialized = True
121
145
  print("(>) Random seed set to %d" % seed)
122
146
  # --------------------------------------------------------------------------------------
123
147
 
124
148
 
125
- # ----------------------------------------------------------------------------------------------------------------------
126
- def is_tensorflow_installed():
127
- bIsInstalled = importlib.util.find_spec("tensorflow") is not None
128
- if not is_tensorflow_installed:
129
- bIsInstalled = importlib.util.find_spec("tensorflow-gpu") is not None
130
- return bIsInstalled
131
- # ----------------------------------------------------------------------------------------------------------------------
149
+ mlsys: MLSystem = MLSystem.instance
150
+
151
+
132
152
 
133
153
 
134
- mlsys: MLSystem = MLSystem.Instance()
135
- mlsys.is_tensorflow_installed = is_tensorflow_installed()
136
- mlsys.is_torch_installed = importlib.util.find_spec("torch") is not None
137
- mlsys.is_opencv_installed = importlib.util.find_spec("cv2") is not None
@@ -0,0 +1,5 @@
1
+ from .cnn.cnn_stem_setup import CNNSizeFactor, CNNStemSetup
2
+ from .model_hyperparams import ModelHyperparams
3
+ from .model_info import ModelInfo
4
+ from .model_factory import ModelFactory, ModelBuildAdapter
5
+
File without changes