nkululeko 0.90.0__py3-none-any.whl → 0.90.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. nkululeko/aug_train.py +6 -4
  2. nkululeko/augment.py +6 -4
  3. nkululeko/augmenting/augmenter.py +4 -4
  4. nkululeko/augmenting/randomsplicer.py +6 -6
  5. nkululeko/augmenting/randomsplicing.py +2 -3
  6. nkululeko/augmenting/resampler.py +9 -6
  7. nkululeko/autopredict/ap_age.py +4 -2
  8. nkululeko/autopredict/ap_arousal.py +4 -2
  9. nkululeko/autopredict/ap_dominance.py +3 -2
  10. nkululeko/autopredict/ap_gender.py +4 -2
  11. nkululeko/autopredict/ap_mos.py +5 -2
  12. nkululeko/autopredict/ap_pesq.py +5 -2
  13. nkululeko/autopredict/ap_sdr.py +5 -2
  14. nkululeko/autopredict/ap_snr.py +5 -2
  15. nkululeko/autopredict/ap_stoi.py +5 -2
  16. nkululeko/autopredict/ap_valence.py +4 -2
  17. nkululeko/autopredict/estimate_snr.py +10 -14
  18. nkululeko/cacheddataset.py +1 -1
  19. nkululeko/constants.py +1 -1
  20. nkululeko/data/dataset.py +11 -14
  21. nkululeko/data/dataset_csv.py +5 -3
  22. nkululeko/demo-ft.py +29 -0
  23. nkululeko/demo_feats.py +5 -4
  24. nkululeko/demo_predictor.py +3 -4
  25. nkululeko/ensemble.py +27 -28
  26. nkululeko/experiment.py +3 -5
  27. nkululeko/experiment_felix.py +728 -0
  28. nkululeko/explore.py +1 -0
  29. nkululeko/export.py +7 -5
  30. nkululeko/feat_extract/feats_agender.py +5 -4
  31. nkululeko/feat_extract/feats_agender_agender.py +7 -6
  32. nkululeko/feat_extract/feats_analyser.py +18 -16
  33. nkululeko/feat_extract/feats_ast.py +9 -8
  34. nkululeko/feat_extract/feats_auddim.py +3 -5
  35. nkululeko/feat_extract/feats_audmodel.py +2 -2
  36. nkululeko/feat_extract/feats_clap.py +9 -12
  37. nkululeko/feat_extract/feats_hubert.py +2 -3
  38. nkululeko/feat_extract/feats_import.py +5 -4
  39. nkululeko/feat_extract/feats_mld.py +3 -5
  40. nkululeko/feat_extract/feats_mos.py +4 -3
  41. nkululeko/feat_extract/feats_opensmile.py +4 -3
  42. nkululeko/feat_extract/feats_oxbow.py +5 -4
  43. nkululeko/feat_extract/feats_praat.py +4 -7
  44. nkululeko/feat_extract/feats_snr.py +3 -5
  45. nkululeko/feat_extract/feats_spectra.py +8 -9
  46. nkululeko/feat_extract/feats_spkrec.py +6 -11
  47. nkululeko/feat_extract/feats_squim.py +2 -4
  48. nkululeko/feat_extract/feats_trill.py +2 -5
  49. nkululeko/feat_extract/feats_wav2vec2.py +8 -4
  50. nkululeko/feat_extract/feats_wavlm.py +2 -3
  51. nkululeko/feat_extract/feats_whisper.py +4 -6
  52. nkululeko/feat_extract/featureset.py +4 -2
  53. nkululeko/feat_extract/feinberg_praat.py +1 -3
  54. nkululeko/feat_extract/transformer_feature_extractor.py +147 -0
  55. nkululeko/file_checker.py +3 -3
  56. nkululeko/filter_data.py +3 -1
  57. nkululeko/fixedsegment.py +83 -0
  58. nkululeko/models/model.py +3 -5
  59. nkululeko/models/model_bayes.py +1 -0
  60. nkululeko/models/model_cnn.py +4 -6
  61. nkululeko/models/model_gmm.py +13 -9
  62. nkululeko/models/model_knn.py +1 -0
  63. nkululeko/models/model_knn_reg.py +1 -0
  64. nkululeko/models/model_lin_reg.py +1 -0
  65. nkululeko/models/model_mlp.py +2 -3
  66. nkululeko/models/model_mlp_regression.py +1 -6
  67. nkululeko/models/model_svm.py +2 -2
  68. nkululeko/models/model_svr.py +1 -0
  69. nkululeko/models/model_tree.py +2 -3
  70. nkululeko/models/model_tree_reg.py +1 -0
  71. nkululeko/models/model_tuned.py +88 -41
  72. nkululeko/models/model_xgb.py +1 -0
  73. nkululeko/models/model_xgr.py +1 -0
  74. nkululeko/multidb.py +1 -0
  75. nkululeko/nkululeko.py +1 -1
  76. nkululeko/predict.py +4 -5
  77. nkululeko/reporting/defines.py +6 -8
  78. nkululeko/reporting/latex_writer.py +3 -3
  79. nkululeko/reporting/report.py +2 -2
  80. nkululeko/reporting/report_item.py +1 -0
  81. nkululeko/reporting/reporter.py +20 -19
  82. nkululeko/resample.py +8 -12
  83. nkululeko/resample_cli.py +99 -0
  84. nkululeko/runmanager.py +3 -1
  85. nkululeko/scaler.py +1 -1
  86. nkululeko/segment.py +6 -5
  87. nkululeko/segmenting/seg_inaspeechsegmenter.py +3 -3
  88. nkululeko/segmenting/seg_silero.py +4 -4
  89. nkululeko/syllable_nuclei.py +9 -22
  90. nkululeko/test_pretrain.py +6 -7
  91. nkululeko/utils/stats.py +0 -1
  92. nkululeko/utils/util.py +4 -5
  93. {nkululeko-0.90.0.dist-info → nkululeko-0.90.2.dist-info}/METADATA +11 -2
  94. nkululeko-0.90.2.dist-info/RECORD +119 -0
  95. {nkululeko-0.90.0.dist-info → nkululeko-0.90.2.dist-info}/WHEEL +1 -1
  96. nkululeko-0.90.0.dist-info/RECORD +0 -114
  97. {nkululeko-0.90.0.dist-info → nkululeko-0.90.2.dist-info}/LICENSE +0 -0
  98. {nkululeko-0.90.0.dist-info → nkululeko-0.90.2.dist-info}/top_level.txt +0 -0
@@ -7,17 +7,18 @@ import os
7
7
  import pickle
8
8
  import typing
9
9
 
10
+ import audeer
11
+ import audiofile
12
+ import audmetric
10
13
  import datasets
11
14
  import numpy as np
12
15
  import pandas as pd
13
16
  import torch
14
17
  import transformers
15
- from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
16
- from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
17
-
18
- import audeer
19
- import audiofile
20
- import audmetric
18
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
19
+ Wav2Vec2Model,
20
+ Wav2Vec2PreTrainedModel,
21
+ )
21
22
 
22
23
  import nkululeko.glob_conf as glob_conf
23
24
  from nkululeko.models.model import Model as BaseModel
@@ -25,15 +26,20 @@ from nkululeko.reporting.reporter import Reporter
25
26
 
26
27
 
27
28
  class TunedModel(BaseModel):
28
-
29
29
  def __init__(self, df_train, df_test, feats_train, feats_test):
30
30
  """Constructor taking the configuration and all dataframes."""
31
31
  super().__init__(df_train, df_test, feats_train, feats_test)
32
32
  super().set_model_type("finetuned")
33
+ self.df_test, self.df_train, self.feats_test, self.feats_train = (
34
+ df_test,
35
+ df_train,
36
+ feats_test,
37
+ feats_train,
38
+ )
33
39
  self.name = "finetuned_wav2vec2"
34
40
  self.target = glob_conf.config["DATA"]["target"]
35
- labels = glob_conf.labels
36
- self.class_num = len(labels)
41
+ self.labels = glob_conf.labels
42
+ self.class_num = len(self.labels)
37
43
  device = self.util.config_val("MODEL", "device", False)
38
44
  if not device:
39
45
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -54,8 +60,7 @@ class TunedModel(BaseModel):
54
60
  self.learning_rate = float(
55
61
  self.util.config_val("MODEL", "learning_rate", "0.0001")
56
62
  )
57
- self.max_duration = float(
58
- self.util.config_val("MODEL", "max_duration", "8.0"))
63
+ self.max_duration = float(self.util.config_val("MODEL", "max_duration", "8.0"))
59
64
  self.df_train, self.df_test = df_train, df_test
60
65
  self.epoch_num = int(self.util.config_val("EXP", "epochs", 1))
61
66
  self.util.debug(f"num of epochs: {self.epoch_num}")
@@ -65,12 +70,12 @@ class TunedModel(BaseModel):
65
70
  self.drop = float(drop)
66
71
  self.util.debug(f"init: training with dropout: {self.drop}")
67
72
  self.push = eval(self.util.config_val("MODEL", "push_to_hub", "False"))
73
+ self.balancing = self.util.config_val("MODEL", "balancing", False)
68
74
  self._init_model()
69
75
 
70
76
  def _init_model(self):
71
77
  model_path = "facebook/wav2vec2-large-robust-ft-swbd-300h"
72
- pretrained_model = self.util.config_val(
73
- "MODEL", "pretrained_model", model_path)
78
+ pretrained_model = self.util.config_val("MODEL", "pretrained_model", model_path)
74
79
  self.num_layers = None
75
80
  self.sampling_rate = 16000
76
81
  self.max_duration_sec = self.max_duration
@@ -94,6 +99,42 @@ class TunedModel(BaseModel):
94
99
  df = y.reset_index()
95
100
  df.start = df.start.dt.total_seconds()
96
101
  df.end = df.end.dt.total_seconds()
102
+ # ds = datasets.Dataset.from_pandas(df)
103
+ # dataset[split] = ds
104
+
105
+ # self.dataset = datasets.DatasetDict(dataset)
106
+ if split == "train" and self.balancing:
107
+ if self.balancing == "ros":
108
+ from imblearn.over_sampling import RandomOverSampler
109
+
110
+ sampler = RandomOverSampler(random_state=42)
111
+ elif self.balancing == "smote":
112
+ from imblearn.over_sampling import SMOTE
113
+
114
+ sampler = SMOTE(random_state=42)
115
+ elif self.balancing == "adasyn":
116
+ from imblearn.over_sampling import ADASYN
117
+
118
+ sampler = ADASYN(random_state=42)
119
+ else:
120
+ self.util.error(f"Unknown balancing algorithm: {self.balancing}")
121
+
122
+ X_resampled, y_resampled = sampler.fit_resample(
123
+ df[["start", "end"]], df["targets"]
124
+ )
125
+ df = pd.DataFrame(
126
+ {
127
+ "start": X_resampled["start"],
128
+ "end": X_resampled["end"],
129
+ "targets": y_resampled,
130
+ }
131
+ )
132
+
133
+ # print the before and after class distribution
134
+ self.util.debug(
135
+ f"balanced with: {self.balancing}, new size: {len(df)}, was {len(data_sources[split])}"
136
+ )
137
+
97
138
  ds = datasets.Dataset.from_pandas(df)
98
139
  dataset[split] = ds
99
140
 
@@ -101,7 +142,7 @@ class TunedModel(BaseModel):
101
142
 
102
143
  # load pre-trained model
103
144
  if self.is_classifier:
104
- self.util.debug(f"Task is classification.")
145
+ self.util.debug("Task is classification.")
105
146
  le = glob_conf.label_encoder
106
147
  mapping = dict(zip(le.classes_, range(len(le.classes_))))
107
148
  target_mapping = {k: int(v) for k, v in mapping.items()}
@@ -116,7 +157,7 @@ class TunedModel(BaseModel):
116
157
  finetuning_task=target_name,
117
158
  )
118
159
  else:
119
- self.util.debug(f"Task is regression.")
160
+ self.util.debug("Task is regression.")
120
161
  self.config = transformers.AutoConfig.from_pretrained(
121
162
  pretrained_model,
122
163
  num_labels=1,
@@ -215,7 +256,6 @@ class TunedModel(BaseModel):
215
256
  return batch
216
257
 
217
258
  def compute_metrics(self, p: transformers.EvalPrediction):
218
-
219
259
  metrics = {
220
260
  "UAR": audmetric.unweighted_average_recall,
221
261
  "ACC": audmetric.accuracy,
@@ -268,10 +308,9 @@ class TunedModel(BaseModel):
268
308
  else:
269
309
  criterion = torch.nn.CrossEntropyLoss()
270
310
  else:
271
- self.util.error(
272
- f"criterion {criterion} not supported for classifier")
311
+ self.util.error(f"criterion {criterion} not supported for classifier")
273
312
  else:
274
- self.criterion = self.util.config_val("MODEL", "loss", "ccc")
313
+ criterion = self.util.config_val("MODEL", "loss", "1-ccc")
275
314
  if criterion == "1-ccc":
276
315
  criterion = ConcordanceCorCoeff()
277
316
  elif criterion == "mse":
@@ -279,8 +318,7 @@ class TunedModel(BaseModel):
279
318
  elif criterion == "mae":
280
319
  criterion = torch.nn.L1Loss()
281
320
  else:
282
- self.util.error(
283
- f"criterion {criterion} not supported for regressor")
321
+ self.util.error(f"criterion {criterion} not supported for regressor")
284
322
 
285
323
  # set push_to_hub value, default false
286
324
  # push = eval(self.util.config_val("MODEL", "push_to_hub", "False"))
@@ -319,8 +357,7 @@ class TunedModel(BaseModel):
319
357
  elif metrics_for_best_model == "MAE":
320
358
  greater_is_better = False
321
359
  else:
322
- self.util.error(
323
- f"unknown metric/measure: {metrics_for_best_model}")
360
+ self.util.error(f"unknown metric/measure: {metrics_for_best_model}")
324
361
 
325
362
  training_args = transformers.TrainingArguments(
326
363
  output_dir=model_root,
@@ -371,7 +408,7 @@ class TunedModel(BaseModel):
371
408
  self.load(self.run, self.epoch)
372
409
 
373
410
  def get_predictions(self):
374
- results = []
411
+ results = [[]].pop(0)
375
412
  for (file, start, end), _ in audeer.progress_bar(
376
413
  self.df_test.iterrows(),
377
414
  total=len(self.df_test),
@@ -384,18 +421,37 @@ class TunedModel(BaseModel):
384
421
  file, duration=end - start, offset=start, always_2d=True
385
422
  )
386
423
  assert sr == self.sampling_rate
387
- predictions = self.model.predict(signal)
388
- results.append(predictions.argmax())
389
- return results
424
+ prediction = self.model.predict(signal)
425
+ results.append(prediction)
426
+ # results.append(predictions.argmax())
427
+ predictions = np.asarray(results)
428
+ if self.util.exp_is_classification():
429
+ # make a dataframe for the class probabilities
430
+ proba_d = {}
431
+ for c in range(self.class_num):
432
+ proba_d[c] = []
433
+ # get the class probabilities
434
+ # predictions = self.clf.predict_proba(self.feats_test.to_numpy())
435
+ # pred = self.clf.predict(features)
436
+ for i in range(self.class_num):
437
+ proba_d[i] = list(predictions.T[i])
438
+ probas = pd.DataFrame(proba_d)
439
+ probas = probas.set_index(self.df_test.index)
440
+ predictions = probas.idxmax(axis=1).values
441
+ else:
442
+ predictions = predictions.flatten()
443
+ probas = None
444
+ return predictions, probas
390
445
 
391
446
  def predict(self):
392
447
  """Predict the whole eval feature set"""
393
- predictions = self.get_predictions()
448
+ predictions, probas = self.get_predictions()
394
449
  report = Reporter(
395
450
  self.df_test[self.target].to_numpy().astype(float),
396
451
  predictions,
397
452
  self.run,
398
453
  self.epoch_num,
454
+ probas=probas,
399
455
  )
400
456
  self._plot_epoch_progression(report)
401
457
  return report
@@ -407,6 +463,7 @@ class TunedModel(BaseModel):
407
463
  )
408
464
  with open(log_file, "r") as file:
409
465
  data = file.read()
466
+ data = data.strip().replace("nan", "0")
410
467
  list = ast.literal_eval(data)
411
468
  epochs, vals, loss = [], [], []
412
469
  for index, tp in enumerate(list):
@@ -453,17 +510,15 @@ class TunedModel(BaseModel):
453
510
  self.clf = pickle.load(handle)
454
511
 
455
512
 
456
- @ dataclasses.dataclass
513
+ @dataclasses.dataclass
457
514
  class ModelOutput(transformers.file_utils.ModelOutput):
458
-
459
515
  logits: torch.FloatTensor = None
460
516
  hidden_states: typing.Tuple[torch.FloatTensor] = None
461
517
  cnn_features: torch.FloatTensor = None
462
518
 
463
519
 
464
- @ dataclasses.dataclass
520
+ @dataclasses.dataclass
465
521
  class ModelOutputReg(transformers.file_utils.ModelOutput):
466
-
467
522
  logits: torch.FloatTensor
468
523
  hidden_states: typing.Tuple[torch.FloatTensor] = None
469
524
  attentions: typing.Tuple[torch.FloatTensor] = None
@@ -473,9 +528,7 @@ class ModelOutputReg(transformers.file_utils.ModelOutput):
473
528
 
474
529
 
475
530
  class ModelHead(torch.nn.Module):
476
-
477
531
  def __init__(self, config):
478
-
479
532
  super().__init__()
480
533
 
481
534
  self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
@@ -483,7 +536,6 @@ class ModelHead(torch.nn.Module):
483
536
  self.out_proj = torch.nn.Linear(config.hidden_size, config.num_labels)
484
537
 
485
538
  def forward(self, features, **kwargs):
486
-
487
539
  x = features
488
540
  x = self.dropout(x)
489
541
  x = self.dense(x)
@@ -495,9 +547,7 @@ class ModelHead(torch.nn.Module):
495
547
 
496
548
 
497
549
  class Model(Wav2Vec2PreTrainedModel):
498
-
499
550
  def __init__(self, config):
500
-
501
551
  if not hasattr(config, "add_adapter"):
502
552
  setattr(config, "add_adapter", False)
503
553
 
@@ -516,7 +566,6 @@ class Model(Wav2Vec2PreTrainedModel):
516
566
  hidden_states,
517
567
  attention_mask,
518
568
  ):
519
-
520
569
  if attention_mask is None: # For evaluation with batch_size==1
521
570
  outputs = torch.mean(hidden_states, dim=1)
522
571
  else:
@@ -532,8 +581,7 @@ class Model(Wav2Vec2PreTrainedModel):
532
581
  attention_sum = torch.sum(attention_mask, dim=1)
533
582
 
534
583
  epsilon = 1e-6 # to avoid division by zero and numerical instability
535
- outputs = outputs / (torch.reshape(attention_sum, (-1, 1)) +
536
- epsilon)
584
+ outputs = outputs / (torch.reshape(attention_sum, (-1, 1)) + epsilon)
537
585
 
538
586
  return outputs
539
587
 
@@ -590,7 +638,6 @@ class Model(Wav2Vec2PreTrainedModel):
590
638
 
591
639
 
592
640
  class ConcordanceCorCoeff(torch.nn.Module):
593
-
594
641
  def __init__(self):
595
642
  super().__init__()
596
643
  self.mean = torch.mean
@@ -1,6 +1,7 @@
1
1
  # xgbmodel.py
2
2
 
3
3
  from xgboost import XGBClassifier
4
+
4
5
  from nkululeko.models.model import Model
5
6
 
6
7
 
@@ -1,6 +1,7 @@
1
1
  # xgrmodel.py
2
2
 
3
3
  from xgboost.sklearn import XGBRegressor
4
+
4
5
  from nkululeko.models.model import Model
5
6
 
6
7
 
nkululeko/multidb.py CHANGED
@@ -5,6 +5,7 @@ The `main` function is the entry point of the script, which parses command-line
5
5
 
6
6
  The `plot_heatmap` function generates a heatmap plot of the results and saves it to a file, along with some summary statistics.
7
7
  """
8
+
8
9
  # main.py
9
10
  # Demonstration code to use the ML-experiment framework
10
11
 
nkululeko/nkululeko.py CHANGED
@@ -6,8 +6,8 @@ import os.path
6
6
 
7
7
  import numpy as np
8
8
 
9
- from nkululeko.constants import VERSION
10
9
  import nkululeko.experiment as exp
10
+ from nkululeko.constants import VERSION
11
11
  from nkululeko.utils.util import Util
12
12
 
13
13
 
nkululeko/predict.py CHANGED
@@ -26,9 +26,9 @@ from nkululeko.utils.util import Util
26
26
 
27
27
  def main(src_dir):
28
28
  parser = argparse.ArgumentParser(
29
- description="Call the nkululeko PREDICT framework.")
30
- parser.add_argument("--config", default="exp.ini",
31
- help="The base configuration")
29
+ description="Call the nkululeko PREDICT framework."
30
+ )
31
+ parser.add_argument("--config", default="exp.ini", help="The base configuration")
32
32
  args = parser.parse_args()
33
33
  if args.config is not None:
34
34
  config_file = args.config
@@ -58,8 +58,7 @@ def main(src_dir):
58
58
 
59
59
  # split into train and test
60
60
  expr.fill_train_and_tests()
61
- util.debug(
62
- f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
61
+ util.debug(f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
63
62
 
64
63
  # process the data
65
64
  df = expr.autopredict()
@@ -1,5 +1,4 @@
1
1
  class DefineBase:
2
-
3
2
  @classmethod
4
3
  def _assert_has_attribute_value(cls, value):
5
4
  valid_values = cls._attribute_values()
@@ -8,17 +7,16 @@ class DefineBase:
8
7
 
9
8
  @classmethod
10
9
  def _attribute_values(cls):
11
- attributes = inspect.getmembers(
12
- cls, lambda x: not inspect.isroutine(x)
13
- )
10
+ attributes = inspect.getmembers(cls, lambda x: not inspect.isroutine(x))
14
11
  return sorted(
15
12
  [
16
- a[1] for a in attributes
17
- if not (a[0].startswith('__') and a[0].endswith('__'))
13
+ a[1]
14
+ for a in attributes
15
+ if not (a[0].startswith("__") and a[0].endswith("__"))
18
16
  ]
19
17
  )
20
18
 
21
19
 
22
20
  class Header(DefineBase):
23
- HEADER_RESULTS = 'Results'
24
- HEADER_EXPLORE = 'Data exploration'
21
+ HEADER_RESULTS = "Results"
22
+ HEADER_EXPLORE = "Data exploration"
@@ -3,9 +3,9 @@ latex_writer.py
3
3
  print out report as latex file and pdf
4
4
  """
5
5
 
6
- from pylatex import Document, Section, Subsection, Command, Figure
7
- from pylatex.utils import italic, NoEscape
8
- from nkululeko.reporting.report_item import ReportItem
6
+ from pylatex import Command, Document, Figure, Section, Subsection
7
+ from pylatex.utils import NoEscape
8
+
9
9
  from nkululeko.utils.util import Util
10
10
 
11
11
 
@@ -4,9 +4,9 @@ report.py
4
4
  Collector class for report items collected during module processing.
5
5
 
6
6
  """
7
- from nkululeko.utils.util import Util
8
- from nkululeko.reporting.report_item import ReportItem
7
+
9
8
  from nkululeko.reporting.latex_writer import LatexWriter
9
+ from nkululeko.utils.util import Util
10
10
 
11
11
 
12
12
  class Report:
@@ -4,6 +4,7 @@ a basic report snippet.
4
4
 
5
5
  import os.path
6
6
 
7
+
7
8
  class ReportItem:
8
9
  def __init__(self, topic, caption, contents, image=None):
9
10
  self.topic = topic
@@ -21,13 +21,13 @@ from scipy.special import softmax
21
21
  from scipy.stats import entropy, pearsonr
22
22
  from sklearn.metrics import (
23
23
  ConfusionMatrixDisplay,
24
+ RocCurveDisplay,
24
25
  auc,
25
26
  classification_report,
26
27
  confusion_matrix,
27
28
  r2_score,
28
29
  roc_auc_score,
29
30
  roc_curve,
30
- RocCurveDisplay,
31
31
  )
32
32
 
33
33
  import nkululeko.glob_conf as glob_conf
@@ -419,24 +419,25 @@ class Reporter:
419
419
  f1_per_class = (
420
420
  f"result per class (F1 score): {c_ress} from epoch: {epoch}"
421
421
  )
422
- if len(np.unique(self.truths)) == 2:
423
- fpr, tpr, _ = roc_curve(self.truths, self.preds)
424
- auc_score = auc(fpr, tpr)
425
- plot_path = f"{fig_dir}{self.util.get_exp_name()}_{epoch}{self.filenameadd}_roc.{self.format}"
426
- plt.figure()
427
- display = RocCurveDisplay(
428
- fpr=fpr,
429
- tpr=tpr,
430
- roc_auc=auc_score,
431
- estimator_name=f"{self.model_type} estimator",
432
- )
433
- display.plot(ax=None)
434
- plt.savefig(plot_path)
435
- plt.close()
436
- self.util.debug(f"Saved ROC curve to {plot_path}")
437
- pauc_score = roc_auc_score(self.truths, self.preds, max_fpr=0.1)
438
- auc_pauc = f"auc: {auc_score:.3f}, pauc: {pauc_score:.3f} from epoch: {epoch}"
439
- self.util.debug(auc_pauc)
422
+ # the following auc is buggy, preds should be probabilities
423
+ # if len(np.unique(self.truths)) == 2:
424
+ # fpr, tpr, _ = roc_curve(self.truths, self.preds)
425
+ # auc_score = auc(fpr, tpr)
426
+ # plot_path = f"{fig_dir}{self.util.get_exp_name()}_{epoch}{self.filenameadd}_roc.{self.format}"
427
+ # plt.figure()
428
+ # display = RocCurveDisplay(
429
+ # fpr=fpr,
430
+ # tpr=tpr,
431
+ # roc_auc=auc_score,
432
+ # estimator_name=f"{self.model_type} estimator",
433
+ # )
434
+ # display.plot(ax=None)
435
+ # plt.savefig(plot_path)
436
+ # plt.close()
437
+ # self.util.debug(f"Saved ROC curve to {plot_path}")
438
+ # pauc_score = roc_auc_score(self.truths, self.preds, max_fpr=0.1)
439
+ # auc_pauc = f"auc: {auc_score:.3f}, pauc: {pauc_score:.3f} from epoch: {epoch}"
440
+ # self.util.debug(auc_pauc)
440
441
  self.util.debug(f1_per_class)
441
442
  rpt_str = f"{json.dumps(rpt)}\n{f1_per_class}"
442
443
  # rpt_str += f"\n{auc_auc}"
nkululeko/resample.py CHANGED
@@ -4,24 +4,23 @@
4
4
  import argparse
5
5
  import configparser
6
6
  import os
7
- import pandas as pd
7
+
8
8
  import audformat
9
- from nkululeko.augmenting.resampler import Resampler
10
- from nkululeko.utils.util import Util
9
+ import pandas as pd
11
10
 
11
+ from nkululeko.augmenting.resampler import Resampler
12
12
  from nkululeko.constants import VERSION
13
13
  from nkululeko.experiment import Experiment
14
14
  from nkululeko.utils.files import find_files
15
+ from nkululeko.utils.util import Util
15
16
 
16
17
 
17
18
  def main(src_dir):
18
19
  parser = argparse.ArgumentParser(
19
20
  description="Call the nkululeko RESAMPLE framework."
20
21
  )
21
- parser.add_argument("--config", default=None,
22
- help="The base configuration")
23
- parser.add_argument("--file", default=None,
24
- help="The input audio file to resample")
22
+ parser.add_argument("--config", default=None, help="The base configuration")
23
+ parser.add_argument("--file", default=None, help="The input audio file to resample")
25
24
  parser.add_argument(
26
25
  "--folder",
27
26
  default=None,
@@ -34,9 +33,7 @@ def main(src_dir):
34
33
  args = parser.parse_args()
35
34
 
36
35
  if args.file is None and args.folder is None and args.config is None:
37
- print(
38
- "ERROR: Either --file, --folder, or --config argument must be provided."
39
- )
36
+ print("ERROR: Either --file, --folder, or --config argument must be provided.")
40
37
  exit()
41
38
 
42
39
  if args.file is not None:
@@ -102,8 +99,7 @@ def main(src_dir):
102
99
  f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
103
100
  )
104
101
 
105
- sample_selection = util.config_val(
106
- "RESAMPLE", "sample_selection", "all")
102
+ sample_selection = util.config_val("RESAMPLE", "sample_selection", "all")
107
103
  if sample_selection == "all":
108
104
  df = pd.concat([expr.df_train, expr.df_test])
109
105
  elif sample_selection == "train":
@@ -0,0 +1,99 @@
1
+ import argparse
2
+ import configparser
3
+ import os
4
+
5
+ import audformat
6
+ import pandas as pd
7
+
8
+ from nkululeko.augmenting.resampler import Resampler
9
+ from nkululeko.constants import VERSION
10
+ from nkululeko.experiment import Experiment
11
+ from nkululeko.utils.util import Util
12
+
13
+
14
+ def main(src_dir):
15
+ parser = argparse.ArgumentParser(
16
+ description="Call the nkululeko RESAMPLE framework."
17
+ )
18
+ parser.add_argument("--config", default=None, help="The base configuration")
19
+ parser.add_argument("--file", default=None, help="The input audio file to resample")
20
+ parser.add_argument(
21
+ "--replace", action="store_true", help="Replace the original audio file"
22
+ )
23
+
24
+ args = parser.parse_args()
25
+
26
+ if args.file is None and args.config is None:
27
+ print("ERROR: Either --file or --config argument must be provided.")
28
+ exit()
29
+
30
+ if args.file is not None:
31
+ # Load the audio file into a DataFrame
32
+ files = pd.Series([args.file])
33
+ df_sample = pd.DataFrame(index=files)
34
+ df_sample.index = audformat.utils.to_segmented_index(
35
+ df_sample.index, allow_nat=False
36
+ )
37
+
38
+ # Resample the audio file
39
+ util = Util("resampler", has_config=False)
40
+ util.debug(f"Resampling audio file: {args.file}")
41
+ rs = Resampler(df_sample, not_testing=True, replace=args.replace)
42
+ rs.resample()
43
+ else:
44
+ # Existing code for handling INI file
45
+ config_file = args.config
46
+
47
+ # Test if the configuration file exists
48
+ if not os.path.isfile(config_file):
49
+ print(f"ERROR: no such file: {config_file}")
50
+ exit()
51
+
52
+ # Load one configuration per experiment
53
+ config = configparser.ConfigParser()
54
+ config.read(config_file)
55
+ # Create a new experiment
56
+ expr = Experiment(config)
57
+ module = "resample"
58
+ expr.set_module(module)
59
+ util = Util(module)
60
+ util.debug(
61
+ f"running {expr.name} from config {config_file}, nkululeko version"
62
+ f" {VERSION}"
63
+ )
64
+
65
+ if util.config_val("EXP", "no_warnings", False):
66
+ import warnings
67
+
68
+ warnings.filterwarnings("ignore")
69
+
70
+ # Load the data
71
+ expr.load_datasets()
72
+
73
+ # Split into train and test
74
+ expr.fill_train_and_tests()
75
+ util.debug(
76
+ f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
77
+ )
78
+
79
+ sample_selection = util.config_val("RESAMPLE", "sample_selection", "all")
80
+ if sample_selection == "all":
81
+ df = pd.concat([expr.df_train, expr.df_test])
82
+ elif sample_selection == "train":
83
+ df = expr.df_train
84
+ elif sample_selection == "test":
85
+ df = expr.df_test
86
+ else:
87
+ util.error(
88
+ f"unknown selection specifier {sample_selection}, should be [all |"
89
+ " train | test]"
90
+ )
91
+ util.debug(f"resampling {sample_selection}: {df.shape[0]} samples")
92
+ replace = util.config_val("RESAMPLE", "replace", "False")
93
+ rs = Resampler(df, replace=replace)
94
+ rs.resample()
95
+
96
+
97
+ if __name__ == "__main__":
98
+ cwd = os.path.dirname(os.path.abspath(__file__))
99
+ main(cwd)
nkululeko/runmanager.py CHANGED
@@ -50,7 +50,9 @@ class Runmanager:
50
50
  self.last_epochs = [] # keep the epoch of best result per run
51
51
  # for all runs
52
52
  for run in range(int(self.util.config_val("EXP", "runs", 1))):
53
- self.util.debug(f"run {run} using model {glob_conf.config['MODEL']['type']}")
53
+ self.util.debug(
54
+ f"run {run} using model {glob_conf.config['MODEL']['type']}"
55
+ )
54
56
  # set the run index as global variable for reporting
55
57
  self.util.set_config_val("EXP", "run", run)
56
58
  self.modelrunner = Modelrunner(
nkululeko/scaler.py CHANGED
@@ -1,8 +1,8 @@
1
1
  # scaler.py
2
2
 
3
3
  import numpy as np
4
- from sklearn.preprocessing import StandardScaler, RobustScaler
5
4
  import pandas as pd
5
+ from sklearn.preprocessing import RobustScaler, StandardScaler
6
6
 
7
7
  from nkululeko.utils.util import Util
8
8