nkululeko 0.86.7__py3-none-any.whl → 0.87.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nkululeko/constants.py CHANGED
@@ -1,2 +1,2 @@
1
- VERSION="0.86.7"
1
+ VERSION="0.87.0"
2
2
  SAMPLING_RATE = 16000
@@ -23,6 +23,9 @@ class Dataset_CSV(Dataset):
23
23
  root = os.path.dirname(data_file)
24
24
  audio_path = self.util.config_val_data(self.name, "audio_path", "./")
25
25
  df = pd.read_csv(data_file)
26
+ # trim all string values
27
+ df_obj = df.select_dtypes("object")
28
+ df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
26
29
  # special treatment for segmented dataframes with only one column:
27
30
  if "start" in df.columns and len(df.columns) == 4:
28
31
  index = audformat.segmented_index(
@@ -49,8 +52,7 @@ class Dataset_CSV(Dataset):
49
52
  .map(lambda x: root + "/" + audio_path + "/" + x)
50
53
  .values
51
54
  )
52
- df = df.set_index(df.index.set_levels(
53
- file_index, level="file"))
55
+ df = df.set_index(df.index.set_levels(file_index, level="file"))
54
56
  else:
55
57
  if not isinstance(df, pd.DataFrame):
56
58
  df = pd.DataFrame(df)
@@ -59,27 +61,24 @@ class Dataset_CSV(Dataset):
59
61
  lambda x: root + "/" + audio_path + "/" + x
60
62
  )
61
63
  )
62
- else: # absolute path is True
64
+ else: # absolute path is True
63
65
  if audformat.index_type(df.index) == "segmented":
64
66
  file_index = (
65
- df.index.levels[0]
66
- .map(lambda x: audio_path + "/" + x)
67
- .values
67
+ df.index.levels[0].map(lambda x: audio_path + "/" + x).values
68
68
  )
69
- df = df.set_index(df.index.set_levels(
70
- file_index, level="file"))
69
+ df = df.set_index(df.index.set_levels(file_index, level="file"))
71
70
  else:
72
71
  if not isinstance(df, pd.DataFrame):
73
72
  df = pd.DataFrame(df)
74
- df = df.set_index(df.index.to_series().apply(
75
- lambda x: audio_path + "/" + x ))
73
+ df = df.set_index(
74
+ df.index.to_series().apply(lambda x: audio_path + "/" + x)
75
+ )
76
76
 
77
77
  self.df = df
78
78
  self.db = None
79
79
  self.got_target = True
80
80
  self.is_labeled = self.got_target
81
- self.start_fresh = eval(
82
- self.util.config_val("DATA", "no_reuse", "False"))
81
+ self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
83
82
  is_index = False
84
83
  try:
85
84
  if self.is_labeled and not "class_label" in self.df.columns:
@@ -106,8 +105,7 @@ class Dataset_CSV(Dataset):
106
105
  f" {self.got_gender}, got age: {self.got_age}"
107
106
  )
108
107
  self.util.debug(r_string)
109
- glob_conf.report.add_item(ReportItem(
110
- "Data", "Loaded report", r_string))
108
+ glob_conf.report.add_item(ReportItem("Data", "Loaded report", r_string))
111
109
 
112
110
  def prepare(self):
113
111
  super().prepare()
nkululeko/demo.py CHANGED
@@ -30,10 +30,8 @@ from transformers import pipeline
30
30
 
31
31
 
32
32
  def main(src_dir):
33
- parser = argparse.ArgumentParser(
34
- description="Call the nkululeko DEMO framework.")
35
- parser.add_argument("--config", default="exp.ini",
36
- help="The base configuration")
33
+ parser = argparse.ArgumentParser(description="Call the nkululeko DEMO framework.")
34
+ parser.add_argument("--config", default="exp.ini", help="The base configuration")
37
35
  parser.add_argument(
38
36
  "--file", help="A file that should be processed (16kHz mono wav)"
39
37
  )
@@ -84,8 +82,7 @@ def main(src_dir):
84
82
  )
85
83
 
86
84
  def print_pipe(files, outfile):
87
- """
88
- Prints the pipeline output for a list of files, and optionally writes the results to an output file.
85
+ """Prints the pipeline output for a list of files, and optionally writes the results to an output file.
89
86
 
90
87
  Args:
91
88
  files (list): A list of file paths to process through the pipeline.
@@ -108,8 +105,7 @@ def main(src_dir):
108
105
  f.write("\n".join(results))
109
106
 
110
107
  if util.get_model_type() == "finetune":
111
- model_path = os.path.join(
112
- util.get_exp_dir(), "models", "run_0", "torch")
108
+ model_path = os.path.join(util.get_exp_dir(), "models", "run_0", "torch")
113
109
  pipe = pipeline("audio-classification", model=model_path)
114
110
  if args.file is not None:
115
111
  print_pipe([args.file], args.outfile)
nkululeko/experiment.py CHANGED
@@ -5,13 +5,13 @@ import pickle
5
5
  import random
6
6
  import time
7
7
 
8
+ import audeer
9
+ import audformat
8
10
  import numpy as np
9
11
  import pandas as pd
10
12
  from sklearn.preprocessing import LabelEncoder
11
13
 
12
- import audeer
13
- import audformat
14
-
14
+ import nkululeko.glob_conf as glob_conf
15
15
  from nkululeko.data.dataset import Dataset
16
16
  from nkululeko.data.dataset_csv import Dataset_CSV
17
17
  from nkululeko.demo_predictor import Demo_predictor
@@ -19,8 +19,6 @@ from nkululeko.feat_extract.feats_analyser import FeatureAnalyser
19
19
  from nkululeko.feature_extractor import FeatureExtractor
20
20
  from nkululeko.file_checker import FileChecker
21
21
  from nkululeko.filter_data import DataFilter
22
- from nkululeko.filter_data import filter_min_dur
23
- import nkululeko.glob_conf as glob_conf
24
22
  from nkululeko.plots import Plots
25
23
  from nkululeko.reporting.report import Report
26
24
  from nkululeko.runmanager import Runmanager
@@ -109,7 +107,8 @@ class Experiment:
109
107
  # print keys/column
110
108
  dbs = ",".join(list(self.datasets.keys()))
111
109
  labels = self.util.config_val("DATA", "labels", False)
112
- auto_labels = list(next(iter(self.datasets.values())).df[self.target].unique())
110
+ auto_labels = list(
111
+ next(iter(self.datasets.values())).df[self.target].unique())
113
112
  if labels:
114
113
  self.labels = ast.literal_eval(labels)
115
114
  self.util.debug(f"Using target labels (from config): {labels}")
@@ -159,7 +158,8 @@ class Experiment:
159
158
  data.split()
160
159
  data.prepare_labels()
161
160
  self.df_test = pd.concat(
162
- [self.df_test, self.util.make_segmented_index(data.df_test)]
161
+ [self.df_test, self.util.make_segmented_index(
162
+ data.df_test)]
163
163
  )
164
164
  self.df_test.is_labeled = data.is_labeled
165
165
  self.df_test.got_gender = self.got_gender
@@ -260,7 +260,8 @@ class Experiment:
260
260
  test_cats = self.df_test[self.target].unique()
261
261
  else:
262
262
  # if there is no target, copy a dummy label
263
- self.df_test = self._add_random_target(self.df_test).astype("str")
263
+ self.df_test = self._add_random_target(
264
+ self.df_test).astype("str")
264
265
  train_cats = self.df_train[self.target].unique()
265
266
  # print(f"df_train: {pd.DataFrame(self.df_train[self.target])}")
266
267
  # print(f"train_cats with target {self.target}: {train_cats}")
@@ -268,7 +269,8 @@ class Experiment:
268
269
  if type(test_cats) == np.ndarray:
269
270
  self.util.debug(f"Categories test (nd.array): {test_cats}")
270
271
  else:
271
- self.util.debug(f"Categories test (list): {list(test_cats)}")
272
+ self.util.debug(
273
+ f"Categories test (list): {list(test_cats)}")
272
274
  if type(train_cats) == np.ndarray:
273
275
  self.util.debug(f"Categories train (nd.array): {train_cats}")
274
276
  else:
@@ -291,7 +293,8 @@ class Experiment:
291
293
 
292
294
  target_factor = self.util.config_val("DATA", "target_divide_by", False)
293
295
  if target_factor:
294
- self.df_test[self.target] = self.df_test[self.target] / float(target_factor)
296
+ self.df_test[self.target] = self.df_test[self.target] / \
297
+ float(target_factor)
295
298
  self.df_train[self.target] = self.df_train[self.target] / float(
296
299
  target_factor
297
300
  )
@@ -314,14 +317,16 @@ class Experiment:
314
317
  def plot_distribution(self, df_labels):
315
318
  """Plot the distribution of samples and speaker per target class and biological sex"""
316
319
  plot = Plots()
317
- sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
320
+ sample_selection = self.util.config_val(
321
+ "EXPL", "sample_selection", "all")
318
322
  plot.plot_distributions(df_labels)
319
323
  if self.got_speaker:
320
324
  plot.plot_distributions_speaker(df_labels)
321
325
 
322
326
  def extract_test_feats(self):
323
327
  self.feats_test = pd.DataFrame()
324
- feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"]))
328
+ feats_name = "_".join(ast.literal_eval(
329
+ glob_conf.config["DATA"]["tests"]))
325
330
  feats_types = self.util.config_val_list("FEATS", "type", ["os"])
326
331
  self.feature_extractor = FeatureExtractor(
327
332
  self.df_test, feats_types, feats_name, "test"
@@ -338,9 +343,17 @@ class Experiment:
338
343
 
339
344
  """
340
345
  df_train, df_test = self.df_train, self.df_test
341
- feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
346
+ feats_name = "_".join(ast.literal_eval(
347
+ glob_conf.config["DATA"]["databases"]))
342
348
  self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
343
- feats_types = self.util.config_val_list("FEATS", "type", [])
349
+ feats_types = self.util.config_val("FEATS", "type", "os")
350
+ # Ensure feats_types is always a list of strings
351
+ if isinstance(feats_types, str):
352
+ if feats_types.startswith("[") and feats_types.endswith("]"):
353
+ feats_types = ast.literal_eval(feats_types)
354
+ else:
355
+ feats_types = [feats_types]
356
+ # print(f"feats_types: {feats_types}")
344
357
  # for some models no features are needed
345
358
  if len(feats_types) == 0:
346
359
  self.util.debug("no feature extractor specified.")
@@ -372,7 +385,8 @@ class Experiment:
372
385
  f"test feats ({self.feats_test.shape[0]}) != test labels"
373
386
  f" ({self.df_test.shape[0]})"
374
387
  )
375
- self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
388
+ self.df_test = self.df_test[self.df_test.index.isin(
389
+ self.feats_test.index)]
376
390
  self.util.warn(f"new test labels shape: {self.df_test.shape[0]}")
377
391
 
378
392
  self._check_scale()
@@ -387,7 +401,8 @@ class Experiment:
387
401
  """Augment the selected samples."""
388
402
  from nkululeko.augmenting.augmenter import Augmenter
389
403
 
390
- sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
404
+ sample_selection = self.util.config_val(
405
+ "AUGMENT", "sample_selection", "all")
391
406
  if sample_selection == "all":
392
407
  df = pd.concat([self.df_train, self.df_test])
393
408
  elif sample_selection == "train":
@@ -482,7 +497,8 @@ class Experiment:
482
497
  """
483
498
  from nkululeko.augmenting.randomsplicer import Randomsplicer
484
499
 
485
- sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
500
+ sample_selection = self.util.config_val(
501
+ "AUGMENT", "sample_selection", "all")
486
502
  if sample_selection == "all":
487
503
  df = pd.concat([self.df_train, self.df_test])
488
504
  elif sample_selection == "train":
@@ -503,7 +519,8 @@ class Experiment:
503
519
  plot_feats = eval(
504
520
  self.util.config_val("EXPL", "feature_distributions", "False")
505
521
  )
506
- sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
522
+ sample_selection = self.util.config_val(
523
+ "EXPL", "sample_selection", "all")
507
524
  # get the data labels
508
525
  if sample_selection == "all":
509
526
  df_labels = pd.concat([self.df_train, self.df_test])
@@ -566,7 +583,8 @@ class Experiment:
566
583
  for scat_target in scat_targets:
567
584
  if self.util.is_categorical(df_labels[scat_target]):
568
585
  for scatter in scatters:
569
- plots.scatter_plot(df_feats, df_labels, scat_target, scatter)
586
+ plots.scatter_plot(
587
+ df_feats, df_labels, scat_target, scatter)
570
588
  else:
571
589
  self.util.debug(
572
590
  f"{self.name}: binning continuous variable to categories"
@@ -657,7 +675,8 @@ class Experiment:
657
675
  preds = best.preds
658
676
  speakers = self.df_test.speaker.values
659
677
  print(f"{len(truths)} {len(preds)} {len(speakers) }")
660
- df = pd.DataFrame(data={"truth": truths, "pred": preds, "speaker": speakers})
678
+ df = pd.DataFrame(
679
+ data={"truth": truths, "pred": preds, "speaker": speakers})
661
680
  plot_name = "result_combined_per_speaker"
662
681
  self.util.debug(
663
682
  f"plotting speaker combination ({function}) confusion matrix to"
@@ -733,7 +752,6 @@ class Experiment:
733
752
  if model.is_ann():
734
753
  print("converting to onnx from torch")
735
754
  else:
736
- from skl2onnx import to_onnx
737
755
 
738
756
  print("converting to onnx from sklearn")
739
757
  # save the rest
@@ -39,16 +39,20 @@ class FeatureExtractor:
39
39
  self.feats = pd.DataFrame()
40
40
  for feats_type in self.feats_types:
41
41
  store_name = f"{self.data_name}_{feats_type}"
42
- self.feat_extractor = self._get_feat_extractor(store_name, feats_type)
42
+ self.feat_extractor = self._get_feat_extractor(
43
+ store_name, feats_type)
43
44
  self.feat_extractor.extract()
44
45
  self.feat_extractor.filter()
45
- self.feats = pd.concat([self.feats, self.feat_extractor.df], axis=1)
46
+ self.feats = pd.concat(
47
+ [self.feats, self.feat_extractor.df], axis=1)
46
48
  return self.feats
47
49
 
48
50
  def extract_sample(self, signal, sr):
49
51
  return self.feat_extractor.extract_sample(signal, sr)
50
52
 
51
53
  def _get_feat_extractor(self, store_name, feats_type):
54
+ if isinstance(feats_type, list) and len(feats_type) == 1:
55
+ feats_type = feats_type[0]
52
56
  feat_extractor_class = self._get_feat_extractor_class(feats_type)
53
57
  if feat_extractor_class is None:
54
58
  self.util.error(f"unknown feats_type: {feats_type}")
@@ -103,13 +107,15 @@ class FeatureExtractor:
103
107
  prefix, _, ext = feats_type.partition("-")
104
108
  from importlib import import_module
105
109
 
106
- module = import_module(f"nkululeko.feat_extract.feats_{prefix.lower()}")
110
+ module = import_module(
111
+ f"nkululeko.feat_extract.feats_{prefix.lower()}")
107
112
  class_name = f"{prefix.capitalize()}"
108
113
  return getattr(module, class_name)
109
114
 
110
115
  def _get_feat_extractor_by_name(self, feats_type):
111
116
  from importlib import import_module
112
117
 
113
- module = import_module(f"nkululeko.feat_extract.feats_{feats_type.lower()}")
118
+ module = import_module(
119
+ f"nkululeko.feat_extract.feats_{feats_type.lower()}")
114
120
  class_name = f"{feats_type.capitalize()}Set"
115
121
  return getattr(module, class_name)
nkululeko/modelrunner.py CHANGED
@@ -85,7 +85,7 @@ class Modelrunner:
85
85
  f"run: {self.run} epoch: {epoch}: result: {test_score_metric}"
86
86
  )
87
87
  # print(f"performance: {performance.split(' ')[1]}")
88
- performance = float(test_score_metric.split(' ')[1])
88
+ performance = float(test_score_metric.split(" ")[1])
89
89
  if performance > self.best_performance:
90
90
  self.best_performance = performance
91
91
  self.best_epoch = epoch
@@ -204,15 +204,15 @@ class Modelrunner:
204
204
  self.df_train, self.df_test, self.feats_train, self.feats_test
205
205
  )
206
206
  elif model_type == "cnn":
207
- from nkululeko.models.model_cnn import CNN_model
207
+ from nkululeko.models.model_cnn import CNNModel
208
208
 
209
- self.model = CNN_model(
209
+ self.model = CNNModel(
210
210
  self.df_train, self.df_test, self.feats_train, self.feats_test
211
211
  )
212
212
  elif model_type == "mlp":
213
- from nkululeko.models.model_mlp import MLP_model
213
+ from nkululeko.models.model_mlp import MLPModel
214
214
 
215
- self.model = MLP_model(
215
+ self.model = MLPModel(
216
216
  self.df_train, self.df_test, self.feats_train, self.feats_test
217
217
  )
218
218
  elif model_type == "mlp_reg":
nkululeko/models/model.py CHANGED
@@ -247,8 +247,25 @@ class Model:
247
247
  self.clf.fit(feats, labels)
248
248
 
249
249
  def get_predictions(self):
250
- predictions = self.clf.predict(self.feats_test.to_numpy())
251
- return predictions
250
+ # predictions = self.clf.predict(self.feats_test.to_numpy())
251
+ if self.util.exp_is_classification():
252
+ # make a dataframe for the class probabilities
253
+ proba_d = {}
254
+ for c in self.clf.classes_:
255
+ proba_d[c] = []
256
+ # get the class probabilities
257
+ predictions = self.clf.predict_proba(self.feats_test.to_numpy())
258
+ # pred = self.clf.predict(features)
259
+ for i, c in enumerate(self.clf.classes_):
260
+ proba_d[c] = list(predictions.T[i])
261
+ probas = pd.DataFrame(proba_d)
262
+ probas = probas.set_index(self.feats_test.index)
263
+ predictions = probas.idxmax(axis=1).values
264
+ else:
265
+ predictions = self.clf.predict(self.feats_test.to_numpy())
266
+ probas = None
267
+
268
+ return predictions, probas
252
269
 
253
270
  def predict(self):
254
271
  if self.feats_test.isna().to_numpy().any():
@@ -263,13 +280,16 @@ class Model:
263
280
  )
264
281
  return report
265
282
  """Predict the whole eval feature set"""
266
- predictions = self.get_predictions()
283
+ predictions, probas = self.get_predictions()
284
+
267
285
  report = Reporter(
268
286
  self.df_test[self.target].to_numpy().astype(float),
269
287
  predictions,
270
288
  self.run,
271
289
  self.epoch,
290
+ probas=probas,
272
291
  )
292
+ report.print_probabilities()
273
293
  return report
274
294
 
275
295
  def get_type(self):
@@ -5,33 +5,40 @@ Inspired by code from Su Lei
5
5
 
6
6
  """
7
7
 
8
+ import ast
9
+ from collections import OrderedDict
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from PIL import Image
14
+ from sklearn.metrics import recall_score
8
15
  import torch
9
16
  import torch.nn as nn
10
17
  import torch.nn.functional as F
11
- import torchvision
12
- import torchvision.transforms as transforms
13
18
  from torch.utils.data import Dataset
14
- import ast
15
- import numpy as np
16
- from sklearn.metrics import recall_score
17
- from collections import OrderedDict
18
- from PIL import Image
19
- from traitlets import default
19
+ import torchvision.transforms as transforms
20
20
 
21
- from nkululeko.utils.util import Util
22
21
  import nkululeko.glob_conf as glob_conf
22
+ from nkululeko.losses.loss_softf1loss import SoftF1Loss
23
23
  from nkululeko.models.model import Model
24
24
  from nkululeko.reporting.reporter import Reporter
25
- from nkululeko.losses.loss_softf1loss import SoftF1Loss
25
+ from nkululeko.utils.util import Util
26
26
 
27
27
 
28
- class CNN_model(Model):
29
- """CNN = convolutional neural net"""
28
+ class CNNModel(Model):
29
+ """CNN = convolutional neural net."""
30
30
 
31
31
  is_classifier = True
32
32
 
33
33
  def __init__(self, df_train, df_test, feats_train, feats_test):
34
- """Constructor taking the configuration and all dataframes"""
34
+ """Constructor, taking all dataframes.
35
+
36
+ Args:
37
+ df_train (pd.DataFrame): The train labels.
38
+ df_test (pd.DataFrame): The test labels.
39
+ feats_train (pd.DataFrame): The train features.
40
+ feats_test (pd.DataFrame): The test features.
41
+ """
35
42
  super().__init__(df_train, df_test, feats_train, feats_test)
36
43
  super().set_model_type("ann")
37
44
  self.name = "cnn"
@@ -147,7 +154,20 @@ class CNN_model(Model):
147
154
  self.optimizer.step()
148
155
  self.loss = (np.asarray(losses)).mean()
149
156
 
150
- def evaluate_model(self, model, loader, device):
157
+ def get_probas(self, logits):
158
+ # make a dataframe for probabilites (logits)
159
+ proba_d = {}
160
+ classes = self.df_test[self.target].unique()
161
+ classes.sort()
162
+ for c in classes:
163
+ proba_d[c] = []
164
+ for i, c in enumerate(classes):
165
+ proba_d[c] = list(logits.numpy().T[i])
166
+ probas = pd.DataFrame(proba_d)
167
+ probas = probas.set_index(self.df_test.index)
168
+ return probas
169
+
170
+ def evaluate(self, model, loader, device):
151
171
  logits = torch.zeros(len(loader.dataset), self.class_num)
152
172
  targets = torch.zeros(len(loader.dataset))
153
173
  model.eval()
@@ -169,14 +189,15 @@ class CNN_model(Model):
169
189
  self.loss_eval = (np.asarray(losses)).mean()
170
190
  predictions = logits.argmax(dim=1)
171
191
  uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
172
- return uar, targets, predictions
192
+ return uar, targets, predictions, logits
173
193
 
174
194
  def predict(self):
175
- _, truths, predictions = self.evaluate_model(
195
+ _, truths, predictions, logits = self.evaluate(
176
196
  self.model, self.testloader, self.device
177
197
  )
178
- uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
179
- report = Reporter(truths, predictions, self.run, self.epoch)
198
+ uar, _, _, _ = self.evaluate(self.model, self.trainloader, self.device)
199
+ probas = self.get_probas(logits)
200
+ report = Reporter(truths, predictions, self.run, self.epoch, probas=probas)
180
201
  try:
181
202
  report.result.loss = self.loss
182
203
  except AttributeError: # if the model was loaded from disk the loss is unknown
@@ -189,13 +210,11 @@ class CNN_model(Model):
189
210
  return report
190
211
 
191
212
  def get_predictions(self):
192
- _, truths, predictions = self.evaluate_model(
193
- self.model, self.testloader, self.device
194
- )
213
+ _, _, predictions, _ = self.evaluate(self.model, self.testloader, self.device)
195
214
  return predictions.numpy()
196
215
 
197
216
  def predict_sample(self, features):
198
- """Predict one sample"""
217
+ """Predict one sample."""
199
218
  with torch.no_grad():
200
219
  logits = self.model(torch.from_numpy(features).to(self.device))
201
220
  a = logits.numpy()
@@ -1,25 +1,33 @@
1
1
  # model_mlp.py
2
+ import ast
3
+ from collections import OrderedDict
4
+
5
+ import numpy as np
2
6
  import pandas as pd
7
+ from sklearn.metrics import recall_score
8
+ import torch
3
9
 
4
- from nkululeko.utils.util import Util
5
10
  import nkululeko.glob_conf as glob_conf
11
+ from nkululeko.losses.loss_softf1loss import SoftF1Loss
6
12
  from nkululeko.models.model import Model
7
13
  from nkululeko.reporting.reporter import Reporter
8
- import torch
9
- import ast
10
- import numpy as np
11
- from sklearn.metrics import recall_score
12
- from collections import OrderedDict
13
- from nkululeko.losses.loss_softf1loss import SoftF1Loss
14
+ from nkululeko.utils.util import Util
14
15
 
15
16
 
16
- class MLP_model(Model):
17
+ class MLPModel(Model):
17
18
  """MLP = multi layer perceptron."""
18
19
 
19
20
  is_classifier = True
20
21
 
21
22
  def __init__(self, df_train, df_test, feats_train, feats_test):
22
- """Constructor taking the configuration and all dataframes."""
23
+ """Constructor, taking all dataframes.
24
+
25
+ Args:
26
+ df_train (pd.DataFrame): The train labels.
27
+ df_test (pd.DataFrame): The test labels.
28
+ feats_train (pd.DataFrame): The train features.
29
+ feats_test (pd.DataFrame): The test features.
30
+ """
23
31
  super().__init__(df_train, df_test, feats_train, feats_test)
24
32
  super().set_model_type("ann")
25
33
  self.name = "mlp"
@@ -97,7 +105,7 @@ class MLP_model(Model):
97
105
  self.optimizer.step()
98
106
  self.loss = (np.asarray(losses)).mean()
99
107
 
100
- def evaluate_model(self, model, loader, device):
108
+ def evaluate(self, model, loader, device):
101
109
  logits = torch.zeros(len(loader.dataset), self.class_num)
102
110
  targets = torch.zeros(len(loader.dataset))
103
111
  model.eval()
@@ -119,14 +127,28 @@ class MLP_model(Model):
119
127
  self.loss_eval = (np.asarray(losses)).mean()
120
128
  predictions = logits.argmax(dim=1)
121
129
  uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
122
- return uar, targets, predictions
130
+ return uar, targets, predictions, logits
131
+
132
+ def get_probas(self, logits):
133
+ # make a dataframe for probabilites (logits)
134
+ proba_d = {}
135
+ classes = self.df_test[self.target].unique()
136
+ classes.sort()
137
+ for c in classes:
138
+ proba_d[c] = []
139
+ for i, c in enumerate(classes):
140
+ proba_d[c] = list(logits.numpy().T[i])
141
+ probas = pd.DataFrame(proba_d)
142
+ probas = probas.set_index(self.df_test.index)
143
+ return probas
123
144
 
124
145
  def predict(self):
125
- _, truths, predictions = self.evaluate_model(
146
+ _, truths, predictions, logits = self.evaluate(
126
147
  self.model, self.testloader, self.device
127
148
  )
128
- uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
129
- report = Reporter(truths, predictions, self.run, self.epoch)
149
+ uar, _, _, _ = self.evaluate(self.model, self.trainloader, self.device)
150
+ probas = self.get_probas(logits)
151
+ report = Reporter(truths, predictions, self.run, self.epoch, probas=probas)
130
152
  try:
131
153
  report.result.loss = self.loss
132
154
  except AttributeError: # if the model was loaded from disk the loss is unknown
@@ -139,9 +161,7 @@ class MLP_model(Model):
139
161
  return report
140
162
 
141
163
  def get_predictions(self):
142
- _, truths, predictions = self.evaluate_model(
143
- self.model, self.testloader, self.device
144
- )
164
+ _, _, predictions, _ = self.evaluate(self.model, self.testloader, self.device)
145
165
  return predictions.numpy()
146
166
 
147
167
  def get_loader(self, df_x, df_y, shuffle):
@@ -97,7 +97,9 @@ class MLP_Reg_model(Model):
97
97
  self.model, self.testloader, self.device
98
98
  )
99
99
  result, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
100
- report = Reporter(truths.numpy(), predictions.numpy(), self.run, self.epoch)
100
+ report = Reporter(
101
+ truths.numpy(), predictions.numpy(), None, self.run, self.epoch
102
+ )
101
103
  try:
102
104
  report.result.loss = self.loss
103
105
  except AttributeError: # if the model was loaded from disk the loss is unknown
nkululeko/plots.py CHANGED
@@ -48,7 +48,7 @@ class Plots:
48
48
  )
49
49
  ax.set_ylabel(f"number of speakers")
50
50
  ax.set_xlabel("number of samples")
51
- self._save_plot(
51
+ self.save_plot(
52
52
  ax,
53
53
  "Samples per speaker",
54
54
  f"Samples per speaker ({df_speakers.shape[0]})",
@@ -70,9 +70,9 @@ class Plots:
70
70
  rot=0,
71
71
  )
72
72
  )
73
- ax.set_ylabel(f"number of speakers")
73
+ ax.set_ylabel("number of speakers")
74
74
  ax.set_xlabel("number of samples")
75
- self._save_plot(
75
+ self.save_plot(
76
76
  ax,
77
77
  "Sample value counts",
78
78
  f"Samples per speaker ({df_speakers.shape[0]})",
@@ -96,7 +96,7 @@ class Plots:
96
96
  binned_data = self.util.continuous_to_categorical(df[class_label])
97
97
  ax = binned_data.value_counts().plot(kind="bar")
98
98
  filename_binned = f"{class_label}_discreet"
99
- self._save_plot(
99
+ self.save_plot(
100
100
  ax,
101
101
  "Sample value counts",
102
102
  filename_binned,
@@ -106,7 +106,7 @@ class Plots:
106
106
  dist_type = self.util.config_val("EXPL", "dist_type", "hist")
107
107
  ax = df[class_label].plot(kind=dist_type)
108
108
 
109
- self._save_plot(
109
+ self.save_plot(
110
110
  ax,
111
111
  "Sample value counts",
112
112
  filename,
@@ -131,17 +131,17 @@ class Plots:
131
131
  df, class_label, att1, self.target, type_s
132
132
  )
133
133
  else:
134
- ax, caption = self._plotcatcont(
134
+ ax, caption = self.plotcatcont(
135
135
  df, class_label, att1, att1, type_s
136
136
  )
137
137
  else:
138
138
  if self.util.is_categorical(df[att1]):
139
- ax, caption = self._plotcatcont(
139
+ ax, caption = self.plotcatcont(
140
140
  df, att1, class_label, att1, type_s
141
141
  )
142
142
  else:
143
143
  ax, caption = self._plot2cont(df, class_label, att1, type_s)
144
- self._save_plot(
144
+ self.save_plot(
145
145
  ax,
146
146
  caption,
147
147
  f"Correlation of {self.target} and {att[0]}",
@@ -171,15 +171,11 @@ class Plots:
171
171
  ax, caption = self._plot2cat(df, att1, att2, att1, type_s)
172
172
  else:
173
173
  # class_label = cat, att1 = cat, att2 = cont
174
- ax, caption = self._plotcatcont(
175
- df, att1, att2, att1, type_s
176
- )
174
+ ax, caption = self.plotcatcont(df, att1, att2, att1, type_s)
177
175
  else:
178
176
  if self.util.is_categorical(df[att2]):
179
177
  # class_label = cat, att1 = cont, att2 = cat
180
- ax, caption = self._plotcatcont(
181
- df, att2, att1, att2, type_s
182
- )
178
+ ax, caption = self.plotcatcont(df, att2, att1, att2, type_s)
183
179
  else:
184
180
  # class_label = cat, att1 = cont, att2 = cont
185
181
  ax, caption = self._plot2cont_cat(
@@ -205,7 +201,7 @@ class Plots:
205
201
  # class_label = cont, att1 = cont, att2 = cont
206
202
  ax, caption = self._plot2cont(df, att1, att2, type_s)
207
203
 
208
- self._save_plot(
204
+ self.save_plot(
209
205
  ax, caption, f"Correlation of {att1} and {att2}", filename, type_s
210
206
  )
211
207
 
@@ -215,16 +211,16 @@ class Plots:
215
211
  f" {att} has more than 2 values. Perhaps you forgot to state a list of lists?"
216
212
  )
217
213
 
218
- def _save_plot(self, ax, caption, header, filename, type_s):
214
+ def save_plot(self, ax, caption, header, filename, type_s):
219
215
  # one up because of the runs
220
216
  fig_dir = self.util.get_path("fig_dir") + "../"
221
- fig = ax.figure
217
+ fig_plots = ax.figure
222
218
  # avoid warning
223
219
  # plt.tight_layout()
224
220
  img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
225
221
  plt.savefig(img_path)
226
- plt.close(fig)
227
- # fig.clear() # avoid error
222
+ plt.close(fig_plots)
223
+ self.util.debug(f"Saved plot to {img_path}")
228
224
  glob_conf.report.add_item(
229
225
  ReportItem(
230
226
  Header.HEADER_EXPLORE,
@@ -244,35 +240,29 @@ class Plots:
244
240
  return att, df
245
241
 
246
242
  def _plot2cont_cat(self, df, cont1, cont2, cat, ylab):
247
- """
248
- plot relation of two continuous distributions with one categorical
249
- """
243
+ """Plot relation of two continuous distributions with one categorical."""
250
244
  pearson = stats.pearsonr(df[cont1], df[cont2])
251
245
  # trunc to three digits
252
246
  pearson = int(pearson[0] * 1000) / 1000
253
247
  pearson_string = f"PCC: {pearson}"
254
248
  ax = sns.lmplot(data=df, x=cont1, y=cont2, hue=cat)
255
249
  caption = f"{ylab} {df.shape[0]}. {pearson_string}"
256
- ax.fig.suptitle(caption)
250
+ ax.figure.suptitle(caption)
257
251
  return ax, caption
258
252
 
259
253
  def _plot2cont(self, df, col1, col2, ylab):
260
- """
261
- plot relation of two continuous distributions
262
- """
254
+ """Plot relation of two continuous distributions."""
263
255
  pearson = stats.pearsonr(df[col1], df[col2])
264
256
  # trunc to three digits
265
257
  pearson = int(pearson[0] * 1000) / 1000
266
258
  pearson_string = f"PCC: {pearson}"
267
259
  ax = sns.lmplot(data=df, x=col1, y=col2)
268
260
  caption = f"{ylab} {df.shape[0]}. {pearson_string}"
269
- ax.fig.suptitle(caption)
261
+ ax.figure.suptitle(caption)
270
262
  return ax, caption
271
263
 
272
- def _plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
273
- """
274
- plot relation of categorical distribution with continuous
275
- """
264
+ def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
265
+ """Plot relation of categorical distribution with continuous."""
276
266
  dist_type = self.util.config_val("EXPL", "dist_type", "hist")
277
267
  cats, cat_str, es = su.get_effect_size(df, cat_col, cont_col)
278
268
  if dist_type == "hist":
@@ -287,13 +277,11 @@ class Plots:
287
277
  )
288
278
  ax.set(xlabel=f"{cont_col}")
289
279
  caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
290
- ax.fig.suptitle(caption)
280
+ ax.figure.suptitle(caption)
291
281
  return ax, caption
292
282
 
293
283
  def _plot2cat(self, df, col1, col2, xlab, ylab):
294
- """
295
- plot relation of 2 categorical distributions
296
- """
284
+ """Plot relation of 2 categorical distributions."""
297
285
  crosstab = pd.crosstab(index=df[col1], columns=df[col2])
298
286
  res_pval = stats.chi2_contingency(crosstab)
299
287
  res_pval = int(res_pval[1] * 1000) / 1000
@@ -320,8 +308,8 @@ class Plots:
320
308
  max = self.util.to_3_digits(df.duration.max())
321
309
  title = f"Duration distr. for {sample_selection} {df.shape[0]}. min={min}, max={max}"
322
310
  ax.set_title(title)
323
- ax.set_xlabel(f"duration")
324
- ax.set_ylabel(f"number of samples")
311
+ ax.set_xlabel("duration")
312
+ ax.set_ylabel("number of samples")
325
313
  fig = ax.figure
326
314
  # plt.tight_layout()
327
315
  img_path = f"{fig_dir}{filename}_{sample_selection}.{self.format}"
@@ -2,16 +2,21 @@ import ast
2
2
  import glob
3
3
  import json
4
4
  import math
5
+ import os
5
6
 
6
7
  from confidence_intervals import evaluate_with_conf_int
7
8
  import matplotlib.pyplot as plt
8
9
  import numpy as np
10
+ from scipy.special import softmax
11
+ from scipy.stats import entropy
9
12
  from scipy.stats import pearsonr
10
- from sklearn.metrics import ConfusionMatrixDisplay, roc_curve
13
+ from sklearn.metrics import ConfusionMatrixDisplay
14
+ from sklearn.metrics import auc
11
15
  from sklearn.metrics import classification_report
12
16
  from sklearn.metrics import confusion_matrix
13
17
  from sklearn.metrics import r2_score
14
- from sklearn.metrics import roc_curve, auc, roc_auc_score
18
+ from sklearn.metrics import roc_auc_score
19
+ from sklearn.metrics import roc_curve
15
20
  from torch import is_tensor
16
21
 
17
22
  from audmetric import accuracy
@@ -21,6 +26,7 @@ from audmetric import mean_squared_error
21
26
  from audmetric import unweighted_average_recall
22
27
 
23
28
  import nkululeko.glob_conf as glob_conf
29
+ from nkululeko.plots import Plots
24
30
  from nkululeko.reporting.defines import Header
25
31
  from nkululeko.reporting.report_item import ReportItem
26
32
  from nkululeko.reporting.result import Result
@@ -46,9 +52,18 @@ class Reporter:
46
52
  self.MEASURE = "CCC"
47
53
  self.result.measure = self.MEASURE
48
54
 
49
- def __init__(self, truths, preds, run, epoch):
50
- """Initialization with ground truth und predictions vector."""
55
+ def __init__(self, truths, preds, run, epoch, probas=None):
56
+ """Initialization with ground truth und predictions vector.
57
+
58
+ Args:
59
+ truths (list): the ground truth
60
+ preds (list): the predictions
61
+ run (int): number of run
62
+ epoch (int): number of epoch
63
+ probas (pd.Dataframe, optional): probabilities per class. Defaults to None.
64
+ """
51
65
  self.util = Util("reporter")
66
+ self.probas = probas
52
67
  self.format = self.util.config_val("PLOT", "format", "png")
53
68
  self.truths = np.asarray(truths)
54
69
  self.preds = np.asarray(preds)
@@ -108,6 +123,47 @@ class Reporter:
108
123
  self.result.test = test_result
109
124
  self.result.set_upper_lower(upper, lower)
110
125
  # train and loss are being set by the model
126
+ # print out the class probilities
127
+
128
+ def print_probabilities(self):
129
+ """Print the probabilities per class to a file in the store."""
130
+ if (
131
+ self.util.exp_is_classification()
132
+ and self.probas is not None
133
+ and "uncertainty" not in self.probas
134
+ ):
135
+ probas = self.probas
136
+ probas["predicted"] = self.preds
137
+ probas["truth"] = self.truths
138
+ # softmax the probabilities or logits
139
+ uncertainty = probas.apply(softmax, axis=1)
140
+ try:
141
+ le = glob_conf.label_encoder
142
+ mapping = dict(zip(le.classes_, range(len(le.classes_))))
143
+ mapping_reverse = {value: key for key, value in mapping.items()}
144
+ probas = probas.rename(columns=mapping_reverse)
145
+ probas["predicted"] = probas["predicted"].map(mapping_reverse)
146
+ probas["truth"] = probas["truth"].map(mapping_reverse)
147
+ except AttributeError as ae:
148
+ self.util.debug(f"Can't label categories: {ae}")
149
+ # compute entropy per sample
150
+ uncertainty = uncertainty.apply(entropy)
151
+ # scale it to 0-1
152
+ max_ent = math.log(len(glob_conf.labels))
153
+ uncertainty = (uncertainty - uncertainty.min()) / (
154
+ max_ent - uncertainty.min()
155
+ )
156
+ probas["uncertainty"] = uncertainty
157
+ probas["correct"] = probas.predicted == probas.truth
158
+ sp = os.path.join(self.util.get_path("store"), "pred_df.csv")
159
+ self.probas = probas
160
+ probas.to_csv(sp)
161
+ self.util.debug(f"Saved probabilities to {sp}")
162
+ plots = Plots()
163
+ ax, caption = plots.plotcatcont(
164
+ probas, "correct", "uncertainty", "uncertainty", "correct"
165
+ )
166
+ plots.save_plot(ax, caption, "Uncertainty", "uncertainty", "samples")
111
167
 
112
168
  def set_id(self, run, epoch):
113
169
  """Make the report identifiable with run and epoch index."""
@@ -123,6 +179,12 @@ class Reporter:
123
179
  self.preds = np.digitize(self.preds, bins) - 1
124
180
 
125
181
  def plot_confmatrix(self, plot_name, epoch=None):
182
+ """Plot a confusionmatrix to the store.
183
+
184
+ Args:
185
+ plot_name (str): name for the image file.
186
+ epoch (int, optional): Number of epoch. Defaults to None.
187
+ """
126
188
  if not self.util.exp_is_classification():
127
189
  self.continuous_to_categorical()
128
190
  self._plot_confmat(self.truths, self.preds, plot_name, epoch)
@@ -212,10 +274,11 @@ class Reporter:
212
274
  )
213
275
  img_path = f"{fig_dir}{plot_name}{self.filenameadd}.{self.format}"
214
276
  plt.savefig(img_path)
277
+ self.util.debug(f"Saved confusion plot to {img_path}")
215
278
  fig.clear()
216
279
  plt.close(fig)
217
- plt.savefig(img_path)
218
- plt.close(fig)
280
+ plt.close()
281
+ plt.clf()
219
282
  glob_conf.report.add_item(
220
283
  ReportItem(
221
284
  Header.HEADER_RESULTS,
nkululeko/runmanager.py CHANGED
@@ -11,7 +11,7 @@ from nkululeko.utils.util import Util
11
11
 
12
12
 
13
13
  class Runmanager:
14
- """Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)"""
14
+ """Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)."""
15
15
 
16
16
  model = None # The underlying model
17
17
  df_train, df_test, feats_train, feats_test = (
@@ -23,15 +23,14 @@ class Runmanager:
23
23
  reports = []
24
24
 
25
25
  def __init__(self, df_train, df_test, feats_train, feats_test):
26
- """Constructor setting up the dataframes
26
+ """Constructor setting up the dataframes.
27
+
27
28
  Args:
28
29
  df_train: train dataframe
29
30
  df_test: test dataframe
30
31
  feats_train: train features
31
32
  feats_train: test features
32
33
 
33
- Returns:
34
-
35
34
  """
36
35
  self.df_train, self.df_test, self.feats_train, self.feats_test = (
37
36
  df_train,
@@ -46,7 +45,7 @@ class Runmanager:
46
45
  # self._select_model(model_type)
47
46
 
48
47
  def do_runs(self):
49
- """Start the runs"""
48
+ """Start the runs."""
50
49
  self.best_results = [] # keep the best result per run
51
50
  self.last_epochs = [] # keep the epoch of best result per run
52
51
  # for all runs
@@ -105,15 +104,13 @@ class Runmanager:
105
104
  )
106
105
  self.print_model(best_report, plot_name)
107
106
  # finally, print out the numbers for this run
108
- # self.reports[-1].print_results(
109
- # int(self.util.config_val("EXP", "epochs", 1))
110
- # )
111
107
  best_report.print_results(best_report.epoch)
108
+ best_report.print_probabilities()
112
109
  self.best_results.append(best_report)
113
110
  self.last_epochs.append(last_epoch)
114
111
 
115
112
  def print_best_result_runs(self):
116
- """Print the best result for all runs"""
113
+ """Print the best result for all runs."""
117
114
  best_report = self.get_best_result(self.best_results)
118
115
  self.util.debug(
119
116
  f"best result all runs with run {best_report.run} and"
@@ -177,7 +174,7 @@ class Runmanager:
177
174
  return self.load_model(best_report)
178
175
 
179
176
  def get_best_result(self, reports):
180
- best_r = Reporter([], [], 0, 0)
177
+ best_r = Reporter([], [], None, 0, 0)
181
178
  if self.util.high_is_good():
182
179
  best_r = self.search_best_result(reports, "ascending")
183
180
  else:
@@ -185,7 +182,7 @@ class Runmanager:
185
182
  return best_r
186
183
 
187
184
  def search_best_result(self, reports, order):
188
- best_r = Reporter([], [], 0, 0)
185
+ best_r = Reporter([], [], None, 0, 0)
189
186
  if order == "ascending":
190
187
  best_result = 0
191
188
  for r in reports:
@@ -56,18 +56,13 @@ class TestPredictor:
56
56
  else:
57
57
  test_dbs = ast.literal_eval(glob_conf.config["DATA"]["tests"])
58
58
  test_dbs_string = "_".join(test_dbs)
59
- predictions = self.model.get_predictions()
59
+ predictions, _ = self.model.get_predictions()
60
60
  report = self.model.predict()
61
61
  result = report.result.get_result()
62
62
  report.set_filename_add(f"test-{test_dbs_string}")
63
63
  self.util.print_best_results([report])
64
64
  report.plot_confmatrix(self.util.get_plot_name(), 0)
65
65
  report.print_results(0)
66
- # print(predictions)
67
- # df = pd.DataFrame(index=self.orig_df.index)
68
- # df["speaker"] = self.orig_df["speaker"]
69
- # df["gender"] = self.orig_df["gender"]
70
- # df[self.target] = self.orig_df[self.target]
71
66
  df = self.orig_df.copy()
72
67
  df["predictions"] = self.label_encoder.inverse_transform(predictions)
73
68
  target = self.util.config_val("DATA", "target", "emotion")
nkululeko/utils/stats.py CHANGED
@@ -70,12 +70,16 @@ def get_effect_size(df, target, variable):
70
70
  cats[c] = df[df[target] == c][variable].values
71
71
  combos = all_combinations(categories)
72
72
  results = {}
73
- for combo in combos:
74
- one = combo[0]
75
- other = combo[1]
76
- results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
77
- max_cat = max(results, key=results.get)
78
- cat_s = cohens_D_to_string(float(results[max_cat]))
73
+ if len(categories) == 1:
74
+ cat_s = cohens_D_to_string(0)
75
+ return categories[0], cat_s, 0
76
+ else:
77
+ for combo in combos:
78
+ one = combo[0]
79
+ other = combo[1]
80
+ results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
81
+ max_cat = max(results, key=results.get)
82
+ cat_s = cohens_D_to_string(float(results[max_cat]))
79
83
  return max_cat, cat_s, results[max_cat]
80
84
 
81
85
 
@@ -92,7 +96,7 @@ def cohens_D_to_string(val):
92
96
 
93
97
 
94
98
  def normalize(values):
95
- """Do a z-transformation of a distribution.
99
+ """Do a z-transformation of a distribution.
96
100
 
97
101
  So that mean = 0 and variance = 1
98
102
  """
nkululeko/utils/util.py CHANGED
@@ -5,15 +5,15 @@ import os.path
5
5
  import pickle
6
6
  import sys
7
7
 
8
- import numpy as np
9
- import pandas as pd
10
-
11
8
  import audeer
12
9
  import audformat
10
+ import numpy as np
11
+ import pandas as pd
13
12
 
14
13
 
15
14
  class Util:
16
- # a list of words that need not to be warned upon if default values are used
15
+ # a list of words that need not to be warned upon if default values are
16
+ # used
17
17
  stopvals = [
18
18
  "all",
19
19
  False,
@@ -40,7 +40,8 @@ class Util:
40
40
  self.got_data_roots = self.config_val(
41
41
  "DATA", "root_folders", False)
42
42
  if self.got_data_roots:
43
- # if there is a global data rootfolder file, read from there
43
+ # if there is a global data rootfolder file, read from
44
+ # there
44
45
  if not os.path.isfile(self.got_data_roots):
45
46
  self.error(f"no such file: {self.got_data_roots}")
46
47
  self.data_roots = configparser.ConfigParser()
@@ -107,16 +108,17 @@ class Util:
107
108
  if self.got_data_roots:
108
109
  try:
109
110
  if len(key) > 0:
110
- return self.data_roots["DATA"][dataset + "." + key].strip("'\"")
111
+ return self.data_roots["DATA"][dataset +
112
+ "." + key].strip("'\"")
111
113
  else:
112
114
  return self.data_roots["DATA"][dataset].strip("'\"")
113
115
  except KeyError:
114
- if not default in self.stopvals:
116
+ if default not in self.stopvals:
115
117
  self.debug(
116
- f"value for {key} not found, using default:" f" {default}"
117
- )
118
+ f"value for {key} not found, using default:"
119
+ f" {default}")
118
120
  return default
119
- if not default in self.stopvals:
121
+ if default not in self.stopvals:
120
122
  self.debug(
121
123
  f"value for {key} not found, using default: {default}")
122
124
  return default
@@ -182,7 +184,7 @@ class Util:
182
184
 
183
185
  def get_feattype_name(self):
184
186
  """
185
- Get a string as name from all feature sets that are useed
187
+ Get a string as name from all feature sets that are used
186
188
  """
187
189
  return "_".join(ast.literal_eval(self.config["FEATS"]["type"]))
188
190
 
@@ -205,7 +207,12 @@ class Util:
205
207
  def get_model_description(self):
206
208
  mt = ""
207
209
  mt = f'{self.config["MODEL"]["type"]}'
208
- ft = "_".join(ast.literal_eval(self.config["FEATS"]["type"]))
210
+ # ft = "_".join(ast.literal_eval(self.config["FEATS"]["type"]))
211
+ ft_value = self.config["FEATS"]["type"]
212
+ if isinstance(ft_value, str) and ft_value.startswith("[") and ft_value.endswith("]"):
213
+ ft = "_".join(ast.literal_eval(ft_value))
214
+ else:
215
+ ft = ft_value
209
216
  ft += "_"
210
217
  layer_string = ""
211
218
  layer_s = self.config_val("MODEL", "layers", False)
@@ -230,9 +237,8 @@ class Util:
230
237
  ["FEATS", "wav2vec2.layer"],
231
238
  ]
232
239
  for option in options:
233
- return_string += self._get_value_descript(option[0], option[1]).replace(
234
- ".", "-"
235
- )
240
+ return_string += self._get_value_descript(
241
+ option[0], option[1]).replace(".", "-")
236
242
  return return_string
237
243
 
238
244
  def get_plot_name(self):
@@ -286,7 +292,7 @@ class Util:
286
292
  try:
287
293
  return ast.literal_eval(self.config[section][key])
288
294
  except KeyError:
289
- if not default in self.stopvals:
295
+ if default not in self.stopvals:
290
296
  self.debug(
291
297
  f"value for {key} not found, using default: {default}")
292
298
  return default
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nkululeko
3
- Version: 0.86.7
3
+ Version: 0.87.0
4
4
  Summary: Machine learning audio prediction experiments based on templates
5
5
  Home-page: https://github.com/felixbur/nkululeko
6
6
  Author: Felix Burkhardt
@@ -51,6 +51,7 @@ Requires-Dist: pylatex
51
51
  - [t-SNE plots](#t-sne-plots)
52
52
  - [Data distribution](#data-distribution)
53
53
  - [Bias checking](#bias-checking)
54
+ - [Uncertainty](#uncertainty)
54
55
  - [Documentation](#documentation)
55
56
  - [Installation](#installation)
56
57
  - [Usage](#usage)
@@ -113,6 +114,13 @@ In cases you might wonder if there's bias in your data. You can try to detect th
113
114
 
114
115
  <img src="meta/images/emotion-pesq.png" width="500px"/>
115
116
 
117
+ ### Uncertainty
118
+ Nkululeko estimates uncertainty of model decision (only for classifiers) with entropy over the class-probabilities or logits per sample.
119
+
120
+ <img src="meta/images/uncertainty.png" width="500px"/>
121
+
122
+
123
+
116
124
  ## Documentation
117
125
  The documentation, along with extensions of installation, usage, INI file format, and examples, can be found [nkululeko.readthedocs.io](https://nkululeko.readthedocs.io).
118
126
 
@@ -343,6 +351,14 @@ F. Burkhardt, Johannes Wagner, Hagen Wierstorf, Florian Eyben and Björn Schulle
343
351
  Changelog
344
352
  =========
345
353
 
354
+ Version 0.87.0
355
+ --------------
356
+ * added class probability output and uncertainty analysis
357
+
358
+ Version 0.86.8
359
+ --------------
360
+ * handle single feature sets as strings in the config
361
+
346
362
  Version 0.86.7
347
363
  --------------
348
364
  * handles now audformat tables where the target is in a file index
@@ -2,30 +2,30 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
2
2
  nkululeko/aug_train.py,sha256=YhuZnS_WVWnun9G-M6g5n6rbRxoVREz6Zh7k6qprFNQ,3194
3
3
  nkululeko/augment.py,sha256=4MG0apTAG5RgkuJrYEjGgDdbodZWi_HweSPNI1JJ5QA,3051
4
4
  nkululeko/cacheddataset.py,sha256=lIJ6hUo5LoxSrzXtWV8mzwO7wRtUETWnOQ4ws2XfL1E,969
5
- nkululeko/constants.py,sha256=CscqJhC7nceHk2wmZd2bBFSeFExtr0HkXt99qpAZU4E,39
6
- nkululeko/demo.py,sha256=WSKr-W5uJ9DQfemK923g7Hd5V3kgAn03Er0JX1Pa45I,5142
5
+ nkululeko/constants.py,sha256=qVowcvAZL-g-Bsp_4yBCOQDkCoW-S-1wrRG5XgnjnX0,39
6
+ nkululeko/demo.py,sha256=Sqbu3o6Pzdr_UlYxWM8Mn3l5uCXsw429yJbtkVDUYHU,5087
7
7
  nkululeko/demo_feats.py,sha256=sAeGFojhEj9WEDFtG3SzPBmyYJWLF2rkbpp65m8Ujo4,2025
8
8
  nkululeko/demo_predictor.py,sha256=es56xbT8ifkS_vnrlb5NTZT54gNmeUtNlA4zVA_gnN8,4757
9
- nkululeko/experiment.py,sha256=5nF-eDf8OCp6KRIU7KnryWL5SLJQUtr2BueHhEdcKw0,31040
9
+ nkululeko/experiment.py,sha256=s9PIjm45dR9yzmHu_69JpBjX9qMVzi5wIgPfMR3F44A,31530
10
10
  nkululeko/explore.py,sha256=lDzRoW_Taa5u4BBABZLD89BcQWnYlrftJR4jgt1yyj0,2609
11
11
  nkululeko/export.py,sha256=mHeEAAmtZuxdyebLlbSzPrHSi9OMgJHbk35d3DTxRBc,4632
12
- nkululeko/feature_extractor.py,sha256=8mssYKmo4LclVI-hiLmJEDZ0ZPyDavFG2YwtXcrGzwM,3976
12
+ nkululeko/feature_extractor.py,sha256=rL-TybLmjZz5uxT9LNTORaDat9FKp_1qloxbyMrinyE,4141
13
13
  nkululeko/file_checker.py,sha256=LoLnL8aHpW-axMQ46qbqrManTs5otG9ShpEZuz9iRSk,3474
14
14
  nkululeko/filter_data.py,sha256=w-X2mhKdYr5DxDIz50E5yzO6Jmzk4jjDBoXsgOOVtcA,7222
15
15
  nkululeko/glob_conf.py,sha256=KL9YJQTHvTztxo1vr25qRRgaPnx4NTg0XrdbovKGMmw,525
16
- nkululeko/modelrunner.py,sha256=OU35qwP94GxW_EtL4I2-RhqB-wxbjNvp8CIHNbtnt7Q,11155
16
+ nkululeko/modelrunner.py,sha256=rpWQRXERiDZ-i_7CwsqynI87vawtsaPihsonDMPe9PU,11151
17
17
  nkululeko/multidb.py,sha256=fG3VukEWP1vreVN4gB1IRXxwwg4jLftsSEYtu0o1f78,5634
18
18
  nkululeko/nkuluflag.py,sha256=PGWSmZz-PiiHLgcZJAoGOI_Y-sZDVI1ksB8p5r7riWM,3725
19
19
  nkululeko/nkululeko.py,sha256=Kn3s2E3yyH8cJ7z6lkMxrnqtCxTu7-qfe9Zr_ONTD5g,1968
20
- nkululeko/plots.py,sha256=C2mwQFK0Vxfl5ZM7CO87tULDoEf7G16ek0nU77bhOc4,23070
20
+ nkululeko/plots.py,sha256=WsI_dtPKfrYPsKymHRmIhqj33aZzTcE8fF_EwLkm_5A,22899
21
21
  nkululeko/predict.py,sha256=sF091sSSLnEWcISx9ZcULLie3tY5XeFsQJd6b3vrxFg,2409
22
22
  nkululeko/resample.py,sha256=2d9eao_0sLrGZ_KSl8OVKsPor3BkFrlmMhrpB9WelIs,4267
23
- nkululeko/runmanager.py,sha256=Na8oPn59lRFiNMsYChRHBRgw40mBcw0Rwl2Kz1RUsA0,7614
23
+ nkululeko/runmanager.py,sha256=eRMJidkoJhkU5NdIKoozv3vovU-8tqfn-7zqr2JZcnE,7533
24
24
  nkululeko/scaler.py,sha256=4nkIqoajkIkuTPK0Z02ifMN_awl6fP_i-GBYdoGYgGM,4101
25
25
  nkululeko/segment.py,sha256=YLKckX44tbvTb3LrdgYw9X4guzuF27sutl92z9DkpZU,4835
26
26
  nkululeko/syllable_nuclei.py,sha256=Sky-C__MeUDaxqHnDl2TGLLYOYvsahD35TUjWGeG31k,10047
27
27
  nkululeko/test.py,sha256=1w624vo5KTzmFC8BUStGlLDmIEAFuJUz7J0W-gp7AxI,1677
28
- nkululeko/test_predictor.py,sha256=_w5J8CxH6hmW3mLTKbdfmywl5QpdNAnW1Y8TE5GtlfE,3237
28
+ nkululeko/test_predictor.py,sha256=KaGef_r4mXW89f0aUiYDw8IiBe2ciGt14HNkR-S14lU,2985
29
29
  nkululeko/test_pretrain.py,sha256=ZWl-bR6nmeSmXkGAIE6zyfQEjN8Zg0rIxfaS-O6Zbas,8465
30
30
  nkululeko/augmenting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  nkululeko/augmenting/augmenter.py,sha256=XAt0dpmlnKxqyysqCgV3rcz-pRIvOz7rU7dmGDCVAzs,2905
@@ -46,7 +46,7 @@ nkululeko/autopredict/ap_valence.py,sha256=n-hctRKySzhmJtowuMOTUu0T_ld3uK5pnfOzW
46
46
  nkululeko/autopredict/estimate_snr.py,sha256=S-bpS0xFkwWc4Ch75UrjbS8y538lQ0U3g_iLRFXureY,5048
47
47
  nkululeko/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
48
  nkululeko/data/dataset.py,sha256=hUD0NqWCfRaSHG8JNs1MsPb0zjUZAf8FJkg_c0ebq0Q,28046
49
- nkululeko/data/dataset_csv.py,sha256=dzOrbKB8t0UATAIYaKAOqHTogmYPBqskt6Hak7VjbSM,4537
49
+ nkululeko/data/dataset_csv.py,sha256=UGEpi__eT2KFS6Fop6N4HkMrzO-u5VP71gt44kwZavo,4588
50
50
  nkululeko/feat_extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  nkululeko/feat_extract/feats_agender.py,sha256=Qm69G4kqAyTVVk7wwRgrXlNwGaDMGRYyKGpuf0vOEgM,3113
52
52
  nkululeko/feat_extract/feats_agender_agender.py,sha256=tgH2BnwcxpvuLmOkrMbVdBSX0Onfz2MG12FsddalRKI,3424
@@ -75,15 +75,15 @@ nkululeko/losses/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
75
75
  nkululeko/losses/loss_ccc.py,sha256=NOK0y0fxKUnU161B5geap6Fmn8QzoPl2MqtPiV8IuJE,976
76
76
  nkululeko/losses/loss_softf1loss.py,sha256=5gW-PuiqeAZcRgfwjueIOQtMokOjZWgQnVIv59HKTCo,1309
77
77
  nkululeko/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
- nkululeko/models/model.py,sha256=PUCqF2r_dEfmFsZn6Cgr1UIzYvxziLH6nSqZ5-vuN1o,11639
78
+ nkululeko/models/model.py,sha256=JXrd0fbU0JhTxUDrs0kOEHF9rtPJBxBeO6zcrHAzk8k,12475
79
79
  nkululeko/models/model_bayes.py,sha256=WJFZ8wFKwWATz6MhmjeZIi1Pal1viU549WL_PjXDSy8,406
80
- nkululeko/models/model_cnn.py,sha256=bJxqwe6FnVR2hFeqN6EXexYGgvKYFED1VOhBXVlLWaE,9954
80
+ nkululeko/models/model_cnn.py,sha256=NreR2LrKMyBYHyIJEL6wm3UQ4mA5HleZfpUyA5wNYpA,10629
81
81
  nkululeko/models/model_gmm.py,sha256=hZ9UO36KNf48qa3J-xkWIicIj9-TApmt21zNES2vEOs,649
82
82
  nkululeko/models/model_knn.py,sha256=KlnrJfwiVnmXZrAaYGFrKA2f5sznvTzSJQ8-5etOP0k,599
83
83
  nkululeko/models/model_knn_reg.py,sha256=j7YFfVm6xOR2d9yBYdQiwwqYfqkX0JynX_qLCvkr1fk,610
84
84
  nkululeko/models/model_lin_reg.py,sha256=0D7mSnSwK82lNWDMwHYRyq3FmGa6y-DHDGg4qUe85q4,422
85
- nkululeko/models/model_mlp.py,sha256=xMirtYax3bLBz_0kkC0M4Rc6-KQY05NNKHQGw7rbum8,9856
86
- nkululeko/models/model_mlp_regression.py,sha256=PO5qyfjgAJH8hawhmeXDaUThyXDYdM642dQHkO0NY7c,10204
85
+ nkululeko/models/model_mlp.py,sha256=VE0CI19qMyRbI-THDkMeJ7JbWf4z7CmZ4MMs1FIQgtM,10557
86
+ nkululeko/models/model_mlp_regression.py,sha256=7oK2zQhhCegSqiBUe6eU7Av8MJ_DPLA9skixJcHaVfg,10232
87
87
  nkululeko/models/model_svm.py,sha256=rsME3KvKvNG7bdE5lbvYUu85WZhaASZxxmdNDIVJRZ4,940
88
88
  nkululeko/models/model_svr.py,sha256=_YZeksqB3eBENGlg3g9RwYFlk9rQQ-XCeNBKLlGGVoE,725
89
89
  nkululeko/models/model_tree.py,sha256=rf16faUm4o2LJgkoYpeY998b8DQIvXZ73_m1IS3TnnE,417
@@ -96,17 +96,17 @@ nkululeko/reporting/defines.py,sha256=IsY1YgKRMaABpylVKjBJgJ5bNCEbGCVA_E6pivraqS
96
96
  nkululeko/reporting/latex_writer.py,sha256=qiCRSmB4KOD_za4oHu5x-PhwjZohzfo8wecMOwlXZwc,1886
97
97
  nkululeko/reporting/report.py,sha256=W0rcigDdjBvxZQ3pZja_gvToILYvaZ1BFtnN2qFRfYI,1060
98
98
  nkululeko/reporting/report_item.py,sha256=siWeGNgo4bAE46YBMNcsdf3jTMTy76BO9Fi6DTvDig4,533
99
- nkululeko/reporting/reporter.py,sha256=S9A62AxdMTEV-9XDUQNxdoevGLXBP52WiDmZ694QMV4,14161
99
+ nkululeko/reporting/reporter.py,sha256=6zW3PmQrwVJO5orBVA-fiaIhnzGrFymC861DSd8nSjc,16806
100
100
  nkululeko/reporting/result.py,sha256=nSN5or-Py2GPRWHkWpGRh7UCi1W0er7WLEHz8fYLk-A,742
101
101
  nkululeko/segmenting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
102
  nkululeko/segmenting/seg_inaspeechsegmenter.py,sha256=pmLHuXsaqvcdYxB4PSW9l1mbQWZZBJFhi_CGabqydas,1947
103
103
  nkululeko/segmenting/seg_silero.py,sha256=lLytS38KzARS17omwv8VBw-zz60RVSXGSvZ5EvWlcWQ,3301
104
104
  nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
105
  nkululeko/utils/files.py,sha256=UiGAtZRWYjHSvlmPaTMtzyNNGE6qaLaxQkybctS7iRM,4021
106
- nkululeko/utils/stats.py,sha256=1yUq0FTOyqkU8TwUocJRYdJaqMU5SlOBBRUun9STo2M,2829
107
- nkululeko/utils/util.py,sha256=ILpfNuaeq-hy1bUkRhVrzO2wG9z9Upaozs9EBoIaMG0,14123
108
- nkululeko-0.86.7.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
109
- nkululeko-0.86.7.dist-info/METADATA,sha256=t5cI43YRp3qmyJj03ACfgCbKoAuLYImDCLS1QkYbMQM,38024
110
- nkululeko-0.86.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
111
- nkululeko-0.86.7.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
112
- nkululeko-0.86.7.dist-info/RECORD,,
106
+ nkululeko/utils/stats.py,sha256=eC9dMO-by6CDnGLHDBQu-2B4-BudZNJ0nnWGhKYdUMA,2968
107
+ nkululeko/utils/util.py,sha256=ZCS02mE2c3_h9_q4hpsSm4XAooCranqRF_5pY-6055E,14432
108
+ nkululeko-0.87.0.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
109
+ nkululeko-0.87.0.dist-info/METADATA,sha256=DPO61pORcuEhRsDwB5S5VJ8CK_piJeh-I5kKJc8eNJE,38442
110
+ nkululeko-0.87.0.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
111
+ nkululeko-0.87.0.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
112
+ nkululeko-0.87.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (70.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5