nkululeko 0.86.8__py3-none-any.whl → 0.88.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,33 @@
1
1
  # model_mlp.py
2
+ import ast
3
+ from collections import OrderedDict
4
+
5
+ import numpy as np
2
6
  import pandas as pd
7
+ from sklearn.metrics import recall_score
8
+ import torch
3
9
 
4
- from nkululeko.utils.util import Util
5
10
  import nkululeko.glob_conf as glob_conf
11
+ from nkululeko.losses.loss_softf1loss import SoftF1Loss
6
12
  from nkululeko.models.model import Model
7
13
  from nkululeko.reporting.reporter import Reporter
8
- import torch
9
- import ast
10
- import numpy as np
11
- from sklearn.metrics import recall_score
12
- from collections import OrderedDict
13
- from nkululeko.losses.loss_softf1loss import SoftF1Loss
14
+ from nkululeko.utils.util import Util
14
15
 
15
16
 
16
- class MLP_model(Model):
17
+ class MLPModel(Model):
17
18
  """MLP = multi layer perceptron."""
18
19
 
19
20
  is_classifier = True
20
21
 
21
22
  def __init__(self, df_train, df_test, feats_train, feats_test):
22
- """Constructor taking the configuration and all dataframes."""
23
+ """Constructor, taking all dataframes.
24
+
25
+ Args:
26
+ df_train (pd.DataFrame): The train labels.
27
+ df_test (pd.DataFrame): The test labels.
28
+ feats_train (pd.DataFrame): The train features.
29
+ feats_test (pd.DataFrame): The test features.
30
+ """
23
31
  super().__init__(df_train, df_test, feats_train, feats_test)
24
32
  super().set_model_type("ann")
25
33
  self.name = "mlp"
@@ -97,7 +105,7 @@ class MLP_model(Model):
97
105
  self.optimizer.step()
98
106
  self.loss = (np.asarray(losses)).mean()
99
107
 
100
- def evaluate_model(self, model, loader, device):
108
+ def evaluate(self, model, loader, device):
101
109
  logits = torch.zeros(len(loader.dataset), self.class_num)
102
110
  targets = torch.zeros(len(loader.dataset))
103
111
  model.eval()
@@ -119,14 +127,28 @@ class MLP_model(Model):
119
127
  self.loss_eval = (np.asarray(losses)).mean()
120
128
  predictions = logits.argmax(dim=1)
121
129
  uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
122
- return uar, targets, predictions
130
+ return uar, targets, predictions, logits
131
+
132
+ def get_probas(self, logits):
133
+ # make a dataframe for probabilites (logits)
134
+ proba_d = {}
135
+ classes = self.df_test[self.target].unique()
136
+ classes.sort()
137
+ for c in classes:
138
+ proba_d[c] = []
139
+ for i, c in enumerate(classes):
140
+ proba_d[c] = list(logits.numpy().T[i])
141
+ probas = pd.DataFrame(proba_d)
142
+ probas = probas.set_index(self.df_test.index)
143
+ return probas
123
144
 
124
145
  def predict(self):
125
- _, truths, predictions = self.evaluate_model(
146
+ _, truths, predictions, logits = self.evaluate(
126
147
  self.model, self.testloader, self.device
127
148
  )
128
- uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
129
- report = Reporter(truths, predictions, self.run, self.epoch)
149
+ uar, _, _, _ = self.evaluate(self.model, self.trainloader, self.device)
150
+ probas = self.get_probas(logits)
151
+ report = Reporter(truths, predictions, self.run, self.epoch, probas=probas)
130
152
  try:
131
153
  report.result.loss = self.loss
132
154
  except AttributeError: # if the model was loaded from disk the loss is unknown
@@ -139,9 +161,7 @@ class MLP_model(Model):
139
161
  return report
140
162
 
141
163
  def get_predictions(self):
142
- _, truths, predictions = self.evaluate_model(
143
- self.model, self.testloader, self.device
144
- )
164
+ _, _, predictions, _ = self.evaluate(self.model, self.testloader, self.device)
145
165
  return predictions.numpy()
146
166
 
147
167
  def get_loader(self, df_x, df_y, shuffle):
@@ -97,7 +97,9 @@ class MLP_Reg_model(Model):
97
97
  self.model, self.testloader, self.device
98
98
  )
99
99
  result, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
100
- report = Reporter(truths.numpy(), predictions.numpy(), self.run, self.epoch)
100
+ report = Reporter(
101
+ truths.numpy(), predictions.numpy(), None, self.run, self.epoch
102
+ )
101
103
  try:
102
104
  report.result.loss = self.loss
103
105
  except AttributeError: # if the model was loaded from disk the loss is unknown
nkululeko/plots.py CHANGED
@@ -48,7 +48,7 @@ class Plots:
48
48
  )
49
49
  ax.set_ylabel(f"number of speakers")
50
50
  ax.set_xlabel("number of samples")
51
- self._save_plot(
51
+ self.save_plot(
52
52
  ax,
53
53
  "Samples per speaker",
54
54
  f"Samples per speaker ({df_speakers.shape[0]})",
@@ -70,9 +70,9 @@ class Plots:
70
70
  rot=0,
71
71
  )
72
72
  )
73
- ax.set_ylabel(f"number of speakers")
73
+ ax.set_ylabel("number of speakers")
74
74
  ax.set_xlabel("number of samples")
75
- self._save_plot(
75
+ self.save_plot(
76
76
  ax,
77
77
  "Sample value counts",
78
78
  f"Samples per speaker ({df_speakers.shape[0]})",
@@ -96,7 +96,7 @@ class Plots:
96
96
  binned_data = self.util.continuous_to_categorical(df[class_label])
97
97
  ax = binned_data.value_counts().plot(kind="bar")
98
98
  filename_binned = f"{class_label}_discreet"
99
- self._save_plot(
99
+ self.save_plot(
100
100
  ax,
101
101
  "Sample value counts",
102
102
  filename_binned,
@@ -106,7 +106,7 @@ class Plots:
106
106
  dist_type = self.util.config_val("EXPL", "dist_type", "hist")
107
107
  ax = df[class_label].plot(kind=dist_type)
108
108
 
109
- self._save_plot(
109
+ self.save_plot(
110
110
  ax,
111
111
  "Sample value counts",
112
112
  filename,
@@ -131,17 +131,17 @@ class Plots:
131
131
  df, class_label, att1, self.target, type_s
132
132
  )
133
133
  else:
134
- ax, caption = self._plotcatcont(
134
+ ax, caption = self.plotcatcont(
135
135
  df, class_label, att1, att1, type_s
136
136
  )
137
137
  else:
138
138
  if self.util.is_categorical(df[att1]):
139
- ax, caption = self._plotcatcont(
139
+ ax, caption = self.plotcatcont(
140
140
  df, att1, class_label, att1, type_s
141
141
  )
142
142
  else:
143
143
  ax, caption = self._plot2cont(df, class_label, att1, type_s)
144
- self._save_plot(
144
+ self.save_plot(
145
145
  ax,
146
146
  caption,
147
147
  f"Correlation of {self.target} and {att[0]}",
@@ -171,15 +171,11 @@ class Plots:
171
171
  ax, caption = self._plot2cat(df, att1, att2, att1, type_s)
172
172
  else:
173
173
  # class_label = cat, att1 = cat, att2 = cont
174
- ax, caption = self._plotcatcont(
175
- df, att1, att2, att1, type_s
176
- )
174
+ ax, caption = self.plotcatcont(df, att1, att2, att1, type_s)
177
175
  else:
178
176
  if self.util.is_categorical(df[att2]):
179
177
  # class_label = cat, att1 = cont, att2 = cat
180
- ax, caption = self._plotcatcont(
181
- df, att2, att1, att2, type_s
182
- )
178
+ ax, caption = self.plotcatcont(df, att2, att1, att2, type_s)
183
179
  else:
184
180
  # class_label = cat, att1 = cont, att2 = cont
185
181
  ax, caption = self._plot2cont_cat(
@@ -205,7 +201,7 @@ class Plots:
205
201
  # class_label = cont, att1 = cont, att2 = cont
206
202
  ax, caption = self._plot2cont(df, att1, att2, type_s)
207
203
 
208
- self._save_plot(
204
+ self.save_plot(
209
205
  ax, caption, f"Correlation of {att1} and {att2}", filename, type_s
210
206
  )
211
207
 
@@ -215,16 +211,16 @@ class Plots:
215
211
  f" {att} has more than 2 values. Perhaps you forgot to state a list of lists?"
216
212
  )
217
213
 
218
- def _save_plot(self, ax, caption, header, filename, type_s):
214
+ def save_plot(self, ax, caption, header, filename, type_s):
219
215
  # one up because of the runs
220
216
  fig_dir = self.util.get_path("fig_dir") + "../"
221
- fig = ax.figure
217
+ fig_plots = ax.figure
222
218
  # avoid warning
223
219
  # plt.tight_layout()
224
220
  img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
225
221
  plt.savefig(img_path)
226
- plt.close(fig)
227
- # fig.clear() # avoid error
222
+ plt.close(fig_plots)
223
+ self.util.debug(f"Saved plot to {img_path}")
228
224
  glob_conf.report.add_item(
229
225
  ReportItem(
230
226
  Header.HEADER_EXPLORE,
@@ -244,35 +240,29 @@ class Plots:
244
240
  return att, df
245
241
 
246
242
  def _plot2cont_cat(self, df, cont1, cont2, cat, ylab):
247
- """
248
- plot relation of two continuous distributions with one categorical
249
- """
243
+ """Plot relation of two continuous distributions with one categorical."""
250
244
  pearson = stats.pearsonr(df[cont1], df[cont2])
251
245
  # trunc to three digits
252
246
  pearson = int(pearson[0] * 1000) / 1000
253
247
  pearson_string = f"PCC: {pearson}"
254
248
  ax = sns.lmplot(data=df, x=cont1, y=cont2, hue=cat)
255
249
  caption = f"{ylab} {df.shape[0]}. {pearson_string}"
256
- ax.fig.suptitle(caption)
250
+ ax.figure.suptitle(caption)
257
251
  return ax, caption
258
252
 
259
253
  def _plot2cont(self, df, col1, col2, ylab):
260
- """
261
- plot relation of two continuous distributions
262
- """
254
+ """Plot relation of two continuous distributions."""
263
255
  pearson = stats.pearsonr(df[col1], df[col2])
264
256
  # trunc to three digits
265
257
  pearson = int(pearson[0] * 1000) / 1000
266
258
  pearson_string = f"PCC: {pearson}"
267
259
  ax = sns.lmplot(data=df, x=col1, y=col2)
268
260
  caption = f"{ylab} {df.shape[0]}. {pearson_string}"
269
- ax.fig.suptitle(caption)
261
+ ax.figure.suptitle(caption)
270
262
  return ax, caption
271
263
 
272
- def _plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
273
- """
274
- plot relation of categorical distribution with continuous
275
- """
264
+ def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
265
+ """Plot relation of categorical distribution with continuous."""
276
266
  dist_type = self.util.config_val("EXPL", "dist_type", "hist")
277
267
  cats, cat_str, es = su.get_effect_size(df, cat_col, cont_col)
278
268
  if dist_type == "hist":
@@ -287,13 +277,11 @@ class Plots:
287
277
  )
288
278
  ax.set(xlabel=f"{cont_col}")
289
279
  caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
290
- ax.fig.suptitle(caption)
280
+ ax.figure.suptitle(caption)
291
281
  return ax, caption
292
282
 
293
283
  def _plot2cat(self, df, col1, col2, xlab, ylab):
294
- """
295
- plot relation of 2 categorical distributions
296
- """
284
+ """Plot relation of 2 categorical distributions."""
297
285
  crosstab = pd.crosstab(index=df[col1], columns=df[col2])
298
286
  res_pval = stats.chi2_contingency(crosstab)
299
287
  res_pval = int(res_pval[1] * 1000) / 1000
@@ -320,8 +308,8 @@ class Plots:
320
308
  max = self.util.to_3_digits(df.duration.max())
321
309
  title = f"Duration distr. for {sample_selection} {df.shape[0]}. min={min}, max={max}"
322
310
  ax.set_title(title)
323
- ax.set_xlabel(f"duration")
324
- ax.set_ylabel(f"number of samples")
311
+ ax.set_xlabel("duration")
312
+ ax.set_ylabel("number of samples")
325
313
  fig = ax.figure
326
314
  # plt.tight_layout()
327
315
  img_path = f"{fig_dir}{filename}_{sample_selection}.{self.format}"
@@ -2,16 +2,21 @@ import ast
2
2
  import glob
3
3
  import json
4
4
  import math
5
+ import os
5
6
 
6
7
  from confidence_intervals import evaluate_with_conf_int
7
8
  import matplotlib.pyplot as plt
8
9
  import numpy as np
10
+ from scipy.special import softmax
11
+ from scipy.stats import entropy
9
12
  from scipy.stats import pearsonr
10
- from sklearn.metrics import ConfusionMatrixDisplay, roc_curve
13
+ from sklearn.metrics import ConfusionMatrixDisplay
14
+ from sklearn.metrics import auc
11
15
  from sklearn.metrics import classification_report
12
16
  from sklearn.metrics import confusion_matrix
13
17
  from sklearn.metrics import r2_score
14
- from sklearn.metrics import roc_curve, auc, roc_auc_score
18
+ from sklearn.metrics import roc_auc_score
19
+ from sklearn.metrics import roc_curve
15
20
  from torch import is_tensor
16
21
 
17
22
  from audmetric import accuracy
@@ -21,6 +26,7 @@ from audmetric import mean_squared_error
21
26
  from audmetric import unweighted_average_recall
22
27
 
23
28
  import nkululeko.glob_conf as glob_conf
29
+ from nkululeko.plots import Plots
24
30
  from nkululeko.reporting.defines import Header
25
31
  from nkululeko.reporting.report_item import ReportItem
26
32
  from nkululeko.reporting.result import Result
@@ -46,9 +52,18 @@ class Reporter:
46
52
  self.MEASURE = "CCC"
47
53
  self.result.measure = self.MEASURE
48
54
 
49
- def __init__(self, truths, preds, run, epoch):
50
- """Initialization with ground truth und predictions vector."""
55
+ def __init__(self, truths, preds, run, epoch, probas=None):
56
+ """Initialization with ground truth und predictions vector.
57
+
58
+ Args:
59
+ truths (list): the ground truth
60
+ preds (list): the predictions
61
+ run (int): number of run
62
+ epoch (int): number of epoch
63
+ probas (pd.Dataframe, optional): probabilities per class. Defaults to None.
64
+ """
51
65
  self.util = Util("reporter")
66
+ self.probas = probas
52
67
  self.format = self.util.config_val("PLOT", "format", "png")
53
68
  self.truths = np.asarray(truths)
54
69
  self.preds = np.asarray(preds)
@@ -108,6 +123,47 @@ class Reporter:
108
123
  self.result.test = test_result
109
124
  self.result.set_upper_lower(upper, lower)
110
125
  # train and loss are being set by the model
126
+ # print out the class probilities
127
+
128
+ def print_probabilities(self):
129
+ """Print the probabilities per class to a file in the store."""
130
+ if (
131
+ self.util.exp_is_classification()
132
+ and self.probas is not None
133
+ and "uncertainty" not in self.probas
134
+ ):
135
+ probas = self.probas
136
+ probas["predicted"] = self.preds
137
+ probas["truth"] = self.truths
138
+ # softmax the probabilities or logits
139
+ uncertainty = probas.apply(softmax, axis=1)
140
+ try:
141
+ le = glob_conf.label_encoder
142
+ mapping = dict(zip(le.classes_, range(len(le.classes_))))
143
+ mapping_reverse = {value: key for key, value in mapping.items()}
144
+ probas = probas.rename(columns=mapping_reverse)
145
+ probas["predicted"] = probas["predicted"].map(mapping_reverse)
146
+ probas["truth"] = probas["truth"].map(mapping_reverse)
147
+ except AttributeError as ae:
148
+ self.util.debug(f"Can't label categories: {ae}")
149
+ # compute entropy per sample
150
+ uncertainty = uncertainty.apply(entropy)
151
+ # scale it to 0-1
152
+ max_ent = math.log(len(glob_conf.labels))
153
+ uncertainty = (uncertainty - uncertainty.min()) / (
154
+ max_ent - uncertainty.min()
155
+ )
156
+ probas["uncertainty"] = uncertainty
157
+ probas["correct"] = probas.predicted == probas.truth
158
+ sp = os.path.join(self.util.get_path("store"), "pred_df.csv")
159
+ self.probas = probas
160
+ probas.to_csv(sp)
161
+ self.util.debug(f"Saved probabilities to {sp}")
162
+ plots = Plots()
163
+ ax, caption = plots.plotcatcont(
164
+ probas, "correct", "uncertainty", "uncertainty", "correct"
165
+ )
166
+ plots.save_plot(ax, caption, "Uncertainty", "uncertainty", "samples")
111
167
 
112
168
  def set_id(self, run, epoch):
113
169
  """Make the report identifiable with run and epoch index."""
@@ -123,6 +179,12 @@ class Reporter:
123
179
  self.preds = np.digitize(self.preds, bins) - 1
124
180
 
125
181
  def plot_confmatrix(self, plot_name, epoch=None):
182
+ """Plot a confusionmatrix to the store.
183
+
184
+ Args:
185
+ plot_name (str): name for the image file.
186
+ epoch (int, optional): Number of epoch. Defaults to None.
187
+ """
126
188
  if not self.util.exp_is_classification():
127
189
  self.continuous_to_categorical()
128
190
  self._plot_confmat(self.truths, self.preds, plot_name, epoch)
@@ -212,10 +274,11 @@ class Reporter:
212
274
  )
213
275
  img_path = f"{fig_dir}{plot_name}{self.filenameadd}.{self.format}"
214
276
  plt.savefig(img_path)
277
+ self.util.debug(f"Saved confusion plot to {img_path}")
215
278
  fig.clear()
216
279
  plt.close(fig)
217
- plt.savefig(img_path)
218
- plt.close(fig)
280
+ plt.close()
281
+ plt.clf()
219
282
  glob_conf.report.add_item(
220
283
  ReportItem(
221
284
  Header.HEADER_RESULTS,
nkululeko/runmanager.py CHANGED
@@ -11,7 +11,7 @@ from nkululeko.utils.util import Util
11
11
 
12
12
 
13
13
  class Runmanager:
14
- """Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)"""
14
+ """Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)."""
15
15
 
16
16
  model = None # The underlying model
17
17
  df_train, df_test, feats_train, feats_test = (
@@ -23,15 +23,14 @@ class Runmanager:
23
23
  reports = []
24
24
 
25
25
  def __init__(self, df_train, df_test, feats_train, feats_test):
26
- """Constructor setting up the dataframes
26
+ """Constructor setting up the dataframes.
27
+
27
28
  Args:
28
29
  df_train: train dataframe
29
30
  df_test: test dataframe
30
31
  feats_train: train features
31
32
  feats_train: test features
32
33
 
33
- Returns:
34
-
35
34
  """
36
35
  self.df_train, self.df_test, self.feats_train, self.feats_test = (
37
36
  df_train,
@@ -46,7 +45,7 @@ class Runmanager:
46
45
  # self._select_model(model_type)
47
46
 
48
47
  def do_runs(self):
49
- """Start the runs"""
48
+ """Start the runs."""
50
49
  self.best_results = [] # keep the best result per run
51
50
  self.last_epochs = [] # keep the epoch of best result per run
52
51
  # for all runs
@@ -105,15 +104,13 @@ class Runmanager:
105
104
  )
106
105
  self.print_model(best_report, plot_name)
107
106
  # finally, print out the numbers for this run
108
- # self.reports[-1].print_results(
109
- # int(self.util.config_val("EXP", "epochs", 1))
110
- # )
111
107
  best_report.print_results(best_report.epoch)
108
+ best_report.print_probabilities()
112
109
  self.best_results.append(best_report)
113
110
  self.last_epochs.append(last_epoch)
114
111
 
115
112
  def print_best_result_runs(self):
116
- """Print the best result for all runs"""
113
+ """Print the best result for all runs."""
117
114
  best_report = self.get_best_result(self.best_results)
118
115
  self.util.debug(
119
116
  f"best result all runs with run {best_report.run} and"
@@ -177,7 +174,7 @@ class Runmanager:
177
174
  return self.load_model(best_report)
178
175
 
179
176
  def get_best_result(self, reports):
180
- best_r = Reporter([], [], 0, 0)
177
+ best_r = Reporter([], [], None, 0, 0)
181
178
  if self.util.high_is_good():
182
179
  best_r = self.search_best_result(reports, "ascending")
183
180
  else:
@@ -185,7 +182,7 @@ class Runmanager:
185
182
  return best_r
186
183
 
187
184
  def search_best_result(self, reports, order):
188
- best_r = Reporter([], [], 0, 0)
185
+ best_r = Reporter([], [], None, 0, 0)
189
186
  if order == "ascending":
190
187
  best_result = 0
191
188
  for r in reports:
@@ -6,13 +6,12 @@
6
6
 
7
7
  import ast
8
8
 
9
- import numpy as np
10
9
  import pandas as pd
11
10
  from sklearn.preprocessing import LabelEncoder
12
11
 
12
+ import nkululeko.glob_conf as glob_conf
13
13
  from nkululeko.data.dataset import Dataset
14
14
  from nkululeko.feature_extractor import FeatureExtractor
15
- import nkululeko.glob_conf as glob_conf
16
15
  from nkululeko.scaler import Scaler
17
16
  from nkululeko.utils.util import Util
18
17
 
@@ -42,7 +41,6 @@ class TestPredictor:
42
41
  scale = self.util.config_val("FEATS", "scale", False)
43
42
  labelenc = LabelEncoder()
44
43
  data_df[self.target] = labelenc.fit_transform(data_df[self.target])
45
- # data_df[self.target] = self.label_encoder.fit_transform(data_df[self.target])
46
44
  if scale:
47
45
  self.scaler = Scaler(data_df, None, feats_df, None, scale)
48
46
  feats_df, _ = self.scaler.scale()
@@ -56,18 +54,13 @@ class TestPredictor:
56
54
  else:
57
55
  test_dbs = ast.literal_eval(glob_conf.config["DATA"]["tests"])
58
56
  test_dbs_string = "_".join(test_dbs)
59
- predictions = self.model.get_predictions()
57
+ predictions, _ = self.model.get_predictions()
60
58
  report = self.model.predict()
61
59
  result = report.result.get_result()
62
60
  report.set_filename_add(f"test-{test_dbs_string}")
63
61
  self.util.print_best_results([report])
64
62
  report.plot_confmatrix(self.util.get_plot_name(), 0)
65
63
  report.print_results(0)
66
- # print(predictions)
67
- # df = pd.DataFrame(index=self.orig_df.index)
68
- # df["speaker"] = self.orig_df["speaker"]
69
- # df["gender"] = self.orig_df["gender"]
70
- # df[self.target] = self.orig_df[self.target]
71
64
  df = self.orig_df.copy()
72
65
  df["predictions"] = self.label_encoder.inverse_transform(predictions)
73
66
  target = self.util.config_val("DATA", "target", "emotion")
nkululeko/utils/stats.py CHANGED
@@ -70,12 +70,16 @@ def get_effect_size(df, target, variable):
70
70
  cats[c] = df[df[target] == c][variable].values
71
71
  combos = all_combinations(categories)
72
72
  results = {}
73
- for combo in combos:
74
- one = combo[0]
75
- other = combo[1]
76
- results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
77
- max_cat = max(results, key=results.get)
78
- cat_s = cohens_D_to_string(float(results[max_cat]))
73
+ if len(categories) == 1:
74
+ cat_s = cohens_D_to_string(0)
75
+ return categories[0], cat_s, 0
76
+ else:
77
+ for combo in combos:
78
+ one = combo[0]
79
+ other = combo[1]
80
+ results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
81
+ max_cat = max(results, key=results.get)
82
+ cat_s = cohens_D_to_string(float(results[max_cat]))
79
83
  return max_cat, cat_s, results[max_cat]
80
84
 
81
85
 
@@ -92,7 +96,7 @@ def cohens_D_to_string(val):
92
96
 
93
97
 
94
98
  def normalize(values):
95
- """Do a z-transformation of a distribution.
99
+ """Do a z-transformation of a distribution.
96
100
 
97
101
  So that mean = 0 and variance = 1
98
102
  """
nkululeko/utils/util.py CHANGED
@@ -37,8 +37,7 @@ class Util:
37
37
  import nkululeko.glob_conf as glob_conf
38
38
 
39
39
  self.config = glob_conf.config
40
- self.got_data_roots = self.config_val(
41
- "DATA", "root_folders", False)
40
+ self.got_data_roots = self.config_val("DATA", "root_folders", False)
42
41
  if self.got_data_roots:
43
42
  # if there is a global data rootfolder file, read from
44
43
  # there
@@ -108,19 +107,17 @@ class Util:
108
107
  if self.got_data_roots:
109
108
  try:
110
109
  if len(key) > 0:
111
- return self.data_roots["DATA"][dataset +
112
- "." + key].strip("'\"")
110
+ return self.data_roots["DATA"][dataset + "." + key].strip("'\"")
113
111
  else:
114
112
  return self.data_roots["DATA"][dataset].strip("'\"")
115
113
  except KeyError:
116
114
  if default not in self.stopvals:
117
115
  self.debug(
118
- f"value for {key} not found, using default:"
119
- f" {default}")
116
+ f"value for {key} not found, using default:" f" {default}"
117
+ )
120
118
  return default
121
119
  if default not in self.stopvals:
122
- self.debug(
123
- f"value for {key} not found, using default: {default}")
120
+ self.debug(f"value for {key} not found, using default: {default}")
124
121
  return default
125
122
 
126
123
  def set_config(self, config):
@@ -131,6 +128,10 @@ class Util:
131
128
  store = self.get_path("store")
132
129
  return f"{store}/{self.get_exp_name()}.pkl"
133
130
 
131
+ def get_pred_name(self):
132
+ store = self.get_path("store")
133
+ return f"{store}/pred_df.csv"
134
+
134
135
  def is_categorical(self, pd_series):
135
136
  """Check if a dataframe column is categorical"""
136
137
  return pd_series.dtype.name == "object" or isinstance(
@@ -163,10 +164,8 @@ class Util:
163
164
  if len(df) == 0:
164
165
  return df
165
166
  if not isinstance(df.index, pd.MultiIndex):
166
- self.debug(
167
- "converting to segmented index, this might take a while...")
168
- df.index = audformat.utils.to_segmented_index(
169
- df.index, allow_nat=False)
167
+ self.debug("converting to segmented index, this might take a while...")
168
+ df.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
170
169
  return df
171
170
 
172
171
  def _get_value_descript(self, section, name):
@@ -209,7 +208,11 @@ class Util:
209
208
  mt = f'{self.config["MODEL"]["type"]}'
210
209
  # ft = "_".join(ast.literal_eval(self.config["FEATS"]["type"]))
211
210
  ft_value = self.config["FEATS"]["type"]
212
- if isinstance(ft_value, str) and ft_value.startswith("[") and ft_value.endswith("]"):
211
+ if (
212
+ isinstance(ft_value, str)
213
+ and ft_value.startswith("[")
214
+ and ft_value.endswith("]")
215
+ ):
213
216
  ft = "_".join(ast.literal_eval(ft_value))
214
217
  else:
215
218
  ft = ft_value
@@ -237,8 +240,9 @@ class Util:
237
240
  ["FEATS", "wav2vec2.layer"],
238
241
  ]
239
242
  for option in options:
240
- return_string += self._get_value_descript(
241
- option[0], option[1]).replace(".", "-")
243
+ return_string += self._get_value_descript(option[0], option[1]).replace(
244
+ ".", "-"
245
+ )
242
246
  return return_string
243
247
 
244
248
  def get_plot_name(self):
@@ -284,8 +288,7 @@ class Util:
284
288
  return self.config[section][key]
285
289
  except KeyError:
286
290
  if default not in self.stopvals:
287
- self.debug(
288
- f"value for {key} not found, using default: {default}")
291
+ self.debug(f"value for {key} not found, using default: {default}")
289
292
  return default
290
293
 
291
294
  def config_val_list(self, section, key, default):
@@ -293,10 +296,12 @@ class Util:
293
296
  return ast.literal_eval(self.config[section][key])
294
297
  except KeyError:
295
298
  if default not in self.stopvals:
296
- self.debug(
297
- f"value for {key} not found, using default: {default}")
299
+ self.debug(f"value for {key} not found, using default: {default}")
298
300
  return default
299
301
 
302
+ def get_labels(self):
303
+ return ast.literal_eval(self.config["DATA"]["labels"])
304
+
300
305
  def continuous_to_categorical(self, series):
301
306
  """
302
307
  discretize a categorical variable.