nkululeko 0.86.8__py3-none-any.whl → 0.88.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/constants.py +1 -1
- nkululeko/data/dataset_csv.py +12 -14
- nkululeko/demo.py +7 -10
- nkululeko/ensemble.py +158 -0
- nkululeko/feat_extract/feats_ast.py +118 -0
- nkululeko/feat_extract/feats_wav2vec2.py +2 -4
- nkululeko/feat_extract/feats_wavlm.py +7 -4
- nkululeko/feature_extractor.py +5 -9
- nkululeko/modelrunner.py +5 -5
- nkululeko/models/model.py +23 -3
- nkululeko/models/model_cnn.py +41 -22
- nkululeko/models/model_mlp.py +37 -17
- nkululeko/models/model_mlp_regression.py +3 -1
- nkululeko/plots.py +25 -37
- nkululeko/reporting/reporter.py +69 -6
- nkululeko/runmanager.py +8 -11
- nkululeko/test_predictor.py +2 -9
- nkululeko/utils/stats.py +11 -7
- nkululeko/utils/util.py +24 -19
- {nkululeko-0.86.8.dist-info → nkululeko-0.88.0.dist-info}/METADATA +22 -1
- {nkululeko-0.86.8.dist-info → nkululeko-0.88.0.dist-info}/RECORD +24 -22
- {nkululeko-0.86.8.dist-info → nkululeko-0.88.0.dist-info}/WHEEL +1 -1
- {nkululeko-0.86.8.dist-info → nkululeko-0.88.0.dist-info}/LICENSE +0 -0
- {nkululeko-0.86.8.dist-info → nkululeko-0.88.0.dist-info}/top_level.txt +0 -0
nkululeko/models/model_mlp.py
CHANGED
@@ -1,25 +1,33 @@
|
|
1
1
|
# model_mlp.py
|
2
|
+
import ast
|
3
|
+
from collections import OrderedDict
|
4
|
+
|
5
|
+
import numpy as np
|
2
6
|
import pandas as pd
|
7
|
+
from sklearn.metrics import recall_score
|
8
|
+
import torch
|
3
9
|
|
4
|
-
from nkululeko.utils.util import Util
|
5
10
|
import nkululeko.glob_conf as glob_conf
|
11
|
+
from nkululeko.losses.loss_softf1loss import SoftF1Loss
|
6
12
|
from nkululeko.models.model import Model
|
7
13
|
from nkululeko.reporting.reporter import Reporter
|
8
|
-
import
|
9
|
-
import ast
|
10
|
-
import numpy as np
|
11
|
-
from sklearn.metrics import recall_score
|
12
|
-
from collections import OrderedDict
|
13
|
-
from nkululeko.losses.loss_softf1loss import SoftF1Loss
|
14
|
+
from nkululeko.utils.util import Util
|
14
15
|
|
15
16
|
|
16
|
-
class
|
17
|
+
class MLPModel(Model):
|
17
18
|
"""MLP = multi layer perceptron."""
|
18
19
|
|
19
20
|
is_classifier = True
|
20
21
|
|
21
22
|
def __init__(self, df_train, df_test, feats_train, feats_test):
|
22
|
-
"""Constructor taking
|
23
|
+
"""Constructor, taking all dataframes.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
df_train (pd.DataFrame): The train labels.
|
27
|
+
df_test (pd.DataFrame): The test labels.
|
28
|
+
feats_train (pd.DataFrame): The train features.
|
29
|
+
feats_test (pd.DataFrame): The test features.
|
30
|
+
"""
|
23
31
|
super().__init__(df_train, df_test, feats_train, feats_test)
|
24
32
|
super().set_model_type("ann")
|
25
33
|
self.name = "mlp"
|
@@ -97,7 +105,7 @@ class MLP_model(Model):
|
|
97
105
|
self.optimizer.step()
|
98
106
|
self.loss = (np.asarray(losses)).mean()
|
99
107
|
|
100
|
-
def
|
108
|
+
def evaluate(self, model, loader, device):
|
101
109
|
logits = torch.zeros(len(loader.dataset), self.class_num)
|
102
110
|
targets = torch.zeros(len(loader.dataset))
|
103
111
|
model.eval()
|
@@ -119,14 +127,28 @@ class MLP_model(Model):
|
|
119
127
|
self.loss_eval = (np.asarray(losses)).mean()
|
120
128
|
predictions = logits.argmax(dim=1)
|
121
129
|
uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
|
122
|
-
return uar, targets, predictions
|
130
|
+
return uar, targets, predictions, logits
|
131
|
+
|
132
|
+
def get_probas(self, logits):
|
133
|
+
# make a dataframe for probabilites (logits)
|
134
|
+
proba_d = {}
|
135
|
+
classes = self.df_test[self.target].unique()
|
136
|
+
classes.sort()
|
137
|
+
for c in classes:
|
138
|
+
proba_d[c] = []
|
139
|
+
for i, c in enumerate(classes):
|
140
|
+
proba_d[c] = list(logits.numpy().T[i])
|
141
|
+
probas = pd.DataFrame(proba_d)
|
142
|
+
probas = probas.set_index(self.df_test.index)
|
143
|
+
return probas
|
123
144
|
|
124
145
|
def predict(self):
|
125
|
-
_, truths, predictions = self.
|
146
|
+
_, truths, predictions, logits = self.evaluate(
|
126
147
|
self.model, self.testloader, self.device
|
127
148
|
)
|
128
|
-
uar, _, _ = self.
|
129
|
-
|
149
|
+
uar, _, _, _ = self.evaluate(self.model, self.trainloader, self.device)
|
150
|
+
probas = self.get_probas(logits)
|
151
|
+
report = Reporter(truths, predictions, self.run, self.epoch, probas=probas)
|
130
152
|
try:
|
131
153
|
report.result.loss = self.loss
|
132
154
|
except AttributeError: # if the model was loaded from disk the loss is unknown
|
@@ -139,9 +161,7 @@ class MLP_model(Model):
|
|
139
161
|
return report
|
140
162
|
|
141
163
|
def get_predictions(self):
|
142
|
-
_,
|
143
|
-
self.model, self.testloader, self.device
|
144
|
-
)
|
164
|
+
_, _, predictions, _ = self.evaluate(self.model, self.testloader, self.device)
|
145
165
|
return predictions.numpy()
|
146
166
|
|
147
167
|
def get_loader(self, df_x, df_y, shuffle):
|
@@ -97,7 +97,9 @@ class MLP_Reg_model(Model):
|
|
97
97
|
self.model, self.testloader, self.device
|
98
98
|
)
|
99
99
|
result, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
|
100
|
-
report = Reporter(
|
100
|
+
report = Reporter(
|
101
|
+
truths.numpy(), predictions.numpy(), None, self.run, self.epoch
|
102
|
+
)
|
101
103
|
try:
|
102
104
|
report.result.loss = self.loss
|
103
105
|
except AttributeError: # if the model was loaded from disk the loss is unknown
|
nkululeko/plots.py
CHANGED
@@ -48,7 +48,7 @@ class Plots:
|
|
48
48
|
)
|
49
49
|
ax.set_ylabel(f"number of speakers")
|
50
50
|
ax.set_xlabel("number of samples")
|
51
|
-
self.
|
51
|
+
self.save_plot(
|
52
52
|
ax,
|
53
53
|
"Samples per speaker",
|
54
54
|
f"Samples per speaker ({df_speakers.shape[0]})",
|
@@ -70,9 +70,9 @@ class Plots:
|
|
70
70
|
rot=0,
|
71
71
|
)
|
72
72
|
)
|
73
|
-
ax.set_ylabel(
|
73
|
+
ax.set_ylabel("number of speakers")
|
74
74
|
ax.set_xlabel("number of samples")
|
75
|
-
self.
|
75
|
+
self.save_plot(
|
76
76
|
ax,
|
77
77
|
"Sample value counts",
|
78
78
|
f"Samples per speaker ({df_speakers.shape[0]})",
|
@@ -96,7 +96,7 @@ class Plots:
|
|
96
96
|
binned_data = self.util.continuous_to_categorical(df[class_label])
|
97
97
|
ax = binned_data.value_counts().plot(kind="bar")
|
98
98
|
filename_binned = f"{class_label}_discreet"
|
99
|
-
self.
|
99
|
+
self.save_plot(
|
100
100
|
ax,
|
101
101
|
"Sample value counts",
|
102
102
|
filename_binned,
|
@@ -106,7 +106,7 @@ class Plots:
|
|
106
106
|
dist_type = self.util.config_val("EXPL", "dist_type", "hist")
|
107
107
|
ax = df[class_label].plot(kind=dist_type)
|
108
108
|
|
109
|
-
self.
|
109
|
+
self.save_plot(
|
110
110
|
ax,
|
111
111
|
"Sample value counts",
|
112
112
|
filename,
|
@@ -131,17 +131,17 @@ class Plots:
|
|
131
131
|
df, class_label, att1, self.target, type_s
|
132
132
|
)
|
133
133
|
else:
|
134
|
-
ax, caption = self.
|
134
|
+
ax, caption = self.plotcatcont(
|
135
135
|
df, class_label, att1, att1, type_s
|
136
136
|
)
|
137
137
|
else:
|
138
138
|
if self.util.is_categorical(df[att1]):
|
139
|
-
ax, caption = self.
|
139
|
+
ax, caption = self.plotcatcont(
|
140
140
|
df, att1, class_label, att1, type_s
|
141
141
|
)
|
142
142
|
else:
|
143
143
|
ax, caption = self._plot2cont(df, class_label, att1, type_s)
|
144
|
-
self.
|
144
|
+
self.save_plot(
|
145
145
|
ax,
|
146
146
|
caption,
|
147
147
|
f"Correlation of {self.target} and {att[0]}",
|
@@ -171,15 +171,11 @@ class Plots:
|
|
171
171
|
ax, caption = self._plot2cat(df, att1, att2, att1, type_s)
|
172
172
|
else:
|
173
173
|
# class_label = cat, att1 = cat, att2 = cont
|
174
|
-
ax, caption = self.
|
175
|
-
df, att1, att2, att1, type_s
|
176
|
-
)
|
174
|
+
ax, caption = self.plotcatcont(df, att1, att2, att1, type_s)
|
177
175
|
else:
|
178
176
|
if self.util.is_categorical(df[att2]):
|
179
177
|
# class_label = cat, att1 = cont, att2 = cat
|
180
|
-
ax, caption = self.
|
181
|
-
df, att2, att1, att2, type_s
|
182
|
-
)
|
178
|
+
ax, caption = self.plotcatcont(df, att2, att1, att2, type_s)
|
183
179
|
else:
|
184
180
|
# class_label = cat, att1 = cont, att2 = cont
|
185
181
|
ax, caption = self._plot2cont_cat(
|
@@ -205,7 +201,7 @@ class Plots:
|
|
205
201
|
# class_label = cont, att1 = cont, att2 = cont
|
206
202
|
ax, caption = self._plot2cont(df, att1, att2, type_s)
|
207
203
|
|
208
|
-
self.
|
204
|
+
self.save_plot(
|
209
205
|
ax, caption, f"Correlation of {att1} and {att2}", filename, type_s
|
210
206
|
)
|
211
207
|
|
@@ -215,16 +211,16 @@ class Plots:
|
|
215
211
|
f" {att} has more than 2 values. Perhaps you forgot to state a list of lists?"
|
216
212
|
)
|
217
213
|
|
218
|
-
def
|
214
|
+
def save_plot(self, ax, caption, header, filename, type_s):
|
219
215
|
# one up because of the runs
|
220
216
|
fig_dir = self.util.get_path("fig_dir") + "../"
|
221
|
-
|
217
|
+
fig_plots = ax.figure
|
222
218
|
# avoid warning
|
223
219
|
# plt.tight_layout()
|
224
220
|
img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
|
225
221
|
plt.savefig(img_path)
|
226
|
-
plt.close(
|
227
|
-
|
222
|
+
plt.close(fig_plots)
|
223
|
+
self.util.debug(f"Saved plot to {img_path}")
|
228
224
|
glob_conf.report.add_item(
|
229
225
|
ReportItem(
|
230
226
|
Header.HEADER_EXPLORE,
|
@@ -244,35 +240,29 @@ class Plots:
|
|
244
240
|
return att, df
|
245
241
|
|
246
242
|
def _plot2cont_cat(self, df, cont1, cont2, cat, ylab):
|
247
|
-
"""
|
248
|
-
plot relation of two continuous distributions with one categorical
|
249
|
-
"""
|
243
|
+
"""Plot relation of two continuous distributions with one categorical."""
|
250
244
|
pearson = stats.pearsonr(df[cont1], df[cont2])
|
251
245
|
# trunc to three digits
|
252
246
|
pearson = int(pearson[0] * 1000) / 1000
|
253
247
|
pearson_string = f"PCC: {pearson}"
|
254
248
|
ax = sns.lmplot(data=df, x=cont1, y=cont2, hue=cat)
|
255
249
|
caption = f"{ylab} {df.shape[0]}. {pearson_string}"
|
256
|
-
ax.
|
250
|
+
ax.figure.suptitle(caption)
|
257
251
|
return ax, caption
|
258
252
|
|
259
253
|
def _plot2cont(self, df, col1, col2, ylab):
|
260
|
-
"""
|
261
|
-
plot relation of two continuous distributions
|
262
|
-
"""
|
254
|
+
"""Plot relation of two continuous distributions."""
|
263
255
|
pearson = stats.pearsonr(df[col1], df[col2])
|
264
256
|
# trunc to three digits
|
265
257
|
pearson = int(pearson[0] * 1000) / 1000
|
266
258
|
pearson_string = f"PCC: {pearson}"
|
267
259
|
ax = sns.lmplot(data=df, x=col1, y=col2)
|
268
260
|
caption = f"{ylab} {df.shape[0]}. {pearson_string}"
|
269
|
-
ax.
|
261
|
+
ax.figure.suptitle(caption)
|
270
262
|
return ax, caption
|
271
263
|
|
272
|
-
def
|
273
|
-
"""
|
274
|
-
plot relation of categorical distribution with continuous
|
275
|
-
"""
|
264
|
+
def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
|
265
|
+
"""Plot relation of categorical distribution with continuous."""
|
276
266
|
dist_type = self.util.config_val("EXPL", "dist_type", "hist")
|
277
267
|
cats, cat_str, es = su.get_effect_size(df, cat_col, cont_col)
|
278
268
|
if dist_type == "hist":
|
@@ -287,13 +277,11 @@ class Plots:
|
|
287
277
|
)
|
288
278
|
ax.set(xlabel=f"{cont_col}")
|
289
279
|
caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
|
290
|
-
ax.
|
280
|
+
ax.figure.suptitle(caption)
|
291
281
|
return ax, caption
|
292
282
|
|
293
283
|
def _plot2cat(self, df, col1, col2, xlab, ylab):
|
294
|
-
"""
|
295
|
-
plot relation of 2 categorical distributions
|
296
|
-
"""
|
284
|
+
"""Plot relation of 2 categorical distributions."""
|
297
285
|
crosstab = pd.crosstab(index=df[col1], columns=df[col2])
|
298
286
|
res_pval = stats.chi2_contingency(crosstab)
|
299
287
|
res_pval = int(res_pval[1] * 1000) / 1000
|
@@ -320,8 +308,8 @@ class Plots:
|
|
320
308
|
max = self.util.to_3_digits(df.duration.max())
|
321
309
|
title = f"Duration distr. for {sample_selection} {df.shape[0]}. min={min}, max={max}"
|
322
310
|
ax.set_title(title)
|
323
|
-
ax.set_xlabel(
|
324
|
-
ax.set_ylabel(
|
311
|
+
ax.set_xlabel("duration")
|
312
|
+
ax.set_ylabel("number of samples")
|
325
313
|
fig = ax.figure
|
326
314
|
# plt.tight_layout()
|
327
315
|
img_path = f"{fig_dir}{filename}_{sample_selection}.{self.format}"
|
nkululeko/reporting/reporter.py
CHANGED
@@ -2,16 +2,21 @@ import ast
|
|
2
2
|
import glob
|
3
3
|
import json
|
4
4
|
import math
|
5
|
+
import os
|
5
6
|
|
6
7
|
from confidence_intervals import evaluate_with_conf_int
|
7
8
|
import matplotlib.pyplot as plt
|
8
9
|
import numpy as np
|
10
|
+
from scipy.special import softmax
|
11
|
+
from scipy.stats import entropy
|
9
12
|
from scipy.stats import pearsonr
|
10
|
-
from sklearn.metrics import ConfusionMatrixDisplay
|
13
|
+
from sklearn.metrics import ConfusionMatrixDisplay
|
14
|
+
from sklearn.metrics import auc
|
11
15
|
from sklearn.metrics import classification_report
|
12
16
|
from sklearn.metrics import confusion_matrix
|
13
17
|
from sklearn.metrics import r2_score
|
14
|
-
from sklearn.metrics import
|
18
|
+
from sklearn.metrics import roc_auc_score
|
19
|
+
from sklearn.metrics import roc_curve
|
15
20
|
from torch import is_tensor
|
16
21
|
|
17
22
|
from audmetric import accuracy
|
@@ -21,6 +26,7 @@ from audmetric import mean_squared_error
|
|
21
26
|
from audmetric import unweighted_average_recall
|
22
27
|
|
23
28
|
import nkululeko.glob_conf as glob_conf
|
29
|
+
from nkululeko.plots import Plots
|
24
30
|
from nkululeko.reporting.defines import Header
|
25
31
|
from nkululeko.reporting.report_item import ReportItem
|
26
32
|
from nkululeko.reporting.result import Result
|
@@ -46,9 +52,18 @@ class Reporter:
|
|
46
52
|
self.MEASURE = "CCC"
|
47
53
|
self.result.measure = self.MEASURE
|
48
54
|
|
49
|
-
def __init__(self, truths, preds, run, epoch):
|
50
|
-
"""Initialization with ground truth und predictions vector.
|
55
|
+
def __init__(self, truths, preds, run, epoch, probas=None):
|
56
|
+
"""Initialization with ground truth und predictions vector.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
truths (list): the ground truth
|
60
|
+
preds (list): the predictions
|
61
|
+
run (int): number of run
|
62
|
+
epoch (int): number of epoch
|
63
|
+
probas (pd.Dataframe, optional): probabilities per class. Defaults to None.
|
64
|
+
"""
|
51
65
|
self.util = Util("reporter")
|
66
|
+
self.probas = probas
|
52
67
|
self.format = self.util.config_val("PLOT", "format", "png")
|
53
68
|
self.truths = np.asarray(truths)
|
54
69
|
self.preds = np.asarray(preds)
|
@@ -108,6 +123,47 @@ class Reporter:
|
|
108
123
|
self.result.test = test_result
|
109
124
|
self.result.set_upper_lower(upper, lower)
|
110
125
|
# train and loss are being set by the model
|
126
|
+
# print out the class probilities
|
127
|
+
|
128
|
+
def print_probabilities(self):
|
129
|
+
"""Print the probabilities per class to a file in the store."""
|
130
|
+
if (
|
131
|
+
self.util.exp_is_classification()
|
132
|
+
and self.probas is not None
|
133
|
+
and "uncertainty" not in self.probas
|
134
|
+
):
|
135
|
+
probas = self.probas
|
136
|
+
probas["predicted"] = self.preds
|
137
|
+
probas["truth"] = self.truths
|
138
|
+
# softmax the probabilities or logits
|
139
|
+
uncertainty = probas.apply(softmax, axis=1)
|
140
|
+
try:
|
141
|
+
le = glob_conf.label_encoder
|
142
|
+
mapping = dict(zip(le.classes_, range(len(le.classes_))))
|
143
|
+
mapping_reverse = {value: key for key, value in mapping.items()}
|
144
|
+
probas = probas.rename(columns=mapping_reverse)
|
145
|
+
probas["predicted"] = probas["predicted"].map(mapping_reverse)
|
146
|
+
probas["truth"] = probas["truth"].map(mapping_reverse)
|
147
|
+
except AttributeError as ae:
|
148
|
+
self.util.debug(f"Can't label categories: {ae}")
|
149
|
+
# compute entropy per sample
|
150
|
+
uncertainty = uncertainty.apply(entropy)
|
151
|
+
# scale it to 0-1
|
152
|
+
max_ent = math.log(len(glob_conf.labels))
|
153
|
+
uncertainty = (uncertainty - uncertainty.min()) / (
|
154
|
+
max_ent - uncertainty.min()
|
155
|
+
)
|
156
|
+
probas["uncertainty"] = uncertainty
|
157
|
+
probas["correct"] = probas.predicted == probas.truth
|
158
|
+
sp = os.path.join(self.util.get_path("store"), "pred_df.csv")
|
159
|
+
self.probas = probas
|
160
|
+
probas.to_csv(sp)
|
161
|
+
self.util.debug(f"Saved probabilities to {sp}")
|
162
|
+
plots = Plots()
|
163
|
+
ax, caption = plots.plotcatcont(
|
164
|
+
probas, "correct", "uncertainty", "uncertainty", "correct"
|
165
|
+
)
|
166
|
+
plots.save_plot(ax, caption, "Uncertainty", "uncertainty", "samples")
|
111
167
|
|
112
168
|
def set_id(self, run, epoch):
|
113
169
|
"""Make the report identifiable with run and epoch index."""
|
@@ -123,6 +179,12 @@ class Reporter:
|
|
123
179
|
self.preds = np.digitize(self.preds, bins) - 1
|
124
180
|
|
125
181
|
def plot_confmatrix(self, plot_name, epoch=None):
|
182
|
+
"""Plot a confusionmatrix to the store.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
plot_name (str): name for the image file.
|
186
|
+
epoch (int, optional): Number of epoch. Defaults to None.
|
187
|
+
"""
|
126
188
|
if not self.util.exp_is_classification():
|
127
189
|
self.continuous_to_categorical()
|
128
190
|
self._plot_confmat(self.truths, self.preds, plot_name, epoch)
|
@@ -212,10 +274,11 @@ class Reporter:
|
|
212
274
|
)
|
213
275
|
img_path = f"{fig_dir}{plot_name}{self.filenameadd}.{self.format}"
|
214
276
|
plt.savefig(img_path)
|
277
|
+
self.util.debug(f"Saved confusion plot to {img_path}")
|
215
278
|
fig.clear()
|
216
279
|
plt.close(fig)
|
217
|
-
plt.
|
218
|
-
plt.
|
280
|
+
plt.close()
|
281
|
+
plt.clf()
|
219
282
|
glob_conf.report.add_item(
|
220
283
|
ReportItem(
|
221
284
|
Header.HEADER_RESULTS,
|
nkululeko/runmanager.py
CHANGED
@@ -11,7 +11,7 @@ from nkululeko.utils.util import Util
|
|
11
11
|
|
12
12
|
|
13
13
|
class Runmanager:
|
14
|
-
"""Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)"""
|
14
|
+
"""Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)."""
|
15
15
|
|
16
16
|
model = None # The underlying model
|
17
17
|
df_train, df_test, feats_train, feats_test = (
|
@@ -23,15 +23,14 @@ class Runmanager:
|
|
23
23
|
reports = []
|
24
24
|
|
25
25
|
def __init__(self, df_train, df_test, feats_train, feats_test):
|
26
|
-
"""Constructor setting up the dataframes
|
26
|
+
"""Constructor setting up the dataframes.
|
27
|
+
|
27
28
|
Args:
|
28
29
|
df_train: train dataframe
|
29
30
|
df_test: test dataframe
|
30
31
|
feats_train: train features
|
31
32
|
feats_train: test features
|
32
33
|
|
33
|
-
Returns:
|
34
|
-
|
35
34
|
"""
|
36
35
|
self.df_train, self.df_test, self.feats_train, self.feats_test = (
|
37
36
|
df_train,
|
@@ -46,7 +45,7 @@ class Runmanager:
|
|
46
45
|
# self._select_model(model_type)
|
47
46
|
|
48
47
|
def do_runs(self):
|
49
|
-
"""Start the runs"""
|
48
|
+
"""Start the runs."""
|
50
49
|
self.best_results = [] # keep the best result per run
|
51
50
|
self.last_epochs = [] # keep the epoch of best result per run
|
52
51
|
# for all runs
|
@@ -105,15 +104,13 @@ class Runmanager:
|
|
105
104
|
)
|
106
105
|
self.print_model(best_report, plot_name)
|
107
106
|
# finally, print out the numbers for this run
|
108
|
-
# self.reports[-1].print_results(
|
109
|
-
# int(self.util.config_val("EXP", "epochs", 1))
|
110
|
-
# )
|
111
107
|
best_report.print_results(best_report.epoch)
|
108
|
+
best_report.print_probabilities()
|
112
109
|
self.best_results.append(best_report)
|
113
110
|
self.last_epochs.append(last_epoch)
|
114
111
|
|
115
112
|
def print_best_result_runs(self):
|
116
|
-
"""Print the best result for all runs"""
|
113
|
+
"""Print the best result for all runs."""
|
117
114
|
best_report = self.get_best_result(self.best_results)
|
118
115
|
self.util.debug(
|
119
116
|
f"best result all runs with run {best_report.run} and"
|
@@ -177,7 +174,7 @@ class Runmanager:
|
|
177
174
|
return self.load_model(best_report)
|
178
175
|
|
179
176
|
def get_best_result(self, reports):
|
180
|
-
best_r = Reporter([], [], 0, 0)
|
177
|
+
best_r = Reporter([], [], None, 0, 0)
|
181
178
|
if self.util.high_is_good():
|
182
179
|
best_r = self.search_best_result(reports, "ascending")
|
183
180
|
else:
|
@@ -185,7 +182,7 @@ class Runmanager:
|
|
185
182
|
return best_r
|
186
183
|
|
187
184
|
def search_best_result(self, reports, order):
|
188
|
-
best_r = Reporter([], [], 0, 0)
|
185
|
+
best_r = Reporter([], [], None, 0, 0)
|
189
186
|
if order == "ascending":
|
190
187
|
best_result = 0
|
191
188
|
for r in reports:
|
nkululeko/test_predictor.py
CHANGED
@@ -6,13 +6,12 @@
|
|
6
6
|
|
7
7
|
import ast
|
8
8
|
|
9
|
-
import numpy as np
|
10
9
|
import pandas as pd
|
11
10
|
from sklearn.preprocessing import LabelEncoder
|
12
11
|
|
12
|
+
import nkululeko.glob_conf as glob_conf
|
13
13
|
from nkululeko.data.dataset import Dataset
|
14
14
|
from nkululeko.feature_extractor import FeatureExtractor
|
15
|
-
import nkululeko.glob_conf as glob_conf
|
16
15
|
from nkululeko.scaler import Scaler
|
17
16
|
from nkululeko.utils.util import Util
|
18
17
|
|
@@ -42,7 +41,6 @@ class TestPredictor:
|
|
42
41
|
scale = self.util.config_val("FEATS", "scale", False)
|
43
42
|
labelenc = LabelEncoder()
|
44
43
|
data_df[self.target] = labelenc.fit_transform(data_df[self.target])
|
45
|
-
# data_df[self.target] = self.label_encoder.fit_transform(data_df[self.target])
|
46
44
|
if scale:
|
47
45
|
self.scaler = Scaler(data_df, None, feats_df, None, scale)
|
48
46
|
feats_df, _ = self.scaler.scale()
|
@@ -56,18 +54,13 @@ class TestPredictor:
|
|
56
54
|
else:
|
57
55
|
test_dbs = ast.literal_eval(glob_conf.config["DATA"]["tests"])
|
58
56
|
test_dbs_string = "_".join(test_dbs)
|
59
|
-
predictions = self.model.get_predictions()
|
57
|
+
predictions, _ = self.model.get_predictions()
|
60
58
|
report = self.model.predict()
|
61
59
|
result = report.result.get_result()
|
62
60
|
report.set_filename_add(f"test-{test_dbs_string}")
|
63
61
|
self.util.print_best_results([report])
|
64
62
|
report.plot_confmatrix(self.util.get_plot_name(), 0)
|
65
63
|
report.print_results(0)
|
66
|
-
# print(predictions)
|
67
|
-
# df = pd.DataFrame(index=self.orig_df.index)
|
68
|
-
# df["speaker"] = self.orig_df["speaker"]
|
69
|
-
# df["gender"] = self.orig_df["gender"]
|
70
|
-
# df[self.target] = self.orig_df[self.target]
|
71
64
|
df = self.orig_df.copy()
|
72
65
|
df["predictions"] = self.label_encoder.inverse_transform(predictions)
|
73
66
|
target = self.util.config_val("DATA", "target", "emotion")
|
nkululeko/utils/stats.py
CHANGED
@@ -70,12 +70,16 @@ def get_effect_size(df, target, variable):
|
|
70
70
|
cats[c] = df[df[target] == c][variable].values
|
71
71
|
combos = all_combinations(categories)
|
72
72
|
results = {}
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
if len(categories) == 1:
|
74
|
+
cat_s = cohens_D_to_string(0)
|
75
|
+
return categories[0], cat_s, 0
|
76
|
+
else:
|
77
|
+
for combo in combos:
|
78
|
+
one = combo[0]
|
79
|
+
other = combo[1]
|
80
|
+
results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
|
81
|
+
max_cat = max(results, key=results.get)
|
82
|
+
cat_s = cohens_D_to_string(float(results[max_cat]))
|
79
83
|
return max_cat, cat_s, results[max_cat]
|
80
84
|
|
81
85
|
|
@@ -92,7 +96,7 @@ def cohens_D_to_string(val):
|
|
92
96
|
|
93
97
|
|
94
98
|
def normalize(values):
|
95
|
-
"""Do a z-transformation of a distribution.
|
99
|
+
"""Do a z-transformation of a distribution.
|
96
100
|
|
97
101
|
So that mean = 0 and variance = 1
|
98
102
|
"""
|
nkululeko/utils/util.py
CHANGED
@@ -37,8 +37,7 @@ class Util:
|
|
37
37
|
import nkululeko.glob_conf as glob_conf
|
38
38
|
|
39
39
|
self.config = glob_conf.config
|
40
|
-
self.got_data_roots = self.config_val(
|
41
|
-
"DATA", "root_folders", False)
|
40
|
+
self.got_data_roots = self.config_val("DATA", "root_folders", False)
|
42
41
|
if self.got_data_roots:
|
43
42
|
# if there is a global data rootfolder file, read from
|
44
43
|
# there
|
@@ -108,19 +107,17 @@ class Util:
|
|
108
107
|
if self.got_data_roots:
|
109
108
|
try:
|
110
109
|
if len(key) > 0:
|
111
|
-
return self.data_roots["DATA"][dataset +
|
112
|
-
"." + key].strip("'\"")
|
110
|
+
return self.data_roots["DATA"][dataset + "." + key].strip("'\"")
|
113
111
|
else:
|
114
112
|
return self.data_roots["DATA"][dataset].strip("'\"")
|
115
113
|
except KeyError:
|
116
114
|
if default not in self.stopvals:
|
117
115
|
self.debug(
|
118
|
-
f"value for {key} not found, using default:"
|
119
|
-
|
116
|
+
f"value for {key} not found, using default:" f" {default}"
|
117
|
+
)
|
120
118
|
return default
|
121
119
|
if default not in self.stopvals:
|
122
|
-
self.debug(
|
123
|
-
f"value for {key} not found, using default: {default}")
|
120
|
+
self.debug(f"value for {key} not found, using default: {default}")
|
124
121
|
return default
|
125
122
|
|
126
123
|
def set_config(self, config):
|
@@ -131,6 +128,10 @@ class Util:
|
|
131
128
|
store = self.get_path("store")
|
132
129
|
return f"{store}/{self.get_exp_name()}.pkl"
|
133
130
|
|
131
|
+
def get_pred_name(self):
|
132
|
+
store = self.get_path("store")
|
133
|
+
return f"{store}/pred_df.csv"
|
134
|
+
|
134
135
|
def is_categorical(self, pd_series):
|
135
136
|
"""Check if a dataframe column is categorical"""
|
136
137
|
return pd_series.dtype.name == "object" or isinstance(
|
@@ -163,10 +164,8 @@ class Util:
|
|
163
164
|
if len(df) == 0:
|
164
165
|
return df
|
165
166
|
if not isinstance(df.index, pd.MultiIndex):
|
166
|
-
self.debug(
|
167
|
-
|
168
|
-
df.index = audformat.utils.to_segmented_index(
|
169
|
-
df.index, allow_nat=False)
|
167
|
+
self.debug("converting to segmented index, this might take a while...")
|
168
|
+
df.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
|
170
169
|
return df
|
171
170
|
|
172
171
|
def _get_value_descript(self, section, name):
|
@@ -209,7 +208,11 @@ class Util:
|
|
209
208
|
mt = f'{self.config["MODEL"]["type"]}'
|
210
209
|
# ft = "_".join(ast.literal_eval(self.config["FEATS"]["type"]))
|
211
210
|
ft_value = self.config["FEATS"]["type"]
|
212
|
-
if
|
211
|
+
if (
|
212
|
+
isinstance(ft_value, str)
|
213
|
+
and ft_value.startswith("[")
|
214
|
+
and ft_value.endswith("]")
|
215
|
+
):
|
213
216
|
ft = "_".join(ast.literal_eval(ft_value))
|
214
217
|
else:
|
215
218
|
ft = ft_value
|
@@ -237,8 +240,9 @@ class Util:
|
|
237
240
|
["FEATS", "wav2vec2.layer"],
|
238
241
|
]
|
239
242
|
for option in options:
|
240
|
-
return_string += self._get_value_descript(
|
241
|
-
|
243
|
+
return_string += self._get_value_descript(option[0], option[1]).replace(
|
244
|
+
".", "-"
|
245
|
+
)
|
242
246
|
return return_string
|
243
247
|
|
244
248
|
def get_plot_name(self):
|
@@ -284,8 +288,7 @@ class Util:
|
|
284
288
|
return self.config[section][key]
|
285
289
|
except KeyError:
|
286
290
|
if default not in self.stopvals:
|
287
|
-
self.debug(
|
288
|
-
f"value for {key} not found, using default: {default}")
|
291
|
+
self.debug(f"value for {key} not found, using default: {default}")
|
289
292
|
return default
|
290
293
|
|
291
294
|
def config_val_list(self, section, key, default):
|
@@ -293,10 +296,12 @@ class Util:
|
|
293
296
|
return ast.literal_eval(self.config[section][key])
|
294
297
|
except KeyError:
|
295
298
|
if default not in self.stopvals:
|
296
|
-
self.debug(
|
297
|
-
f"value for {key} not found, using default: {default}")
|
299
|
+
self.debug(f"value for {key} not found, using default: {default}")
|
298
300
|
return default
|
299
301
|
|
302
|
+
def get_labels(self):
|
303
|
+
return ast.literal_eval(self.config["DATA"]["labels"])
|
304
|
+
|
300
305
|
def continuous_to_categorical(self, series):
|
301
306
|
"""
|
302
307
|
discretize a categorical variable.
|