PyPI - nkululeko - Versions diffs - 0.86.8__py3-none-any.whl → 0.87.0__py3-none-any.whl - Mend

nkululeko 0.86.8py3-none-any.whl → 0.87.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

nkululeko/constants.py +1 -1
nkululeko/data/dataset_csv.py +12 -14
nkululeko/demo.py +4 -8
nkululeko/modelrunner.py +5 -5
nkululeko/models/model.py +23 -3
nkululeko/models/model_cnn.py +41 -22
nkululeko/models/model_mlp.py +37 -17
nkululeko/models/model_mlp_regression.py +3 -1
nkululeko/plots.py +25 -37
nkululeko/reporting/reporter.py +69 -6
nkululeko/runmanager.py +8 -11
nkululeko/test_predictor.py +1 -6
nkululeko/utils/stats.py +11 -7
{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/METADATA +13 -1
{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/RECORD +18 -18
{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/WHEEL +1 -1
{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/LICENSE +0 -0
{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/top_level.txt +0 -0

nkululeko/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.86.8"
+VERSION="0.87.0"
 SAMPLING_RATE = 16000

nkululeko/data/dataset_csv.py CHANGED Viewed

@@ -23,6 +23,9 @@ class Dataset_CSV(Dataset):
         root = os.path.dirname(data_file)
         audio_path = self.util.config_val_data(self.name, "audio_path", "./")
         df = pd.read_csv(data_file)
+        # trim all string values
+        df_obj = df.select_dtypes("object")
+        df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
         # special treatment for segmented dataframes with only one column:
         if "start" in df.columns and len(df.columns) == 4:
             index = audformat.segmented_index(
@@ -49,8 +52,7 @@ class Dataset_CSV(Dataset):
                     .map(lambda x: root + "/" + audio_path + "/" + x)
                     .values
                 )
-                df = df.set_index(df.index.set_levels(
-                    file_index, level="file"))
+                df = df.set_index(df.index.set_levels(file_index, level="file"))
             else:
                 if not isinstance(df, pd.DataFrame):
                     df = pd.DataFrame(df)
@@ -59,27 +61,24 @@ class Dataset_CSV(Dataset):
                         lambda x: root + "/" + audio_path + "/" + x
                     )
                 )
-        else: # absolute path is True
+        else:  # absolute path is True
             if audformat.index_type(df.index) == "segmented":
                 file_index = (
-                    df.index.levels[0]
-                    .map(lambda x: audio_path + "/" + x)
-                    .values
+                    df.index.levels[0].map(lambda x: audio_path + "/" + x).values
                 )
-                df = df.set_index(df.index.set_levels(
-                    file_index, level="file"))
+                df = df.set_index(df.index.set_levels(file_index, level="file"))
             else:
                 if not isinstance(df, pd.DataFrame):
                     df = pd.DataFrame(df)
-                df = df.set_index(df.index.to_series().apply(
-                    lambda x: audio_path + "/" + x ))
+                df = df.set_index(
+                    df.index.to_series().apply(lambda x: audio_path + "/" + x)
+                )
         self.df = df
         self.db = None
         self.got_target = True
         self.is_labeled = self.got_target
-        self.start_fresh = eval(
-            self.util.config_val("DATA", "no_reuse", "False"))
+        self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
         is_index = False
         try:
             if self.is_labeled and not "class_label" in self.df.columns:
@@ -106,8 +105,7 @@ class Dataset_CSV(Dataset):
                 f" {self.got_gender}, got age: {self.got_age}"
             )
         self.util.debug(r_string)
-        glob_conf.report.add_item(ReportItem(
-            "Data", "Loaded report", r_string))
+        glob_conf.report.add_item(ReportItem("Data", "Loaded report", r_string))
     def prepare(self):
         super().prepare()

nkululeko/demo.py CHANGED Viewed

@@ -30,10 +30,8 @@ from transformers import pipeline
 def main(src_dir):
-    parser = argparse.ArgumentParser(
-        description="Call the nkululeko DEMO framework.")
-    parser.add_argument("--config", default="exp.ini",
-                        help="The base configuration")
+    parser = argparse.ArgumentParser(description="Call the nkululeko DEMO framework.")
+    parser.add_argument("--config", default="exp.ini", help="The base configuration")
     parser.add_argument(
         "--file", help="A file that should be processed (16kHz mono wav)"
     )
@@ -84,8 +82,7 @@ def main(src_dir):
     )
     def print_pipe(files, outfile):
-        """
-        Prints the pipeline output for a list of files, and optionally writes the results to an output file.
+        """Prints the pipeline output for a list of files, and optionally writes the results to an output file.
         Args:
             files (list): A list of file paths to process through the pipeline.
@@ -108,8 +105,7 @@ def main(src_dir):
                 f.write("\n".join(results))
     if util.get_model_type() == "finetune":
-        model_path = os.path.join(
-            util.get_exp_dir(), "models", "run_0", "torch")
+        model_path = os.path.join(util.get_exp_dir(), "models", "run_0", "torch")
         pipe = pipeline("audio-classification", model=model_path)
         if args.file is not None:
             print_pipe([args.file], args.outfile)

nkululeko/modelrunner.py CHANGED Viewed

@@ -85,7 +85,7 @@ class Modelrunner:
                     f"run: {self.run} epoch: {epoch}: result: {test_score_metric}"
                 )
                 # print(f"performance: {performance.split(' ')[1]}")
-                performance = float(test_score_metric.split(' ')[1])
+                performance = float(test_score_metric.split(" ")[1])
                 if performance > self.best_performance:
                     self.best_performance = performance
                     self.best_epoch = epoch
@@ -204,15 +204,15 @@ class Modelrunner:
                 self.df_train, self.df_test, self.feats_train, self.feats_test
             )
         elif model_type == "cnn":
-            from nkululeko.models.model_cnn import CNN_model
+            from nkululeko.models.model_cnn import CNNModel
-            self.model = CNN_model(
+            self.model = CNNModel(
                 self.df_train, self.df_test, self.feats_train, self.feats_test
             )
         elif model_type == "mlp":
-            from nkululeko.models.model_mlp import MLP_model
+            from nkululeko.models.model_mlp import MLPModel
-            self.model = MLP_model(
+            self.model = MLPModel(
                 self.df_train, self.df_test, self.feats_train, self.feats_test
             )
         elif model_type == "mlp_reg":

nkululeko/models/model.py CHANGED Viewed

@@ -247,8 +247,25 @@ class Model:
                 self.clf.fit(feats, labels)
     def get_predictions(self):
-        predictions = self.clf.predict(self.feats_test.to_numpy())
-        return predictions
+        #        predictions = self.clf.predict(self.feats_test.to_numpy())
+        if self.util.exp_is_classification():
+            # make a dataframe for the class probabilities
+            proba_d = {}
+            for c in self.clf.classes_:
+                proba_d[c] = []
+            # get the class probabilities
+            predictions = self.clf.predict_proba(self.feats_test.to_numpy())
+            # pred = self.clf.predict(features)
+            for i, c in enumerate(self.clf.classes_):
+                proba_d[c] = list(predictions.T[i])
+            probas = pd.DataFrame(proba_d)
+            probas = probas.set_index(self.feats_test.index)
+            predictions = probas.idxmax(axis=1).values
+        else:
+            predictions = self.clf.predict(self.feats_test.to_numpy())
+            probas = None
+        return predictions, probas
     def predict(self):
         if self.feats_test.isna().to_numpy().any():
@@ -263,13 +280,16 @@ class Model:
             )
             return report
         """Predict the whole eval feature set"""
-        predictions = self.get_predictions()
+        predictions, probas = self.get_predictions()
         report = Reporter(
             self.df_test[self.target].to_numpy().astype(float),
             predictions,
             self.run,
             self.epoch,
+            probas=probas,
         )
+        report.print_probabilities()
         return report
     def get_type(self):

nkululeko/models/model_cnn.py CHANGED Viewed

@@ -5,33 +5,40 @@ Inspired by code from Su Lei
 """
+import ast
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+from PIL import Image
+from sklearn.metrics import recall_score
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torchvision
-import torchvision.transforms as transforms
 from torch.utils.data import Dataset
-import ast
-import numpy as np
-from sklearn.metrics import recall_score
-from collections import OrderedDict
-from PIL import Image
-from traitlets import default
+import torchvision.transforms as transforms
-from nkululeko.utils.util import Util
 import nkululeko.glob_conf as glob_conf
+from nkululeko.losses.loss_softf1loss import SoftF1Loss
 from nkululeko.models.model import Model
 from nkululeko.reporting.reporter import Reporter
-from nkululeko.losses.loss_softf1loss import SoftF1Loss
+from nkululeko.utils.util import Util
-class CNN_model(Model):
-    """CNN = convolutional neural net"""
+class CNNModel(Model):
+    """CNN = convolutional neural net."""
     is_classifier = True
     def __init__(self, df_train, df_test, feats_train, feats_test):
-        """Constructor taking the configuration and all dataframes"""
+        """Constructor, taking all dataframes.
+        Args:
+            df_train (pd.DataFrame): The train labels.
+            df_test (pd.DataFrame): The test labels.
+            feats_train (pd.DataFrame): The train features.
+            feats_test (pd.DataFrame): The test features.
+        """
         super().__init__(df_train, df_test, feats_train, feats_test)
         super().set_model_type("ann")
         self.name = "cnn"
@@ -147,7 +154,20 @@ class CNN_model(Model):
             self.optimizer.step()
         self.loss = (np.asarray(losses)).mean()
-    def evaluate_model(self, model, loader, device):
+    def get_probas(self, logits):
+        # make a dataframe for probabilites (logits)
+        proba_d = {}
+        classes = self.df_test[self.target].unique()
+        classes.sort()
+        for c in classes:
+            proba_d[c] = []
+        for i, c in enumerate(classes):
+            proba_d[c] = list(logits.numpy().T[i])
+        probas = pd.DataFrame(proba_d)
+        probas = probas.set_index(self.df_test.index)
+        return probas
+    def evaluate(self, model, loader, device):
         logits = torch.zeros(len(loader.dataset), self.class_num)
         targets = torch.zeros(len(loader.dataset))
         model.eval()
@@ -169,14 +189,15 @@ class CNN_model(Model):
         self.loss_eval = (np.asarray(losses)).mean()
         predictions = logits.argmax(dim=1)
         uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
-        return uar, targets, predictions
+        return uar, targets, predictions, logits
     def predict(self):
-        _, truths, predictions = self.evaluate_model(
+        _, truths, predictions, logits = self.evaluate(
             self.model, self.testloader, self.device
         )
-        uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
-        report = Reporter(truths, predictions, self.run, self.epoch)
+        uar, _, _, _ = self.evaluate(self.model, self.trainloader, self.device)
+        probas = self.get_probas(logits)
+        report = Reporter(truths, predictions, self.run, self.epoch, probas=probas)
         try:
             report.result.loss = self.loss
         except AttributeError:  # if the model was loaded from disk the loss is unknown
@@ -189,13 +210,11 @@ class CNN_model(Model):
         return report
     def get_predictions(self):
-        _, truths, predictions = self.evaluate_model(
-            self.model, self.testloader, self.device
-        )
+        _, _, predictions, _ = self.evaluate(self.model, self.testloader, self.device)
         return predictions.numpy()
     def predict_sample(self, features):
-        """Predict one sample"""
+        """Predict one sample."""
         with torch.no_grad():
             logits = self.model(torch.from_numpy(features).to(self.device))
         a = logits.numpy()

nkululeko/models/model_mlp.py CHANGED Viewed

@@ -1,25 +1,33 @@
 # model_mlp.py
+import ast
+from collections import OrderedDict
+import numpy as np
 import pandas as pd
+from sklearn.metrics import recall_score
+import torch
-from nkululeko.utils.util import Util
 import nkululeko.glob_conf as glob_conf
+from nkululeko.losses.loss_softf1loss import SoftF1Loss
 from nkululeko.models.model import Model
 from nkululeko.reporting.reporter import Reporter
-import torch
-import ast
-import numpy as np
-from sklearn.metrics import recall_score
-from collections import OrderedDict
-from nkululeko.losses.loss_softf1loss import SoftF1Loss
+from nkululeko.utils.util import Util
-class MLP_model(Model):
+class MLPModel(Model):
     """MLP = multi layer perceptron."""
     is_classifier = True
     def __init__(self, df_train, df_test, feats_train, feats_test):
-        """Constructor taking the configuration and all dataframes."""
+        """Constructor, taking all dataframes.
+        Args:
+            df_train (pd.DataFrame): The train labels.
+            df_test (pd.DataFrame): The test labels.
+            feats_train (pd.DataFrame): The train features.
+            feats_test (pd.DataFrame): The test features.
+        """
         super().__init__(df_train, df_test, feats_train, feats_test)
         super().set_model_type("ann")
         self.name = "mlp"
@@ -97,7 +105,7 @@ class MLP_model(Model):
             self.optimizer.step()
         self.loss = (np.asarray(losses)).mean()
-    def evaluate_model(self, model, loader, device):
+    def evaluate(self, model, loader, device):
         logits = torch.zeros(len(loader.dataset), self.class_num)
         targets = torch.zeros(len(loader.dataset))
         model.eval()
@@ -119,14 +127,28 @@ class MLP_model(Model):
         self.loss_eval = (np.asarray(losses)).mean()
         predictions = logits.argmax(dim=1)
         uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
-        return uar, targets, predictions
+        return uar, targets, predictions, logits
+    def get_probas(self, logits):
+        # make a dataframe for probabilites (logits)
+        proba_d = {}
+        classes = self.df_test[self.target].unique()
+        classes.sort()
+        for c in classes:
+            proba_d[c] = []
+        for i, c in enumerate(classes):
+            proba_d[c] = list(logits.numpy().T[i])
+        probas = pd.DataFrame(proba_d)
+        probas = probas.set_index(self.df_test.index)
+        return probas
     def predict(self):
-        _, truths, predictions = self.evaluate_model(
+        _, truths, predictions, logits = self.evaluate(
             self.model, self.testloader, self.device
         )
-        uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
-        report = Reporter(truths, predictions, self.run, self.epoch)
+        uar, _, _, _ = self.evaluate(self.model, self.trainloader, self.device)
+        probas = self.get_probas(logits)
+        report = Reporter(truths, predictions, self.run, self.epoch, probas=probas)
         try:
             report.result.loss = self.loss
         except AttributeError:  # if the model was loaded from disk the loss is unknown
@@ -139,9 +161,7 @@ class MLP_model(Model):
         return report
     def get_predictions(self):
-        _, truths, predictions = self.evaluate_model(
-            self.model, self.testloader, self.device
-        )
+        _, _, predictions, _ = self.evaluate(self.model, self.testloader, self.device)
         return predictions.numpy()
     def get_loader(self, df_x, df_y, shuffle):

nkululeko/models/model_mlp_regression.py CHANGED Viewed

@@ -97,7 +97,9 @@ class MLP_Reg_model(Model):
             self.model, self.testloader, self.device
         )
         result, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
-        report = Reporter(truths.numpy(), predictions.numpy(), self.run, self.epoch)
+        report = Reporter(
+            truths.numpy(), predictions.numpy(), None, self.run, self.epoch
+        )
         try:
             report.result.loss = self.loss
         except AttributeError:  # if the model was loaded from disk the loss is unknown

nkululeko/plots.py CHANGED Viewed

@@ -48,7 +48,7 @@ class Plots:
             )
             ax.set_ylabel(f"number of speakers")
             ax.set_xlabel("number of samples")
-            self._save_plot(
+            self.save_plot(
                 ax,
                 "Samples per speaker",
                 f"Samples per speaker ({df_speakers.shape[0]})",
@@ -70,9 +70,9 @@ class Plots:
                     rot=0,
                 )
             )
-            ax.set_ylabel(f"number of speakers")
+            ax.set_ylabel("number of speakers")
             ax.set_xlabel("number of samples")
-            self._save_plot(
+            self.save_plot(
                 ax,
                 "Sample value counts",
                 f"Samples per speaker ({df_speakers.shape[0]})",
@@ -96,7 +96,7 @@ class Plots:
             binned_data = self.util.continuous_to_categorical(df[class_label])
             ax = binned_data.value_counts().plot(kind="bar")
             filename_binned = f"{class_label}_discreet"
-            self._save_plot(
+            self.save_plot(
                 ax,
                 "Sample value counts",
                 filename_binned,
@@ -106,7 +106,7 @@ class Plots:
             dist_type = self.util.config_val("EXPL", "dist_type", "hist")
             ax = df[class_label].plot(kind=dist_type)
-        self._save_plot(
+        self.save_plot(
             ax,
             "Sample value counts",
             filename,
@@ -131,17 +131,17 @@ class Plots:
                             df, class_label, att1, self.target, type_s
                         )
                     else:
-                        ax, caption = self._plotcatcont(
+                        ax, caption = self.plotcatcont(
                             df, class_label, att1, att1, type_s
                         )
                 else:
                     if self.util.is_categorical(df[att1]):
-                        ax, caption = self._plotcatcont(
+                        ax, caption = self.plotcatcont(
                             df, att1, class_label, att1, type_s
                         )
                     else:
                         ax, caption = self._plot2cont(df, class_label, att1, type_s)
-                self._save_plot(
+                self.save_plot(
                     ax,
                     caption,
                     f"Correlation of {self.target} and {att[0]}",
@@ -171,15 +171,11 @@ class Plots:
                             ax, caption = self._plot2cat(df, att1, att2, att1, type_s)
                         else:
                             # class_label = cat, att1 = cat, att2 = cont
-                            ax, caption = self._plotcatcont(
-                                df, att1, att2, att1, type_s
-                            )
+                            ax, caption = self.plotcatcont(df, att1, att2, att1, type_s)
                     else:
                         if self.util.is_categorical(df[att2]):
                             # class_label = cat, att1 = cont, att2 = cat
-                            ax, caption = self._plotcatcont(
-                                df, att2, att1, att2, type_s
-                            )
+                            ax, caption = self.plotcatcont(df, att2, att1, att2, type_s)
                         else:
                             # class_label = cat, att1 = cont, att2 = cont
                             ax, caption = self._plot2cont_cat(
@@ -205,7 +201,7 @@ class Plots:
                             # class_label = cont, att1 = cont, att2 = cont
                             ax, caption = self._plot2cont(df, att1, att2, type_s)
-                self._save_plot(
+                self.save_plot(
                     ax, caption, f"Correlation of {att1} and {att2}", filename, type_s
                 )
@@ -215,16 +211,16 @@ class Plots:
                     f" {att} has more than 2 values. Perhaps you forgot to state a list of lists?"
                 )
-    def _save_plot(self, ax, caption, header, filename, type_s):
+    def save_plot(self, ax, caption, header, filename, type_s):
         # one up because of the runs
         fig_dir = self.util.get_path("fig_dir") + "../"
-        fig = ax.figure
+        fig_plots = ax.figure
         # avoid warning
         # plt.tight_layout()
         img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
         plt.savefig(img_path)
-        plt.close(fig)
-        # fig.clear()   # avoid error
+        plt.close(fig_plots)
+        self.util.debug(f"Saved plot to {img_path}")
         glob_conf.report.add_item(
             ReportItem(
                 Header.HEADER_EXPLORE,
@@ -244,35 +240,29 @@ class Plots:
         return att, df
     def _plot2cont_cat(self, df, cont1, cont2, cat, ylab):
-        """
-        plot relation of two continuous distributions with one categorical
-        """
+        """Plot relation of two continuous distributions with one categorical."""
         pearson = stats.pearsonr(df[cont1], df[cont2])
         # trunc to three digits
         pearson = int(pearson[0] * 1000) / 1000
         pearson_string = f"PCC: {pearson}"
         ax = sns.lmplot(data=df, x=cont1, y=cont2, hue=cat)
         caption = f"{ylab} {df.shape[0]}. {pearson_string}"
-        ax.fig.suptitle(caption)
+        ax.figure.suptitle(caption)
         return ax, caption
     def _plot2cont(self, df, col1, col2, ylab):
-        """
-        plot relation of two continuous distributions
-        """
+        """Plot relation of two continuous distributions."""
         pearson = stats.pearsonr(df[col1], df[col2])
         # trunc to three digits
         pearson = int(pearson[0] * 1000) / 1000
         pearson_string = f"PCC: {pearson}"
         ax = sns.lmplot(data=df, x=col1, y=col2)
         caption = f"{ylab} {df.shape[0]}. {pearson_string}"
-        ax.fig.suptitle(caption)
+        ax.figure.suptitle(caption)
         return ax, caption
-    def _plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
-        """
-        plot relation of categorical distribution with continuous
-        """
+    def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
+        """Plot relation of categorical distribution with continuous."""
         dist_type = self.util.config_val("EXPL", "dist_type", "hist")
         cats, cat_str, es = su.get_effect_size(df, cat_col, cont_col)
         if dist_type == "hist":
@@ -287,13 +277,11 @@ class Plots:
             )
             ax.set(xlabel=f"{cont_col}")
             caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
-            ax.fig.suptitle(caption)
+            ax.figure.suptitle(caption)
         return ax, caption
     def _plot2cat(self, df, col1, col2, xlab, ylab):
-        """
-        plot relation of 2 categorical distributions
-        """
+        """Plot relation of 2 categorical distributions."""
         crosstab = pd.crosstab(index=df[col1], columns=df[col2])
         res_pval = stats.chi2_contingency(crosstab)
         res_pval = int(res_pval[1] * 1000) / 1000
@@ -320,8 +308,8 @@ class Plots:
         max = self.util.to_3_digits(df.duration.max())
         title = f"Duration distr. for {sample_selection} {df.shape[0]}. min={min}, max={max}"
         ax.set_title(title)
-        ax.set_xlabel(f"duration")
-        ax.set_ylabel(f"number of samples")
+        ax.set_xlabel("duration")
+        ax.set_ylabel("number of samples")
         fig = ax.figure
         # plt.tight_layout()
         img_path = f"{fig_dir}{filename}_{sample_selection}.{self.format}"

nkululeko/reporting/reporter.py CHANGED Viewed

@@ -2,16 +2,21 @@ import ast
 import glob
 import json
 import math
+import os
 from confidence_intervals import evaluate_with_conf_int
 import matplotlib.pyplot as plt
 import numpy as np
+from scipy.special import softmax
+from scipy.stats import entropy
 from scipy.stats import pearsonr
-from sklearn.metrics import ConfusionMatrixDisplay, roc_curve
+from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.metrics import auc
 from sklearn.metrics import classification_report
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import r2_score
-from sklearn.metrics import roc_curve, auc, roc_auc_score
+from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_curve
 from torch import is_tensor
 from audmetric import accuracy
@@ -21,6 +26,7 @@ from audmetric import mean_squared_error
 from audmetric import unweighted_average_recall
 import nkululeko.glob_conf as glob_conf
+from nkululeko.plots import Plots
 from nkululeko.reporting.defines import Header
 from nkululeko.reporting.report_item import ReportItem
 from nkululeko.reporting.result import Result
@@ -46,9 +52,18 @@ class Reporter:
                 self.MEASURE = "CCC"
                 self.result.measure = self.MEASURE
-    def __init__(self, truths, preds, run, epoch):
-        """Initialization with ground truth und predictions vector."""
+    def __init__(self, truths, preds, run, epoch, probas=None):
+        """Initialization with ground truth und predictions vector.
+        Args:
+            truths (list): the ground truth
+            preds (list): the predictions
+            run (int): number of run
+            epoch (int): number of epoch
+            probas (pd.Dataframe, optional): probabilities per class. Defaults to None.
+        """
         self.util = Util("reporter")
+        self.probas = probas
         self.format = self.util.config_val("PLOT", "format", "png")
         self.truths = np.asarray(truths)
         self.preds = np.asarray(preds)
@@ -108,6 +123,47 @@ class Reporter:
                 self.result.test = test_result
                 self.result.set_upper_lower(upper, lower)
                 # train and loss are being set by the model
+        # print out the class  probilities
+    def print_probabilities(self):
+        """Print the probabilities per class to a file in the store."""
+        if (
+            self.util.exp_is_classification()
+            and self.probas is not None
+            and "uncertainty" not in self.probas
+        ):
+            probas = self.probas
+            probas["predicted"] = self.preds
+            probas["truth"] = self.truths
+            # softmax the probabilities or logits
+            uncertainty = probas.apply(softmax, axis=1)
+            try:
+                le = glob_conf.label_encoder
+                mapping = dict(zip(le.classes_, range(len(le.classes_))))
+                mapping_reverse = {value: key for key, value in mapping.items()}
+                probas = probas.rename(columns=mapping_reverse)
+                probas["predicted"] = probas["predicted"].map(mapping_reverse)
+                probas["truth"] = probas["truth"].map(mapping_reverse)
+            except AttributeError as ae:
+                self.util.debug(f"Can't label categories: {ae}")
+            # compute entropy per sample
+            uncertainty = uncertainty.apply(entropy)
+            # scale it to 0-1
+            max_ent = math.log(len(glob_conf.labels))
+            uncertainty = (uncertainty - uncertainty.min()) / (
+                max_ent - uncertainty.min()
+            )
+            probas["uncertainty"] = uncertainty
+            probas["correct"] = probas.predicted == probas.truth
+            sp = os.path.join(self.util.get_path("store"), "pred_df.csv")
+            self.probas = probas
+            probas.to_csv(sp)
+            self.util.debug(f"Saved probabilities to {sp}")
+            plots = Plots()
+            ax, caption = plots.plotcatcont(
+                probas, "correct", "uncertainty", "uncertainty", "correct"
+            )
+            plots.save_plot(ax, caption, "Uncertainty", "uncertainty", "samples")
     def set_id(self, run, epoch):
         """Make the report identifiable with run and epoch index."""
@@ -123,6 +179,12 @@ class Reporter:
         self.preds = np.digitize(self.preds, bins) - 1
     def plot_confmatrix(self, plot_name, epoch=None):
+        """Plot a confusionmatrix to the store.
+        Args:
+            plot_name (str): name for the image file.
+            epoch (int, optional): Number of epoch. Defaults to None.
+        """
         if not self.util.exp_is_classification():
             self.continuous_to_categorical()
         self._plot_confmat(self.truths, self.preds, plot_name, epoch)
@@ -212,10 +274,11 @@ class Reporter:
             )
         img_path = f"{fig_dir}{plot_name}{self.filenameadd}.{self.format}"
         plt.savefig(img_path)
+        self.util.debug(f"Saved confusion plot to {img_path}")
         fig.clear()
         plt.close(fig)
-        plt.savefig(img_path)
-        plt.close(fig)
+        plt.close()
+        plt.clf()
         glob_conf.report.add_item(
             ReportItem(
                 Header.HEADER_RESULTS,

nkululeko/runmanager.py CHANGED Viewed

@@ -11,7 +11,7 @@ from nkululeko.utils.util import Util
 class Runmanager:
-    """Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)"""
+    """Class to manage the runs of the experiment (e.g. when results differ caused by random initialization)."""
     model = None  # The underlying model
     df_train, df_test, feats_train, feats_test = (
@@ -23,15 +23,14 @@ class Runmanager:
     reports = []
     def __init__(self, df_train, df_test, feats_train, feats_test):
-        """Constructor setting up the dataframes
+        """Constructor setting up the dataframes.
         Args:
             df_train: train dataframe
             df_test: test dataframe
             feats_train: train features
             feats_train: test features
-        Returns:
         """
         self.df_train, self.df_test, self.feats_train, self.feats_test = (
             df_train,
@@ -46,7 +45,7 @@ class Runmanager:
         # self._select_model(model_type)
     def do_runs(self):
-        """Start the runs"""
+        """Start the runs."""
         self.best_results = []  # keep the best result per run
         self.last_epochs = []  # keep the epoch of best result per run
         # for all runs
@@ -105,15 +104,13 @@ class Runmanager:
                 )
                 self.print_model(best_report, plot_name)
             # finally, print out the numbers for this run
-            # self.reports[-1].print_results(
-            #     int(self.util.config_val("EXP", "epochs", 1))
-            # )
             best_report.print_results(best_report.epoch)
+            best_report.print_probabilities()
             self.best_results.append(best_report)
             self.last_epochs.append(last_epoch)
     def print_best_result_runs(self):
-        """Print the best result for all runs"""
+        """Print the best result for all runs."""
         best_report = self.get_best_result(self.best_results)
         self.util.debug(
             f"best result all runs with run {best_report.run}             and"
@@ -177,7 +174,7 @@ class Runmanager:
         return self.load_model(best_report)
     def get_best_result(self, reports):
-        best_r = Reporter([], [], 0, 0)
+        best_r = Reporter([], [], None, 0, 0)
         if self.util.high_is_good():
             best_r = self.search_best_result(reports, "ascending")
         else:
@@ -185,7 +182,7 @@ class Runmanager:
         return best_r
     def search_best_result(self, reports, order):
-        best_r = Reporter([], [], 0, 0)
+        best_r = Reporter([], [], None, 0, 0)
         if order == "ascending":
             best_result = 0
             for r in reports:

nkululeko/test_predictor.py CHANGED Viewed

@@ -56,18 +56,13 @@ class TestPredictor:
         else:
             test_dbs = ast.literal_eval(glob_conf.config["DATA"]["tests"])
             test_dbs_string = "_".join(test_dbs)
-            predictions = self.model.get_predictions()
+            predictions, _ = self.model.get_predictions()
             report = self.model.predict()
             result = report.result.get_result()
             report.set_filename_add(f"test-{test_dbs_string}")
             self.util.print_best_results([report])
             report.plot_confmatrix(self.util.get_plot_name(), 0)
             report.print_results(0)
-            # print(predictions)
-            # df = pd.DataFrame(index=self.orig_df.index)
-            # df["speaker"] = self.orig_df["speaker"]
-            # df["gender"] = self.orig_df["gender"]
-            # df[self.target] = self.orig_df[self.target]
             df = self.orig_df.copy()
             df["predictions"] = self.label_encoder.inverse_transform(predictions)
             target = self.util.config_val("DATA", "target", "emotion")

nkululeko/utils/stats.py CHANGED Viewed

@@ -70,12 +70,16 @@ def get_effect_size(df, target, variable):
         cats[c] = df[df[target] == c][variable].values
     combos = all_combinations(categories)
     results = {}
-    for combo in combos:
-        one = combo[0]
-        other = combo[1]
-        results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
-    max_cat = max(results, key=results.get)
-    cat_s = cohens_D_to_string(float(results[max_cat]))
+    if len(categories) == 1:
+        cat_s = cohens_D_to_string(0)
+        return categories[0], cat_s, 0
+    else:
+        for combo in combos:
+            one = combo[0]
+            other = combo[1]
+            results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
+        max_cat = max(results, key=results.get)
+        cat_s = cohens_D_to_string(float(results[max_cat]))
     return max_cat, cat_s, results[max_cat]
@@ -92,7 +96,7 @@ def cohens_D_to_string(val):
 def normalize(values):
-    """Do a z-transformation of a distribution.
+    """Do a z-transformation of a distribution.
     So that mean = 0 and variance = 1
     """

{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nkululeko
-Version: 0.86.8
+Version: 0.87.0
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -51,6 +51,7 @@ Requires-Dist: pylatex
   - [t-SNE plots](#t-sne-plots)
   - [Data distribution](#data-distribution)
   - [Bias checking](#bias-checking)
+  - [Uncertainty](#uncertainty)
 - [Documentation](#documentation)
 - [Installation](#installation)
 - [Usage](#usage)
@@ -113,6 +114,13 @@ In cases you might wonder if there's bias in your data. You can try to detect th
 <img src="meta/images/emotion-pesq.png" width="500px"/>
+### Uncertainty
+Nkululeko estimates uncertainty of model decision (only for classifiers) with entropy over the class-probabilities or logits per sample.
+<img src="meta/images/uncertainty.png" width="500px"/>
 ## Documentation
 The documentation, along with extensions of installation, usage, INI file format, and examples, can be found [nkululeko.readthedocs.io](https://nkululeko.readthedocs.io).
@@ -343,6 +351,10 @@ F. Burkhardt, Johannes Wagner, Hagen Wierstorf, Florian Eyben and Björn Schulle
 Changelog
 =========
+Version 0.87.0
+--------------
+* added class probability output and uncertainty analysis
 Version 0.86.8
 --------------
 * handle single feature sets as strings in the config

{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/RECORD RENAMED Viewed

@@ -2,8 +2,8 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
 nkululeko/aug_train.py,sha256=YhuZnS_WVWnun9G-M6g5n6rbRxoVREz6Zh7k6qprFNQ,3194
 nkululeko/augment.py,sha256=4MG0apTAG5RgkuJrYEjGgDdbodZWi_HweSPNI1JJ5QA,3051
 nkululeko/cacheddataset.py,sha256=lIJ6hUo5LoxSrzXtWV8mzwO7wRtUETWnOQ4ws2XfL1E,969
-nkululeko/constants.py,sha256=FOK-XF_DHGNFHsO_OMLof3jwgrn2buWnPVfrHy5QBm8,39
-nkululeko/demo.py,sha256=WSKr-W5uJ9DQfemK923g7Hd5V3kgAn03Er0JX1Pa45I,5142
+nkululeko/constants.py,sha256=qVowcvAZL-g-Bsp_4yBCOQDkCoW-S-1wrRG5XgnjnX0,39
+nkululeko/demo.py,sha256=Sqbu3o6Pzdr_UlYxWM8Mn3l5uCXsw429yJbtkVDUYHU,5087
 nkululeko/demo_feats.py,sha256=sAeGFojhEj9WEDFtG3SzPBmyYJWLF2rkbpp65m8Ujo4,2025
 nkululeko/demo_predictor.py,sha256=es56xbT8ifkS_vnrlb5NTZT54gNmeUtNlA4zVA_gnN8,4757
 nkululeko/experiment.py,sha256=s9PIjm45dR9yzmHu_69JpBjX9qMVzi5wIgPfMR3F44A,31530
@@ -13,19 +13,19 @@ nkululeko/feature_extractor.py,sha256=rL-TybLmjZz5uxT9LNTORaDat9FKp_1qloxbyMriny
 nkululeko/file_checker.py,sha256=LoLnL8aHpW-axMQ46qbqrManTs5otG9ShpEZuz9iRSk,3474
 nkululeko/filter_data.py,sha256=w-X2mhKdYr5DxDIz50E5yzO6Jmzk4jjDBoXsgOOVtcA,7222
 nkululeko/glob_conf.py,sha256=KL9YJQTHvTztxo1vr25qRRgaPnx4NTg0XrdbovKGMmw,525
-nkululeko/modelrunner.py,sha256=OU35qwP94GxW_EtL4I2-RhqB-wxbjNvp8CIHNbtnt7Q,11155
+nkululeko/modelrunner.py,sha256=rpWQRXERiDZ-i_7CwsqynI87vawtsaPihsonDMPe9PU,11151
 nkululeko/multidb.py,sha256=fG3VukEWP1vreVN4gB1IRXxwwg4jLftsSEYtu0o1f78,5634
 nkululeko/nkuluflag.py,sha256=PGWSmZz-PiiHLgcZJAoGOI_Y-sZDVI1ksB8p5r7riWM,3725
 nkululeko/nkululeko.py,sha256=Kn3s2E3yyH8cJ7z6lkMxrnqtCxTu7-qfe9Zr_ONTD5g,1968
-nkululeko/plots.py,sha256=C2mwQFK0Vxfl5ZM7CO87tULDoEf7G16ek0nU77bhOc4,23070
+nkululeko/plots.py,sha256=WsI_dtPKfrYPsKymHRmIhqj33aZzTcE8fF_EwLkm_5A,22899
 nkululeko/predict.py,sha256=sF091sSSLnEWcISx9ZcULLie3tY5XeFsQJd6b3vrxFg,2409
 nkululeko/resample.py,sha256=2d9eao_0sLrGZ_KSl8OVKsPor3BkFrlmMhrpB9WelIs,4267
-nkululeko/runmanager.py,sha256=Na8oPn59lRFiNMsYChRHBRgw40mBcw0Rwl2Kz1RUsA0,7614
+nkululeko/runmanager.py,sha256=eRMJidkoJhkU5NdIKoozv3vovU-8tqfn-7zqr2JZcnE,7533
 nkululeko/scaler.py,sha256=4nkIqoajkIkuTPK0Z02ifMN_awl6fP_i-GBYdoGYgGM,4101
 nkululeko/segment.py,sha256=YLKckX44tbvTb3LrdgYw9X4guzuF27sutl92z9DkpZU,4835
 nkululeko/syllable_nuclei.py,sha256=Sky-C__MeUDaxqHnDl2TGLLYOYvsahD35TUjWGeG31k,10047
 nkululeko/test.py,sha256=1w624vo5KTzmFC8BUStGlLDmIEAFuJUz7J0W-gp7AxI,1677
-nkululeko/test_predictor.py,sha256=_w5J8CxH6hmW3mLTKbdfmywl5QpdNAnW1Y8TE5GtlfE,3237
+nkululeko/test_predictor.py,sha256=KaGef_r4mXW89f0aUiYDw8IiBe2ciGt14HNkR-S14lU,2985
 nkululeko/test_pretrain.py,sha256=ZWl-bR6nmeSmXkGAIE6zyfQEjN8Zg0rIxfaS-O6Zbas,8465
 nkululeko/augmenting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/augmenting/augmenter.py,sha256=XAt0dpmlnKxqyysqCgV3rcz-pRIvOz7rU7dmGDCVAzs,2905
@@ -46,7 +46,7 @@ nkululeko/autopredict/ap_valence.py,sha256=n-hctRKySzhmJtowuMOTUu0T_ld3uK5pnfOzW
 nkululeko/autopredict/estimate_snr.py,sha256=S-bpS0xFkwWc4Ch75UrjbS8y538lQ0U3g_iLRFXureY,5048
 nkululeko/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/data/dataset.py,sha256=hUD0NqWCfRaSHG8JNs1MsPb0zjUZAf8FJkg_c0ebq0Q,28046
-nkululeko/data/dataset_csv.py,sha256=dzOrbKB8t0UATAIYaKAOqHTogmYPBqskt6Hak7VjbSM,4537
+nkululeko/data/dataset_csv.py,sha256=UGEpi__eT2KFS6Fop6N4HkMrzO-u5VP71gt44kwZavo,4588
 nkululeko/feat_extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/feat_extract/feats_agender.py,sha256=Qm69G4kqAyTVVk7wwRgrXlNwGaDMGRYyKGpuf0vOEgM,3113
 nkululeko/feat_extract/feats_agender_agender.py,sha256=tgH2BnwcxpvuLmOkrMbVdBSX0Onfz2MG12FsddalRKI,3424
@@ -75,15 +75,15 @@ nkululeko/losses/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 nkululeko/losses/loss_ccc.py,sha256=NOK0y0fxKUnU161B5geap6Fmn8QzoPl2MqtPiV8IuJE,976
 nkululeko/losses/loss_softf1loss.py,sha256=5gW-PuiqeAZcRgfwjueIOQtMokOjZWgQnVIv59HKTCo,1309
 nkululeko/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nkululeko/models/model.py,sha256=PUCqF2r_dEfmFsZn6Cgr1UIzYvxziLH6nSqZ5-vuN1o,11639
+nkululeko/models/model.py,sha256=JXrd0fbU0JhTxUDrs0kOEHF9rtPJBxBeO6zcrHAzk8k,12475
 nkululeko/models/model_bayes.py,sha256=WJFZ8wFKwWATz6MhmjeZIi1Pal1viU549WL_PjXDSy8,406
-nkululeko/models/model_cnn.py,sha256=bJxqwe6FnVR2hFeqN6EXexYGgvKYFED1VOhBXVlLWaE,9954
+nkululeko/models/model_cnn.py,sha256=NreR2LrKMyBYHyIJEL6wm3UQ4mA5HleZfpUyA5wNYpA,10629
 nkululeko/models/model_gmm.py,sha256=hZ9UO36KNf48qa3J-xkWIicIj9-TApmt21zNES2vEOs,649
 nkululeko/models/model_knn.py,sha256=KlnrJfwiVnmXZrAaYGFrKA2f5sznvTzSJQ8-5etOP0k,599
 nkululeko/models/model_knn_reg.py,sha256=j7YFfVm6xOR2d9yBYdQiwwqYfqkX0JynX_qLCvkr1fk,610
 nkululeko/models/model_lin_reg.py,sha256=0D7mSnSwK82lNWDMwHYRyq3FmGa6y-DHDGg4qUe85q4,422
-nkululeko/models/model_mlp.py,sha256=xMirtYax3bLBz_0kkC0M4Rc6-KQY05NNKHQGw7rbum8,9856
-nkululeko/models/model_mlp_regression.py,sha256=PO5qyfjgAJH8hawhmeXDaUThyXDYdM642dQHkO0NY7c,10204
+nkululeko/models/model_mlp.py,sha256=VE0CI19qMyRbI-THDkMeJ7JbWf4z7CmZ4MMs1FIQgtM,10557
+nkululeko/models/model_mlp_regression.py,sha256=7oK2zQhhCegSqiBUe6eU7Av8MJ_DPLA9skixJcHaVfg,10232
 nkululeko/models/model_svm.py,sha256=rsME3KvKvNG7bdE5lbvYUu85WZhaASZxxmdNDIVJRZ4,940
 nkululeko/models/model_svr.py,sha256=_YZeksqB3eBENGlg3g9RwYFlk9rQQ-XCeNBKLlGGVoE,725
 nkululeko/models/model_tree.py,sha256=rf16faUm4o2LJgkoYpeY998b8DQIvXZ73_m1IS3TnnE,417
@@ -96,17 +96,17 @@ nkululeko/reporting/defines.py,sha256=IsY1YgKRMaABpylVKjBJgJ5bNCEbGCVA_E6pivraqS
 nkululeko/reporting/latex_writer.py,sha256=qiCRSmB4KOD_za4oHu5x-PhwjZohzfo8wecMOwlXZwc,1886
 nkululeko/reporting/report.py,sha256=W0rcigDdjBvxZQ3pZja_gvToILYvaZ1BFtnN2qFRfYI,1060
 nkululeko/reporting/report_item.py,sha256=siWeGNgo4bAE46YBMNcsdf3jTMTy76BO9Fi6DTvDig4,533
-nkululeko/reporting/reporter.py,sha256=S9A62AxdMTEV-9XDUQNxdoevGLXBP52WiDmZ694QMV4,14161
+nkululeko/reporting/reporter.py,sha256=6zW3PmQrwVJO5orBVA-fiaIhnzGrFymC861DSd8nSjc,16806
 nkululeko/reporting/result.py,sha256=nSN5or-Py2GPRWHkWpGRh7UCi1W0er7WLEHz8fYLk-A,742
 nkululeko/segmenting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/segmenting/seg_inaspeechsegmenter.py,sha256=pmLHuXsaqvcdYxB4PSW9l1mbQWZZBJFhi_CGabqydas,1947
 nkululeko/segmenting/seg_silero.py,sha256=lLytS38KzARS17omwv8VBw-zz60RVSXGSvZ5EvWlcWQ,3301
 nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/utils/files.py,sha256=UiGAtZRWYjHSvlmPaTMtzyNNGE6qaLaxQkybctS7iRM,4021
-nkululeko/utils/stats.py,sha256=1yUq0FTOyqkU8TwUocJRYdJaqMU5SlOBBRUun9STo2M,2829
+nkululeko/utils/stats.py,sha256=eC9dMO-by6CDnGLHDBQu-2B4-BudZNJ0nnWGhKYdUMA,2968
 nkululeko/utils/util.py,sha256=ZCS02mE2c3_h9_q4hpsSm4XAooCranqRF_5pY-6055E,14432
-nkululeko-0.86.8.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
-nkululeko-0.86.8.dist-info/METADATA,sha256=5TQSWqzrN9E7XJGcVn5oPKGl6qy-RliYGEG2Ycl46qk,38109
-nkululeko-0.86.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-nkululeko-0.86.8.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
-nkululeko-0.86.8.dist-info/RECORD,,
+nkululeko-0.87.0.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
+nkululeko-0.87.0.dist-info/METADATA,sha256=DPO61pORcuEhRsDwB5S5VJ8CK_piJeh-I5kKJc8eNJE,38442
+nkululeko-0.87.0.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
+nkululeko-0.87.0.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
+nkululeko-0.87.0.dist-info/RECORD,,

{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (70.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{nkululeko-0.86.8.dist-info → nkululeko-0.87.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nkululeko 0.86.8__py3-none-any.whl → 0.87.0__py3-none-any.whl

nkululeko 0.86.8py3-none-any.whl → 0.87.0py3-none-any.whl