PyPI - nkululeko - Versions diffs - 0.86.7__tar.gz → 0.87.0__tar.gz - Mend

nkululeko 0.86.7tar.gz → 0.87.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

{nkululeko-0.86.7 → nkululeko-0.87.0}/CHANGELOG.md RENAMED Viewed

@@ -1,6 +1,14 @@
 Changelog
 =========
+Version 0.87.0
+--------------
+* added class probability output and uncertainty analysis
+Version 0.86.8
+--------------
+* handle single feature sets as strings in the config
 Version 0.86.7
 --------------
 * handles now audformat tables where the target is in a file index

{nkululeko-0.86.7/nkululeko.egg-info → nkululeko-0.87.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nkululeko
-Version: 0.86.7
+Version: 0.87.0
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -51,6 +51,7 @@ Requires-Dist: pylatex
   - [t-SNE plots](#t-sne-plots)
   - [Data distribution](#data-distribution)
   - [Bias checking](#bias-checking)
+  - [Uncertainty](#uncertainty)
 - [Documentation](#documentation)
 - [Installation](#installation)
 - [Usage](#usage)
@@ -113,6 +114,13 @@ In cases you might wonder if there's bias in your data. You can try to detect th
 <img src="meta/images/emotion-pesq.png" width="500px"/>
+### Uncertainty
+Nkululeko estimates uncertainty of model decision (only for classifiers) with entropy over the class-probabilities or logits per sample.
+<img src="meta/images/uncertainty.png" width="500px"/>
 ## Documentation
 The documentation, along with extensions of installation, usage, INI file format, and examples, can be found [nkululeko.readthedocs.io](https://nkululeko.readthedocs.io).
@@ -343,6 +351,14 @@ F. Burkhardt, Johannes Wagner, Hagen Wierstorf, Florian Eyben and Björn Schulle
 Changelog
 =========
+Version 0.87.0
+--------------
+* added class probability output and uncertainty analysis
+Version 0.86.8
+--------------
+* handle single feature sets as strings in the config
 Version 0.86.7
 --------------
 * handles now audformat tables where the target is in a file index

{nkululeko-0.86.7 → nkululeko-0.87.0}/README.md RENAMED Viewed

@@ -7,6 +7,7 @@
   - [t-SNE plots](#t-sne-plots)
   - [Data distribution](#data-distribution)
   - [Bias checking](#bias-checking)
+  - [Uncertainty](#uncertainty)
 - [Documentation](#documentation)
 - [Installation](#installation)
 - [Usage](#usage)
@@ -69,6 +70,13 @@ In cases you might wonder if there's bias in your data. You can try to detect th
 <img src="meta/images/emotion-pesq.png" width="500px"/>
+### Uncertainty
+Nkululeko estimates uncertainty of model decision (only for classifiers) with entropy over the class-probabilities or logits per sample.
+<img src="meta/images/uncertainty.png" width="500px"/>
 ## Documentation
 The documentation, along with extensions of installation, usage, INI file format, and examples, can be found [nkululeko.readthedocs.io](https://nkululeko.readthedocs.io).

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/constants.py RENAMED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.86.7"
+VERSION="0.87.0"
 SAMPLING_RATE = 16000

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/data/dataset_csv.py RENAMED Viewed

@@ -23,6 +23,9 @@ class Dataset_CSV(Dataset):
         root = os.path.dirname(data_file)
         audio_path = self.util.config_val_data(self.name, "audio_path", "./")
         df = pd.read_csv(data_file)
+        # trim all string values
+        df_obj = df.select_dtypes("object")
+        df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
         # special treatment for segmented dataframes with only one column:
         if "start" in df.columns and len(df.columns) == 4:
             index = audformat.segmented_index(
@@ -49,8 +52,7 @@ class Dataset_CSV(Dataset):
                     .map(lambda x: root + "/" + audio_path + "/" + x)
                     .values
                 )
-                df = df.set_index(df.index.set_levels(
-                    file_index, level="file"))
+                df = df.set_index(df.index.set_levels(file_index, level="file"))
             else:
                 if not isinstance(df, pd.DataFrame):
                     df = pd.DataFrame(df)
@@ -59,27 +61,24 @@ class Dataset_CSV(Dataset):
                         lambda x: root + "/" + audio_path + "/" + x
                     )
                 )
-        else: # absolute path is True
+        else:  # absolute path is True
             if audformat.index_type(df.index) == "segmented":
                 file_index = (
-                    df.index.levels[0]
-                    .map(lambda x: audio_path + "/" + x)
-                    .values
+                    df.index.levels[0].map(lambda x: audio_path + "/" + x).values
                 )
-                df = df.set_index(df.index.set_levels(
-                    file_index, level="file"))
+                df = df.set_index(df.index.set_levels(file_index, level="file"))
             else:
                 if not isinstance(df, pd.DataFrame):
                     df = pd.DataFrame(df)
-                df = df.set_index(df.index.to_series().apply(
-                    lambda x: audio_path + "/" + x ))
+                df = df.set_index(
+                    df.index.to_series().apply(lambda x: audio_path + "/" + x)
+                )
         self.df = df
         self.db = None
         self.got_target = True
         self.is_labeled = self.got_target
-        self.start_fresh = eval(
-            self.util.config_val("DATA", "no_reuse", "False"))
+        self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
         is_index = False
         try:
             if self.is_labeled and not "class_label" in self.df.columns:
@@ -106,8 +105,7 @@ class Dataset_CSV(Dataset):
                 f" {self.got_gender}, got age: {self.got_age}"
             )
         self.util.debug(r_string)
-        glob_conf.report.add_item(ReportItem(
-            "Data", "Loaded report", r_string))
+        glob_conf.report.add_item(ReportItem("Data", "Loaded report", r_string))
     def prepare(self):
         super().prepare()

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/demo.py RENAMED Viewed

@@ -30,10 +30,8 @@ from transformers import pipeline
 def main(src_dir):
-    parser = argparse.ArgumentParser(
-        description="Call the nkululeko DEMO framework.")
-    parser.add_argument("--config", default="exp.ini",
-                        help="The base configuration")
+    parser = argparse.ArgumentParser(description="Call the nkululeko DEMO framework.")
+    parser.add_argument("--config", default="exp.ini", help="The base configuration")
     parser.add_argument(
         "--file", help="A file that should be processed (16kHz mono wav)"
     )
@@ -84,8 +82,7 @@ def main(src_dir):
     )
     def print_pipe(files, outfile):
-        """
-        Prints the pipeline output for a list of files, and optionally writes the results to an output file.
+        """Prints the pipeline output for a list of files, and optionally writes the results to an output file.
         Args:
             files (list): A list of file paths to process through the pipeline.
@@ -108,8 +105,7 @@ def main(src_dir):
                 f.write("\n".join(results))
     if util.get_model_type() == "finetune":
-        model_path = os.path.join(
-            util.get_exp_dir(), "models", "run_0", "torch")
+        model_path = os.path.join(util.get_exp_dir(), "models", "run_0", "torch")
         pipe = pipeline("audio-classification", model=model_path)
         if args.file is not None:
             print_pipe([args.file], args.outfile)

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/experiment.py RENAMED Viewed

@@ -5,13 +5,13 @@ import pickle
 import random
 import time
+import audeer
+import audformat
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
-import audeer
-import audformat
+import nkululeko.glob_conf as glob_conf
 from nkululeko.data.dataset import Dataset
 from nkululeko.data.dataset_csv import Dataset_CSV
 from nkululeko.demo_predictor import Demo_predictor
@@ -19,8 +19,6 @@ from nkululeko.feat_extract.feats_analyser import FeatureAnalyser
 from nkululeko.feature_extractor import FeatureExtractor
 from nkululeko.file_checker import FileChecker
 from nkululeko.filter_data import DataFilter
-from nkululeko.filter_data import filter_min_dur
-import nkululeko.glob_conf as glob_conf
 from nkululeko.plots import Plots
 from nkululeko.reporting.report import Report
 from nkululeko.runmanager import Runmanager
@@ -109,7 +107,8 @@ class Experiment:
         # print keys/column
         dbs = ",".join(list(self.datasets.keys()))
         labels = self.util.config_val("DATA", "labels", False)
-        auto_labels = list(next(iter(self.datasets.values())).df[self.target].unique())
+        auto_labels = list(
+            next(iter(self.datasets.values())).df[self.target].unique())
         if labels:
             self.labels = ast.literal_eval(labels)
             self.util.debug(f"Using target labels (from config): {labels}")
@@ -159,7 +158,8 @@ class Experiment:
                 data.split()
                 data.prepare_labels()
                 self.df_test = pd.concat(
-                    [self.df_test, self.util.make_segmented_index(data.df_test)]
+                    [self.df_test, self.util.make_segmented_index(
+                        data.df_test)]
                 )
                 self.df_test.is_labeled = data.is_labeled
             self.df_test.got_gender = self.got_gender
@@ -260,7 +260,8 @@ class Experiment:
                     test_cats = self.df_test[self.target].unique()
                 else:
                     # if there is no target, copy a dummy label
-                    self.df_test = self._add_random_target(self.df_test).astype("str")
+                    self.df_test = self._add_random_target(
+                        self.df_test).astype("str")
                 train_cats = self.df_train[self.target].unique()
                 # print(f"df_train: {pd.DataFrame(self.df_train[self.target])}")
                 # print(f"train_cats with target {self.target}: {train_cats}")
@@ -268,7 +269,8 @@ class Experiment:
                 if type(test_cats) == np.ndarray:
                     self.util.debug(f"Categories test (nd.array): {test_cats}")
                 else:
-                    self.util.debug(f"Categories test (list): {list(test_cats)}")
+                    self.util.debug(
+                        f"Categories test (list): {list(test_cats)}")
             if type(train_cats) == np.ndarray:
                 self.util.debug(f"Categories train (nd.array): {train_cats}")
             else:
@@ -291,7 +293,8 @@ class Experiment:
         target_factor = self.util.config_val("DATA", "target_divide_by", False)
         if target_factor:
-            self.df_test[self.target] = self.df_test[self.target] / float(target_factor)
+            self.df_test[self.target] = self.df_test[self.target] / \
+                float(target_factor)
             self.df_train[self.target] = self.df_train[self.target] / float(
                 target_factor
             )
@@ -314,14 +317,16 @@ class Experiment:
     def plot_distribution(self, df_labels):
         """Plot the distribution of samples and speaker per target class and biological sex"""
         plot = Plots()
-        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
+        sample_selection = self.util.config_val(
+            "EXPL", "sample_selection", "all")
         plot.plot_distributions(df_labels)
         if self.got_speaker:
             plot.plot_distributions_speaker(df_labels)
     def extract_test_feats(self):
         self.feats_test = pd.DataFrame()
-        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"]))
+        feats_name = "_".join(ast.literal_eval(
+            glob_conf.config["DATA"]["tests"]))
         feats_types = self.util.config_val_list("FEATS", "type", ["os"])
         self.feature_extractor = FeatureExtractor(
             self.df_test, feats_types, feats_name, "test"
@@ -338,9 +343,17 @@ class Experiment:
         """
         df_train, df_test = self.df_train, self.df_test
-        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
+        feats_name = "_".join(ast.literal_eval(
+            glob_conf.config["DATA"]["databases"]))
         self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
-        feats_types = self.util.config_val_list("FEATS", "type", [])
+        feats_types = self.util.config_val("FEATS", "type", "os")
+        # Ensure feats_types is always a list of strings
+        if isinstance(feats_types, str):
+            if feats_types.startswith("[") and feats_types.endswith("]"):
+                feats_types = ast.literal_eval(feats_types)
+            else:
+                feats_types = [feats_types]
+        # print(f"feats_types: {feats_types}")
         # for some models no features are needed
         if len(feats_types) == 0:
             self.util.debug("no feature extractor specified.")
@@ -372,7 +385,8 @@ class Experiment:
                 f"test feats ({self.feats_test.shape[0]}) != test labels"
                 f" ({self.df_test.shape[0]})"
             )
-            self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
+            self.df_test = self.df_test[self.df_test.index.isin(
+                self.feats_test.index)]
             self.util.warn(f"new test labels shape: {self.df_test.shape[0]}")
         self._check_scale()
@@ -387,7 +401,8 @@ class Experiment:
         """Augment the selected samples."""
         from nkululeko.augmenting.augmenter import Augmenter
-        sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
+        sample_selection = self.util.config_val(
+            "AUGMENT", "sample_selection", "all")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -482,7 +497,8 @@ class Experiment:
         """
         from nkululeko.augmenting.randomsplicer import Randomsplicer
-        sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
+        sample_selection = self.util.config_val(
+            "AUGMENT", "sample_selection", "all")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -503,7 +519,8 @@ class Experiment:
         plot_feats = eval(
             self.util.config_val("EXPL", "feature_distributions", "False")
         )
-        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
+        sample_selection = self.util.config_val(
+            "EXPL", "sample_selection", "all")
         # get the data labels
         if sample_selection == "all":
             df_labels = pd.concat([self.df_train, self.df_test])
@@ -566,7 +583,8 @@ class Experiment:
             for scat_target in scat_targets:
                 if self.util.is_categorical(df_labels[scat_target]):
                     for scatter in scatters:
-                        plots.scatter_plot(df_feats, df_labels, scat_target, scatter)
+                        plots.scatter_plot(
+                            df_feats, df_labels, scat_target, scatter)
                 else:
                     self.util.debug(
                         f"{self.name}: binning continuous variable to categories"
@@ -657,7 +675,8 @@ class Experiment:
         preds = best.preds
         speakers = self.df_test.speaker.values
         print(f"{len(truths)} {len(preds)} {len(speakers) }")
-        df = pd.DataFrame(data={"truth": truths, "pred": preds, "speaker": speakers})
+        df = pd.DataFrame(
+            data={"truth": truths, "pred": preds, "speaker": speakers})
         plot_name = "result_combined_per_speaker"
         self.util.debug(
             f"plotting speaker combination ({function}) confusion matrix to"
@@ -733,7 +752,6 @@ class Experiment:
         if model.is_ann():
             print("converting to onnx from torch")
         else:
-            from skl2onnx import to_onnx
             print("converting to onnx from sklearn")
         # save the rest

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/feature_extractor.py RENAMED Viewed

@@ -39,16 +39,20 @@ class FeatureExtractor:
         self.feats = pd.DataFrame()
         for feats_type in self.feats_types:
             store_name = f"{self.data_name}_{feats_type}"
-            self.feat_extractor = self._get_feat_extractor(store_name, feats_type)
+            self.feat_extractor = self._get_feat_extractor(
+                store_name, feats_type)
             self.feat_extractor.extract()
             self.feat_extractor.filter()
-            self.feats = pd.concat([self.feats, self.feat_extractor.df], axis=1)
+            self.feats = pd.concat(
+                [self.feats, self.feat_extractor.df], axis=1)
         return self.feats
     def extract_sample(self, signal, sr):
         return self.feat_extractor.extract_sample(signal, sr)
     def _get_feat_extractor(self, store_name, feats_type):
+        if isinstance(feats_type, list) and len(feats_type) == 1:
+            feats_type = feats_type[0]
         feat_extractor_class = self._get_feat_extractor_class(feats_type)
         if feat_extractor_class is None:
             self.util.error(f"unknown feats_type: {feats_type}")
@@ -103,13 +107,15 @@ class FeatureExtractor:
         prefix, _, ext = feats_type.partition("-")
         from importlib import import_module
-        module = import_module(f"nkululeko.feat_extract.feats_{prefix.lower()}")
+        module = import_module(
+            f"nkululeko.feat_extract.feats_{prefix.lower()}")
         class_name = f"{prefix.capitalize()}"
         return getattr(module, class_name)
     def _get_feat_extractor_by_name(self, feats_type):
         from importlib import import_module
-        module = import_module(f"nkululeko.feat_extract.feats_{feats_type.lower()}")
+        module = import_module(
+            f"nkululeko.feat_extract.feats_{feats_type.lower()}")
         class_name = f"{feats_type.capitalize()}Set"
         return getattr(module, class_name)

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/modelrunner.py RENAMED Viewed

@@ -85,7 +85,7 @@ class Modelrunner:
                     f"run: {self.run} epoch: {epoch}: result: {test_score_metric}"
                 )
                 # print(f"performance: {performance.split(' ')[1]}")
-                performance = float(test_score_metric.split(' ')[1])
+                performance = float(test_score_metric.split(" ")[1])
                 if performance > self.best_performance:
                     self.best_performance = performance
                     self.best_epoch = epoch
@@ -204,15 +204,15 @@ class Modelrunner:
                 self.df_train, self.df_test, self.feats_train, self.feats_test
             )
         elif model_type == "cnn":
-            from nkululeko.models.model_cnn import CNN_model
+            from nkululeko.models.model_cnn import CNNModel
-            self.model = CNN_model(
+            self.model = CNNModel(
                 self.df_train, self.df_test, self.feats_train, self.feats_test
             )
         elif model_type == "mlp":
-            from nkululeko.models.model_mlp import MLP_model
+            from nkululeko.models.model_mlp import MLPModel
-            self.model = MLP_model(
+            self.model = MLPModel(
                 self.df_train, self.df_test, self.feats_train, self.feats_test
             )
         elif model_type == "mlp_reg":

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/models/model.py RENAMED Viewed

@@ -247,8 +247,25 @@ class Model:
                 self.clf.fit(feats, labels)
     def get_predictions(self):
-        predictions = self.clf.predict(self.feats_test.to_numpy())
-        return predictions
+        #        predictions = self.clf.predict(self.feats_test.to_numpy())
+        if self.util.exp_is_classification():
+            # make a dataframe for the class probabilities
+            proba_d = {}
+            for c in self.clf.classes_:
+                proba_d[c] = []
+            # get the class probabilities
+            predictions = self.clf.predict_proba(self.feats_test.to_numpy())
+            # pred = self.clf.predict(features)
+            for i, c in enumerate(self.clf.classes_):
+                proba_d[c] = list(predictions.T[i])
+            probas = pd.DataFrame(proba_d)
+            probas = probas.set_index(self.feats_test.index)
+            predictions = probas.idxmax(axis=1).values
+        else:
+            predictions = self.clf.predict(self.feats_test.to_numpy())
+            probas = None
+        return predictions, probas
     def predict(self):
         if self.feats_test.isna().to_numpy().any():
@@ -263,13 +280,16 @@ class Model:
             )
             return report
         """Predict the whole eval feature set"""
-        predictions = self.get_predictions()
+        predictions, probas = self.get_predictions()
         report = Reporter(
             self.df_test[self.target].to_numpy().astype(float),
             predictions,
             self.run,
             self.epoch,
+            probas=probas,
         )
+        report.print_probabilities()
         return report
     def get_type(self):

{nkululeko-0.86.7 → nkululeko-0.87.0}/nkululeko/models/model_cnn.py RENAMED Viewed

@@ -5,33 +5,40 @@ Inspired by code from Su Lei
 """
+import ast
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+from PIL import Image
+from sklearn.metrics import recall_score
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torchvision
-import torchvision.transforms as transforms
 from torch.utils.data import Dataset
-import ast
-import numpy as np
-from sklearn.metrics import recall_score
-from collections import OrderedDict
-from PIL import Image
-from traitlets import default
+import torchvision.transforms as transforms
-from nkululeko.utils.util import Util
 import nkululeko.glob_conf as glob_conf
+from nkululeko.losses.loss_softf1loss import SoftF1Loss
 from nkululeko.models.model import Model
 from nkululeko.reporting.reporter import Reporter
-from nkululeko.losses.loss_softf1loss import SoftF1Loss
+from nkululeko.utils.util import Util
-class CNN_model(Model):
-    """CNN = convolutional neural net"""
+class CNNModel(Model):
+    """CNN = convolutional neural net."""
     is_classifier = True
     def __init__(self, df_train, df_test, feats_train, feats_test):
-        """Constructor taking the configuration and all dataframes"""
+        """Constructor, taking all dataframes.
+        Args:
+            df_train (pd.DataFrame): The train labels.
+            df_test (pd.DataFrame): The test labels.
+            feats_train (pd.DataFrame): The train features.
+            feats_test (pd.DataFrame): The test features.
+        """
         super().__init__(df_train, df_test, feats_train, feats_test)
         super().set_model_type("ann")
         self.name = "cnn"
@@ -147,7 +154,20 @@ class CNN_model(Model):
             self.optimizer.step()
         self.loss = (np.asarray(losses)).mean()
-    def evaluate_model(self, model, loader, device):
+    def get_probas(self, logits):
+        # make a dataframe for probabilites (logits)
+        proba_d = {}
+        classes = self.df_test[self.target].unique()
+        classes.sort()
+        for c in classes:
+            proba_d[c] = []
+        for i, c in enumerate(classes):
+            proba_d[c] = list(logits.numpy().T[i])
+        probas = pd.DataFrame(proba_d)
+        probas = probas.set_index(self.df_test.index)
+        return probas
+    def evaluate(self, model, loader, device):
         logits = torch.zeros(len(loader.dataset), self.class_num)
         targets = torch.zeros(len(loader.dataset))
         model.eval()
@@ -169,14 +189,15 @@ class CNN_model(Model):
         self.loss_eval = (np.asarray(losses)).mean()
         predictions = logits.argmax(dim=1)
         uar = recall_score(targets.numpy(), predictions.numpy(), average="macro")
-        return uar, targets, predictions
+        return uar, targets, predictions, logits
     def predict(self):
-        _, truths, predictions = self.evaluate_model(
+        _, truths, predictions, logits = self.evaluate(
             self.model, self.testloader, self.device
         )
-        uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
-        report = Reporter(truths, predictions, self.run, self.epoch)
+        uar, _, _, _ = self.evaluate(self.model, self.trainloader, self.device)
+        probas = self.get_probas(logits)
+        report = Reporter(truths, predictions, self.run, self.epoch, probas=probas)
         try:
             report.result.loss = self.loss
         except AttributeError:  # if the model was loaded from disk the loss is unknown
@@ -189,13 +210,11 @@ class CNN_model(Model):
         return report
     def get_predictions(self):
-        _, truths, predictions = self.evaluate_model(
-            self.model, self.testloader, self.device
-        )
+        _, _, predictions, _ = self.evaluate(self.model, self.testloader, self.device)
         return predictions.numpy()
     def predict_sample(self, features):
-        """Predict one sample"""
+        """Predict one sample."""
         with torch.no_grad():
             logits = self.model(torch.from_numpy(features).to(self.device))
         a = logits.numpy()

nkululeko 0.86.7__tar.gz → 0.87.0__tar.gz

nkululeko 0.86.7tar.gz → 0.87.0tar.gz