PyPI - nkululeko - Versions diffs - 0.85.2__tar.gz → 0.86.1__tar.gz - Mend

nkululeko 0.85.2tar.gz → 0.86.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

{nkululeko-0.85.2 → nkululeko-0.86.1}/CHANGELOG.md RENAMED Viewed

@@ -1,6 +1,17 @@
 Changelog
 =========
+Version 0.86.1
+--------------
+* functionality to push to hub
+* fixed bug that prevented wavlm finetuning
+Version 0.86.0
+--------------
+* added regression to finetuning
+* added other transformer models to finetuning
+* added output the train/dev features sets actually used by the model
 Version 0.85.2
 --------------
 * added data, and automatic task label detection

{nkululeko-0.85.2/nkululeko.egg-info → nkululeko-0.86.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nkululeko
-Version: 0.85.2
+Version: 0.86.1
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -256,6 +256,7 @@ There's my [blog](http://blog.syntheticspeech.de/?s=nkululeko) with tutorials:
 * [Compare several databases](http://blog.syntheticspeech.de/2024/01/02/nkululeko-compare-several-databases/)
 * [Tweak the target variable for database comparison](http://blog.syntheticspeech.de/2024/03/13/nkululeko-how-to-tweak-the-target-variable-for-database-comparison/)
 * [How to run multiple experiments in one go](http://blog.syntheticspeech.de/2022/03/28/how-to-run-multiple-experiments-in-one-go-with-nkululeko/)
+* [How to finetune a transformer-model](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)
 ### <a name="helloworld">Hello World example</a>
 * NEW: [Here's a Google colab that runs this example out-of-the-box](https://colab.research.google.com/drive/1GYNBd5cdZQ1QC3Jm58qoeMaJg3UuPhjw?usp=sharing#scrollTo=4G_SjuF9xeQf), and here is the same [with Kaggle](https://www.kaggle.com/felixburk/nkululeko-hello-world-example)
@@ -333,6 +334,17 @@ F. Burkhardt, Johannes Wagner, Hagen Wierstorf, Florian Eyben and Björn Schulle
 Changelog
 =========
+Version 0.86.1
+--------------
+* functionality to push to hub
+* fixed bug that prevented wavlm finetuning
+Version 0.86.0
+--------------
+* added regression to finetuning
+* added other transformer models to finetuning
+* added output the train/dev features sets actually used by the model
 Version 0.85.2
 --------------
 * added data, and automatic task label detection

{nkululeko-0.85.2 → nkululeko-0.86.1}/README.md RENAMED Viewed

@@ -212,6 +212,7 @@ There's my [blog](http://blog.syntheticspeech.de/?s=nkululeko) with tutorials:
 * [Compare several databases](http://blog.syntheticspeech.de/2024/01/02/nkululeko-compare-several-databases/)
 * [Tweak the target variable for database comparison](http://blog.syntheticspeech.de/2024/03/13/nkululeko-how-to-tweak-the-target-variable-for-database-comparison/)
 * [How to run multiple experiments in one go](http://blog.syntheticspeech.de/2022/03/28/how-to-run-multiple-experiments-in-one-go-with-nkululeko/)
+* [How to finetune a transformer-model](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)
 ### <a name="helloworld">Hello World example</a>
 * NEW: [Here's a Google colab that runs this example out-of-the-box](https://colab.research.google.com/drive/1GYNBd5cdZQ1QC3Jm58qoeMaJg3UuPhjw?usp=sharing#scrollTo=4G_SjuF9xeQf), and here is the same [with Kaggle](https://www.kaggle.com/felixburk/nkululeko-hello-world-example)

{nkululeko-0.85.2 → nkululeko-0.86.1}/nkululeko/constants.py RENAMED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.85.2"
+VERSION="0.86.1"
 SAMPLING_RATE = 16000

{nkululeko-0.85.2 → nkululeko-0.86.1}/nkululeko/experiment.py RENAMED Viewed

@@ -30,15 +30,14 @@ from nkululeko.utils.util import Util
 class Experiment:
-    """Main class specifying an experiment"""
+    """Main class specifying an experiment."""
     def __init__(self, config_obj):
-        """
-        Parameters
-        ----------
-        config_obj : a config parser object that sets the experiment parameters and being set as a global object.
-        """
+        """Constructor.
+        Args:
+            - config_obj : a config parser object that sets the experiment parameters and being set as a global object.
+        """
         self.set_globals(config_obj)
         self.name = glob_conf.config["EXP"]["name"]
         self.root = os.path.join(glob_conf.config["EXP"]["root"], "")
@@ -73,8 +72,9 @@ class Experiment:
         if self.util.config_val("REPORT", "latex", False):
             self.report.export_latex()
-    def get_name(self):
-        return self.util.get_exp_name()
+    # moved to util
+    # def get_name(self):
+    #     return self.util.get_exp_name()
     def set_globals(self, config_obj):
         """install a config object in the global space"""
@@ -109,15 +109,13 @@ class Experiment:
         # print keys/column
         dbs = ",".join(list(self.datasets.keys()))
         labels = self.util.config_val("DATA", "labels", False)
-        auto_labels = list(
-            next(iter(self.datasets.values())).df[self.target].unique()
-        )
+        auto_labels = list(next(iter(self.datasets.values())).df[self.target].unique())
         if labels:
             self.labels = ast.literal_eval(labels)
             self.util.debug(f"Target labels (from config): {labels}")
         else:
             self.labels = auto_labels
-        self.util.debug(f"Target labels (from database): {auto_labels}")
+            self.util.debug(f"Target labels (from database): {auto_labels}")
         glob_conf.set_labels(self.labels)
         self.util.debug(f"loaded databases {dbs}")
@@ -160,8 +158,7 @@ class Experiment:
                 data.split()
                 data.prepare_labels()
                 self.df_test = pd.concat(
-                    [self.df_test, self.util.make_segmented_index(
-                        data.df_test)]
+                    [self.df_test, self.util.make_segmented_index(data.df_test)]
                 )
                 self.df_test.is_labeled = data.is_labeled
             self.df_test.got_gender = self.got_gender
@@ -262,8 +259,7 @@ class Experiment:
                     test_cats = self.df_test[self.target].unique()
                 else:
                     # if there is no target, copy a dummy label
-                    self.df_test = self._add_random_target(
-                        self.df_test).astype("str")
+                    self.df_test = self._add_random_target(self.df_test).astype("str")
                 train_cats = self.df_train[self.target].unique()
                 # print(f"df_train: {pd.DataFrame(self.df_train[self.target])}")
                 # print(f"train_cats with target {self.target}: {train_cats}")
@@ -271,8 +267,7 @@ class Experiment:
                 if type(test_cats) == np.ndarray:
                     self.util.debug(f"Categories test (nd.array): {test_cats}")
                 else:
-                    self.util.debug(
-                        f"Categories test (list): {list(test_cats)}")
+                    self.util.debug(f"Categories test (list): {list(test_cats)}")
             if type(train_cats) == np.ndarray:
                 self.util.debug(f"Categories train (nd.array): {train_cats}")
             else:
@@ -295,8 +290,7 @@ class Experiment:
         target_factor = self.util.config_val("DATA", "target_divide_by", False)
         if target_factor:
-            self.df_test[self.target] = self.df_test[self.target] / \
-                float(target_factor)
+            self.df_test[self.target] = self.df_test[self.target] / float(target_factor)
             self.df_train[self.target] = self.df_train[self.target] / float(
                 target_factor
             )
@@ -319,16 +313,14 @@ class Experiment:
     def plot_distribution(self, df_labels):
         """Plot the distribution of samples and speaker per target class and biological sex"""
         plot = Plots()
-        sample_selection = self.util.config_val(
-            "EXPL", "sample_selection", "all")
+        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
         plot.plot_distributions(df_labels)
         if self.got_speaker:
             plot.plot_distributions_speaker(df_labels)
     def extract_test_feats(self):
         self.feats_test = pd.DataFrame()
-        feats_name = "_".join(ast.literal_eval(
-            glob_conf.config["DATA"]["tests"]))
+        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"]))
         feats_types = self.util.config_val_list("FEATS", "type", ["os"])
         self.feature_extractor = FeatureExtractor(
             self.df_test, feats_types, feats_name, "test"
@@ -345,8 +337,7 @@ class Experiment:
         """
         df_train, df_test = self.df_train, self.df_test
-        feats_name = "_".join(ast.literal_eval(
-            glob_conf.config["DATA"]["databases"]))
+        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
         self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
         feats_types = self.util.config_val_list("FEATS", "type", [])
         # for some models no features are needed
@@ -380,20 +371,22 @@ class Experiment:
                 f"test feats ({self.feats_test.shape[0]}) != test labels"
                 f" ({self.df_test.shape[0]})"
             )
-            self.df_test = self.df_test[self.df_test.index.isin(
-                self.feats_test.index)]
-            self.util.warn(f"mew test labels shape: {self.df_test.shape[0]}")
+            self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
+            self.util.warn(f"new test labels shape: {self.df_test.shape[0]}")
         self._check_scale()
+        # store = self.util.get_path("store")
+        # store_format = self.util.config_val("FEATS", "store_format", "pkl")
+        # storage = f"{store}test_feats.{store_format}"
+        # self.util.write_store(self.feats_test, storage, store_format)
+        # storage = f"{store}train_feats.{store_format}"
+        # self.util.write_store(self.feats_train, storage, store_format)
     def augment(self):
-        """
-        Augment the selected samples
-        """
+        """Augment the selected samples."""
         from nkululeko.augmenting.augmenter import Augmenter
-        sample_selection = self.util.config_val(
-            "AUGMENT", "sample_selection", "all")
+        sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -488,8 +481,7 @@ class Experiment:
         """
         from nkululeko.augmenting.randomsplicer import Randomsplicer
-        sample_selection = self.util.config_val(
-            "AUGMENT", "sample_selection", "all")
+        sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -510,8 +502,7 @@ class Experiment:
         plot_feats = eval(
             self.util.config_val("EXPL", "feature_distributions", "False")
         )
-        sample_selection = self.util.config_val(
-            "EXPL", "sample_selection", "all")
+        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
         # get the data labels
         if sample_selection == "all":
             df_labels = pd.concat([self.df_train, self.df_test])
@@ -574,8 +565,7 @@ class Experiment:
             for scat_target in scat_targets:
                 if self.util.is_categorical(df_labels[scat_target]):
                     for scatter in scatters:
-                        plots.scatter_plot(
-                            df_feats, df_labels, scat_target, scatter)
+                        plots.scatter_plot(df_feats, df_labels, scat_target, scatter)
                 else:
                     self.util.debug(
                         f"{self.name}: binning continuous variable to categories"
@@ -590,6 +580,8 @@ class Experiment:
                         )
     def _check_scale(self):
+        self.util.save_to_store(self.feats_train, "feats_train")
+        self.util.save_to_store(self.feats_test, "feats_test")
         scale_feats = self.util.config_val("FEATS", "scale", False)
         # print the scale
         self.util.debug(f"scaler: {scale_feats}")
@@ -664,8 +656,7 @@ class Experiment:
         preds = best.preds
         speakers = self.df_test.speaker.values
         print(f"{len(truths)} {len(preds)} {len(speakers) }")
-        df = pd.DataFrame(
-            data={"truth": truths, "pred": preds, "speaker": speakers})
+        df = pd.DataFrame(data={"truth": truths, "pred": preds, "speaker": speakers})
         plot_name = "result_combined_per_speaker"
         self.util.debug(
             f"plotting speaker combination ({function}) confusion matrix to"

{nkululeko-0.85.2 → nkululeko-0.86.1}/nkululeko/feat_extract/feats_opensmile.py RENAMED Viewed

@@ -65,28 +65,28 @@ class Opensmileset(Featureset):
         feats = smile.process_signal(signal, sr)
         return feats.to_numpy()
-    def filter(self):
-        # use only the features that are indexed in the target dataframes
-        self.df = self.df[self.df.index.isin(self.data_df.index)]
-        try:
-            # use only some features
-            selected_features = ast.literal_eval(
-                glob_conf.config["FEATS"]["os.features"]
-            )
-            self.util.debug(f"selecting features from opensmile: {selected_features}")
-            sel_feats_df = pd.DataFrame()
-            hit = False
-            for feat in selected_features:
-                try:
-                    sel_feats_df[feat] = self.df[feat]
-                    hit = True
-                except KeyError:
-                    pass
-            if hit:
-                self.df = sel_feats_df
-                self.util.debug(
-                    "new feats shape after selecting opensmile features:"
-                    f" {self.df.shape}"
-                )
-        except KeyError:
-            pass
+    # def filter(self):
+    #     # use only the features that are indexed in the target dataframes
+    #     self.df = self.df[self.df.index.isin(self.data_df.index)]
+    #     try:
+    #         # use only some features
+    #         selected_features = ast.literal_eval(
+    #             glob_conf.config["FEATS"]["os.features"]
+    #         )
+    #         self.util.debug(f"selecting features from opensmile: {selected_features}")
+    #         sel_feats_df = pd.DataFrame()
+    #         hit = False
+    #         for feat in selected_features:
+    #             try:
+    #                 sel_feats_df[feat] = self.df[feat]
+    #                 hit = True
+    #             except KeyError:
+    #                 pass
+    #         if hit:
+    #             self.df = sel_feats_df
+    #             self.util.debug(
+    #                 "new feats shape after selecting opensmile features:"
+    #                 f" {self.df.shape}"
+    #             )
+    #     except KeyError:
+    #         pass

{nkululeko-0.85.2 → nkululeko-0.86.1}/nkululeko/feat_extract/featureset.py RENAMED Viewed

@@ -15,7 +15,7 @@ class Featureset:
         self.name = name
         self.data_df = data_df
         self.util = Util("featureset")
-        self.feats_types = feats_type
+        self.feats_type = feats_type
     def extract(self):
         pass
@@ -25,8 +25,7 @@ class Featureset:
         self.df = self.df[self.df.index.isin(self.data_df.index)]
         try:
             # use only some features
-            selected_features = ast.literal_eval(
-                glob_conf.config["FEATS"]["features"])
+            selected_features = ast.literal_eval(glob_conf.config["FEATS"]["features"])
             self.util.debug(f"selecting features: {selected_features}")
             sel_feats_df = pd.DataFrame()
             hit = False
@@ -35,11 +34,12 @@ class Featureset:
                     sel_feats_df[feat] = self.df[feat]
                     hit = True
                 except KeyError:
+                    self.util.warn(f"non existent feature in {self.feats_type}: {feat}")
                     pass
             if hit:
                 self.df = sel_feats_df
                 self.util.debug(
-                    f"new feats shape after selecting features: {self.df.shape}"
+                    f"new feats shape after selecting features for {self.feats_type}: {self.df.shape}"
                 )
         except KeyError:
             pass

nkululeko 0.85.2__tar.gz → 0.86.1__tar.gz

nkululeko 0.85.2tar.gz → 0.86.1tar.gz