PyPI - nkululeko - Versions diffs - 0.93.11__py3-none-any.whl → 0.93.12__py3-none-any.whl - Mend

nkululeko 0.93.11py3-none-any.whl → 0.93.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

nkululeko/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.93.11"
+VERSION="0.93.12"
 SAMPLING_RATE = 16000

nkululeko/data/dataset.py CHANGED Viewed

@@ -34,9 +34,11 @@ class Dataset:
         self.plot = Plots()
         self.limit = int(self.util.config_val_data(self.name, "limit", 0))
         self.target_tables_append = eval(
-            self.util.config_val_data(self.name, "target_tables_append", "False")
+            self.util.config_val_data(
+                self.name, "target_tables_append", "False")
         )
-        self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
+        self.start_fresh = eval(
+            self.util.config_val("DATA", "no_reuse", "False"))
         self.is_labeled, self.got_speaker, self.got_gender, self.got_age = (
             False,
             False,
@@ -70,7 +72,8 @@ class Dataset:
         try:
             self.db = audformat.Database.load(root)
         except FileNotFoundError:
-            self.util.error(f"{self.name}: no audformat database found at {root}")
+            self.util.error(
+                f"{self.name}: no audformat database found at {root}")
         return root
     def _check_cols(self, df):
@@ -92,7 +95,8 @@ class Dataset:
         )
         self.util.debug(r_string)
         if glob_conf.report.initial:
-            glob_conf.report.add_item(ReportItem("Data", "Load report", r_string))
+            glob_conf.report.add_item(ReportItem(
+                "Data", "Load report", r_string))
             glob_conf.report.initial = False
     def load(self):
@@ -103,7 +107,8 @@ class Dataset:
         store_file = f"{store}{self.name}.{store_format}"
         self.root = self._load_db()
         if not self.start_fresh and os.path.isfile(store_file):
-            self.util.debug(f"{self.name}: reusing previously stored file {store_file}")
+            self.util.debug(
+                f"{self.name}: reusing previously stored file {store_file}")
             self.df = self.util.get_store(store_file, store_format)
             self.is_labeled = self.target in self.df
             self.got_gender = "gender" in self.df
@@ -118,10 +123,12 @@ class Dataset:
         # map the audio file paths
         self.db.map_files(lambda x: os.path.join(self.root, x))
         # the dataframes (potentially more than one) with at least the file names
-        df_files = self.util.config_val_data(self.name, "files_tables", "['files']")
+        df_files = self.util.config_val_data(
+            self.name, "files_tables", "['files']")
         df_files_tables = ast.literal_eval(df_files)
         # The label for the target column
-        self.col_label = self.util.config_val_data(self.name, "label", self.target)
+        self.col_label = self.util.config_val_data(
+            self.name, "label", self.target)
         (
             df,
             self.is_labeled,
@@ -157,7 +164,8 @@ class Dataset:
                 self.got_age = got_age2 or self.got_age
                 if audformat.is_filewise_index(df_target.index):
                     try:
-                        df_target = df_target.loc[df.index.get_level_values("file")]
+                        df_target = df_target.loc[df.index.get_level_values(
+                            "file")]
                         df_target = df_target.set_index(df.index)
                     except KeyError:
                         # just a try...
@@ -206,7 +214,8 @@ class Dataset:
             end = self.df.index.get_level_values(2)
             self.df["duration"] = (end - start).total_seconds()
         elif self.df.duration.dtype == "timedelta64[ns]":
-            self.df["duration"] = self.df["duration"].map(lambda x: x.total_seconds())
+            self.df["duration"] = self.df["duration"].map(
+                lambda x: x.total_seconds())
         # Perform some filtering if desired
         required = self.util.config_val_data(self.name, "required", False)
         if required:
@@ -236,15 +245,18 @@ class Dataset:
                 res.append(abs(n - max))
             return res
-        reverse = eval(self.util.config_val_data(self.name, "reverse", "False"))
+        reverse = eval(self.util.config_val_data(
+            self.name, "reverse", "False"))
         if reverse:
-            max = eval(self.util.config_val_data(self.name, "reverse.max", "False"))
+            max = eval(self.util.config_val_data(
+                self.name, "reverse.max", "False"))
             if max:
                 max = float(max)
             else:
                 max = self.df[self.target].values.max()
             self.util.debug(f"reversing target numbers with max values: {max}")
-            self.df[self.target] = reverse_array(self.df[self.target].values, max)
+            self.df[self.target] = reverse_array(
+                self.df[self.target].values, max)
         # check if the target variable should be scaled (z-transformed)
         scale = self.util.config_val_data(self.name, "scale", False)
@@ -317,13 +329,15 @@ class Dataset:
                 pass
             try:
                 # also it might be possible that the age is part of the speaker description
-                df_local["age"] = db[table]["speaker"].get(map="age").astype(int)
+                df_local["age"] = db[table]["speaker"].get(
+                    map="age").astype(int)
                 got_age = True
             except (ValueError, audformat.errors.BadKeyError):
                 pass
             try:
                 # same for the target, e.g. "age"
-                df_local[self.target] = db[table]["speaker"].get(map=self.target)
+                df_local[self.target] = db[table]["speaker"].get(
+                    map=self.target)
                 is_labeled = True
             except (ValueError, audformat.core.errors.BadKeyError):
                 pass
@@ -384,8 +398,10 @@ class Dataset:
             testdf = self.db.tables[self.target + ".test"].df
             traindf = self.db.tables[self.target + ".train"].df
             # use only the train and test samples that were not perhaps filtered out by an earlier processing step
-            self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
-            self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
+            self.df_test = self.df.loc[self.df.index.intersection(
+                testdf.index)]
+            self.df_train = self.df.loc[self.df.index.intersection(
+                traindf.index)]
         elif split_strategy == "train":
             self.df_train = self.df
             self.df_test = pd.DataFrame()
@@ -408,18 +424,23 @@ class Dataset:
             if entry_train_tables:
                 train_tables = ast.literal_eval(entry_train_tables)
                 for train_table in train_tables:
-                    traindf = pd.concat([traindf, self.db.tables[train_table].df])
+                    traindf = pd.concat(
+                        [traindf, self.db.tables[train_table].df])
             # use only the train and test samples that were not perhaps filtered out by an earlier processing step
             # testdf.index.map(lambda x: os.path.join(self.root, x))
             #            testdf.index = testdf.index.to_series().apply(lambda x: self.root+x)
             testdf = testdf.set_index(
-                audformat.utils.to_segmented_index(testdf.index, allow_nat=False)
+                audformat.utils.to_segmented_index(
+                    testdf.index, allow_nat=False)
             )
             traindf = traindf.set_index(
-                audformat.utils.to_segmented_index(traindf.index, allow_nat=False)
+                audformat.utils.to_segmented_index(
+                    traindf.index, allow_nat=False)
             )
-            self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
-            self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
+            self.df_test = self.df.loc[self.df.index.intersection(
+                testdf.index)]
+            self.df_train = self.df.loc[self.df.index.intersection(
+                traindf.index)]
             # it might be necessary to copy the target values
             try:
                 self.df_test[self.target] = testdf[self.target]
@@ -446,10 +467,12 @@ class Dataset:
             self.util.error(f"unknown split strategy: {split_strategy}")
         # check if train or test set should be ignored
-        as_test = eval(self.util.config_val_data(self.name, "as_test", "False"))
+        as_test = eval(self.util.config_val_data(
+            self.name, "as_test", "False"))
         if as_test:
             self.df_train = pd.DataFrame()
-        as_train = eval(self.util.config_val_data(self.name, "as_train", "False"))
+        as_train = eval(self.util.config_val_data(
+            self.name, "as_train", "False"))
         if as_train:
             self.df_test = pd.DataFrame()
@@ -480,7 +503,8 @@ class Dataset:
         seed = 42
         k = 30
-        test_size = int(self.util.config_val_data(self.name, "test_size", 20)) / 100.0
+        test_size = int(self.util.config_val_data(
+            self.name, "test_size", 20)) / 100.0
         df = self.df
         # split target
         targets = df[self.target].to_numpy()
@@ -496,7 +520,8 @@ class Dataset:
         stratif_vars = self.util.config_val("DATA", "balance", False)
         stratif_vars_array = {}
         if not stratif_vars:
-            self.util.error("balanced split needs stratif_vars to stratify the splits")
+            self.util.error(
+                "balanced split needs stratif_vars to stratify the splits")
         else:
             stratif_vars = ast.literal_eval(stratif_vars)
             for stratif_var in stratif_vars.keys():
@@ -505,7 +530,8 @@ class Dataset:
                     continue
                 else:
                     data = df[stratif_var].to_numpy()
-                    bins = self.util.config_val("DATA", f"{stratif_var}_bins", False)
+                    bins = self.util.config_val(
+                        "DATA", f"{stratif_var}_bins", False)
                     if bins:
                         data = binning(data, nbins=int(bins))
                     stratif_vars_array[stratif_var] = data
@@ -556,7 +582,8 @@ class Dataset:
     def split_speakers(self):
         """One way to split train and eval sets: Specify percentage of evaluation speakers"""
-        test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
+        test_percent = int(self.util.config_val_data(
+            self.name, "test_size", 20))
         df = self.df
         s_num = df.speaker.nunique()
         test_num = int(s_num * (test_percent / 100))
@@ -575,7 +602,8 @@ class Dataset:
     def random_split(self):
         """One way to split train and eval sets: Specify percentage of random samples"""
-        test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
+        test_percent = int(self.util.config_val_data(
+            self.name, "test_size", 20))
         df = self.df
         s_num = len(df)
         test_num = int(s_num * (test_percent / 100))
@@ -676,8 +704,11 @@ class Dataset:
     def map_continuous_classification(self, df):
         """Map labels to bins for continuous data that should be classified"""
+        if df.empty:
+            return
         if self.check_continuous_classification():
-            self.util.debug(f"{self.name}: binning continuous variable to categories")
+            self.util.debug(
+                f"{self.name}: binning continuous variable to categories")
             cat_vals = self.util.continuous_to_categorical(df[self.target])
             df[self.target] = cat_vals.values
             labels = ast.literal_eval(glob_conf.config["DATA"]["labels"])

{nkululeko-0.93.11.dist-info → nkululeko-0.93.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: nkululeko
-Version: 0.93.11
+Version: 0.93.12
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -63,7 +63,7 @@ Requires-Dist: pylatex
 ## Overview
 A project to detect speaker characteristics by machine learning experiments with a high-level interface.
-The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
+The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
 * NEW with nkululeko: [Ensemble learning](http://blog.syntheticspeech.de/2024/06/25/nkululeko-ensemble-classifiers-with-late-fusion/)
 * NEW: [Finetune transformer-models](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)
@@ -353,12 +353,16 @@ If you use it, please mention the Nkululeko paper:
 Changelog
 =========
+Version 0.93.12
+---------------
+* bugfix: map_continuous_to_cat crashed on empty data
 Version 0.93.11
---------------
+---------------
 * bugfix: silero segmenter assigned file duration values
 Version 0.93.10
---------------
+---------------
 * added nan check for imported features
 * added LOGO result output

{nkululeko-0.93.11.dist-info → nkululeko-0.93.12.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
 nkululeko/aug_train.py,sha256=FoMbBrfyOZd4QAw7oIHl3X6-UpsqAKWVDIolCA7qOWs,3196
 nkululeko/augment.py,sha256=3RzaxB3gRxovgJVjHXi0glprW01J7RaHhUkqotW2T3U,2955
 nkululeko/cacheddataset.py,sha256=XFpWZmbJRg0pvhnIgYf0TkclxllD-Fctu-Ol0PF_00c,969
-nkululeko/constants.py,sha256=RFv_MnQuDew9o2GQ6vQEZEZj1JoIn68nAUZQ1_9S_yw,40
+nkululeko/constants.py,sha256=T9YZzqdg_ltKpuIf2XZdrqFmmlZQmhak97DpM5GQQhI,40
 nkululeko/demo-ft.py,sha256=iD9Pzp9QjyAv31q1cDZ75vPez7Ve8A4Cfukv5yfZdrQ,770
 nkululeko/demo.py,sha256=4Yzhg6pCPBYPGJrP7JX2TysVosl_R1llpVDKc2P_gUA,4955
 nkululeko/demo_feats.py,sha256=BvZjeNFTlERIRlq34OHM4Z96jdDQAhB01BGQAUcX9dM,2026
@@ -49,7 +49,7 @@ nkululeko/autopredict/ap_stoi.py,sha256=UEQg1ZV0meAsxgdWB8ieRs9GPXHqArmsaOyCGRwp
 nkululeko/autopredict/ap_valence.py,sha256=WrW4Ltqi_odW49_4QEVKkfnrcztLIVZ4cXIEHu4dBN8,1026
 nkululeko/autopredict/estimate_snr.py,sha256=1k9-XadABudnsNOeFZD_Fg0E64-GUQVS7JEp82MLQS4,4995
 nkululeko/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nkululeko/data/dataset.py,sha256=G4jzD2MvzB7d6Oja_pUIdShFr7Qsbs0ogGzuTcyQfLo,30041
+nkululeko/data/dataset.py,sha256=H65rvQ8sPwEwv_T-FqOEa7FeQ3JBn88v3xOzBOjARe4,30582
 nkululeko/data/dataset_csv.py,sha256=p2b4eS5R2Q5zdOIc56NRRU2PTFXSRt0qrdHGafHkWKo,4830
 nkululeko/feat_extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/feat_extract/feats_agender.py,sha256=onfAQ6-xx_mFMJXEF1IX8cHBmGtGeX6weJmxbkfh1_o,3184
@@ -112,9 +112,9 @@ nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/utils/files.py,sha256=SrrYaU7AB80MZHiV1jcB0h_zigvYLYgSVNTXV4ao38g,4593
 nkululeko/utils/stats.py,sha256=vCRzhCR0Gx5SiJyAGbj1TIto8ocGz58CM5Pr3LltagA,2948
 nkululeko/utils/util.py,sha256=wFDslqxpCVDwi6LBakIFDDy1kYsxt5G7ykE38CocmtA,16880
-nkululeko-0.93.11.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
-nkululeko-0.93.11.dist-info/METADATA,sha256=pMKMenPiE34afU4cUaCXCsi6wqi8OJH5YsHp9Q2pmos,42733
-nkululeko-0.93.11.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
-nkululeko-0.93.11.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
-nkululeko-0.93.11.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
-nkululeko-0.93.11.dist-info/RECORD,,
+nkululeko-0.93.12.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
+nkululeko-0.93.12.dist-info/METADATA,sha256=jjO-vG0POWF3v_v3QfliA4uT8jzZPikwmEj2O6v6rhg,42823
+nkululeko-0.93.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+nkululeko-0.93.12.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
+nkululeko-0.93.12.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
+nkululeko-0.93.12.dist-info/RECORD,,

{nkululeko-0.93.11.dist-info → nkululeko-0.93.12.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.7.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nkululeko-0.93.11.dist-info → nkululeko-0.93.12.dist-info}/LICENSE RENAMED Viewed

File without changes

{nkululeko-0.93.11.dist-info → nkululeko-0.93.12.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nkululeko-0.93.11.dist-info → nkululeko-0.93.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

nkululeko 0.93.11__py3-none-any.whl → 0.93.12__py3-none-any.whl

nkululeko 0.93.11py3-none-any.whl → 0.93.12py3-none-any.whl