PyPI - nkululeko - Versions diffs - 0.93.10__py3-none-any.whl → 0.93.12__py3-none-any.whl - Mend

nkululeko 0.93.10py3-none-any.whl → 0.93.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

nkululeko/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.93.10"
+VERSION="0.93.12"
 SAMPLING_RATE = 16000

nkululeko/data/dataset.py CHANGED Viewed

@@ -34,9 +34,11 @@ class Dataset:
         self.plot = Plots()
         self.limit = int(self.util.config_val_data(self.name, "limit", 0))
         self.target_tables_append = eval(
-            self.util.config_val_data(self.name, "target_tables_append", "False")
+            self.util.config_val_data(
+                self.name, "target_tables_append", "False")
         )
-        self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
+        self.start_fresh = eval(
+            self.util.config_val("DATA", "no_reuse", "False"))
         self.is_labeled, self.got_speaker, self.got_gender, self.got_age = (
             False,
             False,
@@ -70,7 +72,8 @@ class Dataset:
         try:
             self.db = audformat.Database.load(root)
         except FileNotFoundError:
-            self.util.error(f"{self.name}: no audformat database found at {root}")
+            self.util.error(
+                f"{self.name}: no audformat database found at {root}")
         return root
     def _check_cols(self, df):
@@ -92,7 +95,8 @@ class Dataset:
         )
         self.util.debug(r_string)
         if glob_conf.report.initial:
-            glob_conf.report.add_item(ReportItem("Data", "Load report", r_string))
+            glob_conf.report.add_item(ReportItem(
+                "Data", "Load report", r_string))
             glob_conf.report.initial = False
     def load(self):
@@ -103,7 +107,8 @@ class Dataset:
         store_file = f"{store}{self.name}.{store_format}"
         self.root = self._load_db()
         if not self.start_fresh and os.path.isfile(store_file):
-            self.util.debug(f"{self.name}: reusing previously stored file {store_file}")
+            self.util.debug(
+                f"{self.name}: reusing previously stored file {store_file}")
             self.df = self.util.get_store(store_file, store_format)
             self.is_labeled = self.target in self.df
             self.got_gender = "gender" in self.df
@@ -118,10 +123,12 @@ class Dataset:
         # map the audio file paths
         self.db.map_files(lambda x: os.path.join(self.root, x))
         # the dataframes (potentially more than one) with at least the file names
-        df_files = self.util.config_val_data(self.name, "files_tables", "['files']")
+        df_files = self.util.config_val_data(
+            self.name, "files_tables", "['files']")
         df_files_tables = ast.literal_eval(df_files)
         # The label for the target column
-        self.col_label = self.util.config_val_data(self.name, "label", self.target)
+        self.col_label = self.util.config_val_data(
+            self.name, "label", self.target)
         (
             df,
             self.is_labeled,
@@ -157,7 +164,8 @@ class Dataset:
                 self.got_age = got_age2 or self.got_age
                 if audformat.is_filewise_index(df_target.index):
                     try:
-                        df_target = df_target.loc[df.index.get_level_values("file")]
+                        df_target = df_target.loc[df.index.get_level_values(
+                            "file")]
                         df_target = df_target.set_index(df.index)
                     except KeyError:
                         # just a try...
@@ -206,7 +214,8 @@ class Dataset:
             end = self.df.index.get_level_values(2)
             self.df["duration"] = (end - start).total_seconds()
         elif self.df.duration.dtype == "timedelta64[ns]":
-            self.df["duration"] = self.df["duration"].map(lambda x: x.total_seconds())
+            self.df["duration"] = self.df["duration"].map(
+                lambda x: x.total_seconds())
         # Perform some filtering if desired
         required = self.util.config_val_data(self.name, "required", False)
         if required:
@@ -236,15 +245,18 @@ class Dataset:
                 res.append(abs(n - max))
             return res
-        reverse = eval(self.util.config_val_data(self.name, "reverse", "False"))
+        reverse = eval(self.util.config_val_data(
+            self.name, "reverse", "False"))
         if reverse:
-            max = eval(self.util.config_val_data(self.name, "reverse.max", "False"))
+            max = eval(self.util.config_val_data(
+                self.name, "reverse.max", "False"))
             if max:
                 max = float(max)
             else:
                 max = self.df[self.target].values.max()
             self.util.debug(f"reversing target numbers with max values: {max}")
-            self.df[self.target] = reverse_array(self.df[self.target].values, max)
+            self.df[self.target] = reverse_array(
+                self.df[self.target].values, max)
         # check if the target variable should be scaled (z-transformed)
         scale = self.util.config_val_data(self.name, "scale", False)
@@ -317,13 +329,15 @@ class Dataset:
                 pass
             try:
                 # also it might be possible that the age is part of the speaker description
-                df_local["age"] = db[table]["speaker"].get(map="age").astype(int)
+                df_local["age"] = db[table]["speaker"].get(
+                    map="age").astype(int)
                 got_age = True
             except (ValueError, audformat.errors.BadKeyError):
                 pass
             try:
                 # same for the target, e.g. "age"
-                df_local[self.target] = db[table]["speaker"].get(map=self.target)
+                df_local[self.target] = db[table]["speaker"].get(
+                    map=self.target)
                 is_labeled = True
             except (ValueError, audformat.core.errors.BadKeyError):
                 pass
@@ -384,8 +398,10 @@ class Dataset:
             testdf = self.db.tables[self.target + ".test"].df
             traindf = self.db.tables[self.target + ".train"].df
             # use only the train and test samples that were not perhaps filtered out by an earlier processing step
-            self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
-            self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
+            self.df_test = self.df.loc[self.df.index.intersection(
+                testdf.index)]
+            self.df_train = self.df.loc[self.df.index.intersection(
+                traindf.index)]
         elif split_strategy == "train":
             self.df_train = self.df
             self.df_test = pd.DataFrame()
@@ -408,18 +424,23 @@ class Dataset:
             if entry_train_tables:
                 train_tables = ast.literal_eval(entry_train_tables)
                 for train_table in train_tables:
-                    traindf = pd.concat([traindf, self.db.tables[train_table].df])
+                    traindf = pd.concat(
+                        [traindf, self.db.tables[train_table].df])
             # use only the train and test samples that were not perhaps filtered out by an earlier processing step
             # testdf.index.map(lambda x: os.path.join(self.root, x))
             #            testdf.index = testdf.index.to_series().apply(lambda x: self.root+x)
             testdf = testdf.set_index(
-                audformat.utils.to_segmented_index(testdf.index, allow_nat=False)
+                audformat.utils.to_segmented_index(
+                    testdf.index, allow_nat=False)
             )
             traindf = traindf.set_index(
-                audformat.utils.to_segmented_index(traindf.index, allow_nat=False)
+                audformat.utils.to_segmented_index(
+                    traindf.index, allow_nat=False)
             )
-            self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
-            self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
+            self.df_test = self.df.loc[self.df.index.intersection(
+                testdf.index)]
+            self.df_train = self.df.loc[self.df.index.intersection(
+                traindf.index)]
             # it might be necessary to copy the target values
             try:
                 self.df_test[self.target] = testdf[self.target]
@@ -446,10 +467,12 @@ class Dataset:
             self.util.error(f"unknown split strategy: {split_strategy}")
         # check if train or test set should be ignored
-        as_test = eval(self.util.config_val_data(self.name, "as_test", "False"))
+        as_test = eval(self.util.config_val_data(
+            self.name, "as_test", "False"))
         if as_test:
             self.df_train = pd.DataFrame()
-        as_train = eval(self.util.config_val_data(self.name, "as_train", "False"))
+        as_train = eval(self.util.config_val_data(
+            self.name, "as_train", "False"))
         if as_train:
             self.df_test = pd.DataFrame()
@@ -480,7 +503,8 @@ class Dataset:
         seed = 42
         k = 30
-        test_size = int(self.util.config_val_data(self.name, "test_size", 20)) / 100.0
+        test_size = int(self.util.config_val_data(
+            self.name, "test_size", 20)) / 100.0
         df = self.df
         # split target
         targets = df[self.target].to_numpy()
@@ -496,7 +520,8 @@ class Dataset:
         stratif_vars = self.util.config_val("DATA", "balance", False)
         stratif_vars_array = {}
         if not stratif_vars:
-            self.util.error("balanced split needs stratif_vars to stratify the splits")
+            self.util.error(
+                "balanced split needs stratif_vars to stratify the splits")
         else:
             stratif_vars = ast.literal_eval(stratif_vars)
             for stratif_var in stratif_vars.keys():
@@ -505,7 +530,8 @@ class Dataset:
                     continue
                 else:
                     data = df[stratif_var].to_numpy()
-                    bins = self.util.config_val("DATA", f"{stratif_var}_bins", False)
+                    bins = self.util.config_val(
+                        "DATA", f"{stratif_var}_bins", False)
                     if bins:
                         data = binning(data, nbins=int(bins))
                     stratif_vars_array[stratif_var] = data
@@ -556,7 +582,8 @@ class Dataset:
     def split_speakers(self):
         """One way to split train and eval sets: Specify percentage of evaluation speakers"""
-        test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
+        test_percent = int(self.util.config_val_data(
+            self.name, "test_size", 20))
         df = self.df
         s_num = df.speaker.nunique()
         test_num = int(s_num * (test_percent / 100))
@@ -575,7 +602,8 @@ class Dataset:
     def random_split(self):
         """One way to split train and eval sets: Specify percentage of random samples"""
-        test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
+        test_percent = int(self.util.config_val_data(
+            self.name, "test_size", 20))
         df = self.df
         s_num = len(df)
         test_num = int(s_num * (test_percent / 100))
@@ -676,8 +704,11 @@ class Dataset:
     def map_continuous_classification(self, df):
         """Map labels to bins for continuous data that should be classified"""
+        if df.empty:
+            return
         if self.check_continuous_classification():
-            self.util.debug(f"{self.name}: binning continuous variable to categories")
+            self.util.debug(
+                f"{self.name}: binning continuous variable to categories")
             cat_vals = self.util.continuous_to_categorical(df[self.target])
             df[self.target] = cat_vals.values
             labels = ast.literal_eval(glob_conf.config["DATA"]["labels"])

nkululeko/plots.py CHANGED Viewed

@@ -628,8 +628,7 @@ class Plots:
         # one up because of the runs
         fig_dir = self.util.get_path("fig_dir") + "../"
         exp_name = self.util.get_exp_name(only_data=True)
-        format = self.util.config_val("PLOT", "format", "png")
-        filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{format}"
+        filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{self.format}"
         fig = ax.figure
         fig.savefig(filename)
         fig.clear()

nkululeko/segment.py CHANGED Viewed

@@ -62,6 +62,11 @@ def main():
     expr.fill_train_and_tests()
     util.debug(f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
+    def calc_dur(x):
+        starts = x[1]
+        ends = x[2]
+        return (ends - starts).total_seconds()
     # segment
     segmented_file = util.config_val("SEGMENT", "result", "segmented.csv")
@@ -104,16 +109,11 @@ def main():
             df_seg = df_seg.drop(columns=[target])
             df_seg = df_seg.rename(columns={"class_label": target})
         # save file
+        df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
         df_seg.to_csv(f"{expr.data_dir}/{segmented_file}")
-    def calc_dur(x):
-        starts = x[1]
-        ends = x[2]
-        return (ends - starts).total_seconds()
     if "duration" not in df.columns:
         df["duration"] = df.index.to_series().map(lambda x: calc_dur(x))
-    df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
     num_before = df.shape[0]
     num_after = df_seg.shape[0]
     util.debug(

{nkululeko-0.93.10.dist-info → nkululeko-0.93.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: nkululeko
-Version: 0.93.10
+Version: 0.93.12
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -63,7 +63,7 @@ Requires-Dist: pylatex
 ## Overview
 A project to detect speaker characteristics by machine learning experiments with a high-level interface.
-The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
+The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
 * NEW with nkululeko: [Ensemble learning](http://blog.syntheticspeech.de/2024/06/25/nkululeko-ensemble-classifiers-with-late-fusion/)
 * NEW: [Finetune transformer-models](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)
@@ -353,8 +353,16 @@ If you use it, please mention the Nkululeko paper:
 Changelog
 =========
+Version 0.93.12
+---------------
+* bugfix: map_continuous_to_cat crashed on empty data
+Version 0.93.11
+---------------
+* bugfix: silero segmenter assigned file duration values
 Version 0.93.10
---------------
+---------------
 * added nan check for imported features
 * added LOGO result output

{nkululeko-0.93.10.dist-info → nkululeko-0.93.12.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
 nkululeko/aug_train.py,sha256=FoMbBrfyOZd4QAw7oIHl3X6-UpsqAKWVDIolCA7qOWs,3196
 nkululeko/augment.py,sha256=3RzaxB3gRxovgJVjHXi0glprW01J7RaHhUkqotW2T3U,2955
 nkululeko/cacheddataset.py,sha256=XFpWZmbJRg0pvhnIgYf0TkclxllD-Fctu-Ol0PF_00c,969
-nkululeko/constants.py,sha256=0grSx0I2K13N--2KpgQU90VQf94GJLPJXHFLbHVPDjI,40
+nkululeko/constants.py,sha256=T9YZzqdg_ltKpuIf2XZdrqFmmlZQmhak97DpM5GQQhI,40
 nkululeko/demo-ft.py,sha256=iD9Pzp9QjyAv31q1cDZ75vPez7Ve8A4Cfukv5yfZdrQ,770
 nkululeko/demo.py,sha256=4Yzhg6pCPBYPGJrP7JX2TysVosl_R1llpVDKc2P_gUA,4955
 nkululeko/demo_feats.py,sha256=BvZjeNFTlERIRlq34OHM4Z96jdDQAhB01BGQAUcX9dM,2026
@@ -20,12 +20,12 @@ nkululeko/modelrunner.py,sha256=lJy-xM4QfDDWeL0dLTE_VIb4sYrnd_Z_yJRK3wwohQA,1119
 nkululeko/multidb.py,sha256=sO6OwJn8sn1-C-ig3thsIL8QMWHdV9SnJhDodKjeKrI,6876
 nkululeko/nkuluflag.py,sha256=PGWSmZz-PiiHLgcZJAoGOI_Y-sZDVI1ksB8p5r7riWM,3725
 nkululeko/nkululeko.py,sha256=M7baIq2nAoi6dEoBL4ATEuqAs5U1fvl_hyqAl5DybAQ,2040
-nkululeko/plots.py,sha256=zHWZ8Ns_0SLOEdbDVulObpRPoXRw_qqPXJv2dM08EeE,26049
+nkululeko/plots.py,sha256=Mm30pDLBb55iE9SYaSg76KFBKnebZTlypFQIBo26wuY,25991
 nkululeko/predict.py,sha256=MLnHEyFmSiHLLs-HDczag8Vu3zKF5T1rXLKdZZJ6py8,2083
 nkululeko/resample.py,sha256=rn3-M1A-iwVGibfQNGyeYNa7briD24lIN9Szq_1uTJo,5194
 nkululeko/runmanager.py,sha256=AswmORVUkCIH0gTx6zEyufvFATQBS8C5TXo2erSNdVg,7611
 nkululeko/scaler.py,sha256=7VOZ4sREMoQtahfETt9RyuR29Fb7PCwxlYVjBbdCVFc,4101
-nkululeko/segment.py,sha256=DRjC6b7SeInYgwBcDPXpTXPvXPS-J8kFQO7H095bK80,4945
+nkululeko/segment.py,sha256=7UrJEwdLmh9wDL5iBwpdJyJm9dwSxidHrHt-_D2qtxw,4949
 nkululeko/syllable_nuclei.py,sha256=5w_naKxNxz66a_qLkraemi2fggM-gWesiiBPS47iFcE,9931
 nkululeko/test.py,sha256=1w624vo5KTzmFC8BUStGlLDmIEAFuJUz7J0W-gp7AxI,1677
 nkululeko/test_predictor.py,sha256=DEHE_D3A6m6KJTrpDKceA1n655t_UZV3WQd57K4a3Ho,2863
@@ -49,7 +49,7 @@ nkululeko/autopredict/ap_stoi.py,sha256=UEQg1ZV0meAsxgdWB8ieRs9GPXHqArmsaOyCGRwp
 nkululeko/autopredict/ap_valence.py,sha256=WrW4Ltqi_odW49_4QEVKkfnrcztLIVZ4cXIEHu4dBN8,1026
 nkululeko/autopredict/estimate_snr.py,sha256=1k9-XadABudnsNOeFZD_Fg0E64-GUQVS7JEp82MLQS4,4995
 nkululeko/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nkululeko/data/dataset.py,sha256=G4jzD2MvzB7d6Oja_pUIdShFr7Qsbs0ogGzuTcyQfLo,30041
+nkululeko/data/dataset.py,sha256=H65rvQ8sPwEwv_T-FqOEa7FeQ3JBn88v3xOzBOjARe4,30582
 nkululeko/data/dataset_csv.py,sha256=p2b4eS5R2Q5zdOIc56NRRU2PTFXSRt0qrdHGafHkWKo,4830
 nkululeko/feat_extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/feat_extract/feats_agender.py,sha256=onfAQ6-xx_mFMJXEF1IX8cHBmGtGeX6weJmxbkfh1_o,3184
@@ -112,9 +112,9 @@ nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/utils/files.py,sha256=SrrYaU7AB80MZHiV1jcB0h_zigvYLYgSVNTXV4ao38g,4593
 nkululeko/utils/stats.py,sha256=vCRzhCR0Gx5SiJyAGbj1TIto8ocGz58CM5Pr3LltagA,2948
 nkululeko/utils/util.py,sha256=wFDslqxpCVDwi6LBakIFDDy1kYsxt5G7ykE38CocmtA,16880
-nkululeko-0.93.10.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
-nkululeko-0.93.10.dist-info/METADATA,sha256=vO975smqhzWtD-pm5SqiEqJV6RBxPy4UTHxGz4d6Ta4,42644
-nkululeko-0.93.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-nkululeko-0.93.10.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
-nkululeko-0.93.10.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
-nkululeko-0.93.10.dist-info/RECORD,,
+nkululeko-0.93.12.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
+nkululeko-0.93.12.dist-info/METADATA,sha256=jjO-vG0POWF3v_v3QfliA4uT8jzZPikwmEj2O6v6rhg,42823
+nkululeko-0.93.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+nkululeko-0.93.12.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
+nkululeko-0.93.12.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
+nkululeko-0.93.12.dist-info/RECORD,,

{nkululeko-0.93.10.dist-info → nkululeko-0.93.12.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nkululeko-0.93.10.dist-info → nkululeko-0.93.12.dist-info}/LICENSE RENAMED Viewed

File without changes

{nkululeko-0.93.10.dist-info → nkululeko-0.93.12.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nkululeko-0.93.10.dist-info → nkululeko-0.93.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

nkululeko 0.93.10__py3-none-any.whl → 0.93.12__py3-none-any.whl

nkululeko 0.93.10py3-none-any.whl → 0.93.12py3-none-any.whl