PyPI - nkululeko - Versions diffs - 0.65.2__py3-none-any.whl → 0.65.5__py3-none-any.whl - Mend

nkululeko 0.65.2py3-none-any.whl → 0.65.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

nkululeko/constants.py +2 -2
nkululeko/data/dataset.py +5 -2
nkululeko/data/dataset_csv.py +8 -6
nkululeko/export.py +35 -19
nkululeko/feat_extract/feats_wav2vec2.py +1 -1
nkululeko/plots.py +6 -0
nkululeko/utils/stats.py +13 -2
{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/METADATA +14 -1
{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/RECORD +12 -12
{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/LICENSE +0 -0
{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/WHEEL +0 -0
{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/top_level.txt +0 -0

nkululeko/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.65.2"
-SAMPLING_RATE=16000
+VERSION="0.65.5"
+SAMPLING_RATE=16000

nkululeko/data/dataset.py CHANGED Viewed

@@ -159,7 +159,7 @@ class Dataset:
             self.df["duration"] = (end - start).total_seconds()
         # Perform some filtering if desired
-        required = eval(self.util.config_val_data(self.name, "required", "False"))
+        required = self.util.config_val_data(self.name, "required", False)
         if required:
             pre = self.df.shape[0]
             self.df = self.df[self.df[required].notna()]
@@ -204,7 +204,10 @@ class Dataset:
                 pass
             try:
                 # try to get the gender values
-                df_local["gender"] = source_df["gender"]
+                if "gender" in source_df:
+                    df_local["gender"] = source_df["gender"]
+                else:
+                    df_local["gender"] = source_df["sex"]
                 got_gender = True
             except (KeyError, ValueError, audformat.errors.BadKeyError) as e:
                 pass

nkululeko/data/dataset_csv.py CHANGED Viewed

@@ -48,21 +48,23 @@ class Dataset_CSV(Dataset):
         self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
         if self.is_labeled and not "class_label" in self.df.columns:
             self.df["class_label"] = self.df[self.target]
-        if "gender" in df.columns:
+        if "gender" in self.df.columns:
             self.got_gender = True
-        if "age" in df.columns:
+        elif "sex" in self.df.columns:
+            self.df = self.df.rename(columns={'sex':'gender'})
+            self.got_gender = True
+        if "age" in self.df.columns:
             self.got_age = True
-        if "speaker" in df.columns:
+        if "speaker" in self.df.columns:
             self.got_speaker = True
-            ns = df["speaker"].nunique()
-            self.util.debug(f"num of speakers: {ns}")
         speaker_num = 0
         if self.got_speaker:
             speaker_num = self.df.speaker.nunique()
         self.util.debug(
             f"Loaded database {self.name} with {df.shape[0]} "
             f"samples: got targets: {self.got_target}, got speakers: {self.got_speaker} ({speaker_num}), "
-            f"got sexes: {self.got_gender}"
+            f"got sexes: {self.got_gender}, "
+            f"got age: {self.got_age}"
         )
     def prepare(self):

nkululeko/export.py CHANGED Viewed

@@ -10,7 +10,7 @@ import audiofile
 from nkululeko.experiment import Experiment
 from nkululeko.util import Util
 from nkululeko.constants import VERSION
+import shutil
 def main(src_dir):
     parser = argparse.ArgumentParser(description="Call the nkululeko framework.")
@@ -54,6 +54,7 @@ def main(src_dir):
     target_root = util.config_val("EXPORT", "root", "./exported_data/")
     orig_root = util.config_val("EXPORT", "orig_root", None)
     data_name = util.config_val("EXPORT", "data_name", "export")
+    segments_as_files = eval(util.config_val("EXPORT", "segments_as_files", "False"))
     audeer.mkdir(target_root)
     splits = {"train": df_train, "test": df_test}
     df_all = pd.DataFrame()
@@ -61,25 +62,40 @@ def main(src_dir):
         files = []
         df = splits[split]
         for idx, (file, start, end) in enumerate(df.index.to_list()):
-            signal, sampling_rate = audiofile.read(
-                file,
-                offset=start.total_seconds(),
-                duration=(end - start).total_seconds(),
-                always_2d=True,
-            )
             file_dir = os.path.dirname(file)
-            file_name = os.path.basename(file)
-            wav_folder = (
-                f"{target_root}/{os.path.basename(os.path.normpath(orig_root))}"
-            )
-            audeer.mkdir(wav_folder)
-            new_rel_path = file_dir[file_dir.index(orig_root)+1+len(orig_root):]
-            new_file_path = f"{wav_folder}/{new_rel_path}"
-            audeer.mkdir(new_file_path)
-            new_file_name = f"{new_file_path}/{file_name}"
-            audiofile.write(new_file_name, signal, sampling_rate)
-            new_file_name = os.path.relpath(new_file_name, target_root)
-            files.append(new_file_name)
+            if segments_as_files:
+                signal, sampling_rate = audiofile.read(
+                    file,
+                    offset=start.total_seconds(),
+                    duration=(end - start).total_seconds(),
+                    always_2d=True,
+                )
+                file_name = os.path.basename(file)[:-3]+'_'+start.total_seconds()+'.wav'
+                wav_folder = (
+                    f"{target_root}/{os.path.basename(os.path.normpath(orig_root))}"
+                )
+                audeer.mkdir(wav_folder)
+                new_rel_path = file_dir[file_dir.index(orig_root)+1+len(orig_root):]
+                new_file_path = f"{wav_folder}/{new_rel_path}"
+                audeer.mkdir(new_file_path)
+                new_file_name = f"{new_file_path}/{file_name}"
+                audiofile.write(new_file_name, signal, sampling_rate)
+                new_file_name = os.path.relpath(new_file_name, target_root)
+                files.append(new_file_name)
+            else:
+                file_name = os.path.basename(file)[:-4]+'_'+str(start.total_seconds())+'.wav'
+                wav_folder = (
+                    f"{target_root}/{os.path.basename(os.path.normpath(orig_root))}"
+                )
+                audeer.mkdir(wav_folder)
+                new_rel_path = file_dir[file_dir.index(orig_root)+1+len(orig_root):]
+                new_file_path = f"{wav_folder}/{new_rel_path}"
+                audeer.mkdir(new_file_path)
+                new_file_name = f"{new_file_path}/{file_name}"
+                if not os.path.exists(new_file_name):
+                    shutil.copyfile(file, new_file_name)
+                new_file_name = os.path.relpath(new_file_name, target_root)
+                files.append(new_file_name)
         df = df.set_index(df.index.set_levels(files, level="file"))
         df["split"] = split
         df_all = pd.concat([df_all, df])

nkululeko/feat_extract/feats_wav2vec2.py CHANGED Viewed

@@ -54,7 +54,7 @@ class Wav2vec2(Featureset):
                 signal, sampling_rate = torchaudio.load(file,
                     frame_offset=int(start.total_seconds()*16000),
                     num_frames=int((end - start).total_seconds()*16000))
-                assert sampling_rate == 16000
+                assert sampling_rate == 16000, f"got {sampling_rate} instead of 16000"
                 emb = self.get_embeddings(signal, sampling_rate, file)
                 emb_series[idx] = emb
                 if idx % 10 == 0:

nkululeko/plots.py CHANGED Viewed

@@ -57,6 +57,8 @@ class Plots():
         bin_reals = eval(self.util.config_val('EXPL', 'bin_reals', 'True'))
         for att in attributes:
             if len(att) == 1:
+                if att[0] not in df:
+                    self.util.error(f'unknown feature: {att[0]}')
                 self.util.debug(f'plotting {att[0]}')
                 filename = f'{self.target}-{att[0]}'
                 if self.util.is_categorical(df[att[0]]):
@@ -90,6 +92,10 @@ class Plots():
                 plt.close(fig)
                 # fig.clear()           # avoid error
             elif len(att) == 2:
+                if att[0] not in df:
+                    self.util.error(f'unknown feature: {att[0]}')
+                if att[1] not in df:
+                    self.util.error(f'unknown feature: {att[1]}')
                 self.util.debug(f'plotting {att}')
                 att1 = att[0]
                 att2 = att[1]

nkululeko/utils/stats.py CHANGED Viewed

@@ -3,6 +3,14 @@ import math
 import numpy as np
 import pandas as pd
+def check_na(a):
+    if np.isnan(a).any():
+        count = np.count_nonzero(np.isnan(a))
+        print(f'WARNING: got {count} Nans (of {len(a)}), setting to 0')
+        a[np.isnan(a)] = 0
+        return a
+    else:
+        return a
 def cohen_d(d1, d2):
     """
@@ -13,6 +21,9 @@ def cohen_d(d1, d2):
     Returns:
         Cohen's d with precision 3
     """
+    # Checks:
+    d1 = check_na(d1)
+    d2 = check_na(d2)
     # calculate the size of samples
     n1, n2 = len(d1), len(d2)
     # calculate the variance of the samples
@@ -22,8 +33,8 @@ def cohen_d(d1, d2):
     # calculate the means of the samples
     u1, u2 = np.mean(d1), np.mean(d2)
     # calculate the effect size
-    if math.isnan(s):
-        return 0
+    if math.isnan(s) or s == 0:
+        return -1
     return (int(1000 * np.abs((u1 - u2)) / s)) / 1000

{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nkululeko
-Version: 0.65.2
+Version: 0.65.5
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -255,6 +255,19 @@ Nkululeko can be used under the [MIT license](https://choosealicense.com/license
 Changelog
 =========
+Version 0.65.5
+--------------
+* added fill_na in plot effect size
+Version 0.65.4
+--------------
+* added datasets to distribution
+* changes in wav2vec2
+Version 0.65.3
+--------------
+* various bugfixes
 Version 0.65.2
 --------------
 * fixed bug in dataset.csv that prevented correct paths for relative files

{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/RECORD RENAMED Viewed

@@ -2,19 +2,19 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
 nkululeko/augment.py,sha256=paq-vEf02XyaPsLjnCbDJbuZind6M6mm0NWAnK5_PKU,1751
 nkululeko/balancer.py,sha256=64ftZN68sMDfkvuovCDHpAHmSJgCO6Kdk9bwmpSisec,12
 nkululeko/cacheddataset.py,sha256=lIJ6hUo5LoxSrzXtWV8mzwO7wRtUETWnOQ4ws2XfL1E,969
-nkululeko/constants.py,sha256=Nf4gAGehioLVSeUIbSCUwTBi57qq1XZFekfbMmt65n4,37
+nkululeko/constants.py,sha256=KGYcdgRK9DOOWiYr-901PUaOgmccfRuUhfuYAEyFvZM,36
 nkululeko/demo.py,sha256=1JFayJBUqOwj8fDtNF6siuB_5aG6dkb_TeIFPhYv1YI,1858
 nkululeko/demo_predictor.py,sha256=f3tghu3KxrLFPSrMMqT3E_Owy8alAuHEbkoZuahikEE,2310
 nkululeko/experiment.py,sha256=fV-TdDPlFhS8DdALuzme0n9-3HnGwi_GxiTak3u_ZAY,24779
 nkululeko/explore.py,sha256=Ok4Cr1llMQhsP4zCJYWfr5I5HNVuWocGcDnE5QfE9a0,2093
-nkululeko/export.py,sha256=NXSjTLfeUk_T6h9zcIFc63scvSc2vvXDmp1EEUpcyGY,3509
+nkululeko/export.py,sha256=p3EalSrFIJCcZVv_630gY-7QW_WDInkrQzsMpYCL5Fs,4525
 nkululeko/feature_extractor.py,sha256=YRzFJjsE9omCFfr8DNsHj0uiKPd3_NCjHrWpRGmuhmA,6474
 nkululeko/file_checker.py,sha256=Jo3TN6sc_XJNF8x1UYHqfPyaMTues0iuj6_Sal4x0so,3428
 nkululeko/filter_data.py,sha256=LD46OLYfA6UuLvFgaA1LvCVSqmi6JXN2xd_fTZdr-ag,7029
 nkululeko/glob_conf.py,sha256=KOsmB2_9AVoKNyhtCkaamj-ZYCme6-NjTYF-4wlLgOY,241
 nkululeko/modelrunner.py,sha256=NYdV4z9TKhtC9LGhOxXa7aBC2AxRjnFJ42Ah01rsbqg,6025
 nkululeko/nkululeko.py,sha256=SVOY3CPvlmG-16kqV8YOvR2HYCgHkaiVo3GBiwu38W0,1681
-nkululeko/plots.py,sha256=p2J9dOHJxbo0_SSh_TtKBLxD5e96DpQ_TmUwn8GDTHo,12714
+nkululeko/plots.py,sha256=wwUvHIlpP6wYlukYNWtgqhxh6avqo_okslNgoT4S6u0,13023
 nkululeko/predict.py,sha256=3ei4wn2by0p9Vkv7cllMcszmEjSM2vX0T6x_5rlgT28,1851
 nkululeko/reporter.py,sha256=359aeQWt0ZGLseaJnOfafYG8BrwumiM2Q58DWiaoyWQ,10177
 nkululeko/resample.py,sha256=NDZLIhmGPcxSZwzIZul9oeNfbaHfAmzjTvRWs6isIr4,2120
@@ -44,8 +44,8 @@ nkululeko/autopredict/ap_stoi.py,sha256=sx2KSGSjCsjgUvZPFlKALhbZlVg2aTdGmvil-iNS
 nkululeko/autopredict/ap_valence.py,sha256=cyFrKRy68EU5gDiVg0_HxiwaKGbAJ9UuwuiXgVTalAU,997
 nkululeko/autopredict/estimate_snr.py,sha256=m3JZSRGXr2M9yVgovTSZ63rpho9hIUfHOXfAVRAHf1k,4186
 nkululeko/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nkululeko/data/dataset.py,sha256=2brz4yKjUogRy2z7S-VkJpb1G9iG6OtfdiO1yyVEiaw,21127
-nkululeko/data/dataset_csv.py,sha256=RElfRxKrI1IqdPp5pvq2lmL8NIzBsyLEaWEzqjz_zBg,2708
+nkululeko/data/dataset.py,sha256=3h3gL_zxWjkYdujETxJRJhqbFPNlEbG6iaPyr8CQ-oc,21245
+nkululeko/data/dataset_csv.py,sha256=YUAeTqCK-aaoRMTzYpMRB00J5E5P8BRH7Jve4RfEzvU,2818
 nkululeko/feat_extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/feat_extract/feats_agender.py,sha256=ZlfYbpL65SOTLsoqkcNsREa-EyHpigNf8LlFErQH9Ck,2818
 nkululeko/feat_extract/feats_agender_agender.py,sha256=zMY3GucOOx-aBav_9bUSdU63rc2VlFxVZHuVqUKQ6Dk,2904
@@ -63,7 +63,7 @@ nkululeko/feat_extract/feats_praat.py,sha256=VU9hwL0_1XJQW32CYaW0bqTbl9MryQ1hME9
 nkululeko/feat_extract/feats_snr.py,sha256=UHCNgdZj8g69rj-EyujC1gX-BgYkuIoIm63ZR7Xb78w,2790
 nkululeko/feat_extract/feats_squim.py,sha256=8AUNfjWf2MHCDIH0by543fwTVFSybBTYWdQpIlZIMi0,4359
 nkululeko/feat_extract/feats_trill.py,sha256=8BI_wL7YIwaKWTZ8DnLC5RUZ5HVXdLzLXs3QgNexXV8,2958
-nkululeko/feat_extract/feats_wav2vec2.py,sha256=n7eLvt8Zv_GF-w5CkMkts6NU-kh4bEOb-aDgnd2TfvQ,4421
+nkululeko/feat_extract/feats_wav2vec2.py,sha256=48_WK_JmUq0Gda800fJh3RadTxkHeCpOSTBsPIykEjw,4462
 nkululeko/feat_extract/feats_wavlm.py,sha256=06vRILRLG2-OzsLx-tuwOdtMhNtoPSxJwAE5B82upjk,4466
 nkululeko/feat_extract/featureset.py,sha256=7l1qC_nVGyyuYOdKJoS5hY673Fkm3Wpqtdu5mxtBP0E,1369
 nkululeko/feat_extract/feinberg_praat.py,sha256=fcp2863g5NLk7p27HlRISpTOVTaoxcrSGOST4mK5VXk,20496
@@ -90,9 +90,9 @@ nkululeko/segmenting/seg_inaspeechsegmenter.py,sha256=pmLHuXsaqvcdYxB4PSW9l1mbQW
 nkululeko/segmenting/seg_silero.py,sha256=bbLQxUste1lKEwNRHG4wOTCaFMevNt8TOe2DfmLtu5w,3306
 nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/utils/files.py,sha256=82EsGvTN85lDdSzetm1ZHTVOUm0AcfQhTeurwsfHN_Q,3547
-nkululeko/utils/stats.py,sha256=EM2qN1HBh34wZ5FlK3fOyRhBig4W56_DPYY9wudx5aw,1939
-nkululeko-0.65.2.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
-nkululeko-0.65.2.dist-info/METADATA,sha256=4jY2iz2vnyDnJaiS7VE077x3VwKwRBv24dbdvDF8744,23255
-nkululeko-0.65.2.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
-nkululeko-0.65.2.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
-nkululeko-0.65.2.dist-info/RECORD,,
+nkululeko/utils/stats.py,sha256=nRuZEbHpeRTAcvW8b12ZWi_kDnpC5Ztrmvy0_J621cE,2241
+nkululeko-0.65.5.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
+nkululeko-0.65.5.dist-info/METADATA,sha256=y2ttA1LEwXgflc2MLb1krzgq3zazj780wZ3fIWR8fAg,23458
+nkululeko-0.65.5.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+nkululeko-0.65.5.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
+nkululeko-0.65.5.dist-info/RECORD,,

{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/LICENSE RENAMED Viewed

File without changes

{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{nkululeko-0.65.2.dist-info → nkululeko-0.65.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

nkululeko 0.65.2__py3-none-any.whl → 0.65.5__py3-none-any.whl

nkululeko 0.65.2py3-none-any.whl → 0.65.5py3-none-any.whl