PyPI - nkululeko - Versions diffs - 0.77.0__py3-none-any.whl → 0.77.1__py3-none-any.whl - Mend

nkululeko 0.77.0py3-none-any.whl → 0.77.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

nkululeko/augment.py +1 -1
nkululeko/augmenting/augmenter.py +1 -1
nkululeko/augmenting/randomsplicer.py +1 -1
nkululeko/augmenting/resampler.py +4 -9
nkululeko/autopredict/ap_age.py +2 -4
nkululeko/autopredict/ap_arousal.py +2 -4
nkululeko/autopredict/ap_dominance.py +2 -4
nkululeko/autopredict/ap_gender.py +2 -4
nkululeko/autopredict/ap_mos.py +2 -4
nkululeko/autopredict/ap_pesq.py +2 -4
nkululeko/autopredict/ap_sdr.py +2 -4
nkululeko/autopredict/ap_snr.py +2 -4
nkululeko/autopredict/ap_stoi.py +2 -4
nkululeko/autopredict/ap_valence.py +2 -4
nkululeko/constants.py +1 -1
nkululeko/data/dataset.py +1 -1
nkululeko/demo.py +4 -10
nkululeko/demo_predictor.py +1 -1
nkululeko/experiment.py +1 -1
nkululeko/explore.py +6 -13
nkululeko/export.py +14 -25
nkululeko/feat_extract/feats_analyser.py +110 -18
nkululeko/feat_extract/feats_clap.py +4 -10
nkululeko/feat_extract/feats_import.py +2 -4
nkululeko/feat_extract/feats_mld.py +4 -9
nkululeko/feat_extract/feats_mos.py +5 -13
nkululeko/feat_extract/feats_oxbow.py +5 -12
nkululeko/feat_extract/feats_snr.py +3 -7
nkululeko/feat_extract/feats_squim.py +5 -13
nkululeko/feat_extract/feats_trill.py +5 -13
nkululeko/feat_extract/featureset.py +2 -4
nkululeko/feat_extract/feinberg_praat.py +1 -1
nkululeko/feature_extractor.py +1 -1
nkululeko/file_checker.py +5 -5
nkululeko/filter_data.py +6 -16
nkululeko/modelrunner.py +1 -1
nkululeko/models/model.py +1 -1
nkululeko/models/model_cnn.py +1 -1
nkululeko/models/model_mlp.py +1 -1
nkululeko/models/model_mlp_regression.py +1 -1
nkululeko/nkululeko.py +5 -13
nkululeko/plots.py +8 -4
nkululeko/predict.py +5 -13
nkululeko/reporter.py +1 -1
nkululeko/reporting/latex_writer.py +7 -2
nkululeko/reporting/report.py +2 -1
nkululeko/resample.py +5 -13
nkululeko/runmanager.py +1 -1
nkululeko/scaler.py +1 -1
nkululeko/segment.py +1 -1
nkululeko/segmenting/seg_silero.py +3 -5
nkululeko/test.py +4 -10
nkululeko/test_predictor.py +1 -1
nkululeko/utils/stats.py +8 -0
{nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/METADATA +6 -1
nkululeko-0.77.1.dist-info/RECORD +104 -0
nkululeko/balancer.py +0 -1
nkululeko-0.77.0.dist-info/RECORD +0 -105
/nkululeko/{util.py → utils/util.py} +0 -0
{nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/LICENSE +0 -0
{nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/WHEEL +0 -0
{nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/top_level.txt +0 -0

nkululeko/feat_extract/feats_analyser.py CHANGED Viewed

@@ -1,13 +1,15 @@
 # feats_analyser.py
 import ast
 import pandas as pd
+from sklearn.inspection import permutation_importance
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import LinearRegression
 from sklearn.tree import DecisionTreeRegressor
 import matplotlib.pyplot as plt
 from xgboost import XGBClassifier, XGBRegressor
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
+from nkululeko.utils.stats import normalize
 from nkululeko.plots import Plots
 import nkululeko.glob_conf as glob_conf
 from nkululeko.reporting.report_item import ReportItem
@@ -26,21 +28,48 @@ class FeatureAnalyser:
     def analyse(self):
         models = ast.literal_eval(self.util.config_val("EXPL", "model", "[log_reg]"))
+        model_name = "_".join(models)
         max_feat_num = int(self.util.config_val("EXPL", "max_feats", "10"))
+        # https://scikit-learn.org/stable/modules/permutation_importance.html
+        permutation = eval(self.util.config_val("EXPL", "permutation", "False"))
         importance = None
         self.util.debug("analysing features...")
         result_importances = {}
         if self.util.exp_is_classification():
             for model_s in models:
+                if permutation:
+                    self.util.debug(
+                        f"computing feature importance via permutation for {model_s}, might take longer..."
+                    )
                 if model_s == "log_reg":
                     model = LogisticRegression()
                     model.fit(self.features, self.labels)
-                    importance = model.coef_[0]
+                    if permutation:
+                        r = permutation_importance(
+                            model,
+                            self.features,
+                            self.labels,
+                            n_repeats=30,
+                            random_state=0,
+                        )
+                        importance = r["importances_mean"]
+                    else:
+                        importance = model.coef_[0]
                     result_importances[model_s] = importance
                 elif model_s == "tree":
                     model = DecisionTreeClassifier()
                     model.fit(self.features, self.labels)
-                    importance = model.feature_importances_
+                    if permutation:
+                        r = permutation_importance(
+                            model,
+                            self.features,
+                            self.labels,
+                            n_repeats=30,
+                            random_state=0,
+                        )
+                        importance = r["importances_mean"]
+                    else:
+                        importance = model.feature_importances_
                     result_importances[model_s] = importance
                     plot_tree = eval(self.util.config_val("EXPL", "plot_tree", "False"))
                     if plot_tree:
@@ -50,26 +79,70 @@ class FeatureAnalyser:
                     model = XGBClassifier(enable_categorical=True, tree_method="hist")
                     self.labels = self.labels.astype("category")
                     model.fit(self.features, self.labels)
-                    importance = model.feature_importances_
+                    if permutation:
+                        r = permutation_importance(
+                            model,
+                            self.features,
+                            self.labels,
+                            n_repeats=30,
+                            random_state=0,
+                        )
+                        importance = r["importances_mean"]
+                    else:
+                        importance = model.feature_importances_
                     result_importances[model_s] = importance
                 else:
                     self.util.error(f"invalid analysis method: {model}")
         else:  # regression experiment
             for model_s in models:
+                if permutation:
+                    self.util.debug(
+                        f"computing feature importance via permutation for {model_s}, might take longer..."
+                    )
                 if model_s == "lin_reg":
                     model = LinearRegression()
                     model.fit(self.features, self.labels)
-                    importance = model.coef_
+                    if permutation:
+                        r = permutation_importance(
+                            model,
+                            self.features,
+                            self.labels,
+                            n_repeats=30,
+                            random_state=0,
+                        )
+                        importance = r["importances_mean"]
+                    else:
+                        importance = model.coef_
                     result_importances[model_s] = importance
                 elif model_s == "tree":
                     model = DecisionTreeRegressor()
                     model.fit(self.features, self.labels)
-                    importance = model.feature_importances_
+                    if permutation:
+                        r = permutation_importance(
+                            model,
+                            self.features,
+                            self.labels,
+                            n_repeats=30,
+                            random_state=0,
+                        )
+                        importance = r["importances_mean"]
+                    else:
+                        importance = model.feature_importances_
                     result_importances[model_s] = importance
                 elif model_s == "xgb":
                     model = XGBRegressor()
                     model.fit(self.features, self.labels)
-                    importance = model.feature_importances_
+                    if permutation:
+                        r = permutation_importance(
+                            model,
+                            self.features,
+                            self.labels,
+                            n_repeats=30,
+                            random_state=0,
+                        )
+                        importance = r["importances_mean"]
+                    else:
+                        importance = model.feature_importances_
                     result_importances[model_s] = importance
                 else:
                     self.util.error(f"invalid analysis method: {model_s}")
@@ -79,7 +152,15 @@ class FeatureAnalyser:
             }
         )
         for model_s in result_importances:
-            df_imp[f"{model_s}_importance"] = result_importances[model_s]
+            if len(result_importances) == 1:
+                df_imp[f"{model_s}_importance"] = result_importances[model_s]
+            else:
+                # normalize the distributions because they might be different
+                self.util.debug(f"scaling importance values for {model_s}")
+                importance = result_importances[model_s]
+                importance = normalize(importance.reshape(-1, 1))
+                df_imp[f"{model_s}_importance"] = importance
         df_imp["importance"] = df_imp.iloc[:, 1:].mean(axis=1).values
         df_imp = df_imp.sort_values(by="importance", ascending=False).iloc[
             :max_feat_num
@@ -92,21 +173,31 @@ class FeatureAnalyser:
             ax.annotate(
                 str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)
             )
-        ax.set(title=f"{self.label} samples")
+        title = (
+            f"Feature importance for {self.label} samples with model(s) {model_name}"
+        )
+        if permutation:
+            title += "\n based on feature permutation"
+        ax.set(title=title)
         plt.tight_layout()
         fig_dir = self.util.get_path("fig_dir") + "../"  # one up because of the runs
         exp_name = self.util.get_exp_name(only_data=True)
         format = self.util.config_val("PLOT", "format", "png")
-        model_name = "_".join(result_importances.keys())
-        filename = f"{fig_dir}{exp_name}_EXPL_{model_name}.{format}"
+        filename = f"_EXPL_{model_name}"
+        if permutation:
+            filename += "_perm"
+        filename = f"{fig_dir}{exp_name}{filename}.{format}"
         plt.savefig(filename)
         fig = ax.figure
         fig.clear()
         plt.close(fig)
+        caption = f"Feature importance"
+        if permutation:
+            caption += " based on permutation of features."
         glob_conf.report.add_item(
             ReportItem(
                 Header.HEADER_EXPLORE,
-                f"Feature importance",
+                caption,
                 f"using {model_name} models",
                 filename,
             )
@@ -114,16 +205,17 @@ class FeatureAnalyser:
         # result file
         res_dir = self.util.get_path("res_dir")
-        file_name = (
-            f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt"
-        )
-        with open(file_name, "w") as text_file:
+        filename = f"_EXPL_{model_name}"
+        if permutation:
+            filename += "_perm"
+        filename = f"{res_dir}{self.util.get_exp_name(only_data=True)}{filename}_{model_name}.txt"
+        with open(filename, "w") as text_file:
             text_file.write(
                 "features in order of decreasing importance according to model"
-                f" {model_s}:\n" + f"{str(df_imp.feats.values)}\n"
+                f" {model_name}:\n" + f"{str(df_imp.feats.values)}\n"
             )
-        df_imp.to_csv(file_name, mode="a")
+        df_imp.to_csv(filename, mode="a")
         # check if feature distributions should be plotted
         plot_feats = self.util.config_val("EXPL", "feature_distributions", False)

nkululeko/feat_extract/feats_clap.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # feats_clap.py
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 from nkululeko.feat_extract.featureset import Featureset
 import os
 import pandas as pd
@@ -32,16 +32,12 @@ class Clap(Featureset):
         store = self.util.get_path("store")
         store_format = self.util.config_val("FEATS", "store_format", "pkl")
         storage = f"{store}{self.name}.{store_format}"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             if not self.model_initialized:
                 self.init_model()
-            self.util.debug(
-                "extracting clap embeddings, this might take a while..."
-            )
+            self.util.debug("extracting clap embeddings, this might take a while...")
             emb_series = pd.Series(index=self.data_df.index, dtype=object)
             length = len(self.data_df.index)
             for idx, (file, start, end) in enumerate(
@@ -55,9 +51,7 @@ class Clap(Featureset):
                 )
                 emb = self.get_embeddings(signal, sampling_rate)
                 emb_series[idx] = emb
-            self.df = pd.DataFrame(
-                emb_series.values.tolist(), index=self.data_df.index
-            )
+            self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
             self.util.write_store(self.df, storage, store_format)
             try:
                 glob_conf.config["DATA"]["needs_feature_extraction"] = "false"

nkululeko/feat_extract/feats_import.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # feats_import.py
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 from nkululeko.feat_extract.featureset import Featureset
 import os
 import pandas as pd
@@ -17,9 +17,7 @@ class Importset(Featureset):
         """Import the features or load them from disk if present."""
         store = self.util.get_path("store")
         storage = f"{store}{self.name}.pkl"
-        extract = eval(
-            self.util.config_val("FEATS", "needs_feature_extraction", False)
-        )
+        extract = eval(self.util.config_val("FEATS", "needs_feature_extraction", False))
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         feat_import_file = self.util.config_val("FEATS", "import_file", False)
         if not os.path.isfile(feat_import_file):

nkululeko/feat_extract/feats_mld.py CHANGED Viewed

@@ -4,7 +4,7 @@ import sys
 import os
 import pandas as pd
 import numpy as np
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 import nkululeko.glob_conf as glob_conf
@@ -22,19 +22,14 @@ class MLD_set(Featureset):
             os.remove(storage)
         if not os.path.isfile(storage):
             self.util.debug(
-                "extracting midleveldescriptor features, this might take a"
-                " while..."
+                "extracting midleveldescriptor features, this might take a" " while..."
             )
         else:
-            self.util.debug(
-                "reusing previously extracted midleveldescriptor features"
-            )
+            self.util.debug("reusing previously extracted midleveldescriptor features")
         import midlevel_descriptors as mld
         fex_mld = mld.MLD()
-        self.df = fex_mld.extract_from_index(
-            index=self.data_df, cache_path=storage
-        )
+        self.df = fex_mld.extract_from_index(index=self.data_df, cache_path=storage)
         self.util.debug(f"MLD feats shape: {self.df.shape}")
         # shouldn't happen
         # replace NANa with column means values

nkululeko/feat_extract/feats_mos.py CHANGED Viewed

@@ -19,7 +19,7 @@ from torchaudio.pipelines import SQUIM_SUBJECTIVE
 from torchaudio.utils import download_asset
 import audiofile
 import nkululeko.glob_conf as glob_conf
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 from nkululeko.feat_extract.featureset import Featureset
@@ -36,9 +36,7 @@ class MOSSet(Featureset):
         # load model
         self.util.debug("loading MOS model...")
         self.subjective_model = SQUIM_SUBJECTIVE.get_model()
-        NMR_SPEECH = download_asset(
-            "tutorial-assets/ctc-decoding/1688-142285-0007.wav"
-        )
+        NMR_SPEECH = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
         self.WAVEFORM_NMR, SAMPLE_RATE_NMR = torchaudio.load(NMR_SPEECH)
         self.model_initialized = True
@@ -47,9 +45,7 @@ class MOSSet(Featureset):
         store = self.util.get_path("store")
         store_format = self.util.config_val("FEATS", "store_format", "pkl")
         storage = f"{store}{self.name}.{store_format}"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             if not self.model_initialized:
@@ -68,9 +64,7 @@ class MOSSet(Featureset):
                 )
                 emb = self.get_embeddings(signal, sampling_rate, file)
                 emb_series[idx] = emb
-            self.df = pd.DataFrame(
-                emb_series.values.tolist(), index=self.data_df.index
-            )
+            self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
             self.df.columns = ["mos"]
             self.util.write_store(self.df, storage, store_format)
             try:
@@ -91,9 +85,7 @@ class MOSSet(Featureset):
         tmp_audio_name = "mos_audio_tmp.wav"
         try:
             audiofile.write(tmp_audio_name, signal, sampling_rate)
-            WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(
-                tmp_audio_name
-            )
+            WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(tmp_audio_name)
             with torch.no_grad():
                 mos = self.subjective_model(WAVEFORM_SPEECH, self.WAVEFORM_NMR)
         except RuntimeError as re:

nkululeko/feat_extract/feats_oxbow.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # feats_oxbow.py
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 from nkululeko.feat_extract.featureset import Featureset
 import os
 import pandas as pd
@@ -21,15 +21,11 @@ class Openxbow(Featureset):
         self.feature_set = eval(f"opensmile.FeatureSet.{self.featset}")
         store = self.util.get_path("store")
         storage = f"{store}{self.name}_{self.featset}.pkl"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             # extract smile features first
-            self.util.debug(
-                "extracting openSmile features, this might take a while..."
-            )
+            self.util.debug("extracting openSmile features, this might take a while...")
             smile = opensmile.Smile(
                 feature_set=self.feature_set,
                 feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
@@ -52,9 +48,7 @@ class Openxbow(Featureset):
             # save the smile features
             smile_df.to_csv(lld_name, sep=";", header=False)
             # get the path of the xbow java jar file
-            xbow_path = self.util.config_val(
-                "FEATS", "xbow.model", "../openXBOW/"
-            )
+            xbow_path = self.util.config_val("FEATS", "xbow.model", "../openXBOW/")
             # get the size of the codebook
             size = self.util.config_val("FEATS", "size", 500)
             # get the number of assignements
@@ -83,8 +77,7 @@ class Openxbow(Featureset):
             if with_os:
                 # extract smile functionals
                 self.util.debug(
-                    "extracting openSmile functionals, this might take a"
-                    " while..."
+                    "extracting openSmile functionals, this might take a" " while..."
                 )
                 smile = opensmile.Smile(
                     feature_set=opensmile.FeatureSet.eGeMAPSv02,  # always use eGemaps for this

nkululeko/feat_extract/feats_snr.py CHANGED Viewed

@@ -6,7 +6,7 @@ from tqdm import tqdm
 import pandas as pd
 import audiofile
 import nkululeko.glob_conf as glob_conf
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 from nkululeko.feat_extract.featureset import Featureset
 from nkululeko.autopredict.estimate_snr import SNREstimator
@@ -23,9 +23,7 @@ class SNRSet(Featureset):
         store = self.util.get_path("store")
         store_format = self.util.config_val("FEATS", "store_format", "pkl")
         storage = f"{store}{self.name}.{store_format}"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             self.util.debug("estimating SNR, this might take a while...")
@@ -42,9 +40,7 @@ class SNRSet(Featureset):
                 snr = self.get_snr(signal[0], sampling_rate)
                 snr_series[idx] = snr
             print("")
-            self.df = pd.DataFrame(
-                snr_series.values.tolist(), index=self.data_df.index
-            )
+            self.df = pd.DataFrame(snr_series.values.tolist(), index=self.data_df.index)
             self.df.columns = ["snr"]
             self.util.write_store(self.df, storage, store_format)
             try:

nkululeko/feat_extract/feats_squim.py CHANGED Viewed

@@ -25,7 +25,7 @@ import torchaudio
 from torchaudio.pipelines import SQUIM_OBJECTIVE
 import audiofile
 import nkululeko.glob_conf as glob_conf
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 from nkululeko.feat_extract.featureset import Featureset
@@ -49,9 +49,7 @@ class SQUIMSet(Featureset):
         store = self.util.get_path("store")
         store_format = self.util.config_val("FEATS", "store_format", "pkl")
         storage = f"{store}{self.name}.{store_format}"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             if not self.model_initialized:
@@ -70,9 +68,7 @@ class SQUIMSet(Featureset):
                 )
                 emb = self.get_embeddings(signal, sampling_rate, file)
                 emb_series[idx] = emb
-            self.df = pd.DataFrame(
-                emb_series.values.tolist(), index=self.data_df.index
-            )
+            self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
             self.df.columns = ["pesq", "sdr", "stoi"]
             self.util.write_store(self.df, storage, store_format)
             try:
@@ -93,13 +89,9 @@ class SQUIMSet(Featureset):
         tmp_audio_name = "squim_audio_tmp.wav"
         try:
             audiofile.write(tmp_audio_name, signal, sampling_rate)
-            WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(
-                tmp_audio_name
-            )
+            WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(tmp_audio_name)
             with torch.no_grad():
-                stoi_hyp, pesq_hyp, si_sdr_hyp = self.objective_model(
-                    WAVEFORM_SPEECH
-                )
+                stoi_hyp, pesq_hyp, si_sdr_hyp = self.objective_model(WAVEFORM_SPEECH)
             pesq = float(pesq_hyp[0].numpy())
             stoi = float(stoi_hyp[0].numpy())
             sdr = float(si_sdr_hyp[0].numpy())

nkululeko/feat_extract/feats_trill.py CHANGED Viewed

@@ -5,7 +5,7 @@ from numpy.core.numeric import tensordot
 from tqdm import tqdm
 import pandas as pd
 import audiofile as af
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 import nkululeko.glob_conf as glob_conf
 from nkululeko.feat_extract.featureset import Featureset
@@ -43,24 +43,16 @@ class TRILLset(Featureset):
     def extract(self):
         store = self.util.get_path("store")
         storage = f"{store}{self.name}.pkl"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
-            self.util.debug(
-                "extracting TRILL embeddings, this might take a while..."
-            )
+            self.util.debug("extracting TRILL embeddings, this might take a while...")
             emb_series = pd.Series(index=self.data_df.index, dtype=object)
             length = len(self.data_df.index)
-            for idx, file in enumerate(
-                tqdm(self.data_df.index.get_level_values(0))
-            ):
+            for idx, file in enumerate(tqdm(self.data_df.index.get_level_values(0))):
                 emb = self.getEmbeddings(file)
                 emb_series[idx] = emb
-            self.df = pd.DataFrame(
-                emb_series.values.tolist(), index=self.data_df.index
-            )
+            self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
             self.df.to_pickle(storage)
             try:
                 glob_conf.config["DATA"]["needs_feature_extraction"] = "false"

nkululeko/feat_extract/featureset.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # featureset.py
 import pandas as pd
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 import nkululeko.glob_conf as glob_conf
 import ast
@@ -23,9 +23,7 @@ class Featureset:
         self.df = self.df[self.df.index.isin(self.data_df.index)]
         try:
             # use only some features
-            selected_features = ast.literal_eval(
-                glob_conf.config["FEATS"]["features"]
-            )
+            selected_features = ast.literal_eval(glob_conf.config["FEATS"]["features"])
             self.util.debug(f"selecting features: {selected_features}")
             sel_feats_df = pd.DataFrame()
             hit = False

nkululeko/feat_extract/feinberg_praat.py CHANGED Viewed

@@ -11,7 +11,7 @@ import math
 from tqdm import tqdm
 import parselmouth
 import statistics
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 import audiofile
 from parselmouth.praat import call
 from scipy.stats.mstats import zscore

nkululeko/feature_extractor.py CHANGED Viewed

@@ -6,7 +6,7 @@ Helper class to encapsulate feature extraction methods
 """
 import pandas as pd
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 class FeatureExtractor:

nkululeko/file_checker.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pandas as pd
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 import os
@@ -39,7 +39,9 @@ class FileChecker:
             min = self.util.config_val_data(data_name, "check_size", False)
         if min:
             if min == "True":
-                min = 1000  # 1000 bytes would be a reasonable minimal size for 16 kHz sr
+                min = (
+                    1000  # 1000 bytes would be a reasonable minimal size for 16 kHz sr
+                )
             old_samples = self.df.shape[0]
             df = self.df.copy()
             for i in self.df.index:
@@ -66,9 +68,7 @@ class FileChecker:
         else:
             check = self.util.config_val_data(data_name, "check_vad", False)
         if check:
-            self.util.debug(
-                f"{data_name}: checking for samples without speech."
-            )
+            self.util.debug(f"{data_name}: checking for samples without speech.")
             SAMPLING_RATE = 16000
             (
                 get_speech_timestamps,

nkululeko/filter_data.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import audformat
 import pandas as pd
 import nkululeko.glob_conf as glob_conf
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 import ast
@@ -45,9 +45,7 @@ class DataFilter:
         the samples are selected randomly
         """
         if data_name == "":
-            max = self.util.config_val(
-                "DATA", "limit_samples_per_speaker", False
-            )
+            max = self.util.config_val("DATA", "limit_samples_per_speaker", False)
         else:
             max = self.util.config_val_data(
                 data_name, "limit_samples_per_speaker", False
@@ -73,12 +71,8 @@ class DataFilter:
     def filter_duration(self, data_name=""):
         """remove all samples less than min_dur duration"""
         if data_name == "":
-            min_dur = self.util.config_val(
-                "DATA", "min_duration_of_sample", False
-            )
-            max_dur = self.util.config_val(
-                "DATA", "max_duration_of_sample", False
-            )
+            min_dur = self.util.config_val("DATA", "min_duration_of_sample", False)
+            max_dur = self.util.config_val("DATA", "max_duration_of_sample", False)
         else:
             min_dur = self.util.config_val_data(
                 data_name, "min_duration_of_sample", False
@@ -175,9 +169,7 @@ def filter_min_dur(df, min_dur):
         glob_conf.util.debug(
             "converting file index to multi index, this might take a while..."
         )
-        df_ret.index = audformat.utils.to_segmented_index(
-            df.index, allow_nat=False
-        )
+        df_ret.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
     for i in df_ret.index:
         start = i[1]
         end = i[2]
@@ -197,9 +189,7 @@ def filter_max_dur(df, max_dur):
         glob_conf.util.debug(
             "converting file index to multi index, this might take a while..."
         )
-        df_ret.index = audformat.utils.to_segmented_index(
-            df.index, allow_nat=False
-        )
+        df_ret.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
     for i in df_ret.index:
         start = i[1]
         end = i[2]

nkululeko/modelrunner.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import pandas as pd
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 from nkululeko import glob_conf
 import nkululeko.glob_conf as glob_conf

nkululeko/models/model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # model.py
-from nkululeko.util import Util
+from nkululeko.utils.util import Util
 import pandas as pd
 import numpy as np
 import nkululeko.glob_conf as glob_conf

nkululeko 0.77.0__py3-none-any.whl → 0.77.1__py3-none-any.whl

nkululeko 0.77.0py3-none-any.whl → 0.77.1py3-none-any.whl