PyPI - nkululeko - Versions diffs - 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl - Mend

nkululeko 0.94.3py3-none-any.whl → 0.95.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

nkululeko/augmenting/resampler.py +5 -2
nkululeko/autopredict/ap_emotion.py +36 -0
nkululeko/autopredict/ap_text.py +45 -0
nkululeko/autopredict/tests/__init__.py +0 -0
nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
nkululeko/autopredict/whisper_transcriber.py +81 -0
nkululeko/balance.py +222 -0
nkululeko/constants.py +1 -1
nkululeko/experiment.py +53 -3
nkululeko/explore.py +32 -13
nkululeko/feat_extract/feats_analyser.py +45 -17
nkululeko/feat_extract/feats_emotion2vec.py +51 -26
nkululeko/feat_extract/feats_praat.py +3 -3
nkululeko/feat_extract/feats_praat_core.py +769 -0
nkululeko/feat_extract/tests/__init__.py +1 -0
nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
nkululeko/glob_conf.py +9 -0
nkululeko/modelrunner.py +15 -39
nkululeko/models/model.py +4 -42
nkululeko/models/model_tuned.py +416 -84
nkululeko/models/model_xgb.py +148 -2
nkululeko/models/tests/test_model_knn.py +49 -0
nkululeko/models/tests/test_model_mlp.py +153 -0
nkululeko/models/tests/test_model_xgb.py +33 -0
nkululeko/nkululeko.py +0 -9
nkululeko/plots.py +25 -19
nkululeko/predict.py +8 -6
nkululeko/reporting/report.py +7 -5
nkululeko/reporting/reporter.py +20 -5
nkululeko/test_predictor.py +7 -1
nkululeko/tests/__init__.py +1 -0
nkululeko/tests/test_balancing.py +270 -0
nkululeko/utils/util.py +38 -6
{nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
{nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
nkululeko/feat_extract/feats_opensmile copy.py +0 -93
nkululeko/feat_extract/feinberg_praat.py +0 -628
{nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
{nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
{nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
{nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0

nkululeko/explore.py CHANGED Viewed

@@ -8,6 +8,8 @@ The script supports the following configuration options:
 - `no_warnings`: If set to `True`, it will ignore all warnings during the exploration.
 - `feature_distributions`: If set to `True`, it will generate plots of the feature distributions.
 - `tsne`: If set to `True`, it will generate a t-SNE plot of the feature space.
+- `umap`: If set to `True`, it will generate a UMAP plot of the feature space.
+- `pca`: If set to `True`, it will generate a PCA plot of the feature space.
 - `scatter`: If set to `True`, it will generate a scatter plot of the feature space.
 - `spotlight`: If set to `True`, it will generate a 'spotlight' plot of the feature space.
 - `shap`: If set to `True`, it will generate SHAP feature importance plots.
@@ -59,10 +61,12 @@ def main():
         warnings.filterwarnings("ignore")
     needs_feats = False
+    experiment_loaded = False
     try:
         # load the experiment
         expr.load(f"{util.get_save_name()}")
         needs_feats = True
+        experiment_loaded = True
     except FileNotFoundError:
         # first time: load the data
         expr.load_datasets()
@@ -73,20 +77,35 @@ def main():
             f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
         )
-        plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False"))
-        tsne = eval(util.config_val("EXPL", "tsne", "False"))
-        scatter = eval(util.config_val("EXPL", "scatter", "False"))
-        shap = eval(util.config_val("EXPL", "shap", "False"))
-        model_type = util.config_val("EXPL", "model", False)
-        plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
-        needs_feats = False
-        if plot_feats or tsne or scatter or model_type or plot_tree or shap:
-            # these investigations need features to explore
+    # Check exploration settings regardless of whether experiment was loaded or not
+    plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False"))
+    tsne_plot = eval(util.config_val("EXPL", "tsne", "False"))
+    umap_plot = eval(util.config_val("EXPL", "umap", "False"))
+    pca_plot = eval(util.config_val("EXPL", "pca", "False"))
+    scatter = eval(util.config_val("EXPL", "scatter", "False"))
+    shap = eval(util.config_val("EXPL", "shap", "False"))
+    model_type = util.config_val("EXPL", "model", False)
+    plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
+    if (
+        plot_feats
+        or tsne_plot
+        or umap_plot
+        or pca_plot
+        or scatter
+        or model_type
+        or plot_tree
+        or shap
+    ):
+        # these investigations need features to explore
+        if not experiment_loaded or not needs_feats:
             expr.extract_feats()
-            needs_feats = True
-            # explore
-            # expr.init_runmanager()
-            # expr.runmgr.do_runs()
+        needs_feats = True
+        # explore
+        if shap:
+            # SHAP analysis requires a trained model
+            expr.init_runmanager()
+            expr.runmgr.do_runs()
     expr.analyse_features(needs_feats)
     expr.store_report()
     print("DONE")

nkululeko/feat_extract/feats_analyser.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # feats_analyser.py
 import ast
+import os
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -76,17 +77,37 @@ class FeatureAnalyser:
             self.util.to_pickle(shap_values, name)
         else:
             shap_values = self.util.from_pickle(name)
-        # plt.figure()
-        plt.close("all")
-        plt.tight_layout()
-        shap.plots.bar(shap_values)
-        fig_dir = self.util.get_path("fig_dir") + "../"  # one up because of the runs
-        exp_name = self.util.get_exp_name(only_data=True)
+        # Create SHAP summary plot instead
+        fig, ax = plt.subplots(figsize=(10, 6))
+        shap.plots.bar(shap_values, ax=ax, show=False)
+        fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
         format = self.util.config_val("PLOT", "format", "png")
-        filename = f"_SHAP_{model.name}"
-        filename = f"{fig_dir}{exp_name}{filename}.{format}"
-        plt.savefig(filename)
-        plt.close()
+        feat_type = self.util.get_feattype_name()
+        filename = f"SHAP_{feat_type}_{model.name}.{format}"
+        filename = os.path.join(fig_dir, filename)
+        fig.savefig(filename, dpi=300, bbox_inches="tight")
+        plt.close(fig)
+        # print and save SHAP feature importance
+        max_feat_num = len(self.features.columns)
+        shap_importance_values = shap_values.abs.mean(0).values
+        feature_cols = self.features.columns
+        feature_importance = pd.DataFrame(
+            shap_importance_values[:max_feat_num],
+            index=feature_cols,
+            columns=["importance"],
+        ).sort_values("importance", ascending=False)
+        self.util.debug(
+            f"SHAP analysis, features = {feature_importance.index.tolist()}"
+        )
+        # Save to CSV (save all features, not just top ones)
+        csv_filename = os.path.join(fig_dir, f"SHAP_{feat_type}_importance_{model.name}.csv")
+        feature_importance.to_csv(csv_filename)
+        self.util.debug(f"Saved SHAP feature importance to {csv_filename}")
         self.util.debug(f"plotted SHAP feature importance to {filename}")
     def analyse(self):
@@ -120,6 +141,12 @@ class FeatureAnalyser:
                     covariance_type = self.util.config_val(
                         "MODEL", "GMM_covariance_type", "full"
                     )
+                    allowed_cov_types = ["full", "tied", "diag", "spherical"]
+                    if covariance_type not in allowed_cov_types:
+                        self.util.error(
+                            f"Invalid covariance_type '{covariance_type}', must be one of {allowed_cov_types}. Using default 'full'."
+                        )
+                        covariance_type = "full"
                     model = mixture.GaussianMixture(
                         n_components=n_components, covariance_type=covariance_type
                     )
@@ -156,7 +183,7 @@ class FeatureAnalyser:
                     from sklearn.svm import SVC
                     c = float(self.util.config_val("MODEL", "C_val", "1.0"))
-                    model = SVC(kernel="linear", C=c, gamma="scale")
+                    model = SVC(kernel="linear", C=c, gamma="scale", random_state=42)
                     result_importances[model_s] = self._get_importance(
                         model, permutation
                     )
@@ -165,7 +192,7 @@ class FeatureAnalyser:
                         plots = Plots()
                         plots.plot_tree(model, self.features)
                 elif model_s == "tree":
-                    model = DecisionTreeClassifier()
+                    model = DecisionTreeClassifier(random_state=42)
                     result_importances[model_s] = self._get_importance(
                         model, permutation
                     )
@@ -176,7 +203,9 @@ class FeatureAnalyser:
                 elif model_s == "xgb":
                     from xgboost import XGBClassifier
-                    model = XGBClassifier(enable_categorical=True, tree_method="hist")
+                    model = XGBClassifier(
+                        enable_categorical=True, tree_method="hist", random_state=42
+                    )
                     self.labels = self.labels.astype("category")
                     result_importances[model_s] = self._get_importance(
                         model, permutation
@@ -263,13 +292,12 @@ class FeatureAnalyser:
             title += "\n based on feature permutation"
         ax.set(title=title)
         plt.tight_layout()
-        fig_dir = self.util.get_path("fig_dir") + "../"  # one up because of the runs
-        exp_name = self.util.get_exp_name(only_data=True)
+        fig_dir = self.util.get_path("fig_dir")
         format = self.util.config_val("PLOT", "format", "png")
-        filename = f"_EXPL_{model_name}"
+        filename = f"EXPL_{model_name}"
         if permutation:
             filename += "_perm"
-        filename = f"{fig_dir}{exp_name}{filename}.{format}"
+        filename = f"{fig_dir}{filename}.{format}"
         plt.savefig(filename)
         fig = ax.figure
         fig.clear()

nkululeko/feat_extract/feats_emotion2vec.py CHANGED Viewed

@@ -3,7 +3,6 @@
 # choices for feat_type = "emotion2vec", "emotion2vec-large", "emotion2vec-base", "emotion2vec-seed"
 # requirements:
-# pip install "modelscope>=1.9.5,<2.0.0"
 # pip install funasr
 import os
@@ -43,27 +42,30 @@ class Emotion2vec(Featureset):
         except ImportError:
             self.util.error(
                 "FunASR is required for emotion2vec features. "
-                "Please install with: pip install funasr modelscope"
+                "Please install with: pip install funasr"
             )
-        # Map feat_type to model names
+        # Map feat_type to model names on HuggingFace
         model_mapping = {
-            "emotion2vec": "iic/emotion2vec_base",
-            "emotion2vec-base": "iic/emotion2vec_base_finetuned",
-            "emotion2vec-seed": "iic/emotion2vec_plus_seed",
-            "emotion2vec-large": "iic/emotion2vec_plus_large",
+            "emotion2vec": "emotion2vec/emotion2vec_base",
+            "emotion2vec-base": "emotion2vec/emotion2vec_base",
+            "emotion2vec-seed": "emotion2vec/emotion2vec_plus_seed",
+            "emotion2vec-large": "emotion2vec/emotion2vec_plus_large",
         }
         # Get model path from config or use default mapping
         model_path = self.util.config_val(
             "FEATS",
             "emotion2vec.model",
-            model_mapping.get(self.feat_type, "iic/emotion2vec_base"),
+            model_mapping.get(self.feat_type, "emotion2vec/emotion2vec_base"),
         )
         try:
-            # Initialize the FunASR model for emotion2vec
-            self.model = AutoModel(model=model_path)
+            # Initialize the FunASR model for emotion2vec using HuggingFace Hub
+            self.model = AutoModel(
+                model=model_path,
+                hub="hf"  # Use HuggingFace Hub instead of ModelScope
+            )
             self.util.debug(f"initialized emotion2vec model: {model_path}")
             self.model_initialized = True
         except Exception as e:
@@ -131,7 +133,9 @@ class Emotion2vec(Featureset):
                 import tempfile
                 import soundfile as sf
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                with tempfile.NamedTemporaryFile(
+                    suffix=".wav", delete=False
+                ) as tmp_file:
                     sf.write(tmp_file.name, signal_np, sampling_rate)
                     audio_path = tmp_file.name
             else:
@@ -152,11 +156,20 @@ class Emotion2vec(Featureset):
                             embeddings = np.array(embeddings)
                         return embeddings.flatten()
                     else:
-                        # Fallback to create default embedding
-                        return np.array([0.0] * 768)
+                        # Fallback based on model type
+                        if 'large' in self.feat_type.lower():
+                            return np.array([0.0] * 1024)
+                        else:
+                            return np.array([0.0] * 768)
                 else:
-                    self.util.error(f"No result from emotion2vec model for file: {file}")
-                    return np.array([0.0] * 768)
+                    self.util.error(
+                        f"No result from emotion2vec model for file: {file}"
+                    )
+                    # Fallback based on model type
+                    if 'large' in self.feat_type.lower():
+                        return np.array([0.0] * 1024)
+                    else:
+                        return np.array([0.0] * 768)
             finally:
                 # Clean up temporary file if we created one
@@ -166,36 +179,40 @@ class Emotion2vec(Featureset):
         except Exception as e:
             print(f"Error processing {file}: {str(e)}")
             self.util.error(f"couldn't extract file: {file}, error: {str(e)}")
-            return np.array([0.0] * 768)
+            # Return appropriate dimension based on model type
+            if 'large' in self.feat_type.lower():
+                return np.array([0.0] * 1024)
+            else:
+                return np.array([0.0] * 768)
     def extract_sample(self, signal, sr):
         """Extract features from a single sample."""
         if not self.model_initialized:
             self.init_model()
         # Save signal as temporary file for emotion2vec
         import tempfile
         import soundfile as sf
         try:
             # Convert tensor to numpy if needed
             if torch.is_tensor(signal):
                 signal_np = signal.squeeze().numpy()
             else:
                 signal_np = signal.squeeze()
             # Handle multi-channel audio
             if signal_np.ndim > 1:
                 signal_np = signal_np[0]
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 sf.write(tmp_file.name, signal_np, sr)
                 # Extract using the emotion2vec model
                 res = self.model.generate(
                     tmp_file.name, granularity="utterance", extract_embedding=True
                 )
                 # Get embeddings from result
                 if isinstance(res, list) and len(res) > 0:
                     embeddings = res[0].get("feats", None)
@@ -203,12 +220,20 @@ class Emotion2vec(Featureset):
                         if isinstance(embeddings, list):
                             embeddings = np.array(embeddings)
                         return embeddings.flatten()
-                return np.array([0.0] * 768)
+                # Fallback based on model type
+                if 'large' in self.feat_type.lower():
+                    return np.array([0.0] * 1024)
+                else:
+                    return np.array([0.0] * 768)
         except Exception as e:
             print(f"Error in extract_sample: {str(e)}")
-            return np.array([0.0] * 768)
+            # Return appropriate dimension based on model type
+            if 'large' in self.feat_type.lower():
+                return np.array([0.0] * 1024)
+            else:
+                return np.array([0.0] * 768)
         finally:
             # Clean up temporary file
             if tmp_file is not None:  # Check if tmp_file was created

nkululeko/feat_extract/feats_praat.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import pandas as pd
 import nkululeko.glob_conf as glob_conf
-from nkululeko.feat_extract import feinberg_praat
+from nkululeko.feat_extract import feats_praat_core
 from nkululeko.feat_extract.featureset import Featureset
@@ -29,7 +29,7 @@ class PraatSet(Featureset):
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             self.util.debug("extracting Praat features, this might take a while...")
-            self.df = feinberg_praat.compute_features(self.data_df.index)
+            self.df = feats_praat_core.compute_features(self.data_df.index)
             self.df = self.df.set_index(self.data_df.index)
             for i, col in enumerate(self.df.columns):
                 if self.df[col].isnull().values.any():
@@ -58,7 +58,7 @@ class PraatSet(Featureset):
         audiofile.write(tmp_audio_names[0], signal, sr)
         df = pd.DataFrame(index=tmp_audio_names)
         index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
-        df = feinberg_praat.compute_features(index)
+        df = feats_praat_core.compute_features(index)
         df.set_index(index)
         for i, col in enumerate(df.columns):
             if df[col].isnull().values.any():

nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl

nkululeko 0.94.3py3-none-any.whl → 0.95.1py3-none-any.whl