nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. nkululeko/augmenting/resampler.py +5 -2
  2. nkululeko/autopredict/ap_emotion.py +36 -0
  3. nkululeko/autopredict/ap_text.py +45 -0
  4. nkululeko/autopredict/tests/__init__.py +0 -0
  5. nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
  6. nkululeko/autopredict/whisper_transcriber.py +81 -0
  7. nkululeko/balance.py +222 -0
  8. nkululeko/constants.py +1 -1
  9. nkululeko/experiment.py +53 -3
  10. nkululeko/explore.py +32 -13
  11. nkululeko/feat_extract/feats_analyser.py +45 -17
  12. nkululeko/feat_extract/feats_emotion2vec.py +51 -26
  13. nkululeko/feat_extract/feats_praat.py +3 -3
  14. nkululeko/feat_extract/feats_praat_core.py +769 -0
  15. nkululeko/feat_extract/tests/__init__.py +1 -0
  16. nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
  17. nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
  18. nkululeko/glob_conf.py +9 -0
  19. nkululeko/modelrunner.py +15 -39
  20. nkululeko/models/model.py +4 -42
  21. nkululeko/models/model_tuned.py +416 -84
  22. nkululeko/models/model_xgb.py +148 -2
  23. nkululeko/models/tests/test_model_knn.py +49 -0
  24. nkululeko/models/tests/test_model_mlp.py +153 -0
  25. nkululeko/models/tests/test_model_xgb.py +33 -0
  26. nkululeko/nkululeko.py +0 -9
  27. nkululeko/plots.py +25 -19
  28. nkululeko/predict.py +8 -6
  29. nkululeko/reporting/report.py +7 -5
  30. nkululeko/reporting/reporter.py +20 -5
  31. nkululeko/test_predictor.py +7 -1
  32. nkululeko/tests/__init__.py +1 -0
  33. nkululeko/tests/test_balancing.py +270 -0
  34. nkululeko/utils/util.py +38 -6
  35. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
  36. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
  37. nkululeko/feat_extract/feats_opensmile copy.py +0 -93
  38. nkululeko/feat_extract/feinberg_praat.py +0 -628
  39. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
  40. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
  41. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
  42. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
nkululeko/explore.py CHANGED
@@ -8,6 +8,8 @@ The script supports the following configuration options:
8
8
  - `no_warnings`: If set to `True`, it will ignore all warnings during the exploration.
9
9
  - `feature_distributions`: If set to `True`, it will generate plots of the feature distributions.
10
10
  - `tsne`: If set to `True`, it will generate a t-SNE plot of the feature space.
11
+ - `umap`: If set to `True`, it will generate a UMAP plot of the feature space.
12
+ - `pca`: If set to `True`, it will generate a PCA plot of the feature space.
11
13
  - `scatter`: If set to `True`, it will generate a scatter plot of the feature space.
12
14
  - `spotlight`: If set to `True`, it will generate a 'spotlight' plot of the feature space.
13
15
  - `shap`: If set to `True`, it will generate SHAP feature importance plots.
@@ -59,10 +61,12 @@ def main():
59
61
 
60
62
  warnings.filterwarnings("ignore")
61
63
  needs_feats = False
64
+ experiment_loaded = False
62
65
  try:
63
66
  # load the experiment
64
67
  expr.load(f"{util.get_save_name()}")
65
68
  needs_feats = True
69
+ experiment_loaded = True
66
70
  except FileNotFoundError:
67
71
  # first time: load the data
68
72
  expr.load_datasets()
@@ -73,20 +77,35 @@ def main():
73
77
  f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
74
78
  )
75
79
 
76
- plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False"))
77
- tsne = eval(util.config_val("EXPL", "tsne", "False"))
78
- scatter = eval(util.config_val("EXPL", "scatter", "False"))
79
- shap = eval(util.config_val("EXPL", "shap", "False"))
80
- model_type = util.config_val("EXPL", "model", False)
81
- plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
82
- needs_feats = False
83
- if plot_feats or tsne or scatter or model_type or plot_tree or shap:
84
- # these investigations need features to explore
80
+ # Check exploration settings regardless of whether experiment was loaded or not
81
+ plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False"))
82
+ tsne_plot = eval(util.config_val("EXPL", "tsne", "False"))
83
+ umap_plot = eval(util.config_val("EXPL", "umap", "False"))
84
+ pca_plot = eval(util.config_val("EXPL", "pca", "False"))
85
+ scatter = eval(util.config_val("EXPL", "scatter", "False"))
86
+ shap = eval(util.config_val("EXPL", "shap", "False"))
87
+ model_type = util.config_val("EXPL", "model", False)
88
+ plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
89
+
90
+ if (
91
+ plot_feats
92
+ or tsne_plot
93
+ or umap_plot
94
+ or pca_plot
95
+ or scatter
96
+ or model_type
97
+ or plot_tree
98
+ or shap
99
+ ):
100
+ # these investigations need features to explore
101
+ if not experiment_loaded or not needs_feats:
85
102
  expr.extract_feats()
86
- needs_feats = True
87
- # explore
88
- # expr.init_runmanager()
89
- # expr.runmgr.do_runs()
103
+ needs_feats = True
104
+ # explore
105
+ if shap:
106
+ # SHAP analysis requires a trained model
107
+ expr.init_runmanager()
108
+ expr.runmgr.do_runs()
90
109
  expr.analyse_features(needs_feats)
91
110
  expr.store_report()
92
111
  print("DONE")
@@ -1,5 +1,6 @@
1
1
  # feats_analyser.py
2
2
  import ast
3
+ import os
3
4
 
4
5
  import matplotlib.pyplot as plt
5
6
  import pandas as pd
@@ -76,17 +77,37 @@ class FeatureAnalyser:
76
77
  self.util.to_pickle(shap_values, name)
77
78
  else:
78
79
  shap_values = self.util.from_pickle(name)
79
- # plt.figure()
80
- plt.close("all")
81
- plt.tight_layout()
82
- shap.plots.bar(shap_values)
83
- fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
84
- exp_name = self.util.get_exp_name(only_data=True)
80
+ # Create SHAP summary plot instead
81
+ fig, ax = plt.subplots(figsize=(10, 6))
82
+ shap.plots.bar(shap_values, ax=ax, show=False)
83
+ fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
84
+
85
85
  format = self.util.config_val("PLOT", "format", "png")
86
- filename = f"_SHAP_{model.name}"
87
- filename = f"{fig_dir}{exp_name}{filename}.{format}"
88
- plt.savefig(filename)
89
- plt.close()
86
+ feat_type = self.util.get_feattype_name()
87
+ filename = f"SHAP_{feat_type}_{model.name}.{format}"
88
+ filename = os.path.join(fig_dir, filename)
89
+
90
+ fig.savefig(filename, dpi=300, bbox_inches="tight")
91
+ plt.close(fig)
92
+
93
+ # print and save SHAP feature importance
94
+ max_feat_num = len(self.features.columns)
95
+ shap_importance_values = shap_values.abs.mean(0).values
96
+
97
+ feature_cols = self.features.columns
98
+ feature_importance = pd.DataFrame(
99
+ shap_importance_values[:max_feat_num],
100
+ index=feature_cols,
101
+ columns=["importance"],
102
+ ).sort_values("importance", ascending=False)
103
+
104
+ self.util.debug(
105
+ f"SHAP analysis, features = {feature_importance.index.tolist()}"
106
+ )
107
+ # Save to CSV (save all features, not just top ones)
108
+ csv_filename = os.path.join(fig_dir, f"SHAP_{feat_type}_importance_{model.name}.csv")
109
+ feature_importance.to_csv(csv_filename)
110
+ self.util.debug(f"Saved SHAP feature importance to {csv_filename}")
90
111
  self.util.debug(f"plotted SHAP feature importance to {filename}")
91
112
 
92
113
  def analyse(self):
@@ -120,6 +141,12 @@ class FeatureAnalyser:
120
141
  covariance_type = self.util.config_val(
121
142
  "MODEL", "GMM_covariance_type", "full"
122
143
  )
144
+ allowed_cov_types = ["full", "tied", "diag", "spherical"]
145
+ if covariance_type not in allowed_cov_types:
146
+ self.util.error(
147
+ f"Invalid covariance_type '{covariance_type}', must be one of {allowed_cov_types}. Using default 'full'."
148
+ )
149
+ covariance_type = "full"
123
150
  model = mixture.GaussianMixture(
124
151
  n_components=n_components, covariance_type=covariance_type
125
152
  )
@@ -156,7 +183,7 @@ class FeatureAnalyser:
156
183
  from sklearn.svm import SVC
157
184
 
158
185
  c = float(self.util.config_val("MODEL", "C_val", "1.0"))
159
- model = SVC(kernel="linear", C=c, gamma="scale")
186
+ model = SVC(kernel="linear", C=c, gamma="scale", random_state=42)
160
187
  result_importances[model_s] = self._get_importance(
161
188
  model, permutation
162
189
  )
@@ -165,7 +192,7 @@ class FeatureAnalyser:
165
192
  plots = Plots()
166
193
  plots.plot_tree(model, self.features)
167
194
  elif model_s == "tree":
168
- model = DecisionTreeClassifier()
195
+ model = DecisionTreeClassifier(random_state=42)
169
196
  result_importances[model_s] = self._get_importance(
170
197
  model, permutation
171
198
  )
@@ -176,7 +203,9 @@ class FeatureAnalyser:
176
203
  elif model_s == "xgb":
177
204
  from xgboost import XGBClassifier
178
205
 
179
- model = XGBClassifier(enable_categorical=True, tree_method="hist")
206
+ model = XGBClassifier(
207
+ enable_categorical=True, tree_method="hist", random_state=42
208
+ )
180
209
  self.labels = self.labels.astype("category")
181
210
  result_importances[model_s] = self._get_importance(
182
211
  model, permutation
@@ -263,13 +292,12 @@ class FeatureAnalyser:
263
292
  title += "\n based on feature permutation"
264
293
  ax.set(title=title)
265
294
  plt.tight_layout()
266
- fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
267
- exp_name = self.util.get_exp_name(only_data=True)
295
+ fig_dir = self.util.get_path("fig_dir")
268
296
  format = self.util.config_val("PLOT", "format", "png")
269
- filename = f"_EXPL_{model_name}"
297
+ filename = f"EXPL_{model_name}"
270
298
  if permutation:
271
299
  filename += "_perm"
272
- filename = f"{fig_dir}{exp_name}{filename}.{format}"
300
+ filename = f"{fig_dir}{filename}.{format}"
273
301
  plt.savefig(filename)
274
302
  fig = ax.figure
275
303
  fig.clear()
@@ -3,7 +3,6 @@
3
3
  # choices for feat_type = "emotion2vec", "emotion2vec-large", "emotion2vec-base", "emotion2vec-seed"
4
4
 
5
5
  # requirements:
6
- # pip install "modelscope>=1.9.5,<2.0.0"
7
6
  # pip install funasr
8
7
 
9
8
  import os
@@ -43,27 +42,30 @@ class Emotion2vec(Featureset):
43
42
  except ImportError:
44
43
  self.util.error(
45
44
  "FunASR is required for emotion2vec features. "
46
- "Please install with: pip install funasr modelscope"
45
+ "Please install with: pip install funasr"
47
46
  )
48
47
 
49
- # Map feat_type to model names
48
+ # Map feat_type to model names on HuggingFace
50
49
  model_mapping = {
51
- "emotion2vec": "iic/emotion2vec_base",
52
- "emotion2vec-base": "iic/emotion2vec_base_finetuned",
53
- "emotion2vec-seed": "iic/emotion2vec_plus_seed",
54
- "emotion2vec-large": "iic/emotion2vec_plus_large",
50
+ "emotion2vec": "emotion2vec/emotion2vec_base",
51
+ "emotion2vec-base": "emotion2vec/emotion2vec_base",
52
+ "emotion2vec-seed": "emotion2vec/emotion2vec_plus_seed",
53
+ "emotion2vec-large": "emotion2vec/emotion2vec_plus_large",
55
54
  }
56
55
 
57
56
  # Get model path from config or use default mapping
58
57
  model_path = self.util.config_val(
59
58
  "FEATS",
60
59
  "emotion2vec.model",
61
- model_mapping.get(self.feat_type, "iic/emotion2vec_base"),
60
+ model_mapping.get(self.feat_type, "emotion2vec/emotion2vec_base"),
62
61
  )
63
62
 
64
63
  try:
65
- # Initialize the FunASR model for emotion2vec
66
- self.model = AutoModel(model=model_path)
64
+ # Initialize the FunASR model for emotion2vec using HuggingFace Hub
65
+ self.model = AutoModel(
66
+ model=model_path,
67
+ hub="hf" # Use HuggingFace Hub instead of ModelScope
68
+ )
67
69
  self.util.debug(f"initialized emotion2vec model: {model_path}")
68
70
  self.model_initialized = True
69
71
  except Exception as e:
@@ -131,7 +133,9 @@ class Emotion2vec(Featureset):
131
133
  import tempfile
132
134
  import soundfile as sf
133
135
 
134
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
136
+ with tempfile.NamedTemporaryFile(
137
+ suffix=".wav", delete=False
138
+ ) as tmp_file:
135
139
  sf.write(tmp_file.name, signal_np, sampling_rate)
136
140
  audio_path = tmp_file.name
137
141
  else:
@@ -152,11 +156,20 @@ class Emotion2vec(Featureset):
152
156
  embeddings = np.array(embeddings)
153
157
  return embeddings.flatten()
154
158
  else:
155
- # Fallback to create default embedding
156
- return np.array([0.0] * 768)
159
+ # Fallback based on model type
160
+ if 'large' in self.feat_type.lower():
161
+ return np.array([0.0] * 1024)
162
+ else:
163
+ return np.array([0.0] * 768)
157
164
  else:
158
- self.util.error(f"No result from emotion2vec model for file: {file}")
159
- return np.array([0.0] * 768)
165
+ self.util.error(
166
+ f"No result from emotion2vec model for file: {file}"
167
+ )
168
+ # Fallback based on model type
169
+ if 'large' in self.feat_type.lower():
170
+ return np.array([0.0] * 1024)
171
+ else:
172
+ return np.array([0.0] * 768)
160
173
 
161
174
  finally:
162
175
  # Clean up temporary file if we created one
@@ -166,36 +179,40 @@ class Emotion2vec(Featureset):
166
179
  except Exception as e:
167
180
  print(f"Error processing {file}: {str(e)}")
168
181
  self.util.error(f"couldn't extract file: {file}, error: {str(e)}")
169
- return np.array([0.0] * 768)
182
+ # Return appropriate dimension based on model type
183
+ if 'large' in self.feat_type.lower():
184
+ return np.array([0.0] * 1024)
185
+ else:
186
+ return np.array([0.0] * 768)
170
187
 
171
188
  def extract_sample(self, signal, sr):
172
189
  """Extract features from a single sample."""
173
190
  if not self.model_initialized:
174
191
  self.init_model()
175
-
192
+
176
193
  # Save signal as temporary file for emotion2vec
177
194
  import tempfile
178
195
  import soundfile as sf
179
-
196
+
180
197
  try:
181
198
  # Convert tensor to numpy if needed
182
199
  if torch.is_tensor(signal):
183
200
  signal_np = signal.squeeze().numpy()
184
201
  else:
185
202
  signal_np = signal.squeeze()
186
-
203
+
187
204
  # Handle multi-channel audio
188
205
  if signal_np.ndim > 1:
189
206
  signal_np = signal_np[0]
190
-
207
+
191
208
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
192
209
  sf.write(tmp_file.name, signal_np, sr)
193
-
210
+
194
211
  # Extract using the emotion2vec model
195
212
  res = self.model.generate(
196
213
  tmp_file.name, granularity="utterance", extract_embedding=True
197
214
  )
198
-
215
+
199
216
  # Get embeddings from result
200
217
  if isinstance(res, list) and len(res) > 0:
201
218
  embeddings = res[0].get("feats", None)
@@ -203,12 +220,20 @@ class Emotion2vec(Featureset):
203
220
  if isinstance(embeddings, list):
204
221
  embeddings = np.array(embeddings)
205
222
  return embeddings.flatten()
206
-
207
- return np.array([0.0] * 768)
208
-
223
+
224
+ # Fallback based on model type
225
+ if 'large' in self.feat_type.lower():
226
+ return np.array([0.0] * 1024)
227
+ else:
228
+ return np.array([0.0] * 768)
229
+
209
230
  except Exception as e:
210
231
  print(f"Error in extract_sample: {str(e)}")
211
- return np.array([0.0] * 768)
232
+ # Return appropriate dimension based on model type
233
+ if 'large' in self.feat_type.lower():
234
+ return np.array([0.0] * 1024)
235
+ else:
236
+ return np.array([0.0] * 768)
212
237
  finally:
213
238
  # Clean up temporary file
214
239
  if tmp_file is not None: # Check if tmp_file was created
@@ -5,7 +5,7 @@ import numpy as np
5
5
  import pandas as pd
6
6
 
7
7
  import nkululeko.glob_conf as glob_conf
8
- from nkululeko.feat_extract import feinberg_praat
8
+ from nkululeko.feat_extract import feats_praat_core
9
9
  from nkululeko.feat_extract.featureset import Featureset
10
10
 
11
11
 
@@ -29,7 +29,7 @@ class PraatSet(Featureset):
29
29
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
30
30
  if extract or no_reuse or not os.path.isfile(storage):
31
31
  self.util.debug("extracting Praat features, this might take a while...")
32
- self.df = feinberg_praat.compute_features(self.data_df.index)
32
+ self.df = feats_praat_core.compute_features(self.data_df.index)
33
33
  self.df = self.df.set_index(self.data_df.index)
34
34
  for i, col in enumerate(self.df.columns):
35
35
  if self.df[col].isnull().values.any():
@@ -58,7 +58,7 @@ class PraatSet(Featureset):
58
58
  audiofile.write(tmp_audio_names[0], signal, sr)
59
59
  df = pd.DataFrame(index=tmp_audio_names)
60
60
  index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
61
- df = feinberg_praat.compute_features(index)
61
+ df = feats_praat_core.compute_features(index)
62
62
  df.set_index(index)
63
63
  for i, col in enumerate(df.columns):
64
64
  if df[col].isnull().values.any():