nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/augmenting/resampler.py +5 -2
- nkululeko/autopredict/ap_emotion.py +36 -0
- nkululeko/autopredict/ap_text.py +45 -0
- nkululeko/autopredict/tests/__init__.py +0 -0
- nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
- nkululeko/autopredict/whisper_transcriber.py +81 -0
- nkululeko/balance.py +222 -0
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +53 -3
- nkululeko/explore.py +32 -13
- nkululeko/feat_extract/feats_analyser.py +45 -17
- nkululeko/feat_extract/feats_emotion2vec.py +51 -26
- nkululeko/feat_extract/feats_praat.py +3 -3
- nkululeko/feat_extract/feats_praat_core.py +769 -0
- nkululeko/feat_extract/tests/__init__.py +1 -0
- nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
- nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
- nkululeko/glob_conf.py +9 -0
- nkululeko/modelrunner.py +15 -39
- nkululeko/models/model.py +4 -42
- nkululeko/models/model_tuned.py +416 -84
- nkululeko/models/model_xgb.py +148 -2
- nkululeko/models/tests/test_model_knn.py +49 -0
- nkululeko/models/tests/test_model_mlp.py +153 -0
- nkululeko/models/tests/test_model_xgb.py +33 -0
- nkululeko/nkululeko.py +0 -9
- nkululeko/plots.py +25 -19
- nkululeko/predict.py +8 -6
- nkululeko/reporting/report.py +7 -5
- nkululeko/reporting/reporter.py +20 -5
- nkululeko/test_predictor.py +7 -1
- nkululeko/tests/__init__.py +1 -0
- nkululeko/tests/test_balancing.py +270 -0
- nkululeko/utils/util.py +38 -6
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
- nkululeko/feat_extract/feats_opensmile copy.py +0 -93
- nkululeko/feat_extract/feinberg_praat.py +0 -628
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
nkululeko/explore.py
CHANGED
@@ -8,6 +8,8 @@ The script supports the following configuration options:
|
|
8
8
|
- `no_warnings`: If set to `True`, it will ignore all warnings during the exploration.
|
9
9
|
- `feature_distributions`: If set to `True`, it will generate plots of the feature distributions.
|
10
10
|
- `tsne`: If set to `True`, it will generate a t-SNE plot of the feature space.
|
11
|
+
- `umap`: If set to `True`, it will generate a UMAP plot of the feature space.
|
12
|
+
- `pca`: If set to `True`, it will generate a PCA plot of the feature space.
|
11
13
|
- `scatter`: If set to `True`, it will generate a scatter plot of the feature space.
|
12
14
|
- `spotlight`: If set to `True`, it will generate a 'spotlight' plot of the feature space.
|
13
15
|
- `shap`: If set to `True`, it will generate SHAP feature importance plots.
|
@@ -59,10 +61,12 @@ def main():
|
|
59
61
|
|
60
62
|
warnings.filterwarnings("ignore")
|
61
63
|
needs_feats = False
|
64
|
+
experiment_loaded = False
|
62
65
|
try:
|
63
66
|
# load the experiment
|
64
67
|
expr.load(f"{util.get_save_name()}")
|
65
68
|
needs_feats = True
|
69
|
+
experiment_loaded = True
|
66
70
|
except FileNotFoundError:
|
67
71
|
# first time: load the data
|
68
72
|
expr.load_datasets()
|
@@ -73,20 +77,35 @@ def main():
|
|
73
77
|
f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
|
74
78
|
)
|
75
79
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
80
|
+
# Check exploration settings regardless of whether experiment was loaded or not
|
81
|
+
plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False"))
|
82
|
+
tsne_plot = eval(util.config_val("EXPL", "tsne", "False"))
|
83
|
+
umap_plot = eval(util.config_val("EXPL", "umap", "False"))
|
84
|
+
pca_plot = eval(util.config_val("EXPL", "pca", "False"))
|
85
|
+
scatter = eval(util.config_val("EXPL", "scatter", "False"))
|
86
|
+
shap = eval(util.config_val("EXPL", "shap", "False"))
|
87
|
+
model_type = util.config_val("EXPL", "model", False)
|
88
|
+
plot_tree = eval(util.config_val("EXPL", "plot_tree", "False"))
|
89
|
+
|
90
|
+
if (
|
91
|
+
plot_feats
|
92
|
+
or tsne_plot
|
93
|
+
or umap_plot
|
94
|
+
or pca_plot
|
95
|
+
or scatter
|
96
|
+
or model_type
|
97
|
+
or plot_tree
|
98
|
+
or shap
|
99
|
+
):
|
100
|
+
# these investigations need features to explore
|
101
|
+
if not experiment_loaded or not needs_feats:
|
85
102
|
expr.extract_feats()
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
#
|
103
|
+
needs_feats = True
|
104
|
+
# explore
|
105
|
+
if shap:
|
106
|
+
# SHAP analysis requires a trained model
|
107
|
+
expr.init_runmanager()
|
108
|
+
expr.runmgr.do_runs()
|
90
109
|
expr.analyse_features(needs_feats)
|
91
110
|
expr.store_report()
|
92
111
|
print("DONE")
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# feats_analyser.py
|
2
2
|
import ast
|
3
|
+
import os
|
3
4
|
|
4
5
|
import matplotlib.pyplot as plt
|
5
6
|
import pandas as pd
|
@@ -76,17 +77,37 @@ class FeatureAnalyser:
|
|
76
77
|
self.util.to_pickle(shap_values, name)
|
77
78
|
else:
|
78
79
|
shap_values = self.util.from_pickle(name)
|
79
|
-
#
|
80
|
-
plt.
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
exp_name = self.util.get_exp_name(only_data=True)
|
80
|
+
# Create SHAP summary plot instead
|
81
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
82
|
+
shap.plots.bar(shap_values, ax=ax, show=False)
|
83
|
+
fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
|
84
|
+
|
85
85
|
format = self.util.config_val("PLOT", "format", "png")
|
86
|
-
|
87
|
-
filename = f"{
|
88
|
-
|
89
|
-
|
86
|
+
feat_type = self.util.get_feattype_name()
|
87
|
+
filename = f"SHAP_{feat_type}_{model.name}.{format}"
|
88
|
+
filename = os.path.join(fig_dir, filename)
|
89
|
+
|
90
|
+
fig.savefig(filename, dpi=300, bbox_inches="tight")
|
91
|
+
plt.close(fig)
|
92
|
+
|
93
|
+
# print and save SHAP feature importance
|
94
|
+
max_feat_num = len(self.features.columns)
|
95
|
+
shap_importance_values = shap_values.abs.mean(0).values
|
96
|
+
|
97
|
+
feature_cols = self.features.columns
|
98
|
+
feature_importance = pd.DataFrame(
|
99
|
+
shap_importance_values[:max_feat_num],
|
100
|
+
index=feature_cols,
|
101
|
+
columns=["importance"],
|
102
|
+
).sort_values("importance", ascending=False)
|
103
|
+
|
104
|
+
self.util.debug(
|
105
|
+
f"SHAP analysis, features = {feature_importance.index.tolist()}"
|
106
|
+
)
|
107
|
+
# Save to CSV (save all features, not just top ones)
|
108
|
+
csv_filename = os.path.join(fig_dir, f"SHAP_{feat_type}_importance_{model.name}.csv")
|
109
|
+
feature_importance.to_csv(csv_filename)
|
110
|
+
self.util.debug(f"Saved SHAP feature importance to {csv_filename}")
|
90
111
|
self.util.debug(f"plotted SHAP feature importance to {filename}")
|
91
112
|
|
92
113
|
def analyse(self):
|
@@ -120,6 +141,12 @@ class FeatureAnalyser:
|
|
120
141
|
covariance_type = self.util.config_val(
|
121
142
|
"MODEL", "GMM_covariance_type", "full"
|
122
143
|
)
|
144
|
+
allowed_cov_types = ["full", "tied", "diag", "spherical"]
|
145
|
+
if covariance_type not in allowed_cov_types:
|
146
|
+
self.util.error(
|
147
|
+
f"Invalid covariance_type '{covariance_type}', must be one of {allowed_cov_types}. Using default 'full'."
|
148
|
+
)
|
149
|
+
covariance_type = "full"
|
123
150
|
model = mixture.GaussianMixture(
|
124
151
|
n_components=n_components, covariance_type=covariance_type
|
125
152
|
)
|
@@ -156,7 +183,7 @@ class FeatureAnalyser:
|
|
156
183
|
from sklearn.svm import SVC
|
157
184
|
|
158
185
|
c = float(self.util.config_val("MODEL", "C_val", "1.0"))
|
159
|
-
model = SVC(kernel="linear", C=c, gamma="scale")
|
186
|
+
model = SVC(kernel="linear", C=c, gamma="scale", random_state=42)
|
160
187
|
result_importances[model_s] = self._get_importance(
|
161
188
|
model, permutation
|
162
189
|
)
|
@@ -165,7 +192,7 @@ class FeatureAnalyser:
|
|
165
192
|
plots = Plots()
|
166
193
|
plots.plot_tree(model, self.features)
|
167
194
|
elif model_s == "tree":
|
168
|
-
model = DecisionTreeClassifier()
|
195
|
+
model = DecisionTreeClassifier(random_state=42)
|
169
196
|
result_importances[model_s] = self._get_importance(
|
170
197
|
model, permutation
|
171
198
|
)
|
@@ -176,7 +203,9 @@ class FeatureAnalyser:
|
|
176
203
|
elif model_s == "xgb":
|
177
204
|
from xgboost import XGBClassifier
|
178
205
|
|
179
|
-
model = XGBClassifier(
|
206
|
+
model = XGBClassifier(
|
207
|
+
enable_categorical=True, tree_method="hist", random_state=42
|
208
|
+
)
|
180
209
|
self.labels = self.labels.astype("category")
|
181
210
|
result_importances[model_s] = self._get_importance(
|
182
211
|
model, permutation
|
@@ -263,13 +292,12 @@ class FeatureAnalyser:
|
|
263
292
|
title += "\n based on feature permutation"
|
264
293
|
ax.set(title=title)
|
265
294
|
plt.tight_layout()
|
266
|
-
fig_dir = self.util.get_path("fig_dir")
|
267
|
-
exp_name = self.util.get_exp_name(only_data=True)
|
295
|
+
fig_dir = self.util.get_path("fig_dir")
|
268
296
|
format = self.util.config_val("PLOT", "format", "png")
|
269
|
-
filename = f"
|
297
|
+
filename = f"EXPL_{model_name}"
|
270
298
|
if permutation:
|
271
299
|
filename += "_perm"
|
272
|
-
filename = f"{fig_dir}{
|
300
|
+
filename = f"{fig_dir}{filename}.{format}"
|
273
301
|
plt.savefig(filename)
|
274
302
|
fig = ax.figure
|
275
303
|
fig.clear()
|
@@ -3,7 +3,6 @@
|
|
3
3
|
# choices for feat_type = "emotion2vec", "emotion2vec-large", "emotion2vec-base", "emotion2vec-seed"
|
4
4
|
|
5
5
|
# requirements:
|
6
|
-
# pip install "modelscope>=1.9.5,<2.0.0"
|
7
6
|
# pip install funasr
|
8
7
|
|
9
8
|
import os
|
@@ -43,27 +42,30 @@ class Emotion2vec(Featureset):
|
|
43
42
|
except ImportError:
|
44
43
|
self.util.error(
|
45
44
|
"FunASR is required for emotion2vec features. "
|
46
|
-
"Please install with: pip install funasr
|
45
|
+
"Please install with: pip install funasr"
|
47
46
|
)
|
48
47
|
|
49
|
-
# Map feat_type to model names
|
48
|
+
# Map feat_type to model names on HuggingFace
|
50
49
|
model_mapping = {
|
51
|
-
"emotion2vec": "
|
52
|
-
"emotion2vec-base": "
|
53
|
-
"emotion2vec-seed": "
|
54
|
-
"emotion2vec-large": "
|
50
|
+
"emotion2vec": "emotion2vec/emotion2vec_base",
|
51
|
+
"emotion2vec-base": "emotion2vec/emotion2vec_base",
|
52
|
+
"emotion2vec-seed": "emotion2vec/emotion2vec_plus_seed",
|
53
|
+
"emotion2vec-large": "emotion2vec/emotion2vec_plus_large",
|
55
54
|
}
|
56
55
|
|
57
56
|
# Get model path from config or use default mapping
|
58
57
|
model_path = self.util.config_val(
|
59
58
|
"FEATS",
|
60
59
|
"emotion2vec.model",
|
61
|
-
model_mapping.get(self.feat_type, "
|
60
|
+
model_mapping.get(self.feat_type, "emotion2vec/emotion2vec_base"),
|
62
61
|
)
|
63
62
|
|
64
63
|
try:
|
65
|
-
# Initialize the FunASR model for emotion2vec
|
66
|
-
self.model = AutoModel(
|
64
|
+
# Initialize the FunASR model for emotion2vec using HuggingFace Hub
|
65
|
+
self.model = AutoModel(
|
66
|
+
model=model_path,
|
67
|
+
hub="hf" # Use HuggingFace Hub instead of ModelScope
|
68
|
+
)
|
67
69
|
self.util.debug(f"initialized emotion2vec model: {model_path}")
|
68
70
|
self.model_initialized = True
|
69
71
|
except Exception as e:
|
@@ -131,7 +133,9 @@ class Emotion2vec(Featureset):
|
|
131
133
|
import tempfile
|
132
134
|
import soundfile as sf
|
133
135
|
|
134
|
-
with tempfile.NamedTemporaryFile(
|
136
|
+
with tempfile.NamedTemporaryFile(
|
137
|
+
suffix=".wav", delete=False
|
138
|
+
) as tmp_file:
|
135
139
|
sf.write(tmp_file.name, signal_np, sampling_rate)
|
136
140
|
audio_path = tmp_file.name
|
137
141
|
else:
|
@@ -152,11 +156,20 @@ class Emotion2vec(Featureset):
|
|
152
156
|
embeddings = np.array(embeddings)
|
153
157
|
return embeddings.flatten()
|
154
158
|
else:
|
155
|
-
# Fallback
|
156
|
-
|
159
|
+
# Fallback based on model type
|
160
|
+
if 'large' in self.feat_type.lower():
|
161
|
+
return np.array([0.0] * 1024)
|
162
|
+
else:
|
163
|
+
return np.array([0.0] * 768)
|
157
164
|
else:
|
158
|
-
self.util.error(
|
159
|
-
|
165
|
+
self.util.error(
|
166
|
+
f"No result from emotion2vec model for file: {file}"
|
167
|
+
)
|
168
|
+
# Fallback based on model type
|
169
|
+
if 'large' in self.feat_type.lower():
|
170
|
+
return np.array([0.0] * 1024)
|
171
|
+
else:
|
172
|
+
return np.array([0.0] * 768)
|
160
173
|
|
161
174
|
finally:
|
162
175
|
# Clean up temporary file if we created one
|
@@ -166,36 +179,40 @@ class Emotion2vec(Featureset):
|
|
166
179
|
except Exception as e:
|
167
180
|
print(f"Error processing {file}: {str(e)}")
|
168
181
|
self.util.error(f"couldn't extract file: {file}, error: {str(e)}")
|
169
|
-
|
182
|
+
# Return appropriate dimension based on model type
|
183
|
+
if 'large' in self.feat_type.lower():
|
184
|
+
return np.array([0.0] * 1024)
|
185
|
+
else:
|
186
|
+
return np.array([0.0] * 768)
|
170
187
|
|
171
188
|
def extract_sample(self, signal, sr):
|
172
189
|
"""Extract features from a single sample."""
|
173
190
|
if not self.model_initialized:
|
174
191
|
self.init_model()
|
175
|
-
|
192
|
+
|
176
193
|
# Save signal as temporary file for emotion2vec
|
177
194
|
import tempfile
|
178
195
|
import soundfile as sf
|
179
|
-
|
196
|
+
|
180
197
|
try:
|
181
198
|
# Convert tensor to numpy if needed
|
182
199
|
if torch.is_tensor(signal):
|
183
200
|
signal_np = signal.squeeze().numpy()
|
184
201
|
else:
|
185
202
|
signal_np = signal.squeeze()
|
186
|
-
|
203
|
+
|
187
204
|
# Handle multi-channel audio
|
188
205
|
if signal_np.ndim > 1:
|
189
206
|
signal_np = signal_np[0]
|
190
|
-
|
207
|
+
|
191
208
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
192
209
|
sf.write(tmp_file.name, signal_np, sr)
|
193
|
-
|
210
|
+
|
194
211
|
# Extract using the emotion2vec model
|
195
212
|
res = self.model.generate(
|
196
213
|
tmp_file.name, granularity="utterance", extract_embedding=True
|
197
214
|
)
|
198
|
-
|
215
|
+
|
199
216
|
# Get embeddings from result
|
200
217
|
if isinstance(res, list) and len(res) > 0:
|
201
218
|
embeddings = res[0].get("feats", None)
|
@@ -203,12 +220,20 @@ class Emotion2vec(Featureset):
|
|
203
220
|
if isinstance(embeddings, list):
|
204
221
|
embeddings = np.array(embeddings)
|
205
222
|
return embeddings.flatten()
|
206
|
-
|
207
|
-
|
208
|
-
|
223
|
+
|
224
|
+
# Fallback based on model type
|
225
|
+
if 'large' in self.feat_type.lower():
|
226
|
+
return np.array([0.0] * 1024)
|
227
|
+
else:
|
228
|
+
return np.array([0.0] * 768)
|
229
|
+
|
209
230
|
except Exception as e:
|
210
231
|
print(f"Error in extract_sample: {str(e)}")
|
211
|
-
|
232
|
+
# Return appropriate dimension based on model type
|
233
|
+
if 'large' in self.feat_type.lower():
|
234
|
+
return np.array([0.0] * 1024)
|
235
|
+
else:
|
236
|
+
return np.array([0.0] * 768)
|
212
237
|
finally:
|
213
238
|
# Clean up temporary file
|
214
239
|
if tmp_file is not None: # Check if tmp_file was created
|
@@ -5,7 +5,7 @@ import numpy as np
|
|
5
5
|
import pandas as pd
|
6
6
|
|
7
7
|
import nkululeko.glob_conf as glob_conf
|
8
|
-
from nkululeko.feat_extract import
|
8
|
+
from nkululeko.feat_extract import feats_praat_core
|
9
9
|
from nkululeko.feat_extract.featureset import Featureset
|
10
10
|
|
11
11
|
|
@@ -29,7 +29,7 @@ class PraatSet(Featureset):
|
|
29
29
|
no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
|
30
30
|
if extract or no_reuse or not os.path.isfile(storage):
|
31
31
|
self.util.debug("extracting Praat features, this might take a while...")
|
32
|
-
self.df =
|
32
|
+
self.df = feats_praat_core.compute_features(self.data_df.index)
|
33
33
|
self.df = self.df.set_index(self.data_df.index)
|
34
34
|
for i, col in enumerate(self.df.columns):
|
35
35
|
if self.df[col].isnull().values.any():
|
@@ -58,7 +58,7 @@ class PraatSet(Featureset):
|
|
58
58
|
audiofile.write(tmp_audio_names[0], signal, sr)
|
59
59
|
df = pd.DataFrame(index=tmp_audio_names)
|
60
60
|
index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
|
61
|
-
df =
|
61
|
+
df = feats_praat_core.compute_features(index)
|
62
62
|
df.set_index(index)
|
63
63
|
for i, col in enumerate(df.columns):
|
64
64
|
if df[col].isnull().values.any():
|