nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. nkululeko/augmenting/resampler.py +5 -2
  2. nkululeko/autopredict/ap_emotion.py +36 -0
  3. nkululeko/autopredict/ap_text.py +45 -0
  4. nkululeko/autopredict/tests/__init__.py +0 -0
  5. nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
  6. nkululeko/autopredict/whisper_transcriber.py +81 -0
  7. nkululeko/balance.py +222 -0
  8. nkululeko/constants.py +1 -1
  9. nkululeko/experiment.py +53 -3
  10. nkululeko/explore.py +32 -13
  11. nkululeko/feat_extract/feats_analyser.py +45 -17
  12. nkululeko/feat_extract/feats_emotion2vec.py +51 -26
  13. nkululeko/feat_extract/feats_praat.py +3 -3
  14. nkululeko/feat_extract/feats_praat_core.py +769 -0
  15. nkululeko/feat_extract/tests/__init__.py +1 -0
  16. nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
  17. nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
  18. nkululeko/glob_conf.py +9 -0
  19. nkululeko/modelrunner.py +15 -39
  20. nkululeko/models/model.py +4 -42
  21. nkululeko/models/model_tuned.py +416 -84
  22. nkululeko/models/model_xgb.py +148 -2
  23. nkululeko/models/tests/test_model_knn.py +49 -0
  24. nkululeko/models/tests/test_model_mlp.py +153 -0
  25. nkululeko/models/tests/test_model_xgb.py +33 -0
  26. nkululeko/nkululeko.py +0 -9
  27. nkululeko/plots.py +25 -19
  28. nkululeko/predict.py +8 -6
  29. nkululeko/reporting/report.py +7 -5
  30. nkululeko/reporting/reporter.py +20 -5
  31. nkululeko/test_predictor.py +7 -1
  32. nkululeko/tests/__init__.py +1 -0
  33. nkululeko/tests/test_balancing.py +270 -0
  34. nkululeko/utils/util.py +38 -6
  35. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
  36. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
  37. nkululeko/feat_extract/feats_opensmile copy.py +0 -93
  38. nkululeko/feat_extract/feinberg_praat.py +0 -628
  39. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
  40. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
  41. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
  42. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
1
+ # Tests for feat_extract module
@@ -0,0 +1,162 @@
1
+ import os
2
+ from unittest.mock import MagicMock, patch
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+ import nkululeko.glob_conf as glob_conf
9
+ from nkululeko.feat_extract.feats_opensmile import Opensmileset
10
+
11
+
12
+ class DummyUtil:
13
+ """Mock utility class for testing."""
14
+ def config_val(self, section, key, default=None):
15
+ config_values = {
16
+ ("FEATS", "set"): "eGeMAPSv02",
17
+ ("FEATS", "level"): "functionals",
18
+ ("FEATS", "needs_feature_extraction"): "False",
19
+ ("FEATS", "no_reuse"): "False",
20
+ ("FEATS", "store_format"): "pkl",
21
+ ("MODEL", "n_jobs"): "1"
22
+ }
23
+ return config_values.get((section, key), default)
24
+
25
+ def debug(self, msg): pass
26
+ def warning(self, msg): pass
27
+ def error(self, msg): raise Exception(msg)
28
+ def get_path(self, key): return "/tmp/test_store/"
29
+ def get_exp_name(self, only_train=False): return "test_exp"
30
+ def write_store(self, df, path, format): pass
31
+ def get_store(self, path, format): return pd.DataFrame()
32
+
33
+
34
+ @pytest.fixture
35
+ def mock_config():
36
+ """Mock glob_conf.config with required structure."""
37
+ mock_config = {
38
+ "EXP": {
39
+ "root": "/tmp/test_nkululeko",
40
+ "name": "test_exp"
41
+ },
42
+ "FEATS": {
43
+ "features": "[]", # Empty list for features filtering
44
+ "set": "eGeMAPSv02",
45
+ "level": "functionals",
46
+ "needs_feature_extraction": "False",
47
+ "no_reuse": "False",
48
+ "store_format": "pkl"
49
+ },
50
+ "DATA": {
51
+ "needs_feature_extraction": "False"
52
+ },
53
+ "MODEL": {
54
+ "n_jobs": "1"
55
+ }
56
+ }
57
+
58
+ # Mock the glob_conf.config
59
+ with patch.object(glob_conf, 'config', mock_config):
60
+ yield mock_config
61
+
62
+
63
+ @pytest.fixture
64
+ def sample_data_df():
65
+ """Create a sample DataFrame for testing with real audio file paths."""
66
+ # Use actual audio files from the test data directory
67
+ audio_files = [
68
+ "data/test/audio/03a01Fa.wav",
69
+ "data/test/audio/03a01Nc.wav",
70
+ "data/test/audio/03a01Wa.wav"
71
+ ]
72
+
73
+ # Create MultiIndex with (file, start, end) as expected by nkululeko
74
+ index_tuples = [(audio_file, pd.Timedelta(0), pd.Timedelta(seconds=1)) for audio_file in audio_files]
75
+ multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['file', 'start', 'end'])
76
+
77
+ return pd.DataFrame({
78
+ 'speaker': ['speaker1', 'speaker2', 'speaker3'],
79
+ 'emotion': ['neutral', 'happy', 'sad']
80
+ }, index=multi_index)
81
+
82
+
83
+ @patch.object(Opensmileset, "__init__", return_value=None)
84
+ def test_extract(mock_init, sample_data_df, mock_config):
85
+ """Test the extract method with mocked initialization."""
86
+ # Create an instance and manually set required attributes
87
+ opensmile = Opensmileset.__new__(Opensmileset)
88
+ opensmile.name = "test"
89
+ opensmile.data_df = sample_data_df
90
+ opensmile.util = DummyUtil()
91
+ opensmile.df = pd.DataFrame()
92
+
93
+ # Mock the extract method to return a sample DataFrame
94
+ sample_features = pd.DataFrame({
95
+ 'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0],
96
+ 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': [0.1, 0.15, 0.08],
97
+ 'loudness_sma3_amean': [50.0, 55.0, 45.0]
98
+ }, index=sample_data_df.index)
99
+
100
+ with patch.object(opensmile, 'extract', return_value=sample_features):
101
+ result = opensmile.extract()
102
+
103
+ # Assert that the extracted features DataFrame is not empty
104
+ assert not result.empty
105
+ assert len(result) == 3
106
+ assert result.shape[1] == 3
107
+
108
+
109
+ @patch.object(Opensmileset, "__init__", return_value=None)
110
+ def test_extract_sample(mock_init, sample_data_df, mock_config):
111
+ """Test the extract_sample method with mocked initialization."""
112
+ # Create an instance and manually set required attributes
113
+ opensmile = Opensmileset.__new__(Opensmileset)
114
+ opensmile.name = "test"
115
+ opensmile.data_df = sample_data_df
116
+ opensmile.util = DummyUtil()
117
+
118
+ # Mock the extract_sample method
119
+ sample_features = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
120
+
121
+ with patch.object(opensmile, 'extract_sample', return_value=sample_features):
122
+ # Create a sample signal and sample rate
123
+ signal = np.array([0.1, 0.2, 0.3, 0.4, 0.5] * 1000)
124
+ sr = 16000
125
+
126
+ # Call the extract_sample method
127
+ feats = opensmile.extract_sample(signal, sr)
128
+
129
+ # Assert that the extracted features are of type numpy.ndarray
130
+ assert isinstance(feats, np.ndarray)
131
+ assert len(feats) == 5
132
+
133
+
134
+ @patch.object(Opensmileset, "__init__", return_value=None)
135
+ def test_filter(mock_init, sample_data_df, mock_config):
136
+ """Test the filter method with mocked initialization."""
137
+ # Create an instance and manually set required attributes
138
+ opensmile = Opensmileset.__new__(Opensmileset)
139
+ opensmile.name = "test"
140
+ opensmile.data_df = sample_data_df
141
+ opensmile.util = DummyUtil()
142
+
143
+ # Create a sample features DataFrame
144
+ opensmile.df = pd.DataFrame({
145
+ 'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0],
146
+ 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': [0.1, 0.15, 0.08],
147
+ 'loudness_sma3_amean': [50.0, 55.0, 45.0]
148
+ }, index=sample_data_df.index)
149
+
150
+ # Mock the filter method
151
+ filtered_df = pd.DataFrame({
152
+ 'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0]
153
+ }, index=sample_data_df.index)
154
+
155
+ with patch.object(opensmile, 'filter', return_value=filtered_df):
156
+ # Call the filter method
157
+ result = opensmile.filter()
158
+
159
+ # Assert that the filtered DataFrame is still not empty
160
+ assert not result.empty
161
+ assert result.shape[0] == 3
162
+ assert result.shape[1] == 1
@@ -0,0 +1,507 @@
1
+ import os
2
+ import tempfile
3
+ from unittest.mock import MagicMock, Mock, patch
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import parselmouth
8
+ import pytest
9
+ from scipy.stats import lognorm
10
+
11
+ from nkululeko.feat_extract.feats_praat_core import (AudioFeatureExtractor,
12
+ add_derived_features,
13
+ compute_features,
14
+ get_speech_rate, run_pca,
15
+ speech_rate)
16
+
17
+
18
+ class TestAudioFeatureExtractor:
19
+
20
+ @pytest.fixture
21
+ def extractor(self):
22
+ return AudioFeatureExtractor(f0min=75, f0max=300)
23
+
24
+ @pytest.fixture
25
+ def mock_sound(self):
26
+ sound = Mock()
27
+ sound.get_total_duration.return_value = 2.5
28
+ return sound
29
+
30
+ def test_init(self):
31
+ extractor = AudioFeatureExtractor(f0min=50, f0max=400)
32
+ assert extractor.f0min == 50
33
+ assert extractor.f0max == 400
34
+
35
+ def test_init_default_values(self):
36
+ extractor = AudioFeatureExtractor()
37
+ assert extractor.f0min == 75
38
+ assert extractor.f0max == 300
39
+
40
+ @patch('nkululeko.feat_extract.feats_praat_core.call')
41
+ def test_extract_pitch_features(self, mock_call, extractor, mock_sound):
42
+ mock_pitch = Mock()
43
+ mock_point_process = Mock()
44
+
45
+ # Mock call return values
46
+ mock_call.side_effect = [
47
+ 150.0, # mean_f0
48
+ 25.0, # stdev_f0
49
+ Mock(), # harmonicity object
50
+ 0.8, # hnr
51
+ 0.01, # local_jitter
52
+ 0.05, # localabsolute_jitter
53
+ 0.02, # rap_jitter
54
+ 0.03, # ppq5_jitter
55
+ 0.04, # ddp_jitter
56
+ 0.1, # local_shimmer
57
+ 0.5, # localdb_shimmer
58
+ 0.15, # apq3_shimmer
59
+ 0.2, # apq5_shimmer
60
+ 0.25, # apq11_shimmer
61
+ 0.3 # dda_shimmer
62
+ ]
63
+
64
+ result = extractor._extract_pitch_features(mock_sound, mock_pitch, mock_point_process)
65
+
66
+ assert result['meanF0Hz'] == 150.0
67
+ assert result['stdevF0Hz'] == 25.0
68
+ assert result['HNR'] == 0.8
69
+ assert result['localJitter'] == 0.01
70
+ assert len(result) == 14
71
+
72
+ @patch('nkululeko.feat_extract.feats_praat_core.call')
73
+ def test_extract_formant_features(self, mock_call, extractor, mock_sound):
74
+ mock_point_process = Mock()
75
+
76
+ # Mock formant values
77
+ mock_call.side_effect = [
78
+ Mock(), # formants object
79
+ 3, # num_points
80
+ 0.5, # time from index 1
81
+ 800.0, # f1 at time 0.5
82
+ 1200.0, # f2 at time 0.5
83
+ 2800.0, # f3 at time 0.5
84
+ 3500.0, # f4 at time 0.5
85
+ 1.0, # time from index 2
86
+ 750.0, # f1 at time 1.0
87
+ 1150.0, # f2 at time 1.0
88
+ 2700.0, # f3 at time 1.0
89
+ 3400.0, # f4 at time 1.0
90
+ 1.5, # time from index 3
91
+ 820.0, # f1 at time 1.5
92
+ 1250.0, # f2 at time 1.5
93
+ 2900.0, # f3 at time 1.5
94
+ 3600.0 # f4 at time 1.5
95
+ ]
96
+
97
+ result = extractor._extract_formant_features(mock_sound, mock_point_process)
98
+
99
+ assert 'f1_mean' in result
100
+ assert 'f2_mean' in result
101
+ assert 'f3_mean' in result
102
+ assert 'f4_mean' in result
103
+ assert 'f1_median' in result
104
+ assert 'f2_median' in result
105
+ assert 'f3_median' in result
106
+ assert 'f4_median' in result
107
+ assert len(result) == 8
108
+
109
+ @patch('nkululeko.feat_extract.feats_praat_core.call')
110
+ def test_extract_formant_features_with_nan(self, mock_call, extractor, mock_sound):
111
+ mock_point_process = Mock()
112
+
113
+ # Mock with some NaN values
114
+ mock_call.side_effect = [
115
+ Mock(), # formants object
116
+ 2, # num_points
117
+ 0.5, # time from index 1
118
+ float('nan'), # f1 at time 0.5 (NaN)
119
+ 1200.0, # f2 at time 0.5
120
+ float('nan'), # f3 at time 0.5 (NaN)
121
+ 3500.0, # f4 at time 0.5
122
+ 1.0, # time from index 2
123
+ 750.0, # f1 at time 1.0
124
+ 1150.0, # f2 at time 1.0
125
+ 2700.0, # f3 at time 1.0
126
+ float('nan') # f4 at time 1.0 (NaN)
127
+ ]
128
+
129
+ result = extractor._extract_formant_features(mock_sound, mock_point_process)
130
+
131
+ # Should handle NaN values gracefully
132
+ assert 'f1_mean' in result
133
+ assert not np.isnan(result['f2_mean'])
134
+ assert len(result) == 8
135
+
136
+ def test_calculate_pause_distribution_empty_list(self, extractor):
137
+ result = extractor._calculate_pause_distribution([])
138
+
139
+ assert np.isnan(result['pause_lognorm_mu'])
140
+ assert np.isnan(result['pause_lognorm_sigma'])
141
+ assert np.isnan(result['pause_lognorm_ks_pvalue'])
142
+ assert np.isnan(result['pause_mean_duration'])
143
+ assert np.isnan(result['pause_std_duration'])
144
+ assert np.isnan(result['pause_cv'])
145
+
146
+ def test_calculate_pause_distribution_valid_data(self, extractor):
147
+ pause_durations = [0.1, 0.2, 0.3, 0.4, 0.5]
148
+ result = extractor._calculate_pause_distribution(pause_durations)
149
+
150
+ assert not np.isnan(result['pause_mean_duration'])
151
+ assert not np.isnan(result['pause_std_duration'])
152
+ assert not np.isnan(result['pause_cv'])
153
+ assert result['pause_mean_duration'] == 0.3
154
+ assert len(result) == 6
155
+
156
+
157
+ class TestRunPCA:
158
+
159
+ def test_run_pca_valid_data(self):
160
+ # Create test dataframe with jitter and shimmer measures
161
+ data = {
162
+ 'localJitter': [0.01, 0.02, 0.015],
163
+ 'localabsoluteJitter': [0.05, 0.06, 0.055],
164
+ 'rapJitter': [0.02, 0.03, 0.025],
165
+ 'ppq5Jitter': [0.03, 0.04, 0.035],
166
+ 'ddpJitter': [0.04, 0.05, 0.045],
167
+ 'localShimmer': [0.1, 0.2, 0.15],
168
+ 'localdbShimmer': [0.5, 0.6, 0.55],
169
+ 'apq3Shimmer': [0.15, 0.25, 0.2],
170
+ 'apq5Shimmer': [0.2, 0.3, 0.25],
171
+ 'apq11Shimmer': [0.25, 0.35, 0.3],
172
+ 'ddaShimmer': [0.3, 0.4, 0.35]
173
+ }
174
+ df = pd.DataFrame(data)
175
+
176
+ result = run_pca(df)
177
+
178
+ assert isinstance(result, pd.DataFrame)
179
+ assert 'JitterPCA' in result.columns
180
+ assert 'ShimmerPCA' in result.columns
181
+ assert len(result) == 3
182
+
183
+ def test_run_pca_with_nan_values(self):
184
+ # Create test dataframe with NaN values
185
+ data = {
186
+ 'localJitter': [0.01, np.nan, 0.015],
187
+ 'localabsoluteJitter': [0.05, 0.06, np.nan],
188
+ 'rapJitter': [0.02, 0.03, 0.025],
189
+ 'ppq5Jitter': [0.03, 0.04, 0.035],
190
+ 'ddpJitter': [0.04, 0.05, 0.045],
191
+ 'localShimmer': [0.1, 0.2, 0.15],
192
+ 'localdbShimmer': [0.5, 0.6, 0.55],
193
+ 'apq3Shimmer': [0.15, 0.25, 0.2],
194
+ 'apq5Shimmer': [0.2, 0.3, 0.25],
195
+ 'apq11Shimmer': [0.25, 0.35, 0.3],
196
+ 'ddaShimmer': [0.3, 0.4, 0.35]
197
+ }
198
+ df = pd.DataFrame(data)
199
+
200
+ result = run_pca(df)
201
+
202
+ assert isinstance(result, pd.DataFrame)
203
+ assert 'JitterPCA' in result.columns
204
+ assert 'ShimmerPCA' in result.columns
205
+
206
+ def test_run_pca_single_file(self):
207
+ # Test with single file (should handle ValueError)
208
+ data = {
209
+ 'localJitter': [0.01],
210
+ 'localabsoluteJitter': [0.05],
211
+ 'rapJitter': [0.02],
212
+ 'ppq5Jitter': [0.03],
213
+ 'ddpJitter': [0.04],
214
+ 'localShimmer': [0.1],
215
+ 'localdbShimmer': [0.5],
216
+ 'apq3Shimmer': [0.15],
217
+ 'apq5Shimmer': [0.2],
218
+ 'apq11Shimmer': [0.25],
219
+ 'ddaShimmer': [0.3]
220
+ }
221
+ df = pd.DataFrame(data)
222
+
223
+ result = run_pca(df)
224
+
225
+ assert isinstance(result, pd.DataFrame)
226
+ assert result.iloc[0]['JitterPCA'] == 0
227
+ assert result.iloc[0]['ShimmerPCA'] == 0
228
+
229
+
230
+ class TestAddDerivedFeatures:
231
+
232
+ def test_add_derived_features(self):
233
+ # Create test dataframe with required columns
234
+ data = {
235
+ 'f1_median': [800, 750, 820],
236
+ 'f2_median': [1200, 1150, 1250],
237
+ 'f3_median': [2800, 2700, 2900],
238
+ 'f4_median': [3500, 3400, 3600],
239
+ 'localJitter': [0.01, 0.02, 0.015],
240
+ 'localabsoluteJitter': [0.05, 0.06, 0.055],
241
+ 'rapJitter': [0.02, 0.03, 0.025],
242
+ 'ppq5Jitter': [0.03, 0.04, 0.035],
243
+ 'ddpJitter': [0.04, 0.05, 0.045],
244
+ 'localShimmer': [0.1, 0.2, 0.15],
245
+ 'localdbShimmer': [0.5, 0.6, 0.55],
246
+ 'apq3Shimmer': [0.15, 0.25, 0.2],
247
+ 'apq5Shimmer': [0.2, 0.3, 0.25],
248
+ 'apq11Shimmer': [0.25, 0.35, 0.3],
249
+ 'ddaShimmer': [0.3, 0.4, 0.35]
250
+ }
251
+ df = pd.DataFrame(data)
252
+
253
+ result = add_derived_features(df)
254
+
255
+ # Check PCA columns are added
256
+ assert 'JitterPCA' in result.columns
257
+ assert 'ShimmerPCA' in result.columns
258
+
259
+ # Check vocal tract features are added
260
+ assert 'pF' in result.columns
261
+ assert 'fdisp' in result.columns
262
+ assert 'avgFormant' in result.columns
263
+ assert 'mff' in result.columns
264
+ assert 'fitch_vtl' in result.columns
265
+ assert 'delta_f' in result.columns
266
+ assert 'vtl_delta_f' in result.columns
267
+
268
+ def test_add_derived_features_with_nan(self):
269
+ # Test with NaN values
270
+ data = {
271
+ 'f1_median': [np.nan, 750, 820],
272
+ 'f2_median': [1200, np.nan, 1250],
273
+ 'f3_median': [2800, 2700, np.nan],
274
+ 'f4_median': [3500, 3400, 3600],
275
+ 'localJitter': [0.01, 0.02, 0.015],
276
+ 'localabsoluteJitter': [0.05, 0.06, 0.055],
277
+ 'rapJitter': [0.02, 0.03, 0.025],
278
+ 'ppq5Jitter': [0.03, 0.04, 0.035],
279
+ 'ddpJitter': [0.04, 0.05, 0.045],
280
+ 'localShimmer': [0.1, 0.2, 0.15],
281
+ 'localdbShimmer': [0.5, 0.6, 0.55],
282
+ 'apq3Shimmer': [0.15, 0.25, 0.2],
283
+ 'apq5Shimmer': [0.2, 0.3, 0.25],
284
+ 'apq11Shimmer': [0.25, 0.35, 0.3],
285
+ 'ddaShimmer': [0.3, 0.4, 0.35]
286
+ }
287
+ df = pd.DataFrame(data)
288
+
289
+ result = add_derived_features(df)
290
+
291
+ # Should handle NaN values without raising errors
292
+ assert 'pF' in result.columns
293
+ assert 'fdisp' in result.columns
294
+ assert len(result) == len(df)
295
+
296
+
297
+ class TestComputeFeatures:
298
+
299
+ def test_compute_features_function_exists(self):
300
+ # Simple test to verify the function exists and is importable
301
+ assert callable(compute_features)
302
+
303
+
304
+ class TestSpeechRate:
305
+
306
+ def test_speech_rate_function_exists(self):
307
+ # Simple test to verify the function exists and is importable
308
+ assert callable(speech_rate)
309
+
310
+
311
+ class TestGetSpeechRate:
312
+
313
+ def test_get_speech_rate_function_exists(self):
314
+ # Simple test to verify the function exists and is importable
315
+ assert callable(get_speech_rate)
316
+
317
+
318
+ class TestPraatIntegration:
319
+ """Integration tests for complete Praat feature extraction pipeline."""
320
+
321
+ def test_compute_features_with_real_audio_file(self):
322
+ """Test that all 45 features can be extracted from a real audio file."""
323
+ import datetime
324
+ import os
325
+
326
+ # Use a real audio file from the test data
327
+ audio_file = "./data/test/audio/debate_sample.wav"
328
+
329
+ # Verify the test audio file exists
330
+ assert os.path.exists(audio_file), f"Test audio file not found: {audio_file}"
331
+
332
+ # Create a mock file index similar to what nkululeko uses
333
+ # Format: (file_path, start_time, end_time)
334
+ file_index = pd.DataFrame([
335
+ (audio_file, datetime.timedelta(seconds=0), datetime.timedelta(seconds=5))
336
+ ], columns=['file', 'start', 'end'])
337
+
338
+ # Set the DataFrame index to match what compute_features expects
339
+ file_index = file_index.set_index(['file', 'start', 'end']).index
340
+
341
+ # Extract features using the main compute_features function
342
+ features_df = compute_features(file_index)
343
+
344
+ # Verify the result is a DataFrame
345
+ assert isinstance(features_df, pd.DataFrame), "compute_features should return a DataFrame"
346
+
347
+ # Verify we have exactly one row (one audio file)
348
+ assert len(features_df) == 1, f"Expected 1 row, got {len(features_df)}"
349
+
350
+ # Verify we have approximately 45 features (exact count may vary with optimizations)
351
+ expected_min_features = 40 # Allow some tolerance
352
+ expected_max_features = 50 # Allow some tolerance
353
+ actual_features = len(features_df.columns)
354
+
355
+ assert expected_min_features <= actual_features <= expected_max_features, \
356
+ f"Expected ~45 features (range {expected_min_features}-{expected_max_features}), got {actual_features}. " \
357
+ f"Features: {list(features_df.columns)}"
358
+
359
+ # Verify that all expected core features are present
360
+ expected_core_features = [
361
+ 'duration', 'meanF0Hz', 'stdevF0Hz', 'HNR',
362
+ 'f1_mean', 'f1_median', 'f2_mean', 'f2_median',
363
+ 'f3_mean', 'f3_median', 'f4_mean', 'f4_median',
364
+ 'localJitter', 'localabsoluteJitter', 'rapJitter', 'ppq5Jitter', 'ddpJitter',
365
+ 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer', 'apq11Shimmer', 'ddaShimmer',
366
+ 'JitterPCA', 'ShimmerPCA', # From PCA
367
+ 'pF', 'fdisp', 'avgFormant', 'mff', 'fitch_vtl', 'delta_f', 'vtl_delta_f', # Vocal tract
368
+ 'nsyll', 'npause', 'phonationtime_s', 'speechrate_nsyll_dur',
369
+ 'articulation_rate_nsyll_phonationtime', 'ASD_speakingtime_nsyll', # Speech rate
370
+ ]
371
+
372
+ missing_features = [feat for feat in expected_core_features if feat not in features_df.columns]
373
+ assert len(missing_features) == 0, f"Missing expected features: {missing_features}"
374
+
375
+ # Verify that most features are not NaN (allowing some tolerance for edge cases)
376
+ non_nan_features = features_df.notna().sum(axis=1).iloc[0]
377
+ total_features = len(features_df.columns)
378
+
379
+ # At least 80% of features should be non-NaN for a valid audio file
380
+ min_valid_features = int(0.8 * total_features)
381
+ assert non_nan_features >= min_valid_features, \
382
+ f"Too many NaN features: {non_nan_features}/{total_features} are valid, " \
383
+ f"expected at least {min_valid_features}"
384
+
385
+ # Verify that specific features have reasonable values
386
+ row = features_df.iloc[0]
387
+
388
+ # Duration should be positive and approximately 5 seconds (with some tolerance)
389
+ assert 3.0 <= row['duration'] <= 7.0, f"Duration seems unreasonable: {row['duration']}"
390
+
391
+ # F0 values should be in human speech range if detected
392
+ if not pd.isna(row['meanF0Hz']):
393
+ assert 50 <= row['meanF0Hz'] <= 500, f"Mean F0 seems unreasonable: {row['meanF0Hz']}"
394
+
395
+ # Formant values should be in typical ranges if detected
396
+ for i in range(1, 5):
397
+ formant_mean = row[f'f{i}_mean']
398
+ if not pd.isna(formant_mean):
399
+ assert 200 <= formant_mean <= 4000, f"Formant F{i} mean seems unreasonable: {formant_mean}"
400
+
401
+ print(f"SUCCESS: Extracted {actual_features} features from real audio file")
402
+ print(f"Feature names: {list(features_df.columns)}")
403
+ print(f"Non-NaN features: {non_nan_features}/{total_features}")
404
+
405
+ def test_feature_extraction_robustness_multiple_files(self):
406
+ """Test feature extraction with multiple real audio files."""
407
+ import datetime
408
+ import os
409
+
410
+ # Test with multiple audio files
411
+ audio_dir = "./data/test/audio"
412
+ available_files = [
413
+ "debate_sample.wav",
414
+ "03a01Fa.wav",
415
+ "03a01Nc.wav"
416
+ ]
417
+
418
+ # Filter to only files that actually exist
419
+ test_files = []
420
+ for fname in available_files:
421
+ fpath = os.path.join(audio_dir, fname)
422
+ if os.path.exists(fpath):
423
+ test_files.append(fpath)
424
+
425
+ assert len(test_files) >= 1, "Need at least one test audio file"
426
+
427
+ # Create file index for multiple files
428
+ file_index_data = []
429
+ for audio_file in test_files:
430
+ file_index_data.append((audio_file, datetime.timedelta(seconds=0), datetime.timedelta(seconds=3)))
431
+
432
+ file_index = pd.DataFrame(file_index_data, columns=['file', 'start', 'end'])
433
+ file_index = file_index.set_index(['file', 'start', 'end']).index
434
+
435
+ # Extract features
436
+ features_df = compute_features(file_index)
437
+
438
+ # Verify we have the correct number of rows
439
+ assert len(features_df) == len(test_files), f"Expected {len(test_files)} rows, got {len(features_df)}"
440
+
441
+ # Verify all files produced some valid features
442
+ for i, test_file in enumerate(test_files):
443
+ row = features_df.iloc[i]
444
+ non_nan_count = row.notna().sum()
445
+ total_features = len(features_df.columns)
446
+
447
+ # Each file should have at least some valid features
448
+ min_valid = int(0.5 * total_features) # More lenient for multiple files
449
+ assert non_nan_count >= min_valid, \
450
+ f"File {test_file} has too few valid features: {non_nan_count}/{total_features}"
451
+
452
+ print(f"SUCCESS: Extracted features from {len(test_files)} files")
453
+ print(f"Total features per file: {len(features_df.columns)}")
454
+
455
+ def test_expected_feature_count_matches_documentation(self):
456
+ """Test that the actual feature count matches the documented count in the code."""
457
+ import datetime
458
+ import os
459
+
460
+ audio_file = "./data/test/audio/debate_sample.wav"
461
+ assert os.path.exists(audio_file), f"Test audio file not found: {audio_file}"
462
+
463
+ file_index = pd.DataFrame([
464
+ (audio_file, datetime.timedelta(seconds=0), datetime.timedelta(seconds=2))
465
+ ], columns=['file', 'start', 'end'])
466
+ file_index = file_index.set_index(['file', 'start', 'end']).index
467
+
468
+ features_df = compute_features(file_index)
469
+ actual_count = len(features_df.columns)
470
+
471
+ # According to the docstring, we expect ~43-45 features
472
+ # The exact count may vary based on optimization and implementation details
473
+ expected_range = (42, 47) # Allow some tolerance
474
+
475
+ assert expected_range[0] <= actual_count <= expected_range[1], \
476
+ f"Feature count {actual_count} is outside expected range {expected_range}. " \
477
+ f"This may indicate changes to the feature extraction implementation."
478
+
479
+ # Print the actual features for documentation/debugging
480
+ feature_categories = {
481
+ 'basic': ['duration', 'meanF0Hz', 'stdevF0Hz', 'HNR'],
482
+ 'formants': [col for col in features_df.columns if col.startswith('f') and ('_mean' in col or '_median' in col)],
483
+ 'jitter': [col for col in features_df.columns if 'Jitter' in col],
484
+ 'shimmer': [col for col in features_df.columns if 'Shimmer' in col],
485
+ 'pca': [col for col in features_df.columns if 'PCA' in col],
486
+ 'vocal_tract': [col for col in features_df.columns if col in ['pF', 'fdisp', 'avgFormant', 'mff', 'fitch_vtl', 'delta_f', 'vtl_delta_f']],
487
+ 'speech_rate': [col for col in features_df.columns if col in ['nsyll', 'npause', 'phonationtime_s', 'speechrate_nsyll_dur', 'articulation_rate_nsyll_phonationtime', 'ASD_speakingtime_nsyll']],
488
+ 'pause_distribution': [col for col in features_df.columns if 'pause' in col.lower()],
489
+ 'other': []
490
+ }
491
+
492
+ # Classify all features
493
+ classified_features = set()
494
+ for category, features in feature_categories.items():
495
+ classified_features.update(features)
496
+
497
+ feature_categories['other'] = [col for col in features_df.columns if col not in classified_features]
498
+
499
+ print(f"\nFeature breakdown (total: {actual_count}):")
500
+ for category, features in feature_categories.items():
501
+ if features:
502
+ print(f" {category}: {len(features)} features - {features}")
503
+
504
+ # Verify we have features in all major categories
505
+ required_categories = ['basic', 'formants', 'jitter', 'shimmer']
506
+ for category in required_categories:
507
+ assert len(feature_categories[category]) > 0, f"No features found in {category} category"
nkululeko/glob_conf.py CHANGED
@@ -1,5 +1,14 @@
1
1
  # glob_conf.py
2
2
 
3
+ # Initialize global variables
4
+ config = None
5
+ label_encoder = None
6
+ util = None
7
+ module = None
8
+ report = None
9
+ labels = None
10
+ target = None
11
+
3
12
 
4
13
  def init_config(config_obj):
5
14
  global config