nkululeko 0.95.0__py3-none-any.whl → 0.95.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,122 @@
1
+ import os
2
+ import tempfile
3
+ from datetime import timedelta
4
+ from unittest.mock import MagicMock, Mock, patch
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import pytest
9
+
10
+ from nkululeko.autopredict.whisper_transcriber import Transcriber
11
+
12
+
13
+ class TestTranscriber:
14
+
15
+ @patch('nkululeko.autopredict.whisper_transcriber.whisper.load_model')
16
+ @patch('nkululeko.autopredict.whisper_transcriber.torch.cuda.is_available')
17
+ def test_init_default_device(self, mock_cuda, mock_load_model):
18
+ mock_cuda.return_value = True
19
+ mock_model = Mock()
20
+ mock_load_model.return_value = mock_model
21
+
22
+ transcriber = Transcriber()
23
+
24
+ mock_load_model.assert_called_once_with("turbo", device="cuda")
25
+ assert transcriber.language == "en"
26
+ assert transcriber.model == mock_model
27
+
28
+ @patch('nkululeko.autopredict.whisper_transcriber.whisper.load_model')
29
+ def test_init_custom_params(self, mock_load_model):
30
+ mock_model = Mock()
31
+ mock_load_model.return_value = mock_model
32
+ mock_util = Mock()
33
+
34
+ transcriber = Transcriber(model_name="base", device="cpu", language="es", util=mock_util)
35
+
36
+ mock_load_model.assert_called_once_with("base", device="cpu")
37
+ assert transcriber.language == "es"
38
+ assert transcriber.util == mock_util
39
+
40
+ def test_transcribe_file(self):
41
+ mock_model = Mock()
42
+ mock_model.transcribe.return_value = {"text": " Hello world "}
43
+
44
+ transcriber = Transcriber()
45
+ transcriber.model = mock_model
46
+
47
+ result = transcriber.transcribe_file("test.wav")
48
+
49
+ mock_model.transcribe.assert_called_once_with("test.wav", language="en", without_timestamps=True)
50
+ assert result == "Hello world"
51
+
52
+ @patch('nkululeko.autopredict.whisper_transcriber.audiofile.write')
53
+ def test_transcribe_array(self, mock_write):
54
+ transcriber = Transcriber()
55
+ transcriber.transcribe_file = Mock(return_value="transcribed text")
56
+
57
+ signal = np.array([0.1, 0.2, 0.3])
58
+ sampling_rate = 16000
59
+
60
+ result = transcriber.transcribe_array(signal, sampling_rate)
61
+
62
+ mock_write.assert_called_once_with("temp.wav", signal, sampling_rate, format="wav")
63
+ transcriber.transcribe_file.assert_called_once_with("temp.wav")
64
+ assert result == "transcribed text"
65
+
66
+ @patch('nkululeko.autopredict.whisper_transcriber.audiofile.read')
67
+ @patch('nkululeko.autopredict.whisper_transcriber.audeer.mkdir')
68
+ @patch('nkululeko.autopredict.whisper_transcriber.audeer.path')
69
+ @patch('nkululeko.autopredict.whisper_transcriber.audeer.basename_wo_ext')
70
+ @patch('nkululeko.autopredict.whisper_transcriber.os.path.isfile')
71
+ def test_transcribe_index_with_cache(self, mock_isfile, mock_basename, mock_path, mock_mkdir, mock_read):
72
+ mock_util = Mock()
73
+ mock_util.get_path.return_value = "/cache"
74
+ mock_util.read_json.return_value = {"transcription": "cached text"}
75
+
76
+ mock_mkdir.return_value = "/cache/transcriptions"
77
+ mock_path.side_effect = lambda *args: "/".join(args)
78
+ mock_basename.return_value = "file1"
79
+ mock_isfile.return_value = True
80
+
81
+ transcriber = Transcriber(util=mock_util)
82
+
83
+ index = pd.Index([
84
+ ("file1.wav", timedelta(seconds=0), timedelta(seconds=1))
85
+ ])
86
+
87
+ result = transcriber.transcribe_index(index)
88
+
89
+ assert isinstance(result, pd.DataFrame)
90
+ assert len(result) == 1
91
+ assert result.iloc[0]["text"] == "cached text"
92
+
93
+ @patch('nkululeko.autopredict.whisper_transcriber.whisper.load_model')
94
+ @patch('nkululeko.autopredict.whisper_transcriber.audiofile.read')
95
+ @patch('nkululeko.autopredict.whisper_transcriber.audeer.mkdir')
96
+ @patch('nkululeko.autopredict.whisper_transcriber.audeer.path')
97
+ @patch('nkululeko.autopredict.whisper_transcriber.audeer.basename_wo_ext')
98
+ @patch('nkululeko.autopredict.whisper_transcriber.os.path.isfile')
99
+ def test_transcribe_index_without_cache(self, mock_isfile, mock_basename, mock_path, mock_mkdir, mock_audioread, mock_load_model):
100
+ mock_util = Mock()
101
+ mock_util.get_path.return_value = "/cache"
102
+
103
+ mock_mkdir.return_value = "/cache/transcriptions"
104
+ mock_path.side_effect = lambda *args: "/".join(args)
105
+ mock_basename.return_value = "file1"
106
+ mock_isfile.return_value = False
107
+ mock_audioread.return_value = (np.array([0.1, 0.2]), 16000)
108
+ mock_load_model.return_value = Mock()
109
+
110
+ transcriber = Transcriber(util=mock_util)
111
+ transcriber.transcribe_array = Mock(return_value="new transcription")
112
+
113
+ index = pd.Index([
114
+ ("file1.wav", timedelta(seconds=0), timedelta(seconds=1))
115
+ ])
116
+
117
+ result = transcriber.transcribe_index(index)
118
+
119
+ mock_util.save_json.assert_called_once()
120
+ assert isinstance(result, pd.DataFrame)
121
+ assert len(result) == 1
122
+ assert result.iloc[0]["text"] == "new transcription"
nkululeko/balance.py ADDED
@@ -0,0 +1,222 @@
1
+ # balance.py
2
+ """
3
+ Data and feature balancing module for imbalanced datasets.
4
+
5
+ This module provides a unified interface for various balancing techniques
6
+ including over-sampling, under-sampling, and combination methods.
7
+ """
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from nkululeko.utils.util import Util
12
+ import nkululeko.glob_conf as glob_conf
13
+
14
+
15
+ class DataBalancer:
16
+ """Class to handle data and feature balancing operations."""
17
+
18
+ def __init__(self, random_state=42):
19
+ """
20
+ Initialize the DataBalancer.
21
+
22
+ Args:
23
+ random_state (int): Random state for reproducible results
24
+ """
25
+ self.util = Util("data_balancer")
26
+ self.random_state = random_state
27
+
28
+ # Supported balancing algorithms
29
+ self.oversampling_methods = [
30
+ 'ros', # RandomOverSampler
31
+ 'smote', # SMOTE
32
+ 'adasyn', # ADASYN
33
+ 'borderlinesmote', # BorderlineSMOTE
34
+ 'svmsmote' # SVMSMOTE
35
+ ]
36
+
37
+ self.undersampling_methods = [
38
+ 'clustercentroids', # ClusterCentroids
39
+ 'randomundersampler', # RandomUnderSampler
40
+ 'editednearestneighbours', # EditedNearestNeighbours
41
+ 'tomeklinks' # TomekLinks
42
+ ]
43
+
44
+ self.combination_methods = [
45
+ 'smoteenn', # SMOTEENN
46
+ 'smotetomek' # SMOTETomek
47
+ ]
48
+
49
+ def get_supported_methods(self):
50
+ """Get all supported balancing methods."""
51
+ return {
52
+ 'oversampling': self.oversampling_methods,
53
+ 'undersampling': self.undersampling_methods,
54
+ 'combination': self.combination_methods
55
+ }
56
+
57
+ def is_valid_method(self, method):
58
+ """Check if a balancing method is supported."""
59
+ all_methods = (self.oversampling_methods +
60
+ self.undersampling_methods +
61
+ self.combination_methods)
62
+ return method.lower() in all_methods
63
+
64
+ def balance_features(self, df_train, feats_train, target_column, method):
65
+ """
66
+ Balance features using the specified method.
67
+
68
+ Args:
69
+ df_train (pd.DataFrame): Training dataframe with target labels
70
+ feats_train (np.ndarray or pd.DataFrame): Training features
71
+ target_column (str): Name of the target column
72
+ method (str): Balancing method to use
73
+
74
+ Returns:
75
+ tuple: (balanced_df, balanced_features)
76
+ """
77
+ if not self.is_valid_method(method):
78
+ available_methods = (self.oversampling_methods +
79
+ self.undersampling_methods +
80
+ self.combination_methods)
81
+ self.util.error(
82
+ f"Unknown balancing algorithm: {method}. "
83
+ f"Available methods: {available_methods}"
84
+ )
85
+ return df_train, feats_train
86
+
87
+ orig_size = len(df_train)
88
+ self.util.debug(f"Balancing features with: {method}")
89
+ self.util.debug(f"Original dataset size: {orig_size}")
90
+
91
+ # Get original class distribution
92
+ orig_dist = df_train[target_column].value_counts().to_dict()
93
+ self.util.debug(f"Original class distribution: {orig_dist}")
94
+
95
+ try:
96
+ # Apply the specified balancing method
97
+ X_res, y_res = self._apply_balancing_method(
98
+ feats_train, df_train[target_column], method
99
+ )
100
+
101
+ # Create new balanced dataframe
102
+ balanced_df = pd.DataFrame({target_column: y_res})
103
+
104
+ # If original dataframe has an index, try to preserve it
105
+ if hasattr(X_res, 'index'):
106
+ balanced_df.index = X_res.index
107
+
108
+ new_size = len(balanced_df)
109
+ new_dist = balanced_df[target_column].value_counts().to_dict()
110
+
111
+ self.util.debug(f"Balanced dataset size: {new_size} (was {orig_size})")
112
+ self.util.debug(f"New class distribution: {new_dist}")
113
+
114
+ # Log class distribution with label names if encoder is available
115
+ self._log_class_distribution(y_res, method)
116
+
117
+ return balanced_df, X_res
118
+
119
+ except Exception as e:
120
+ self.util.debug(f"Error applying {method} balancing: {str(e)}")
121
+ # Don't call sys.exit() in tests, just return original data
122
+ return df_train, feats_train
123
+
124
+ def _apply_balancing_method(self, features, targets, method):
125
+ """Apply the specific balancing method."""
126
+ method = method.lower()
127
+
128
+ # Over-sampling methods
129
+ if method == 'ros':
130
+ from imblearn.over_sampling import RandomOverSampler
131
+ sampler = RandomOverSampler(random_state=self.random_state)
132
+
133
+ elif method == 'smote':
134
+ from imblearn.over_sampling import SMOTE
135
+ sampler = SMOTE(random_state=self.random_state)
136
+
137
+ elif method == 'adasyn':
138
+ from imblearn.over_sampling import ADASYN
139
+ sampler = ADASYN(random_state=self.random_state)
140
+
141
+ elif method == 'borderlinesmote':
142
+ from imblearn.over_sampling import BorderlineSMOTE
143
+ sampler = BorderlineSMOTE(random_state=self.random_state)
144
+
145
+ elif method == 'svmsmote':
146
+ from imblearn.over_sampling import SVMSMOTE
147
+ sampler = SVMSMOTE(random_state=self.random_state)
148
+
149
+ # Under-sampling methods
150
+ elif method == 'clustercentroids':
151
+ from imblearn.under_sampling import ClusterCentroids
152
+ sampler = ClusterCentroids(random_state=self.random_state)
153
+
154
+ elif method == 'randomundersampler':
155
+ from imblearn.under_sampling import RandomUnderSampler
156
+ sampler = RandomUnderSampler(random_state=self.random_state)
157
+
158
+ elif method == 'editednearestneighbours':
159
+ from imblearn.under_sampling import EditedNearestNeighbours
160
+ sampler = EditedNearestNeighbours()
161
+
162
+ elif method == 'tomeklinks':
163
+ from imblearn.under_sampling import TomekLinks
164
+ sampler = TomekLinks()
165
+
166
+ # Combination methods
167
+ elif method == 'smoteenn':
168
+ from imblearn.combine import SMOTEENN
169
+ sampler = SMOTEENN(random_state=self.random_state)
170
+
171
+ elif method == 'smotetomek':
172
+ from imblearn.combine import SMOTETomek
173
+ sampler = SMOTETomek(random_state=self.random_state)
174
+
175
+ else:
176
+ raise ValueError(f"Unsupported balancing method: {method}")
177
+
178
+ # Apply the balancing
179
+ X_res, y_res = sampler.fit_resample(features, targets)
180
+ return X_res, y_res
181
+
182
+ def _log_class_distribution(self, y_res, method):
183
+ """Log class distribution with label names if possible."""
184
+ # Check if label encoder is available for pretty printing
185
+ if (hasattr(glob_conf, "label_encoder") and
186
+ glob_conf.label_encoder is not None):
187
+ try:
188
+ le = glob_conf.label_encoder
189
+ res = pd.Series(y_res).value_counts()
190
+ resd = {}
191
+ for i, label_idx in enumerate(res.index.values):
192
+ label_name = le.inverse_transform([label_idx])[0]
193
+ resd[label_name] = res.values[i]
194
+ self.util.debug(f"Class distribution after {method} balancing: {resd}")
195
+ except Exception as e:
196
+ self.util.debug(
197
+ f"Could not decode class labels: {e}. "
198
+ f"Showing numeric distribution: {pd.Series(y_res).value_counts().to_dict()}"
199
+ )
200
+ else:
201
+ self.util.debug(
202
+ f"Label encoder not available. "
203
+ f"Class distribution after {method} balancing: {pd.Series(y_res).value_counts().to_dict()}"
204
+ )
205
+
206
+
207
+ class LegacyDataBalancer:
208
+ """Legacy data balancer for backward compatibility."""
209
+
210
+ def __init__(self):
211
+ self.util = Util("legacy_data_balancer")
212
+
213
+ def balance_data(self, df_train, df_test):
214
+ """
215
+ Legacy method for data balancing (kept for backward compatibility).
216
+
217
+ This method should be replaced by the new DataBalancer class.
218
+ """
219
+ self.util.debug("Using legacy data balancing method")
220
+ # Implementation for legacy balance_data method would go here
221
+ # For now, just return the original data unchanged
222
+ return df_train, df_test
nkululeko/constants.py CHANGED
@@ -1,2 +1,2 @@
1
- VERSION="0.95.0"
1
+ VERSION="0.95.1"
2
2
  SAMPLING_RATE = 16000
@@ -5,7 +5,7 @@ import numpy as np
5
5
  import pandas as pd
6
6
 
7
7
  import nkululeko.glob_conf as glob_conf
8
- from nkululeko.feat_extract import feinberg_praat
8
+ from nkululeko.feat_extract import feats_praat_core
9
9
  from nkululeko.feat_extract.featureset import Featureset
10
10
 
11
11
 
@@ -29,7 +29,7 @@ class PraatSet(Featureset):
29
29
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
30
30
  if extract or no_reuse or not os.path.isfile(storage):
31
31
  self.util.debug("extracting Praat features, this might take a while...")
32
- self.df = feinberg_praat.compute_features(self.data_df.index)
32
+ self.df = feats_praat_core.compute_features(self.data_df.index)
33
33
  self.df = self.df.set_index(self.data_df.index)
34
34
  for i, col in enumerate(self.df.columns):
35
35
  if self.df[col].isnull().values.any():
@@ -58,7 +58,7 @@ class PraatSet(Featureset):
58
58
  audiofile.write(tmp_audio_names[0], signal, sr)
59
59
  df = pd.DataFrame(index=tmp_audio_names)
60
60
  index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
61
- df = feinberg_praat.compute_features(index)
61
+ df = feats_praat_core.compute_features(index)
62
62
  df.set_index(index)
63
63
  for i, col in enumerate(df.columns):
64
64
  if df[col].isnull().values.any():
@@ -537,7 +537,6 @@ def get_speech_rate(file_index):
537
537
  cols = [
538
538
  "nsyll",
539
539
  "npause",
540
- "dur_s",
541
540
  "phonationtime_s",
542
541
  "speechrate_nsyll_dur",
543
542
  "articulation_rate_nsyll_phonationtime",
@@ -755,7 +754,6 @@ def speech_rate(sound):
755
754
  speechrate_dictionary = {
756
755
  "nsyll": voicedcount,
757
756
  "npause": npause,
758
- # "dur_s": originaldur,
759
757
  "phonationtime_s": intensity_duration,
760
758
  "speechrate_nsyll_dur": speakingrate,
761
759
  "articulation_rate_nsyll_phonationtime": articulationrate,
@@ -0,0 +1 @@
1
+ # Tests for feat_extract module
@@ -0,0 +1,162 @@
1
+ import os
2
+ from unittest.mock import MagicMock, patch
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import pytest
7
+
8
+ import nkululeko.glob_conf as glob_conf
9
+ from nkululeko.feat_extract.feats_opensmile import Opensmileset
10
+
11
+
12
+ class DummyUtil:
13
+ """Mock utility class for testing."""
14
+ def config_val(self, section, key, default=None):
15
+ config_values = {
16
+ ("FEATS", "set"): "eGeMAPSv02",
17
+ ("FEATS", "level"): "functionals",
18
+ ("FEATS", "needs_feature_extraction"): "False",
19
+ ("FEATS", "no_reuse"): "False",
20
+ ("FEATS", "store_format"): "pkl",
21
+ ("MODEL", "n_jobs"): "1"
22
+ }
23
+ return config_values.get((section, key), default)
24
+
25
+ def debug(self, msg): pass
26
+ def warning(self, msg): pass
27
+ def error(self, msg): raise Exception(msg)
28
+ def get_path(self, key): return "/tmp/test_store/"
29
+ def get_exp_name(self, only_train=False): return "test_exp"
30
+ def write_store(self, df, path, format): pass
31
+ def get_store(self, path, format): return pd.DataFrame()
32
+
33
+
34
+ @pytest.fixture
35
+ def mock_config():
36
+ """Mock glob_conf.config with required structure."""
37
+ mock_config = {
38
+ "EXP": {
39
+ "root": "/tmp/test_nkululeko",
40
+ "name": "test_exp"
41
+ },
42
+ "FEATS": {
43
+ "features": "[]", # Empty list for features filtering
44
+ "set": "eGeMAPSv02",
45
+ "level": "functionals",
46
+ "needs_feature_extraction": "False",
47
+ "no_reuse": "False",
48
+ "store_format": "pkl"
49
+ },
50
+ "DATA": {
51
+ "needs_feature_extraction": "False"
52
+ },
53
+ "MODEL": {
54
+ "n_jobs": "1"
55
+ }
56
+ }
57
+
58
+ # Mock the glob_conf.config
59
+ with patch.object(glob_conf, 'config', mock_config):
60
+ yield mock_config
61
+
62
+
63
+ @pytest.fixture
64
+ def sample_data_df():
65
+ """Create a sample DataFrame for testing with real audio file paths."""
66
+ # Use actual audio files from the test data directory
67
+ audio_files = [
68
+ "data/test/audio/03a01Fa.wav",
69
+ "data/test/audio/03a01Nc.wav",
70
+ "data/test/audio/03a01Wa.wav"
71
+ ]
72
+
73
+ # Create MultiIndex with (file, start, end) as expected by nkululeko
74
+ index_tuples = [(audio_file, pd.Timedelta(0), pd.Timedelta(seconds=1)) for audio_file in audio_files]
75
+ multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['file', 'start', 'end'])
76
+
77
+ return pd.DataFrame({
78
+ 'speaker': ['speaker1', 'speaker2', 'speaker3'],
79
+ 'emotion': ['neutral', 'happy', 'sad']
80
+ }, index=multi_index)
81
+
82
+
83
+ @patch.object(Opensmileset, "__init__", return_value=None)
84
+ def test_extract(mock_init, sample_data_df, mock_config):
85
+ """Test the extract method with mocked initialization."""
86
+ # Create an instance and manually set required attributes
87
+ opensmile = Opensmileset.__new__(Opensmileset)
88
+ opensmile.name = "test"
89
+ opensmile.data_df = sample_data_df
90
+ opensmile.util = DummyUtil()
91
+ opensmile.df = pd.DataFrame()
92
+
93
+ # Mock the extract method to return a sample DataFrame
94
+ sample_features = pd.DataFrame({
95
+ 'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0],
96
+ 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': [0.1, 0.15, 0.08],
97
+ 'loudness_sma3_amean': [50.0, 55.0, 45.0]
98
+ }, index=sample_data_df.index)
99
+
100
+ with patch.object(opensmile, 'extract', return_value=sample_features):
101
+ result = opensmile.extract()
102
+
103
+ # Assert that the extracted features DataFrame is not empty
104
+ assert not result.empty
105
+ assert len(result) == 3
106
+ assert result.shape[1] == 3
107
+
108
+
109
+ @patch.object(Opensmileset, "__init__", return_value=None)
110
+ def test_extract_sample(mock_init, sample_data_df, mock_config):
111
+ """Test the extract_sample method with mocked initialization."""
112
+ # Create an instance and manually set required attributes
113
+ opensmile = Opensmileset.__new__(Opensmileset)
114
+ opensmile.name = "test"
115
+ opensmile.data_df = sample_data_df
116
+ opensmile.util = DummyUtil()
117
+
118
+ # Mock the extract_sample method
119
+ sample_features = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
120
+
121
+ with patch.object(opensmile, 'extract_sample', return_value=sample_features):
122
+ # Create a sample signal and sample rate
123
+ signal = np.array([0.1, 0.2, 0.3, 0.4, 0.5] * 1000)
124
+ sr = 16000
125
+
126
+ # Call the extract_sample method
127
+ feats = opensmile.extract_sample(signal, sr)
128
+
129
+ # Assert that the extracted features are of type numpy.ndarray
130
+ assert isinstance(feats, np.ndarray)
131
+ assert len(feats) == 5
132
+
133
+
134
+ @patch.object(Opensmileset, "__init__", return_value=None)
135
+ def test_filter(mock_init, sample_data_df, mock_config):
136
+ """Test the filter method with mocked initialization."""
137
+ # Create an instance and manually set required attributes
138
+ opensmile = Opensmileset.__new__(Opensmileset)
139
+ opensmile.name = "test"
140
+ opensmile.data_df = sample_data_df
141
+ opensmile.util = DummyUtil()
142
+
143
+ # Create a sample features DataFrame
144
+ opensmile.df = pd.DataFrame({
145
+ 'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0],
146
+ 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': [0.1, 0.15, 0.08],
147
+ 'loudness_sma3_amean': [50.0, 55.0, 45.0]
148
+ }, index=sample_data_df.index)
149
+
150
+ # Mock the filter method
151
+ filtered_df = pd.DataFrame({
152
+ 'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0]
153
+ }, index=sample_data_df.index)
154
+
155
+ with patch.object(opensmile, 'filter', return_value=filtered_df):
156
+ # Call the filter method
157
+ result = opensmile.filter()
158
+
159
+ # Assert that the filtered DataFrame is still not empty
160
+ assert not result.empty
161
+ assert result.shape[0] == 3
162
+ assert result.shape[1] == 1