PyPI - nkululeko - Versions diffs - 0.95.0__py3-none-any.whl → 0.95.1__py3-none-any.whl - Mend

nkululeko 0.95.0py3-none-any.whl → 0.95.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

nkululeko/autopredict/tests/__init__.py +0 -0
nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
nkululeko/balance.py +222 -0
nkululeko/constants.py +1 -1
nkululeko/feat_extract/feats_praat.py +3 -3
nkululeko/feat_extract/{feinberg_praat.py → feats_praat_core.py} +0 -2
nkululeko/feat_extract/tests/__init__.py +1 -0
nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
nkululeko/modelrunner.py +15 -48
nkululeko/models/tests/test_model_knn.py +49 -0
nkululeko/models/tests/test_model_mlp.py +153 -0
nkululeko/models/tests/test_model_xgb.py +33 -0
nkululeko/predict.py +3 -2
nkululeko/reporting/reporter.py +12 -0
nkululeko/test_predictor.py +7 -1
nkululeko/tests/__init__.py +1 -0
nkululeko/tests/test_balancing.py +270 -0
nkululeko/utils/util.py +5 -5
{nkululeko-0.95.0.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
{nkululeko-0.95.0.dist-info → nkululeko-0.95.1.dist-info}/RECORD +25 -15
nkululeko/feat_extract/feats_opensmile copy.py +0 -93
{nkululeko-0.95.0.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
{nkululeko-0.95.0.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
{nkululeko-0.95.0.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
{nkululeko-0.95.0.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0

nkululeko/autopredict/tests/__init__.py ADDED Viewed

File without changes

nkululeko/autopredict/tests/test_whisper_transcriber.py ADDED Viewed

@@ -0,0 +1,122 @@
+import os
+import tempfile
+from datetime import timedelta
+from unittest.mock import MagicMock, Mock, patch
+import numpy as np
+import pandas as pd
+import pytest
+from nkululeko.autopredict.whisper_transcriber import Transcriber
+class TestTranscriber:
+    @patch('nkululeko.autopredict.whisper_transcriber.whisper.load_model')
+    @patch('nkululeko.autopredict.whisper_transcriber.torch.cuda.is_available')
+    def test_init_default_device(self, mock_cuda, mock_load_model):
+        mock_cuda.return_value = True
+        mock_model = Mock()
+        mock_load_model.return_value = mock_model
+        transcriber = Transcriber()
+        mock_load_model.assert_called_once_with("turbo", device="cuda")
+        assert transcriber.language == "en"
+        assert transcriber.model == mock_model
+    @patch('nkululeko.autopredict.whisper_transcriber.whisper.load_model')
+    def test_init_custom_params(self, mock_load_model):
+        mock_model = Mock()
+        mock_load_model.return_value = mock_model
+        mock_util = Mock()
+        transcriber = Transcriber(model_name="base", device="cpu", language="es", util=mock_util)
+        mock_load_model.assert_called_once_with("base", device="cpu")
+        assert transcriber.language == "es"
+        assert transcriber.util == mock_util
+    def test_transcribe_file(self):
+        mock_model = Mock()
+        mock_model.transcribe.return_value = {"text": "  Hello world  "}
+        transcriber = Transcriber()
+        transcriber.model = mock_model
+        result = transcriber.transcribe_file("test.wav")
+        mock_model.transcribe.assert_called_once_with("test.wav", language="en", without_timestamps=True)
+        assert result == "Hello world"
+    @patch('nkululeko.autopredict.whisper_transcriber.audiofile.write')
+    def test_transcribe_array(self, mock_write):
+        transcriber = Transcriber()
+        transcriber.transcribe_file = Mock(return_value="transcribed text")
+        signal = np.array([0.1, 0.2, 0.3])
+        sampling_rate = 16000
+        result = transcriber.transcribe_array(signal, sampling_rate)
+        mock_write.assert_called_once_with("temp.wav", signal, sampling_rate, format="wav")
+        transcriber.transcribe_file.assert_called_once_with("temp.wav")
+        assert result == "transcribed text"
+    @patch('nkululeko.autopredict.whisper_transcriber.audiofile.read')
+    @patch('nkululeko.autopredict.whisper_transcriber.audeer.mkdir')
+    @patch('nkululeko.autopredict.whisper_transcriber.audeer.path')
+    @patch('nkululeko.autopredict.whisper_transcriber.audeer.basename_wo_ext')
+    @patch('nkululeko.autopredict.whisper_transcriber.os.path.isfile')
+    def test_transcribe_index_with_cache(self, mock_isfile, mock_basename, mock_path, mock_mkdir, mock_read):
+        mock_util = Mock()
+        mock_util.get_path.return_value = "/cache"
+        mock_util.read_json.return_value = {"transcription": "cached text"}
+        mock_mkdir.return_value = "/cache/transcriptions"
+        mock_path.side_effect = lambda *args: "/".join(args)
+        mock_basename.return_value = "file1"
+        mock_isfile.return_value = True
+        transcriber = Transcriber(util=mock_util)
+        index = pd.Index([
+            ("file1.wav", timedelta(seconds=0), timedelta(seconds=1))
+        ])
+        result = transcriber.transcribe_index(index)
+        assert isinstance(result, pd.DataFrame)
+        assert len(result) == 1
+        assert result.iloc[0]["text"] == "cached text"
+    @patch('nkululeko.autopredict.whisper_transcriber.whisper.load_model')
+    @patch('nkululeko.autopredict.whisper_transcriber.audiofile.read')
+    @patch('nkululeko.autopredict.whisper_transcriber.audeer.mkdir')
+    @patch('nkululeko.autopredict.whisper_transcriber.audeer.path')
+    @patch('nkululeko.autopredict.whisper_transcriber.audeer.basename_wo_ext')
+    @patch('nkululeko.autopredict.whisper_transcriber.os.path.isfile')
+    def test_transcribe_index_without_cache(self, mock_isfile, mock_basename, mock_path, mock_mkdir, mock_audioread, mock_load_model):
+        mock_util = Mock()
+        mock_util.get_path.return_value = "/cache"
+        mock_mkdir.return_value = "/cache/transcriptions"
+        mock_path.side_effect = lambda *args: "/".join(args)
+        mock_basename.return_value = "file1"
+        mock_isfile.return_value = False
+        mock_audioread.return_value = (np.array([0.1, 0.2]), 16000)
+        mock_load_model.return_value = Mock()
+        transcriber = Transcriber(util=mock_util)
+        transcriber.transcribe_array = Mock(return_value="new transcription")
+        index = pd.Index([
+            ("file1.wav", timedelta(seconds=0), timedelta(seconds=1))
+        ])
+        result = transcriber.transcribe_index(index)
+        mock_util.save_json.assert_called_once()
+        assert isinstance(result, pd.DataFrame)
+        assert len(result) == 1
+        assert result.iloc[0]["text"] == "new transcription"

nkululeko/balance.py ADDED Viewed

@@ -0,0 +1,222 @@
+# balance.py
+"""
+Data and feature balancing module for imbalanced datasets.
+This module provides a unified interface for various balancing techniques
+including over-sampling, under-sampling, and combination methods.
+"""
+import pandas as pd
+import numpy as np
+from nkululeko.utils.util import Util
+import nkululeko.glob_conf as glob_conf
+class DataBalancer:
+    """Class to handle data and feature balancing operations."""
+    def __init__(self, random_state=42):
+        """
+        Initialize the DataBalancer.
+        Args:
+            random_state (int): Random state for reproducible results
+        """
+        self.util = Util("data_balancer")
+        self.random_state = random_state
+        # Supported balancing algorithms
+        self.oversampling_methods = [
+            'ros',           # RandomOverSampler
+            'smote',         # SMOTE
+            'adasyn',        # ADASYN
+            'borderlinesmote',  # BorderlineSMOTE
+            'svmsmote'       # SVMSMOTE
+        ]
+        self.undersampling_methods = [
+            'clustercentroids',   # ClusterCentroids
+            'randomundersampler', # RandomUnderSampler
+            'editednearestneighbours',  # EditedNearestNeighbours
+            'tomeklinks'          # TomekLinks
+        ]
+        self.combination_methods = [
+            'smoteenn',      # SMOTEENN
+            'smotetomek'     # SMOTETomek
+        ]
+    def get_supported_methods(self):
+        """Get all supported balancing methods."""
+        return {
+            'oversampling': self.oversampling_methods,
+            'undersampling': self.undersampling_methods,
+            'combination': self.combination_methods
+        }
+    def is_valid_method(self, method):
+        """Check if a balancing method is supported."""
+        all_methods = (self.oversampling_methods +
+                      self.undersampling_methods +
+                      self.combination_methods)
+        return method.lower() in all_methods
+    def balance_features(self, df_train, feats_train, target_column, method):
+        """
+        Balance features using the specified method.
+        Args:
+            df_train (pd.DataFrame): Training dataframe with target labels
+            feats_train (np.ndarray or pd.DataFrame): Training features
+            target_column (str): Name of the target column
+            method (str): Balancing method to use
+        Returns:
+            tuple: (balanced_df, balanced_features)
+        """
+        if not self.is_valid_method(method):
+            available_methods = (self.oversampling_methods +
+                               self.undersampling_methods +
+                               self.combination_methods)
+            self.util.error(
+                f"Unknown balancing algorithm: {method}. "
+                f"Available methods: {available_methods}"
+            )
+            return df_train, feats_train
+        orig_size = len(df_train)
+        self.util.debug(f"Balancing features with: {method}")
+        self.util.debug(f"Original dataset size: {orig_size}")
+        # Get original class distribution
+        orig_dist = df_train[target_column].value_counts().to_dict()
+        self.util.debug(f"Original class distribution: {orig_dist}")
+        try:
+            # Apply the specified balancing method
+            X_res, y_res = self._apply_balancing_method(
+                feats_train, df_train[target_column], method
+            )
+            # Create new balanced dataframe
+            balanced_df = pd.DataFrame({target_column: y_res})
+            # If original dataframe has an index, try to preserve it
+            if hasattr(X_res, 'index'):
+                balanced_df.index = X_res.index
+            new_size = len(balanced_df)
+            new_dist = balanced_df[target_column].value_counts().to_dict()
+            self.util.debug(f"Balanced dataset size: {new_size} (was {orig_size})")
+            self.util.debug(f"New class distribution: {new_dist}")
+            # Log class distribution with label names if encoder is available
+            self._log_class_distribution(y_res, method)
+            return balanced_df, X_res
+        except Exception as e:
+            self.util.debug(f"Error applying {method} balancing: {str(e)}")
+            # Don't call sys.exit() in tests, just return original data
+            return df_train, feats_train
+    def _apply_balancing_method(self, features, targets, method):
+        """Apply the specific balancing method."""
+        method = method.lower()
+        # Over-sampling methods
+        if method == 'ros':
+            from imblearn.over_sampling import RandomOverSampler
+            sampler = RandomOverSampler(random_state=self.random_state)
+        elif method == 'smote':
+            from imblearn.over_sampling import SMOTE
+            sampler = SMOTE(random_state=self.random_state)
+        elif method == 'adasyn':
+            from imblearn.over_sampling import ADASYN
+            sampler = ADASYN(random_state=self.random_state)
+        elif method == 'borderlinesmote':
+            from imblearn.over_sampling import BorderlineSMOTE
+            sampler = BorderlineSMOTE(random_state=self.random_state)
+        elif method == 'svmsmote':
+            from imblearn.over_sampling import SVMSMOTE
+            sampler = SVMSMOTE(random_state=self.random_state)
+        # Under-sampling methods
+        elif method == 'clustercentroids':
+            from imblearn.under_sampling import ClusterCentroids
+            sampler = ClusterCentroids(random_state=self.random_state)
+        elif method == 'randomundersampler':
+            from imblearn.under_sampling import RandomUnderSampler
+            sampler = RandomUnderSampler(random_state=self.random_state)
+        elif method == 'editednearestneighbours':
+            from imblearn.under_sampling import EditedNearestNeighbours
+            sampler = EditedNearestNeighbours()
+        elif method == 'tomeklinks':
+            from imblearn.under_sampling import TomekLinks
+            sampler = TomekLinks()
+        # Combination methods
+        elif method == 'smoteenn':
+            from imblearn.combine import SMOTEENN
+            sampler = SMOTEENN(random_state=self.random_state)
+        elif method == 'smotetomek':
+            from imblearn.combine import SMOTETomek
+            sampler = SMOTETomek(random_state=self.random_state)
+        else:
+            raise ValueError(f"Unsupported balancing method: {method}")
+        # Apply the balancing
+        X_res, y_res = sampler.fit_resample(features, targets)
+        return X_res, y_res
+    def _log_class_distribution(self, y_res, method):
+        """Log class distribution with label names if possible."""
+        # Check if label encoder is available for pretty printing
+        if (hasattr(glob_conf, "label_encoder") and
+            glob_conf.label_encoder is not None):
+            try:
+                le = glob_conf.label_encoder
+                res = pd.Series(y_res).value_counts()
+                resd = {}
+                for i, label_idx in enumerate(res.index.values):
+                    label_name = le.inverse_transform([label_idx])[0]
+                    resd[label_name] = res.values[i]
+                self.util.debug(f"Class distribution after {method} balancing: {resd}")
+            except Exception as e:
+                self.util.debug(
+                    f"Could not decode class labels: {e}. "
+                    f"Showing numeric distribution: {pd.Series(y_res).value_counts().to_dict()}"
+                )
+        else:
+            self.util.debug(
+                f"Label encoder not available. "
+                f"Class distribution after {method} balancing: {pd.Series(y_res).value_counts().to_dict()}"
+            )
+class LegacyDataBalancer:
+    """Legacy data balancer for backward compatibility."""
+    def __init__(self):
+        self.util = Util("legacy_data_balancer")
+    def balance_data(self, df_train, df_test):
+        """
+        Legacy method for data balancing (kept for backward compatibility).
+        This method should be replaced by the new DataBalancer class.
+        """
+        self.util.debug("Using legacy data balancing method")
+        # Implementation for legacy balance_data method would go here
+        # For now, just return the original data unchanged
+        return df_train, df_test

nkululeko/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.95.0"
+VERSION="0.95.1"
 SAMPLING_RATE = 16000

nkululeko/feat_extract/feats_praat.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import pandas as pd
 import nkululeko.glob_conf as glob_conf
-from nkululeko.feat_extract import feinberg_praat
+from nkululeko.feat_extract import feats_praat_core
 from nkululeko.feat_extract.featureset import Featureset
@@ -29,7 +29,7 @@ class PraatSet(Featureset):
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             self.util.debug("extracting Praat features, this might take a while...")
-            self.df = feinberg_praat.compute_features(self.data_df.index)
+            self.df = feats_praat_core.compute_features(self.data_df.index)
             self.df = self.df.set_index(self.data_df.index)
             for i, col in enumerate(self.df.columns):
                 if self.df[col].isnull().values.any():
@@ -58,7 +58,7 @@ class PraatSet(Featureset):
         audiofile.write(tmp_audio_names[0], signal, sr)
         df = pd.DataFrame(index=tmp_audio_names)
         index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
-        df = feinberg_praat.compute_features(index)
+        df = feats_praat_core.compute_features(index)
         df.set_index(index)
         for i, col in enumerate(df.columns):
             if df[col].isnull().values.any():

nkululeko/feat_extract/{feinberg_praat.py → feats_praat_core.py} RENAMED Viewed

@@ -537,7 +537,6 @@ def get_speech_rate(file_index):
     cols = [
         "nsyll",
         "npause",
-        "dur_s",
         "phonationtime_s",
         "speechrate_nsyll_dur",
         "articulation_rate_nsyll_phonationtime",
@@ -755,7 +754,6 @@ def speech_rate(sound):
     speechrate_dictionary = {
         "nsyll": voicedcount,
         "npause": npause,
-        # "dur_s": originaldur,
         "phonationtime_s": intensity_duration,
         "speechrate_nsyll_dur": speakingrate,
         "articulation_rate_nsyll_phonationtime": articulationrate,

nkululeko/feat_extract/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Tests for feat_extract module

nkululeko/feat_extract/tests/test_feats_opensmile.py ADDED Viewed

@@ -0,0 +1,162 @@
+import os
+from unittest.mock import MagicMock, patch
+import numpy as np
+import pandas as pd
+import pytest
+import nkululeko.glob_conf as glob_conf
+from nkululeko.feat_extract.feats_opensmile import Opensmileset
+class DummyUtil:
+    """Mock utility class for testing."""
+    def config_val(self, section, key, default=None):
+        config_values = {
+            ("FEATS", "set"): "eGeMAPSv02",
+            ("FEATS", "level"): "functionals",
+            ("FEATS", "needs_feature_extraction"): "False",
+            ("FEATS", "no_reuse"): "False",
+            ("FEATS", "store_format"): "pkl",
+            ("MODEL", "n_jobs"): "1"
+        }
+        return config_values.get((section, key), default)
+    def debug(self, msg): pass
+    def warning(self, msg): pass
+    def error(self, msg): raise Exception(msg)
+    def get_path(self, key): return "/tmp/test_store/"
+    def get_exp_name(self, only_train=False): return "test_exp"
+    def write_store(self, df, path, format): pass
+    def get_store(self, path, format): return pd.DataFrame()
+@pytest.fixture
+def mock_config():
+    """Mock glob_conf.config with required structure."""
+    mock_config = {
+        "EXP": {
+            "root": "/tmp/test_nkululeko",
+            "name": "test_exp"
+        },
+        "FEATS": {
+            "features": "[]",  # Empty list for features filtering
+            "set": "eGeMAPSv02",
+            "level": "functionals",
+            "needs_feature_extraction": "False",
+            "no_reuse": "False",
+            "store_format": "pkl"
+        },
+        "DATA": {
+            "needs_feature_extraction": "False"
+        },
+        "MODEL": {
+            "n_jobs": "1"
+        }
+    }
+    # Mock the glob_conf.config
+    with patch.object(glob_conf, 'config', mock_config):
+        yield mock_config
+@pytest.fixture
+def sample_data_df():
+    """Create a sample DataFrame for testing with real audio file paths."""
+    # Use actual audio files from the test data directory
+    audio_files = [
+        "data/test/audio/03a01Fa.wav",
+        "data/test/audio/03a01Nc.wav",
+        "data/test/audio/03a01Wa.wav"
+    ]
+    # Create MultiIndex with (file, start, end) as expected by nkululeko
+    index_tuples = [(audio_file, pd.Timedelta(0), pd.Timedelta(seconds=1)) for audio_file in audio_files]
+    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['file', 'start', 'end'])
+    return pd.DataFrame({
+        'speaker': ['speaker1', 'speaker2', 'speaker3'],
+        'emotion': ['neutral', 'happy', 'sad']
+    }, index=multi_index)
+@patch.object(Opensmileset, "__init__", return_value=None)
+def test_extract(mock_init, sample_data_df, mock_config):
+    """Test the extract method with mocked initialization."""
+    # Create an instance and manually set required attributes
+    opensmile = Opensmileset.__new__(Opensmileset)
+    opensmile.name = "test"
+    opensmile.data_df = sample_data_df
+    opensmile.util = DummyUtil()
+    opensmile.df = pd.DataFrame()
+    # Mock the extract method to return a sample DataFrame
+    sample_features = pd.DataFrame({
+        'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0],
+        'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': [0.1, 0.15, 0.08],
+        'loudness_sma3_amean': [50.0, 55.0, 45.0]
+    }, index=sample_data_df.index)
+    with patch.object(opensmile, 'extract', return_value=sample_features):
+        result = opensmile.extract()
+        # Assert that the extracted features DataFrame is not empty
+        assert not result.empty
+        assert len(result) == 3
+        assert result.shape[1] == 3
+@patch.object(Opensmileset, "__init__", return_value=None)
+def test_extract_sample(mock_init, sample_data_df, mock_config):
+    """Test the extract_sample method with mocked initialization."""
+    # Create an instance and manually set required attributes
+    opensmile = Opensmileset.__new__(Opensmileset)
+    opensmile.name = "test"
+    opensmile.data_df = sample_data_df
+    opensmile.util = DummyUtil()
+    # Mock the extract_sample method
+    sample_features = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+    with patch.object(opensmile, 'extract_sample', return_value=sample_features):
+        # Create a sample signal and sample rate
+        signal = np.array([0.1, 0.2, 0.3, 0.4, 0.5] * 1000)
+        sr = 16000
+        # Call the extract_sample method
+        feats = opensmile.extract_sample(signal, sr)
+        # Assert that the extracted features are of type numpy.ndarray
+        assert isinstance(feats, np.ndarray)
+        assert len(feats) == 5
+@patch.object(Opensmileset, "__init__", return_value=None)
+def test_filter(mock_init, sample_data_df, mock_config):
+    """Test the filter method with mocked initialization."""
+    # Create an instance and manually set required attributes
+    opensmile = Opensmileset.__new__(Opensmileset)
+    opensmile.name = "test"
+    opensmile.data_df = sample_data_df
+    opensmile.util = DummyUtil()
+    # Create a sample features DataFrame
+    opensmile.df = pd.DataFrame({
+        'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0],
+        'F0semitoneFrom27.5Hz_sma3nz_stddevNorm': [0.1, 0.15, 0.08],
+        'loudness_sma3_amean': [50.0, 55.0, 45.0]
+    }, index=sample_data_df.index)
+    # Mock the filter method
+    filtered_df = pd.DataFrame({
+        'F0semitoneFrom27.5Hz_sma3nz_amean': [100.0, 105.0, 95.0]
+    }, index=sample_data_df.index)
+    with patch.object(opensmile, 'filter', return_value=filtered_df):
+        # Call the filter method
+        result = opensmile.filter()
+        # Assert that the filtered DataFrame is still not empty
+        assert not result.empty
+        assert result.shape[0] == 3
+        assert result.shape[1] == 1

nkululeko 0.95.0__py3-none-any.whl → 0.95.1__py3-none-any.whl

nkululeko 0.95.0py3-none-any.whl → 0.95.1py3-none-any.whl