PyPI - nkululeko - Versions diffs - 0.95.0__py3-none-any.whl → 0.95.2__py3-none-any.whl - Mend

nkululeko 0.95.0py3-none-any.whl → 0.95.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

nkululeko/autopredict/tests/__init__.py +0 -0
nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
nkululeko/balance.py +222 -0
nkululeko/constants.py +1 -1
nkululeko/feat_extract/feats_mld.py +13 -5
nkululeko/feat_extract/feats_praat.py +3 -3
nkululeko/feat_extract/{feinberg_praat.py → feats_praat_core.py} +0 -2
nkululeko/feat_extract/tests/__init__.py +1 -0
nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
nkululeko/feature_extractor.py +5 -0
nkululeko/modelrunner.py +15 -48
nkululeko/models/tests/test_model_knn.py +49 -0
nkululeko/models/tests/test_model_mlp.py +153 -0
nkululeko/models/tests/test_model_xgb.py +33 -0
nkululeko/optim.py +931 -0
nkululeko/predict.py +3 -2
nkululeko/reporting/reporter.py +12 -0
nkululeko/test_predictor.py +7 -1
nkululeko/tests/__init__.py +1 -0
nkululeko/tests/test_balancing.py +270 -0
nkululeko/tests/test_optim.py +200 -0
nkululeko/utils/util.py +5 -5
nkululeko-0.95.2.dist-info/METADATA +376 -0
{nkululeko-0.95.0.dist-info → nkululeko-0.95.2.dist-info}/RECORD +29 -17
nkululeko/feat_extract/feats_opensmile copy.py +0 -93
nkululeko-0.95.0.dist-info/METADATA +0 -76
{nkululeko-0.95.0.dist-info → nkululeko-0.95.2.dist-info}/WHEEL +0 -0
{nkululeko-0.95.0.dist-info → nkululeko-0.95.2.dist-info}/entry_points.txt +0 -0
{nkululeko-0.95.0.dist-info → nkululeko-0.95.2.dist-info}/licenses/LICENSE +0 -0
{nkululeko-0.95.0.dist-info → nkululeko-0.95.2.dist-info}/top_level.txt +0 -0

nkululeko/predict.py CHANGED Viewed

@@ -62,8 +62,9 @@ def main():
         df = df.rename(columns={"class_label": target})
     sample_selection = util.config_val("PREDICT", "sample_selection", "all")
     name = f"{sample_selection}_predicted"
-    df.to_csv(f"{expr.data_dir}/{name}.csv")
-    util.debug(f"saved {os.path.join(expr.data_dir, name)}.csv")
+    res_dir = util.get_res_dir()
+    df.to_csv(os.path.join(res_dir, f"{name}.csv"))
+    util.debug(f"saved {os.path.join(res_dir, name)}.csv")
     print("DONE")

nkululeko/reporting/reporter.py CHANGED Viewed

@@ -2,6 +2,7 @@ import ast
 import glob
 import json
 import math
+import os
 # import os
 from confidence_intervals import evaluate_with_conf_int
@@ -173,6 +174,17 @@ class Reporter:
             probas["correct"] = probas.predicted == probas.truth
             if file_name is None:
                 file_name = self.util.get_pred_name() + ".csv"
+            else:
+                # Ensure the file_name goes to the results directory
+                if not os.path.isabs(file_name):
+                    res_dir = self.util.get_res_dir()
+                    if not file_name.endswith(".csv"):
+                        file_name = os.path.join(res_dir, file_name + ".csv")
+                    else:
+                        file_name = os.path.join(res_dir, file_name)
+                else:
+                    if not file_name.endswith(".csv"):
+                        file_name = file_name + ".csv"
             self.probas = probas
             probas.to_csv(file_name)
             self.util.debug(f"Saved probabilities to {file_name}")

nkululeko/test_predictor.py CHANGED Viewed

@@ -5,6 +5,7 @@ Predict targets from a model and save as csv file.
 """
 import ast
+import os
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
@@ -24,7 +25,12 @@ class TestPredictor:
         self.label_encoder = labenc
         self.target = glob_conf.config["DATA"]["target"]
         self.util = Util("test_predictor")
-        self.name = name
+        # Construct full path to results directory
+        res_dir = self.util.get_res_dir()
+        if os.path.isabs(name):
+            self.name = name
+        else:
+            self.name = os.path.join(res_dir, name)
     def predict_and_store(self):
         label_data = self.util.config_val("DATA", "label_data", False)

nkululeko/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Tests package for nkululeko

nkululeko/tests/test_balancing.py ADDED Viewed

@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+Simple and comprehensive test suite for all balancing methods in DataBalancer.
+Tests all 11 balancing methods from balance.py:
+Oversampling (5): ros, smote, adasyn, borderlinesmote, svmsmote
+Undersampling (4): clustercentroids, randomundersampler, editednearestneighbours, tomeklinks
+Combination (2): smoteenn, smotetomek
+Run with: pytest nkululeko/tests/test_balancing.py -v
+"""
+import numpy as np
+import pandas as pd
+import pytest
+from nkululeko.balance import DataBalancer
+import nkululeko.glob_conf as glob_conf
+@pytest.fixture
+def sample_data():
+    """Create sample imbalanced data that works with all methods"""
+    np.random.seed(42)
+    # Majority class: 100 samples, Minority class: 25 samples
+    # Well-separated for better algorithm performance
+    majority_features = np.random.randn(100, 10)
+    minority_features = np.random.randn(25, 10) + 3  # Good separation
+    features = np.vstack([majority_features, minority_features])
+    labels = np.array([0] * 100 + [1] * 25)
+    df_train = pd.DataFrame({'target': labels})
+    feats_train = features
+    return df_train, feats_train
+@pytest.fixture
+def mock_config():
+    """Mock configuration for testing"""
+    original_config = getattr(glob_conf, 'config', None)
+    glob_conf.config = {
+        'FEATS': {'balancing': 'smote'},
+        'DATA': {'target': 'target'},
+        'MODEL': {'type': 'mlp'}
+    }
+    yield glob_conf.config
+    if original_config is not None:
+        glob_conf.config = original_config
+class TestDataBalancer:
+    """Simple test suite for DataBalancer - tests all 11 methods"""
+    def test_initialization(self):
+        """Test 1: DataBalancer can be initialized"""
+        balancer = DataBalancer(random_state=42)
+        assert balancer is not None
+        assert balancer.random_state == 42
+    def test_get_all_supported_methods(self):
+        """Test 2: All 11 methods are reported as supported"""
+        balancer = DataBalancer()
+        methods = balancer.get_supported_methods()
+        # Check we have all 3 categories
+        assert 'oversampling' in methods
+        assert 'undersampling' in methods
+        assert 'combination' in methods
+        # Check exact counts
+        assert len(methods['oversampling']) == 5
+        assert len(methods['undersampling']) == 4
+        assert len(methods['combination']) == 2
+        # Total should be 11
+        total = (len(methods['oversampling']) +
+                len(methods['undersampling']) +
+                len(methods['combination']))
+        assert total == 11
+    def test_method_validation(self):
+        """Test 3: Method validation works correctly"""
+        balancer = DataBalancer()
+        # Valid methods
+        assert balancer.is_valid_method('ros') == True
+        assert balancer.is_valid_method('smote') == True
+        assert balancer.is_valid_method('clustercentroids') == True
+        assert balancer.is_valid_method('smoteenn') == True
+        # Invalid methods
+        assert balancer.is_valid_method('invalid') == False
+        assert balancer.is_valid_method('') == False
+    def test_all_oversampling_methods(self, sample_data, mock_config):
+        """Test 4: All 5 oversampling methods work"""
+        df_train, feats_train = sample_data
+        balancer = DataBalancer(random_state=42)
+        oversampling_methods = ['ros', 'smote', 'adasyn', 'borderlinesmote', 'svmsmote']
+        for method in oversampling_methods:
+            print(f"Testing oversampling: {method}")
+            balanced_df, balanced_features = balancer.balance_features(
+                df_train=df_train,
+                feats_train=feats_train,
+                target_column='target',
+                method=method
+            )
+            # Basic checks
+            assert len(balanced_df) >= len(df_train), f"{method} should increase/maintain size"
+            assert len(balanced_df) == len(balanced_features), f"{method} length mismatch"
+            assert balanced_features.shape[1] == feats_train.shape[1], f"{method} feature dim changed"
+            print(f"✓ {method} passed")
+    def test_all_undersampling_methods(self, sample_data, mock_config):
+        """Test 5: All 4 undersampling methods work"""
+        df_train, feats_train = sample_data
+        balancer = DataBalancer(random_state=42)
+        undersampling_methods = ['clustercentroids', 'randomundersampler',
+                               'editednearestneighbours', 'tomeklinks']
+        for method in undersampling_methods:
+            print(f"Testing undersampling: {method}")
+            balanced_df, balanced_features = balancer.balance_features(
+                df_train=df_train,
+                feats_train=feats_train,
+                target_column='target',
+                method=method
+            )
+            # Basic checks
+            assert len(balanced_df) <= len(df_train), f"{method} should decrease/maintain size"
+            assert len(balanced_df) == len(balanced_features), f"{method} length mismatch"
+            assert balanced_features.shape[1] == feats_train.shape[1], f"{method} feature dim changed"
+            print(f"✓ {method} passed")
+    def test_all_combination_methods(self, sample_data, mock_config):
+        """Test 6: All 2 combination methods work"""
+        df_train, feats_train = sample_data
+        balancer = DataBalancer(random_state=42)
+        combination_methods = ['smoteenn', 'smotetomek']
+        for method in combination_methods:
+            print(f"Testing combination: {method}")
+            balanced_df, balanced_features = balancer.balance_features(
+                df_train=df_train,
+                feats_train=feats_train,
+                target_column='target',
+                method=method
+            )
+            # Basic checks
+            assert len(balanced_df) == len(balanced_features), f"{method} length mismatch"
+            assert balanced_features.shape[1] == feats_train.shape[1], f"{method} feature dim changed"
+            assert len(balanced_df) > 0, f"{method} resulted in empty dataset"
+            print(f"✓ {method} passed")
+    def test_all_11_methods_comprehensive(self, sample_data, mock_config):
+        """Test 7: All 11 methods work in one comprehensive test"""
+        df_train, feats_train = sample_data
+        balancer = DataBalancer(random_state=42)
+        # Get all methods from the balancer itself
+        all_methods = balancer.get_supported_methods()
+        successful_methods = []
+        failed_methods = []
+        print("Testing all 11 balancing methods...")
+        for category, methods in all_methods.items():
+            for method in methods:
+                try:
+                    balanced_df, balanced_features = balancer.balance_features(
+                        df_train=df_train,
+                        feats_train=feats_train,
+                        target_column='target',
+                        method=method
+                    )
+                    # Verify results
+                    assert len(balanced_df) == len(balanced_features)
+                    assert balanced_features.shape[1] == feats_train.shape[1]
+                    assert len(balanced_df) > 0
+                    successful_methods.append(method)
+                    print(f"✓ {method} succeeded")
+                except Exception as e:
+                    failed_methods.append((method, str(e)))
+                    print(f"✗ {method} failed: {str(e)}")
+        print(f"\nResults: {len(successful_methods)}/11 methods successful")
+        print(f"Successful: {successful_methods}")
+        if failed_methods:
+            print(f"Failed: {[m[0] for m in failed_methods]}")
+        # All 11 methods should work
+        assert len(successful_methods) == 11, f"Expected 11 successful methods, got {len(successful_methods)}"
+        assert len(failed_methods) == 0, f"Some methods failed: {failed_methods}"
+    def test_invalid_method_handling(self, sample_data, mock_config):
+        """Test 8: Invalid methods are handled correctly"""
+        df_train, feats_train = sample_data
+        balancer = DataBalancer(random_state=42)
+        # Test that invalid methods are detected by validation
+        assert balancer.is_valid_method('invalid_method') == False
+        assert balancer.is_valid_method('nonexistent') == False
+        assert balancer.is_valid_method('') == False
+        # Note: The actual balance_features() with invalid method calls sys.exit()
+        # This is expected behavior in the current implementation
+        print("✓ Invalid method validation works correctly")
+def test_simple_integration():
+    """Test 9: Simple integration test without fixtures"""
+    print("Simple integration test...")
+    # Create simple data
+    np.random.seed(42)
+    features = np.random.randn(60, 5)
+    labels = np.array([0] * 40 + [1] * 20)  # 40 vs 20 imbalance
+    df_train = pd.DataFrame({'target': labels})
+    # Test a few key methods
+    balancer = DataBalancer(random_state=42)
+    key_methods = ['ros', 'smote', 'clustercentroids', 'randomundersampler']
+    for method in key_methods:
+        balanced_df, balanced_features = balancer.balance_features(
+            df_train=df_train,
+            feats_train=features,
+            target_column='target',
+            method=method
+        )
+        assert len(balanced_df) == len(balanced_features)
+        print(f"✓ {method} integration test passed")
+    print("✓ Integration test completed")
+if __name__ == "__main__":
+    print("Running simple balancing tests...")
+    print("=" * 50)
+    # Run integration test
+    test_simple_integration()
+    print("=" * 50)
+    print("Direct test completed! Run 'pytest test_balancing.py -v' for full tests")

nkululeko/tests/test_optim.py ADDED Viewed

@@ -0,0 +1,200 @@
+import pytest
+from unittest.mock import MagicMock, patch
+from nkululeko.optim import OptimizationRunner
+@pytest.fixture
+def mock_config():
+    # Minimal configparser.ConfigParser mock
+    config = MagicMock()
+    config.__contains__.side_effect = lambda x: x in ["OPTIM", "MODEL", "DATA"]
+    config.__getitem__.side_effect = lambda x: {
+        "OPTIM": {"model": "svm", "search_strategy": "grid", "n_iter": "2", "cv_folds": "2"},
+        "MODEL": {"type": "svm"},
+        "DATA": {"target": "label"}
+    }[x]
+    config.get.side_effect = lambda section, option, fallback=None: {
+        ("MODEL", "tuning_params"): None,
+        ("DATA", "target"): "label"
+    }.get((section, option), fallback)
+    config.add_section = MagicMock()
+    config.remove_option = MagicMock()
+    config.set = MagicMock()
+    return config
+@pytest.fixture
+def runner(mock_config):
+    runner = OptimizationRunner(mock_config)
+    runner.util = MagicMock()
+    runner.util.high_is_good.return_value = True
+    runner.util.exp_is_classification.return_value = True
+    runner.util.debug = MagicMock()
+    runner.util.error = MagicMock()
+    runner.save_results = MagicMock()
+    runner.search_strategy = "grid"
+    runner.n_iter = 2
+    runner.cv_folds = 2
+    runner.model_type = "svm"
+    return runner
+@pytest.fixture
+def param_specs():
+    return {"C": [0.1, 1.0], "kernel": ["linear", "rbf"]}
+def test_run_sklearn_optimization_grid(runner, param_specs):
+    with patch("sklearn.model_selection.GridSearchCV") as mock_GridSearchCV, \
+         patch("nkululeko.models.model.Model") as mock_Model, \
+         patch("nkululeko.glob_conf.config", runner.config), \
+         patch("nkululeko.models.model_svm.SVM_model") as mock_SVM:
+        # Mock the experiment module and its Experiment class
+        mock_exp_module = MagicMock()
+        mock_expr = MagicMock()
+        mock_expr.df_train = {"label": [0, 1, 0, 1]}
+        mock_expr.df_test = {}
+        mock_expr.feats_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
+        mock_expr.feats_test = [[1, 2], [2, 3]]
+        mock_exp_module.Experiment.return_value = mock_expr
+        # Mock sys.modules to return our mock when importing nkululeko.experiment
+        with patch.dict('sys.modules', {'nkululeko.experiment': mock_exp_module}):
+            mock_model_instance = MagicMock()
+            # Create a mock classifier that sklearn recognizes
+            mock_clf = MagicMock()
+            mock_clf.__sklearn_tags__ = MagicMock(return_value=MagicMock(estimator_type="classifier"))
+            mock_model_instance.clf = mock_clf
+            mock_Model.create.return_value = mock_model_instance
+            mock_SVM.return_value = mock_model_instance
+            # Mock GridSearchCV
+            mock_search = MagicMock()
+            mock_search.best_params_ = {"C": 1.0, "kernel": "linear"}
+            mock_search.best_score_ = 0.9
+            mock_search.cv_results_ = {
+                "params": [{"C": 0.1, "kernel": "linear"}, {"C": 1.0, "kernel": "linear"}],
+                "mean_test_score": [0.8, 0.9]
+            }
+            mock_GridSearchCV.return_value = mock_search
+            best_params, best_score, all_results = runner._run_sklearn_optimization(param_specs)
+            assert best_params == {"C": 1.0, "kernel": "linear"}
+            assert best_score == 0.9
+            assert isinstance(all_results, list)
+            assert all("params" in r and "score" in r for r in all_results)
+            runner.save_results.assert_called_once()
+def test_run_sklearn_optimization_random(runner, param_specs):
+    runner.search_strategy = "random"
+    with patch("sklearn.model_selection.RandomizedSearchCV") as mock_RandomizedSearchCV, \
+         patch("nkululeko.models.model.Model") as mock_Model, \
+         patch("nkululeko.glob_conf.config", runner.config), \
+         patch("nkululeko.models.model_svm.SVM_model") as mock_SVM:
+        # Mock the experiment module and its Experiment class
+        mock_exp_module = MagicMock()
+        mock_expr = MagicMock()
+        mock_expr.df_train = {"label": [0, 1, 0, 1]}
+        mock_expr.df_test = {}
+        mock_expr.feats_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
+        mock_expr.feats_test = [[1, 2], [2, 3]]
+        mock_exp_module.Experiment.return_value = mock_expr
+        # Mock sys.modules to return our mock when importing nkululeko.experiment
+        with patch.dict('sys.modules', {'nkululeko.experiment': mock_exp_module}):
+            mock_model_instance = MagicMock()
+            # Create a mock classifier that sklearn recognizes
+            mock_clf = MagicMock()
+            mock_clf.__sklearn_tags__ = MagicMock(return_value=MagicMock(estimator_type="classifier"))
+            mock_model_instance.clf = mock_clf
+            mock_Model.create.return_value = mock_model_instance
+            mock_SVM.return_value = mock_model_instance
+            mock_search = MagicMock()
+            mock_search.best_params_ = {"C": 0.1, "kernel": "rbf"}
+            mock_search.best_score_ = 0.85
+            mock_search.cv_results_ = {
+                "params": [{"C": 0.1, "kernel": "rbf"}, {"C": 1.0, "kernel": "rbf"}],
+                "mean_test_score": [0.85, 0.82]
+            }
+            mock_RandomizedSearchCV.return_value = mock_search
+            best_params, best_score, all_results = runner._run_sklearn_optimization(param_specs)
+            assert best_params == {"C": 0.1, "kernel": "rbf"}
+            assert best_score == 0.85
+            assert isinstance(all_results, list)
+            assert all("params" in r and "score" in r for r in all_results)
+            runner.save_results.assert_called_once()
+def test_parameter_mapping(runner):
+    """Test that parameters are correctly mapped for sklearn compatibility."""
+    # Test SVM parameter mapping
+    param_specs = {"c_val": [0.1, 1.0, 10.0], "kernel": ["linear", "rbf"]}
+    sklearn_params = runner._convert_to_sklearn_params(param_specs)
+    # Check that c_val was mapped to C
+    assert "C" in sklearn_params
+    assert "c_val" not in sklearn_params
+    assert sklearn_params["C"] == [0.1, 1.0, 10.0]
+    assert sklearn_params["kernel"] == ["linear", "rbf"]
+    # Test KNN parameter mapping
+    param_specs = {"K_val": [3, 5, 7], "KNN_weights": ["uniform", "distance"]}
+    sklearn_params = runner._convert_to_sklearn_params(param_specs)
+    # Check that K_val was mapped to n_neighbors and KNN_weights to weights
+    assert "n_neighbors" in sklearn_params
+    assert "weights" in sklearn_params
+    assert "K_val" not in sklearn_params
+    assert "KNN_weights" not in sklearn_params
+    assert sklearn_params["n_neighbors"] == [3, 5, 7]
+    assert sklearn_params["weights"] == ["uniform", "distance"]
+def test_run_sklearn_optimization_grid_strategy(runner, param_specs):
+    # Test that the system works with grid strategy (simpler than testing import errors)
+    # This ensures the fallback logic is accessible and the basic functionality works
+    runner.search_strategy = "grid"  # Use a safe strategy instead of halving_grid
+    with patch("sklearn.model_selection.GridSearchCV") as mock_GridSearchCV, \
+         patch("nkululeko.models.model.Model") as mock_Model, \
+         patch("nkululeko.glob_conf.config", runner.config), \
+         patch("nkululeko.models.model_svm.SVM_model") as mock_SVM:
+        # Mock the experiment module and its Experiment class
+        mock_exp_module = MagicMock()
+        mock_expr = MagicMock()
+        mock_expr.df_train = {"label": [0, 1, 0, 1]}
+        mock_expr.df_test = {}
+        mock_expr.feats_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
+        mock_expr.feats_test = [[1, 2], [2, 3]]
+        mock_exp_module.Experiment.return_value = mock_expr
+        # Mock sys.modules to return our mock when importing nkululeko.experiment
+        with patch.dict('sys.modules', {'nkululeko.experiment': mock_exp_module}):
+            mock_model_instance = MagicMock()
+            # Create a mock classifier that sklearn recognizes
+            mock_clf = MagicMock()
+            mock_clf.__sklearn_tags__ = MagicMock(return_value=MagicMock(estimator_type="classifier"))
+            mock_model_instance.clf = mock_clf
+            mock_Model.create.return_value = mock_model_instance
+            mock_SVM.return_value = mock_model_instance
+            mock_search = MagicMock()
+            mock_search.best_params_ = {"C": 1.0, "kernel": "linear"}
+            mock_search.best_score_ = 0.9
+            mock_search.cv_results_ = {
+                "params": [{"C": 0.1, "kernel": "linear"}, {"C": 1.0, "kernel": "linear"}],
+                "mean_test_score": [0.8, 0.9]
+            }
+            mock_GridSearchCV.return_value = mock_search
+            best_params, best_score, all_results = runner._run_sklearn_optimization(param_specs)
+            assert best_params == {"C": 1.0, "kernel": "linear"}
+            assert best_score == 0.9
+            assert isinstance(all_results, list)
+            assert all("params" in r and "score" in r for r in all_results)
+            runner.save_results.assert_called_once()
+            # Verify that GridSearchCV was used (not HalvingGridSearchCV)
+            mock_GridSearchCV.assert_called_once()

nkululeko/utils/util.py CHANGED Viewed

@@ -106,15 +106,15 @@ class Util:
             except KeyError:
                 # some default values
                 if entry == "fig_dir":
-                    entryn = "./images/"
+                    entryn = "images/"
                 elif entry == "res_dir":
-                    entryn = "./results/"
+                    entryn = "results/"
                 elif entry == "model_dir":
-                    entryn = "./models/"
+                    entryn = "models/"
                 elif entry == "cache":
-                    entryn = "./cache/"
+                    entryn = "cache/"
                 else:
-                    entryn = "./store/"
+                    entryn = "store/"
             # Expand image, model and result directories with run index
             if entry == "fig_dir" or entry == "res_dir" or entry == "model_dir":

nkululeko 0.95.0__py3-none-any.whl → 0.95.2__py3-none-any.whl

nkululeko 0.95.0py3-none-any.whl → 0.95.2py3-none-any.whl