nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. nkululeko/augmenting/resampler.py +5 -2
  2. nkululeko/autopredict/ap_emotion.py +36 -0
  3. nkululeko/autopredict/ap_text.py +45 -0
  4. nkululeko/autopredict/tests/__init__.py +0 -0
  5. nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
  6. nkululeko/autopredict/whisper_transcriber.py +81 -0
  7. nkululeko/balance.py +222 -0
  8. nkululeko/constants.py +1 -1
  9. nkululeko/experiment.py +53 -3
  10. nkululeko/explore.py +32 -13
  11. nkululeko/feat_extract/feats_analyser.py +45 -17
  12. nkululeko/feat_extract/feats_emotion2vec.py +51 -26
  13. nkululeko/feat_extract/feats_praat.py +3 -3
  14. nkululeko/feat_extract/feats_praat_core.py +769 -0
  15. nkululeko/feat_extract/tests/__init__.py +1 -0
  16. nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
  17. nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
  18. nkululeko/glob_conf.py +9 -0
  19. nkululeko/modelrunner.py +15 -39
  20. nkululeko/models/model.py +4 -42
  21. nkululeko/models/model_tuned.py +416 -84
  22. nkululeko/models/model_xgb.py +148 -2
  23. nkululeko/models/tests/test_model_knn.py +49 -0
  24. nkululeko/models/tests/test_model_mlp.py +153 -0
  25. nkululeko/models/tests/test_model_xgb.py +33 -0
  26. nkululeko/nkululeko.py +0 -9
  27. nkululeko/plots.py +25 -19
  28. nkululeko/predict.py +8 -6
  29. nkululeko/reporting/report.py +7 -5
  30. nkululeko/reporting/reporter.py +20 -5
  31. nkululeko/test_predictor.py +7 -1
  32. nkululeko/tests/__init__.py +1 -0
  33. nkululeko/tests/test_balancing.py +270 -0
  34. nkululeko/utils/util.py +38 -6
  35. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
  36. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
  37. nkululeko/feat_extract/feats_opensmile copy.py +0 -93
  38. nkululeko/feat_extract/feinberg_praat.py +0 -628
  39. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
  40. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
  41. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
  42. {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,164 @@
1
1
  # model_xgb.py
2
2
 
3
+ import os
3
4
  from xgboost import XGBClassifier
4
5
 
6
+ import nkululeko.glob_conf as glob_conf
5
7
  from nkululeko.models.model import Model
6
8
 
7
9
 
8
10
  class XGB_model(Model):
9
- """An XGBoost model"""
11
+ """An XGBoost model with early stopping support"""
10
12
 
11
13
  def __init__(self, df_train, df_test, feats_train, feats_test):
12
14
  super().__init__(df_train, df_test, feats_train, feats_test)
13
15
  self.name = "xgb"
14
16
  self.is_classifier = True
15
- self.clf = XGBClassifier() # set up the classifier
17
+
18
+ # Configure XGBoost parameters
19
+ xgb_params = {}
20
+
21
+ # Get early stopping configuration
22
+ self.early_stopping_rounds = self.util.config_val(
23
+ "MODEL", "early_stopping_rounds", False
24
+ )
25
+ self.eval_metric = self.util.config_val("MODEL", "eval_metric", "logloss")
26
+
27
+ # Set up other XGBoost parameters that can be configured
28
+ n_estimators = self.util.config_val("MODEL", "n_estimators", 100)
29
+ max_depth = self.util.config_val("MODEL", "max_depth", 6)
30
+ learning_rate = self.util.config_val("MODEL", "learning_rate", 0.3)
31
+ subsample = self.util.config_val("MODEL", "subsample", 1.0)
32
+
33
+ xgb_params["n_estimators"] = int(n_estimators)
34
+ xgb_params["max_depth"] = int(max_depth)
35
+ xgb_params["learning_rate"] = float(learning_rate)
36
+ xgb_params["subsample"] = float(subsample)
37
+
38
+ # Set random state for reproducibility
39
+ xgb_params["random_state"] = 42
40
+
41
+ # Add early stopping parameters to model initialization if configured
42
+ if self.early_stopping_rounds:
43
+ xgb_params["early_stopping_rounds"] = int(self.early_stopping_rounds)
44
+ xgb_params["eval_metric"] = self.eval_metric
45
+
46
+ # Initialize classifier with parameters
47
+ self.clf = XGBClassifier(**xgb_params)
48
+
49
+ def train(self):
50
+ """Train the XGBoost model with optional early stopping."""
51
+ # Check if NANs in features and handle them
52
+ if self.feats_train.isna().to_numpy().any():
53
+ self.util.debug(
54
+ "Model, train: replacing"
55
+ f" {self.feats_train.isna().sum().sum()} NANs with 0"
56
+ )
57
+ self.feats_train = self.feats_train.fillna(0)
58
+
59
+ feats = self.feats_train.to_numpy()
60
+ labels = self.df_train[self.target]
61
+
62
+ # Configure fitting parameters
63
+ fit_params = {}
64
+
65
+ # Check if early stopping is configured
66
+ if self.early_stopping_rounds:
67
+ # Check if we're in split3 mode (train/dev/test) where validation data is available
68
+ import ast
69
+
70
+ split3 = ast.literal_eval(
71
+ self.util.config_val("EXP", "traindevtest", "False")
72
+ )
73
+
74
+ if split3 and self.feats_test is not None and self.df_test is not None:
75
+ # In split3 mode, self.feats_test and self.df_test are actually the dev set
76
+ feats_dev = self.feats_test.to_numpy()
77
+ labels_dev = self.df_test[self.target]
78
+
79
+ # Handle NANs in dev features
80
+ if self.feats_test.isna().to_numpy().any():
81
+ self.util.debug(
82
+ "Model, dev: replacing"
83
+ f" {self.feats_test.isna().sum().sum()} NANs with 0"
84
+ )
85
+ feats_dev = self.feats_test.fillna(0).to_numpy()
86
+
87
+ # Set up early stopping with validation data
88
+ eval_set = [(feats, labels), (feats_dev, labels_dev)]
89
+ fit_params["eval_set"] = eval_set
90
+ fit_params["verbose"] = True
91
+
92
+ self.util.debug(
93
+ f"Training XGBoost with early stopping (using dev set):"
94
+ )
95
+ self.util.debug(
96
+ f" - early_stopping_rounds: {self.early_stopping_rounds}"
97
+ )
98
+ self.util.debug(f" - eval_metric: {self.eval_metric}")
99
+ self.util.debug(f" - validation set size: {feats_dev.shape[0]}")
100
+ else:
101
+ # For train/test split only: use a portion of training data for validation
102
+ from sklearn.model_selection import train_test_split
103
+
104
+ # Get validation split ratio (default 0.2 = 20% of training data)
105
+ val_split = float(
106
+ self.util.config_val("MODEL", "validation_split", 0.2)
107
+ )
108
+
109
+ # Split training data into train and validation
110
+ feats_train_split, feats_val, labels_train_split, labels_val = (
111
+ train_test_split(
112
+ feats,
113
+ labels,
114
+ test_size=val_split,
115
+ random_state=42,
116
+ stratify=labels,
117
+ )
118
+ )
119
+
120
+ # Set up early stopping with validation split
121
+ eval_set = [
122
+ (feats_train_split, labels_train_split),
123
+ (feats_val, labels_val),
124
+ ]
125
+ fit_params["eval_set"] = eval_set
126
+ fit_params["verbose"] = True
127
+
128
+ # Use the split training data for actual training
129
+ feats = feats_train_split
130
+ labels = labels_train_split
131
+
132
+ self.util.debug(
133
+ f"Training XGBoost with early stopping (using validation split):"
134
+ )
135
+ self.util.debug(
136
+ f" - early_stopping_rounds: {self.early_stopping_rounds}"
137
+ )
138
+ self.util.debug(f" - eval_metric: {self.eval_metric}")
139
+ self.util.debug(f" - validation_split: {val_split}")
140
+ self.util.debug(f" - training set size: {feats_train_split.shape[0]}")
141
+ self.util.debug(f" - validation set size: {feats_val.shape[0]}")
142
+
143
+ # Handle class weights if configured
144
+ class_weight = self.util.config_val("MODEL", "class_weight", False)
145
+ if class_weight:
146
+ import sklearn.utils.class_weight
147
+
148
+ self.util.debug("using class weight")
149
+ classes_weights = sklearn.utils.class_weight.compute_sample_weight(
150
+ class_weight="balanced", y=labels
151
+ )
152
+ fit_params["sample_weight"] = classes_weights
153
+
154
+ # Train the model
155
+ self.clf.fit(feats, labels, **fit_params)
156
+
157
+ # Log information about the trained model
158
+ if hasattr(self.clf, "best_iteration"):
159
+ self.util.debug(f"Best iteration: {self.clf.best_iteration}")
160
+ if hasattr(self.clf, "best_score"):
161
+ self.util.debug(f"Best score: {self.clf.best_score}")
16
162
 
17
163
  def get_type(self):
18
164
  return "xgb"
@@ -0,0 +1,49 @@
1
+ from unittest.mock import MagicMock, patch
2
+
3
+ import pytest
4
+
5
+ from nkululeko.models.model_knn import KNN_model
6
+
7
+
8
+ @pytest.fixture
9
+ def mock_util():
10
+ mock = MagicMock()
11
+ mock.config_val.side_effect = lambda section, key, default: {
12
+ ("MODEL", "KNN_weights", "uniform"): "distance",
13
+ ("MODEL", "K_val", "5"): "3"
14
+ }[(section, key, default)]
15
+ return mock
16
+
17
+ @pytest.fixture
18
+ def dummy_data():
19
+ df_train = MagicMock()
20
+ df_test = MagicMock()
21
+ feats_train = MagicMock()
22
+ feats_test = MagicMock()
23
+ return df_train, df_test, feats_train, feats_test
24
+
25
+ def test_knn_model_initialization(monkeypatch, mock_util, dummy_data):
26
+ with patch.object(KNN_model, "__init__", return_value=None):
27
+ model = KNN_model(*dummy_data)
28
+ model.util = mock_util
29
+ model.name = "knn"
30
+ from sklearn.neighbors import KNeighborsClassifier
31
+ model.clf = KNeighborsClassifier(n_neighbors=3, weights="distance")
32
+ model.is_classifier = True
33
+ assert model.name == "knn"
34
+ assert model.clf.get_params()["n_neighbors"] == 3
35
+ assert model.clf.get_params()["weights"] == "distance"
36
+ assert model.is_classifier is True
37
+
38
+ def test_knn_model_default_params(monkeypatch, dummy_data):
39
+ mock_util = MagicMock()
40
+ mock_util.config_val.side_effect = lambda section, key, default: default
41
+ with patch.object(KNN_model, "__init__", return_value=None):
42
+ model = KNN_model(*dummy_data)
43
+ model.util = mock_util
44
+ model.name = "knn"
45
+ from sklearn.neighbors import KNeighborsClassifier
46
+ model.clf = KNeighborsClassifier(n_neighbors=5, weights="uniform")
47
+ model.is_classifier = True
48
+ assert model.clf.get_params()["n_neighbors"] == 5
49
+ assert model.clf.get_params()["weights"] == "uniform"
@@ -0,0 +1,153 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pytest
4
+ import torch
5
+ from unittest.mock import patch
6
+
7
+ from nkululeko.models.model_mlp import MLPModel
8
+
9
+
10
+ class DummyUtil:
11
+ def config_val(self, section, key, default=None):
12
+ # Provide defaults for required config values
13
+ if key == "manual_seed":
14
+ return True
15
+ if key == "loss":
16
+ return "cross"
17
+ if key == "device":
18
+ return "cpu"
19
+ if key == "learning_rate":
20
+ return 0.001
21
+ if key == "batch_size":
22
+ return 2
23
+ if key == "drop":
24
+ return False
25
+ return default
26
+ def debug(self, msg): pass
27
+ def error(self, msg): raise Exception(msg)
28
+ def get_path(self, key): return "./"
29
+ def get_exp_name(self, only_train=False): return "exp"
30
+
31
+ @pytest.fixture(autouse=True)
32
+ def patch_globals(monkeypatch):
33
+ # Patch global config and labels
34
+ import nkululeko.glob_conf as glob_conf
35
+ glob_conf.config = {
36
+ "DATA": {"target": "label"},
37
+ "MODEL": {"layers": "{'a': 8, 'b': 4}"}
38
+ }
39
+ glob_conf.labels = [0, 1]
40
+ yield
41
+
42
+ @pytest.fixture
43
+ def dummy_data():
44
+ # 4 samples, 3 features
45
+ feats_train = pd.DataFrame(np.random.rand(4, 3), columns=['f1', 'f2', 'f3'])
46
+ feats_test = pd.DataFrame(np.random.rand(2, 3), columns=['f1', 'f2', 'f3'])
47
+ df_train = pd.DataFrame({'label': [0, 1, 0, 1]})
48
+ df_test = pd.DataFrame({'label': [1, 0]})
49
+ return df_train, df_test, feats_train, feats_test
50
+
51
+ @pytest.fixture
52
+ def mlp_model(dummy_data, monkeypatch):
53
+ df_train, df_test, feats_train, feats_test = dummy_data
54
+ with patch.object(MLPModel, "__init__", return_value=None):
55
+ model = MLPModel(df_train, df_test, feats_train, feats_test)
56
+ model.util = DummyUtil()
57
+ model.n_jobs = 1
58
+ model.target = "label"
59
+ model.class_num = 2
60
+ model.criterion = torch.nn.CrossEntropyLoss()
61
+ model.device = "cpu"
62
+ model.learning_rate = 0.001
63
+ model.batch_size = 2
64
+ model.num_workers = 1
65
+ model.loss = 0.0
66
+ model.loss_eval = 0.0
67
+ model.run = 0
68
+ model.epoch = 0
69
+ model.df_test = df_test
70
+ model.feats_test = feats_test
71
+ model.feats_train = feats_train
72
+
73
+ # Create a simple MLP model for testing
74
+ model.model = MLPModel.MLP(3, {'a': 8, 'b': 4}, 2, False).to("cpu")
75
+ model.optimizer = torch.optim.Adam(model.model.parameters(), lr=0.001)
76
+
77
+ # Create data loaders
78
+ model.trainloader = model.get_loader(feats_train, df_train, True)
79
+ model.testloader = model.get_loader(feats_test, df_test, False)
80
+ model.store_path = "/tmp/test_model.pt"
81
+
82
+ return model
83
+
84
+ def test_mlpmodel_init(mlp_model):
85
+ assert hasattr(mlp_model, "model")
86
+ assert hasattr(mlp_model, "trainloader")
87
+ assert hasattr(mlp_model, "testloader")
88
+ assert mlp_model.model is not None
89
+
90
+ def test_train_and_predict(mlp_model):
91
+ mlp_model.train()
92
+ report = mlp_model.predict()
93
+ assert hasattr(report, "result")
94
+ assert hasattr(report.result, "train")
95
+
96
+ def test_get_predictions(mlp_model):
97
+ mlp_model.train()
98
+ preds = mlp_model.get_predictions()
99
+ assert isinstance(preds, np.ndarray)
100
+ assert preds.shape[0] == 2
101
+
102
+ def test_get_probas(mlp_model):
103
+ mlp_model.train()
104
+ _, _, _, logits = mlp_model.evaluate(mlp_model.model, mlp_model.testloader, mlp_model.device)
105
+ probas = mlp_model.get_probas(logits)
106
+ assert isinstance(probas, pd.DataFrame)
107
+ assert set(probas.columns) == set([0, 1])
108
+
109
+ def test_predict_sample(mlp_model):
110
+ mlp_model.train()
111
+ feats = np.random.rand(3)
112
+ res = mlp_model.predict_sample(feats)
113
+ assert isinstance(res, dict)
114
+ assert set(res.keys()) == set([0, 1])
115
+
116
+ def test_predict_shap(mlp_model):
117
+ mlp_model.train()
118
+ feats = pd.DataFrame(np.random.rand(2, 3))
119
+ results = mlp_model.predict_shap(feats)
120
+ assert len(results) == 2
121
+
122
+ def test_store_and_load(tmp_path, mlp_model, monkeypatch):
123
+ mlp_model.train()
124
+
125
+ # Mock the util methods that load() uses to construct the path
126
+ def mock_get_path(key):
127
+ if key == "model_dir":
128
+ return str(tmp_path) + "/"
129
+ return "./"
130
+
131
+ def mock_get_exp_name(only_train=False):
132
+ return "model"
133
+
134
+ mlp_model.util.get_path = mock_get_path
135
+ mlp_model.util.get_exp_name = mock_get_exp_name
136
+
137
+ # Set store path to match what load() will construct
138
+ mlp_model.store_path = str(tmp_path) + "/model_0_000.model"
139
+ mlp_model.store()
140
+
141
+ # Simulate loading
142
+ mlp_model.load(0, 0)
143
+ assert mlp_model.model is not None
144
+
145
+ def test_set_testdata(mlp_model, dummy_data):
146
+ _, df_test, _, feats_test = dummy_data
147
+ mlp_model.set_testdata(df_test, feats_test)
148
+ assert mlp_model.testloader is not None
149
+
150
+ def test_reset_test(mlp_model, dummy_data):
151
+ _, df_test, _, feats_test = dummy_data
152
+ mlp_model.reset_test(df_test, feats_test)
153
+ assert mlp_model.testloader is not None
@@ -0,0 +1,33 @@
1
+ import pandas as pd
2
+ import pytest
3
+
4
+ from ..model_xgb import XGB_model
5
+
6
+
7
+ class DummyUtil:
8
+ def config_val(self, section, key, default):
9
+ return default
10
+ def debug(self, msg):
11
+ pass
12
+
13
+ class DummyModel(XGB_model):
14
+ def __init__(self, df_train, df_test, feats_train, feats_test):
15
+ # Patch util before calling super().__init__
16
+ self.util = DummyUtil()
17
+ self.target = "label"
18
+ super().__init__(df_train, df_test, feats_train, feats_test)
19
+ self.util = DummyUtil()
20
+ self.target = "label"
21
+
22
+ @pytest.fixture
23
+ def dummy_data():
24
+ df_train = pd.DataFrame({"label": [0, 1], "f1": [1.0, 2.0]})
25
+ df_test = pd.DataFrame({"label": [0, 1], "f1": [1.5, 2.5]})
26
+ feats_train = df_train[["f1"]]
27
+ feats_test = df_test[["f1"]]
28
+ return df_train, df_test, feats_train, feats_test
29
+
30
+ def test_get_type_returns_xgb(dummy_data):
31
+ df_train, df_test, feats_train, feats_test = dummy_data
32
+ model = DummyModel(df_train, df_test, feats_train, feats_test)
33
+ assert model.get_type() == "xgb"
nkululeko/nkululeko.py CHANGED
@@ -54,15 +54,6 @@ def doit(config_file):
54
54
  reports, last_epochs = expr.run()
55
55
  result = expr.get_best_report(reports).result.test
56
56
  expr.store_report()
57
-
58
- # check if we want to export the model
59
- o_path = util.config_val("EXP", "export_onnx", "False")
60
- if eval(o_path):
61
- print(f"Exporting ONNX model to {o_path}")
62
- o_path = o_path.replace('"', '')
63
- expr.runmgr.get_best_model().export_onnx(str(o_path))
64
-
65
-
66
57
  print("DONE")
67
58
  return result, int(np.asarray(last_epochs).min())
68
59
 
nkululeko/plots.py CHANGED
@@ -1,5 +1,6 @@
1
1
  # plots.py
2
2
  import ast
3
+ import os
3
4
 
4
5
  import matplotlib.pyplot as plt
5
6
  import numpy as np
@@ -87,9 +88,10 @@ class Plots:
87
88
 
88
89
  def plot_distributions(self, df, type_s="samples"):
89
90
  class_label, df = self._check_binning("class_label", df)
90
- attributes = ast.literal_eval(
91
- self.util.config_val("EXPL", "value_counts", False)
92
- )
91
+ value_counts_conf = self.util.config_val("EXPL", "value_counts", False)
92
+ if not isinstance(value_counts_conf, str):
93
+ value_counts_conf = str(value_counts_conf)
94
+ attributes = ast.literal_eval(value_counts_conf)
93
95
  # always plot the distribution of the main attribute
94
96
  filename = f"{class_label}_distribution"
95
97
  if self.util.is_categorical(df[class_label]):
@@ -216,11 +218,11 @@ class Plots:
216
218
 
217
219
  def save_plot(self, ax, caption, header, filename, type_s):
218
220
  # one up because of the runs
219
- fig_dir = self.util.get_path("fig_dir") + "../"
221
+ fig_dir = os.path.dirname(self.util.get_path("fig_dir"))
220
222
  fig_plots = ax.figure
221
223
  # avoid warning
222
224
  # plt.tight_layout()
223
- img_path = f"{fig_dir}{filename}_{type_s}.{self.format}"
225
+ img_path = os.path.join(fig_dir, f"{filename}_{type_s}.{self.format}")
224
226
  plt.savefig(img_path)
225
227
  plt.close(fig_plots)
226
228
  self.util.debug(f"Saved plot to {img_path}")
@@ -359,7 +361,7 @@ class Plots:
359
361
 
360
362
  def plot_durations(self, df, filename, sample_selection, caption=""):
361
363
  # one up because of the runs
362
- fig_dir = self.util.get_path("fig_dir") + "../"
364
+ fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
363
365
  try:
364
366
  ax = sns.histplot(df, x="duration", hue="class_label", kde=True)
365
367
  except AttributeError as ae:
@@ -376,7 +378,7 @@ class Plots:
376
378
  ax.set_ylabel("number of samples")
377
379
  fig = ax.figure
378
380
  # plt.tight_layout()
379
- img_path = f"{fig_dir}{filename}_{sample_selection}.{self.format}"
381
+ img_path = os.path.join(fig_dir, f"{filename}_{sample_selection}.{self.format}")
380
382
  plt.savefig(img_path)
381
383
  plt.close(fig)
382
384
  self.util.debug(f"plotted durations to {img_path}")
@@ -393,14 +395,14 @@ class Plots:
393
395
  filename = "speakers"
394
396
  caption = "speakers"
395
397
  # one up because of the runs
396
- fig_dir = self.util.get_path("fig_dir") + "../"
398
+ fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
397
399
  sns.set_style("whitegrid") # Set style for chart
398
400
  ax = df["speaker"].value_counts().plot(kind="pie", autopct="%1.1f%%")
399
401
  title = f"Speaker distr. for {sample_selection} {df.shape[0]}."
400
402
  ax.set_title(title)
401
403
  fig = ax.figure
402
404
  # plt.tight_layout()
403
- img_path = f"{fig_dir}{filename}_{sample_selection}.{self.format}"
405
+ img_path = os.path.join(fig_dir, f"{filename}_{sample_selection}.{self.format}")
404
406
  plt.savefig(img_path)
405
407
  plt.close(fig)
406
408
  self.util.debug(f"plotted speakers to {img_path}")
@@ -415,7 +417,7 @@ class Plots:
415
417
 
416
418
  def describe_df(self, name, df, target, filename):
417
419
  """Make a stacked barplot of samples and speakers per sex and target values. speaker, gender and target columns must be present"""
418
- fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
420
+ fig_dir = self.util.get_path("fig_dir") # + "../" # one up because of the runs
419
421
  sampl_num = df.shape[0]
420
422
  sex_col = "gender"
421
423
  if target == "gender":
@@ -447,7 +449,7 @@ class Plots:
447
449
  kind="bar", ax=axes, title=f"samples ({sampl_num})"
448
450
  )
449
451
  # plt.tight_layout()
450
- img_path = f"{fig_dir}{filename}.{self.format}"
452
+ img_path = os.path.join(fig_dir, f"{filename}.{self.format}")
451
453
  plt.savefig(img_path)
452
454
  fig.clear()
453
455
  plt.close(fig)
@@ -462,11 +464,12 @@ class Plots:
462
464
 
463
465
  def scatter_plot(self, feats, label_df, label, dimred_type):
464
466
  dim_num = int(self.util.config_val("EXPL", "scatter.dim", 2))
465
- # one up because of the runs
466
- fig_dir = self.util.get_path("fig_dir") + "../"
467
+ # one up because of the runs (for explore module)
468
+ fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
467
469
  sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
468
- filename = f"{label}_{self.util.get_feattype_name()}_{sample_selection}_{dimred_type}_{str(dim_num)}d"
469
- filename = f"{fig_dir}{filename}.{self.format}"
470
+ exp_name = self.util.get_name()
471
+ filename = f"{label}_{exp_name}_{self.util.get_feattype_name()}_{sample_selection}_{dimred_type}_{str(dim_num)}d"
472
+ filename = os.path.join(fig_dir, f"{filename}.{self.format}")
470
473
  self.util.debug(f"computing {dimred_type}, this might take a while...")
471
474
  data = None
472
475
  labels = label_df[label]
@@ -573,6 +576,7 @@ class Plots:
573
576
  self.util.error(f"wrong dimension number: {dim_num}")
574
577
  fig = ax.figure
575
578
  plt.savefig(filename)
579
+ self.util.debug(f"plotted {dimred_type} scatter plot to {filename}")
576
580
  fig.clear()
577
581
  plt.close(fig)
578
582
  glob_conf.report.add_item(
@@ -599,8 +603,10 @@ class Plots:
599
603
  # remove fullstops in the name
600
604
  feature_name = feature.replace(".", "-")
601
605
  # one up because of the runs
602
- fig_dir = self.util.get_path("fig_dir") + "../"
603
- filename = f"{fig_dir}feat_dist_{title}_{feature_name}.{self.format}"
606
+ fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
607
+ filename = os.path.join(
608
+ fig_dir, f"feat_dist_{title}_{feature_name}.{self.format}"
609
+ )
604
610
  if self.util.is_categorical(df_labels[label]):
605
611
  df_plot = pd.DataFrame(
606
612
  {label: df_labels[label], feature: df_features[feature]}
@@ -647,9 +653,9 @@ class Plots:
647
653
  # plt.tight_layout()
648
654
  # print(ax)
649
655
  # one up because of the runs
650
- fig_dir = self.util.get_path("fig_dir") + "../"
656
+ fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
651
657
  exp_name = self.util.get_exp_name(only_data=True)
652
- filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{self.format}"
658
+ filename = os.path.join(fig_dir, f"{exp_name}EXPL_tree-plot.{self.format}")
653
659
  fig = ax.figure
654
660
  fig.savefig(filename)
655
661
  fig.clear()
nkululeko/predict.py CHANGED
@@ -1,8 +1,8 @@
1
1
  # predict.py
2
- # use some model and add automatically predicted labels to train and test splits
3
- # then save as a new dataset
2
+ # use some model and add automatically predicted labels
3
+ # also can labels train and test splits then save as a new dataset
4
4
 
5
- r"""This script is used to call the nkululeko PREDICT framework.
5
+ r"""This script is used to call the nkululeko PREDICT module.
6
6
 
7
7
  It loads a configuration file, creates a new experiment,
8
8
  and performs automatic prediction on the train and test datasets. The predicted labels are added to the datasets and
@@ -60,9 +60,11 @@ def main():
60
60
  if "class_label" in df.columns:
61
61
  df = df.drop(columns=[target])
62
62
  df = df.rename(columns={"class_label": target})
63
- name = util.get_data_name() + "_predicted"
64
- df.to_csv(f"{expr.data_dir}/{name}.csv")
65
- util.debug(f"saved {name}.csv to {expr.data_dir}")
63
+ sample_selection = util.config_val("PREDICT", "sample_selection", "all")
64
+ name = f"{sample_selection}_predicted"
65
+ res_dir = util.get_res_dir()
66
+ df.to_csv(os.path.join(res_dir, f"{name}.csv"))
67
+ util.debug(f"saved {os.path.join(res_dir, name)}.csv")
66
68
  print("DONE")
67
69
 
68
70
 
@@ -5,7 +5,6 @@ Collector class for report items collected during module processing.
5
5
 
6
6
  """
7
7
 
8
- from nkululeko.reporting.latex_writer import LatexWriter
9
8
  from nkululeko.utils.util import Util
10
9
 
11
10
 
@@ -31,7 +30,10 @@ class Report:
31
30
  print("\t" + c.contents)
32
31
 
33
32
  def export_latex(self):
34
- lw = LatexWriter()
35
- for topic in self.report_items:
36
- lw.add_items_for_section(topic, self.report_items[topic])
37
- lw.finish_doc()
33
+ if str(self.util.config_val("REPORT", "show", "False")).lower() == "true":
34
+ from nkululeko.reporting.latex_writer import LatexWriter
35
+
36
+ lw = LatexWriter()
37
+ for topic in self.report_items:
38
+ lw.add_items_for_section(topic, self.report_items[topic])
39
+ lw.finish_doc()
@@ -2,6 +2,7 @@ import ast
2
2
  import glob
3
3
  import json
4
4
  import math
5
+ import os
5
6
 
6
7
  # import os
7
8
  from confidence_intervals import evaluate_with_conf_int
@@ -152,11 +153,14 @@ class Reporter:
152
153
  probas["truth"] = self.truths
153
154
  try:
154
155
  le = glob_conf.label_encoder
155
- mapping = dict(zip(le.classes_, range(len(le.classes_))))
156
- mapping_reverse = {value: key for key, value in mapping.items()}
157
- probas = probas.rename(columns=mapping_reverse)
158
- probas["predicted"] = probas["predicted"].map(mapping_reverse)
159
- probas["truth"] = probas["truth"].map(mapping_reverse)
156
+ if le is not None:
157
+ mapping = dict(zip(le.classes_, range(len(le.classes_))))
158
+ mapping_reverse = {value: key for key, value in mapping.items()}
159
+ probas = probas.rename(columns=mapping_reverse)
160
+ probas["predicted"] = probas["predicted"].map(mapping_reverse)
161
+ probas["truth"] = probas["truth"].map(mapping_reverse)
162
+ else:
163
+ self.util.debug("Label encoder is None, skipping label mapping")
160
164
  except AttributeError as ae:
161
165
  self.util.debug(f"Can't label categories: {ae}")
162
166
  # compute entropy per sample
@@ -170,6 +174,17 @@ class Reporter:
170
174
  probas["correct"] = probas.predicted == probas.truth
171
175
  if file_name is None:
172
176
  file_name = self.util.get_pred_name() + ".csv"
177
+ else:
178
+ # Ensure the file_name goes to the results directory
179
+ if not os.path.isabs(file_name):
180
+ res_dir = self.util.get_res_dir()
181
+ if not file_name.endswith(".csv"):
182
+ file_name = os.path.join(res_dir, file_name + ".csv")
183
+ else:
184
+ file_name = os.path.join(res_dir, file_name)
185
+ else:
186
+ if not file_name.endswith(".csv"):
187
+ file_name = file_name + ".csv"
173
188
  self.probas = probas
174
189
  probas.to_csv(file_name)
175
190
  self.util.debug(f"Saved probabilities to {file_name}")
@@ -5,6 +5,7 @@ Predict targets from a model and save as csv file.
5
5
  """
6
6
 
7
7
  import ast
8
+ import os
8
9
 
9
10
  import pandas as pd
10
11
  from sklearn.preprocessing import LabelEncoder
@@ -24,7 +25,12 @@ class TestPredictor:
24
25
  self.label_encoder = labenc
25
26
  self.target = glob_conf.config["DATA"]["target"]
26
27
  self.util = Util("test_predictor")
27
- self.name = name
28
+ # Construct full path to results directory
29
+ res_dir = self.util.get_res_dir()
30
+ if os.path.isabs(name):
31
+ self.name = name
32
+ else:
33
+ self.name = os.path.join(res_dir, name)
28
34
 
29
35
  def predict_and_store(self):
30
36
  label_data = self.util.config_val("DATA", "label_data", False)
@@ -0,0 +1 @@
1
+ # Tests package for nkululeko