nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/augmenting/resampler.py +5 -2
- nkululeko/autopredict/ap_emotion.py +36 -0
- nkululeko/autopredict/ap_text.py +45 -0
- nkululeko/autopredict/tests/__init__.py +0 -0
- nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
- nkululeko/autopredict/whisper_transcriber.py +81 -0
- nkululeko/balance.py +222 -0
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +53 -3
- nkululeko/explore.py +32 -13
- nkululeko/feat_extract/feats_analyser.py +45 -17
- nkululeko/feat_extract/feats_emotion2vec.py +51 -26
- nkululeko/feat_extract/feats_praat.py +3 -3
- nkululeko/feat_extract/feats_praat_core.py +769 -0
- nkululeko/feat_extract/tests/__init__.py +1 -0
- nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
- nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
- nkululeko/glob_conf.py +9 -0
- nkululeko/modelrunner.py +15 -39
- nkululeko/models/model.py +4 -42
- nkululeko/models/model_tuned.py +416 -84
- nkululeko/models/model_xgb.py +148 -2
- nkululeko/models/tests/test_model_knn.py +49 -0
- nkululeko/models/tests/test_model_mlp.py +153 -0
- nkululeko/models/tests/test_model_xgb.py +33 -0
- nkululeko/nkululeko.py +0 -9
- nkululeko/plots.py +25 -19
- nkululeko/predict.py +8 -6
- nkululeko/reporting/report.py +7 -5
- nkululeko/reporting/reporter.py +20 -5
- nkululeko/test_predictor.py +7 -1
- nkululeko/tests/__init__.py +1 -0
- nkululeko/tests/test_balancing.py +270 -0
- nkululeko/utils/util.py +38 -6
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
- nkululeko/feat_extract/feats_opensmile copy.py +0 -93
- nkululeko/feat_extract/feinberg_praat.py +0 -628
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
nkululeko/models/model_xgb.py
CHANGED
@@ -1,18 +1,164 @@
|
|
1
1
|
# model_xgb.py
|
2
2
|
|
3
|
+
import os
|
3
4
|
from xgboost import XGBClassifier
|
4
5
|
|
6
|
+
import nkululeko.glob_conf as glob_conf
|
5
7
|
from nkululeko.models.model import Model
|
6
8
|
|
7
9
|
|
8
10
|
class XGB_model(Model):
|
9
|
-
"""An XGBoost model"""
|
11
|
+
"""An XGBoost model with early stopping support"""
|
10
12
|
|
11
13
|
def __init__(self, df_train, df_test, feats_train, feats_test):
|
12
14
|
super().__init__(df_train, df_test, feats_train, feats_test)
|
13
15
|
self.name = "xgb"
|
14
16
|
self.is_classifier = True
|
15
|
-
|
17
|
+
|
18
|
+
# Configure XGBoost parameters
|
19
|
+
xgb_params = {}
|
20
|
+
|
21
|
+
# Get early stopping configuration
|
22
|
+
self.early_stopping_rounds = self.util.config_val(
|
23
|
+
"MODEL", "early_stopping_rounds", False
|
24
|
+
)
|
25
|
+
self.eval_metric = self.util.config_val("MODEL", "eval_metric", "logloss")
|
26
|
+
|
27
|
+
# Set up other XGBoost parameters that can be configured
|
28
|
+
n_estimators = self.util.config_val("MODEL", "n_estimators", 100)
|
29
|
+
max_depth = self.util.config_val("MODEL", "max_depth", 6)
|
30
|
+
learning_rate = self.util.config_val("MODEL", "learning_rate", 0.3)
|
31
|
+
subsample = self.util.config_val("MODEL", "subsample", 1.0)
|
32
|
+
|
33
|
+
xgb_params["n_estimators"] = int(n_estimators)
|
34
|
+
xgb_params["max_depth"] = int(max_depth)
|
35
|
+
xgb_params["learning_rate"] = float(learning_rate)
|
36
|
+
xgb_params["subsample"] = float(subsample)
|
37
|
+
|
38
|
+
# Set random state for reproducibility
|
39
|
+
xgb_params["random_state"] = 42
|
40
|
+
|
41
|
+
# Add early stopping parameters to model initialization if configured
|
42
|
+
if self.early_stopping_rounds:
|
43
|
+
xgb_params["early_stopping_rounds"] = int(self.early_stopping_rounds)
|
44
|
+
xgb_params["eval_metric"] = self.eval_metric
|
45
|
+
|
46
|
+
# Initialize classifier with parameters
|
47
|
+
self.clf = XGBClassifier(**xgb_params)
|
48
|
+
|
49
|
+
def train(self):
|
50
|
+
"""Train the XGBoost model with optional early stopping."""
|
51
|
+
# Check if NANs in features and handle them
|
52
|
+
if self.feats_train.isna().to_numpy().any():
|
53
|
+
self.util.debug(
|
54
|
+
"Model, train: replacing"
|
55
|
+
f" {self.feats_train.isna().sum().sum()} NANs with 0"
|
56
|
+
)
|
57
|
+
self.feats_train = self.feats_train.fillna(0)
|
58
|
+
|
59
|
+
feats = self.feats_train.to_numpy()
|
60
|
+
labels = self.df_train[self.target]
|
61
|
+
|
62
|
+
# Configure fitting parameters
|
63
|
+
fit_params = {}
|
64
|
+
|
65
|
+
# Check if early stopping is configured
|
66
|
+
if self.early_stopping_rounds:
|
67
|
+
# Check if we're in split3 mode (train/dev/test) where validation data is available
|
68
|
+
import ast
|
69
|
+
|
70
|
+
split3 = ast.literal_eval(
|
71
|
+
self.util.config_val("EXP", "traindevtest", "False")
|
72
|
+
)
|
73
|
+
|
74
|
+
if split3 and self.feats_test is not None and self.df_test is not None:
|
75
|
+
# In split3 mode, self.feats_test and self.df_test are actually the dev set
|
76
|
+
feats_dev = self.feats_test.to_numpy()
|
77
|
+
labels_dev = self.df_test[self.target]
|
78
|
+
|
79
|
+
# Handle NANs in dev features
|
80
|
+
if self.feats_test.isna().to_numpy().any():
|
81
|
+
self.util.debug(
|
82
|
+
"Model, dev: replacing"
|
83
|
+
f" {self.feats_test.isna().sum().sum()} NANs with 0"
|
84
|
+
)
|
85
|
+
feats_dev = self.feats_test.fillna(0).to_numpy()
|
86
|
+
|
87
|
+
# Set up early stopping with validation data
|
88
|
+
eval_set = [(feats, labels), (feats_dev, labels_dev)]
|
89
|
+
fit_params["eval_set"] = eval_set
|
90
|
+
fit_params["verbose"] = True
|
91
|
+
|
92
|
+
self.util.debug(
|
93
|
+
f"Training XGBoost with early stopping (using dev set):"
|
94
|
+
)
|
95
|
+
self.util.debug(
|
96
|
+
f" - early_stopping_rounds: {self.early_stopping_rounds}"
|
97
|
+
)
|
98
|
+
self.util.debug(f" - eval_metric: {self.eval_metric}")
|
99
|
+
self.util.debug(f" - validation set size: {feats_dev.shape[0]}")
|
100
|
+
else:
|
101
|
+
# For train/test split only: use a portion of training data for validation
|
102
|
+
from sklearn.model_selection import train_test_split
|
103
|
+
|
104
|
+
# Get validation split ratio (default 0.2 = 20% of training data)
|
105
|
+
val_split = float(
|
106
|
+
self.util.config_val("MODEL", "validation_split", 0.2)
|
107
|
+
)
|
108
|
+
|
109
|
+
# Split training data into train and validation
|
110
|
+
feats_train_split, feats_val, labels_train_split, labels_val = (
|
111
|
+
train_test_split(
|
112
|
+
feats,
|
113
|
+
labels,
|
114
|
+
test_size=val_split,
|
115
|
+
random_state=42,
|
116
|
+
stratify=labels,
|
117
|
+
)
|
118
|
+
)
|
119
|
+
|
120
|
+
# Set up early stopping with validation split
|
121
|
+
eval_set = [
|
122
|
+
(feats_train_split, labels_train_split),
|
123
|
+
(feats_val, labels_val),
|
124
|
+
]
|
125
|
+
fit_params["eval_set"] = eval_set
|
126
|
+
fit_params["verbose"] = True
|
127
|
+
|
128
|
+
# Use the split training data for actual training
|
129
|
+
feats = feats_train_split
|
130
|
+
labels = labels_train_split
|
131
|
+
|
132
|
+
self.util.debug(
|
133
|
+
f"Training XGBoost with early stopping (using validation split):"
|
134
|
+
)
|
135
|
+
self.util.debug(
|
136
|
+
f" - early_stopping_rounds: {self.early_stopping_rounds}"
|
137
|
+
)
|
138
|
+
self.util.debug(f" - eval_metric: {self.eval_metric}")
|
139
|
+
self.util.debug(f" - validation_split: {val_split}")
|
140
|
+
self.util.debug(f" - training set size: {feats_train_split.shape[0]}")
|
141
|
+
self.util.debug(f" - validation set size: {feats_val.shape[0]}")
|
142
|
+
|
143
|
+
# Handle class weights if configured
|
144
|
+
class_weight = self.util.config_val("MODEL", "class_weight", False)
|
145
|
+
if class_weight:
|
146
|
+
import sklearn.utils.class_weight
|
147
|
+
|
148
|
+
self.util.debug("using class weight")
|
149
|
+
classes_weights = sklearn.utils.class_weight.compute_sample_weight(
|
150
|
+
class_weight="balanced", y=labels
|
151
|
+
)
|
152
|
+
fit_params["sample_weight"] = classes_weights
|
153
|
+
|
154
|
+
# Train the model
|
155
|
+
self.clf.fit(feats, labels, **fit_params)
|
156
|
+
|
157
|
+
# Log information about the trained model
|
158
|
+
if hasattr(self.clf, "best_iteration"):
|
159
|
+
self.util.debug(f"Best iteration: {self.clf.best_iteration}")
|
160
|
+
if hasattr(self.clf, "best_score"):
|
161
|
+
self.util.debug(f"Best score: {self.clf.best_score}")
|
16
162
|
|
17
163
|
def get_type(self):
|
18
164
|
return "xgb"
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from unittest.mock import MagicMock, patch
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from nkululeko.models.model_knn import KNN_model
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def mock_util():
|
10
|
+
mock = MagicMock()
|
11
|
+
mock.config_val.side_effect = lambda section, key, default: {
|
12
|
+
("MODEL", "KNN_weights", "uniform"): "distance",
|
13
|
+
("MODEL", "K_val", "5"): "3"
|
14
|
+
}[(section, key, default)]
|
15
|
+
return mock
|
16
|
+
|
17
|
+
@pytest.fixture
|
18
|
+
def dummy_data():
|
19
|
+
df_train = MagicMock()
|
20
|
+
df_test = MagicMock()
|
21
|
+
feats_train = MagicMock()
|
22
|
+
feats_test = MagicMock()
|
23
|
+
return df_train, df_test, feats_train, feats_test
|
24
|
+
|
25
|
+
def test_knn_model_initialization(monkeypatch, mock_util, dummy_data):
|
26
|
+
with patch.object(KNN_model, "__init__", return_value=None):
|
27
|
+
model = KNN_model(*dummy_data)
|
28
|
+
model.util = mock_util
|
29
|
+
model.name = "knn"
|
30
|
+
from sklearn.neighbors import KNeighborsClassifier
|
31
|
+
model.clf = KNeighborsClassifier(n_neighbors=3, weights="distance")
|
32
|
+
model.is_classifier = True
|
33
|
+
assert model.name == "knn"
|
34
|
+
assert model.clf.get_params()["n_neighbors"] == 3
|
35
|
+
assert model.clf.get_params()["weights"] == "distance"
|
36
|
+
assert model.is_classifier is True
|
37
|
+
|
38
|
+
def test_knn_model_default_params(monkeypatch, dummy_data):
|
39
|
+
mock_util = MagicMock()
|
40
|
+
mock_util.config_val.side_effect = lambda section, key, default: default
|
41
|
+
with patch.object(KNN_model, "__init__", return_value=None):
|
42
|
+
model = KNN_model(*dummy_data)
|
43
|
+
model.util = mock_util
|
44
|
+
model.name = "knn"
|
45
|
+
from sklearn.neighbors import KNeighborsClassifier
|
46
|
+
model.clf = KNeighborsClassifier(n_neighbors=5, weights="uniform")
|
47
|
+
model.is_classifier = True
|
48
|
+
assert model.clf.get_params()["n_neighbors"] == 5
|
49
|
+
assert model.clf.get_params()["weights"] == "uniform"
|
@@ -0,0 +1,153 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
import pytest
|
4
|
+
import torch
|
5
|
+
from unittest.mock import patch
|
6
|
+
|
7
|
+
from nkululeko.models.model_mlp import MLPModel
|
8
|
+
|
9
|
+
|
10
|
+
class DummyUtil:
|
11
|
+
def config_val(self, section, key, default=None):
|
12
|
+
# Provide defaults for required config values
|
13
|
+
if key == "manual_seed":
|
14
|
+
return True
|
15
|
+
if key == "loss":
|
16
|
+
return "cross"
|
17
|
+
if key == "device":
|
18
|
+
return "cpu"
|
19
|
+
if key == "learning_rate":
|
20
|
+
return 0.001
|
21
|
+
if key == "batch_size":
|
22
|
+
return 2
|
23
|
+
if key == "drop":
|
24
|
+
return False
|
25
|
+
return default
|
26
|
+
def debug(self, msg): pass
|
27
|
+
def error(self, msg): raise Exception(msg)
|
28
|
+
def get_path(self, key): return "./"
|
29
|
+
def get_exp_name(self, only_train=False): return "exp"
|
30
|
+
|
31
|
+
@pytest.fixture(autouse=True)
|
32
|
+
def patch_globals(monkeypatch):
|
33
|
+
# Patch global config and labels
|
34
|
+
import nkululeko.glob_conf as glob_conf
|
35
|
+
glob_conf.config = {
|
36
|
+
"DATA": {"target": "label"},
|
37
|
+
"MODEL": {"layers": "{'a': 8, 'b': 4}"}
|
38
|
+
}
|
39
|
+
glob_conf.labels = [0, 1]
|
40
|
+
yield
|
41
|
+
|
42
|
+
@pytest.fixture
|
43
|
+
def dummy_data():
|
44
|
+
# 4 samples, 3 features
|
45
|
+
feats_train = pd.DataFrame(np.random.rand(4, 3), columns=['f1', 'f2', 'f3'])
|
46
|
+
feats_test = pd.DataFrame(np.random.rand(2, 3), columns=['f1', 'f2', 'f3'])
|
47
|
+
df_train = pd.DataFrame({'label': [0, 1, 0, 1]})
|
48
|
+
df_test = pd.DataFrame({'label': [1, 0]})
|
49
|
+
return df_train, df_test, feats_train, feats_test
|
50
|
+
|
51
|
+
@pytest.fixture
|
52
|
+
def mlp_model(dummy_data, monkeypatch):
|
53
|
+
df_train, df_test, feats_train, feats_test = dummy_data
|
54
|
+
with patch.object(MLPModel, "__init__", return_value=None):
|
55
|
+
model = MLPModel(df_train, df_test, feats_train, feats_test)
|
56
|
+
model.util = DummyUtil()
|
57
|
+
model.n_jobs = 1
|
58
|
+
model.target = "label"
|
59
|
+
model.class_num = 2
|
60
|
+
model.criterion = torch.nn.CrossEntropyLoss()
|
61
|
+
model.device = "cpu"
|
62
|
+
model.learning_rate = 0.001
|
63
|
+
model.batch_size = 2
|
64
|
+
model.num_workers = 1
|
65
|
+
model.loss = 0.0
|
66
|
+
model.loss_eval = 0.0
|
67
|
+
model.run = 0
|
68
|
+
model.epoch = 0
|
69
|
+
model.df_test = df_test
|
70
|
+
model.feats_test = feats_test
|
71
|
+
model.feats_train = feats_train
|
72
|
+
|
73
|
+
# Create a simple MLP model for testing
|
74
|
+
model.model = MLPModel.MLP(3, {'a': 8, 'b': 4}, 2, False).to("cpu")
|
75
|
+
model.optimizer = torch.optim.Adam(model.model.parameters(), lr=0.001)
|
76
|
+
|
77
|
+
# Create data loaders
|
78
|
+
model.trainloader = model.get_loader(feats_train, df_train, True)
|
79
|
+
model.testloader = model.get_loader(feats_test, df_test, False)
|
80
|
+
model.store_path = "/tmp/test_model.pt"
|
81
|
+
|
82
|
+
return model
|
83
|
+
|
84
|
+
def test_mlpmodel_init(mlp_model):
|
85
|
+
assert hasattr(mlp_model, "model")
|
86
|
+
assert hasattr(mlp_model, "trainloader")
|
87
|
+
assert hasattr(mlp_model, "testloader")
|
88
|
+
assert mlp_model.model is not None
|
89
|
+
|
90
|
+
def test_train_and_predict(mlp_model):
|
91
|
+
mlp_model.train()
|
92
|
+
report = mlp_model.predict()
|
93
|
+
assert hasattr(report, "result")
|
94
|
+
assert hasattr(report.result, "train")
|
95
|
+
|
96
|
+
def test_get_predictions(mlp_model):
|
97
|
+
mlp_model.train()
|
98
|
+
preds = mlp_model.get_predictions()
|
99
|
+
assert isinstance(preds, np.ndarray)
|
100
|
+
assert preds.shape[0] == 2
|
101
|
+
|
102
|
+
def test_get_probas(mlp_model):
|
103
|
+
mlp_model.train()
|
104
|
+
_, _, _, logits = mlp_model.evaluate(mlp_model.model, mlp_model.testloader, mlp_model.device)
|
105
|
+
probas = mlp_model.get_probas(logits)
|
106
|
+
assert isinstance(probas, pd.DataFrame)
|
107
|
+
assert set(probas.columns) == set([0, 1])
|
108
|
+
|
109
|
+
def test_predict_sample(mlp_model):
|
110
|
+
mlp_model.train()
|
111
|
+
feats = np.random.rand(3)
|
112
|
+
res = mlp_model.predict_sample(feats)
|
113
|
+
assert isinstance(res, dict)
|
114
|
+
assert set(res.keys()) == set([0, 1])
|
115
|
+
|
116
|
+
def test_predict_shap(mlp_model):
|
117
|
+
mlp_model.train()
|
118
|
+
feats = pd.DataFrame(np.random.rand(2, 3))
|
119
|
+
results = mlp_model.predict_shap(feats)
|
120
|
+
assert len(results) == 2
|
121
|
+
|
122
|
+
def test_store_and_load(tmp_path, mlp_model, monkeypatch):
|
123
|
+
mlp_model.train()
|
124
|
+
|
125
|
+
# Mock the util methods that load() uses to construct the path
|
126
|
+
def mock_get_path(key):
|
127
|
+
if key == "model_dir":
|
128
|
+
return str(tmp_path) + "/"
|
129
|
+
return "./"
|
130
|
+
|
131
|
+
def mock_get_exp_name(only_train=False):
|
132
|
+
return "model"
|
133
|
+
|
134
|
+
mlp_model.util.get_path = mock_get_path
|
135
|
+
mlp_model.util.get_exp_name = mock_get_exp_name
|
136
|
+
|
137
|
+
# Set store path to match what load() will construct
|
138
|
+
mlp_model.store_path = str(tmp_path) + "/model_0_000.model"
|
139
|
+
mlp_model.store()
|
140
|
+
|
141
|
+
# Simulate loading
|
142
|
+
mlp_model.load(0, 0)
|
143
|
+
assert mlp_model.model is not None
|
144
|
+
|
145
|
+
def test_set_testdata(mlp_model, dummy_data):
|
146
|
+
_, df_test, _, feats_test = dummy_data
|
147
|
+
mlp_model.set_testdata(df_test, feats_test)
|
148
|
+
assert mlp_model.testloader is not None
|
149
|
+
|
150
|
+
def test_reset_test(mlp_model, dummy_data):
|
151
|
+
_, df_test, _, feats_test = dummy_data
|
152
|
+
mlp_model.reset_test(df_test, feats_test)
|
153
|
+
assert mlp_model.testloader is not None
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import pytest
|
3
|
+
|
4
|
+
from ..model_xgb import XGB_model
|
5
|
+
|
6
|
+
|
7
|
+
class DummyUtil:
|
8
|
+
def config_val(self, section, key, default):
|
9
|
+
return default
|
10
|
+
def debug(self, msg):
|
11
|
+
pass
|
12
|
+
|
13
|
+
class DummyModel(XGB_model):
|
14
|
+
def __init__(self, df_train, df_test, feats_train, feats_test):
|
15
|
+
# Patch util before calling super().__init__
|
16
|
+
self.util = DummyUtil()
|
17
|
+
self.target = "label"
|
18
|
+
super().__init__(df_train, df_test, feats_train, feats_test)
|
19
|
+
self.util = DummyUtil()
|
20
|
+
self.target = "label"
|
21
|
+
|
22
|
+
@pytest.fixture
|
23
|
+
def dummy_data():
|
24
|
+
df_train = pd.DataFrame({"label": [0, 1], "f1": [1.0, 2.0]})
|
25
|
+
df_test = pd.DataFrame({"label": [0, 1], "f1": [1.5, 2.5]})
|
26
|
+
feats_train = df_train[["f1"]]
|
27
|
+
feats_test = df_test[["f1"]]
|
28
|
+
return df_train, df_test, feats_train, feats_test
|
29
|
+
|
30
|
+
def test_get_type_returns_xgb(dummy_data):
|
31
|
+
df_train, df_test, feats_train, feats_test = dummy_data
|
32
|
+
model = DummyModel(df_train, df_test, feats_train, feats_test)
|
33
|
+
assert model.get_type() == "xgb"
|
nkululeko/nkululeko.py
CHANGED
@@ -54,15 +54,6 @@ def doit(config_file):
|
|
54
54
|
reports, last_epochs = expr.run()
|
55
55
|
result = expr.get_best_report(reports).result.test
|
56
56
|
expr.store_report()
|
57
|
-
|
58
|
-
# check if we want to export the model
|
59
|
-
o_path = util.config_val("EXP", "export_onnx", "False")
|
60
|
-
if eval(o_path):
|
61
|
-
print(f"Exporting ONNX model to {o_path}")
|
62
|
-
o_path = o_path.replace('"', '')
|
63
|
-
expr.runmgr.get_best_model().export_onnx(str(o_path))
|
64
|
-
|
65
|
-
|
66
57
|
print("DONE")
|
67
58
|
return result, int(np.asarray(last_epochs).min())
|
68
59
|
|
nkululeko/plots.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# plots.py
|
2
2
|
import ast
|
3
|
+
import os
|
3
4
|
|
4
5
|
import matplotlib.pyplot as plt
|
5
6
|
import numpy as np
|
@@ -87,9 +88,10 @@ class Plots:
|
|
87
88
|
|
88
89
|
def plot_distributions(self, df, type_s="samples"):
|
89
90
|
class_label, df = self._check_binning("class_label", df)
|
90
|
-
|
91
|
-
|
92
|
-
|
91
|
+
value_counts_conf = self.util.config_val("EXPL", "value_counts", False)
|
92
|
+
if not isinstance(value_counts_conf, str):
|
93
|
+
value_counts_conf = str(value_counts_conf)
|
94
|
+
attributes = ast.literal_eval(value_counts_conf)
|
93
95
|
# always plot the distribution of the main attribute
|
94
96
|
filename = f"{class_label}_distribution"
|
95
97
|
if self.util.is_categorical(df[class_label]):
|
@@ -216,11 +218,11 @@ class Plots:
|
|
216
218
|
|
217
219
|
def save_plot(self, ax, caption, header, filename, type_s):
|
218
220
|
# one up because of the runs
|
219
|
-
fig_dir = self.util.get_path("fig_dir")
|
221
|
+
fig_dir = os.path.dirname(self.util.get_path("fig_dir"))
|
220
222
|
fig_plots = ax.figure
|
221
223
|
# avoid warning
|
222
224
|
# plt.tight_layout()
|
223
|
-
img_path = f"{
|
225
|
+
img_path = os.path.join(fig_dir, f"{filename}_{type_s}.{self.format}")
|
224
226
|
plt.savefig(img_path)
|
225
227
|
plt.close(fig_plots)
|
226
228
|
self.util.debug(f"Saved plot to {img_path}")
|
@@ -359,7 +361,7 @@ class Plots:
|
|
359
361
|
|
360
362
|
def plot_durations(self, df, filename, sample_selection, caption=""):
|
361
363
|
# one up because of the runs
|
362
|
-
fig_dir = self.util.get_path("fig_dir")
|
364
|
+
fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
|
363
365
|
try:
|
364
366
|
ax = sns.histplot(df, x="duration", hue="class_label", kde=True)
|
365
367
|
except AttributeError as ae:
|
@@ -376,7 +378,7 @@ class Plots:
|
|
376
378
|
ax.set_ylabel("number of samples")
|
377
379
|
fig = ax.figure
|
378
380
|
# plt.tight_layout()
|
379
|
-
img_path = f"{
|
381
|
+
img_path = os.path.join(fig_dir, f"{filename}_{sample_selection}.{self.format}")
|
380
382
|
plt.savefig(img_path)
|
381
383
|
plt.close(fig)
|
382
384
|
self.util.debug(f"plotted durations to {img_path}")
|
@@ -393,14 +395,14 @@ class Plots:
|
|
393
395
|
filename = "speakers"
|
394
396
|
caption = "speakers"
|
395
397
|
# one up because of the runs
|
396
|
-
fig_dir = self.util.get_path("fig_dir")
|
398
|
+
fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
|
397
399
|
sns.set_style("whitegrid") # Set style for chart
|
398
400
|
ax = df["speaker"].value_counts().plot(kind="pie", autopct="%1.1f%%")
|
399
401
|
title = f"Speaker distr. for {sample_selection} {df.shape[0]}."
|
400
402
|
ax.set_title(title)
|
401
403
|
fig = ax.figure
|
402
404
|
# plt.tight_layout()
|
403
|
-
img_path = f"{
|
405
|
+
img_path = os.path.join(fig_dir, f"{filename}_{sample_selection}.{self.format}")
|
404
406
|
plt.savefig(img_path)
|
405
407
|
plt.close(fig)
|
406
408
|
self.util.debug(f"plotted speakers to {img_path}")
|
@@ -415,7 +417,7 @@ class Plots:
|
|
415
417
|
|
416
418
|
def describe_df(self, name, df, target, filename):
|
417
419
|
"""Make a stacked barplot of samples and speakers per sex and target values. speaker, gender and target columns must be present"""
|
418
|
-
fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
|
420
|
+
fig_dir = self.util.get_path("fig_dir") # + "../" # one up because of the runs
|
419
421
|
sampl_num = df.shape[0]
|
420
422
|
sex_col = "gender"
|
421
423
|
if target == "gender":
|
@@ -447,7 +449,7 @@ class Plots:
|
|
447
449
|
kind="bar", ax=axes, title=f"samples ({sampl_num})"
|
448
450
|
)
|
449
451
|
# plt.tight_layout()
|
450
|
-
img_path = f"{
|
452
|
+
img_path = os.path.join(fig_dir, f"{filename}.{self.format}")
|
451
453
|
plt.savefig(img_path)
|
452
454
|
fig.clear()
|
453
455
|
plt.close(fig)
|
@@ -462,11 +464,12 @@ class Plots:
|
|
462
464
|
|
463
465
|
def scatter_plot(self, feats, label_df, label, dimred_type):
|
464
466
|
dim_num = int(self.util.config_val("EXPL", "scatter.dim", 2))
|
465
|
-
# one up because of the runs
|
466
|
-
fig_dir = self.util.get_path("fig_dir")
|
467
|
+
# one up because of the runs (for explore module)
|
468
|
+
fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
|
467
469
|
sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
|
468
|
-
|
469
|
-
filename = f"{
|
470
|
+
exp_name = self.util.get_name()
|
471
|
+
filename = f"{label}_{exp_name}_{self.util.get_feattype_name()}_{sample_selection}_{dimred_type}_{str(dim_num)}d"
|
472
|
+
filename = os.path.join(fig_dir, f"{filename}.{self.format}")
|
470
473
|
self.util.debug(f"computing {dimred_type}, this might take a while...")
|
471
474
|
data = None
|
472
475
|
labels = label_df[label]
|
@@ -573,6 +576,7 @@ class Plots:
|
|
573
576
|
self.util.error(f"wrong dimension number: {dim_num}")
|
574
577
|
fig = ax.figure
|
575
578
|
plt.savefig(filename)
|
579
|
+
self.util.debug(f"plotted {dimred_type} scatter plot to {filename}")
|
576
580
|
fig.clear()
|
577
581
|
plt.close(fig)
|
578
582
|
glob_conf.report.add_item(
|
@@ -599,8 +603,10 @@ class Plots:
|
|
599
603
|
# remove fullstops in the name
|
600
604
|
feature_name = feature.replace(".", "-")
|
601
605
|
# one up because of the runs
|
602
|
-
fig_dir = self.util.get_path("fig_dir")
|
603
|
-
filename =
|
606
|
+
fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
|
607
|
+
filename = os.path.join(
|
608
|
+
fig_dir, f"feat_dist_{title}_{feature_name}.{self.format}"
|
609
|
+
)
|
604
610
|
if self.util.is_categorical(df_labels[label]):
|
605
611
|
df_plot = pd.DataFrame(
|
606
612
|
{label: df_labels[label], feature: df_features[feature]}
|
@@ -647,9 +653,9 @@ class Plots:
|
|
647
653
|
# plt.tight_layout()
|
648
654
|
# print(ax)
|
649
655
|
# one up because of the runs
|
650
|
-
fig_dir = self.util.get_path("fig_dir")
|
656
|
+
fig_dir = os.path.join(self.util.get_path("fig_dir"), "..")
|
651
657
|
exp_name = self.util.get_exp_name(only_data=True)
|
652
|
-
filename = f"{
|
658
|
+
filename = os.path.join(fig_dir, f"{exp_name}EXPL_tree-plot.{self.format}")
|
653
659
|
fig = ax.figure
|
654
660
|
fig.savefig(filename)
|
655
661
|
fig.clear()
|
nkululeko/predict.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# predict.py
|
2
|
-
# use some model and add automatically predicted labels
|
3
|
-
# then save as a new dataset
|
2
|
+
# use some model and add automatically predicted labels
|
3
|
+
# also can labels train and test splits then save as a new dataset
|
4
4
|
|
5
|
-
r"""This script is used to call the nkululeko PREDICT
|
5
|
+
r"""This script is used to call the nkululeko PREDICT module.
|
6
6
|
|
7
7
|
It loads a configuration file, creates a new experiment,
|
8
8
|
and performs automatic prediction on the train and test datasets. The predicted labels are added to the datasets and
|
@@ -60,9 +60,11 @@ def main():
|
|
60
60
|
if "class_label" in df.columns:
|
61
61
|
df = df.drop(columns=[target])
|
62
62
|
df = df.rename(columns={"class_label": target})
|
63
|
-
|
64
|
-
|
65
|
-
util.
|
63
|
+
sample_selection = util.config_val("PREDICT", "sample_selection", "all")
|
64
|
+
name = f"{sample_selection}_predicted"
|
65
|
+
res_dir = util.get_res_dir()
|
66
|
+
df.to_csv(os.path.join(res_dir, f"{name}.csv"))
|
67
|
+
util.debug(f"saved {os.path.join(res_dir, name)}.csv")
|
66
68
|
print("DONE")
|
67
69
|
|
68
70
|
|
nkululeko/reporting/report.py
CHANGED
@@ -5,7 +5,6 @@ Collector class for report items collected during module processing.
|
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
8
|
-
from nkululeko.reporting.latex_writer import LatexWriter
|
9
8
|
from nkululeko.utils.util import Util
|
10
9
|
|
11
10
|
|
@@ -31,7 +30,10 @@ class Report:
|
|
31
30
|
print("\t" + c.contents)
|
32
31
|
|
33
32
|
def export_latex(self):
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
if str(self.util.config_val("REPORT", "show", "False")).lower() == "true":
|
34
|
+
from nkululeko.reporting.latex_writer import LatexWriter
|
35
|
+
|
36
|
+
lw = LatexWriter()
|
37
|
+
for topic in self.report_items:
|
38
|
+
lw.add_items_for_section(topic, self.report_items[topic])
|
39
|
+
lw.finish_doc()
|
nkululeko/reporting/reporter.py
CHANGED
@@ -2,6 +2,7 @@ import ast
|
|
2
2
|
import glob
|
3
3
|
import json
|
4
4
|
import math
|
5
|
+
import os
|
5
6
|
|
6
7
|
# import os
|
7
8
|
from confidence_intervals import evaluate_with_conf_int
|
@@ -152,11 +153,14 @@ class Reporter:
|
|
152
153
|
probas["truth"] = self.truths
|
153
154
|
try:
|
154
155
|
le = glob_conf.label_encoder
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
156
|
+
if le is not None:
|
157
|
+
mapping = dict(zip(le.classes_, range(len(le.classes_))))
|
158
|
+
mapping_reverse = {value: key for key, value in mapping.items()}
|
159
|
+
probas = probas.rename(columns=mapping_reverse)
|
160
|
+
probas["predicted"] = probas["predicted"].map(mapping_reverse)
|
161
|
+
probas["truth"] = probas["truth"].map(mapping_reverse)
|
162
|
+
else:
|
163
|
+
self.util.debug("Label encoder is None, skipping label mapping")
|
160
164
|
except AttributeError as ae:
|
161
165
|
self.util.debug(f"Can't label categories: {ae}")
|
162
166
|
# compute entropy per sample
|
@@ -170,6 +174,17 @@ class Reporter:
|
|
170
174
|
probas["correct"] = probas.predicted == probas.truth
|
171
175
|
if file_name is None:
|
172
176
|
file_name = self.util.get_pred_name() + ".csv"
|
177
|
+
else:
|
178
|
+
# Ensure the file_name goes to the results directory
|
179
|
+
if not os.path.isabs(file_name):
|
180
|
+
res_dir = self.util.get_res_dir()
|
181
|
+
if not file_name.endswith(".csv"):
|
182
|
+
file_name = os.path.join(res_dir, file_name + ".csv")
|
183
|
+
else:
|
184
|
+
file_name = os.path.join(res_dir, file_name)
|
185
|
+
else:
|
186
|
+
if not file_name.endswith(".csv"):
|
187
|
+
file_name = file_name + ".csv"
|
173
188
|
self.probas = probas
|
174
189
|
probas.to_csv(file_name)
|
175
190
|
self.util.debug(f"Saved probabilities to {file_name}")
|
nkululeko/test_predictor.py
CHANGED
@@ -5,6 +5,7 @@ Predict targets from a model and save as csv file.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import ast
|
8
|
+
import os
|
8
9
|
|
9
10
|
import pandas as pd
|
10
11
|
from sklearn.preprocessing import LabelEncoder
|
@@ -24,7 +25,12 @@ class TestPredictor:
|
|
24
25
|
self.label_encoder = labenc
|
25
26
|
self.target = glob_conf.config["DATA"]["target"]
|
26
27
|
self.util = Util("test_predictor")
|
27
|
-
|
28
|
+
# Construct full path to results directory
|
29
|
+
res_dir = self.util.get_res_dir()
|
30
|
+
if os.path.isabs(name):
|
31
|
+
self.name = name
|
32
|
+
else:
|
33
|
+
self.name = os.path.join(res_dir, name)
|
28
34
|
|
29
35
|
def predict_and_store(self):
|
30
36
|
label_data = self.util.config_val("DATA", "label_data", False)
|
@@ -0,0 +1 @@
|
|
1
|
+
# Tests package for nkululeko
|