nkululeko 0.59.1__py3-none-any.whl → 0.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +43 -43
- nkululeko/feature_extractor.py +101 -58
- nkululeko/modelrunner.py +14 -14
- nkululeko/plots.py +11 -0
- nkululeko/segment.py +23 -27
- nkululeko/test_predictor.py +1 -1
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/METADATA +13 -1
- nkululeko-0.61.0.dist-info/RECORD +31 -0
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/WHEEL +1 -1
- nkululeko/ap_age.py +0 -31
- nkululeko/ap_arousal.py +0 -30
- nkululeko/ap_dominance.py +0 -29
- nkululeko/ap_gender.py +0 -29
- nkululeko/ap_mos.py +0 -35
- nkululeko/ap_pesq.py +0 -35
- nkululeko/ap_sdr.py +0 -36
- nkululeko/ap_snr.py +0 -35
- nkululeko/ap_stoi.py +0 -34
- nkululeko/ap_valence.py +0 -30
- nkululeko/augmenter.py +0 -64
- nkululeko/dataset.py +0 -415
- nkululeko/dataset_csv.py +0 -49
- nkululeko/dataset_ravdess.py +0 -19
- nkululeko/estimate_snr.py +0 -89
- nkululeko/feats_agender.py +0 -63
- nkululeko/feats_agender_agender.py +0 -65
- nkululeko/feats_analyser.py +0 -87
- nkululeko/feats_audmodel.py +0 -63
- nkululeko/feats_audmodel_dim.py +0 -63
- nkululeko/feats_clap.py +0 -74
- nkululeko/feats_import.py +0 -44
- nkululeko/feats_mld.py +0 -47
- nkululeko/feats_mos.py +0 -92
- nkululeko/feats_opensmile.py +0 -84
- nkululeko/feats_oxbow.py +0 -87
- nkululeko/feats_praat.py +0 -72
- nkululeko/feats_snr.py +0 -63
- nkululeko/feats_squim.py +0 -99
- nkululeko/feats_trill.py +0 -74
- nkululeko/feats_wav2vec2.py +0 -94
- nkululeko/featureset.py +0 -41
- nkululeko/feinberg_praat.py +0 -430
- nkululeko/loss_ccc.py +0 -28
- nkululeko/loss_softf1loss.py +0 -40
- nkululeko/model.py +0 -256
- nkululeko/model_bayes.py +0 -14
- nkululeko/model_cnn.py +0 -118
- nkululeko/model_gmm.py +0 -16
- nkululeko/model_knn.py +0 -16
- nkululeko/model_knn_reg.py +0 -16
- nkululeko/model_mlp.py +0 -175
- nkululeko/model_mlp_regression.py +0 -197
- nkululeko/model_svm.py +0 -18
- nkululeko/model_svr.py +0 -18
- nkululeko/model_tree.py +0 -14
- nkululeko/model_tree_reg.py +0 -14
- nkululeko/model_xgb.py +0 -12
- nkululeko/model_xgr.py +0 -12
- nkululeko/randomsplicer.py +0 -76
- nkululeko/randomsplicing.py +0 -74
- nkululeko-0.59.1.dist-info/RECORD +0 -82
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/LICENSE +0 -0
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/top_level.txt +0 -0
nkululeko/model.py
DELETED
@@ -1,256 +0,0 @@
|
|
1
|
-
# model.py
|
2
|
-
from nkululeko.util import Util
|
3
|
-
import pandas as pd
|
4
|
-
import numpy as np
|
5
|
-
import nkululeko.glob_conf as glob_conf
|
6
|
-
import sklearn.utils
|
7
|
-
from nkululeko.reporter import Reporter
|
8
|
-
import ast
|
9
|
-
from sklearn.model_selection import GridSearchCV
|
10
|
-
import pickle
|
11
|
-
import random
|
12
|
-
from sklearn.model_selection import LeaveOneGroupOut
|
13
|
-
from sklearn.model_selection import StratifiedKFold
|
14
|
-
|
15
|
-
|
16
|
-
class Model:
|
17
|
-
"""Generic model class for linear (non-neural) algorithms"""
|
18
|
-
|
19
|
-
def __init__(self, df_train, df_test, feats_train, feats_test):
|
20
|
-
"""Constructor taking the configuration and all dataframes"""
|
21
|
-
self.df_train, self.df_test, self.feats_train, self.feats_test = df_train, df_test, feats_train, feats_test
|
22
|
-
self.util = Util('model')
|
23
|
-
self.target = self.util.config_val('DATA', 'target', 'emotion')
|
24
|
-
self.run = 0
|
25
|
-
self.epoch = 0
|
26
|
-
self.logo = self.util.config_val('MODEL', 'logo', False)
|
27
|
-
self.xfoldx = self.util.config_val('MODEL', 'k_fold_cross', False)
|
28
|
-
|
29
|
-
def set_testdata(self, data_df, feats_df):
|
30
|
-
self.df_test, self.feats_test = data_df, feats_df
|
31
|
-
|
32
|
-
def reset_test(self, df_test, feats_test):
|
33
|
-
self.df_test, self.feats_test = df_test, feats_test
|
34
|
-
|
35
|
-
|
36
|
-
def set_id(self, run, epoch):
|
37
|
-
self.run = run
|
38
|
-
self.epoch = epoch
|
39
|
-
dir = self.util.get_path('model_dir')
|
40
|
-
name = f'{self.util.get_exp_name(only_train=True)}_{self.run}_{self.epoch:03d}.model'
|
41
|
-
self.store_path = dir+name
|
42
|
-
|
43
|
-
|
44
|
-
def _x_fold_cross(self):
|
45
|
-
# ignore train and test sets and do a x-fold-cross evaluation
|
46
|
-
self.util.debug(f'ignoring splits and doing {self.xfoldx} fold cross validation')
|
47
|
-
feats = pd.concat([self.feats_train, self.feats_test])
|
48
|
-
annos = pd.concat([self.df_train, self.df_test])
|
49
|
-
targets = annos[self.target]
|
50
|
-
_skf = StratifiedKFold(n_splits=int(self.xfoldx))
|
51
|
-
truths, preds, results = [], [], []
|
52
|
-
g_index = 0
|
53
|
-
# leave-one-speaker loop
|
54
|
-
for train_index, test_index in _skf.split(
|
55
|
-
feats,
|
56
|
-
targets,
|
57
|
-
):
|
58
|
-
train_x = feats.iloc[train_index].to_numpy()
|
59
|
-
train_y = targets[train_index]
|
60
|
-
self.clf.fit(train_x, train_y)
|
61
|
-
truth_x = feats.iloc[test_index].to_numpy()
|
62
|
-
truth_y = targets[test_index]
|
63
|
-
predict_y = self.clf.predict(truth_x)
|
64
|
-
report = Reporter(truth_y.astype(float), predict_y, self.run, self.epoch)
|
65
|
-
self.util.debug(f'result for fold {g_index}: {report.get_result().get_test_result()} ')
|
66
|
-
results.append(float(report.get_result().test))
|
67
|
-
truths.append(truth_y)
|
68
|
-
preds.append(predict_y)
|
69
|
-
g_index += 1
|
70
|
-
|
71
|
-
# combine speaker folds
|
72
|
-
truth = pd.concat(truths)
|
73
|
-
truth.name = 'truth'
|
74
|
-
pred = pd.Series(
|
75
|
-
np.concatenate(preds),
|
76
|
-
index=truth.index,
|
77
|
-
name='prediction',
|
78
|
-
)
|
79
|
-
self.truths = truth
|
80
|
-
self.preds = pred
|
81
|
-
results = np.asarray(results)
|
82
|
-
self.util.debug(f'KFOLD: {self.xfoldx} folds: mean {results.mean():.3f}, std: {results.std():.3f}')
|
83
|
-
def _do_logo(self):
|
84
|
-
# ignore train and test sets and do a "leave one speaker group out" evaluation
|
85
|
-
self.util.debug(f'ignoring splits and doing LOGO with {self.logo} groups')
|
86
|
-
logo = int(self.logo)
|
87
|
-
feats = pd.concat([self.feats_train, self.feats_test])
|
88
|
-
annos = pd.concat([self.df_train, self.df_test])
|
89
|
-
targets = annos[self.target]
|
90
|
-
_logo = LeaveOneGroupOut()
|
91
|
-
truths, preds, results = [], [], []
|
92
|
-
# get unique list of speakers
|
93
|
-
speakers = annos['speaker'].unique()
|
94
|
-
# check for folds columns
|
95
|
-
if not 'fold' in annos.columns:
|
96
|
-
self.util.debug(f'creating random folds for {logo} groups')
|
97
|
-
# create a random dictionary of groups
|
98
|
-
sdict = {}
|
99
|
-
# randomize the speaker order
|
100
|
-
random.shuffle(speakers)
|
101
|
-
folds = list(range(logo))
|
102
|
-
for i, s in enumerate(speakers):
|
103
|
-
sdict[s] = folds[i % len(folds)]
|
104
|
-
# add this to the annotations
|
105
|
-
annos['fold'] = annos['speaker'].apply(lambda x: str(sdict[x]))
|
106
|
-
else:
|
107
|
-
fold_count = annos['fold'].nunique()
|
108
|
-
self.util.debug(f'using existing folds for {fold_count} groups')
|
109
|
-
g_index = 0
|
110
|
-
# leave-one-group loop
|
111
|
-
for train_index, test_index in _logo.split(
|
112
|
-
feats,
|
113
|
-
targets,
|
114
|
-
groups=annos['fold'],
|
115
|
-
):
|
116
|
-
train_x = feats.iloc[train_index].to_numpy()
|
117
|
-
train_y = targets[train_index]
|
118
|
-
self.clf.fit(train_x, train_y)
|
119
|
-
|
120
|
-
truth_x = feats.iloc[test_index].to_numpy()
|
121
|
-
truth_y = targets[test_index]
|
122
|
-
predict_y = self.clf.predict(truth_x)
|
123
|
-
report = Reporter(truth_y.astype(float), predict_y, self.run, self.epoch)
|
124
|
-
result = report.get_result().get_test_result()
|
125
|
-
self.util.debug(f'result for speaker group {g_index}: {result} ')
|
126
|
-
results.append(float(report.get_result().test))
|
127
|
-
truths.append(truth_y)
|
128
|
-
preds.append(predict_y)
|
129
|
-
g_index += 1
|
130
|
-
|
131
|
-
# combine speaker folds
|
132
|
-
truth = pd.concat(truths)
|
133
|
-
truth.name = 'truth'
|
134
|
-
pred = pd.Series(
|
135
|
-
np.concatenate(preds),
|
136
|
-
index=truth.index,
|
137
|
-
name='prediction',
|
138
|
-
)
|
139
|
-
self.truths = truth
|
140
|
-
self.preds = pred
|
141
|
-
results = np.asarray(results)
|
142
|
-
self.util.debug(f'LOGO: {self.logo} folds: mean {results.mean():.3f}, std: {results.std():.3f}')
|
143
|
-
|
144
|
-
def train(self):
|
145
|
-
"""Train the model"""
|
146
|
-
# # first check if the model already has been trained
|
147
|
-
# if os.path.isfile(self.store_path):
|
148
|
-
# self.load(self.run, self.epoch)
|
149
|
-
# self.util.debug(f'reusing model: {self.store_path}')
|
150
|
-
# return
|
151
|
-
|
152
|
-
# then if leave one speaker group out validation is wanted
|
153
|
-
if self.logo:
|
154
|
-
self._do_logo()
|
155
|
-
return
|
156
|
-
# then if x fold cross validation is wanted
|
157
|
-
if self.xfoldx:
|
158
|
-
self._x_fold_cross()
|
159
|
-
return
|
160
|
-
|
161
|
-
# check for NANs in the features
|
162
|
-
# set up the data_loaders
|
163
|
-
if self.feats_train.isna().to_numpy().any():
|
164
|
-
self.util.debug(f'Model, train: replacing {self.feats_train.isna().sum().sum()} NANs with 0')
|
165
|
-
self.feats_train = self.feats_train.fillna(0)
|
166
|
-
# remove labels from features
|
167
|
-
feats = self.feats_train.to_numpy()
|
168
|
-
# compute class weights
|
169
|
-
if self.util.config_val('MODEL', 'class_weight', False):
|
170
|
-
self.classes_weights = sklearn.utils.class_weight.compute_sample_weight(
|
171
|
-
class_weight='balanced',
|
172
|
-
y=self.df_train[self.target]
|
173
|
-
)
|
174
|
-
|
175
|
-
tuning_params = self.util.config_val('MODEL', 'tuning_params', False)
|
176
|
-
if tuning_params:
|
177
|
-
# tune the model meta parameters
|
178
|
-
tuning_params = ast.literal_eval(tuning_params)
|
179
|
-
tuned_params={}
|
180
|
-
try:
|
181
|
-
scoring = glob_conf.config['MODEL']['scoring']
|
182
|
-
except KeyError:
|
183
|
-
self.util.error('got tuning params but no scoring')
|
184
|
-
for param in tuning_params:
|
185
|
-
values = ast.literal_eval(glob_conf.config['MODEL'][param])
|
186
|
-
tuned_params[param] = values
|
187
|
-
self.util.debug(f'tuning on {tuned_params}')
|
188
|
-
self.clf = GridSearchCV(self.clf, tuned_params, refit = True, verbose = 3, scoring=scoring)
|
189
|
-
try:
|
190
|
-
class_weight = self.util.config_val('MODEL', 'class_weight', False)
|
191
|
-
if class_weight:
|
192
|
-
self.util.debug('using class weight')
|
193
|
-
self.clf.fit(feats, self.df_train[self.target], sample_weight=self.classes_weights)
|
194
|
-
else:
|
195
|
-
self.clf.fit(feats, self.df_train[self.target])
|
196
|
-
except KeyError:
|
197
|
-
self.clf.fit(feats, self.df_train[self.target])
|
198
|
-
self.util.debug(f'winner parameters: {self.clf.best_params_}')
|
199
|
-
else:
|
200
|
-
class_weight = self.util.config_val('MODEL', 'class_weight', False)
|
201
|
-
if class_weight:
|
202
|
-
self.util.debug('using class weight')
|
203
|
-
self.clf.fit(feats, self.df_train[self.target], sample_weight=self.classes_weights)
|
204
|
-
else:
|
205
|
-
labels = self.df_train[self.target]
|
206
|
-
self.clf.fit(feats, labels)
|
207
|
-
|
208
|
-
def get_predictions(self):
|
209
|
-
predictions = self.clf.predict(self.feats_test.to_numpy())
|
210
|
-
return predictions
|
211
|
-
|
212
|
-
def predict(self):
|
213
|
-
if self.feats_test.isna().to_numpy().any():
|
214
|
-
self.util.debug(f'Model, test: replacing {self.feats_test.isna().sum().sum()} NANs with 0')
|
215
|
-
self.feats_test = self.feats_test.fillna(0)
|
216
|
-
if self.logo or self.xfoldx:
|
217
|
-
report = Reporter(self.truths.astype(float), self.preds, self.run, self.epoch)
|
218
|
-
return report
|
219
|
-
"""Predict the whole eval feature set"""
|
220
|
-
predictions = self.get_predictions()
|
221
|
-
report = Reporter(self.df_test[self.target]\
|
222
|
-
.to_numpy().astype(float), predictions, self.run, self.epoch)
|
223
|
-
return report
|
224
|
-
|
225
|
-
def predict_sample(self, features):
|
226
|
-
"""Predict one sample"""
|
227
|
-
prediction = {}
|
228
|
-
if self.util.exp_is_classification():
|
229
|
-
# get the class probabilities
|
230
|
-
predictions = self.clf.predict_proba(features)
|
231
|
-
# pred = self.clf.predict(features)
|
232
|
-
for i in range(len(self.clf.classes_)):
|
233
|
-
cat = self.clf.classes_[i]
|
234
|
-
prediction[cat] = predictions[0][i]
|
235
|
-
else:
|
236
|
-
predictions = self.clf.predict(features)
|
237
|
-
prediction['result'] = predictions[0]
|
238
|
-
return prediction
|
239
|
-
|
240
|
-
|
241
|
-
def store(self):
|
242
|
-
with open(self.store_path, 'wb') as handle:
|
243
|
-
pickle.dump(self.clf, handle)
|
244
|
-
|
245
|
-
|
246
|
-
def load(self, run, epoch):
|
247
|
-
self.set_id(run, epoch)
|
248
|
-
dir = self.util.get_path('model_dir')
|
249
|
-
name = f'{self.util.get_exp_name(only_train=True)}_{self.run}_{self.epoch:03d}.model'
|
250
|
-
with open(dir+name, 'rb') as handle:
|
251
|
-
self.clf = pickle.load(handle)
|
252
|
-
|
253
|
-
def load_path(self, path, run, epoch):
|
254
|
-
self.set_id(run, epoch)
|
255
|
-
with open(path, 'rb') as handle:
|
256
|
-
self.clf = pickle.load(handle)
|
nkululeko/model_bayes.py
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
# model_bayes.py
|
2
|
-
|
3
|
-
from sklearn.naive_bayes import GaussianNB
|
4
|
-
from nkululeko.model import Model
|
5
|
-
|
6
|
-
class Bayes_model(Model):
|
7
|
-
is_classifier = True
|
8
|
-
|
9
|
-
"""An SVM model"""
|
10
|
-
def __init__(self, df_train, df_test, feats_train, feats_test):
|
11
|
-
super().__init__(df_train, df_test, feats_train, feats_test)
|
12
|
-
c = float(self.util.config_val('MODEL', 'C_val', '0.001'))
|
13
|
-
self.clf = GaussianNB() # set up the classifier
|
14
|
-
|
nkululeko/model_cnn.py
DELETED
@@ -1,118 +0,0 @@
|
|
1
|
-
# cnnmodel.py
|
2
|
-
|
3
|
-
from sklearn.utils import resample
|
4
|
-
from nkululeko.model import Model
|
5
|
-
import torch
|
6
|
-
from sklearn.metrics import mean_squared_error
|
7
|
-
import nkululeko.glob_conf as glob_conf
|
8
|
-
from nkululeko.reporter import Reporter
|
9
|
-
import numpy as np
|
10
|
-
from loss_ccc import ConcordanceCorCoeff
|
11
|
-
|
12
|
-
class CNN_model(Model):
|
13
|
-
"""A CNN (convolutional neural net) model"""
|
14
|
-
is_classifier = True
|
15
|
-
|
16
|
-
def __init__(self, df_train, df_test, feats_train, feats_test):
|
17
|
-
"""Constructor taking the configuration and all dataframes"""
|
18
|
-
Model.__init__(self, df_train, df_test, feats_train, feats_test)
|
19
|
-
self.util.debug(f'initializing model')
|
20
|
-
self.device = glob_conf.config['MODEL']['device']
|
21
|
-
pretrained_state = self.util.config_val('MODEL', 'pre_train', 'gender_state.pth.tar')
|
22
|
-
state = torch.load(pretrained_state)
|
23
|
-
state.pop('out.gender.weight')
|
24
|
-
state.pop('out.gender.bias')
|
25
|
-
state['fc1.weight'] = state.pop('fc1.gender.weight')
|
26
|
-
state['fc1.bias'] = state.pop('fc1.gender.bias')
|
27
|
-
model = audpann.Cnn10(sampling_rate=16000, output_dim=1)
|
28
|
-
model.load_state_dict(state, strict=False)
|
29
|
-
self.model = model.to(self.device)
|
30
|
-
self.loss_func = self.util.config_val('MODEL', 'loss', 'mse')
|
31
|
-
if self.loss_func == 'mse':
|
32
|
-
self.criterion = torch.nn.MSELoss()
|
33
|
-
elif self.loss_func == '1-ccc':
|
34
|
-
self.criterion = ConcordanceCorCoeff()
|
35
|
-
else:
|
36
|
-
self.util.error(f'unknown loss function: {self.loss_func}')
|
37
|
-
self.util.debug(f'training model with {self.loss_func} loss function')
|
38
|
-
self.learning_rate = float(self.util.config_val('MODEL', 'learning_rate', 0.0001))
|
39
|
-
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
|
40
|
-
|
41
|
-
def train(self):
|
42
|
-
"""Train the model one epoch"""
|
43
|
-
losses = []
|
44
|
-
# first check if the model already has been trained
|
45
|
-
# if os.path.isfile(self.store_path):
|
46
|
-
# self.load(self.run, self.epoch)
|
47
|
-
# self.util.debug(f'reusing model: {self.store_path}')
|
48
|
-
# return
|
49
|
-
|
50
|
-
self.model.train()
|
51
|
-
|
52
|
-
for features, labels in self.feats_train:
|
53
|
-
logits = self.model(features.to(self.device).float()).squeeze(1)
|
54
|
-
loss = self.criterion(logits, labels.float().to(self.device))
|
55
|
-
losses.append(loss.item())
|
56
|
-
self.optimizer.zero_grad()
|
57
|
-
loss.backward()
|
58
|
-
self.optimizer.step()
|
59
|
-
self.loss = (np.asarray(losses)).mean()
|
60
|
-
|
61
|
-
def predict(self):
|
62
|
-
"""Predict the whole eval feature set"""
|
63
|
-
# evaluate on dev set
|
64
|
-
_, truths, predictions = self.evaluate_model(False)
|
65
|
-
# evaluate on train set, if there is one
|
66
|
-
result = 0
|
67
|
-
if self.feats_train != None:
|
68
|
-
result, _, _ = self.evaluate_model(True)
|
69
|
-
report = Reporter(truths.numpy(), predictions.numpy(), self.run, self.epoch)
|
70
|
-
try:
|
71
|
-
report.result.loss = self.loss
|
72
|
-
except AttributeError: # if the model was loaded from disk the loss is unknown
|
73
|
-
pass
|
74
|
-
report.result.train = result
|
75
|
-
return report
|
76
|
-
|
77
|
-
|
78
|
-
def evaluate_model(self, on_train=False):
|
79
|
-
if on_train:
|
80
|
-
loader = self.feats_train
|
81
|
-
else:
|
82
|
-
loader = self.feats_test
|
83
|
-
logits = torch.zeros(len(loader.dataset))
|
84
|
-
targets = torch.zeros(len(loader.dataset))
|
85
|
-
self.model.eval()
|
86
|
-
with torch.no_grad():
|
87
|
-
for index, (features, labels) in enumerate(loader):
|
88
|
-
start_index = index * loader.batch_size
|
89
|
-
end_index = (index + 1) * loader.batch_size
|
90
|
-
if end_index > len(loader.dataset):
|
91
|
-
end_index = len(loader.dataset)
|
92
|
-
logits[start_index:end_index] = self.model(features.to(self.device).float()).squeeze(1)
|
93
|
-
targets[start_index:end_index] = labels
|
94
|
-
|
95
|
-
measure = self.util.config_val('MODEL', 'measure', 'mse')
|
96
|
-
if measure == 'mse':
|
97
|
-
result = mean_squared_error(targets.numpy(), logits.numpy())
|
98
|
-
elif measure == 'ccc':
|
99
|
-
result = Reporter.ccc(targets.numpy(), logits.numpy())
|
100
|
-
else:
|
101
|
-
self.util.error(f'unknown measure: {measure}')
|
102
|
-
return result, targets, logits
|
103
|
-
|
104
|
-
def store(self):
|
105
|
-
torch.save(self.model.state_dict(), self.store_path)
|
106
|
-
self.device = self.util.config_val('MODEL', 'device', 'cpu')
|
107
|
-
# self.model.to(self.device)
|
108
|
-
|
109
|
-
def load(self, run, epoch):
|
110
|
-
self.set_id(run, epoch)
|
111
|
-
dir = self.util.get_path('model_dir')
|
112
|
-
name = f'{self.util.get_exp_name(only_train=True)}_{run}_{epoch:03d}.model'
|
113
|
-
self.device = self.util.config_val('MODEL', 'device', 'cpu')
|
114
|
-
self.store_path = dir+name
|
115
|
-
self.model = audpann.Cnn10(sampling_rate=16000, output_dim=1)
|
116
|
-
state_dict = torch.load(dir+name, map_location='cpu')
|
117
|
-
self.model.load_state_dict(state_dict, strict=False)
|
118
|
-
self.model.to(self.device)
|
nkululeko/model_gmm.py
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
# model_gmm.py
|
2
|
-
|
3
|
-
from sklearn import mixture
|
4
|
-
from nkululeko.model import Model
|
5
|
-
|
6
|
-
class GMM_model(Model):
|
7
|
-
"""An GMM model"""
|
8
|
-
|
9
|
-
is_classifier = True
|
10
|
-
def __init__(self, df_train, df_test, feats_train, feats_test):
|
11
|
-
super().__init__(df_train, df_test, feats_train, feats_test)
|
12
|
-
n_components = int(self.util.config_val('MODEL', 'GMM_components', '4'))
|
13
|
-
covariance_type = self.util.config_val('MODEL', 'GMM_covariance_type', 'full')
|
14
|
-
self.clf = mixture.GaussianMixture(n_components=n_components, covariance_type=covariance_type)
|
15
|
-
# set up the classifier
|
16
|
-
|
nkululeko/model_knn.py
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
# model_knn.py
|
2
|
-
|
3
|
-
from sklearn.neighbors import KNeighborsClassifier
|
4
|
-
from nkululeko.model import Model
|
5
|
-
|
6
|
-
class KNN_model(Model):
|
7
|
-
"""An KNN model"""
|
8
|
-
|
9
|
-
is_classifier = True
|
10
|
-
|
11
|
-
def __init__(self, df_train, df_test, feats_train, feats_test):
|
12
|
-
super().__init__(df_train, df_test, feats_train, feats_test)
|
13
|
-
method = self.util.config_val('MODEL', 'KNN_weights', 'uniform')
|
14
|
-
k = int(self.util.config_val('MODEL', 'K_val', '5'))
|
15
|
-
self.clf = KNeighborsClassifier(n_neighbors=k, weights=method) # set up the classifier
|
16
|
-
|
nkululeko/model_knn_reg.py
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
# model_knn_reg.py
|
2
|
-
|
3
|
-
from sklearn.neighbors import KNeighborsRegressor
|
4
|
-
from nkululeko.model import Model
|
5
|
-
|
6
|
-
class KNN_reg_model(Model):
|
7
|
-
"""An KNN model"""
|
8
|
-
|
9
|
-
is_classifier = False
|
10
|
-
|
11
|
-
def __init__(self, df_train, df_test, feats_train, feats_test):
|
12
|
-
super().__init__(df_train, df_test, feats_train, feats_test)
|
13
|
-
method = self.util.config_val('MODEL', 'KNN_weights', 'uniform')
|
14
|
-
k = int(self.util.config_val('MODEL', 'K_val', '5'))
|
15
|
-
self.clf = KNeighborsRegressor(n_neighbors=k, weights=method) # set up the classifier
|
16
|
-
|
nkululeko/model_mlp.py
DELETED
@@ -1,175 +0,0 @@
|
|
1
|
-
# model_mlp.py
|
2
|
-
from nkululeko.util import Util
|
3
|
-
import nkululeko.glob_conf as glob_conf
|
4
|
-
from nkululeko.model import Model
|
5
|
-
from nkululeko.reporter import Reporter
|
6
|
-
import torch
|
7
|
-
import ast
|
8
|
-
import numpy as np
|
9
|
-
from sklearn.metrics import recall_score
|
10
|
-
from collections import OrderedDict
|
11
|
-
from nkululeko.loss_softf1loss import SoftF1Loss
|
12
|
-
|
13
|
-
class MLP_model(Model):
|
14
|
-
"""MLP = multi layer perceptron"""
|
15
|
-
|
16
|
-
is_classifier = True
|
17
|
-
|
18
|
-
def __init__(self, df_train, df_test, feats_train, feats_test):
|
19
|
-
"""Constructor taking the configuration and all dataframes"""
|
20
|
-
super().__init__(df_train, df_test, feats_train, feats_test)
|
21
|
-
self.target = glob_conf.config['DATA']['target']
|
22
|
-
labels = ast.literal_eval(glob_conf.config['DATA']['labels'])
|
23
|
-
self.class_num = len(labels)
|
24
|
-
# set up loss criterion
|
25
|
-
criterion = self.util.config_val('MODEL', 'loss', 'cross')
|
26
|
-
if criterion == 'cross':
|
27
|
-
self.criterion = torch.nn.CrossEntropyLoss()
|
28
|
-
elif criterion == 'f1':
|
29
|
-
self.criterion = SoftF1Loss(num_classes=self.class_num, weight=None, epsilon=1e-7)
|
30
|
-
else:
|
31
|
-
self.util.error(f'unknown loss function: {criterion}')
|
32
|
-
self.util.debug(f'using model with cross entropy loss function')
|
33
|
-
# set up the model
|
34
|
-
self.device = self.util.config_val('MODEL', 'device', 'cpu')
|
35
|
-
layers_string = glob_conf.config['MODEL']['layers']
|
36
|
-
self.util.debug(f'using layers {layers_string}')
|
37
|
-
layers = ast.literal_eval(layers_string)
|
38
|
-
# with dropout?
|
39
|
-
drop = self.util.config_val('MODEL', 'drop', False)
|
40
|
-
if drop:
|
41
|
-
self.util.debug(f'init: training with dropout: {drop}')
|
42
|
-
self.model = self.MLP(feats_train.shape[1], layers, self.class_num, drop).to(self.device)
|
43
|
-
self.learning_rate = float(self.util.config_val('MODEL', 'learning_rate', 0.0001))
|
44
|
-
# set up regularization
|
45
|
-
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
|
46
|
-
# batch size
|
47
|
-
self.batch_size = int(self.util.config_val('MODEL', 'batch_size', 8))
|
48
|
-
# number of parallel processes
|
49
|
-
self.num_workers = int(self.util.config_val('MODEL', 'num_workers', 5))
|
50
|
-
if feats_train.isna().to_numpy().any():
|
51
|
-
self.util.debug(f'Model, train: replacing {feats_train.isna().sum().sum()} NANs with 0')
|
52
|
-
feats_train = feats_train.fillna(0)
|
53
|
-
if feats_test.isna().to_numpy().any():
|
54
|
-
self.util.debug(f'Model, test: replacing {feats_test.isna().sum().sum()} NANs with 0')
|
55
|
-
feats_test = feats_test.fillna(0)
|
56
|
-
# set up the data_loaders
|
57
|
-
self.trainloader = self.get_loader(feats_train, df_train, True)
|
58
|
-
self.testloader = self.get_loader(feats_test, df_test, False)
|
59
|
-
|
60
|
-
def set_testdata(self, data_df, feats_df):
|
61
|
-
self.testloader = self.get_loader(feats_df, data_df, False)
|
62
|
-
|
63
|
-
def reset_test(self, df_test, feats_test):
|
64
|
-
self.testloader = self.get_loader(feats_test, df_test, False)
|
65
|
-
|
66
|
-
def train(self):
|
67
|
-
self.model.train()
|
68
|
-
losses = []
|
69
|
-
for features, labels in self.trainloader:
|
70
|
-
logits = self.model(features.to(self.device))
|
71
|
-
loss = self.criterion(logits, labels.to(self.device, dtype=torch.int64))
|
72
|
-
losses.append(loss.item())
|
73
|
-
self.optimizer.zero_grad()
|
74
|
-
loss.backward()
|
75
|
-
self.optimizer.step()
|
76
|
-
self.loss = (np.asarray(losses)).mean()
|
77
|
-
|
78
|
-
def predict(self):
|
79
|
-
_, truths, predictions = self.evaluate_model(self.model, self.testloader, self.device)
|
80
|
-
uar, _, _ = self.evaluate_model(self.model, self.trainloader, self.device)
|
81
|
-
report = Reporter(truths, predictions, self.run, self.epoch)
|
82
|
-
try:
|
83
|
-
report.result.loss = self.loss
|
84
|
-
except AttributeError: # if the model was loaded from disk the loss is unknown
|
85
|
-
pass
|
86
|
-
report.result.train = uar
|
87
|
-
return report
|
88
|
-
|
89
|
-
def get_predictions(self):
|
90
|
-
_, truths, predictions = self.evaluate_model(self.model, self.testloader, self.device)
|
91
|
-
return predictions.numpy()
|
92
|
-
|
93
|
-
def get_loader(self, df_x, df_y, shuffle):
|
94
|
-
data=[]
|
95
|
-
for i in range(len(df_x)):
|
96
|
-
data.append([df_x.values[i], df_y[self.target][i]])
|
97
|
-
return torch.utils.data.DataLoader(data, shuffle=shuffle, batch_size=self.batch_size)
|
98
|
-
|
99
|
-
class MLP(torch.nn.Module):
|
100
|
-
def __init__(self, i, layers, o, drop):
|
101
|
-
super().__init__()
|
102
|
-
sorted_layers = sorted(layers.items(), key=lambda x: x[1])
|
103
|
-
layers = OrderedDict()
|
104
|
-
layers['0'] = torch.nn.Linear(i, sorted_layers[0][1])
|
105
|
-
layers['0_r'] = torch.nn.ReLU()
|
106
|
-
for i in range(0, len(sorted_layers)-1):
|
107
|
-
layers[str(i+1)] = torch.nn.Linear(sorted_layers[i][1], sorted_layers[i+1][1])
|
108
|
-
if drop:
|
109
|
-
layers[str(i)+'_d'] = torch.nn.Dropout(float(drop))
|
110
|
-
layers[str(i)+'_r'] = torch.nn.ReLU()
|
111
|
-
layers[str(len(sorted_layers)+1)] = torch.nn.Linear(sorted_layers[-1][1], o)
|
112
|
-
self.linear = torch.nn.Sequential(layers)
|
113
|
-
def forward(self, x):
|
114
|
-
# x: (batch_size, channels, samples)
|
115
|
-
x = x.squeeze(dim=1).float()
|
116
|
-
return self.linear(x)
|
117
|
-
|
118
|
-
|
119
|
-
def evaluate_model(self, model, loader, device):
|
120
|
-
logits = torch.zeros(len(loader.dataset), self.class_num)
|
121
|
-
targets = torch.zeros(len(loader.dataset))
|
122
|
-
model.eval()
|
123
|
-
with torch.no_grad():
|
124
|
-
for index, (features, labels) in enumerate(loader):
|
125
|
-
start_index = index * loader.batch_size
|
126
|
-
end_index = (index + 1) * loader.batch_size
|
127
|
-
if end_index > len(loader.dataset):
|
128
|
-
end_index = len(loader.dataset)
|
129
|
-
logits[start_index:end_index, :] = model(features.to(device))
|
130
|
-
targets[start_index:end_index] = labels
|
131
|
-
|
132
|
-
predictions = logits.argmax(dim=1)
|
133
|
-
uar = recall_score(targets.numpy(), predictions.numpy(), average='macro')
|
134
|
-
return uar, targets, predictions
|
135
|
-
|
136
|
-
def predict_sample(self, features):
|
137
|
-
"""Predict one sample"""
|
138
|
-
with torch.no_grad():
|
139
|
-
logits = self.model(torch.from_numpy(features).to(self.device))
|
140
|
-
a = logits.numpy()
|
141
|
-
res = {}
|
142
|
-
for i in range(len(a[0])):
|
143
|
-
res[i] = a[0][i]
|
144
|
-
return res
|
145
|
-
|
146
|
-
def store(self):
|
147
|
-
torch.save(self.model.state_dict(), self.store_path)
|
148
|
-
|
149
|
-
def load(self, run, epoch):
|
150
|
-
self.set_id(run, epoch)
|
151
|
-
dir = self.util.get_path('model_dir')
|
152
|
-
# name = f'{self.util.get_exp_name()}_{run}_{epoch:03d}.model'
|
153
|
-
name = f'{self.util.get_exp_name(only_train=True)}_{self.run}_{self.epoch:03d}.model'
|
154
|
-
self.device = self.util.config_val('MODEL', 'device', 'cpu')
|
155
|
-
layers = ast.literal_eval(glob_conf.config['MODEL']['layers'])
|
156
|
-
self.store_path = dir+name
|
157
|
-
drop = self.util.config_val('MODEL', 'drop', False)
|
158
|
-
if drop:
|
159
|
-
self.util.debug(f'loading: dropout set to: {drop}')
|
160
|
-
self.model = self.MLP(self.feats_train.shape[1], layers, self.class_num, drop).to(self.device)
|
161
|
-
self.model.load_state_dict(torch.load(self.store_path))
|
162
|
-
self.model.eval()
|
163
|
-
|
164
|
-
def load_path(self, path, run, epoch):
|
165
|
-
self.set_id(run, epoch)
|
166
|
-
with open(path, 'rb') as handle:
|
167
|
-
self.device = self.util.config_val('MODEL', 'device', 'cpu')
|
168
|
-
layers = ast.literal_eval(glob_conf.config['MODEL']['layers'])
|
169
|
-
self.store_path = path
|
170
|
-
drop = self.util.config_val('MODEL', 'drop', False)
|
171
|
-
if drop:
|
172
|
-
self.util.debug(f'dropout set to: {drop}')
|
173
|
-
self.model = self.MLP(self.feats_train.shape[1], layers, self.class_num, drop).to(self.device)
|
174
|
-
self.model.load_state_dict(torch.load(self.store_path))
|
175
|
-
self.model.eval()
|