nkululeko 0.93.15__py3-none-any.whl → 0.94.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/aug_train.py +13 -2
- nkululeko/constants.py +1 -1
- nkululeko/data/dataset.py +287 -36
- nkululeko/experiment.py +121 -17
- nkululeko/feat_extract/feats_opensmile copy.py +93 -0
- nkululeko/feat_extract/feats_opensmile.py +207 -60
- nkululeko/feat_extract/feats_trill.py +2 -2
- nkululeko/filter_data.py +3 -1
- nkululeko/modelrunner.py +23 -10
- nkululeko/models/model_mlp.py +2 -0
- nkululeko/nkululeko.py +0 -1
- nkululeko/plots.py +11 -2
- nkululeko/reporting/reporter.py +27 -39
- nkululeko/runmanager.py +53 -33
- nkululeko/scaler.py +41 -24
- nkululeko/utils/util.py +1 -1
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/METADATA +3 -2
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/RECORD +22 -21
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/WHEEL +1 -1
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info/licenses}/LICENSE +0 -0
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/top_level.txt +0 -0
nkululeko/experiment.py
CHANGED
@@ -43,6 +43,7 @@ class Experiment:
|
|
43
43
|
audeer.mkdir(self.data_dir) # create the experiment directory
|
44
44
|
self.util = Util("experiment")
|
45
45
|
glob_conf.set_util(self.util)
|
46
|
+
self.split3 = eval(self.util.config_val("EXP", "traindevtest", "False"))
|
46
47
|
fresh_report = eval(self.util.config_val("REPORT", "fresh", "False"))
|
47
48
|
if not fresh_report:
|
48
49
|
try:
|
@@ -75,11 +76,11 @@ class Experiment:
|
|
75
76
|
# return self.util.get_exp_name()
|
76
77
|
|
77
78
|
def set_globals(self, config_obj):
|
78
|
-
"""
|
79
|
+
"""Install a config object in the global space."""
|
79
80
|
glob_conf.init_config(config_obj)
|
80
81
|
|
81
82
|
def load_datasets(self):
|
82
|
-
"""Load all databases specified in the configuration and map the labels"""
|
83
|
+
"""Load all databases specified in the configuration and map the labels."""
|
83
84
|
ds = ast.literal_eval(glob_conf.config["DATA"]["databases"])
|
84
85
|
self.datasets = {}
|
85
86
|
self.got_speaker, self.got_gender, self.got_age = False, False, False
|
@@ -186,6 +187,10 @@ class Experiment:
|
|
186
187
|
store = self.util.get_path("store")
|
187
188
|
storage_test = f"{store}testdf.csv"
|
188
189
|
storage_train = f"{store}traindf.csv"
|
190
|
+
self.df_dev = None
|
191
|
+
self.feats_dev = None
|
192
|
+
if self.split3:
|
193
|
+
storage_dev = f"{store}devdf.csv"
|
189
194
|
start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
|
190
195
|
if (
|
191
196
|
os.path.isfile(storage_train)
|
@@ -199,10 +204,20 @@ class Experiment:
|
|
199
204
|
self.df_train = self._import_csv(storage_train)
|
200
205
|
self.train_empty = True if self.df_train.shape[0] == 0 else False
|
201
206
|
self.test_empty = True if self.df_test.shape[0] == 0 else False
|
207
|
+
if self.split3:
|
208
|
+
self.df_dev = self._import_csv(storage_dev)
|
209
|
+
self.dev_empty = True if self.df_dev.shape[0] == 0 else False
|
202
210
|
else:
|
203
211
|
self.df_train, self.df_test = pd.DataFrame(), pd.DataFrame()
|
212
|
+
if self.split3:
|
213
|
+
self.df_dev = pd.DataFrame()
|
214
|
+
else:
|
215
|
+
self.df_dev = None
|
204
216
|
for d in self.datasets.values():
|
205
|
-
|
217
|
+
if self.split3:
|
218
|
+
d.split_3()
|
219
|
+
else:
|
220
|
+
d.split()
|
206
221
|
if self.target != "none":
|
207
222
|
d.prepare_labels()
|
208
223
|
if d.df_train.shape[0] == 0:
|
@@ -214,23 +229,38 @@ class Experiment:
|
|
214
229
|
self.util.debug(f"warn: {d.name} test empty")
|
215
230
|
self.df_test = pd.concat([self.df_test, d.df_test])
|
216
231
|
self.util.copy_flags(d, self.df_test)
|
232
|
+
if self.split3:
|
233
|
+
if d.df_dev.shape[0] == 0:
|
234
|
+
self.util.debug(f"warn: {d.name} dev empty")
|
235
|
+
self.df_dev = pd.concat([self.df_dev, d.df_dev])
|
236
|
+
self.util.copy_flags(d, self.df_dev)
|
217
237
|
self.train_empty = True if self.df_train.shape[0] == 0 else False
|
218
238
|
self.test_empty = True if self.df_test.shape[0] == 0 else False
|
239
|
+
if self.split3:
|
240
|
+
self.dev_empty = True if self.df_dev.shape[0] == 0 else False
|
219
241
|
store = self.util.get_path("store")
|
220
242
|
storage_test = f"{store}testdf.csv"
|
221
243
|
storage_train = f"{store}traindf.csv"
|
222
244
|
self.df_test.to_csv(storage_test)
|
223
245
|
self.df_train.to_csv(storage_train)
|
246
|
+
if self.split3:
|
247
|
+
storage_dev = f"{store}devdf.csv"
|
248
|
+
self.df_dev.to_csv(storage_dev)
|
224
249
|
|
225
250
|
if self.target == "none":
|
226
251
|
return
|
227
252
|
self.util.copy_flags(self, self.df_test)
|
228
253
|
self.util.copy_flags(self, self.df_train)
|
254
|
+
if self.split3:
|
255
|
+
self.util.copy_flags(self, self.df_dev)
|
229
256
|
# Try data checks
|
230
257
|
datachecker = FileChecker(self.df_train)
|
231
258
|
self.df_train = datachecker.all_checks()
|
232
259
|
datachecker.set_data(self.df_test)
|
233
260
|
self.df_test = datachecker.all_checks()
|
261
|
+
if self.split3:
|
262
|
+
datachecker.set_data(self.df_dev)
|
263
|
+
self.df_dev = datachecker.all_checks()
|
234
264
|
|
235
265
|
# Check for filters
|
236
266
|
filter_sample_selection = self.util.config_val(
|
@@ -241,6 +271,9 @@ class Experiment:
|
|
241
271
|
self.df_train = datafilter.all_filters()
|
242
272
|
datafilter = DataFilter(self.df_test)
|
243
273
|
self.df_test = datafilter.all_filters()
|
274
|
+
if self.split3:
|
275
|
+
datafilter = DataFilter(self.df_dev)
|
276
|
+
self.df_dev = datafilter.all_filters()
|
244
277
|
elif filter_sample_selection == "train":
|
245
278
|
datafilter = DataFilter(self.df_train)
|
246
279
|
self.df_train = datafilter.all_filters()
|
@@ -248,10 +281,11 @@ class Experiment:
|
|
248
281
|
datafilter = DataFilter(self.df_test)
|
249
282
|
self.df_test = datafilter.all_filters()
|
250
283
|
else:
|
251
|
-
|
284
|
+
msg = (
|
252
285
|
"unkown filter sample selection specifier"
|
253
286
|
f" {filter_sample_selection}, should be [all | train | test]"
|
254
287
|
)
|
288
|
+
self.util.error(msg)
|
255
289
|
|
256
290
|
# encode the labels
|
257
291
|
if self.util.exp_is_classification():
|
@@ -261,6 +295,8 @@ class Experiment:
|
|
261
295
|
test_cats = self.df_test["class_label"].unique()
|
262
296
|
if not self.train_empty:
|
263
297
|
train_cats = self.df_train["class_label"].unique()
|
298
|
+
if self.split3 and not self.dev_empty:
|
299
|
+
dev_cats = self.df_dev["class_label"].unique()
|
264
300
|
else:
|
265
301
|
if not self.test_empty:
|
266
302
|
if self.df_test.is_labeled:
|
@@ -272,11 +308,13 @@ class Experiment:
|
|
272
308
|
)
|
273
309
|
if not self.train_empty:
|
274
310
|
train_cats = self.df_train[self.target].unique()
|
311
|
+
if self.split3 and not self.dev_empty:
|
312
|
+
dev_cats = self.df_dev[self.target].unique()
|
275
313
|
# encode the labels as numbers
|
276
314
|
self.label_encoder = LabelEncoder()
|
277
315
|
glob_conf.set_label_encoder(self.label_encoder)
|
278
316
|
if not self.train_empty:
|
279
|
-
if
|
317
|
+
if isinstance(train_cats, np.ndarray):
|
280
318
|
self.util.debug(f"Categories train (nd.array): {train_cats}")
|
281
319
|
else:
|
282
320
|
self.util.debug(f"Categories train (list): {list(train_cats)}")
|
@@ -286,7 +324,7 @@ class Experiment:
|
|
286
324
|
)
|
287
325
|
if not self.test_empty:
|
288
326
|
if self.df_test.is_labeled:
|
289
|
-
if
|
327
|
+
if isinstance(test_cats, np.ndarray):
|
290
328
|
self.util.debug(f"Categories test (nd.array): {test_cats}")
|
291
329
|
else:
|
292
330
|
self.util.debug(f"Categories test (list): {list(test_cats)}")
|
@@ -294,6 +332,15 @@ class Experiment:
|
|
294
332
|
self.df_test[self.target] = self.label_encoder.transform(
|
295
333
|
self.df_test[self.target]
|
296
334
|
)
|
335
|
+
if self.split3 and not self.dev_empty:
|
336
|
+
if isinstance(dev_cats, np.ndarray):
|
337
|
+
self.util.debug(f"Categories dev (nd.array): {dev_cats}")
|
338
|
+
else:
|
339
|
+
self.util.debug(f"Categories dev (list): {list(dev_cats)}")
|
340
|
+
if not self.train_empty:
|
341
|
+
self.df_dev[self.target] = self.label_encoder.transform(
|
342
|
+
self.df_dev[self.target]
|
343
|
+
)
|
297
344
|
if self.got_speaker:
|
298
345
|
speakers_train = 0 if self.train_empty else self.df_train.speaker.nunique()
|
299
346
|
speakers_test = 0 if self.test_empty else self.df_test.speaker.nunique()
|
@@ -301,6 +348,9 @@ class Experiment:
|
|
301
348
|
f"{speakers_test} speakers in test and"
|
302
349
|
f" {speakers_train} speakers in train"
|
303
350
|
)
|
351
|
+
if self.split3:
|
352
|
+
speakers_dev = 0 if self.dev_empty else self.df_dev.speaker.nunique()
|
353
|
+
self.util.debug(f"{speakers_dev} speakers in dev")
|
304
354
|
|
305
355
|
target_factor = self.util.config_val("DATA", "target_divide_by", False)
|
306
356
|
if target_factor:
|
@@ -308,6 +358,10 @@ class Experiment:
|
|
308
358
|
self.df_train[self.target] = self.df_train[self.target] / float(
|
309
359
|
target_factor
|
310
360
|
)
|
361
|
+
if self.split3:
|
362
|
+
self.df_dev[self.target] = self.df_dev[self.target] / float(
|
363
|
+
target_factor
|
364
|
+
)
|
311
365
|
if not self.util.exp_is_classification():
|
312
366
|
self.df_test["class_label"] = self.df_test["class_label"] / float(
|
313
367
|
target_factor
|
@@ -315,7 +369,17 @@ class Experiment:
|
|
315
369
|
self.df_train["class_label"] = self.df_train["class_label"] / float(
|
316
370
|
target_factor
|
317
371
|
)
|
318
|
-
|
372
|
+
if self.split3:
|
373
|
+
self.df_dev["class_label"] = self.df_dev["class_label"] / float(
|
374
|
+
target_factor
|
375
|
+
)
|
376
|
+
if self.split3:
|
377
|
+
shapes = f"{self.df_train.shape}/{self.df_dev.shape}/{self.df_test.shape}"
|
378
|
+
self.util.debug(f"train/dev/test shape: {shapes}")
|
379
|
+
else:
|
380
|
+
self.util.debug(
|
381
|
+
f"train/test shape: {self.df_train.shape}/{self.df_test.shape}"
|
382
|
+
)
|
319
383
|
def _add_random_target(self, df):
|
320
384
|
labels = glob_conf.labels
|
321
385
|
a = [None] * len(df)
|
@@ -325,9 +389,11 @@ class Experiment:
|
|
325
389
|
return df
|
326
390
|
|
327
391
|
def plot_distribution(self, df_labels):
|
328
|
-
"""Plot the distribution of samples and
|
392
|
+
"""Plot the distribution of samples and speakers.
|
393
|
+
|
394
|
+
Per target class and biological sex.
|
395
|
+
"""
|
329
396
|
plot = Plots()
|
330
|
-
sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
|
331
397
|
plot.plot_distributions(df_labels)
|
332
398
|
if self.got_speaker:
|
333
399
|
plot.plot_distributions_speaker(df_labels)
|
@@ -351,8 +417,16 @@ class Experiment:
|
|
351
417
|
|
352
418
|
"""
|
353
419
|
df_train, df_test = self.df_train, self.df_test
|
420
|
+
if self.split3:
|
421
|
+
df_dev = self.df_dev
|
422
|
+
else:
|
423
|
+
df_dev = None
|
354
424
|
feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
|
355
425
|
self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
|
426
|
+
if self.split3:
|
427
|
+
self.feats_dev = pd.DataFrame()
|
428
|
+
else:
|
429
|
+
self.feats_dev = None
|
356
430
|
feats_types = self.util.config_val("FEATS", "type", "os")
|
357
431
|
# Ensure feats_types is always a list of strings
|
358
432
|
if isinstance(feats_types, str):
|
@@ -364,7 +438,6 @@ class Experiment:
|
|
364
438
|
# for some models no features are needed
|
365
439
|
if len(feats_types) == 0:
|
366
440
|
self.util.debug("no feature extractor specified.")
|
367
|
-
self.feats_train, self.feats_test = pd.DataFrame(), pd.DataFrame()
|
368
441
|
return
|
369
442
|
if not self.train_empty:
|
370
443
|
self.feature_extractor = FeatureExtractor(
|
@@ -376,10 +449,19 @@ class Experiment:
|
|
376
449
|
df_test, feats_types, feats_name, "test"
|
377
450
|
)
|
378
451
|
self.feats_test = self.feature_extractor.extract()
|
379
|
-
self.
|
380
|
-
|
381
|
-
|
382
|
-
|
452
|
+
if self.split3:
|
453
|
+
if not self.dev_empty:
|
454
|
+
self.feature_extractor = FeatureExtractor(
|
455
|
+
df_dev, feats_types, feats_name, "dev"
|
456
|
+
)
|
457
|
+
self.feats_dev = self.feature_extractor.extract()
|
458
|
+
shps = f"{self.feats_train.shape}/{self.feats_dev.shape}/{self.feats_test.shape}"
|
459
|
+
self.util.debug(f"Train/dev/test features:{shps}")
|
460
|
+
else:
|
461
|
+
self.util.debug(
|
462
|
+
f"All features: train shape : {self.feats_train.shape}, test"
|
463
|
+
f" shape:{self.feats_test.shape}"
|
464
|
+
)
|
383
465
|
if self.feats_train.shape[0] < self.df_train.shape[0]:
|
384
466
|
self.util.warn(
|
385
467
|
f"train feats ({self.feats_train.shape[0]}) != train labels"
|
@@ -396,6 +478,14 @@ class Experiment:
|
|
396
478
|
)
|
397
479
|
self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
|
398
480
|
self.util.warn(f"new test labels shape: {self.df_test.shape[0]}")
|
481
|
+
if self.split3:
|
482
|
+
if self.feats_dev.shape[0] < self.df_dev.shape[0]:
|
483
|
+
self.util.warn(
|
484
|
+
f"dev feats ({self.feats_dev.shape[0]}) != dev labels"
|
485
|
+
f" ({self.df_dev.shape[0]})"
|
486
|
+
)
|
487
|
+
self.df_dev = self.df_dev[self.df_dev.index.isin(self.feats_dev.index)]
|
488
|
+
self.util.warn(f"new dev labels shape: {self.df_dev.shape[0]}")
|
399
489
|
|
400
490
|
self._check_scale()
|
401
491
|
|
@@ -604,6 +694,8 @@ class Experiment:
|
|
604
694
|
def _check_scale(self):
|
605
695
|
self.util.save_to_store(self.feats_train, "feats_train")
|
606
696
|
self.util.save_to_store(self.feats_test, "feats_test")
|
697
|
+
if self.split3:
|
698
|
+
self.util.save_to_store(self.feats_dev, "feats_dev")
|
607
699
|
scale_feats = self.util.config_val("FEATS", "scale", False)
|
608
700
|
# print the scale
|
609
701
|
self.util.debug(f"scaler: {scale_feats}")
|
@@ -614,6 +706,8 @@ class Experiment:
|
|
614
706
|
self.feats_train,
|
615
707
|
self.feats_test,
|
616
708
|
scale_feats,
|
709
|
+
dev_x=self.df_dev,
|
710
|
+
dev_y=self.feats_dev,
|
617
711
|
)
|
618
712
|
self.feats_train, self.feats_test = self.scaler_feats.scale()
|
619
713
|
# store versions
|
@@ -622,9 +716,19 @@ class Experiment:
|
|
622
716
|
|
623
717
|
def init_runmanager(self):
|
624
718
|
"""Initialize the manager object for the runs."""
|
625
|
-
self.
|
626
|
-
self.
|
627
|
-
|
719
|
+
if self.split3:
|
720
|
+
self.runmgr = Runmanager(
|
721
|
+
self.df_train,
|
722
|
+
self.df_test,
|
723
|
+
self.feats_train,
|
724
|
+
self.feats_test,
|
725
|
+
dev_x=self.df_dev,
|
726
|
+
dev_y=self.feats_dev,
|
727
|
+
)
|
728
|
+
else:
|
729
|
+
self.runmgr = Runmanager(
|
730
|
+
self.df_train, self.df_test, self.feats_train, self.feats_test
|
731
|
+
)
|
628
732
|
|
629
733
|
def run(self):
|
630
734
|
"""Do the runs."""
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# opensmileset.py
|
2
|
+
import os
|
3
|
+
|
4
|
+
import opensmile
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
import nkululeko.glob_conf as glob_conf
|
8
|
+
from nkululeko.feat_extract.featureset import Featureset
|
9
|
+
|
10
|
+
|
11
|
+
class Opensmileset(Featureset):
|
12
|
+
def __init__(self, name, data_df, feats_type=None, config_file=None):
|
13
|
+
super().__init__(name, data_df, feats_type)
|
14
|
+
self.featset = self.util.config_val("FEATS", "set", "eGeMAPSv02")
|
15
|
+
try:
|
16
|
+
self.feature_set = eval(f"opensmile.FeatureSet.{self.featset}")
|
17
|
+
# 'eGeMAPSv02, ComParE_2016, GeMAPSv01a, eGeMAPSv01a':
|
18
|
+
except AttributeError:
|
19
|
+
self.util.error(f"something is wrong with feature set: {self.featset}")
|
20
|
+
self.featlevel = self.util.config_val("FEATS", "level", "functionals")
|
21
|
+
try:
|
22
|
+
self.featlevel = self.featlevel.replace("lld", "LowLevelDescriptors")
|
23
|
+
self.featlevel = self.featlevel.replace("functionals", "Functionals")
|
24
|
+
self.feature_level = eval(f"opensmile.FeatureLevel.{self.featlevel}")
|
25
|
+
except AttributeError:
|
26
|
+
self.util.error(f"something is wrong with feature level: {self.featlevel}")
|
27
|
+
|
28
|
+
def extract(self):
|
29
|
+
"""Extract the features based on the initialized dataset or re-open them when found on disk."""
|
30
|
+
store = self.util.get_path("store")
|
31
|
+
store_format = self.util.config_val("FEATS", "store_format", "pkl")
|
32
|
+
storage = f"{store}{self.name}.{store_format}"
|
33
|
+
extract = eval(
|
34
|
+
self.util.config_val("FEATS", "needs_feature_extraction", "False")
|
35
|
+
)
|
36
|
+
no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
|
37
|
+
if extract or not os.path.isfile(storage) or no_reuse:
|
38
|
+
self.util.debug("extracting openSmile features, this might take a while...")
|
39
|
+
smile = opensmile.Smile(
|
40
|
+
feature_set=self.feature_set,
|
41
|
+
feature_level=self.feature_level,
|
42
|
+
num_workers=self.n_jobs,
|
43
|
+
verbose=True,
|
44
|
+
)
|
45
|
+
if isinstance(self.data_df.index, pd.MultiIndex):
|
46
|
+
self.df = smile.process_index(self.data_df.index)
|
47
|
+
self.df = self.df.set_index(self.data_df.index)
|
48
|
+
else:
|
49
|
+
self.df = smile.process_files(self.data_df.index)
|
50
|
+
self.df.index = self.df.index.droplevel(1)
|
51
|
+
self.df.index = self.df.index.droplevel(1)
|
52
|
+
self.util.write_store(self.df, storage, store_format)
|
53
|
+
try:
|
54
|
+
glob_conf.config["DATA"]["needs_feature_extraction"] = "False"
|
55
|
+
except KeyError:
|
56
|
+
pass
|
57
|
+
else:
|
58
|
+
self.util.debug(f"reusing extracted OS features: {storage}.")
|
59
|
+
self.df = self.util.get_store(storage, store_format)
|
60
|
+
|
61
|
+
def extract_sample(self, signal, sr):
|
62
|
+
smile = opensmile.Smile(
|
63
|
+
feature_set=self.feature_set,
|
64
|
+
feature_level=opensmile.FeatureLevel.Functionals,
|
65
|
+
)
|
66
|
+
feats = smile.process_signal(signal, sr)
|
67
|
+
return feats.to_numpy()
|
68
|
+
|
69
|
+
# def filter(self):
|
70
|
+
# # use only the features that are indexed in the target dataframes
|
71
|
+
# self.df = self.df[self.df.index.isin(self.data_df.index)]
|
72
|
+
# try:
|
73
|
+
# # use only some features
|
74
|
+
# selected_features = ast.literal_eval(
|
75
|
+
# glob_conf.config["FEATS"]["os.features"]
|
76
|
+
# )
|
77
|
+
# self.util.debug(f"selecting features from opensmile: {selected_features}")
|
78
|
+
# sel_feats_df = pd.DataFrame()
|
79
|
+
# hit = False
|
80
|
+
# for feat in selected_features:
|
81
|
+
# try:
|
82
|
+
# sel_feats_df[feat] = self.df[feat]
|
83
|
+
# hit = True
|
84
|
+
# except KeyError:
|
85
|
+
# pass
|
86
|
+
# if hit:
|
87
|
+
# self.df = sel_feats_df
|
88
|
+
# self.util.debug(
|
89
|
+
# "new feats shape after selecting opensmile features:"
|
90
|
+
# f" {self.df.shape}"
|
91
|
+
# )
|
92
|
+
# except KeyError:
|
93
|
+
# pass
|