nkululeko 0.93.15__py3-none-any.whl → 0.94.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/aug_train.py +13 -2
- nkululeko/constants.py +1 -1
- nkululeko/data/dataset.py +287 -36
- nkululeko/experiment.py +121 -17
- nkululeko/feat_extract/feats_opensmile copy.py +93 -0
- nkululeko/feat_extract/feats_opensmile.py +207 -60
- nkululeko/feat_extract/feats_trill.py +2 -2
- nkululeko/filter_data.py +3 -1
- nkululeko/modelrunner.py +23 -10
- nkululeko/models/model_mlp.py +2 -0
- nkululeko/nkululeko.py +0 -1
- nkululeko/plots.py +11 -2
- nkululeko/reporting/reporter.py +27 -39
- nkululeko/runmanager.py +53 -33
- nkululeko/scaler.py +41 -24
- nkululeko/utils/util.py +1 -1
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/METADATA +3 -2
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/RECORD +22 -21
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/WHEEL +1 -1
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info/licenses}/LICENSE +0 -0
- {nkululeko-0.93.15.dist-info → nkululeko-0.94.1.dist-info}/top_level.txt +0 -0
nkululeko/aug_train.py
CHANGED
@@ -7,10 +7,10 @@ import os.path
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
|
10
|
-
import nkululeko.experiment as exp
|
11
|
-
import nkululeko.glob_conf as glob_conf
|
12
10
|
from nkululeko.augment import doit as augment
|
13
11
|
from nkululeko.constants import VERSION
|
12
|
+
import nkululeko.experiment as exp
|
13
|
+
import nkululeko.glob_conf as glob_conf
|
14
14
|
from nkululeko.utils.util import Util
|
15
15
|
|
16
16
|
|
@@ -85,6 +85,17 @@ def doit(config_file):
|
|
85
85
|
|
86
86
|
|
87
87
|
def main(src_dir):
|
88
|
+
"""Entrypoint for the nkululeko framework.
|
89
|
+
|
90
|
+
This function parses command line arguments to determine the configuration file to use,
|
91
|
+
and then calls the `doit` function with the specified configuration file.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
src_dir (str): The directory containing the source code.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
None
|
98
|
+
"""
|
88
99
|
parser = argparse.ArgumentParser(description="Call the nkululeko framework.")
|
89
100
|
parser.add_argument("--config", default="exp.ini", help="The base configuration")
|
90
101
|
args = parser.parse_args()
|
nkululeko/constants.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
VERSION="0.
|
1
|
+
VERSION="0.94.1"
|
2
2
|
SAMPLING_RATE = 16000
|
nkululeko/data/dataset.py
CHANGED
@@ -43,6 +43,7 @@ class Dataset:
|
|
43
43
|
False,
|
44
44
|
False,
|
45
45
|
)
|
46
|
+
self.split3 = eval(self.util.config_val("EXP", "traindevtest", "False"))
|
46
47
|
|
47
48
|
def _get_tables(self):
|
48
49
|
tables = []
|
@@ -463,20 +464,178 @@ class Dataset:
|
|
463
464
|
f" {self.df_train.shape[0]} samples in train"
|
464
465
|
)
|
465
466
|
|
467
|
+
def split_3(self):
|
468
|
+
"""Split the database into train, test and dev set."""
|
469
|
+
store = self.util.get_path("store")
|
470
|
+
storage_test = f"{store}{self.name}_testdf.pkl"
|
471
|
+
storage_train = f"{store}{self.name}_traindf.pkl"
|
472
|
+
storage_dev = f"{store}{self.name}_devdf.pkl"
|
473
|
+
split_strategy = self.util.config_val_data(
|
474
|
+
self.name, "split_strategy", "speaker_split"
|
475
|
+
)
|
476
|
+
self.util.debug(
|
477
|
+
f"splitting database {self.name} into train/dev/test with strategy {split_strategy}"
|
478
|
+
)
|
479
|
+
# 'database' (default), 'speaker_split', 'specified', 'reuse'
|
480
|
+
if split_strategy != "speaker_split" and not self.start_fresh:
|
481
|
+
# check if the splits have been computed previously (not for speaker split)
|
482
|
+
if (
|
483
|
+
os.path.isfile(storage_train)
|
484
|
+
and os.path.isfile(storage_test)
|
485
|
+
and os.path.isfile(storage_dev)
|
486
|
+
):
|
487
|
+
# if self.util.config_val_data(self.name, 'test_tables', False):
|
488
|
+
self.util.debug(
|
489
|
+
"splits: reusing previously stored test file" f" {storage_test}"
|
490
|
+
)
|
491
|
+
self.df_test = pd.read_pickle(storage_test)
|
492
|
+
self.util.debug(
|
493
|
+
"splits: reusing previously stored train file" f" {storage_train}"
|
494
|
+
)
|
495
|
+
self.df_train = pd.read_pickle(storage_train)
|
496
|
+
self.util.debug(
|
497
|
+
"splits: reusing previously stored dev file" f" {storage_dev}"
|
498
|
+
)
|
499
|
+
self.df_dev = pd.read_pickle(storage_train)
|
500
|
+
return
|
501
|
+
elif os.path.isfile(storage_train):
|
502
|
+
self.util.debug(
|
503
|
+
"splits: reusing previously stored train file" f" {storage_train}"
|
504
|
+
)
|
505
|
+
self.df_train = pd.read_pickle(storage_train)
|
506
|
+
self.df_test = pd.DataFrame()
|
507
|
+
self.df_dev = pd.DataFrame()
|
508
|
+
return
|
509
|
+
elif os.path.isfile(storage_test):
|
510
|
+
self.util.debug(
|
511
|
+
"splits: reusing previously stored test file" f" {storage_test}"
|
512
|
+
)
|
513
|
+
self.df_test = pd.read_pickle(storage_test)
|
514
|
+
self.df_train = pd.DataFrame()
|
515
|
+
self.df_dev = pd.DataFrame()
|
516
|
+
return
|
517
|
+
elif os.path.isfile(storage_dev):
|
518
|
+
self.util.debug(
|
519
|
+
"splits: reusing previously stored dev file" f" {storage_dev}"
|
520
|
+
)
|
521
|
+
self.df_dev = pd.read_pickle(storage_dev)
|
522
|
+
self.df_train = pd.DataFrame()
|
523
|
+
self.df_test = pd.DataFrame()
|
524
|
+
return
|
525
|
+
if split_strategy == "database":
|
526
|
+
# use the splits from the database
|
527
|
+
testdf = self.db.tables[self.target + ".test"].df
|
528
|
+
traindf = self.db.tables[self.target + ".train"].df
|
529
|
+
devdf = self.db.tables[self.target + ".dev"].df
|
530
|
+
# use only the train and test samples that were not perhaps filtered out by an earlier processing step
|
531
|
+
self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
|
532
|
+
self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
|
533
|
+
self.df_dev = self.df.loc[self.df.index.intersection(devdf.index)]
|
534
|
+
elif split_strategy == "train":
|
535
|
+
self.df_train = self.df
|
536
|
+
self.df_test = pd.DataFrame()
|
537
|
+
self.df_dev = pd.DataFrame()
|
538
|
+
elif split_strategy == "test":
|
539
|
+
self.df_test = self.df
|
540
|
+
self.df_train = pd.DataFrame()
|
541
|
+
self.df_dev = pd.DataFrame()
|
542
|
+
elif split_strategy == "dev":
|
543
|
+
self.df_dev = self.df
|
544
|
+
self.df_train = pd.DataFrame()
|
545
|
+
self.df_test = pd.DataFrame()
|
546
|
+
elif split_strategy == "specified":
|
547
|
+
traindf, testdf, devdf = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
548
|
+
# try to load some dataframes for testing
|
549
|
+
entry_test_tables = self.util.config_val_data(
|
550
|
+
self.name, "test_tables", False
|
551
|
+
)
|
552
|
+
if entry_test_tables:
|
553
|
+
test_tables = ast.literal_eval(entry_test_tables)
|
554
|
+
for test_table in test_tables:
|
555
|
+
testdf = pd.concat([testdf, self.db.tables[test_table].df])
|
556
|
+
entry_train_tables = self.util.config_val_data(
|
557
|
+
self.name, "train_tables", False
|
558
|
+
)
|
559
|
+
if entry_train_tables:
|
560
|
+
train_tables = ast.literal_eval(entry_train_tables)
|
561
|
+
for train_table in train_tables:
|
562
|
+
traindf = pd.concat([traindf, self.db.tables[train_table].df])
|
563
|
+
entry_dev_tables = self.util.config_val_data(self.name, "dev_tables", False)
|
564
|
+
if entry_dev_tables:
|
565
|
+
dev_tables = ast.literal_eval(entry_dev_tables)
|
566
|
+
for dev_table in dev_tables:
|
567
|
+
devdf = pd.concat([devdf, self.db.tables[dev_table].df])
|
568
|
+
testdf = testdf.set_index(
|
569
|
+
audformat.utils.to_segmented_index(testdf.index, allow_nat=False)
|
570
|
+
)
|
571
|
+
traindf = traindf.set_index(
|
572
|
+
audformat.utils.to_segmented_index(traindf.index, allow_nat=False)
|
573
|
+
)
|
574
|
+
devdf = devdf.set_index(
|
575
|
+
audformat.utils.to_segmented_index(devdf.index, allow_nat=False)
|
576
|
+
)
|
577
|
+
self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
|
578
|
+
self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
|
579
|
+
self.df_dev = self.df.loc[self.df.index.intersection(devdf.index)]
|
580
|
+
# it might be necessary to copy the target values
|
581
|
+
if not self.df_test.empty:
|
582
|
+
self.df_test[self.target] = testdf[self.target]
|
583
|
+
if not self.df_train.empty:
|
584
|
+
self.df_train[self.target] = traindf[self.target]
|
585
|
+
if not self.df_dev.empty:
|
586
|
+
self.df_dev[self.target] = devdf[self.target]
|
587
|
+
elif split_strategy == "balanced":
|
588
|
+
self.balanced_split(with_dev=True)
|
589
|
+
elif split_strategy == "speaker_split":
|
590
|
+
self.split_speakers_3()
|
591
|
+
elif split_strategy == "random":
|
592
|
+
self.random_split_3()
|
593
|
+
elif split_strategy == "reuse":
|
594
|
+
self.util.debug(f"{self.name}: trying to reuse data splits")
|
595
|
+
self.df_test = pd.read_pickle(storage_test)
|
596
|
+
self.df_train = pd.read_pickle(storage_train)
|
597
|
+
self.df_dev = pd.read_pickle(storage_dev)
|
598
|
+
elif isinstance(ast.literal_eval(split_strategy), list):
|
599
|
+
# treat this as a list of test speakers
|
600
|
+
self.assign_speakers(ast.literal_eval(split_strategy))
|
601
|
+
else:
|
602
|
+
self.util.error(f"unknown split strategy: {split_strategy}")
|
603
|
+
|
604
|
+
# check if train or test set should be ignored
|
605
|
+
as_test = eval(self.util.config_val_data(self.name, "as_test", "False"))
|
606
|
+
if as_test:
|
607
|
+
self.df_train = pd.DataFrame()
|
608
|
+
self.df_dev = pd.DataFrame()
|
609
|
+
as_train = eval(self.util.config_val_data(self.name, "as_train", "False"))
|
610
|
+
if as_train:
|
611
|
+
self.df_test = pd.DataFrame()
|
612
|
+
self.df_dev = pd.DataFrame()
|
613
|
+
|
614
|
+
if self.df_test.shape[0] > 0:
|
615
|
+
self.df_test = self.finish_up(self.df_test, storage_test)
|
616
|
+
if self.df_train.shape[0] > 0:
|
617
|
+
self.df_train = self.finish_up(self.df_train, storage_train)
|
618
|
+
if self.df_dev.shape[0] > 0:
|
619
|
+
self.df_dev = self.finish_up(self.df_dev, storage_dev)
|
620
|
+
|
621
|
+
self.util.debug(
|
622
|
+
f"{self.name}: {self.df_test.shape[0]} samples in test and"
|
623
|
+
f" {self.df_train.shape[0]} samples in train"
|
624
|
+
)
|
625
|
+
|
466
626
|
def finish_up(self, df, storage):
|
467
|
-
"""
|
468
|
-
Bin target values if they are continuous but a classification experiment should be done
|
469
|
-
self.check_continuous_classification(df)
|
470
|
-
remember the splits for future use
|
471
|
-
"""
|
472
627
|
df.is_labeled = self.is_labeled
|
473
|
-
self.df_test.is_labeled = self.is_labeled
|
474
628
|
df.to_pickle(storage)
|
475
629
|
return df
|
476
630
|
|
477
|
-
def balanced_split(self):
|
478
|
-
"""
|
479
|
-
|
631
|
+
def balanced_split(self, with_dev=False):
|
632
|
+
"""Split train and eval sets: Generate dataframes for some balancing criterion."""
|
633
|
+
|
634
|
+
from splitutils import (
|
635
|
+
binning,
|
636
|
+
optimize_traintest_split,
|
637
|
+
optimize_traindevtest_split,
|
638
|
+
)
|
480
639
|
|
481
640
|
seed = 42
|
482
641
|
k = 30
|
@@ -521,24 +680,48 @@ class Dataset:
|
|
521
680
|
# find optimal test indices TEST_I in DF
|
522
681
|
# info: dict with goodness of split information
|
523
682
|
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
683
|
+
if with_dev:
|
684
|
+
train_i, dev_i, test_i, info = optimize_traindevtest_split(
|
685
|
+
X=df,
|
686
|
+
y=targets,
|
687
|
+
split_on=speakers,
|
688
|
+
stratify_on=stratif_vars_array,
|
689
|
+
weight=weights,
|
690
|
+
test_size=test_size,
|
691
|
+
k=k,
|
692
|
+
seed=seed,
|
693
|
+
)
|
694
|
+
self.util.debug(f"stratification info;\n{info}")
|
695
|
+
self.df_train = df.iloc[train_i]
|
696
|
+
self.df_test = df.iloc[test_i]
|
697
|
+
self.df_dev = df.iloc[dev_i]
|
698
|
+
msg = (
|
699
|
+
f"{self.name} (balanced split): "
|
700
|
+
f"[{self.df_train.shape[0]}/{self.df_dev.shape[0]}/{self.df_test.shape[0]}]"
|
701
|
+
" samples in train/dev/test"
|
702
|
+
)
|
703
|
+
self.util.debug(msg)
|
704
|
+
else:
|
705
|
+
train_i, test_i, info = optimize_traintest_split(
|
706
|
+
X=df,
|
707
|
+
y=targets,
|
708
|
+
split_on=speakers,
|
709
|
+
stratify_on=stratif_vars_array,
|
710
|
+
weight=weights,
|
711
|
+
test_size=test_size,
|
712
|
+
k=k,
|
713
|
+
seed=seed,
|
714
|
+
)
|
715
|
+
self.util.debug(f"stratification info;\n{info}")
|
716
|
+
self.df_train = df.iloc[train_i]
|
717
|
+
self.df_test = df.iloc[test_i]
|
718
|
+
msg = (
|
719
|
+
f"{self.name} (balanced split): "
|
720
|
+
f"[{self.df_train.shape[0]}/{self.df_test.shape[0]}] samples in train/test"
|
721
|
+
)
|
722
|
+
self.util.debug(msg)
|
723
|
+
# because this generates new train/test sample quantaties,
|
724
|
+
# the feature extraction has to be done again
|
542
725
|
glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
|
543
726
|
|
544
727
|
def assign_speakers(self, speakers):
|
@@ -548,10 +731,11 @@ class Dataset:
|
|
548
731
|
self.util.error(f"no speakers found in {speakers}")
|
549
732
|
self.df_train = self.df[~self.df.index.isin(self.df_test.index)]
|
550
733
|
self.util.debug(
|
551
|
-
f"{self.name} (speakers assigned):
|
552
|
-
" samples in train/test"
|
734
|
+
f"{self.name} (speakers assigned): "
|
735
|
+
f"[{self.df_train.shape[0]}/{self.df_test.shape[0]}] samples in train/test"
|
553
736
|
)
|
554
|
-
# because this generates new train/test sample quantaties,
|
737
|
+
# because this generates new train/test sample quantaties,
|
738
|
+
# the feature extraction has to be done again
|
555
739
|
glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
|
556
740
|
|
557
741
|
def split_speakers(self):
|
@@ -563,18 +747,49 @@ class Dataset:
|
|
563
747
|
test_spkrs = sample(list(df.speaker.unique()), test_num)
|
564
748
|
self.df_test = df[df.speaker.isin(test_spkrs)]
|
565
749
|
self.df_train = df[~df.index.isin(self.df_test.index)]
|
566
|
-
|
567
|
-
f"{self.name} (speaker
|
568
|
-
"
|
750
|
+
msg = (
|
751
|
+
f"{self.name} (speaker splits): "
|
752
|
+
f"[{self.df_train.shape[0]}/{self.df_test.shape[0]}]"
|
753
|
+
" samples in train/dev/test"
|
569
754
|
)
|
570
|
-
|
755
|
+
self.util.debug(msg)
|
756
|
+
# because this generates new train/test sample quantaties,
|
757
|
+
# the feature extraction has to be done again
|
758
|
+
try:
|
759
|
+
glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
|
760
|
+
except KeyError:
|
761
|
+
pass
|
762
|
+
|
763
|
+
def split_speakers_3(self):
|
764
|
+
"""One way to split train, dev and test sets: Specify percentage speakers."""
|
765
|
+
test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
|
766
|
+
dev_percent = int(self.util.config_val_data(self.name, "dev_size", 20))
|
767
|
+
df = self.df
|
768
|
+
s_num = df.speaker.nunique()
|
769
|
+
test_num = int(s_num * (test_percent / 100))
|
770
|
+
dev_num = int(s_num * (dev_percent / 100))
|
771
|
+
testdev_spkrs = sample(list(df.speaker.unique()), test_num + dev_num)
|
772
|
+
# sample from testdev speakers for test and dev
|
773
|
+
test_spkrs = sample(testdev_spkrs, test_num)
|
774
|
+
dev_spkrs = [spkr for spkr in testdev_spkrs if spkr not in test_spkrs]
|
775
|
+
self.df_test = df[df.speaker.isin(test_spkrs)]
|
776
|
+
self.df_dev = df[df.speaker.isin(dev_spkrs)]
|
777
|
+
self.df_train = df[~df.speaker.isin(testdev_spkrs)]
|
778
|
+
msg = (
|
779
|
+
f"{self.name} (speaker splits): "
|
780
|
+
f"[{self.df_train.shape[0]}/{self.df_dev.shape[0]}/{self.df_test.shape[0]}]"
|
781
|
+
" samples in train/dev/test"
|
782
|
+
)
|
783
|
+
self.util.debug(msg)
|
784
|
+
# because this generates new train/test sample quantaties,
|
785
|
+
# the feature extraction has to be done again
|
571
786
|
try:
|
572
787
|
glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
|
573
788
|
except KeyError:
|
574
789
|
pass
|
575
790
|
|
576
791
|
def random_split(self):
|
577
|
-
"""One way to split train and eval sets: Specify percentage of random samples"""
|
792
|
+
"""One way to split train and eval sets: Specify percentage of random samples."""
|
578
793
|
test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
|
579
794
|
df = self.df
|
580
795
|
s_num = len(df)
|
@@ -586,7 +801,32 @@ class Dataset:
|
|
586
801
|
f"{self.name}: [{self.df_train.shape[0]}/{self.df_test.shape[0]}]"
|
587
802
|
" samples in train/test"
|
588
803
|
)
|
589
|
-
# because this generates new train/test sample quantaties,
|
804
|
+
# because this generates new train/test sample quantaties,
|
805
|
+
# the feature extraction has to be done again
|
806
|
+
glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
|
807
|
+
|
808
|
+
def random_split_3(self):
|
809
|
+
"""One way to split train, dev and test sets: Specify random samples."""
|
810
|
+
test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
|
811
|
+
dev_percent = int(self.util.config_val_data(self.name, "dev_size", 20))
|
812
|
+
df = self.df
|
813
|
+
s_num = len(df)
|
814
|
+
test_num = int(s_num * (test_percent / 100))
|
815
|
+
dev_num = int(s_num * (dev_percent / 100))
|
816
|
+
testdev_smpls = sample(list(df.index), test_num + dev_num)
|
817
|
+
test_smpls = sample(testdev_smpls, test_num)
|
818
|
+
dev_smpls = [spkr for spkr in testdev_smpls if spkr not in test_smpls]
|
819
|
+
self.df_test = df[df.speaker.isin(test_smpls)]
|
820
|
+
self.df_dev = df[df.speaker.isin(dev_smpls)]
|
821
|
+
self.df_train = df[~df.speaker.isin(testdev_smpls)]
|
822
|
+
msg = (
|
823
|
+
f"{self.name} (sample splits): "
|
824
|
+
f"[{self.df_train.shape[0]}/{self.df_dev.shape[0]}/{self.df_test.shape[0]}]"
|
825
|
+
" samples in train/dev/test"
|
826
|
+
)
|
827
|
+
self.util.debug(msg)
|
828
|
+
# because this generates new train/test sample quantaties,
|
829
|
+
# the feature extraction has to be done again
|
590
830
|
glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
|
591
831
|
|
592
832
|
def _add_labels(self, df):
|
@@ -621,6 +861,10 @@ class Dataset:
|
|
621
861
|
self.map_continuous_classification(self.df_test)
|
622
862
|
self.df_train = self._add_labels(self.df_train)
|
623
863
|
self.df_test = self._add_labels(self.df_test)
|
864
|
+
if self.split3:
|
865
|
+
self.df_dev = self.map_labels(self.df_dev)
|
866
|
+
self.map_continuous_classification(self.df_dev)
|
867
|
+
self.df_dev = self._add_labels(self.df_dev)
|
624
868
|
if self.util.config_val_data(self.name, "value_counts", False):
|
625
869
|
if not self.got_gender or not self.got_speaker:
|
626
870
|
self.util.error(
|
@@ -639,6 +883,13 @@ class Dataset:
|
|
639
883
|
self.target,
|
640
884
|
f"{self.name}_test_distplot",
|
641
885
|
)
|
886
|
+
if self.split3:
|
887
|
+
self.plot.describe_df(
|
888
|
+
self.name,
|
889
|
+
self.df_dev,
|
890
|
+
self.target,
|
891
|
+
f"{self.name}_dev_distplot",
|
892
|
+
)
|
642
893
|
|
643
894
|
def map_labels(self, df):
|
644
895
|
pd.options.mode.chained_assignment = None
|