nkululeko 0.93.15__py3-none-any.whl → 0.94.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nkululeko/aug_train.py CHANGED
@@ -7,10 +7,10 @@ import os.path
7
7
 
8
8
  import numpy as np
9
9
 
10
- import nkululeko.experiment as exp
11
- import nkululeko.glob_conf as glob_conf
12
10
  from nkululeko.augment import doit as augment
13
11
  from nkululeko.constants import VERSION
12
+ import nkululeko.experiment as exp
13
+ import nkululeko.glob_conf as glob_conf
14
14
  from nkululeko.utils.util import Util
15
15
 
16
16
 
@@ -85,6 +85,17 @@ def doit(config_file):
85
85
 
86
86
 
87
87
  def main(src_dir):
88
+ """Entrypoint for the nkululeko framework.
89
+
90
+ This function parses command line arguments to determine the configuration file to use,
91
+ and then calls the `doit` function with the specified configuration file.
92
+
93
+ Args:
94
+ src_dir (str): The directory containing the source code.
95
+
96
+ Returns:
97
+ None
98
+ """
88
99
  parser = argparse.ArgumentParser(description="Call the nkululeko framework.")
89
100
  parser.add_argument("--config", default="exp.ini", help="The base configuration")
90
101
  args = parser.parse_args()
nkululeko/constants.py CHANGED
@@ -1,2 +1,2 @@
1
- VERSION="0.93.15"
1
+ VERSION="0.94.0"
2
2
  SAMPLING_RATE = 16000
nkululeko/data/dataset.py CHANGED
@@ -43,6 +43,7 @@ class Dataset:
43
43
  False,
44
44
  False,
45
45
  )
46
+ self.split3 = eval(self.util.config_val("EXP", "traindevtest", "False"))
46
47
 
47
48
  def _get_tables(self):
48
49
  tables = []
@@ -463,20 +464,178 @@ class Dataset:
463
464
  f" {self.df_train.shape[0]} samples in train"
464
465
  )
465
466
 
467
+ def split_3(self):
468
+ """Split the database into train, test and dev set."""
469
+ store = self.util.get_path("store")
470
+ storage_test = f"{store}{self.name}_testdf.pkl"
471
+ storage_train = f"{store}{self.name}_traindf.pkl"
472
+ storage_dev = f"{store}{self.name}_devdf.pkl"
473
+ split_strategy = self.util.config_val_data(
474
+ self.name, "split_strategy", "speaker_split"
475
+ )
476
+ self.util.debug(
477
+ f"splitting database {self.name} into train/dev/test with strategy {split_strategy}"
478
+ )
479
+ # 'database' (default), 'speaker_split', 'specified', 'reuse'
480
+ if split_strategy != "speaker_split" and not self.start_fresh:
481
+ # check if the splits have been computed previously (not for speaker split)
482
+ if (
483
+ os.path.isfile(storage_train)
484
+ and os.path.isfile(storage_test)
485
+ and os.path.isfile(storage_dev)
486
+ ):
487
+ # if self.util.config_val_data(self.name, 'test_tables', False):
488
+ self.util.debug(
489
+ "splits: reusing previously stored test file" f" {storage_test}"
490
+ )
491
+ self.df_test = pd.read_pickle(storage_test)
492
+ self.util.debug(
493
+ "splits: reusing previously stored train file" f" {storage_train}"
494
+ )
495
+ self.df_train = pd.read_pickle(storage_train)
496
+ self.util.debug(
497
+ "splits: reusing previously stored dev file" f" {storage_dev}"
498
+ )
499
+ self.df_dev = pd.read_pickle(storage_train)
500
+ return
501
+ elif os.path.isfile(storage_train):
502
+ self.util.debug(
503
+ "splits: reusing previously stored train file" f" {storage_train}"
504
+ )
505
+ self.df_train = pd.read_pickle(storage_train)
506
+ self.df_test = pd.DataFrame()
507
+ self.df_dev = pd.DataFrame()
508
+ return
509
+ elif os.path.isfile(storage_test):
510
+ self.util.debug(
511
+ "splits: reusing previously stored test file" f" {storage_test}"
512
+ )
513
+ self.df_test = pd.read_pickle(storage_test)
514
+ self.df_train = pd.DataFrame()
515
+ self.df_dev = pd.DataFrame()
516
+ return
517
+ elif os.path.isfile(storage_dev):
518
+ self.util.debug(
519
+ "splits: reusing previously stored dev file" f" {storage_dev}"
520
+ )
521
+ self.df_dev = pd.read_pickle(storage_dev)
522
+ self.df_train = pd.DataFrame()
523
+ self.df_test = pd.DataFrame()
524
+ return
525
+ if split_strategy == "database":
526
+ # use the splits from the database
527
+ testdf = self.db.tables[self.target + ".test"].df
528
+ traindf = self.db.tables[self.target + ".train"].df
529
+ devdf = self.db.tables[self.target + ".dev"].df
530
+ # use only the train and test samples that were not perhaps filtered out by an earlier processing step
531
+ self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
532
+ self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
533
+ self.df_dev = self.df.loc[self.df.index.intersection(devdf.index)]
534
+ elif split_strategy == "train":
535
+ self.df_train = self.df
536
+ self.df_test = pd.DataFrame()
537
+ self.df_dev = pd.DataFrame()
538
+ elif split_strategy == "test":
539
+ self.df_test = self.df
540
+ self.df_train = pd.DataFrame()
541
+ self.df_dev = pd.DataFrame()
542
+ elif split_strategy == "dev":
543
+ self.df_dev = self.df
544
+ self.df_train = pd.DataFrame()
545
+ self.df_test = pd.DataFrame()
546
+ elif split_strategy == "specified":
547
+ traindf, testdf, devdf = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
548
+ # try to load some dataframes for testing
549
+ entry_test_tables = self.util.config_val_data(
550
+ self.name, "test_tables", False
551
+ )
552
+ if entry_test_tables:
553
+ test_tables = ast.literal_eval(entry_test_tables)
554
+ for test_table in test_tables:
555
+ testdf = pd.concat([testdf, self.db.tables[test_table].df])
556
+ entry_train_tables = self.util.config_val_data(
557
+ self.name, "train_tables", False
558
+ )
559
+ if entry_train_tables:
560
+ train_tables = ast.literal_eval(entry_train_tables)
561
+ for train_table in train_tables:
562
+ traindf = pd.concat([traindf, self.db.tables[train_table].df])
563
+ entry_dev_tables = self.util.config_val_data(self.name, "dev_tables", False)
564
+ if entry_dev_tables:
565
+ dev_tables = ast.literal_eval(entry_dev_tables)
566
+ for dev_table in dev_tables:
567
+ devdf = pd.concat([devdf, self.db.tables[dev_table].df])
568
+ testdf = testdf.set_index(
569
+ audformat.utils.to_segmented_index(testdf.index, allow_nat=False)
570
+ )
571
+ traindf = traindf.set_index(
572
+ audformat.utils.to_segmented_index(traindf.index, allow_nat=False)
573
+ )
574
+ devdf = devdf.set_index(
575
+ audformat.utils.to_segmented_index(devdf.index, allow_nat=False)
576
+ )
577
+ self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
578
+ self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
579
+ self.df_dev = self.df.loc[self.df.index.intersection(devdf.index)]
580
+ # it might be necessary to copy the target values
581
+ if not self.df_test.empty:
582
+ self.df_test[self.target] = testdf[self.target]
583
+ if not self.df_train.empty:
584
+ self.df_train[self.target] = traindf[self.target]
585
+ if not self.df_dev.empty:
586
+ self.df_dev[self.target] = devdf[self.target]
587
+ elif split_strategy == "balanced":
588
+ self.balanced_split(with_dev=True)
589
+ elif split_strategy == "speaker_split":
590
+ self.split_speakers_3()
591
+ elif split_strategy == "random":
592
+ self.random_split_3()
593
+ elif split_strategy == "reuse":
594
+ self.util.debug(f"{self.name}: trying to reuse data splits")
595
+ self.df_test = pd.read_pickle(storage_test)
596
+ self.df_train = pd.read_pickle(storage_train)
597
+ self.df_dev = pd.read_pickle(storage_dev)
598
+ elif isinstance(ast.literal_eval(split_strategy), list):
599
+ # treat this as a list of test speakers
600
+ self.assign_speakers(ast.literal_eval(split_strategy))
601
+ else:
602
+ self.util.error(f"unknown split strategy: {split_strategy}")
603
+
604
+ # check if train or test set should be ignored
605
+ as_test = eval(self.util.config_val_data(self.name, "as_test", "False"))
606
+ if as_test:
607
+ self.df_train = pd.DataFrame()
608
+ self.df_dev = pd.DataFrame()
609
+ as_train = eval(self.util.config_val_data(self.name, "as_train", "False"))
610
+ if as_train:
611
+ self.df_test = pd.DataFrame()
612
+ self.df_dev = pd.DataFrame()
613
+
614
+ if self.df_test.shape[0] > 0:
615
+ self.df_test = self.finish_up(self.df_test, storage_test)
616
+ if self.df_train.shape[0] > 0:
617
+ self.df_train = self.finish_up(self.df_train, storage_train)
618
+ if self.df_dev.shape[0] > 0:
619
+ self.df_dev = self.finish_up(self.df_dev, storage_dev)
620
+
621
+ self.util.debug(
622
+ f"{self.name}: {self.df_test.shape[0]} samples in test and"
623
+ f" {self.df_train.shape[0]} samples in train"
624
+ )
625
+
466
626
  def finish_up(self, df, storage):
467
- """
468
- Bin target values if they are continuous but a classification experiment should be done
469
- self.check_continuous_classification(df)
470
- remember the splits for future use
471
- """
472
627
  df.is_labeled = self.is_labeled
473
- self.df_test.is_labeled = self.is_labeled
474
628
  df.to_pickle(storage)
475
629
  return df
476
630
 
477
- def balanced_split(self):
478
- """One way to split train and eval sets: Generate split dataframes for some balancing criterion"""
479
- from splitutils import binning, optimize_traintest_split
631
+ def balanced_split(self, with_dev=False):
632
+ """Split train and eval sets: Generate dataframes for some balancing criterion."""
633
+
634
+ from splitutils import (
635
+ binning,
636
+ optimize_traintest_split,
637
+ optimize_traindevtest_split,
638
+ )
480
639
 
481
640
  seed = 42
482
641
  k = 30
@@ -521,24 +680,48 @@ class Dataset:
521
680
  # find optimal test indices TEST_I in DF
522
681
  # info: dict with goodness of split information
523
682
 
524
- train_i, test_i, info = optimize_traintest_split(
525
- X=df,
526
- y=targets,
527
- split_on=speakers,
528
- stratify_on=stratif_vars_array,
529
- weight=weights,
530
- test_size=test_size,
531
- k=k,
532
- seed=seed,
533
- )
534
- self.util.debug(f"stratification info;\n{info}")
535
- self.df_train = df.iloc[train_i]
536
- self.df_test = df.iloc[test_i]
537
- self.util.debug(
538
- f"{self.name} (balanced split): [{self.df_train.shape[0]}/{self.df_test.shape[0]}]"
539
- " samples in train/test"
540
- )
541
- # because this generates new train/test sample quantaties, the feature extraction has to be done again
683
+ if with_dev:
684
+ train_i, dev_i, test_i, info = optimize_traindevtest_split(
685
+ X=df,
686
+ y=targets,
687
+ split_on=speakers,
688
+ stratify_on=stratif_vars_array,
689
+ weight=weights,
690
+ test_size=test_size,
691
+ k=k,
692
+ seed=seed,
693
+ )
694
+ self.util.debug(f"stratification info;\n{info}")
695
+ self.df_train = df.iloc[train_i]
696
+ self.df_test = df.iloc[test_i]
697
+ self.df_dev = df.iloc[dev_i]
698
+ msg = (
699
+ f"{self.name} (balanced split): "
700
+ f"[{self.df_train.shape[0]}/{self.df_dev.shape[0]}/{self.df_test.shape[0]}]"
701
+ " samples in train/dev/test"
702
+ )
703
+ self.util.debug(msg)
704
+ else:
705
+ train_i, test_i, info = optimize_traintest_split(
706
+ X=df,
707
+ y=targets,
708
+ split_on=speakers,
709
+ stratify_on=stratif_vars_array,
710
+ weight=weights,
711
+ test_size=test_size,
712
+ k=k,
713
+ seed=seed,
714
+ )
715
+ self.util.debug(f"stratification info;\n{info}")
716
+ self.df_train = df.iloc[train_i]
717
+ self.df_test = df.iloc[test_i]
718
+ msg = (
719
+ f"{self.name} (balanced split): "
720
+ f"[{self.df_train.shape[0]}/{self.df_test.shape[0]}] samples in train/test"
721
+ )
722
+ self.util.debug(msg)
723
+ # because this generates new train/test sample quantaties,
724
+ # the feature extraction has to be done again
542
725
  glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
543
726
 
544
727
  def assign_speakers(self, speakers):
@@ -548,10 +731,11 @@ class Dataset:
548
731
  self.util.error(f"no speakers found in {speakers}")
549
732
  self.df_train = self.df[~self.df.index.isin(self.df_test.index)]
550
733
  self.util.debug(
551
- f"{self.name} (speakers assigned): [{self.df_train.shape[0]}/{self.df_test.shape[0]}]"
552
- " samples in train/test"
734
+ f"{self.name} (speakers assigned): "
735
+ f"[{self.df_train.shape[0]}/{self.df_test.shape[0]}] samples in train/test"
553
736
  )
554
- # because this generates new train/test sample quantaties, the feature extraction has to be done again
737
+ # because this generates new train/test sample quantaties,
738
+ # the feature extraction has to be done again
555
739
  glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
556
740
 
557
741
  def split_speakers(self):
@@ -563,18 +747,49 @@ class Dataset:
563
747
  test_spkrs = sample(list(df.speaker.unique()), test_num)
564
748
  self.df_test = df[df.speaker.isin(test_spkrs)]
565
749
  self.df_train = df[~df.index.isin(self.df_test.index)]
566
- self.util.debug(
567
- f"{self.name} (speaker split): [{self.df_train.shape[0]}/{self.df_test.shape[0]}]"
568
- " samples in train/test"
750
+ msg = (
751
+ f"{self.name} (speaker splits): "
752
+ f"[{self.df_train.shape[0]}/{self.df_test.shape[0]}]"
753
+ " samples in train/dev/test"
569
754
  )
570
- # because this generates new train/test sample quantaties, the feature extraction has to be done again
755
+ self.util.debug(msg)
756
+ # because this generates new train/test sample quantaties,
757
+ # the feature extraction has to be done again
758
+ try:
759
+ glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
760
+ except KeyError:
761
+ pass
762
+
763
+ def split_speakers_3(self):
764
+ """One way to split train, dev and test sets: Specify percentage speakers."""
765
+ test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
766
+ dev_percent = int(self.util.config_val_data(self.name, "dev_size", 20))
767
+ df = self.df
768
+ s_num = df.speaker.nunique()
769
+ test_num = int(s_num * (test_percent / 100))
770
+ dev_num = int(s_num * (dev_percent / 100))
771
+ testdev_spkrs = sample(list(df.speaker.unique()), test_num + dev_num)
772
+ # sample from testdev speakers for test and dev
773
+ test_spkrs = sample(testdev_spkrs, test_num)
774
+ dev_spkrs = [spkr for spkr in testdev_spkrs if spkr not in test_spkrs]
775
+ self.df_test = df[df.speaker.isin(test_spkrs)]
776
+ self.df_dev = df[df.speaker.isin(dev_spkrs)]
777
+ self.df_train = df[~df.speaker.isin(testdev_spkrs)]
778
+ msg = (
779
+ f"{self.name} (speaker splits): "
780
+ f"[{self.df_train.shape[0]}/{self.df_dev.shape[0]}/{self.df_test.shape[0]}]"
781
+ " samples in train/dev/test"
782
+ )
783
+ self.util.debug(msg)
784
+ # because this generates new train/test sample quantaties,
785
+ # the feature extraction has to be done again
571
786
  try:
572
787
  glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
573
788
  except KeyError:
574
789
  pass
575
790
 
576
791
  def random_split(self):
577
- """One way to split train and eval sets: Specify percentage of random samples"""
792
+ """One way to split train and eval sets: Specify percentage of random samples."""
578
793
  test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
579
794
  df = self.df
580
795
  s_num = len(df)
@@ -586,7 +801,32 @@ class Dataset:
586
801
  f"{self.name}: [{self.df_train.shape[0]}/{self.df_test.shape[0]}]"
587
802
  " samples in train/test"
588
803
  )
589
- # because this generates new train/test sample quantaties, the feature extraction has to be done again
804
+ # because this generates new train/test sample quantaties,
805
+ # the feature extraction has to be done again
806
+ glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
807
+
808
+ def random_split_3(self):
809
+ """One way to split train, dev and test sets: Specify random samples."""
810
+ test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
811
+ dev_percent = int(self.util.config_val_data(self.name, "dev_size", 20))
812
+ df = self.df
813
+ s_num = len(df)
814
+ test_num = int(s_num * (test_percent / 100))
815
+ dev_num = int(s_num * (dev_percent / 100))
816
+ testdev_smpls = sample(list(df.index), test_num + dev_num)
817
+ test_smpls = sample(testdev_smpls, test_num)
818
+ dev_smpls = [spkr for spkr in testdev_smpls if spkr not in test_smpls]
819
+ self.df_test = df[df.speaker.isin(test_smpls)]
820
+ self.df_dev = df[df.speaker.isin(dev_smpls)]
821
+ self.df_train = df[~df.speaker.isin(testdev_smpls)]
822
+ msg = (
823
+ f"{self.name} (sample splits): "
824
+ f"[{self.df_train.shape[0]}/{self.df_dev.shape[0]}/{self.df_test.shape[0]}]"
825
+ " samples in train/dev/test"
826
+ )
827
+ self.util.debug(msg)
828
+ # because this generates new train/test sample quantaties,
829
+ # the feature extraction has to be done again
590
830
  glob_conf.config["FEATS"]["needs_feature_extraction"] = "True"
591
831
 
592
832
  def _add_labels(self, df):
@@ -621,6 +861,10 @@ class Dataset:
621
861
  self.map_continuous_classification(self.df_test)
622
862
  self.df_train = self._add_labels(self.df_train)
623
863
  self.df_test = self._add_labels(self.df_test)
864
+ if self.split3:
865
+ self.df_dev = self.map_labels(self.df_dev)
866
+ self.map_continuous_classification(self.df_dev)
867
+ self.df_dev = self._add_labels(self.df_dev)
624
868
  if self.util.config_val_data(self.name, "value_counts", False):
625
869
  if not self.got_gender or not self.got_speaker:
626
870
  self.util.error(
@@ -639,6 +883,13 @@ class Dataset:
639
883
  self.target,
640
884
  f"{self.name}_test_distplot",
641
885
  )
886
+ if self.split3:
887
+ self.plot.describe_df(
888
+ self.name,
889
+ self.df_dev,
890
+ self.target,
891
+ f"{self.name}_dev_distplot",
892
+ )
642
893
 
643
894
  def map_labels(self, df):
644
895
  pd.options.mode.chained_assignment = None