nkululeko 0.93.15__py3-none-any.whl → 0.94.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nkululeko/experiment.py CHANGED
@@ -43,6 +43,7 @@ class Experiment:
43
43
  audeer.mkdir(self.data_dir) # create the experiment directory
44
44
  self.util = Util("experiment")
45
45
  glob_conf.set_util(self.util)
46
+ self.split3 = eval(self.util.config_val("EXP", "traindevtest", "False"))
46
47
  fresh_report = eval(self.util.config_val("REPORT", "fresh", "False"))
47
48
  if not fresh_report:
48
49
  try:
@@ -75,11 +76,11 @@ class Experiment:
75
76
  # return self.util.get_exp_name()
76
77
 
77
78
  def set_globals(self, config_obj):
78
- """install a config object in the global space"""
79
+ """Install a config object in the global space."""
79
80
  glob_conf.init_config(config_obj)
80
81
 
81
82
  def load_datasets(self):
82
- """Load all databases specified in the configuration and map the labels"""
83
+ """Load all databases specified in the configuration and map the labels."""
83
84
  ds = ast.literal_eval(glob_conf.config["DATA"]["databases"])
84
85
  self.datasets = {}
85
86
  self.got_speaker, self.got_gender, self.got_age = False, False, False
@@ -186,6 +187,10 @@ class Experiment:
186
187
  store = self.util.get_path("store")
187
188
  storage_test = f"{store}testdf.csv"
188
189
  storage_train = f"{store}traindf.csv"
190
+ self.df_dev = None
191
+ self.feats_dev = None
192
+ if self.split3:
193
+ storage_dev = f"{store}devdf.csv"
189
194
  start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
190
195
  if (
191
196
  os.path.isfile(storage_train)
@@ -199,10 +204,20 @@ class Experiment:
199
204
  self.df_train = self._import_csv(storage_train)
200
205
  self.train_empty = True if self.df_train.shape[0] == 0 else False
201
206
  self.test_empty = True if self.df_test.shape[0] == 0 else False
207
+ if self.split3:
208
+ self.df_dev = self._import_csv(storage_dev)
209
+ self.dev_empty = True if self.df_dev.shape[0] == 0 else False
202
210
  else:
203
211
  self.df_train, self.df_test = pd.DataFrame(), pd.DataFrame()
212
+ if self.split3:
213
+ self.df_dev = pd.DataFrame()
214
+ else:
215
+ self.df_dev = None
204
216
  for d in self.datasets.values():
205
- d.split()
217
+ if self.split3:
218
+ d.split_3()
219
+ else:
220
+ d.split()
206
221
  if self.target != "none":
207
222
  d.prepare_labels()
208
223
  if d.df_train.shape[0] == 0:
@@ -214,23 +229,38 @@ class Experiment:
214
229
  self.util.debug(f"warn: {d.name} test empty")
215
230
  self.df_test = pd.concat([self.df_test, d.df_test])
216
231
  self.util.copy_flags(d, self.df_test)
232
+ if self.split3:
233
+ if d.df_dev.shape[0] == 0:
234
+ self.util.debug(f"warn: {d.name} dev empty")
235
+ self.df_dev = pd.concat([self.df_dev, d.df_dev])
236
+ self.util.copy_flags(d, self.df_dev)
217
237
  self.train_empty = True if self.df_train.shape[0] == 0 else False
218
238
  self.test_empty = True if self.df_test.shape[0] == 0 else False
239
+ if self.split3:
240
+ self.dev_empty = True if self.df_dev.shape[0] == 0 else False
219
241
  store = self.util.get_path("store")
220
242
  storage_test = f"{store}testdf.csv"
221
243
  storage_train = f"{store}traindf.csv"
222
244
  self.df_test.to_csv(storage_test)
223
245
  self.df_train.to_csv(storage_train)
246
+ if self.split3:
247
+ storage_dev = f"{store}devdf.csv"
248
+ self.df_dev.to_csv(storage_dev)
224
249
 
225
250
  if self.target == "none":
226
251
  return
227
252
  self.util.copy_flags(self, self.df_test)
228
253
  self.util.copy_flags(self, self.df_train)
254
+ if self.split3:
255
+ self.util.copy_flags(self, self.df_dev)
229
256
  # Try data checks
230
257
  datachecker = FileChecker(self.df_train)
231
258
  self.df_train = datachecker.all_checks()
232
259
  datachecker.set_data(self.df_test)
233
260
  self.df_test = datachecker.all_checks()
261
+ if self.split3:
262
+ datachecker.set_data(self.df_dev)
263
+ self.df_dev = datachecker.all_checks()
234
264
 
235
265
  # Check for filters
236
266
  filter_sample_selection = self.util.config_val(
@@ -241,6 +271,9 @@ class Experiment:
241
271
  self.df_train = datafilter.all_filters()
242
272
  datafilter = DataFilter(self.df_test)
243
273
  self.df_test = datafilter.all_filters()
274
+ if self.split3:
275
+ datafilter = DataFilter(self.df_dev)
276
+ self.df_dev = datafilter.all_filters()
244
277
  elif filter_sample_selection == "train":
245
278
  datafilter = DataFilter(self.df_train)
246
279
  self.df_train = datafilter.all_filters()
@@ -248,10 +281,11 @@ class Experiment:
248
281
  datafilter = DataFilter(self.df_test)
249
282
  self.df_test = datafilter.all_filters()
250
283
  else:
251
- self.util.error(
284
+ msg = (
252
285
  "unkown filter sample selection specifier"
253
286
  f" {filter_sample_selection}, should be [all | train | test]"
254
287
  )
288
+ self.util.error(msg)
255
289
 
256
290
  # encode the labels
257
291
  if self.util.exp_is_classification():
@@ -261,6 +295,8 @@ class Experiment:
261
295
  test_cats = self.df_test["class_label"].unique()
262
296
  if not self.train_empty:
263
297
  train_cats = self.df_train["class_label"].unique()
298
+ if self.split3 and not self.dev_empty:
299
+ dev_cats = self.df_dev["class_label"].unique()
264
300
  else:
265
301
  if not self.test_empty:
266
302
  if self.df_test.is_labeled:
@@ -272,11 +308,13 @@ class Experiment:
272
308
  )
273
309
  if not self.train_empty:
274
310
  train_cats = self.df_train[self.target].unique()
311
+ if self.split3 and not self.dev_empty:
312
+ dev_cats = self.df_dev[self.target].unique()
275
313
  # encode the labels as numbers
276
314
  self.label_encoder = LabelEncoder()
277
315
  glob_conf.set_label_encoder(self.label_encoder)
278
316
  if not self.train_empty:
279
- if type(train_cats) == np.ndarray:
317
+ if isinstance(train_cats, np.ndarray):
280
318
  self.util.debug(f"Categories train (nd.array): {train_cats}")
281
319
  else:
282
320
  self.util.debug(f"Categories train (list): {list(train_cats)}")
@@ -286,7 +324,7 @@ class Experiment:
286
324
  )
287
325
  if not self.test_empty:
288
326
  if self.df_test.is_labeled:
289
- if type(test_cats) == np.ndarray:
327
+ if isinstance(test_cats, np.ndarray):
290
328
  self.util.debug(f"Categories test (nd.array): {test_cats}")
291
329
  else:
292
330
  self.util.debug(f"Categories test (list): {list(test_cats)}")
@@ -294,6 +332,15 @@ class Experiment:
294
332
  self.df_test[self.target] = self.label_encoder.transform(
295
333
  self.df_test[self.target]
296
334
  )
335
+ if self.split3 and not self.dev_empty:
336
+ if isinstance(dev_cats, np.ndarray):
337
+ self.util.debug(f"Categories dev (nd.array): {dev_cats}")
338
+ else:
339
+ self.util.debug(f"Categories dev (list): {list(dev_cats)}")
340
+ if not self.train_empty:
341
+ self.df_dev[self.target] = self.label_encoder.transform(
342
+ self.df_dev[self.target]
343
+ )
297
344
  if self.got_speaker:
298
345
  speakers_train = 0 if self.train_empty else self.df_train.speaker.nunique()
299
346
  speakers_test = 0 if self.test_empty else self.df_test.speaker.nunique()
@@ -301,6 +348,9 @@ class Experiment:
301
348
  f"{speakers_test} speakers in test and"
302
349
  f" {speakers_train} speakers in train"
303
350
  )
351
+ if self.split3:
352
+ speakers_dev = 0 if self.dev_empty else self.df_dev.speaker.nunique()
353
+ self.util.debug(f"{speakers_dev} speakers in dev")
304
354
 
305
355
  target_factor = self.util.config_val("DATA", "target_divide_by", False)
306
356
  if target_factor:
@@ -308,6 +358,10 @@ class Experiment:
308
358
  self.df_train[self.target] = self.df_train[self.target] / float(
309
359
  target_factor
310
360
  )
361
+ if self.split3:
362
+ self.df_dev[self.target] = self.df_dev[self.target] / float(
363
+ target_factor
364
+ )
311
365
  if not self.util.exp_is_classification():
312
366
  self.df_test["class_label"] = self.df_test["class_label"] / float(
313
367
  target_factor
@@ -315,7 +369,17 @@ class Experiment:
315
369
  self.df_train["class_label"] = self.df_train["class_label"] / float(
316
370
  target_factor
317
371
  )
318
-
372
+ if self.split3:
373
+ self.df_dev["class_label"] = self.df_dev["class_label"] / float(
374
+ target_factor
375
+ )
376
+ if self.split3:
377
+ shapes = f"{self.df_train.shape}/{self.df_dev.shape}/{self.df_test.shape}"
378
+ self.util.debug(f"train/dev/test shape: {shapes}")
379
+ else:
380
+ self.util.debug(
381
+ f"train/test shape: {self.df_train.shape}/{self.df_test.shape}"
382
+ )
319
383
  def _add_random_target(self, df):
320
384
  labels = glob_conf.labels
321
385
  a = [None] * len(df)
@@ -325,9 +389,11 @@ class Experiment:
325
389
  return df
326
390
 
327
391
  def plot_distribution(self, df_labels):
328
- """Plot the distribution of samples and speaker per target class and biological sex"""
392
+ """Plot the distribution of samples and speakers.
393
+
394
+ Per target class and biological sex.
395
+ """
329
396
  plot = Plots()
330
- sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
331
397
  plot.plot_distributions(df_labels)
332
398
  if self.got_speaker:
333
399
  plot.plot_distributions_speaker(df_labels)
@@ -351,8 +417,16 @@ class Experiment:
351
417
 
352
418
  """
353
419
  df_train, df_test = self.df_train, self.df_test
420
+ if self.split3:
421
+ df_dev = self.df_dev
422
+ else:
423
+ df_dev = None
354
424
  feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
355
425
  self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
426
+ if self.split3:
427
+ self.feats_dev = pd.DataFrame()
428
+ else:
429
+ self.feats_dev = None
356
430
  feats_types = self.util.config_val("FEATS", "type", "os")
357
431
  # Ensure feats_types is always a list of strings
358
432
  if isinstance(feats_types, str):
@@ -364,7 +438,6 @@ class Experiment:
364
438
  # for some models no features are needed
365
439
  if len(feats_types) == 0:
366
440
  self.util.debug("no feature extractor specified.")
367
- self.feats_train, self.feats_test = pd.DataFrame(), pd.DataFrame()
368
441
  return
369
442
  if not self.train_empty:
370
443
  self.feature_extractor = FeatureExtractor(
@@ -376,10 +449,19 @@ class Experiment:
376
449
  df_test, feats_types, feats_name, "test"
377
450
  )
378
451
  self.feats_test = self.feature_extractor.extract()
379
- self.util.debug(
380
- f"All features: train shape : {self.feats_train.shape}, test"
381
- f" shape:{self.feats_test.shape}"
382
- )
452
+ if self.split3:
453
+ if not self.dev_empty:
454
+ self.feature_extractor = FeatureExtractor(
455
+ df_dev, feats_types, feats_name, "dev"
456
+ )
457
+ self.feats_dev = self.feature_extractor.extract()
458
+ shps = f"{self.feats_train.shape}/{self.feats_dev.shape}/{self.feats_test.shape}"
459
+ self.util.debug(f"Train/dev/test features:{shps}")
460
+ else:
461
+ self.util.debug(
462
+ f"All features: train shape : {self.feats_train.shape}, test"
463
+ f" shape:{self.feats_test.shape}"
464
+ )
383
465
  if self.feats_train.shape[0] < self.df_train.shape[0]:
384
466
  self.util.warn(
385
467
  f"train feats ({self.feats_train.shape[0]}) != train labels"
@@ -396,6 +478,14 @@ class Experiment:
396
478
  )
397
479
  self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
398
480
  self.util.warn(f"new test labels shape: {self.df_test.shape[0]}")
481
+ if self.split3:
482
+ if self.feats_dev.shape[0] < self.df_dev.shape[0]:
483
+ self.util.warn(
484
+ f"dev feats ({self.feats_dev.shape[0]}) != dev labels"
485
+ f" ({self.df_dev.shape[0]})"
486
+ )
487
+ self.df_dev = self.df_dev[self.df_dev.index.isin(self.feats_dev.index)]
488
+ self.util.warn(f"new dev labels shape: {self.df_dev.shape[0]}")
399
489
 
400
490
  self._check_scale()
401
491
 
@@ -604,6 +694,8 @@ class Experiment:
604
694
  def _check_scale(self):
605
695
  self.util.save_to_store(self.feats_train, "feats_train")
606
696
  self.util.save_to_store(self.feats_test, "feats_test")
697
+ if self.split3:
698
+ self.util.save_to_store(self.feats_dev, "feats_dev")
607
699
  scale_feats = self.util.config_val("FEATS", "scale", False)
608
700
  # print the scale
609
701
  self.util.debug(f"scaler: {scale_feats}")
@@ -614,6 +706,8 @@ class Experiment:
614
706
  self.feats_train,
615
707
  self.feats_test,
616
708
  scale_feats,
709
+ dev_x=self.df_dev,
710
+ dev_y=self.feats_dev,
617
711
  )
618
712
  self.feats_train, self.feats_test = self.scaler_feats.scale()
619
713
  # store versions
@@ -622,9 +716,19 @@ class Experiment:
622
716
 
623
717
  def init_runmanager(self):
624
718
  """Initialize the manager object for the runs."""
625
- self.runmgr = Runmanager(
626
- self.df_train, self.df_test, self.feats_train, self.feats_test
627
- )
719
+ if self.split3:
720
+ self.runmgr = Runmanager(
721
+ self.df_train,
722
+ self.df_test,
723
+ self.feats_train,
724
+ self.feats_test,
725
+ dev_x=self.df_dev,
726
+ dev_y=self.feats_dev,
727
+ )
728
+ else:
729
+ self.runmgr = Runmanager(
730
+ self.df_train, self.df_test, self.feats_train, self.feats_test
731
+ )
628
732
 
629
733
  def run(self):
630
734
  """Do the runs."""
@@ -0,0 +1,93 @@
1
+ # opensmileset.py
2
+ import os
3
+
4
+ import opensmile
5
+ import pandas as pd
6
+
7
+ import nkululeko.glob_conf as glob_conf
8
+ from nkululeko.feat_extract.featureset import Featureset
9
+
10
+
11
+ class Opensmileset(Featureset):
12
+ def __init__(self, name, data_df, feats_type=None, config_file=None):
13
+ super().__init__(name, data_df, feats_type)
14
+ self.featset = self.util.config_val("FEATS", "set", "eGeMAPSv02")
15
+ try:
16
+ self.feature_set = eval(f"opensmile.FeatureSet.{self.featset}")
17
+ # 'eGeMAPSv02, ComParE_2016, GeMAPSv01a, eGeMAPSv01a':
18
+ except AttributeError:
19
+ self.util.error(f"something is wrong with feature set: {self.featset}")
20
+ self.featlevel = self.util.config_val("FEATS", "level", "functionals")
21
+ try:
22
+ self.featlevel = self.featlevel.replace("lld", "LowLevelDescriptors")
23
+ self.featlevel = self.featlevel.replace("functionals", "Functionals")
24
+ self.feature_level = eval(f"opensmile.FeatureLevel.{self.featlevel}")
25
+ except AttributeError:
26
+ self.util.error(f"something is wrong with feature level: {self.featlevel}")
27
+
28
+ def extract(self):
29
+ """Extract the features based on the initialized dataset or re-open them when found on disk."""
30
+ store = self.util.get_path("store")
31
+ store_format = self.util.config_val("FEATS", "store_format", "pkl")
32
+ storage = f"{store}{self.name}.{store_format}"
33
+ extract = eval(
34
+ self.util.config_val("FEATS", "needs_feature_extraction", "False")
35
+ )
36
+ no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
37
+ if extract or not os.path.isfile(storage) or no_reuse:
38
+ self.util.debug("extracting openSmile features, this might take a while...")
39
+ smile = opensmile.Smile(
40
+ feature_set=self.feature_set,
41
+ feature_level=self.feature_level,
42
+ num_workers=self.n_jobs,
43
+ verbose=True,
44
+ )
45
+ if isinstance(self.data_df.index, pd.MultiIndex):
46
+ self.df = smile.process_index(self.data_df.index)
47
+ self.df = self.df.set_index(self.data_df.index)
48
+ else:
49
+ self.df = smile.process_files(self.data_df.index)
50
+ self.df.index = self.df.index.droplevel(1)
51
+ self.df.index = self.df.index.droplevel(1)
52
+ self.util.write_store(self.df, storage, store_format)
53
+ try:
54
+ glob_conf.config["DATA"]["needs_feature_extraction"] = "False"
55
+ except KeyError:
56
+ pass
57
+ else:
58
+ self.util.debug(f"reusing extracted OS features: {storage}.")
59
+ self.df = self.util.get_store(storage, store_format)
60
+
61
+ def extract_sample(self, signal, sr):
62
+ smile = opensmile.Smile(
63
+ feature_set=self.feature_set,
64
+ feature_level=opensmile.FeatureLevel.Functionals,
65
+ )
66
+ feats = smile.process_signal(signal, sr)
67
+ return feats.to_numpy()
68
+
69
+ # def filter(self):
70
+ # # use only the features that are indexed in the target dataframes
71
+ # self.df = self.df[self.df.index.isin(self.data_df.index)]
72
+ # try:
73
+ # # use only some features
74
+ # selected_features = ast.literal_eval(
75
+ # glob_conf.config["FEATS"]["os.features"]
76
+ # )
77
+ # self.util.debug(f"selecting features from opensmile: {selected_features}")
78
+ # sel_feats_df = pd.DataFrame()
79
+ # hit = False
80
+ # for feat in selected_features:
81
+ # try:
82
+ # sel_feats_df[feat] = self.df[feat]
83
+ # hit = True
84
+ # except KeyError:
85
+ # pass
86
+ # if hit:
87
+ # self.df = sel_feats_df
88
+ # self.util.debug(
89
+ # "new feats shape after selecting opensmile features:"
90
+ # f" {self.df.shape}"
91
+ # )
92
+ # except KeyError:
93
+ # pass