nkululeko 0.93.10__py3-none-any.whl → 0.93.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nkululeko/constants.py CHANGED
@@ -1,2 +1,2 @@
1
- VERSION="0.93.10"
1
+ VERSION="0.93.12"
2
2
  SAMPLING_RATE = 16000
nkululeko/data/dataset.py CHANGED
@@ -34,9 +34,11 @@ class Dataset:
34
34
  self.plot = Plots()
35
35
  self.limit = int(self.util.config_val_data(self.name, "limit", 0))
36
36
  self.target_tables_append = eval(
37
- self.util.config_val_data(self.name, "target_tables_append", "False")
37
+ self.util.config_val_data(
38
+ self.name, "target_tables_append", "False")
38
39
  )
39
- self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
40
+ self.start_fresh = eval(
41
+ self.util.config_val("DATA", "no_reuse", "False"))
40
42
  self.is_labeled, self.got_speaker, self.got_gender, self.got_age = (
41
43
  False,
42
44
  False,
@@ -70,7 +72,8 @@ class Dataset:
70
72
  try:
71
73
  self.db = audformat.Database.load(root)
72
74
  except FileNotFoundError:
73
- self.util.error(f"{self.name}: no audformat database found at {root}")
75
+ self.util.error(
76
+ f"{self.name}: no audformat database found at {root}")
74
77
  return root
75
78
 
76
79
  def _check_cols(self, df):
@@ -92,7 +95,8 @@ class Dataset:
92
95
  )
93
96
  self.util.debug(r_string)
94
97
  if glob_conf.report.initial:
95
- glob_conf.report.add_item(ReportItem("Data", "Load report", r_string))
98
+ glob_conf.report.add_item(ReportItem(
99
+ "Data", "Load report", r_string))
96
100
  glob_conf.report.initial = False
97
101
 
98
102
  def load(self):
@@ -103,7 +107,8 @@ class Dataset:
103
107
  store_file = f"{store}{self.name}.{store_format}"
104
108
  self.root = self._load_db()
105
109
  if not self.start_fresh and os.path.isfile(store_file):
106
- self.util.debug(f"{self.name}: reusing previously stored file {store_file}")
110
+ self.util.debug(
111
+ f"{self.name}: reusing previously stored file {store_file}")
107
112
  self.df = self.util.get_store(store_file, store_format)
108
113
  self.is_labeled = self.target in self.df
109
114
  self.got_gender = "gender" in self.df
@@ -118,10 +123,12 @@ class Dataset:
118
123
  # map the audio file paths
119
124
  self.db.map_files(lambda x: os.path.join(self.root, x))
120
125
  # the dataframes (potentially more than one) with at least the file names
121
- df_files = self.util.config_val_data(self.name, "files_tables", "['files']")
126
+ df_files = self.util.config_val_data(
127
+ self.name, "files_tables", "['files']")
122
128
  df_files_tables = ast.literal_eval(df_files)
123
129
  # The label for the target column
124
- self.col_label = self.util.config_val_data(self.name, "label", self.target)
130
+ self.col_label = self.util.config_val_data(
131
+ self.name, "label", self.target)
125
132
  (
126
133
  df,
127
134
  self.is_labeled,
@@ -157,7 +164,8 @@ class Dataset:
157
164
  self.got_age = got_age2 or self.got_age
158
165
  if audformat.is_filewise_index(df_target.index):
159
166
  try:
160
- df_target = df_target.loc[df.index.get_level_values("file")]
167
+ df_target = df_target.loc[df.index.get_level_values(
168
+ "file")]
161
169
  df_target = df_target.set_index(df.index)
162
170
  except KeyError:
163
171
  # just a try...
@@ -206,7 +214,8 @@ class Dataset:
206
214
  end = self.df.index.get_level_values(2)
207
215
  self.df["duration"] = (end - start).total_seconds()
208
216
  elif self.df.duration.dtype == "timedelta64[ns]":
209
- self.df["duration"] = self.df["duration"].map(lambda x: x.total_seconds())
217
+ self.df["duration"] = self.df["duration"].map(
218
+ lambda x: x.total_seconds())
210
219
  # Perform some filtering if desired
211
220
  required = self.util.config_val_data(self.name, "required", False)
212
221
  if required:
@@ -236,15 +245,18 @@ class Dataset:
236
245
  res.append(abs(n - max))
237
246
  return res
238
247
 
239
- reverse = eval(self.util.config_val_data(self.name, "reverse", "False"))
248
+ reverse = eval(self.util.config_val_data(
249
+ self.name, "reverse", "False"))
240
250
  if reverse:
241
- max = eval(self.util.config_val_data(self.name, "reverse.max", "False"))
251
+ max = eval(self.util.config_val_data(
252
+ self.name, "reverse.max", "False"))
242
253
  if max:
243
254
  max = float(max)
244
255
  else:
245
256
  max = self.df[self.target].values.max()
246
257
  self.util.debug(f"reversing target numbers with max values: {max}")
247
- self.df[self.target] = reverse_array(self.df[self.target].values, max)
258
+ self.df[self.target] = reverse_array(
259
+ self.df[self.target].values, max)
248
260
 
249
261
  # check if the target variable should be scaled (z-transformed)
250
262
  scale = self.util.config_val_data(self.name, "scale", False)
@@ -317,13 +329,15 @@ class Dataset:
317
329
  pass
318
330
  try:
319
331
  # also it might be possible that the age is part of the speaker description
320
- df_local["age"] = db[table]["speaker"].get(map="age").astype(int)
332
+ df_local["age"] = db[table]["speaker"].get(
333
+ map="age").astype(int)
321
334
  got_age = True
322
335
  except (ValueError, audformat.errors.BadKeyError):
323
336
  pass
324
337
  try:
325
338
  # same for the target, e.g. "age"
326
- df_local[self.target] = db[table]["speaker"].get(map=self.target)
339
+ df_local[self.target] = db[table]["speaker"].get(
340
+ map=self.target)
327
341
  is_labeled = True
328
342
  except (ValueError, audformat.core.errors.BadKeyError):
329
343
  pass
@@ -384,8 +398,10 @@ class Dataset:
384
398
  testdf = self.db.tables[self.target + ".test"].df
385
399
  traindf = self.db.tables[self.target + ".train"].df
386
400
  # use only the train and test samples that were not perhaps filtered out by an earlier processing step
387
- self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
388
- self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
401
+ self.df_test = self.df.loc[self.df.index.intersection(
402
+ testdf.index)]
403
+ self.df_train = self.df.loc[self.df.index.intersection(
404
+ traindf.index)]
389
405
  elif split_strategy == "train":
390
406
  self.df_train = self.df
391
407
  self.df_test = pd.DataFrame()
@@ -408,18 +424,23 @@ class Dataset:
408
424
  if entry_train_tables:
409
425
  train_tables = ast.literal_eval(entry_train_tables)
410
426
  for train_table in train_tables:
411
- traindf = pd.concat([traindf, self.db.tables[train_table].df])
427
+ traindf = pd.concat(
428
+ [traindf, self.db.tables[train_table].df])
412
429
  # use only the train and test samples that were not perhaps filtered out by an earlier processing step
413
430
  # testdf.index.map(lambda x: os.path.join(self.root, x))
414
431
  # testdf.index = testdf.index.to_series().apply(lambda x: self.root+x)
415
432
  testdf = testdf.set_index(
416
- audformat.utils.to_segmented_index(testdf.index, allow_nat=False)
433
+ audformat.utils.to_segmented_index(
434
+ testdf.index, allow_nat=False)
417
435
  )
418
436
  traindf = traindf.set_index(
419
- audformat.utils.to_segmented_index(traindf.index, allow_nat=False)
437
+ audformat.utils.to_segmented_index(
438
+ traindf.index, allow_nat=False)
420
439
  )
421
- self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
422
- self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
440
+ self.df_test = self.df.loc[self.df.index.intersection(
441
+ testdf.index)]
442
+ self.df_train = self.df.loc[self.df.index.intersection(
443
+ traindf.index)]
423
444
  # it might be necessary to copy the target values
424
445
  try:
425
446
  self.df_test[self.target] = testdf[self.target]
@@ -446,10 +467,12 @@ class Dataset:
446
467
  self.util.error(f"unknown split strategy: {split_strategy}")
447
468
 
448
469
  # check if train or test set should be ignored
449
- as_test = eval(self.util.config_val_data(self.name, "as_test", "False"))
470
+ as_test = eval(self.util.config_val_data(
471
+ self.name, "as_test", "False"))
450
472
  if as_test:
451
473
  self.df_train = pd.DataFrame()
452
- as_train = eval(self.util.config_val_data(self.name, "as_train", "False"))
474
+ as_train = eval(self.util.config_val_data(
475
+ self.name, "as_train", "False"))
453
476
  if as_train:
454
477
  self.df_test = pd.DataFrame()
455
478
 
@@ -480,7 +503,8 @@ class Dataset:
480
503
 
481
504
  seed = 42
482
505
  k = 30
483
- test_size = int(self.util.config_val_data(self.name, "test_size", 20)) / 100.0
506
+ test_size = int(self.util.config_val_data(
507
+ self.name, "test_size", 20)) / 100.0
484
508
  df = self.df
485
509
  # split target
486
510
  targets = df[self.target].to_numpy()
@@ -496,7 +520,8 @@ class Dataset:
496
520
  stratif_vars = self.util.config_val("DATA", "balance", False)
497
521
  stratif_vars_array = {}
498
522
  if not stratif_vars:
499
- self.util.error("balanced split needs stratif_vars to stratify the splits")
523
+ self.util.error(
524
+ "balanced split needs stratif_vars to stratify the splits")
500
525
  else:
501
526
  stratif_vars = ast.literal_eval(stratif_vars)
502
527
  for stratif_var in stratif_vars.keys():
@@ -505,7 +530,8 @@ class Dataset:
505
530
  continue
506
531
  else:
507
532
  data = df[stratif_var].to_numpy()
508
- bins = self.util.config_val("DATA", f"{stratif_var}_bins", False)
533
+ bins = self.util.config_val(
534
+ "DATA", f"{stratif_var}_bins", False)
509
535
  if bins:
510
536
  data = binning(data, nbins=int(bins))
511
537
  stratif_vars_array[stratif_var] = data
@@ -556,7 +582,8 @@ class Dataset:
556
582
 
557
583
  def split_speakers(self):
558
584
  """One way to split train and eval sets: Specify percentage of evaluation speakers"""
559
- test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
585
+ test_percent = int(self.util.config_val_data(
586
+ self.name, "test_size", 20))
560
587
  df = self.df
561
588
  s_num = df.speaker.nunique()
562
589
  test_num = int(s_num * (test_percent / 100))
@@ -575,7 +602,8 @@ class Dataset:
575
602
 
576
603
  def random_split(self):
577
604
  """One way to split train and eval sets: Specify percentage of random samples"""
578
- test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
605
+ test_percent = int(self.util.config_val_data(
606
+ self.name, "test_size", 20))
579
607
  df = self.df
580
608
  s_num = len(df)
581
609
  test_num = int(s_num * (test_percent / 100))
@@ -676,8 +704,11 @@ class Dataset:
676
704
 
677
705
  def map_continuous_classification(self, df):
678
706
  """Map labels to bins for continuous data that should be classified"""
707
+ if df.empty:
708
+ return
679
709
  if self.check_continuous_classification():
680
- self.util.debug(f"{self.name}: binning continuous variable to categories")
710
+ self.util.debug(
711
+ f"{self.name}: binning continuous variable to categories")
681
712
  cat_vals = self.util.continuous_to_categorical(df[self.target])
682
713
  df[self.target] = cat_vals.values
683
714
  labels = ast.literal_eval(glob_conf.config["DATA"]["labels"])
nkululeko/plots.py CHANGED
@@ -628,8 +628,7 @@ class Plots:
628
628
  # one up because of the runs
629
629
  fig_dir = self.util.get_path("fig_dir") + "../"
630
630
  exp_name = self.util.get_exp_name(only_data=True)
631
- format = self.util.config_val("PLOT", "format", "png")
632
- filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{format}"
631
+ filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{self.format}"
633
632
  fig = ax.figure
634
633
  fig.savefig(filename)
635
634
  fig.clear()
nkululeko/segment.py CHANGED
@@ -62,6 +62,11 @@ def main():
62
62
  expr.fill_train_and_tests()
63
63
  util.debug(f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
64
64
 
65
+ def calc_dur(x):
66
+ starts = x[1]
67
+ ends = x[2]
68
+ return (ends - starts).total_seconds()
69
+
65
70
  # segment
66
71
  segmented_file = util.config_val("SEGMENT", "result", "segmented.csv")
67
72
 
@@ -104,16 +109,11 @@ def main():
104
109
  df_seg = df_seg.drop(columns=[target])
105
110
  df_seg = df_seg.rename(columns={"class_label": target})
106
111
  # save file
112
+ df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
107
113
  df_seg.to_csv(f"{expr.data_dir}/{segmented_file}")
108
114
 
109
- def calc_dur(x):
110
- starts = x[1]
111
- ends = x[2]
112
- return (ends - starts).total_seconds()
113
-
114
115
  if "duration" not in df.columns:
115
116
  df["duration"] = df.index.to_series().map(lambda x: calc_dur(x))
116
- df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
117
117
  num_before = df.shape[0]
118
118
  num_after = df_seg.shape[0]
119
119
  util.debug(
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: nkululeko
3
- Version: 0.93.10
3
+ Version: 0.93.12
4
4
  Summary: Machine learning audio prediction experiments based on templates
5
5
  Home-page: https://github.com/felixbur/nkululeko
6
6
  Author: Felix Burkhardt
@@ -63,7 +63,7 @@ Requires-Dist: pylatex
63
63
  ## Overview
64
64
  A project to detect speaker characteristics by machine learning experiments with a high-level interface.
65
65
 
66
- The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
66
+ The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
67
67
 
68
68
  * NEW with nkululeko: [Ensemble learning](http://blog.syntheticspeech.de/2024/06/25/nkululeko-ensemble-classifiers-with-late-fusion/)
69
69
  * NEW: [Finetune transformer-models](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)
@@ -353,8 +353,16 @@ If you use it, please mention the Nkululeko paper:
353
353
  Changelog
354
354
  =========
355
355
 
356
+ Version 0.93.12
357
+ ---------------
358
+ * bugfix: map_continuous_to_cat crashed on empty data
359
+
360
+ Version 0.93.11
361
+ ---------------
362
+ * bugfix: silero segmenter assigned file duration values
363
+
356
364
  Version 0.93.10
357
- --------------
365
+ ---------------
358
366
  * added nan check for imported features
359
367
  * added LOGO result output
360
368
 
@@ -2,7 +2,7 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
2
2
  nkululeko/aug_train.py,sha256=FoMbBrfyOZd4QAw7oIHl3X6-UpsqAKWVDIolCA7qOWs,3196
3
3
  nkululeko/augment.py,sha256=3RzaxB3gRxovgJVjHXi0glprW01J7RaHhUkqotW2T3U,2955
4
4
  nkululeko/cacheddataset.py,sha256=XFpWZmbJRg0pvhnIgYf0TkclxllD-Fctu-Ol0PF_00c,969
5
- nkululeko/constants.py,sha256=0grSx0I2K13N--2KpgQU90VQf94GJLPJXHFLbHVPDjI,40
5
+ nkululeko/constants.py,sha256=T9YZzqdg_ltKpuIf2XZdrqFmmlZQmhak97DpM5GQQhI,40
6
6
  nkululeko/demo-ft.py,sha256=iD9Pzp9QjyAv31q1cDZ75vPez7Ve8A4Cfukv5yfZdrQ,770
7
7
  nkululeko/demo.py,sha256=4Yzhg6pCPBYPGJrP7JX2TysVosl_R1llpVDKc2P_gUA,4955
8
8
  nkululeko/demo_feats.py,sha256=BvZjeNFTlERIRlq34OHM4Z96jdDQAhB01BGQAUcX9dM,2026
@@ -20,12 +20,12 @@ nkululeko/modelrunner.py,sha256=lJy-xM4QfDDWeL0dLTE_VIb4sYrnd_Z_yJRK3wwohQA,1119
20
20
  nkululeko/multidb.py,sha256=sO6OwJn8sn1-C-ig3thsIL8QMWHdV9SnJhDodKjeKrI,6876
21
21
  nkululeko/nkuluflag.py,sha256=PGWSmZz-PiiHLgcZJAoGOI_Y-sZDVI1ksB8p5r7riWM,3725
22
22
  nkululeko/nkululeko.py,sha256=M7baIq2nAoi6dEoBL4ATEuqAs5U1fvl_hyqAl5DybAQ,2040
23
- nkululeko/plots.py,sha256=zHWZ8Ns_0SLOEdbDVulObpRPoXRw_qqPXJv2dM08EeE,26049
23
+ nkululeko/plots.py,sha256=Mm30pDLBb55iE9SYaSg76KFBKnebZTlypFQIBo26wuY,25991
24
24
  nkululeko/predict.py,sha256=MLnHEyFmSiHLLs-HDczag8Vu3zKF5T1rXLKdZZJ6py8,2083
25
25
  nkululeko/resample.py,sha256=rn3-M1A-iwVGibfQNGyeYNa7briD24lIN9Szq_1uTJo,5194
26
26
  nkululeko/runmanager.py,sha256=AswmORVUkCIH0gTx6zEyufvFATQBS8C5TXo2erSNdVg,7611
27
27
  nkululeko/scaler.py,sha256=7VOZ4sREMoQtahfETt9RyuR29Fb7PCwxlYVjBbdCVFc,4101
28
- nkululeko/segment.py,sha256=DRjC6b7SeInYgwBcDPXpTXPvXPS-J8kFQO7H095bK80,4945
28
+ nkululeko/segment.py,sha256=7UrJEwdLmh9wDL5iBwpdJyJm9dwSxidHrHt-_D2qtxw,4949
29
29
  nkululeko/syllable_nuclei.py,sha256=5w_naKxNxz66a_qLkraemi2fggM-gWesiiBPS47iFcE,9931
30
30
  nkululeko/test.py,sha256=1w624vo5KTzmFC8BUStGlLDmIEAFuJUz7J0W-gp7AxI,1677
31
31
  nkululeko/test_predictor.py,sha256=DEHE_D3A6m6KJTrpDKceA1n655t_UZV3WQd57K4a3Ho,2863
@@ -49,7 +49,7 @@ nkululeko/autopredict/ap_stoi.py,sha256=UEQg1ZV0meAsxgdWB8ieRs9GPXHqArmsaOyCGRwp
49
49
  nkululeko/autopredict/ap_valence.py,sha256=WrW4Ltqi_odW49_4QEVKkfnrcztLIVZ4cXIEHu4dBN8,1026
50
50
  nkululeko/autopredict/estimate_snr.py,sha256=1k9-XadABudnsNOeFZD_Fg0E64-GUQVS7JEp82MLQS4,4995
51
51
  nkululeko/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
- nkululeko/data/dataset.py,sha256=G4jzD2MvzB7d6Oja_pUIdShFr7Qsbs0ogGzuTcyQfLo,30041
52
+ nkululeko/data/dataset.py,sha256=H65rvQ8sPwEwv_T-FqOEa7FeQ3JBn88v3xOzBOjARe4,30582
53
53
  nkululeko/data/dataset_csv.py,sha256=p2b4eS5R2Q5zdOIc56NRRU2PTFXSRt0qrdHGafHkWKo,4830
54
54
  nkululeko/feat_extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  nkululeko/feat_extract/feats_agender.py,sha256=onfAQ6-xx_mFMJXEF1IX8cHBmGtGeX6weJmxbkfh1_o,3184
@@ -112,9 +112,9 @@ nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
112
  nkululeko/utils/files.py,sha256=SrrYaU7AB80MZHiV1jcB0h_zigvYLYgSVNTXV4ao38g,4593
113
113
  nkululeko/utils/stats.py,sha256=vCRzhCR0Gx5SiJyAGbj1TIto8ocGz58CM5Pr3LltagA,2948
114
114
  nkululeko/utils/util.py,sha256=wFDslqxpCVDwi6LBakIFDDy1kYsxt5G7ykE38CocmtA,16880
115
- nkululeko-0.93.10.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
116
- nkululeko-0.93.10.dist-info/METADATA,sha256=vO975smqhzWtD-pm5SqiEqJV6RBxPy4UTHxGz4d6Ta4,42644
117
- nkululeko-0.93.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
118
- nkululeko-0.93.10.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
119
- nkululeko-0.93.10.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
120
- nkululeko-0.93.10.dist-info/RECORD,,
115
+ nkululeko-0.93.12.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
116
+ nkululeko-0.93.12.dist-info/METADATA,sha256=jjO-vG0POWF3v_v3QfliA4uT8jzZPikwmEj2O6v6rhg,42823
117
+ nkululeko-0.93.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
118
+ nkululeko-0.93.12.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
119
+ nkululeko-0.93.12.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
120
+ nkululeko-0.93.12.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5