nkululeko 0.65.2__py3-none-any.whl → 0.65.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nkululeko/constants.py CHANGED
@@ -1,2 +1,2 @@
1
- VERSION="0.65.2"
2
- SAMPLING_RATE=16000
1
+ VERSION="0.65.5"
2
+ SAMPLING_RATE=16000
nkululeko/data/dataset.py CHANGED
@@ -159,7 +159,7 @@ class Dataset:
159
159
  self.df["duration"] = (end - start).total_seconds()
160
160
 
161
161
  # Perform some filtering if desired
162
- required = eval(self.util.config_val_data(self.name, "required", "False"))
162
+ required = self.util.config_val_data(self.name, "required", False)
163
163
  if required:
164
164
  pre = self.df.shape[0]
165
165
  self.df = self.df[self.df[required].notna()]
@@ -204,7 +204,10 @@ class Dataset:
204
204
  pass
205
205
  try:
206
206
  # try to get the gender values
207
- df_local["gender"] = source_df["gender"]
207
+ if "gender" in source_df:
208
+ df_local["gender"] = source_df["gender"]
209
+ else:
210
+ df_local["gender"] = source_df["sex"]
208
211
  got_gender = True
209
212
  except (KeyError, ValueError, audformat.errors.BadKeyError) as e:
210
213
  pass
@@ -48,21 +48,23 @@ class Dataset_CSV(Dataset):
48
48
  self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
49
49
  if self.is_labeled and not "class_label" in self.df.columns:
50
50
  self.df["class_label"] = self.df[self.target]
51
- if "gender" in df.columns:
51
+ if "gender" in self.df.columns:
52
52
  self.got_gender = True
53
- if "age" in df.columns:
53
+ elif "sex" in self.df.columns:
54
+ self.df = self.df.rename(columns={'sex':'gender'})
55
+ self.got_gender = True
56
+ if "age" in self.df.columns:
54
57
  self.got_age = True
55
- if "speaker" in df.columns:
58
+ if "speaker" in self.df.columns:
56
59
  self.got_speaker = True
57
- ns = df["speaker"].nunique()
58
- self.util.debug(f"num of speakers: {ns}")
59
60
  speaker_num = 0
60
61
  if self.got_speaker:
61
62
  speaker_num = self.df.speaker.nunique()
62
63
  self.util.debug(
63
64
  f"Loaded database {self.name} with {df.shape[0]} "
64
65
  f"samples: got targets: {self.got_target}, got speakers: {self.got_speaker} ({speaker_num}), "
65
- f"got sexes: {self.got_gender}"
66
+ f"got sexes: {self.got_gender}, "
67
+ f"got age: {self.got_age}"
66
68
  )
67
69
 
68
70
  def prepare(self):
nkululeko/export.py CHANGED
@@ -10,7 +10,7 @@ import audiofile
10
10
  from nkululeko.experiment import Experiment
11
11
  from nkululeko.util import Util
12
12
  from nkululeko.constants import VERSION
13
-
13
+ import shutil
14
14
 
15
15
  def main(src_dir):
16
16
  parser = argparse.ArgumentParser(description="Call the nkululeko framework.")
@@ -54,6 +54,7 @@ def main(src_dir):
54
54
  target_root = util.config_val("EXPORT", "root", "./exported_data/")
55
55
  orig_root = util.config_val("EXPORT", "orig_root", None)
56
56
  data_name = util.config_val("EXPORT", "data_name", "export")
57
+ segments_as_files = eval(util.config_val("EXPORT", "segments_as_files", "False"))
57
58
  audeer.mkdir(target_root)
58
59
  splits = {"train": df_train, "test": df_test}
59
60
  df_all = pd.DataFrame()
@@ -61,25 +62,40 @@ def main(src_dir):
61
62
  files = []
62
63
  df = splits[split]
63
64
  for idx, (file, start, end) in enumerate(df.index.to_list()):
64
- signal, sampling_rate = audiofile.read(
65
- file,
66
- offset=start.total_seconds(),
67
- duration=(end - start).total_seconds(),
68
- always_2d=True,
69
- )
70
65
  file_dir = os.path.dirname(file)
71
- file_name = os.path.basename(file)
72
- wav_folder = (
73
- f"{target_root}/{os.path.basename(os.path.normpath(orig_root))}"
74
- )
75
- audeer.mkdir(wav_folder)
76
- new_rel_path = file_dir[file_dir.index(orig_root)+1+len(orig_root):]
77
- new_file_path = f"{wav_folder}/{new_rel_path}"
78
- audeer.mkdir(new_file_path)
79
- new_file_name = f"{new_file_path}/{file_name}"
80
- audiofile.write(new_file_name, signal, sampling_rate)
81
- new_file_name = os.path.relpath(new_file_name, target_root)
82
- files.append(new_file_name)
66
+ if segments_as_files:
67
+ signal, sampling_rate = audiofile.read(
68
+ file,
69
+ offset=start.total_seconds(),
70
+ duration=(end - start).total_seconds(),
71
+ always_2d=True,
72
+ )
73
+ file_name = os.path.basename(file)[:-3]+'_'+start.total_seconds()+'.wav'
74
+ wav_folder = (
75
+ f"{target_root}/{os.path.basename(os.path.normpath(orig_root))}"
76
+ )
77
+ audeer.mkdir(wav_folder)
78
+ new_rel_path = file_dir[file_dir.index(orig_root)+1+len(orig_root):]
79
+ new_file_path = f"{wav_folder}/{new_rel_path}"
80
+ audeer.mkdir(new_file_path)
81
+ new_file_name = f"{new_file_path}/{file_name}"
82
+ audiofile.write(new_file_name, signal, sampling_rate)
83
+ new_file_name = os.path.relpath(new_file_name, target_root)
84
+ files.append(new_file_name)
85
+ else:
86
+ file_name = os.path.basename(file)[:-4]+'_'+str(start.total_seconds())+'.wav'
87
+ wav_folder = (
88
+ f"{target_root}/{os.path.basename(os.path.normpath(orig_root))}"
89
+ )
90
+ audeer.mkdir(wav_folder)
91
+ new_rel_path = file_dir[file_dir.index(orig_root)+1+len(orig_root):]
92
+ new_file_path = f"{wav_folder}/{new_rel_path}"
93
+ audeer.mkdir(new_file_path)
94
+ new_file_name = f"{new_file_path}/{file_name}"
95
+ if not os.path.exists(new_file_name):
96
+ shutil.copyfile(file, new_file_name)
97
+ new_file_name = os.path.relpath(new_file_name, target_root)
98
+ files.append(new_file_name)
83
99
  df = df.set_index(df.index.set_levels(files, level="file"))
84
100
  df["split"] = split
85
101
  df_all = pd.concat([df_all, df])
@@ -54,7 +54,7 @@ class Wav2vec2(Featureset):
54
54
  signal, sampling_rate = torchaudio.load(file,
55
55
  frame_offset=int(start.total_seconds()*16000),
56
56
  num_frames=int((end - start).total_seconds()*16000))
57
- assert sampling_rate == 16000
57
+ assert sampling_rate == 16000, f"got {sampling_rate} instead of 16000"
58
58
  emb = self.get_embeddings(signal, sampling_rate, file)
59
59
  emb_series[idx] = emb
60
60
  if idx % 10 == 0:
nkululeko/plots.py CHANGED
@@ -57,6 +57,8 @@ class Plots():
57
57
  bin_reals = eval(self.util.config_val('EXPL', 'bin_reals', 'True'))
58
58
  for att in attributes:
59
59
  if len(att) == 1:
60
+ if att[0] not in df:
61
+ self.util.error(f'unknown feature: {att[0]}')
60
62
  self.util.debug(f'plotting {att[0]}')
61
63
  filename = f'{self.target}-{att[0]}'
62
64
  if self.util.is_categorical(df[att[0]]):
@@ -90,6 +92,10 @@ class Plots():
90
92
  plt.close(fig)
91
93
  # fig.clear() # avoid error
92
94
  elif len(att) == 2:
95
+ if att[0] not in df:
96
+ self.util.error(f'unknown feature: {att[0]}')
97
+ if att[1] not in df:
98
+ self.util.error(f'unknown feature: {att[1]}')
93
99
  self.util.debug(f'plotting {att}')
94
100
  att1 = att[0]
95
101
  att2 = att[1]
nkululeko/utils/stats.py CHANGED
@@ -3,6 +3,14 @@ import math
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
 
6
+ def check_na(a):
7
+ if np.isnan(a).any():
8
+ count = np.count_nonzero(np.isnan(a))
9
+ print(f'WARNING: got {count} Nans (of {len(a)}), setting to 0')
10
+ a[np.isnan(a)] = 0
11
+ return a
12
+ else:
13
+ return a
6
14
 
7
15
  def cohen_d(d1, d2):
8
16
  """
@@ -13,6 +21,9 @@ def cohen_d(d1, d2):
13
21
  Returns:
14
22
  Cohen's d with precision 3
15
23
  """
24
+ # Checks:
25
+ d1 = check_na(d1)
26
+ d2 = check_na(d2)
16
27
  # calculate the size of samples
17
28
  n1, n2 = len(d1), len(d2)
18
29
  # calculate the variance of the samples
@@ -22,8 +33,8 @@ def cohen_d(d1, d2):
22
33
  # calculate the means of the samples
23
34
  u1, u2 = np.mean(d1), np.mean(d2)
24
35
  # calculate the effect size
25
- if math.isnan(s):
26
- return 0
36
+ if math.isnan(s) or s == 0:
37
+ return -1
27
38
  return (int(1000 * np.abs((u1 - u2)) / s)) / 1000
28
39
 
29
40
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nkululeko
3
- Version: 0.65.2
3
+ Version: 0.65.5
4
4
  Summary: Machine learning audio prediction experiments based on templates
5
5
  Home-page: https://github.com/felixbur/nkululeko
6
6
  Author: Felix Burkhardt
@@ -255,6 +255,19 @@ Nkululeko can be used under the [MIT license](https://choosealicense.com/license
255
255
  Changelog
256
256
  =========
257
257
 
258
+ Version 0.65.5
259
+ --------------
260
+ * added fill_na in plot effect size
261
+
262
+ Version 0.65.4
263
+ --------------
264
+ * added datasets to distribution
265
+ * changes in wav2vec2
266
+
267
+ Version 0.65.3
268
+ --------------
269
+ * various bugfixes
270
+
258
271
  Version 0.65.2
259
272
  --------------
260
273
  * fixed bug in dataset.csv that prevented correct paths for relative files
@@ -2,19 +2,19 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
2
2
  nkululeko/augment.py,sha256=paq-vEf02XyaPsLjnCbDJbuZind6M6mm0NWAnK5_PKU,1751
3
3
  nkululeko/balancer.py,sha256=64ftZN68sMDfkvuovCDHpAHmSJgCO6Kdk9bwmpSisec,12
4
4
  nkululeko/cacheddataset.py,sha256=lIJ6hUo5LoxSrzXtWV8mzwO7wRtUETWnOQ4ws2XfL1E,969
5
- nkululeko/constants.py,sha256=Nf4gAGehioLVSeUIbSCUwTBi57qq1XZFekfbMmt65n4,37
5
+ nkululeko/constants.py,sha256=KGYcdgRK9DOOWiYr-901PUaOgmccfRuUhfuYAEyFvZM,36
6
6
  nkululeko/demo.py,sha256=1JFayJBUqOwj8fDtNF6siuB_5aG6dkb_TeIFPhYv1YI,1858
7
7
  nkululeko/demo_predictor.py,sha256=f3tghu3KxrLFPSrMMqT3E_Owy8alAuHEbkoZuahikEE,2310
8
8
  nkululeko/experiment.py,sha256=fV-TdDPlFhS8DdALuzme0n9-3HnGwi_GxiTak3u_ZAY,24779
9
9
  nkululeko/explore.py,sha256=Ok4Cr1llMQhsP4zCJYWfr5I5HNVuWocGcDnE5QfE9a0,2093
10
- nkululeko/export.py,sha256=NXSjTLfeUk_T6h9zcIFc63scvSc2vvXDmp1EEUpcyGY,3509
10
+ nkululeko/export.py,sha256=p3EalSrFIJCcZVv_630gY-7QW_WDInkrQzsMpYCL5Fs,4525
11
11
  nkululeko/feature_extractor.py,sha256=YRzFJjsE9omCFfr8DNsHj0uiKPd3_NCjHrWpRGmuhmA,6474
12
12
  nkululeko/file_checker.py,sha256=Jo3TN6sc_XJNF8x1UYHqfPyaMTues0iuj6_Sal4x0so,3428
13
13
  nkululeko/filter_data.py,sha256=LD46OLYfA6UuLvFgaA1LvCVSqmi6JXN2xd_fTZdr-ag,7029
14
14
  nkululeko/glob_conf.py,sha256=KOsmB2_9AVoKNyhtCkaamj-ZYCme6-NjTYF-4wlLgOY,241
15
15
  nkululeko/modelrunner.py,sha256=NYdV4z9TKhtC9LGhOxXa7aBC2AxRjnFJ42Ah01rsbqg,6025
16
16
  nkululeko/nkululeko.py,sha256=SVOY3CPvlmG-16kqV8YOvR2HYCgHkaiVo3GBiwu38W0,1681
17
- nkululeko/plots.py,sha256=p2J9dOHJxbo0_SSh_TtKBLxD5e96DpQ_TmUwn8GDTHo,12714
17
+ nkululeko/plots.py,sha256=wwUvHIlpP6wYlukYNWtgqhxh6avqo_okslNgoT4S6u0,13023
18
18
  nkululeko/predict.py,sha256=3ei4wn2by0p9Vkv7cllMcszmEjSM2vX0T6x_5rlgT28,1851
19
19
  nkululeko/reporter.py,sha256=359aeQWt0ZGLseaJnOfafYG8BrwumiM2Q58DWiaoyWQ,10177
20
20
  nkululeko/resample.py,sha256=NDZLIhmGPcxSZwzIZul9oeNfbaHfAmzjTvRWs6isIr4,2120
@@ -44,8 +44,8 @@ nkululeko/autopredict/ap_stoi.py,sha256=sx2KSGSjCsjgUvZPFlKALhbZlVg2aTdGmvil-iNS
44
44
  nkululeko/autopredict/ap_valence.py,sha256=cyFrKRy68EU5gDiVg0_HxiwaKGbAJ9UuwuiXgVTalAU,997
45
45
  nkululeko/autopredict/estimate_snr.py,sha256=m3JZSRGXr2M9yVgovTSZ63rpho9hIUfHOXfAVRAHf1k,4186
46
46
  nkululeko/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
- nkululeko/data/dataset.py,sha256=2brz4yKjUogRy2z7S-VkJpb1G9iG6OtfdiO1yyVEiaw,21127
48
- nkululeko/data/dataset_csv.py,sha256=RElfRxKrI1IqdPp5pvq2lmL8NIzBsyLEaWEzqjz_zBg,2708
47
+ nkululeko/data/dataset.py,sha256=3h3gL_zxWjkYdujETxJRJhqbFPNlEbG6iaPyr8CQ-oc,21245
48
+ nkululeko/data/dataset_csv.py,sha256=YUAeTqCK-aaoRMTzYpMRB00J5E5P8BRH7Jve4RfEzvU,2818
49
49
  nkululeko/feat_extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
50
  nkululeko/feat_extract/feats_agender.py,sha256=ZlfYbpL65SOTLsoqkcNsREa-EyHpigNf8LlFErQH9Ck,2818
51
51
  nkululeko/feat_extract/feats_agender_agender.py,sha256=zMY3GucOOx-aBav_9bUSdU63rc2VlFxVZHuVqUKQ6Dk,2904
@@ -63,7 +63,7 @@ nkululeko/feat_extract/feats_praat.py,sha256=VU9hwL0_1XJQW32CYaW0bqTbl9MryQ1hME9
63
63
  nkululeko/feat_extract/feats_snr.py,sha256=UHCNgdZj8g69rj-EyujC1gX-BgYkuIoIm63ZR7Xb78w,2790
64
64
  nkululeko/feat_extract/feats_squim.py,sha256=8AUNfjWf2MHCDIH0by543fwTVFSybBTYWdQpIlZIMi0,4359
65
65
  nkululeko/feat_extract/feats_trill.py,sha256=8BI_wL7YIwaKWTZ8DnLC5RUZ5HVXdLzLXs3QgNexXV8,2958
66
- nkululeko/feat_extract/feats_wav2vec2.py,sha256=n7eLvt8Zv_GF-w5CkMkts6NU-kh4bEOb-aDgnd2TfvQ,4421
66
+ nkululeko/feat_extract/feats_wav2vec2.py,sha256=48_WK_JmUq0Gda800fJh3RadTxkHeCpOSTBsPIykEjw,4462
67
67
  nkululeko/feat_extract/feats_wavlm.py,sha256=06vRILRLG2-OzsLx-tuwOdtMhNtoPSxJwAE5B82upjk,4466
68
68
  nkululeko/feat_extract/featureset.py,sha256=7l1qC_nVGyyuYOdKJoS5hY673Fkm3Wpqtdu5mxtBP0E,1369
69
69
  nkululeko/feat_extract/feinberg_praat.py,sha256=fcp2863g5NLk7p27HlRISpTOVTaoxcrSGOST4mK5VXk,20496
@@ -90,9 +90,9 @@ nkululeko/segmenting/seg_inaspeechsegmenter.py,sha256=pmLHuXsaqvcdYxB4PSW9l1mbQW
90
90
  nkululeko/segmenting/seg_silero.py,sha256=bbLQxUste1lKEwNRHG4wOTCaFMevNt8TOe2DfmLtu5w,3306
91
91
  nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
92
  nkululeko/utils/files.py,sha256=82EsGvTN85lDdSzetm1ZHTVOUm0AcfQhTeurwsfHN_Q,3547
93
- nkululeko/utils/stats.py,sha256=EM2qN1HBh34wZ5FlK3fOyRhBig4W56_DPYY9wudx5aw,1939
94
- nkululeko-0.65.2.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
95
- nkululeko-0.65.2.dist-info/METADATA,sha256=4jY2iz2vnyDnJaiS7VE077x3VwKwRBv24dbdvDF8744,23255
96
- nkululeko-0.65.2.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
97
- nkululeko-0.65.2.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
98
- nkululeko-0.65.2.dist-info/RECORD,,
93
+ nkululeko/utils/stats.py,sha256=nRuZEbHpeRTAcvW8b12ZWi_kDnpC5Ztrmvy0_J621cE,2241
94
+ nkululeko-0.65.5.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
95
+ nkululeko-0.65.5.dist-info/METADATA,sha256=y2ttA1LEwXgflc2MLb1krzgq3zazj780wZ3fIWR8fAg,23458
96
+ nkululeko-0.65.5.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
97
+ nkululeko-0.65.5.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
98
+ nkululeko-0.65.5.dist-info/RECORD,,