nkululeko 0.77.0__py3-none-any.whl → 0.77.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. nkululeko/augment.py +1 -1
  2. nkululeko/augmenting/augmenter.py +1 -1
  3. nkululeko/augmenting/randomsplicer.py +1 -1
  4. nkululeko/augmenting/resampler.py +4 -9
  5. nkululeko/autopredict/ap_age.py +2 -4
  6. nkululeko/autopredict/ap_arousal.py +2 -4
  7. nkululeko/autopredict/ap_dominance.py +2 -4
  8. nkululeko/autopredict/ap_gender.py +2 -4
  9. nkululeko/autopredict/ap_mos.py +2 -4
  10. nkululeko/autopredict/ap_pesq.py +2 -4
  11. nkululeko/autopredict/ap_sdr.py +2 -4
  12. nkululeko/autopredict/ap_snr.py +2 -4
  13. nkululeko/autopredict/ap_stoi.py +2 -4
  14. nkululeko/autopredict/ap_valence.py +2 -4
  15. nkululeko/constants.py +1 -1
  16. nkululeko/data/dataset.py +1 -1
  17. nkululeko/demo.py +4 -10
  18. nkululeko/demo_predictor.py +1 -1
  19. nkululeko/experiment.py +1 -1
  20. nkululeko/explore.py +6 -13
  21. nkululeko/export.py +14 -25
  22. nkululeko/feat_extract/feats_analyser.py +110 -18
  23. nkululeko/feat_extract/feats_clap.py +4 -10
  24. nkululeko/feat_extract/feats_import.py +2 -4
  25. nkululeko/feat_extract/feats_mld.py +4 -9
  26. nkululeko/feat_extract/feats_mos.py +5 -13
  27. nkululeko/feat_extract/feats_oxbow.py +5 -12
  28. nkululeko/feat_extract/feats_snr.py +3 -7
  29. nkululeko/feat_extract/feats_squim.py +5 -13
  30. nkululeko/feat_extract/feats_trill.py +5 -13
  31. nkululeko/feat_extract/featureset.py +2 -4
  32. nkululeko/feat_extract/feinberg_praat.py +1 -1
  33. nkululeko/feature_extractor.py +1 -1
  34. nkululeko/file_checker.py +5 -5
  35. nkululeko/filter_data.py +6 -16
  36. nkululeko/modelrunner.py +1 -1
  37. nkululeko/models/model.py +1 -1
  38. nkululeko/models/model_cnn.py +1 -1
  39. nkululeko/models/model_mlp.py +1 -1
  40. nkululeko/models/model_mlp_regression.py +1 -1
  41. nkululeko/nkululeko.py +5 -13
  42. nkululeko/plots.py +8 -4
  43. nkululeko/predict.py +5 -13
  44. nkululeko/reporter.py +1 -1
  45. nkululeko/reporting/latex_writer.py +7 -2
  46. nkululeko/reporting/report.py +2 -1
  47. nkululeko/resample.py +5 -13
  48. nkululeko/runmanager.py +1 -1
  49. nkululeko/scaler.py +1 -1
  50. nkululeko/segment.py +1 -1
  51. nkululeko/segmenting/seg_silero.py +3 -5
  52. nkululeko/test.py +4 -10
  53. nkululeko/test_predictor.py +1 -1
  54. nkululeko/utils/stats.py +8 -0
  55. {nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/METADATA +6 -1
  56. nkululeko-0.77.1.dist-info/RECORD +104 -0
  57. nkululeko/balancer.py +0 -1
  58. nkululeko-0.77.0.dist-info/RECORD +0 -105
  59. /nkululeko/{util.py → utils/util.py} +0 -0
  60. {nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/LICENSE +0 -0
  61. {nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/WHEEL +0 -0
  62. {nkululeko-0.77.0.dist-info → nkululeko-0.77.1.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,15 @@
1
1
  # feats_analyser.py
2
2
  import ast
3
3
  import pandas as pd
4
+ from sklearn.inspection import permutation_importance
4
5
  from sklearn.linear_model import LogisticRegression
5
6
  from sklearn.tree import DecisionTreeClassifier
6
7
  from sklearn.linear_model import LinearRegression
7
8
  from sklearn.tree import DecisionTreeRegressor
8
9
  import matplotlib.pyplot as plt
9
10
  from xgboost import XGBClassifier, XGBRegressor
10
- from nkululeko.util import Util
11
+ from nkululeko.utils.util import Util
12
+ from nkululeko.utils.stats import normalize
11
13
  from nkululeko.plots import Plots
12
14
  import nkululeko.glob_conf as glob_conf
13
15
  from nkululeko.reporting.report_item import ReportItem
@@ -26,21 +28,48 @@ class FeatureAnalyser:
26
28
 
27
29
  def analyse(self):
28
30
  models = ast.literal_eval(self.util.config_val("EXPL", "model", "[log_reg]"))
31
+ model_name = "_".join(models)
29
32
  max_feat_num = int(self.util.config_val("EXPL", "max_feats", "10"))
33
+ # https://scikit-learn.org/stable/modules/permutation_importance.html
34
+ permutation = eval(self.util.config_val("EXPL", "permutation", "False"))
30
35
  importance = None
31
36
  self.util.debug("analysing features...")
32
37
  result_importances = {}
33
38
  if self.util.exp_is_classification():
34
39
  for model_s in models:
40
+ if permutation:
41
+ self.util.debug(
42
+ f"computing feature importance via permutation for {model_s}, might take longer..."
43
+ )
35
44
  if model_s == "log_reg":
36
45
  model = LogisticRegression()
37
46
  model.fit(self.features, self.labels)
38
- importance = model.coef_[0]
47
+ if permutation:
48
+ r = permutation_importance(
49
+ model,
50
+ self.features,
51
+ self.labels,
52
+ n_repeats=30,
53
+ random_state=0,
54
+ )
55
+ importance = r["importances_mean"]
56
+ else:
57
+ importance = model.coef_[0]
39
58
  result_importances[model_s] = importance
40
59
  elif model_s == "tree":
41
60
  model = DecisionTreeClassifier()
42
61
  model.fit(self.features, self.labels)
43
- importance = model.feature_importances_
62
+ if permutation:
63
+ r = permutation_importance(
64
+ model,
65
+ self.features,
66
+ self.labels,
67
+ n_repeats=30,
68
+ random_state=0,
69
+ )
70
+ importance = r["importances_mean"]
71
+ else:
72
+ importance = model.feature_importances_
44
73
  result_importances[model_s] = importance
45
74
  plot_tree = eval(self.util.config_val("EXPL", "plot_tree", "False"))
46
75
  if plot_tree:
@@ -50,26 +79,70 @@ class FeatureAnalyser:
50
79
  model = XGBClassifier(enable_categorical=True, tree_method="hist")
51
80
  self.labels = self.labels.astype("category")
52
81
  model.fit(self.features, self.labels)
53
- importance = model.feature_importances_
82
+ if permutation:
83
+ r = permutation_importance(
84
+ model,
85
+ self.features,
86
+ self.labels,
87
+ n_repeats=30,
88
+ random_state=0,
89
+ )
90
+ importance = r["importances_mean"]
91
+ else:
92
+ importance = model.feature_importances_
54
93
  result_importances[model_s] = importance
55
94
  else:
56
95
  self.util.error(f"invalid analysis method: {model}")
57
96
  else: # regression experiment
58
97
  for model_s in models:
98
+ if permutation:
99
+ self.util.debug(
100
+ f"computing feature importance via permutation for {model_s}, might take longer..."
101
+ )
59
102
  if model_s == "lin_reg":
60
103
  model = LinearRegression()
61
104
  model.fit(self.features, self.labels)
62
- importance = model.coef_
105
+ if permutation:
106
+ r = permutation_importance(
107
+ model,
108
+ self.features,
109
+ self.labels,
110
+ n_repeats=30,
111
+ random_state=0,
112
+ )
113
+ importance = r["importances_mean"]
114
+ else:
115
+ importance = model.coef_
63
116
  result_importances[model_s] = importance
64
117
  elif model_s == "tree":
65
118
  model = DecisionTreeRegressor()
66
119
  model.fit(self.features, self.labels)
67
- importance = model.feature_importances_
120
+ if permutation:
121
+ r = permutation_importance(
122
+ model,
123
+ self.features,
124
+ self.labels,
125
+ n_repeats=30,
126
+ random_state=0,
127
+ )
128
+ importance = r["importances_mean"]
129
+ else:
130
+ importance = model.feature_importances_
68
131
  result_importances[model_s] = importance
69
132
  elif model_s == "xgb":
70
133
  model = XGBRegressor()
71
134
  model.fit(self.features, self.labels)
72
- importance = model.feature_importances_
135
+ if permutation:
136
+ r = permutation_importance(
137
+ model,
138
+ self.features,
139
+ self.labels,
140
+ n_repeats=30,
141
+ random_state=0,
142
+ )
143
+ importance = r["importances_mean"]
144
+ else:
145
+ importance = model.feature_importances_
73
146
  result_importances[model_s] = importance
74
147
  else:
75
148
  self.util.error(f"invalid analysis method: {model_s}")
@@ -79,7 +152,15 @@ class FeatureAnalyser:
79
152
  }
80
153
  )
81
154
  for model_s in result_importances:
82
- df_imp[f"{model_s}_importance"] = result_importances[model_s]
155
+ if len(result_importances) == 1:
156
+ df_imp[f"{model_s}_importance"] = result_importances[model_s]
157
+ else:
158
+ # normalize the distributions because they might be different
159
+ self.util.debug(f"scaling importance values for {model_s}")
160
+ importance = result_importances[model_s]
161
+ importance = normalize(importance.reshape(-1, 1))
162
+ df_imp[f"{model_s}_importance"] = importance
163
+
83
164
  df_imp["importance"] = df_imp.iloc[:, 1:].mean(axis=1).values
84
165
  df_imp = df_imp.sort_values(by="importance", ascending=False).iloc[
85
166
  :max_feat_num
@@ -92,21 +173,31 @@ class FeatureAnalyser:
92
173
  ax.annotate(
93
174
  str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)
94
175
  )
95
- ax.set(title=f"{self.label} samples")
176
+ title = (
177
+ f"Feature importance for {self.label} samples with model(s) {model_name}"
178
+ )
179
+ if permutation:
180
+ title += "\n based on feature permutation"
181
+ ax.set(title=title)
96
182
  plt.tight_layout()
97
183
  fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs
98
184
  exp_name = self.util.get_exp_name(only_data=True)
99
185
  format = self.util.config_val("PLOT", "format", "png")
100
- model_name = "_".join(result_importances.keys())
101
- filename = f"{fig_dir}{exp_name}_EXPL_{model_name}.{format}"
186
+ filename = f"_EXPL_{model_name}"
187
+ if permutation:
188
+ filename += "_perm"
189
+ filename = f"{fig_dir}{exp_name}{filename}.{format}"
102
190
  plt.savefig(filename)
103
191
  fig = ax.figure
104
192
  fig.clear()
105
193
  plt.close(fig)
194
+ caption = f"Feature importance"
195
+ if permutation:
196
+ caption += " based on permutation of features."
106
197
  glob_conf.report.add_item(
107
198
  ReportItem(
108
199
  Header.HEADER_EXPLORE,
109
- f"Feature importance",
200
+ caption,
110
201
  f"using {model_name} models",
111
202
  filename,
112
203
  )
@@ -114,16 +205,17 @@ class FeatureAnalyser:
114
205
 
115
206
  # result file
116
207
  res_dir = self.util.get_path("res_dir")
117
- file_name = (
118
- f"{res_dir}{self.util.get_exp_name(only_data=True)}EXPL_{model_s}.txt"
119
- )
120
- with open(file_name, "w") as text_file:
208
+ filename = f"_EXPL_{model_name}"
209
+ if permutation:
210
+ filename += "_perm"
211
+ filename = f"{res_dir}{self.util.get_exp_name(only_data=True)}{filename}_{model_name}.txt"
212
+ with open(filename, "w") as text_file:
121
213
  text_file.write(
122
214
  "features in order of decreasing importance according to model"
123
- f" {model_s}:\n" + f"{str(df_imp.feats.values)}\n"
215
+ f" {model_name}:\n" + f"{str(df_imp.feats.values)}\n"
124
216
  )
125
217
 
126
- df_imp.to_csv(file_name, mode="a")
218
+ df_imp.to_csv(filename, mode="a")
127
219
 
128
220
  # check if feature distributions should be plotted
129
221
  plot_feats = self.util.config_val("EXPL", "feature_distributions", False)
@@ -1,6 +1,6 @@
1
1
  # feats_clap.py
2
2
 
3
- from nkululeko.util import Util
3
+ from nkululeko.utils.util import Util
4
4
  from nkululeko.feat_extract.featureset import Featureset
5
5
  import os
6
6
  import pandas as pd
@@ -32,16 +32,12 @@ class Clap(Featureset):
32
32
  store = self.util.get_path("store")
33
33
  store_format = self.util.config_val("FEATS", "store_format", "pkl")
34
34
  storage = f"{store}{self.name}.{store_format}"
35
- extract = self.util.config_val(
36
- "FEATS", "needs_feature_extraction", False
37
- )
35
+ extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
38
36
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
39
37
  if extract or no_reuse or not os.path.isfile(storage):
40
38
  if not self.model_initialized:
41
39
  self.init_model()
42
- self.util.debug(
43
- "extracting clap embeddings, this might take a while..."
44
- )
40
+ self.util.debug("extracting clap embeddings, this might take a while...")
45
41
  emb_series = pd.Series(index=self.data_df.index, dtype=object)
46
42
  length = len(self.data_df.index)
47
43
  for idx, (file, start, end) in enumerate(
@@ -55,9 +51,7 @@ class Clap(Featureset):
55
51
  )
56
52
  emb = self.get_embeddings(signal, sampling_rate)
57
53
  emb_series[idx] = emb
58
- self.df = pd.DataFrame(
59
- emb_series.values.tolist(), index=self.data_df.index
60
- )
54
+ self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
61
55
  self.util.write_store(self.df, storage, store_format)
62
56
  try:
63
57
  glob_conf.config["DATA"]["needs_feature_extraction"] = "false"
@@ -1,6 +1,6 @@
1
1
  # feats_import.py
2
2
 
3
- from nkululeko.util import Util
3
+ from nkululeko.utils.util import Util
4
4
  from nkululeko.feat_extract.featureset import Featureset
5
5
  import os
6
6
  import pandas as pd
@@ -17,9 +17,7 @@ class Importset(Featureset):
17
17
  """Import the features or load them from disk if present."""
18
18
  store = self.util.get_path("store")
19
19
  storage = f"{store}{self.name}.pkl"
20
- extract = eval(
21
- self.util.config_val("FEATS", "needs_feature_extraction", False)
22
- )
20
+ extract = eval(self.util.config_val("FEATS", "needs_feature_extraction", False))
23
21
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
24
22
  feat_import_file = self.util.config_val("FEATS", "import_file", False)
25
23
  if not os.path.isfile(feat_import_file):
@@ -4,7 +4,7 @@ import sys
4
4
  import os
5
5
  import pandas as pd
6
6
  import numpy as np
7
- from nkululeko.util import Util
7
+ from nkululeko.utils.util import Util
8
8
  import nkululeko.glob_conf as glob_conf
9
9
 
10
10
 
@@ -22,19 +22,14 @@ class MLD_set(Featureset):
22
22
  os.remove(storage)
23
23
  if not os.path.isfile(storage):
24
24
  self.util.debug(
25
- "extracting midleveldescriptor features, this might take a"
26
- " while..."
25
+ "extracting midleveldescriptor features, this might take a" " while..."
27
26
  )
28
27
  else:
29
- self.util.debug(
30
- "reusing previously extracted midleveldescriptor features"
31
- )
28
+ self.util.debug("reusing previously extracted midleveldescriptor features")
32
29
  import midlevel_descriptors as mld
33
30
 
34
31
  fex_mld = mld.MLD()
35
- self.df = fex_mld.extract_from_index(
36
- index=self.data_df, cache_path=storage
37
- )
32
+ self.df = fex_mld.extract_from_index(index=self.data_df, cache_path=storage)
38
33
  self.util.debug(f"MLD feats shape: {self.df.shape}")
39
34
  # shouldn't happen
40
35
  # replace NANa with column means values
@@ -19,7 +19,7 @@ from torchaudio.pipelines import SQUIM_SUBJECTIVE
19
19
  from torchaudio.utils import download_asset
20
20
  import audiofile
21
21
  import nkululeko.glob_conf as glob_conf
22
- from nkululeko.util import Util
22
+ from nkululeko.utils.util import Util
23
23
  from nkululeko.feat_extract.featureset import Featureset
24
24
 
25
25
 
@@ -36,9 +36,7 @@ class MOSSet(Featureset):
36
36
  # load model
37
37
  self.util.debug("loading MOS model...")
38
38
  self.subjective_model = SQUIM_SUBJECTIVE.get_model()
39
- NMR_SPEECH = download_asset(
40
- "tutorial-assets/ctc-decoding/1688-142285-0007.wav"
41
- )
39
+ NMR_SPEECH = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
42
40
  self.WAVEFORM_NMR, SAMPLE_RATE_NMR = torchaudio.load(NMR_SPEECH)
43
41
  self.model_initialized = True
44
42
 
@@ -47,9 +45,7 @@ class MOSSet(Featureset):
47
45
  store = self.util.get_path("store")
48
46
  store_format = self.util.config_val("FEATS", "store_format", "pkl")
49
47
  storage = f"{store}{self.name}.{store_format}"
50
- extract = self.util.config_val(
51
- "FEATS", "needs_feature_extraction", False
52
- )
48
+ extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
53
49
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
54
50
  if extract or no_reuse or not os.path.isfile(storage):
55
51
  if not self.model_initialized:
@@ -68,9 +64,7 @@ class MOSSet(Featureset):
68
64
  )
69
65
  emb = self.get_embeddings(signal, sampling_rate, file)
70
66
  emb_series[idx] = emb
71
- self.df = pd.DataFrame(
72
- emb_series.values.tolist(), index=self.data_df.index
73
- )
67
+ self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
74
68
  self.df.columns = ["mos"]
75
69
  self.util.write_store(self.df, storage, store_format)
76
70
  try:
@@ -91,9 +85,7 @@ class MOSSet(Featureset):
91
85
  tmp_audio_name = "mos_audio_tmp.wav"
92
86
  try:
93
87
  audiofile.write(tmp_audio_name, signal, sampling_rate)
94
- WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(
95
- tmp_audio_name
96
- )
88
+ WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(tmp_audio_name)
97
89
  with torch.no_grad():
98
90
  mos = self.subjective_model(WAVEFORM_SPEECH, self.WAVEFORM_NMR)
99
91
  except RuntimeError as re:
@@ -1,6 +1,6 @@
1
1
  # feats_oxbow.py
2
2
 
3
- from nkululeko.util import Util
3
+ from nkululeko.utils.util import Util
4
4
  from nkululeko.feat_extract.featureset import Featureset
5
5
  import os
6
6
  import pandas as pd
@@ -21,15 +21,11 @@ class Openxbow(Featureset):
21
21
  self.feature_set = eval(f"opensmile.FeatureSet.{self.featset}")
22
22
  store = self.util.get_path("store")
23
23
  storage = f"{store}{self.name}_{self.featset}.pkl"
24
- extract = self.util.config_val(
25
- "FEATS", "needs_feature_extraction", False
26
- )
24
+ extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
27
25
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
28
26
  if extract or no_reuse or not os.path.isfile(storage):
29
27
  # extract smile features first
30
- self.util.debug(
31
- "extracting openSmile features, this might take a while..."
32
- )
28
+ self.util.debug("extracting openSmile features, this might take a while...")
33
29
  smile = opensmile.Smile(
34
30
  feature_set=self.feature_set,
35
31
  feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
@@ -52,9 +48,7 @@ class Openxbow(Featureset):
52
48
  # save the smile features
53
49
  smile_df.to_csv(lld_name, sep=";", header=False)
54
50
  # get the path of the xbow java jar file
55
- xbow_path = self.util.config_val(
56
- "FEATS", "xbow.model", "../openXBOW/"
57
- )
51
+ xbow_path = self.util.config_val("FEATS", "xbow.model", "../openXBOW/")
58
52
  # get the size of the codebook
59
53
  size = self.util.config_val("FEATS", "size", 500)
60
54
  # get the number of assignements
@@ -83,8 +77,7 @@ class Openxbow(Featureset):
83
77
  if with_os:
84
78
  # extract smile functionals
85
79
  self.util.debug(
86
- "extracting openSmile functionals, this might take a"
87
- " while..."
80
+ "extracting openSmile functionals, this might take a" " while..."
88
81
  )
89
82
  smile = opensmile.Smile(
90
83
  feature_set=opensmile.FeatureSet.eGeMAPSv02, # always use eGemaps for this
@@ -6,7 +6,7 @@ from tqdm import tqdm
6
6
  import pandas as pd
7
7
  import audiofile
8
8
  import nkululeko.glob_conf as glob_conf
9
- from nkululeko.util import Util
9
+ from nkululeko.utils.util import Util
10
10
  from nkululeko.feat_extract.featureset import Featureset
11
11
  from nkululeko.autopredict.estimate_snr import SNREstimator
12
12
 
@@ -23,9 +23,7 @@ class SNRSet(Featureset):
23
23
  store = self.util.get_path("store")
24
24
  store_format = self.util.config_val("FEATS", "store_format", "pkl")
25
25
  storage = f"{store}{self.name}.{store_format}"
26
- extract = self.util.config_val(
27
- "FEATS", "needs_feature_extraction", False
28
- )
26
+ extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
29
27
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
30
28
  if extract or no_reuse or not os.path.isfile(storage):
31
29
  self.util.debug("estimating SNR, this might take a while...")
@@ -42,9 +40,7 @@ class SNRSet(Featureset):
42
40
  snr = self.get_snr(signal[0], sampling_rate)
43
41
  snr_series[idx] = snr
44
42
  print("")
45
- self.df = pd.DataFrame(
46
- snr_series.values.tolist(), index=self.data_df.index
47
- )
43
+ self.df = pd.DataFrame(snr_series.values.tolist(), index=self.data_df.index)
48
44
  self.df.columns = ["snr"]
49
45
  self.util.write_store(self.df, storage, store_format)
50
46
  try:
@@ -25,7 +25,7 @@ import torchaudio
25
25
  from torchaudio.pipelines import SQUIM_OBJECTIVE
26
26
  import audiofile
27
27
  import nkululeko.glob_conf as glob_conf
28
- from nkululeko.util import Util
28
+ from nkululeko.utils.util import Util
29
29
  from nkululeko.feat_extract.featureset import Featureset
30
30
 
31
31
 
@@ -49,9 +49,7 @@ class SQUIMSet(Featureset):
49
49
  store = self.util.get_path("store")
50
50
  store_format = self.util.config_val("FEATS", "store_format", "pkl")
51
51
  storage = f"{store}{self.name}.{store_format}"
52
- extract = self.util.config_val(
53
- "FEATS", "needs_feature_extraction", False
54
- )
52
+ extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
55
53
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
56
54
  if extract or no_reuse or not os.path.isfile(storage):
57
55
  if not self.model_initialized:
@@ -70,9 +68,7 @@ class SQUIMSet(Featureset):
70
68
  )
71
69
  emb = self.get_embeddings(signal, sampling_rate, file)
72
70
  emb_series[idx] = emb
73
- self.df = pd.DataFrame(
74
- emb_series.values.tolist(), index=self.data_df.index
75
- )
71
+ self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
76
72
  self.df.columns = ["pesq", "sdr", "stoi"]
77
73
  self.util.write_store(self.df, storage, store_format)
78
74
  try:
@@ -93,13 +89,9 @@ class SQUIMSet(Featureset):
93
89
  tmp_audio_name = "squim_audio_tmp.wav"
94
90
  try:
95
91
  audiofile.write(tmp_audio_name, signal, sampling_rate)
96
- WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(
97
- tmp_audio_name
98
- )
92
+ WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(tmp_audio_name)
99
93
  with torch.no_grad():
100
- stoi_hyp, pesq_hyp, si_sdr_hyp = self.objective_model(
101
- WAVEFORM_SPEECH
102
- )
94
+ stoi_hyp, pesq_hyp, si_sdr_hyp = self.objective_model(WAVEFORM_SPEECH)
103
95
  pesq = float(pesq_hyp[0].numpy())
104
96
  stoi = float(stoi_hyp[0].numpy())
105
97
  sdr = float(si_sdr_hyp[0].numpy())
@@ -5,7 +5,7 @@ from numpy.core.numeric import tensordot
5
5
  from tqdm import tqdm
6
6
  import pandas as pd
7
7
  import audiofile as af
8
- from nkululeko.util import Util
8
+ from nkululeko.utils.util import Util
9
9
  import nkululeko.glob_conf as glob_conf
10
10
  from nkululeko.feat_extract.featureset import Featureset
11
11
 
@@ -43,24 +43,16 @@ class TRILLset(Featureset):
43
43
  def extract(self):
44
44
  store = self.util.get_path("store")
45
45
  storage = f"{store}{self.name}.pkl"
46
- extract = self.util.config_val(
47
- "FEATS", "needs_feature_extraction", False
48
- )
46
+ extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
49
47
  no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
50
48
  if extract or no_reuse or not os.path.isfile(storage):
51
- self.util.debug(
52
- "extracting TRILL embeddings, this might take a while..."
53
- )
49
+ self.util.debug("extracting TRILL embeddings, this might take a while...")
54
50
  emb_series = pd.Series(index=self.data_df.index, dtype=object)
55
51
  length = len(self.data_df.index)
56
- for idx, file in enumerate(
57
- tqdm(self.data_df.index.get_level_values(0))
58
- ):
52
+ for idx, file in enumerate(tqdm(self.data_df.index.get_level_values(0))):
59
53
  emb = self.getEmbeddings(file)
60
54
  emb_series[idx] = emb
61
- self.df = pd.DataFrame(
62
- emb_series.values.tolist(), index=self.data_df.index
63
- )
55
+ self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
64
56
  self.df.to_pickle(storage)
65
57
  try:
66
58
  glob_conf.config["DATA"]["needs_feature_extraction"] = "false"
@@ -1,6 +1,6 @@
1
1
  # featureset.py
2
2
  import pandas as pd
3
- from nkululeko.util import Util
3
+ from nkululeko.utils.util import Util
4
4
  import nkululeko.glob_conf as glob_conf
5
5
  import ast
6
6
 
@@ -23,9 +23,7 @@ class Featureset:
23
23
  self.df = self.df[self.df.index.isin(self.data_df.index)]
24
24
  try:
25
25
  # use only some features
26
- selected_features = ast.literal_eval(
27
- glob_conf.config["FEATS"]["features"]
28
- )
26
+ selected_features = ast.literal_eval(glob_conf.config["FEATS"]["features"])
29
27
  self.util.debug(f"selecting features: {selected_features}")
30
28
  sel_feats_df = pd.DataFrame()
31
29
  hit = False
@@ -11,7 +11,7 @@ import math
11
11
  from tqdm import tqdm
12
12
  import parselmouth
13
13
  import statistics
14
- from nkululeko.util import Util
14
+ from nkululeko.utils.util import Util
15
15
  import audiofile
16
16
  from parselmouth.praat import call
17
17
  from scipy.stats.mstats import zscore
@@ -6,7 +6,7 @@ Helper class to encapsulate feature extraction methods
6
6
  """
7
7
  import pandas as pd
8
8
 
9
- from nkululeko.util import Util
9
+ from nkululeko.utils.util import Util
10
10
 
11
11
 
12
12
  class FeatureExtractor:
nkululeko/file_checker.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import pandas as pd
2
- from nkululeko.util import Util
2
+ from nkululeko.utils.util import Util
3
3
  import os
4
4
 
5
5
 
@@ -39,7 +39,9 @@ class FileChecker:
39
39
  min = self.util.config_val_data(data_name, "check_size", False)
40
40
  if min:
41
41
  if min == "True":
42
- min = 1000 # 1000 bytes would be a reasonable minimal size for 16 kHz sr
42
+ min = (
43
+ 1000 # 1000 bytes would be a reasonable minimal size for 16 kHz sr
44
+ )
43
45
  old_samples = self.df.shape[0]
44
46
  df = self.df.copy()
45
47
  for i in self.df.index:
@@ -66,9 +68,7 @@ class FileChecker:
66
68
  else:
67
69
  check = self.util.config_val_data(data_name, "check_vad", False)
68
70
  if check:
69
- self.util.debug(
70
- f"{data_name}: checking for samples without speech."
71
- )
71
+ self.util.debug(f"{data_name}: checking for samples without speech.")
72
72
  SAMPLING_RATE = 16000
73
73
  (
74
74
  get_speech_timestamps,
nkululeko/filter_data.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import audformat
2
2
  import pandas as pd
3
3
  import nkululeko.glob_conf as glob_conf
4
- from nkululeko.util import Util
4
+ from nkululeko.utils.util import Util
5
5
  import ast
6
6
 
7
7
 
@@ -45,9 +45,7 @@ class DataFilter:
45
45
  the samples are selected randomly
46
46
  """
47
47
  if data_name == "":
48
- max = self.util.config_val(
49
- "DATA", "limit_samples_per_speaker", False
50
- )
48
+ max = self.util.config_val("DATA", "limit_samples_per_speaker", False)
51
49
  else:
52
50
  max = self.util.config_val_data(
53
51
  data_name, "limit_samples_per_speaker", False
@@ -73,12 +71,8 @@ class DataFilter:
73
71
  def filter_duration(self, data_name=""):
74
72
  """remove all samples less than min_dur duration"""
75
73
  if data_name == "":
76
- min_dur = self.util.config_val(
77
- "DATA", "min_duration_of_sample", False
78
- )
79
- max_dur = self.util.config_val(
80
- "DATA", "max_duration_of_sample", False
81
- )
74
+ min_dur = self.util.config_val("DATA", "min_duration_of_sample", False)
75
+ max_dur = self.util.config_val("DATA", "max_duration_of_sample", False)
82
76
  else:
83
77
  min_dur = self.util.config_val_data(
84
78
  data_name, "min_duration_of_sample", False
@@ -175,9 +169,7 @@ def filter_min_dur(df, min_dur):
175
169
  glob_conf.util.debug(
176
170
  "converting file index to multi index, this might take a while..."
177
171
  )
178
- df_ret.index = audformat.utils.to_segmented_index(
179
- df.index, allow_nat=False
180
- )
172
+ df_ret.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
181
173
  for i in df_ret.index:
182
174
  start = i[1]
183
175
  end = i[2]
@@ -197,9 +189,7 @@ def filter_max_dur(df, max_dur):
197
189
  glob_conf.util.debug(
198
190
  "converting file index to multi index, this might take a while..."
199
191
  )
200
- df_ret.index = audformat.utils.to_segmented_index(
201
- df.index, allow_nat=False
202
- )
192
+ df_ret.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
203
193
  for i in df_ret.index:
204
194
  start = i[1]
205
195
  end = i[2]
nkululeko/modelrunner.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from nkululeko.util import Util
5
+ from nkululeko.utils.util import Util
6
6
  from nkululeko import glob_conf
7
7
  import nkululeko.glob_conf as glob_conf
8
8
 
nkululeko/models/model.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # model.py
2
- from nkululeko.util import Util
2
+ from nkululeko.utils.util import Util
3
3
  import pandas as pd
4
4
  import numpy as np
5
5
  import nkululeko.glob_conf as glob_conf