nkululeko 0.59.1__py3-none-any.whl → 0.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +43 -43
- nkululeko/feature_extractor.py +101 -58
- nkululeko/modelrunner.py +14 -14
- nkululeko/plots.py +11 -0
- nkululeko/segment.py +23 -27
- nkululeko/test_predictor.py +1 -1
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/METADATA +13 -1
- nkululeko-0.61.0.dist-info/RECORD +31 -0
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/WHEEL +1 -1
- nkululeko/ap_age.py +0 -31
- nkululeko/ap_arousal.py +0 -30
- nkululeko/ap_dominance.py +0 -29
- nkululeko/ap_gender.py +0 -29
- nkululeko/ap_mos.py +0 -35
- nkululeko/ap_pesq.py +0 -35
- nkululeko/ap_sdr.py +0 -36
- nkululeko/ap_snr.py +0 -35
- nkululeko/ap_stoi.py +0 -34
- nkululeko/ap_valence.py +0 -30
- nkululeko/augmenter.py +0 -64
- nkululeko/dataset.py +0 -415
- nkululeko/dataset_csv.py +0 -49
- nkululeko/dataset_ravdess.py +0 -19
- nkululeko/estimate_snr.py +0 -89
- nkululeko/feats_agender.py +0 -63
- nkululeko/feats_agender_agender.py +0 -65
- nkululeko/feats_analyser.py +0 -87
- nkululeko/feats_audmodel.py +0 -63
- nkululeko/feats_audmodel_dim.py +0 -63
- nkululeko/feats_clap.py +0 -74
- nkululeko/feats_import.py +0 -44
- nkululeko/feats_mld.py +0 -47
- nkululeko/feats_mos.py +0 -92
- nkululeko/feats_opensmile.py +0 -84
- nkululeko/feats_oxbow.py +0 -87
- nkululeko/feats_praat.py +0 -72
- nkululeko/feats_snr.py +0 -63
- nkululeko/feats_squim.py +0 -99
- nkululeko/feats_trill.py +0 -74
- nkululeko/feats_wav2vec2.py +0 -94
- nkululeko/featureset.py +0 -41
- nkululeko/feinberg_praat.py +0 -430
- nkululeko/loss_ccc.py +0 -28
- nkululeko/loss_softf1loss.py +0 -40
- nkululeko/model.py +0 -256
- nkululeko/model_bayes.py +0 -14
- nkululeko/model_cnn.py +0 -118
- nkululeko/model_gmm.py +0 -16
- nkululeko/model_knn.py +0 -16
- nkululeko/model_knn_reg.py +0 -16
- nkululeko/model_mlp.py +0 -175
- nkululeko/model_mlp_regression.py +0 -197
- nkululeko/model_svm.py +0 -18
- nkululeko/model_svr.py +0 -18
- nkululeko/model_tree.py +0 -14
- nkululeko/model_tree_reg.py +0 -14
- nkululeko/model_xgb.py +0 -12
- nkululeko/model_xgr.py +0 -12
- nkululeko/randomsplicer.py +0 -76
- nkululeko/randomsplicing.py +0 -74
- nkululeko-0.59.1.dist-info/RECORD +0 -82
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/LICENSE +0 -0
- {nkululeko-0.59.1.dist-info → nkululeko-0.61.0.dist-info}/top_level.txt +0 -0
nkululeko/feats_oxbow.py
DELETED
@@ -1,87 +0,0 @@
|
|
1
|
-
# feats_oxbow.py
|
2
|
-
|
3
|
-
from nkululeko.util import Util
|
4
|
-
from nkululeko.featureset import Featureset
|
5
|
-
import os
|
6
|
-
import pandas as pd
|
7
|
-
import opensmile
|
8
|
-
|
9
|
-
class Openxbow(Featureset):
|
10
|
-
"""Class to extract openXBOW processed opensmile features (https://github.com/openXBOW)"""
|
11
|
-
|
12
|
-
def __init__(self, name, data_df, is_train = False):
|
13
|
-
"""Constructor. is_train is needed to distinguish from test/dev sets, because they use the codebook from the training"""
|
14
|
-
super().__init__(name, data_df)
|
15
|
-
self.is_train = is_train
|
16
|
-
|
17
|
-
def extract(self):
|
18
|
-
"""Extract the features or load them from disk if present."""
|
19
|
-
self.featset = self.util.config_val('FEATS', 'set', 'eGeMAPSv02')
|
20
|
-
self.feature_set = eval(f'opensmile.FeatureSet.{self.featset}')
|
21
|
-
store = self.util.get_path('store')
|
22
|
-
storage = f'{store}{self.name}_{self.featset}.pkl'
|
23
|
-
extract = self.util.config_val('FEATS', 'needs_feature_extraction', False)
|
24
|
-
no_reuse = eval(self.util.config_val('FEATS', 'no_reuse', 'False'))
|
25
|
-
if extract or no_reuse or not os.path.isfile(storage):
|
26
|
-
# extract smile features first
|
27
|
-
self.util.debug('extracting openSmile features, this might take a while...')
|
28
|
-
smile = opensmile.Smile(
|
29
|
-
feature_set= self.feature_set,
|
30
|
-
feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
|
31
|
-
num_workers=5,)
|
32
|
-
if isinstance(self.data_df.index, pd.MultiIndex):
|
33
|
-
is_multi_index = True
|
34
|
-
smile_df = smile.process_index(self.data_df.index)
|
35
|
-
else:
|
36
|
-
smile_df = smile.process_files(self.data_df.index)
|
37
|
-
smile_df.index = smile_df.index.droplevel(1)
|
38
|
-
smile_df.index = smile_df.index.droplevel(1)
|
39
|
-
# compute xbow features
|
40
|
-
# set some file names on disk
|
41
|
-
lld_name, xbow_name, codebook_name = 'llds.csv', 'xbow.csv', 'xbow_codebook'
|
42
|
-
# save the smile features
|
43
|
-
smile_df.to_csv(lld_name, sep=';', header=False)
|
44
|
-
# get the path of the xbow java jar file
|
45
|
-
xbow_path = self.util.config_val('FEATS', 'xbow.model', '../openXBOW/')
|
46
|
-
# get the size of the codebook
|
47
|
-
size = self.util.config_val('FEATS', 'size', 500)
|
48
|
-
# get the number of assignements
|
49
|
-
assignments = self.util.config_val('FEATS', 'assignments', 10)
|
50
|
-
# differentiate between train and test
|
51
|
-
if self.is_train:
|
52
|
-
# store the codebook
|
53
|
-
os.system(f'java -jar {xbow_path}openXBOW.jar -i {lld_name} -standardizeInput -log \
|
54
|
-
-o {xbow_name} -size {size} -a {assignments} -B {codebook_name}')
|
55
|
-
else:
|
56
|
-
# use the codebook
|
57
|
-
os.system(f'java -jar {xbow_path}openXBOW.jar -i {lld_name} \
|
58
|
-
-o {xbow_name} -b {codebook_name}')
|
59
|
-
# read in the result from disk
|
60
|
-
xbow_df = pd.read_csv(xbow_name, sep=';', header=None)
|
61
|
-
# set the index
|
62
|
-
xbow_df = xbow_df.set_index(self.data_df.index)
|
63
|
-
# check if smile features should be added
|
64
|
-
with_os = self.util.config_val('FEATS', 'with_os', False)
|
65
|
-
if with_os:
|
66
|
-
# extract smile functionals
|
67
|
-
self.util.debug('extracting openSmile functionals, this might take a while...')
|
68
|
-
smile = opensmile.Smile(
|
69
|
-
feature_set= opensmile.FeatureSet.eGeMAPSv02, # always use eGemaps for this
|
70
|
-
feature_level=opensmile.FeatureLevel.Functionals,
|
71
|
-
num_workers=5,)
|
72
|
-
if isinstance(self.data_df.index, pd.MultiIndex):
|
73
|
-
is_multi_index = True
|
74
|
-
smile_df = smile.process_index(self.data_df.index)
|
75
|
-
else:
|
76
|
-
smile_df = smile.process_files(self.data_df.index)
|
77
|
-
# drop the multi index
|
78
|
-
smile_df.index = smile_df.index.droplevel(1)
|
79
|
-
smile_df.index = smile_df.index.droplevel(1)
|
80
|
-
xbow_df = xbow_df.join(smile_df)
|
81
|
-
# in any case, store to disk for later use
|
82
|
-
xbow_df.to_pickle(storage)
|
83
|
-
# and assign to be the "official" feature set
|
84
|
-
self.df = xbow_df
|
85
|
-
else:
|
86
|
-
self.util.debug('reusing extracted OS features.')
|
87
|
-
self.df = pd.read_pickle(storage)
|
nkululeko/feats_praat.py
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# feats_praat.py
|
2
|
-
from nkululeko.featureset import Featureset
|
3
|
-
import os
|
4
|
-
import pandas as pd
|
5
|
-
import nkululeko.glob_conf as glob_conf
|
6
|
-
from nkululeko import feinberg_praat
|
7
|
-
import ast
|
8
|
-
|
9
|
-
class Praatset(Featureset):
|
10
|
-
"""
|
11
|
-
a feature extractor for the Praat software, based on
|
12
|
-
David R. Feinberg's Praat scripts for the parselmouth python interface.
|
13
|
-
https://osf.io/6dwr3/
|
14
|
-
|
15
|
-
"""
|
16
|
-
def __init__(self, name, data_df):
|
17
|
-
super().__init__(name, data_df)
|
18
|
-
|
19
|
-
def extract(self):
|
20
|
-
"""Extract the features based on the initialized dataset or re-open them when found on disk."""
|
21
|
-
store = self.util.get_path('store')
|
22
|
-
store_format = self.util.config_val('FEATS', 'store_format', 'pkl')
|
23
|
-
storage = f'{store}{self.name}.{store_format}'
|
24
|
-
extract = self.util.config_val('FEATS', 'needs_feature_extraction', False)
|
25
|
-
no_reuse = eval(self.util.config_val('FEATS', 'no_reuse', 'False'))
|
26
|
-
if extract or no_reuse or not os.path.isfile(storage):
|
27
|
-
self.util.debug('extracting Praat features, this might take a while...')
|
28
|
-
self.df = feinberg_praat.compute_features(self.data_df.index)
|
29
|
-
self.df = self.df.set_index(self.data_df.index)
|
30
|
-
for i, col in enumerate(self.df.columns):
|
31
|
-
if self.df[col].isnull().values.any():
|
32
|
-
self.util.debug(f'{col} includes {self.df[col].isnull().sum()} nan, inserting mean values')
|
33
|
-
self.df[col] = self.df[col].fillna(self.df[col].mean())
|
34
|
-
|
35
|
-
self.util.write_store(self.df, storage, store_format)
|
36
|
-
try:
|
37
|
-
glob_conf.config['DATA']['needs_feature_extraction'] = 'false'
|
38
|
-
except KeyError:
|
39
|
-
pass
|
40
|
-
else:
|
41
|
-
self.util.debug(f'reusing extracted Praat features: {storage}.')
|
42
|
-
self.df = self.util.get_store(storage, store_format)
|
43
|
-
self.util.debug(f'praat feature names: {self.df.columns}')
|
44
|
-
self.df = self.df.astype(float)
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
def extract_sample(self, signal, sr):
|
49
|
-
self.util.error('feats_praat: extracting single samples not implemented yet')
|
50
|
-
feats = None
|
51
|
-
return feats
|
52
|
-
|
53
|
-
def filter(self):
|
54
|
-
# use only the features that are indexed in the target dataframes
|
55
|
-
self.df = self.df[self.df.index.isin(self.data_df.index)]
|
56
|
-
try:
|
57
|
-
# use only some features
|
58
|
-
selected_features = ast.literal_eval(glob_conf.config['FEATS']['praat.features'])
|
59
|
-
self.util.debug(f'selecting features from Praat: {selected_features}')
|
60
|
-
sel_feats_df = pd.DataFrame()
|
61
|
-
hit = False
|
62
|
-
for feat in selected_features:
|
63
|
-
try:
|
64
|
-
sel_feats_df[feat] = self.df[feat]
|
65
|
-
hit = True
|
66
|
-
except KeyError:
|
67
|
-
pass
|
68
|
-
if hit:
|
69
|
-
self.df = sel_feats_df
|
70
|
-
self.util.debug(f'new feats shape after selecting Praat features: {self.df.shape}')
|
71
|
-
except KeyError:
|
72
|
-
pass
|
nkululeko/feats_snr.py
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
""" feats_snr.py
|
2
|
-
Estimate snr (signal to noise ratio as acoustic features)
|
3
|
-
"""
|
4
|
-
from nkululeko.util import Util
|
5
|
-
from nkululeko.featureset import Featureset
|
6
|
-
from nkululeko.estimate_snr import SNREstimator
|
7
|
-
import os
|
8
|
-
import pandas as pd
|
9
|
-
import os
|
10
|
-
import nkululeko.glob_conf as glob_conf
|
11
|
-
import audiofile
|
12
|
-
|
13
|
-
class SNRSet(Featureset):
|
14
|
-
"""Class to estimate snr"""
|
15
|
-
|
16
|
-
def __init__(self, name, data_df):
|
17
|
-
"""Constructor. """
|
18
|
-
super().__init__(name, data_df)
|
19
|
-
|
20
|
-
def extract(self):
|
21
|
-
"""Estimate the features or load them from disk if present."""
|
22
|
-
store = self.util.get_path('store')
|
23
|
-
store_format = self.util.config_val('FEATS', 'store_format', 'pkl')
|
24
|
-
storage = f'{store}{self.name}.{store_format}'
|
25
|
-
extract = self.util.config_val('FEATS', 'needs_feature_extraction', False)
|
26
|
-
no_reuse = eval(self.util.config_val('FEATS', 'no_reuse', 'False'))
|
27
|
-
if extract or no_reuse or not os.path.isfile(storage):
|
28
|
-
self.util.debug('estimating SNR, this might take a while...')
|
29
|
-
snr_series = pd.Series(index = self.data_df.index, dtype=object)
|
30
|
-
for idx, (file, start, end) in enumerate(self.data_df.index.to_list()):
|
31
|
-
signal, sampling_rate = audiofile.read(file, offset=start.total_seconds(), duration=(end-start).total_seconds(), always_2d=True)
|
32
|
-
snr = self.get_snr(signal[0], sampling_rate)
|
33
|
-
snr_series[idx] = snr
|
34
|
-
if idx%10==0:
|
35
|
-
print('.', end='')
|
36
|
-
print('')
|
37
|
-
self.df = pd.DataFrame(snr_series.values.tolist(), index=self.data_df.index)
|
38
|
-
self.df.columns = ['snr']
|
39
|
-
self.util.write_store(self.df, storage, store_format)
|
40
|
-
try:
|
41
|
-
glob_conf.config['DATA']['needs_feature_extraction'] = 'false'
|
42
|
-
except KeyError:
|
43
|
-
pass
|
44
|
-
else:
|
45
|
-
self.util.debug('reusing estimated SNR values')
|
46
|
-
self.df = self.util.get_store(storage, store_format)
|
47
|
-
|
48
|
-
def get_snr(self, signal, sampling_rate):
|
49
|
-
r"""Estimate SNR from raw audio signal.
|
50
|
-
Args:
|
51
|
-
signal: audio signal
|
52
|
-
sampling_rate: sample rate
|
53
|
-
Returns
|
54
|
-
snr: estimated signal to noise ratio
|
55
|
-
"""
|
56
|
-
snr_estimator = SNREstimator(signal, sampling_rate)
|
57
|
-
estimated_snr, log_energies, energy_threshold_low, energy_threshold_high = snr_estimator.estimate_snr()
|
58
|
-
return estimated_snr
|
59
|
-
|
60
|
-
def extract_sample(self, signal, sr):
|
61
|
-
self.init_model()
|
62
|
-
feats = self.get_snr(signal, sr)
|
63
|
-
return feats
|
nkululeko/feats_squim.py
DELETED
@@ -1,99 +0,0 @@
|
|
1
|
-
""" feats_squim.py
|
2
|
-
predict SQUIM ( SPEECH QUALITY AND INTELLIGIBILITY
|
3
|
-
MEASURES) features
|
4
|
-
|
5
|
-
|
6
|
-
Wideband Perceptual Estimation of Speech Quality (PESQ) [2]
|
7
|
-
Short-Time Objective Intelligibility (STOI) [3]
|
8
|
-
Scale-Invariant Signal-to-Distortion Ratio (SI-SDR) [4]
|
9
|
-
|
10
|
-
|
11
|
-
adapted from
|
12
|
-
from https://pytorch.org/audio/main/tutorials/squim_tutorial.html#sphx-glr-tutorials-squim-tutorial-py
|
13
|
-
paper: https://arxiv.org/pdf/2304.01448.pdf
|
14
|
-
|
15
|
-
needs
|
16
|
-
pip uninstall -y torch torchvision torchaudio
|
17
|
-
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
|
18
|
-
|
19
|
-
"""
|
20
|
-
from nkululeko.util import Util
|
21
|
-
from nkululeko.featureset import Featureset
|
22
|
-
import os
|
23
|
-
import pandas as pd
|
24
|
-
import os
|
25
|
-
import nkululeko.glob_conf as glob_conf
|
26
|
-
import audiofile
|
27
|
-
import torch
|
28
|
-
import torchaudio
|
29
|
-
from torchaudio.pipelines import SQUIM_OBJECTIVE
|
30
|
-
|
31
|
-
class SQUIMSet(Featureset):
|
32
|
-
"""Class to predict SQUIM features
|
33
|
-
|
34
|
-
"""
|
35
|
-
|
36
|
-
def __init__(self, name, data_df):
|
37
|
-
"""Constructor. is_train is needed to distinguish from test/dev sets, because they use the codebook from the training"""
|
38
|
-
super().__init__(name, data_df)
|
39
|
-
self.device = self.util.config_val('MODEL', 'device', 'cpu')
|
40
|
-
self.model_initialized = False
|
41
|
-
|
42
|
-
|
43
|
-
def init_model(self):
|
44
|
-
# load model
|
45
|
-
self.util.debug('loading model...')
|
46
|
-
self.objective_model = SQUIM_OBJECTIVE.get_model()
|
47
|
-
self.model_initialized = True
|
48
|
-
|
49
|
-
|
50
|
-
def extract(self):
|
51
|
-
"""Extract the features or load them from disk if present."""
|
52
|
-
store = self.util.get_path('store')
|
53
|
-
store_format = self.util.config_val('FEATS', 'store_format', 'pkl')
|
54
|
-
storage = f'{store}{self.name}.{store_format}'
|
55
|
-
extract = self.util.config_val('FEATS', 'needs_feature_extraction', False)
|
56
|
-
no_reuse = eval(self.util.config_val('FEATS', 'no_reuse', 'False'))
|
57
|
-
if extract or no_reuse or not os.path.isfile(storage):
|
58
|
-
if not self.model_initialized:
|
59
|
-
self.init_model()
|
60
|
-
self.util.debug('predicting SQUIM, this might take a while...')
|
61
|
-
emb_series = pd.Series(index = self.data_df.index, dtype=object)
|
62
|
-
length = len(self.data_df.index)
|
63
|
-
for idx, (file, start, end) in enumerate(self.data_df.index.to_list()):
|
64
|
-
signal, sampling_rate = audiofile.read(file, offset=start.total_seconds(), duration=(end-start).total_seconds(), always_2d=True)
|
65
|
-
emb = self.get_embeddings(signal, sampling_rate)
|
66
|
-
emb_series[idx] = emb
|
67
|
-
if idx%10==0:
|
68
|
-
self.util.debug(f'SQUIM: {idx} of {length} done')
|
69
|
-
self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
|
70
|
-
self.df.columns = ['pesq', 'sdr', 'stoi']
|
71
|
-
self.util.write_store(self.df, storage, store_format)
|
72
|
-
try:
|
73
|
-
glob_conf.config['DATA']['needs_feature_extraction'] = 'false'
|
74
|
-
except KeyError:
|
75
|
-
pass
|
76
|
-
else:
|
77
|
-
self.util.debug('reusing predicted SQUIM values')
|
78
|
-
self.df = self.util.get_store(storage, store_format)
|
79
|
-
if self.df.isnull().values.any():
|
80
|
-
nanrows = self.df.columns[self.df.isna().any()].tolist()
|
81
|
-
print(nanrows)
|
82
|
-
self.util.error(f'got nan: {self.df.shape} {self.df.isnull().sum().sum()}')
|
83
|
-
|
84
|
-
|
85
|
-
def get_embeddings(self, signal, sampling_rate):
|
86
|
-
tmp_audio_name = 'squim_audio_tmp.wav'
|
87
|
-
audiofile.write(tmp_audio_name, signal, sampling_rate)
|
88
|
-
WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(tmp_audio_name)
|
89
|
-
with torch.no_grad():
|
90
|
-
stoi_hyp, pesq_hyp, si_sdr_hyp = self.objective_model(WAVEFORM_SPEECH)
|
91
|
-
pesq = float(pesq_hyp[0].numpy())
|
92
|
-
stoi = float(stoi_hyp[0].numpy())
|
93
|
-
sdr = float(si_sdr_hyp[0].numpy())
|
94
|
-
return pesq, sdr, stoi
|
95
|
-
|
96
|
-
def extract_sample(self, signal, sr):
|
97
|
-
self.init_model()
|
98
|
-
feats = self.get_embeddings(signal, sr)
|
99
|
-
return feats
|
nkululeko/feats_trill.py
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
# feats_trill.py
|
2
|
-
from numpy.core.numeric import tensordot
|
3
|
-
from nkululeko.featureset import Featureset
|
4
|
-
import pandas as pd
|
5
|
-
from nkululeko.util import Util
|
6
|
-
import nkululeko.glob_conf as glob_conf
|
7
|
-
import audiofile as af
|
8
|
-
import os
|
9
|
-
import tensorflow as tf
|
10
|
-
# Import TF 2.X and make sure we're running eager.
|
11
|
-
assert tf.executing_eagerly()
|
12
|
-
import tensorflow_hub as hub
|
13
|
-
|
14
|
-
class TRILLset(Featureset):
|
15
|
-
"""A feature extractor for the Google TRILL embeddings"""
|
16
|
-
"""https://ai.googleblog.com/2020/06/improving-speech-representations-and.html"""
|
17
|
-
|
18
|
-
# Initialization of the class
|
19
|
-
def __init__(self, name, data_df):
|
20
|
-
"""
|
21
|
-
Initialize the class with name, data and Util instance
|
22
|
-
Also loads the model from hub
|
23
|
-
|
24
|
-
:param name: Name of the class
|
25
|
-
:type name: str
|
26
|
-
:param data_df: Data of the class
|
27
|
-
:type data_df: DataFrame
|
28
|
-
:return: None
|
29
|
-
"""
|
30
|
-
super().__init__(name, data_df)
|
31
|
-
# Load the model from the configured path
|
32
|
-
model_path = self.util.config_val('FEATS', 'trill.model', \
|
33
|
-
'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/3')
|
34
|
-
self.module = hub.load(model_path)
|
35
|
-
|
36
|
-
|
37
|
-
def extract(self):
|
38
|
-
store = self.util.get_path('store')
|
39
|
-
storage = f'{store}{self.name}.pkl'
|
40
|
-
extract = self.util.config_val('FEATS', 'needs_feature_extraction', False)
|
41
|
-
no_reuse = eval(self.util.config_val('FEATS', 'no_reuse', 'False'))
|
42
|
-
if extract or no_reuse or not os.path.isfile(storage):
|
43
|
-
self.util.debug('extracting TRILL embeddings, this might take a while...')
|
44
|
-
emb_series = pd.Series(index = self.data_df.index, dtype=object)
|
45
|
-
length = len(self.data_df.index)
|
46
|
-
for idx, file in enumerate(self.data_df.index.get_level_values(0)):
|
47
|
-
emb = self.getEmbeddings(file)
|
48
|
-
emb_series[idx] = emb
|
49
|
-
if idx%10==0:
|
50
|
-
self.util.debug(f'TRILL: {idx} of {length} done')
|
51
|
-
self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
|
52
|
-
self.df.to_pickle(storage)
|
53
|
-
try:
|
54
|
-
glob_conf.config['DATA']['needs_feature_extraction'] = 'false'
|
55
|
-
except KeyError:
|
56
|
-
pass
|
57
|
-
else:
|
58
|
-
self.util.debug('reusing extracted TRILL embeddings')
|
59
|
-
self.df = pd.read_pickle(storage)
|
60
|
-
|
61
|
-
def embed_wav(self, wav):
|
62
|
-
if len(wav.shape) > 1:
|
63
|
-
wav = tf.reduce_mean(wav, axis=0)
|
64
|
-
|
65
|
-
emb_dict = self.module(samples=wav, sample_rate=tf.constant(16000))
|
66
|
-
return emb_dict['embedding']
|
67
|
-
|
68
|
-
def getEmbeddings(self, file):
|
69
|
-
wav = af.read(file)[0]
|
70
|
-
wav = tf.convert_to_tensor(wav)
|
71
|
-
emb_short = self.embed_wav(wav)
|
72
|
-
# you get one embedding per frame, we use the mean for all the frames
|
73
|
-
emb_short = emb_short.numpy().mean(axis=0)
|
74
|
-
return emb_short
|
nkululeko/feats_wav2vec2.py
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
# feats_wav2vec2.py
|
2
|
-
|
3
|
-
from nkululeko.util import Util
|
4
|
-
from nkululeko.featureset import Featureset
|
5
|
-
import os
|
6
|
-
import pandas as pd
|
7
|
-
import os
|
8
|
-
import nkululeko.glob_conf as glob_conf
|
9
|
-
import transformers
|
10
|
-
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
|
11
|
-
import torch
|
12
|
-
|
13
|
-
import audiofile
|
14
|
-
|
15
|
-
class Wav2vec2(Featureset):
|
16
|
-
"""Class to extract wav2vec2 embeddings (https://huggingface.co/facebook/wav2vec2-large-robust-ft-swbd-300h)"""
|
17
|
-
|
18
|
-
def __init__(self, name, data_df):
|
19
|
-
"""Constructor. is_train is needed to distinguish from test/dev sets, because they use the codebook from the training"""
|
20
|
-
super().__init__(name, data_df)
|
21
|
-
self.device = self.util.config_val('MODEL', 'device', 'cpu')
|
22
|
-
self.model_initialized = False
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
def init_model(self):
|
27
|
-
# load model
|
28
|
-
self.util.debug('loading wav2vec model...')
|
29
|
-
model_path = self.util.config_val('FEATS', 'wav2vec.model', 'wav2vec2-large-robust-ft-swbd-300h')
|
30
|
-
self.processor = transformers.Wav2Vec2Processor.from_pretrained(model_path)
|
31
|
-
self.model = Wav2Vec2Model.from_pretrained(model_path).to(self.device)
|
32
|
-
print(f'intialized vec model on {self.device}')
|
33
|
-
self.model.eval()
|
34
|
-
self.model_initialized = True
|
35
|
-
|
36
|
-
|
37
|
-
def extract(self):
|
38
|
-
"""Extract the features or load them from disk if present."""
|
39
|
-
store = self.util.get_path('store')
|
40
|
-
storage = f'{store}{self.name}.pkl'
|
41
|
-
extract = self.util.config_val('FEATS', 'needs_feature_extraction', False)
|
42
|
-
no_reuse = eval(self.util.config_val('FEATS', 'no_reuse', 'False'))
|
43
|
-
if extract or no_reuse or not os.path.isfile(storage):
|
44
|
-
if not self.model_initialized:
|
45
|
-
self.init_model()
|
46
|
-
self.util.debug('extracting wav2vec2 embeddings, this might take a while...')
|
47
|
-
emb_series = pd.Series(index = self.data_df.index, dtype=object)
|
48
|
-
length = len(self.data_df.index)
|
49
|
-
for idx, (file, start, end) in enumerate(self.data_df.index.to_list()):
|
50
|
-
signal, sampling_rate = audiofile.read(file, offset=start.total_seconds(), duration=(end-start).total_seconds(), always_2d=True)
|
51
|
-
#signal, sampling_rate = audiofile.read(audio_path, always_2d=True)
|
52
|
-
emb = self.get_embeddings(signal, sampling_rate)
|
53
|
-
emb_series[idx] = emb
|
54
|
-
if idx%10==0:
|
55
|
-
self.util.debug(f'Wav2vec2: {idx} of {length} done')
|
56
|
-
self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
|
57
|
-
self.df.to_pickle(storage)
|
58
|
-
try:
|
59
|
-
glob_conf.config['DATA']['needs_feature_extraction'] = 'false'
|
60
|
-
except KeyError:
|
61
|
-
pass
|
62
|
-
else:
|
63
|
-
self.util.debug('reusing extracted wav2vec2 embeddings')
|
64
|
-
self.df = pd.read_pickle(storage)
|
65
|
-
if self.df.isnull().values.any():
|
66
|
-
nanrows = self.df.columns[self.df.isna().any()].tolist()
|
67
|
-
print(nanrows)
|
68
|
-
self.util.error(f'got nan: {self.df.shape} {self.df.isnull().sum().sum()}')
|
69
|
-
|
70
|
-
|
71
|
-
def get_embeddings(self, signal, sampling_rate):
|
72
|
-
r"""Extract embeddings from raw audio signal."""
|
73
|
-
with torch.no_grad():
|
74
|
-
# run through processor to normalize signal
|
75
|
-
# always returns a batch, so we just get the first entry
|
76
|
-
# then we put it on the device
|
77
|
-
y = self.processor(signal, sampling_rate=sampling_rate)
|
78
|
-
y = y['input_values'][0]
|
79
|
-
y = torch.from_numpy(y).to(self.device)
|
80
|
-
|
81
|
-
# run through model
|
82
|
-
# first entry contains hidden state
|
83
|
-
y = self.model(y)[0]
|
84
|
-
|
85
|
-
# pool result and convert to numpy
|
86
|
-
y = torch.mean(y, dim=1)
|
87
|
-
y = y.detach().cpu().numpy()
|
88
|
-
|
89
|
-
return y.flatten()
|
90
|
-
|
91
|
-
def extract_sample(self, signal, sr):
|
92
|
-
self.init_model()
|
93
|
-
feats = self.get_embeddings(signal, sr)
|
94
|
-
return feats
|
nkululeko/featureset.py
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
# featureset.py
|
2
|
-
import pandas as pd
|
3
|
-
from nkululeko.util import Util
|
4
|
-
import nkululeko.glob_conf as glob_conf
|
5
|
-
import ast
|
6
|
-
|
7
|
-
class Featureset:
|
8
|
-
name = '' # designation
|
9
|
-
df = None # pandas dataframe to store the features (and indexed with the data from the sets)
|
10
|
-
data_df = None # dataframe to get audio paths
|
11
|
-
|
12
|
-
|
13
|
-
def __init__(self, name, data_df):
|
14
|
-
self.name = name
|
15
|
-
self.data_df = data_df
|
16
|
-
self.util = Util('featureset')
|
17
|
-
|
18
|
-
def extract(self):
|
19
|
-
pass
|
20
|
-
|
21
|
-
def filter(self):
|
22
|
-
# use only the features that are indexed in the target dataframes
|
23
|
-
self.df = self.df[self.df.index.isin(self.data_df.index)]
|
24
|
-
try:
|
25
|
-
# use only some features
|
26
|
-
selected_features = ast.literal_eval(glob_conf.config['FEATS']['features'])
|
27
|
-
self.util.debug(f'selecting features: {selected_features}')
|
28
|
-
sel_feats_df = pd.DataFrame()
|
29
|
-
hit = False
|
30
|
-
for feat in selected_features:
|
31
|
-
try:
|
32
|
-
sel_feats_df[feat] = self.df[feat]
|
33
|
-
hit = True
|
34
|
-
except KeyError:
|
35
|
-
pass
|
36
|
-
if hit:
|
37
|
-
self.df = sel_feats_df
|
38
|
-
self.util.debug(f'new feats shape after selecting features: {self.df.shape}')
|
39
|
-
except KeyError:
|
40
|
-
pass
|
41
|
-
|