nkululeko 0.84.1__py3-none-any.whl → 0.85.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +6 -1
- nkululeko/feat_extract/feats_whisper.py +3 -6
- nkululeko/modelrunner.py +56 -33
- nkululeko/models/finetune_model.py +9 -0
- nkululeko/models/model.py +1 -1
- nkululeko/models/model_tuned.py +479 -0
- nkululeko/test_pretrain.py +16 -4
- {nkululeko-0.84.1.dist-info → nkululeko-0.85.1.dist-info}/METADATA +9 -1
- {nkululeko-0.84.1.dist-info → nkululeko-0.85.1.dist-info}/RECORD +13 -12
- {nkululeko-0.84.1.dist-info → nkululeko-0.85.1.dist-info}/LICENSE +0 -0
- {nkululeko-0.84.1.dist-info → nkululeko-0.85.1.dist-info}/WHEEL +0 -0
- {nkululeko-0.84.1.dist-info → nkululeko-0.85.1.dist-info}/top_level.txt +0 -0
nkululeko/constants.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
VERSION="0.
|
1
|
+
VERSION="0.85.1"
|
2
2
|
SAMPLING_RATE = 16000
|
nkululeko/experiment.py
CHANGED
@@ -340,7 +340,12 @@ class Experiment:
|
|
340
340
|
df_train, df_test = self.df_train, self.df_test
|
341
341
|
feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
|
342
342
|
self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
|
343
|
-
feats_types = self.util.config_val_list("FEATS", "type", [
|
343
|
+
feats_types = self.util.config_val_list("FEATS", "type", [])
|
344
|
+
# for some models no features are needed
|
345
|
+
if len(feats_types) == 0:
|
346
|
+
self.util.debug("no feature extractor specified.")
|
347
|
+
self.feats_train, self.feats_test = pd.DataFrame(), pd.DataFrame()
|
348
|
+
return
|
344
349
|
self.feature_extractor = FeatureExtractor(
|
345
350
|
df_train, feats_types, feats_name, "train"
|
346
351
|
)
|
@@ -32,22 +32,19 @@ class Whisper(Featureset):
|
|
32
32
|
model_name = f"openai/{self.feat_type}"
|
33
33
|
self.model = WhisperModel.from_pretrained(model_name).to(self.device)
|
34
34
|
print(f"intialized Whisper model on {self.device}")
|
35
|
-
self.feature_extractor = AutoFeatureExtractor.from_pretrained(
|
36
|
-
model_name)
|
35
|
+
self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
|
37
36
|
self.model_initialized = True
|
38
37
|
|
39
38
|
def extract(self):
|
40
39
|
"""Extract the features or load them from disk if present."""
|
41
40
|
store = self.util.get_path("store")
|
42
41
|
storage = f"{store}{self.name}.pkl"
|
43
|
-
extract = self.util.config_val(
|
44
|
-
"FEATS", "needs_feature_extraction", False)
|
42
|
+
extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
|
45
43
|
no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
|
46
44
|
if extract or no_reuse or not os.path.isfile(storage):
|
47
45
|
if not self.model_initialized:
|
48
46
|
self.init_model()
|
49
|
-
self.util.debug(
|
50
|
-
"extracting whisper embeddings, this might take a while...")
|
47
|
+
self.util.debug("extracting whisper embeddings, this might take a while...")
|
51
48
|
emb_series = []
|
52
49
|
for (file, start, end), _ in audeer.progress_bar(
|
53
50
|
self.data_df.iterrows(),
|
nkululeko/modelrunner.py
CHANGED
@@ -47,16 +47,12 @@ class Modelrunner:
|
|
47
47
|
highest = 0
|
48
48
|
else:
|
49
49
|
highest = 100000
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
self.model.load(self.run, epoch)
|
54
|
-
self.util.debug(f"reusing model: {self.model.store_path}")
|
55
|
-
self.model.reset_test(self.df_test, self.feats_test)
|
56
|
-
else:
|
57
|
-
self.model.set_id(self.run, epoch)
|
58
|
-
self.model.train()
|
50
|
+
if self.model.model_type == "finetuned":
|
51
|
+
# epochs are handled by Huggingface API
|
52
|
+
self.model.train()
|
59
53
|
report = self.model.predict()
|
54
|
+
# todo: findout the best epoch
|
55
|
+
epoch = epoch_num
|
60
56
|
report.set_id(self.run, epoch)
|
61
57
|
plot_name = self.util.get_plot_name() + f"_{self.run}_{epoch:03d}_cnf"
|
62
58
|
reports.append(report)
|
@@ -67,32 +63,53 @@ class Modelrunner:
|
|
67
63
|
if plot_epochs:
|
68
64
|
self.util.debug(f"plotting conf matrix to {plot_name}")
|
69
65
|
report.plot_confmatrix(plot_name, epoch)
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
patience = int(patience)
|
78
|
-
result = report.result.get_result()
|
79
|
-
if self.util.high_is_good():
|
80
|
-
if result > highest:
|
81
|
-
highest = result
|
82
|
-
patience_counter = 0
|
83
|
-
else:
|
84
|
-
patience_counter += 1
|
66
|
+
else:
|
67
|
+
# for all epochs
|
68
|
+
for epoch in range(epoch_num):
|
69
|
+
if only_test:
|
70
|
+
self.model.load(self.run, epoch)
|
71
|
+
self.util.debug(f"reusing model: {self.model.store_path}")
|
72
|
+
self.model.reset_test(self.df_test, self.feats_test)
|
85
73
|
else:
|
86
|
-
|
87
|
-
|
88
|
-
|
74
|
+
self.model.set_id(self.run, epoch)
|
75
|
+
self.model.train()
|
76
|
+
report = self.model.predict()
|
77
|
+
report.set_id(self.run, epoch)
|
78
|
+
plot_name = self.util.get_plot_name() + f"_{self.run}_{epoch:03d}_cnf"
|
79
|
+
reports.append(report)
|
80
|
+
self.util.debug(
|
81
|
+
f"run: {self.run} epoch: {epoch}: result: "
|
82
|
+
f"{reports[-1].get_result().get_test_result()}"
|
83
|
+
)
|
84
|
+
if plot_epochs:
|
85
|
+
self.util.debug(f"plotting conf matrix to {plot_name}")
|
86
|
+
report.plot_confmatrix(plot_name, epoch)
|
87
|
+
store_models = self.util.config_val("EXP", "save", False)
|
88
|
+
plot_best_model = self.util.config_val("PLOT", "best_model", False)
|
89
|
+
if (store_models or plot_best_model) and (
|
90
|
+
not only_test
|
91
|
+
): # in any case the model needs to be stored to disk.
|
92
|
+
self.model.store()
|
93
|
+
if patience:
|
94
|
+
patience = int(patience)
|
95
|
+
result = report.result.get_result()
|
96
|
+
if self.util.high_is_good():
|
97
|
+
if result > highest:
|
98
|
+
highest = result
|
99
|
+
patience_counter = 0
|
100
|
+
else:
|
101
|
+
patience_counter += 1
|
89
102
|
else:
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
103
|
+
if result < highest:
|
104
|
+
highest = result
|
105
|
+
patience_counter = 0
|
106
|
+
else:
|
107
|
+
patience_counter += 1
|
108
|
+
if patience_counter >= patience:
|
109
|
+
self.util.debug(
|
110
|
+
f"reached patience ({str(patience)}): early stopping"
|
111
|
+
)
|
112
|
+
break
|
96
113
|
|
97
114
|
if not plot_epochs:
|
98
115
|
# Do at least one confusion matrix plot
|
@@ -133,6 +150,12 @@ class Modelrunner:
|
|
133
150
|
self.model = Bayes_model(
|
134
151
|
self.df_train, self.df_test, self.feats_train, self.feats_test
|
135
152
|
)
|
153
|
+
elif model_type == "finetune":
|
154
|
+
from nkululeko.models.model_tuned import TunedModel
|
155
|
+
|
156
|
+
self.model = TunedModel(
|
157
|
+
self.df_train, self.df_test, self.feats_train, self.feats_test
|
158
|
+
)
|
136
159
|
elif model_type == "gmm":
|
137
160
|
from nkululeko.models.model_gmm import GMM_model
|
138
161
|
|
@@ -1,3 +1,7 @@
|
|
1
|
+
"""
|
2
|
+
Code based on @jwagner
|
3
|
+
"""
|
4
|
+
|
1
5
|
import dataclasses
|
2
6
|
import typing
|
3
7
|
|
@@ -148,6 +152,11 @@ class Model(Wav2Vec2PreTrainedModel):
|
|
148
152
|
logits_cat=logits_cat,
|
149
153
|
)
|
150
154
|
|
155
|
+
def predict(self, signal):
|
156
|
+
result = self(torch.from_numpy(signal))
|
157
|
+
result = result[0].detach().numpy()[0]
|
158
|
+
return result
|
159
|
+
|
151
160
|
|
152
161
|
class ModelWithPreProcessing(Model):
|
153
162
|
|
nkululeko/models/model.py
CHANGED
@@ -0,0 +1,479 @@
|
|
1
|
+
"""
|
2
|
+
Code based on @jwagner.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import dataclasses
|
6
|
+
import json
|
7
|
+
import os
|
8
|
+
import pickle
|
9
|
+
import typing
|
10
|
+
|
11
|
+
import datasets
|
12
|
+
import numpy as np
|
13
|
+
import pandas as pd
|
14
|
+
import torch
|
15
|
+
import transformers
|
16
|
+
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
|
17
|
+
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
|
18
|
+
|
19
|
+
import audeer
|
20
|
+
import audiofile
|
21
|
+
import audmetric
|
22
|
+
|
23
|
+
import nkululeko.glob_conf as glob_conf
|
24
|
+
from nkululeko.models.model import Model as BaseModel
|
25
|
+
from nkululeko.reporting.reporter import Reporter
|
26
|
+
|
27
|
+
|
28
|
+
class TunedModel(BaseModel):
|
29
|
+
|
30
|
+
is_classifier = True
|
31
|
+
|
32
|
+
def __init__(self, df_train, df_test, feats_train, feats_test):
|
33
|
+
"""Constructor taking the configuration and all dataframes."""
|
34
|
+
super().__init__(df_train, df_test, feats_train, feats_test)
|
35
|
+
super().set_model_type("finetuned")
|
36
|
+
self.name = "finetuned_wav2vec2"
|
37
|
+
self.target = glob_conf.config["DATA"]["target"]
|
38
|
+
labels = glob_conf.labels
|
39
|
+
self.class_num = len(labels)
|
40
|
+
device = self.util.config_val("MODEL", "device", "cpu")
|
41
|
+
self.batch_size = int(self.util.config_val("MODEL", "batch_size", "8"))
|
42
|
+
if device != "cpu":
|
43
|
+
self.util.debug(f"running on device {device}")
|
44
|
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
45
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = device
|
46
|
+
self.df_train, self.df_test = df_train, df_test
|
47
|
+
self.epoch_num = int(self.util.config_val("EXP", "epochs", 1))
|
48
|
+
|
49
|
+
self._init_model()
|
50
|
+
|
51
|
+
def _init_model(self):
|
52
|
+
model_path = "facebook/wav2vec2-large-robust-ft-swbd-300h"
|
53
|
+
self.num_layers = None
|
54
|
+
self.sampling_rate = 16000
|
55
|
+
self.max_duration_sec = 8.0
|
56
|
+
self.accumulation_steps = 4
|
57
|
+
# create dataset
|
58
|
+
|
59
|
+
dataset = {}
|
60
|
+
target_name = glob_conf.target
|
61
|
+
data_sources = {
|
62
|
+
"train": pd.DataFrame(self.df_train[target_name]),
|
63
|
+
"dev": pd.DataFrame(self.df_test[target_name]),
|
64
|
+
}
|
65
|
+
|
66
|
+
for split in ["train", "dev"]:
|
67
|
+
df = data_sources[split]
|
68
|
+
y = df[target_name].astype("float")
|
69
|
+
y.name = "targets"
|
70
|
+
df = y.reset_index()
|
71
|
+
df.start = df.start.dt.total_seconds()
|
72
|
+
df.end = df.end.dt.total_seconds()
|
73
|
+
ds = datasets.Dataset.from_pandas(df)
|
74
|
+
dataset[split] = ds
|
75
|
+
|
76
|
+
self.dataset = datasets.DatasetDict(dataset)
|
77
|
+
|
78
|
+
# load pre-trained model
|
79
|
+
le = glob_conf.label_encoder
|
80
|
+
mapping = dict(zip(le.classes_, range(len(le.classes_))))
|
81
|
+
target_mapping = {k: int(v) for k, v in mapping.items()}
|
82
|
+
target_mapping_reverse = {value: key for key, value in target_mapping.items()}
|
83
|
+
|
84
|
+
self.config = transformers.AutoConfig.from_pretrained(
|
85
|
+
model_path,
|
86
|
+
num_labels=len(target_mapping),
|
87
|
+
label2id=target_mapping,
|
88
|
+
id2label=target_mapping_reverse,
|
89
|
+
finetuning_task=target_name,
|
90
|
+
)
|
91
|
+
if self.num_layers is not None:
|
92
|
+
self.config.num_hidden_layers = self.num_layers
|
93
|
+
setattr(self.config, "sampling_rate", self.sampling_rate)
|
94
|
+
setattr(self.config, "data", self.util.get_data_name())
|
95
|
+
|
96
|
+
vocab_dict = {}
|
97
|
+
with open("vocab.json", "w") as vocab_file:
|
98
|
+
json.dump(vocab_dict, vocab_file)
|
99
|
+
tokenizer = transformers.Wav2Vec2CTCTokenizer("./vocab.json")
|
100
|
+
tokenizer.save_pretrained(".")
|
101
|
+
|
102
|
+
feature_extractor = transformers.Wav2Vec2FeatureExtractor(
|
103
|
+
feature_size=1,
|
104
|
+
sampling_rate=16000,
|
105
|
+
padding_value=0.0,
|
106
|
+
do_normalize=True,
|
107
|
+
return_attention_mask=True,
|
108
|
+
)
|
109
|
+
self.processor = transformers.Wav2Vec2Processor(
|
110
|
+
feature_extractor=feature_extractor,
|
111
|
+
tokenizer=tokenizer,
|
112
|
+
)
|
113
|
+
assert self.processor.feature_extractor.sampling_rate == self.sampling_rate
|
114
|
+
|
115
|
+
self.model = Model.from_pretrained(
|
116
|
+
model_path,
|
117
|
+
config=self.config,
|
118
|
+
)
|
119
|
+
self.model.freeze_feature_extractor()
|
120
|
+
self.model.train()
|
121
|
+
self.model_initialized = True
|
122
|
+
|
123
|
+
def set_model_type(self, type):
|
124
|
+
self.model_type = type
|
125
|
+
|
126
|
+
def set_testdata(self, data_df, feats_df):
|
127
|
+
self.df_test, self.feats_test = data_df, feats_df
|
128
|
+
|
129
|
+
def reset_test(self, df_test, feats_test):
|
130
|
+
self.df_test, self.feats_test = df_test, feats_test
|
131
|
+
|
132
|
+
def set_id(self, run, epoch):
|
133
|
+
self.run = run
|
134
|
+
self.epoch = epoch
|
135
|
+
dir = self.util.get_path("model_dir")
|
136
|
+
name = f"{self.util.get_exp_name(only_train=True)}_{self.run}_{self.epoch:03d}.model"
|
137
|
+
self.store_path = dir + name
|
138
|
+
|
139
|
+
def data_collator(self, data):
|
140
|
+
files = [d["file"] for d in data]
|
141
|
+
starts = [d["start"] for d in data]
|
142
|
+
ends = [d["end"] for d in data]
|
143
|
+
targets = [d["targets"] for d in data]
|
144
|
+
|
145
|
+
signals = []
|
146
|
+
for file, start, end in zip(
|
147
|
+
files,
|
148
|
+
starts,
|
149
|
+
ends,
|
150
|
+
):
|
151
|
+
offset = start
|
152
|
+
duration = end - offset
|
153
|
+
if self.max_duration_sec is not None:
|
154
|
+
duration = min(duration, self.max_duration_sec)
|
155
|
+
signal, _ = audiofile.read(
|
156
|
+
file,
|
157
|
+
offset=offset,
|
158
|
+
duration=duration,
|
159
|
+
)
|
160
|
+
signals.append(signal.squeeze())
|
161
|
+
|
162
|
+
input_values = self.processor(
|
163
|
+
signals,
|
164
|
+
sampling_rate=self.sampling_rate,
|
165
|
+
padding=True,
|
166
|
+
)
|
167
|
+
batch = self.processor.pad(
|
168
|
+
input_values,
|
169
|
+
padding=True,
|
170
|
+
return_tensors="pt",
|
171
|
+
)
|
172
|
+
|
173
|
+
batch["labels"] = torch.tensor(targets)
|
174
|
+
|
175
|
+
return batch
|
176
|
+
|
177
|
+
def compute_metrics(self, p: transformers.EvalPrediction):
|
178
|
+
|
179
|
+
metrics = {
|
180
|
+
"UAR": audmetric.unweighted_average_recall,
|
181
|
+
"ACC": audmetric.accuracy,
|
182
|
+
}
|
183
|
+
|
184
|
+
# truth = p.label_ids[:, 0].astype(int)
|
185
|
+
truth = p.label_ids
|
186
|
+
preds = p.predictions
|
187
|
+
preds = np.argmax(preds, axis=1)
|
188
|
+
scores = {}
|
189
|
+
for name, metric in metrics.items():
|
190
|
+
scores[f"{name}"] = metric(truth, preds)
|
191
|
+
return scores
|
192
|
+
|
193
|
+
def train(self):
|
194
|
+
"""Train the model."""
|
195
|
+
model_root = self.util.get_path("model_dir")
|
196
|
+
log_root = os.path.join(self.util.get_exp_dir(), "log")
|
197
|
+
audeer.mkdir(log_root)
|
198
|
+
self.torch_root = audeer.path(model_root, "torch")
|
199
|
+
conf_file = os.path.join(self.torch_root, "config.json")
|
200
|
+
if os.path.isfile(conf_file):
|
201
|
+
self.util.debug(f"reusing finetuned model: {conf_file}")
|
202
|
+
self.load(self.run, self.epoch_num)
|
203
|
+
return
|
204
|
+
targets = pd.DataFrame(self.dataset["train"]["targets"])
|
205
|
+
counts = targets[0].value_counts().sort_index()
|
206
|
+
train_weights = 1 / counts
|
207
|
+
train_weights /= train_weights.sum()
|
208
|
+
self.util.debug("train weights: {train_weights}")
|
209
|
+
criterion = torch.nn.CrossEntropyLoss(
|
210
|
+
weight=torch.Tensor(train_weights).to("cuda"),
|
211
|
+
)
|
212
|
+
# criterion = torch.nn.CrossEntropyLoss()
|
213
|
+
|
214
|
+
class Trainer(transformers.Trainer):
|
215
|
+
|
216
|
+
def compute_loss(
|
217
|
+
self,
|
218
|
+
model,
|
219
|
+
inputs,
|
220
|
+
return_outputs=False,
|
221
|
+
):
|
222
|
+
|
223
|
+
targets = inputs.pop("labels").squeeze()
|
224
|
+
targets = targets.type(torch.long)
|
225
|
+
|
226
|
+
outputs = model(**inputs)
|
227
|
+
logits = outputs[0].squeeze()
|
228
|
+
|
229
|
+
loss = criterion(logits, targets)
|
230
|
+
|
231
|
+
return (loss, outputs) if return_outputs else loss
|
232
|
+
|
233
|
+
num_steps = (
|
234
|
+
len(self.dataset["train"])
|
235
|
+
// (self.batch_size * self.accumulation_steps)
|
236
|
+
// 5
|
237
|
+
)
|
238
|
+
num_steps = max(1, num_steps)
|
239
|
+
# print(num_steps)
|
240
|
+
|
241
|
+
training_args = transformers.TrainingArguments(
|
242
|
+
output_dir=model_root,
|
243
|
+
logging_dir=log_root,
|
244
|
+
per_device_train_batch_size=self.batch_size,
|
245
|
+
per_device_eval_batch_size=self.batch_size,
|
246
|
+
gradient_accumulation_steps=self.accumulation_steps,
|
247
|
+
evaluation_strategy="steps",
|
248
|
+
num_train_epochs=self.epoch_num,
|
249
|
+
fp16=True,
|
250
|
+
save_steps=num_steps,
|
251
|
+
eval_steps=num_steps,
|
252
|
+
logging_steps=num_steps,
|
253
|
+
learning_rate=1e-4,
|
254
|
+
save_total_limit=2,
|
255
|
+
metric_for_best_model="UAR",
|
256
|
+
greater_is_better=True,
|
257
|
+
load_best_model_at_end=True,
|
258
|
+
remove_unused_columns=False,
|
259
|
+
report_to="none",
|
260
|
+
)
|
261
|
+
|
262
|
+
trainer = Trainer(
|
263
|
+
model=self.model,
|
264
|
+
data_collator=self.data_collator,
|
265
|
+
args=training_args,
|
266
|
+
compute_metrics=self.compute_metrics,
|
267
|
+
train_dataset=self.dataset["train"],
|
268
|
+
eval_dataset=self.dataset["dev"],
|
269
|
+
tokenizer=self.processor.feature_extractor,
|
270
|
+
callbacks=[transformers.integrations.TensorBoardCallback()],
|
271
|
+
)
|
272
|
+
trainer.train()
|
273
|
+
trainer.save_model(self.torch_root)
|
274
|
+
self.load(self.run, self.epoch)
|
275
|
+
|
276
|
+
def get_predictions(self):
|
277
|
+
results = []
|
278
|
+
for (file, start, end), _ in audeer.progress_bar(
|
279
|
+
self.df_test.iterrows(),
|
280
|
+
total=len(self.df_test),
|
281
|
+
desc=f"Predicting {len(self.df_test)} audiofiles",
|
282
|
+
):
|
283
|
+
if end == pd.NaT:
|
284
|
+
signal, sr = audiofile.read(file, offset=start)
|
285
|
+
else:
|
286
|
+
signal, sr = audiofile.read(
|
287
|
+
file, duration=end - start, offset=start, always_2d=True
|
288
|
+
)
|
289
|
+
assert sr == self.sampling_rate
|
290
|
+
predictions = self.model.predict(signal)
|
291
|
+
results.append(predictions.argmax())
|
292
|
+
return results
|
293
|
+
|
294
|
+
def predict(self):
|
295
|
+
"""Predict the whole eval feature set"""
|
296
|
+
predictions = self.get_predictions()
|
297
|
+
report = Reporter(
|
298
|
+
self.df_test[self.target].to_numpy().astype(float),
|
299
|
+
predictions,
|
300
|
+
self.run,
|
301
|
+
self.epoch_num,
|
302
|
+
)
|
303
|
+
return report
|
304
|
+
|
305
|
+
def predict_sample(self, signal):
|
306
|
+
"""Predict one sample"""
|
307
|
+
prediction = {}
|
308
|
+
if self.util.exp_is_classification():
|
309
|
+
# get the class probabilities
|
310
|
+
predictions = self.model.predict(signal)
|
311
|
+
# pred = self.clf.predict(features)
|
312
|
+
for i in range(len(self.labels)):
|
313
|
+
cat = self.labels[i]
|
314
|
+
prediction[cat] = predictions[i]
|
315
|
+
else:
|
316
|
+
predictions = self.model.predict(signal)
|
317
|
+
prediction = predictions
|
318
|
+
return prediction
|
319
|
+
|
320
|
+
def store(self):
|
321
|
+
self.util.debug("stored: ")
|
322
|
+
|
323
|
+
def load(self, run, epoch):
|
324
|
+
self.set_id(run, epoch)
|
325
|
+
self.model = Model.from_pretrained(
|
326
|
+
self.torch_root,
|
327
|
+
config=self.config,
|
328
|
+
)
|
329
|
+
# print(f"loaded model type {type(self.model)}")
|
330
|
+
|
331
|
+
def load_path(self, path, run, epoch):
|
332
|
+
self.set_id(run, epoch)
|
333
|
+
with open(path, "rb") as handle:
|
334
|
+
self.clf = pickle.load(handle)
|
335
|
+
|
336
|
+
|
337
|
+
@dataclasses.dataclass
|
338
|
+
class ModelOutput(transformers.file_utils.ModelOutput):
|
339
|
+
|
340
|
+
logits_cat: torch.FloatTensor = None
|
341
|
+
hidden_states: typing.Tuple[torch.FloatTensor] = None
|
342
|
+
cnn_features: torch.FloatTensor = None
|
343
|
+
|
344
|
+
|
345
|
+
class ModelHead(torch.nn.Module):
|
346
|
+
|
347
|
+
def __init__(self, config):
|
348
|
+
|
349
|
+
super().__init__()
|
350
|
+
|
351
|
+
self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
|
352
|
+
self.dropout = torch.nn.Dropout(config.final_dropout)
|
353
|
+
self.out_proj = torch.nn.Linear(config.hidden_size, config.num_labels)
|
354
|
+
|
355
|
+
def forward(self, features, **kwargs):
|
356
|
+
|
357
|
+
x = features
|
358
|
+
x = self.dropout(x)
|
359
|
+
x = self.dense(x)
|
360
|
+
x = torch.tanh(x)
|
361
|
+
x = self.dropout(x)
|
362
|
+
x = self.out_proj(x)
|
363
|
+
|
364
|
+
return x
|
365
|
+
|
366
|
+
|
367
|
+
class Model(Wav2Vec2PreTrainedModel):
|
368
|
+
|
369
|
+
def __init__(self, config):
|
370
|
+
|
371
|
+
super().__init__(config)
|
372
|
+
|
373
|
+
self.wav2vec2 = Wav2Vec2Model(config)
|
374
|
+
self.cat = ModelHead(config)
|
375
|
+
self.init_weights()
|
376
|
+
|
377
|
+
def freeze_feature_extractor(self):
|
378
|
+
self.wav2vec2.feature_extractor._freeze_parameters()
|
379
|
+
|
380
|
+
def pooling(
|
381
|
+
self,
|
382
|
+
hidden_states,
|
383
|
+
attention_mask,
|
384
|
+
):
|
385
|
+
|
386
|
+
if attention_mask is None: # For evaluation with batch_size==1
|
387
|
+
outputs = torch.mean(hidden_states, dim=1)
|
388
|
+
else:
|
389
|
+
attention_mask = self._get_feature_vector_attention_mask(
|
390
|
+
hidden_states.shape[1],
|
391
|
+
attention_mask,
|
392
|
+
)
|
393
|
+
hidden_states = hidden_states * torch.reshape(
|
394
|
+
attention_mask,
|
395
|
+
(-1, attention_mask.shape[-1], 1),
|
396
|
+
)
|
397
|
+
outputs = torch.sum(hidden_states, dim=1)
|
398
|
+
attention_sum = torch.sum(attention_mask, dim=1)
|
399
|
+
outputs = outputs / torch.reshape(attention_sum, (-1, 1))
|
400
|
+
|
401
|
+
return outputs
|
402
|
+
|
403
|
+
def forward(
|
404
|
+
self,
|
405
|
+
input_values,
|
406
|
+
attention_mask=None,
|
407
|
+
labels=None,
|
408
|
+
return_hidden=False,
|
409
|
+
):
|
410
|
+
|
411
|
+
outputs = self.wav2vec2(
|
412
|
+
input_values,
|
413
|
+
attention_mask=attention_mask,
|
414
|
+
)
|
415
|
+
|
416
|
+
cnn_features = outputs.extract_features
|
417
|
+
hidden_states_framewise = outputs.last_hidden_state
|
418
|
+
hidden_states = self.pooling(
|
419
|
+
hidden_states_framewise,
|
420
|
+
attention_mask,
|
421
|
+
)
|
422
|
+
logits_cat = self.cat(hidden_states)
|
423
|
+
|
424
|
+
if not self.training:
|
425
|
+
logits_cat = torch.softmax(logits_cat, dim=1)
|
426
|
+
|
427
|
+
if return_hidden:
|
428
|
+
|
429
|
+
# make time last axis
|
430
|
+
cnn_features = torch.transpose(cnn_features, 1, 2)
|
431
|
+
|
432
|
+
return ModelOutput(
|
433
|
+
logits_cat=logits_cat,
|
434
|
+
hidden_states=hidden_states,
|
435
|
+
cnn_features=cnn_features,
|
436
|
+
)
|
437
|
+
|
438
|
+
else:
|
439
|
+
|
440
|
+
return ModelOutput(
|
441
|
+
logits_cat=logits_cat,
|
442
|
+
)
|
443
|
+
|
444
|
+
def predict(self, signal):
|
445
|
+
result = self(torch.from_numpy(signal))
|
446
|
+
result = result[0].detach().numpy()[0]
|
447
|
+
return result
|
448
|
+
|
449
|
+
|
450
|
+
class ModelWithPreProcessing(Model):
|
451
|
+
|
452
|
+
def __init__(self, config):
|
453
|
+
super().__init__(config)
|
454
|
+
|
455
|
+
def forward(
|
456
|
+
self,
|
457
|
+
input_values,
|
458
|
+
):
|
459
|
+
# Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm():
|
460
|
+
# normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
|
461
|
+
|
462
|
+
mean = input_values.mean()
|
463
|
+
|
464
|
+
# var = input_values.var()
|
465
|
+
# raises: onnxruntime.capi.onnxruntime_pybind11_state.NotImplemented: [ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for the node ReduceProd_3:ReduceProd(11)
|
466
|
+
|
467
|
+
var = torch.square(input_values - mean).mean()
|
468
|
+
input_values = (input_values - mean) / torch.sqrt(var + 1e-7)
|
469
|
+
|
470
|
+
output = super().forward(
|
471
|
+
input_values,
|
472
|
+
return_hidden=True,
|
473
|
+
)
|
474
|
+
|
475
|
+
return (
|
476
|
+
output.hidden_states,
|
477
|
+
output.logits_cat,
|
478
|
+
output.cnn_features,
|
479
|
+
)
|
nkululeko/test_pretrain.py
CHANGED
@@ -53,8 +53,8 @@ def doit(config_file):
|
|
53
53
|
expr.fill_train_and_tests()
|
54
54
|
util.debug(f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
|
55
55
|
|
56
|
+
model_root = util.get_path("model_dir")
|
56
57
|
log_root = audeer.mkdir("log")
|
57
|
-
model_root = audeer.mkdir("model")
|
58
58
|
torch_root = audeer.path(model_root, "torch")
|
59
59
|
|
60
60
|
metrics_gender = {
|
@@ -69,7 +69,7 @@ def doit(config_file):
|
|
69
69
|
num_layers = None
|
70
70
|
|
71
71
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
72
|
-
os.environ["CUDA_VISIBLE_DEVICES"] = "
|
72
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
73
73
|
|
74
74
|
batch_size = 16
|
75
75
|
accumulation_steps = 4
|
@@ -259,6 +259,7 @@ def doit(config_file):
|
|
259
259
|
greater_is_better=True,
|
260
260
|
load_best_model_at_end=True,
|
261
261
|
remove_unused_columns=False,
|
262
|
+
report_to="none",
|
262
263
|
)
|
263
264
|
|
264
265
|
trainer = Trainer(
|
@@ -271,9 +272,20 @@ def doit(config_file):
|
|
271
272
|
tokenizer=processor.feature_extractor,
|
272
273
|
callbacks=[transformers.integrations.TensorBoardCallback()],
|
273
274
|
)
|
275
|
+
if False:
|
276
|
+
trainer.train()
|
277
|
+
trainer.save_model(torch_root)
|
274
278
|
|
275
|
-
|
276
|
-
|
279
|
+
modelnew = fm.Model.from_pretrained(
|
280
|
+
torch_root,
|
281
|
+
config=config,
|
282
|
+
)
|
283
|
+
print(f"loaded new model type{type(modelnew)}")
|
284
|
+
import audiofile
|
285
|
+
|
286
|
+
signal, _ = audiofile.read("./test.wav", always_2d=True)
|
287
|
+
result = modelnew.predict(signal)
|
288
|
+
print(result)
|
277
289
|
|
278
290
|
print("DONE")
|
279
291
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nkululeko
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.85.1
|
4
4
|
Summary: Machine learning audio prediction experiments based on templates
|
5
5
|
Home-page: https://github.com/felixbur/nkululeko
|
6
6
|
Author: Felix Burkhardt
|
@@ -333,6 +333,14 @@ F. Burkhardt, Johannes Wagner, Hagen Wierstorf, Florian Eyben and Björn Schulle
|
|
333
333
|
Changelog
|
334
334
|
=========
|
335
335
|
|
336
|
+
Version 0.85.1
|
337
|
+
--------------
|
338
|
+
* fixed bug in model_finetuned that label_num was constant 2
|
339
|
+
|
340
|
+
Version 0.85.0
|
341
|
+
--------------
|
342
|
+
* first version with finetuning wav2vec2 layers
|
343
|
+
|
336
344
|
Version 0.84.1
|
337
345
|
--------------
|
338
346
|
* made resample independent of config file
|
@@ -2,18 +2,18 @@ nkululeko/__init__.py,sha256=62f8HiEzJ8rG2QlTFJXUCMpvuH3fKI33DoJSj33mscc,63
|
|
2
2
|
nkululeko/aug_train.py,sha256=YhuZnS_WVWnun9G-M6g5n6rbRxoVREz6Zh7k6qprFNQ,3194
|
3
3
|
nkululeko/augment.py,sha256=4MG0apTAG5RgkuJrYEjGgDdbodZWi_HweSPNI1JJ5QA,3051
|
4
4
|
nkululeko/cacheddataset.py,sha256=lIJ6hUo5LoxSrzXtWV8mzwO7wRtUETWnOQ4ws2XfL1E,969
|
5
|
-
nkululeko/constants.py,sha256=
|
5
|
+
nkululeko/constants.py,sha256=WnTSXQjJmWE-IrXcNSEa5FFV_83-z0EOGXa9trq00uE,39
|
6
6
|
nkululeko/demo.py,sha256=8bl15Kitoesnz8oa8yrs52T6YCSOhWbbq9PnZ8Hj6D0,3232
|
7
7
|
nkululeko/demo_feats.py,sha256=sAeGFojhEj9WEDFtG3SzPBmyYJWLF2rkbpp65m8Ujo4,2025
|
8
8
|
nkululeko/demo_predictor.py,sha256=es56xbT8ifkS_vnrlb5NTZT54gNmeUtNlA4zVA_gnN8,4757
|
9
|
-
nkululeko/experiment.py,sha256=
|
9
|
+
nkululeko/experiment.py,sha256=9Nw23b7sVOciH8IaOuAAKbY7otXYSsPrj_rQCA_U9cc,30465
|
10
10
|
nkululeko/explore.py,sha256=lDzRoW_Taa5u4BBABZLD89BcQWnYlrftJR4jgt1yyj0,2609
|
11
11
|
nkululeko/export.py,sha256=mHeEAAmtZuxdyebLlbSzPrHSi9OMgJHbk35d3DTxRBc,4632
|
12
12
|
nkululeko/feature_extractor.py,sha256=8mssYKmo4LclVI-hiLmJEDZ0ZPyDavFG2YwtXcrGzwM,3976
|
13
13
|
nkululeko/file_checker.py,sha256=LoLnL8aHpW-axMQ46qbqrManTs5otG9ShpEZuz9iRSk,3474
|
14
14
|
nkululeko/filter_data.py,sha256=w-X2mhKdYr5DxDIz50E5yzO6Jmzk4jjDBoXsgOOVtcA,7222
|
15
15
|
nkululeko/glob_conf.py,sha256=KL9YJQTHvTztxo1vr25qRRgaPnx4NTg0XrdbovKGMmw,525
|
16
|
-
nkululeko/modelrunner.py,sha256=
|
16
|
+
nkululeko/modelrunner.py,sha256=iCmfJxsS2UafcikjRdUqPQuqQMOYA-Ctr3et3HeNR3c,10452
|
17
17
|
nkululeko/multidb.py,sha256=fG3VukEWP1vreVN4gB1IRXxwwg4jLftsSEYtu0o1f78,5634
|
18
18
|
nkululeko/nkuluflag.py,sha256=PGWSmZz-PiiHLgcZJAoGOI_Y-sZDVI1ksB8p5r7riWM,3725
|
19
19
|
nkululeko/nkululeko.py,sha256=Kn3s2E3yyH8cJ7z6lkMxrnqtCxTu7-qfe9Zr_ONTD5g,1968
|
@@ -26,7 +26,7 @@ nkululeko/segment.py,sha256=YLKckX44tbvTb3LrdgYw9X4guzuF27sutl92z9DkpZU,4835
|
|
26
26
|
nkululeko/syllable_nuclei.py,sha256=Sky-C__MeUDaxqHnDl2TGLLYOYvsahD35TUjWGeG31k,10047
|
27
27
|
nkululeko/test.py,sha256=1w624vo5KTzmFC8BUStGlLDmIEAFuJUz7J0W-gp7AxI,1677
|
28
28
|
nkululeko/test_predictor.py,sha256=_w5J8CxH6hmW3mLTKbdfmywl5QpdNAnW1Y8TE5GtlfE,3237
|
29
|
-
nkululeko/test_pretrain.py,sha256=
|
29
|
+
nkululeko/test_pretrain.py,sha256=ZWl-bR6nmeSmXkGAIE6zyfQEjN8Zg0rIxfaS-O6Zbas,8465
|
30
30
|
nkululeko/augmenting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
31
|
nkululeko/augmenting/augmenter.py,sha256=XAt0dpmlnKxqyysqCgV3rcz-pRIvOz7rU7dmGDCVAzs,2905
|
32
32
|
nkululeko/augmenting/randomsplicer.py,sha256=Z5rxdKKUpuncLWuTS6xVfVKUeVbeiYU_dLRHQ5fcg4Y,2669
|
@@ -68,15 +68,15 @@ nkululeko/feat_extract/feats_squim.py,sha256=Y31YmDmscuG0YozvxyBZIutO3id8t7IZJWC
|
|
68
68
|
nkululeko/feat_extract/feats_trill.py,sha256=K2ahhdpwpjgg3WZS1POg3UMP2U44i8cLZZvn5Rq7fUI,3228
|
69
69
|
nkululeko/feat_extract/feats_wav2vec2.py,sha256=9WUMfyddB_3nx79g7mZoQrRynhM1uEBWuOotRq8bxoU,5268
|
70
70
|
nkululeko/feat_extract/feats_wavlm.py,sha256=ulxpGjifUFx2ZgGmY32SmBJGIuvkYHoLb2n1LZ8KMwA,4703
|
71
|
-
nkululeko/feat_extract/feats_whisper.py,sha256=
|
71
|
+
nkululeko/feat_extract/feats_whisper.py,sha256=0N7Vj65OVi2PNoB_NrDjWT5lP6xZNKxFOZZIoxkJvcA,4533
|
72
72
|
nkululeko/feat_extract/featureset.py,sha256=HtgW2389rmlRAgFP3F1sSFzq2_iUVr2NhOfIXG9omt0,1448
|
73
73
|
nkululeko/feat_extract/feinberg_praat.py,sha256=EP9pMALjlKdiYInLQdrZ7MmE499Mq-ISRCgqbqL3Rxc,21304
|
74
74
|
nkululeko/losses/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
75
75
|
nkululeko/losses/loss_ccc.py,sha256=NOK0y0fxKUnU161B5geap6Fmn8QzoPl2MqtPiV8IuJE,976
|
76
76
|
nkululeko/losses/loss_softf1loss.py,sha256=5gW-PuiqeAZcRgfwjueIOQtMokOjZWgQnVIv59HKTCo,1309
|
77
77
|
nkululeko/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
|
-
nkululeko/models/finetune_model.py,sha256=
|
79
|
-
nkululeko/models/model.py,sha256=
|
78
|
+
nkululeko/models/finetune_model.py,sha256=OMlzDyUFNXZ2xSiqqH8tbzey_KzPJ4jsoYT-4KrWFKM,5091
|
79
|
+
nkululeko/models/model.py,sha256=PUCqF2r_dEfmFsZn6Cgr1UIzYvxziLH6nSqZ5-vuN1o,11639
|
80
80
|
nkululeko/models/model_bayes.py,sha256=WJFZ8wFKwWATz6MhmjeZIi1Pal1viU549WL_PjXDSy8,406
|
81
81
|
nkululeko/models/model_cnn.py,sha256=bJxqwe6FnVR2hFeqN6EXexYGgvKYFED1VOhBXVlLWaE,9954
|
82
82
|
nkululeko/models/model_gmm.py,sha256=hZ9UO36KNf48qa3J-xkWIicIj9-TApmt21zNES2vEOs,649
|
@@ -89,6 +89,7 @@ nkululeko/models/model_svm.py,sha256=rsME3KvKvNG7bdE5lbvYUu85WZhaASZxxmdNDIVJRZ4
|
|
89
89
|
nkululeko/models/model_svr.py,sha256=_YZeksqB3eBENGlg3g9RwYFlk9rQQ-XCeNBKLlGGVoE,725
|
90
90
|
nkululeko/models/model_tree.py,sha256=rf16faUm4o2LJgkoYpeY998b8DQIvXZ73_m1IS3TnnE,417
|
91
91
|
nkululeko/models/model_tree_reg.py,sha256=IgQcPTE-304HQLYSKPF8Z4ot_Ur9dH01fZjS0nXke_M,428
|
92
|
+
nkululeko/models/model_tuned.py,sha256=WJplfUK3CGLSd2mahUrPSjMvqjPfxLp99KFeZaz2AbU,15098
|
92
93
|
nkululeko/models/model_xgb.py,sha256=Thgx5ESdIok4v72mKh4plxpo4smGcKALWNCJTDScY0M,447
|
93
94
|
nkululeko/models/model_xgr.py,sha256=aGBtNGLWjOE_2rICGYGFxmT8DtnHYsIl1lIpMtghHsY,418
|
94
95
|
nkululeko/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -105,8 +106,8 @@ nkululeko/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
105
106
|
nkululeko/utils/files.py,sha256=UiGAtZRWYjHSvlmPaTMtzyNNGE6qaLaxQkybctS7iRM,4021
|
106
107
|
nkululeko/utils/stats.py,sha256=1yUq0FTOyqkU8TwUocJRYdJaqMU5SlOBBRUun9STo2M,2829
|
107
108
|
nkululeko/utils/util.py,sha256=b1IHFucRNuF9Iyv5IJeK4AEg0Rga0xKG80UM5GWWdHA,13816
|
108
|
-
nkululeko-0.
|
109
|
-
nkululeko-0.
|
110
|
-
nkululeko-0.
|
111
|
-
nkululeko-0.
|
112
|
-
nkululeko-0.
|
109
|
+
nkululeko-0.85.1.dist-info/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
|
110
|
+
nkululeko-0.85.1.dist-info/METADATA,sha256=RonY9PdKyHjwYsZ3T9TgEs1JNnY1qbMdDr-Sp6kcCW8,36591
|
111
|
+
nkululeko-0.85.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
112
|
+
nkululeko-0.85.1.dist-info/top_level.txt,sha256=DPFNNSHPjUeVKj44dVANAjuVGRCC3MusJ08lc2a8xFA,10
|
113
|
+
nkululeko-0.85.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|