nkululeko 0.94.3__py3-none-any.whl → 0.95.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/augmenting/resampler.py +5 -2
- nkululeko/autopredict/ap_emotion.py +36 -0
- nkululeko/autopredict/ap_text.py +45 -0
- nkululeko/autopredict/tests/__init__.py +0 -0
- nkululeko/autopredict/tests/test_whisper_transcriber.py +122 -0
- nkululeko/autopredict/whisper_transcriber.py +81 -0
- nkululeko/balance.py +222 -0
- nkululeko/constants.py +1 -1
- nkululeko/experiment.py +53 -3
- nkululeko/explore.py +32 -13
- nkululeko/feat_extract/feats_analyser.py +45 -17
- nkululeko/feat_extract/feats_emotion2vec.py +51 -26
- nkululeko/feat_extract/feats_praat.py +3 -3
- nkululeko/feat_extract/feats_praat_core.py +769 -0
- nkululeko/feat_extract/tests/__init__.py +1 -0
- nkululeko/feat_extract/tests/test_feats_opensmile.py +162 -0
- nkululeko/feat_extract/tests/test_feats_praat_core.py +507 -0
- nkululeko/glob_conf.py +9 -0
- nkululeko/modelrunner.py +15 -39
- nkululeko/models/model.py +4 -42
- nkululeko/models/model_tuned.py +416 -84
- nkululeko/models/model_xgb.py +148 -2
- nkululeko/models/tests/test_model_knn.py +49 -0
- nkululeko/models/tests/test_model_mlp.py +153 -0
- nkululeko/models/tests/test_model_xgb.py +33 -0
- nkululeko/nkululeko.py +0 -9
- nkululeko/plots.py +25 -19
- nkululeko/predict.py +8 -6
- nkululeko/reporting/report.py +7 -5
- nkululeko/reporting/reporter.py +20 -5
- nkululeko/test_predictor.py +7 -1
- nkululeko/tests/__init__.py +1 -0
- nkululeko/tests/test_balancing.py +270 -0
- nkululeko/utils/util.py +38 -6
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/METADATA +1 -1
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/RECORD +40 -27
- nkululeko/feat_extract/feats_opensmile copy.py +0 -93
- nkululeko/feat_extract/feinberg_praat.py +0 -628
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/WHEEL +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/entry_points.txt +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/licenses/LICENSE +0 -0
- {nkululeko-0.94.3.dist-info → nkululeko-0.95.1.dist-info}/top_level.txt +0 -0
nkululeko/models/model_tuned.py
CHANGED
@@ -84,6 +84,17 @@ class TunedModel(BaseModel):
|
|
84
84
|
# print finetuning information via debug
|
85
85
|
self.util.debug(f"Finetuning from model: {pretrained_model}")
|
86
86
|
|
87
|
+
if any(
|
88
|
+
emotion_model in pretrained_model
|
89
|
+
for emotion_model in ["emotion2vec", "iic/emotion2vec"]
|
90
|
+
):
|
91
|
+
self._init_emotion2vec_model(pretrained_model)
|
92
|
+
return
|
93
|
+
|
94
|
+
self._init_huggingface_model(pretrained_model)
|
95
|
+
|
96
|
+
def _init_huggingface_model(self, pretrained_model):
|
97
|
+
"""Initialize HuggingFace transformer model for finetuning."""
|
87
98
|
# create dataset
|
88
99
|
dataset = {}
|
89
100
|
target_name = glob_conf.target
|
@@ -99,41 +110,9 @@ class TunedModel(BaseModel):
|
|
99
110
|
df = y.reset_index()
|
100
111
|
df.start = df.start.dt.total_seconds()
|
101
112
|
df.end = df.end.dt.total_seconds()
|
102
|
-
# ds = datasets.Dataset.from_pandas(df)
|
103
|
-
# dataset[split] = ds
|
104
113
|
|
105
|
-
# self.dataset = datasets.DatasetDict(dataset)
|
106
114
|
if split == "train" and self.balancing:
|
107
|
-
|
108
|
-
from imblearn.over_sampling import RandomOverSampler
|
109
|
-
|
110
|
-
sampler = RandomOverSampler(random_state=42)
|
111
|
-
elif self.balancing == "smote":
|
112
|
-
from imblearn.over_sampling import SMOTE
|
113
|
-
|
114
|
-
sampler = SMOTE(random_state=42)
|
115
|
-
elif self.balancing == "adasyn":
|
116
|
-
from imblearn.over_sampling import ADASYN
|
117
|
-
|
118
|
-
sampler = ADASYN(random_state=42)
|
119
|
-
else:
|
120
|
-
self.util.error(f"Unknown balancing algorithm: {self.balancing}")
|
121
|
-
|
122
|
-
X_resampled, y_resampled = sampler.fit_resample(
|
123
|
-
df[["start", "end"]], df["targets"]
|
124
|
-
)
|
125
|
-
df = pd.DataFrame(
|
126
|
-
{
|
127
|
-
"start": X_resampled["start"],
|
128
|
-
"end": X_resampled["end"],
|
129
|
-
"targets": y_resampled,
|
130
|
-
}
|
131
|
-
)
|
132
|
-
|
133
|
-
# print the before and after class distribution
|
134
|
-
self.util.debug(
|
135
|
-
f"balanced with: {self.balancing}, new size: {len(df)}, was {len(data_sources[split])}"
|
136
|
-
)
|
115
|
+
df = self._apply_balancing(df, data_sources[split])
|
137
116
|
|
138
117
|
ds = datasets.Dataset.from_pandas(df)
|
139
118
|
dataset[split] = ds
|
@@ -144,6 +123,13 @@ class TunedModel(BaseModel):
|
|
144
123
|
if self.is_classifier:
|
145
124
|
self.util.debug("Task is classification.")
|
146
125
|
le = glob_conf.label_encoder
|
126
|
+
if le is None:
|
127
|
+
self.util.error(
|
128
|
+
"Label encoder is not available. Make sure to set up data loading properly."
|
129
|
+
)
|
130
|
+
raise ValueError(
|
131
|
+
"Label encoder is missing. Initialization cannot proceed. Ensure data loading is correctly configured."
|
132
|
+
)
|
147
133
|
mapping = dict(zip(le.classes_, range(len(le.classes_))))
|
148
134
|
target_mapping = {k: int(v) for k, v in mapping.items()}
|
149
135
|
target_mapping_reverse = {
|
@@ -191,16 +177,136 @@ class TunedModel(BaseModel):
|
|
191
177
|
feature_extractor=feature_extractor,
|
192
178
|
tokenizer=tokenizer,
|
193
179
|
)
|
194
|
-
assert self.processor.feature_extractor.sampling_rate == self.sampling_rate
|
180
|
+
assert self.processor.feature_extractor.sampling_rate == self.sampling_rate # type: ignore
|
195
181
|
|
196
|
-
self.model = Model.from_pretrained(
|
182
|
+
self.model = Model.from_pretrained( # type: ignore
|
197
183
|
pretrained_model,
|
198
184
|
config=self.config,
|
199
185
|
)
|
200
|
-
self.model.freeze_feature_extractor()
|
186
|
+
self.model.freeze_feature_extractor() # type: ignore
|
187
|
+
self.model.train() # type: ignore
|
188
|
+
self.model_initialized = True
|
189
|
+
|
190
|
+
def _init_emotion2vec_model(self, pretrained_model):
|
191
|
+
"""Initialize emotion2vec model for finetuning."""
|
192
|
+
try:
|
193
|
+
from funasr import AutoModel
|
194
|
+
except ImportError:
|
195
|
+
self.util.error(
|
196
|
+
"FunASR is required for emotion2vec finetuning. "
|
197
|
+
"Please install with: pip install funasr"
|
198
|
+
)
|
199
|
+
return
|
200
|
+
|
201
|
+
model_mapping = {
|
202
|
+
"emotion2vec": "emotion2vec/emotion2vec_base",
|
203
|
+
"emotion2vec-base": "emotion2vec/emotion2vec_base",
|
204
|
+
"emotion2vec-seed": "emotion2vec/emotion2vec_plus_seed",
|
205
|
+
"emotion2vec-large": "emotion2vec/emotion2vec_plus_large",
|
206
|
+
}
|
207
|
+
|
208
|
+
if pretrained_model in model_mapping:
|
209
|
+
model_path = model_mapping[pretrained_model]
|
210
|
+
else:
|
211
|
+
model_path = pretrained_model
|
212
|
+
|
213
|
+
self._create_emotion2vec_dataset()
|
214
|
+
|
215
|
+
self.emotion2vec_backbone = AutoModel(
|
216
|
+
model=model_path,
|
217
|
+
hub="hf" # Use HuggingFace Hub instead of ModelScope
|
218
|
+
)
|
219
|
+
|
220
|
+
if self.is_classifier:
|
221
|
+
le = glob_conf.label_encoder
|
222
|
+
if le is None:
|
223
|
+
self.util.error("Label encoder not available for classification")
|
224
|
+
return
|
225
|
+
num_labels = len(le.classes_)
|
226
|
+
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
|
227
|
+
self.config = EmotionVecConfig(
|
228
|
+
num_labels=num_labels,
|
229
|
+
label2id=label_mapping,
|
230
|
+
id2label={v: k for k, v in label_mapping.items()},
|
231
|
+
is_classifier=True,
|
232
|
+
sampling_rate=self.sampling_rate,
|
233
|
+
final_dropout=self.drop,
|
234
|
+
model_name=pretrained_model,
|
235
|
+
)
|
236
|
+
else:
|
237
|
+
self.config = EmotionVecConfig(
|
238
|
+
num_labels=1,
|
239
|
+
is_classifier=False,
|
240
|
+
sampling_rate=self.sampling_rate,
|
241
|
+
final_dropout=self.drop,
|
242
|
+
model_name=pretrained_model,
|
243
|
+
)
|
244
|
+
|
245
|
+
self.model = Emotion2vecModel(self.emotion2vec_backbone, self.config)
|
201
246
|
self.model.train()
|
202
247
|
self.model_initialized = True
|
203
248
|
|
249
|
+
self.processor = None
|
250
|
+
|
251
|
+
def _create_emotion2vec_dataset(self):
|
252
|
+
"""Create dataset for emotion2vec training."""
|
253
|
+
dataset = {}
|
254
|
+
target_name = glob_conf.target
|
255
|
+
data_sources = {
|
256
|
+
"train": pd.DataFrame(self.df_train[target_name]),
|
257
|
+
"dev": pd.DataFrame(self.df_test[target_name]),
|
258
|
+
}
|
259
|
+
|
260
|
+
for split in ["train", "dev"]:
|
261
|
+
df = data_sources[split]
|
262
|
+
y = df[target_name].astype("float")
|
263
|
+
y.name = "targets"
|
264
|
+
df = y.reset_index()
|
265
|
+
df.start = df.start.dt.total_seconds()
|
266
|
+
df.end = df.end.dt.total_seconds()
|
267
|
+
|
268
|
+
if split == "train" and self.balancing:
|
269
|
+
df = self._apply_balancing(df, data_sources[split])
|
270
|
+
|
271
|
+
ds = datasets.Dataset.from_pandas(df)
|
272
|
+
dataset[split] = ds
|
273
|
+
|
274
|
+
self.dataset = datasets.DatasetDict(dataset)
|
275
|
+
|
276
|
+
def _apply_balancing(self, df, original_df):
|
277
|
+
"""Apply data balancing to training dataset."""
|
278
|
+
if self.balancing == "ros":
|
279
|
+
from imblearn.over_sampling import RandomOverSampler
|
280
|
+
|
281
|
+
sampler = RandomOverSampler(random_state=42)
|
282
|
+
elif self.balancing == "smote":
|
283
|
+
from imblearn.over_sampling import SMOTE
|
284
|
+
|
285
|
+
sampler = SMOTE(random_state=42)
|
286
|
+
elif self.balancing == "adasyn":
|
287
|
+
from imblearn.over_sampling import ADASYN
|
288
|
+
|
289
|
+
sampler = ADASYN(random_state=42)
|
290
|
+
else:
|
291
|
+
self.util.error(f"Unknown balancing algorithm: {self.balancing}")
|
292
|
+
return df
|
293
|
+
|
294
|
+
X_resampled, y_resampled = sampler.fit_resample(
|
295
|
+
df[["start", "end"]], df["targets"]
|
296
|
+
)
|
297
|
+
df = pd.DataFrame(
|
298
|
+
{
|
299
|
+
"start": X_resampled["start"],
|
300
|
+
"end": X_resampled["end"],
|
301
|
+
"targets": y_resampled,
|
302
|
+
}
|
303
|
+
)
|
304
|
+
|
305
|
+
self.util.debug(
|
306
|
+
f"balanced with: {self.balancing}, new size: {len(df)}, was {len(original_df)}"
|
307
|
+
)
|
308
|
+
return df
|
309
|
+
|
204
310
|
def set_model_type(self, type):
|
205
311
|
self.model_type = type
|
206
312
|
|
@@ -224,11 +330,7 @@ class TunedModel(BaseModel):
|
|
224
330
|
targets = [d["targets"] for d in data]
|
225
331
|
|
226
332
|
signals = []
|
227
|
-
for file, start, end in zip(
|
228
|
-
files,
|
229
|
-
starts,
|
230
|
-
ends,
|
231
|
-
):
|
333
|
+
for file, start, end in zip(files, starts, ends):
|
232
334
|
offset = start
|
233
335
|
duration = end - offset
|
234
336
|
if self.max_duration_sec is not None:
|
@@ -240,18 +342,37 @@ class TunedModel(BaseModel):
|
|
240
342
|
)
|
241
343
|
signals.append(signal.squeeze())
|
242
344
|
|
243
|
-
|
244
|
-
signals
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
345
|
+
if hasattr(self, "emotion2vec_backbone"):
|
346
|
+
max_length = max(len(s) for s in signals)
|
347
|
+
padded_signals = []
|
348
|
+
for s in signals:
|
349
|
+
if len(s) < max_length:
|
350
|
+
padded = np.pad(s, (0, max_length - len(s)), mode="constant")
|
351
|
+
else:
|
352
|
+
padded = s[:max_length]
|
353
|
+
padded_signals.append(padded)
|
354
|
+
|
355
|
+
batch = {
|
356
|
+
"input_values": torch.stack(
|
357
|
+
[torch.tensor(s, dtype=torch.float32) for s in padded_signals]
|
358
|
+
),
|
359
|
+
"labels": torch.tensor(
|
360
|
+
targets,
|
361
|
+
dtype=torch.float32 if not self.is_classifier else torch.long,
|
362
|
+
),
|
363
|
+
}
|
364
|
+
else:
|
365
|
+
input_values = self.processor(
|
366
|
+
signals,
|
367
|
+
sampling_rate=self.sampling_rate,
|
368
|
+
padding=True,
|
369
|
+
)
|
370
|
+
batch = self.processor.pad(
|
371
|
+
input_values,
|
372
|
+
padding=True,
|
373
|
+
return_tensors="pt",
|
374
|
+
)
|
375
|
+
batch["labels"] = torch.Tensor(targets)
|
255
376
|
|
256
377
|
return batch
|
257
378
|
|
@@ -270,7 +391,22 @@ class TunedModel(BaseModel):
|
|
270
391
|
# truth = p.label_ids[:, 0].astype(int)
|
271
392
|
truth = p.label_ids
|
272
393
|
preds = p.predictions
|
273
|
-
|
394
|
+
|
395
|
+
if isinstance(preds, tuple):
|
396
|
+
if len(preds) > 0:
|
397
|
+
preds = preds[0] # Extract logits from tuple
|
398
|
+
else:
|
399
|
+
raise ValueError(f"Empty predictions tuple received: {preds}")
|
400
|
+
|
401
|
+
if hasattr(preds, 'numpy'):
|
402
|
+
preds = preds.numpy()
|
403
|
+
elif hasattr(preds, 'detach'):
|
404
|
+
preds = preds.detach().numpy()
|
405
|
+
|
406
|
+
if len(preds.shape) > 1 and preds.shape[1] > 1:
|
407
|
+
preds = np.argmax(preds, axis=1)
|
408
|
+
else:
|
409
|
+
preds = preds.flatten()
|
274
410
|
scores = {}
|
275
411
|
if self.is_classifier:
|
276
412
|
for name, metric in metrics.items():
|
@@ -329,12 +465,16 @@ class TunedModel(BaseModel):
|
|
329
465
|
model,
|
330
466
|
inputs,
|
331
467
|
return_outputs=False,
|
468
|
+
num_items_in_batch=None,
|
332
469
|
):
|
333
470
|
targets = inputs.pop("labels").squeeze()
|
334
471
|
targets = targets.type(torch.long)
|
335
472
|
|
336
473
|
outputs = model(**inputs)
|
337
|
-
|
474
|
+
if hasattr(outputs, 'logits'):
|
475
|
+
logits = outputs.logits.squeeze()
|
476
|
+
else:
|
477
|
+
logits = outputs[0].squeeze()
|
338
478
|
|
339
479
|
loss = criterion(logits, targets)
|
340
480
|
|
@@ -365,7 +505,7 @@ class TunedModel(BaseModel):
|
|
365
505
|
per_device_train_batch_size=self.batch_size,
|
366
506
|
per_device_eval_batch_size=self.batch_size,
|
367
507
|
gradient_accumulation_steps=self.accumulation_steps,
|
368
|
-
|
508
|
+
eval_strategy="steps",
|
369
509
|
num_train_epochs=self.epoch_num,
|
370
510
|
fp16=self.device != "cpu",
|
371
511
|
use_cpu=self.device == "cpu",
|
@@ -385,16 +525,20 @@ class TunedModel(BaseModel):
|
|
385
525
|
overwrite_output_dir=True,
|
386
526
|
)
|
387
527
|
|
388
|
-
|
389
|
-
model
|
390
|
-
data_collator
|
391
|
-
args
|
392
|
-
compute_metrics
|
393
|
-
train_dataset
|
394
|
-
eval_dataset
|
395
|
-
|
396
|
-
|
397
|
-
|
528
|
+
trainer_kwargs = {
|
529
|
+
"model": self.model,
|
530
|
+
"data_collator": self.data_collator,
|
531
|
+
"args": training_args,
|
532
|
+
"compute_metrics": self.compute_metrics,
|
533
|
+
"train_dataset": self.dataset["train"],
|
534
|
+
"eval_dataset": self.dataset["dev"],
|
535
|
+
"callbacks": [transformers.integrations.TensorBoardCallback()],
|
536
|
+
}
|
537
|
+
|
538
|
+
if self.processor is not None:
|
539
|
+
trainer_kwargs["tokenizer"] = self.processor.feature_extractor
|
540
|
+
|
541
|
+
trainer = Trainer(**trainer_kwargs)
|
398
542
|
|
399
543
|
trainer.train()
|
400
544
|
trainer.save_model(self.torch_root)
|
@@ -421,7 +565,7 @@ class TunedModel(BaseModel):
|
|
421
565
|
file, duration=end - start, offset=start, always_2d=True
|
422
566
|
)
|
423
567
|
assert sr == self.sampling_rate
|
424
|
-
prediction = self.model.predict(signal)
|
568
|
+
prediction = self.model.predict(signal) # type: ignore
|
425
569
|
results.append(prediction)
|
426
570
|
# results.append(predictions.argmax())
|
427
571
|
predictions = np.asarray(results)
|
@@ -483,13 +627,13 @@ class TunedModel(BaseModel):
|
|
483
627
|
prediction = {}
|
484
628
|
if self.is_classifier:
|
485
629
|
# get the class probabilities
|
486
|
-
predictions = self.model.predict(signal)
|
630
|
+
predictions = self.model.predict(signal) # type: ignore
|
487
631
|
# pred = self.clf.predict(features)
|
488
632
|
for i in range(len(self.labels)):
|
489
633
|
cat = self.labels[i]
|
490
634
|
prediction[cat] = predictions[i]
|
491
635
|
else:
|
492
|
-
predictions = self.model.predict(signal)
|
636
|
+
predictions = self.model.predict(signal) # type: ignore
|
493
637
|
prediction = predictions
|
494
638
|
return prediction
|
495
639
|
|
@@ -498,10 +642,16 @@ class TunedModel(BaseModel):
|
|
498
642
|
|
499
643
|
def load(self, run, epoch):
|
500
644
|
self.set_id(run, epoch)
|
501
|
-
self
|
502
|
-
self.torch_root,
|
503
|
-
|
504
|
-
|
645
|
+
if hasattr(self, "emotion2vec_backbone"):
|
646
|
+
model_path = os.path.join(self.torch_root, "pytorch_model.bin")
|
647
|
+
if os.path.exists(model_path):
|
648
|
+
self.model.load_state_dict(torch.load(model_path))
|
649
|
+
self.model.eval()
|
650
|
+
else:
|
651
|
+
self.model = Model.from_pretrained(
|
652
|
+
self.torch_root,
|
653
|
+
config=self.config,
|
654
|
+
)
|
505
655
|
# print(f"loaded model type {type(self.model)}")
|
506
656
|
|
507
657
|
def load_path(self, path, run, epoch):
|
@@ -511,20 +661,70 @@ class TunedModel(BaseModel):
|
|
511
661
|
|
512
662
|
|
513
663
|
@dataclasses.dataclass
|
514
|
-
class ModelOutput
|
515
|
-
logits: torch.
|
516
|
-
hidden_states: typing.
|
517
|
-
cnn_features: torch.
|
664
|
+
class ModelOutput:
|
665
|
+
logits: typing.Optional[torch.Tensor] = None
|
666
|
+
hidden_states: typing.Optional[torch.Tensor] = None
|
667
|
+
cnn_features: typing.Optional[torch.Tensor] = None
|
668
|
+
|
669
|
+
def __getitem__(self, index):
|
670
|
+
"""Make ModelOutput subscriptable for HuggingFace compatibility."""
|
671
|
+
if isinstance(index, slice):
|
672
|
+
items = [self.logits, self.hidden_states, self.cnn_features]
|
673
|
+
result = items[index]
|
674
|
+
filtered_result = [item for item in result if item is not None]
|
675
|
+
|
676
|
+
if not filtered_result and self.logits is not None:
|
677
|
+
return (self.logits,)
|
678
|
+
|
679
|
+
return tuple(filtered_result)
|
680
|
+
elif index == 0:
|
681
|
+
return self.logits
|
682
|
+
elif index == 1:
|
683
|
+
return self.hidden_states
|
684
|
+
elif index == 2:
|
685
|
+
return self.cnn_features
|
686
|
+
else:
|
687
|
+
raise IndexError(f"Index {index} out of range for ModelOutput")
|
688
|
+
|
689
|
+
def __len__(self):
|
690
|
+
"""Return the number of available outputs."""
|
691
|
+
return 3
|
518
692
|
|
519
693
|
|
520
694
|
@dataclasses.dataclass
|
521
|
-
class ModelOutputReg
|
522
|
-
logits: torch.
|
523
|
-
hidden_states: typing.
|
524
|
-
attentions: typing.
|
525
|
-
logits_framewise: torch.
|
526
|
-
hidden_states_framewise: torch.
|
527
|
-
cnn_features: torch.
|
695
|
+
class ModelOutputReg:
|
696
|
+
logits: torch.Tensor
|
697
|
+
hidden_states: typing.Optional[torch.Tensor] = None
|
698
|
+
attentions: typing.Optional[torch.Tensor] = None
|
699
|
+
logits_framewise: typing.Optional[torch.Tensor] = None
|
700
|
+
hidden_states_framewise: typing.Optional[torch.Tensor] = None
|
701
|
+
cnn_features: typing.Optional[torch.Tensor] = None
|
702
|
+
|
703
|
+
def __getitem__(self, index):
|
704
|
+
"""Make ModelOutputReg subscriptable for HuggingFace compatibility."""
|
705
|
+
if isinstance(index, slice):
|
706
|
+
items = [self.logits, self.hidden_states, self.attentions,
|
707
|
+
self.logits_framewise, self.hidden_states_framewise, self.cnn_features]
|
708
|
+
result = items[index]
|
709
|
+
return tuple(item for item in result if item is not None)
|
710
|
+
elif index == 0:
|
711
|
+
return self.logits
|
712
|
+
elif index == 1:
|
713
|
+
return self.hidden_states
|
714
|
+
elif index == 2:
|
715
|
+
return self.attentions
|
716
|
+
elif index == 3:
|
717
|
+
return self.logits_framewise
|
718
|
+
elif index == 4:
|
719
|
+
return self.hidden_states_framewise
|
720
|
+
elif index == 5:
|
721
|
+
return self.cnn_features
|
722
|
+
else:
|
723
|
+
raise IndexError(f"Index {index} out of range for ModelOutputReg")
|
724
|
+
|
725
|
+
def __len__(self):
|
726
|
+
"""Return the number of available outputs."""
|
727
|
+
return 6
|
528
728
|
|
529
729
|
|
530
730
|
class ModelHead(torch.nn.Module):
|
@@ -637,6 +837,138 @@ class Model(Wav2Vec2PreTrainedModel):
|
|
637
837
|
return result
|
638
838
|
|
639
839
|
|
840
|
+
class EmotionVecConfig:
|
841
|
+
"""Configuration class for emotion2vec models."""
|
842
|
+
|
843
|
+
def __init__(
|
844
|
+
self,
|
845
|
+
num_labels,
|
846
|
+
is_classifier=True,
|
847
|
+
sampling_rate=16000,
|
848
|
+
final_dropout=0.1,
|
849
|
+
model_name=None,
|
850
|
+
**kwargs,
|
851
|
+
):
|
852
|
+
self.num_labels = num_labels
|
853
|
+
self.is_classifier = is_classifier
|
854
|
+
self.sampling_rate = sampling_rate
|
855
|
+
self.final_dropout = final_dropout
|
856
|
+
self.model_name = model_name
|
857
|
+
for key, value in kwargs.items():
|
858
|
+
setattr(self, key, value)
|
859
|
+
|
860
|
+
def to_json_string(self):
|
861
|
+
"""Convert config to JSON string for HuggingFace compatibility."""
|
862
|
+
import json
|
863
|
+
config_dict = {
|
864
|
+
"num_labels": self.num_labels,
|
865
|
+
"is_classifier": self.is_classifier,
|
866
|
+
"sampling_rate": self.sampling_rate,
|
867
|
+
"final_dropout": self.final_dropout,
|
868
|
+
}
|
869
|
+
for key, value in self.__dict__.items():
|
870
|
+
if key not in config_dict:
|
871
|
+
config_dict[key] = value
|
872
|
+
return json.dumps(config_dict, indent=2)
|
873
|
+
|
874
|
+
|
875
|
+
class Emotion2vecModel(torch.nn.Module):
|
876
|
+
"""Wrapper class for emotion2vec finetuning."""
|
877
|
+
|
878
|
+
def __init__(self, emotion2vec_backbone, config):
|
879
|
+
super().__init__()
|
880
|
+
self.emotion2vec_backbone = emotion2vec_backbone
|
881
|
+
self.config = config
|
882
|
+
self.is_classifier = config.is_classifier
|
883
|
+
|
884
|
+
# Determine embedding dimension based on model variant (hardcoded)
|
885
|
+
embedding_dim = self._get_embedding_dim_by_model()
|
886
|
+
self.head = torch.nn.Sequential(
|
887
|
+
torch.nn.Dropout(config.final_dropout),
|
888
|
+
torch.nn.Linear(embedding_dim, config.num_labels),
|
889
|
+
)
|
890
|
+
|
891
|
+
def _get_embedding_dim_by_model(self):
|
892
|
+
"""Get embedding dimension based on model variant."""
|
893
|
+
model_name = getattr(self.config, 'model_name', '')
|
894
|
+
|
895
|
+
# Large models have 1024 dimensions
|
896
|
+
if 'large' in model_name.lower():
|
897
|
+
return 1024
|
898
|
+
# Base, seed, and other models have 768 dimensions
|
899
|
+
else:
|
900
|
+
return 768
|
901
|
+
|
902
|
+
def forward(self, input_values, labels=None, **kwargs):
|
903
|
+
embeddings = self._extract_embeddings(input_values)
|
904
|
+
|
905
|
+
logits = self.head(embeddings)
|
906
|
+
|
907
|
+
if not self.training and self.is_classifier:
|
908
|
+
logits = torch.softmax(logits, dim=1)
|
909
|
+
|
910
|
+
if self.is_classifier:
|
911
|
+
return ModelOutput(logits=logits)
|
912
|
+
else:
|
913
|
+
return ModelOutputReg(logits=logits)
|
914
|
+
|
915
|
+
def _extract_embeddings(self, input_values):
|
916
|
+
batch_embeddings = []
|
917
|
+
device = next(self.parameters()).device # Get the device of the model
|
918
|
+
for audio_tensor in input_values:
|
919
|
+
embedding = self._process_single_audio(audio_tensor)
|
920
|
+
# Ensure embedding is on the same device as the model
|
921
|
+
embedding = embedding.to(device)
|
922
|
+
batch_embeddings.append(embedding)
|
923
|
+
return torch.stack(batch_embeddings)
|
924
|
+
|
925
|
+
def _process_single_audio(self, audio_tensor):
|
926
|
+
import tempfile
|
927
|
+
import soundfile as sf
|
928
|
+
|
929
|
+
signal_np = audio_tensor.squeeze().cpu().numpy()
|
930
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
931
|
+
sf.write(tmp_file.name, signal_np, self.config.sampling_rate)
|
932
|
+
|
933
|
+
try:
|
934
|
+
res = self.emotion2vec_backbone.generate(
|
935
|
+
tmp_file.name, granularity="utterance", extract_embedding=True
|
936
|
+
)
|
937
|
+
|
938
|
+
if isinstance(res, list) and len(res) > 0:
|
939
|
+
embeddings = res[0].get("feats", None)
|
940
|
+
if embeddings is not None:
|
941
|
+
if isinstance(embeddings, list):
|
942
|
+
embeddings = np.array(embeddings)
|
943
|
+
return torch.tensor(embeddings.flatten(), dtype=torch.float32)
|
944
|
+
|
945
|
+
# Fallback based on model type
|
946
|
+
model_name = getattr(self.config, 'model_name', '')
|
947
|
+
if 'large' in model_name.lower():
|
948
|
+
return torch.zeros(1024, dtype=torch.float32)
|
949
|
+
else:
|
950
|
+
return torch.zeros(768, dtype=torch.float32)
|
951
|
+
finally:
|
952
|
+
os.unlink(tmp_file.name)
|
953
|
+
|
954
|
+
def predict(self, signal):
|
955
|
+
"""Predict method for compatibility with nkululeko prediction pipeline."""
|
956
|
+
if isinstance(signal, np.ndarray):
|
957
|
+
signal_tensor = torch.from_numpy(signal).unsqueeze(0)
|
958
|
+
else:
|
959
|
+
signal_tensor = signal.unsqueeze(0) if signal.dim() == 1 else signal
|
960
|
+
|
961
|
+
with torch.no_grad():
|
962
|
+
result = self(signal_tensor)
|
963
|
+
|
964
|
+
if self.is_classifier:
|
965
|
+
logits = result.logits
|
966
|
+
else:
|
967
|
+
logits = result.logits
|
968
|
+
|
969
|
+
return logits.detach().cpu().numpy()[0]
|
970
|
+
|
971
|
+
|
640
972
|
class ConcordanceCorCoeff(torch.nn.Module):
|
641
973
|
def __init__(self):
|
642
974
|
super().__init__()
|