nkululeko 0.90.0__py3-none-any.whl → 0.90.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/aug_train.py +6 -4
- nkululeko/augment.py +6 -4
- nkululeko/augmenting/augmenter.py +4 -4
- nkululeko/augmenting/randomsplicer.py +6 -6
- nkululeko/augmenting/randomsplicing.py +2 -3
- nkululeko/augmenting/resampler.py +9 -6
- nkululeko/autopredict/ap_age.py +4 -2
- nkululeko/autopredict/ap_arousal.py +4 -2
- nkululeko/autopredict/ap_dominance.py +3 -2
- nkululeko/autopredict/ap_gender.py +4 -2
- nkululeko/autopredict/ap_mos.py +5 -2
- nkululeko/autopredict/ap_pesq.py +5 -2
- nkululeko/autopredict/ap_sdr.py +5 -2
- nkululeko/autopredict/ap_snr.py +5 -2
- nkululeko/autopredict/ap_stoi.py +5 -2
- nkululeko/autopredict/ap_valence.py +4 -2
- nkululeko/autopredict/estimate_snr.py +10 -14
- nkululeko/cacheddataset.py +1 -1
- nkululeko/constants.py +1 -1
- nkululeko/data/dataset.py +11 -14
- nkululeko/data/dataset_csv.py +5 -3
- nkululeko/demo-ft.py +29 -0
- nkululeko/demo_feats.py +5 -4
- nkululeko/demo_predictor.py +3 -4
- nkululeko/ensemble.py +27 -28
- nkululeko/experiment.py +3 -5
- nkululeko/experiment_felix.py +728 -0
- nkululeko/explore.py +1 -0
- nkululeko/export.py +7 -5
- nkululeko/feat_extract/feats_agender.py +5 -4
- nkululeko/feat_extract/feats_agender_agender.py +7 -6
- nkululeko/feat_extract/feats_analyser.py +18 -16
- nkululeko/feat_extract/feats_ast.py +9 -8
- nkululeko/feat_extract/feats_auddim.py +3 -5
- nkululeko/feat_extract/feats_audmodel.py +2 -2
- nkululeko/feat_extract/feats_clap.py +9 -12
- nkululeko/feat_extract/feats_hubert.py +2 -3
- nkululeko/feat_extract/feats_import.py +5 -4
- nkululeko/feat_extract/feats_mld.py +3 -5
- nkululeko/feat_extract/feats_mos.py +4 -3
- nkululeko/feat_extract/feats_opensmile.py +4 -3
- nkululeko/feat_extract/feats_oxbow.py +5 -4
- nkululeko/feat_extract/feats_praat.py +4 -7
- nkululeko/feat_extract/feats_snr.py +3 -5
- nkululeko/feat_extract/feats_spectra.py +8 -9
- nkululeko/feat_extract/feats_spkrec.py +6 -11
- nkululeko/feat_extract/feats_squim.py +2 -4
- nkululeko/feat_extract/feats_trill.py +2 -5
- nkululeko/feat_extract/feats_wav2vec2.py +8 -4
- nkululeko/feat_extract/feats_wavlm.py +2 -3
- nkululeko/feat_extract/feats_whisper.py +4 -6
- nkululeko/feat_extract/featureset.py +4 -2
- nkululeko/feat_extract/feinberg_praat.py +1 -3
- nkululeko/feat_extract/transformer_feature_extractor.py +147 -0
- nkululeko/file_checker.py +3 -3
- nkululeko/filter_data.py +3 -1
- nkululeko/fixedsegment.py +83 -0
- nkululeko/models/model.py +3 -5
- nkululeko/models/model_bayes.py +1 -0
- nkululeko/models/model_cnn.py +4 -6
- nkululeko/models/model_gmm.py +13 -9
- nkululeko/models/model_knn.py +1 -0
- nkululeko/models/model_knn_reg.py +1 -0
- nkululeko/models/model_lin_reg.py +1 -0
- nkululeko/models/model_mlp.py +2 -3
- nkululeko/models/model_mlp_regression.py +1 -6
- nkululeko/models/model_svm.py +2 -2
- nkululeko/models/model_svr.py +1 -0
- nkululeko/models/model_tree.py +2 -3
- nkululeko/models/model_tree_reg.py +1 -0
- nkululeko/models/model_tuned.py +54 -33
- nkululeko/models/model_xgb.py +1 -0
- nkululeko/models/model_xgr.py +1 -0
- nkululeko/multidb.py +1 -0
- nkululeko/nkululeko.py +1 -1
- nkululeko/predict.py +4 -5
- nkululeko/reporting/defines.py +6 -8
- nkululeko/reporting/latex_writer.py +3 -3
- nkululeko/reporting/report.py +2 -2
- nkululeko/reporting/report_item.py +1 -0
- nkululeko/reporting/reporter.py +20 -19
- nkululeko/resample.py +8 -12
- nkululeko/resample_cli.py +99 -0
- nkululeko/runmanager.py +3 -1
- nkululeko/scaler.py +1 -1
- nkululeko/segment.py +6 -5
- nkululeko/segmenting/seg_inaspeechsegmenter.py +3 -3
- nkululeko/segmenting/seg_silero.py +4 -4
- nkululeko/syllable_nuclei.py +9 -22
- nkululeko/test_pretrain.py +6 -7
- nkululeko/utils/stats.py +0 -1
- nkululeko/utils/util.py +2 -3
- {nkululeko-0.90.0.dist-info → nkululeko-0.90.1.dist-info}/METADATA +6 -2
- nkululeko-0.90.1.dist-info/RECORD +119 -0
- {nkululeko-0.90.0.dist-info → nkululeko-0.90.1.dist-info}/WHEEL +1 -1
- nkululeko-0.90.0.dist-info/RECORD +0 -114
- {nkululeko-0.90.0.dist-info → nkululeko-0.90.1.dist-info}/LICENSE +0 -0
- {nkululeko-0.90.0.dist-info → nkululeko-0.90.1.dist-info}/top_level.txt +0 -0
@@ -8,11 +8,10 @@ import pandas as pd
|
|
8
8
|
import torch
|
9
9
|
import torchaudio
|
10
10
|
from tqdm import tqdm
|
11
|
-
from transformers import Wav2Vec2FeatureExtractor
|
12
|
-
from transformers import WavLMModel
|
11
|
+
from transformers import Wav2Vec2FeatureExtractor, WavLMModel
|
13
12
|
|
14
|
-
from nkululeko.feat_extract.featureset import Featureset
|
15
13
|
import nkululeko.glob_conf as glob_conf
|
14
|
+
from nkululeko.feat_extract.featureset import Featureset
|
16
15
|
|
17
16
|
|
18
17
|
class Wavlm(Featureset):
|
@@ -1,16 +1,14 @@
|
|
1
1
|
# feats_whisper.py
|
2
2
|
import os
|
3
3
|
|
4
|
-
import pandas as pd
|
5
|
-
import torch
|
6
|
-
from transformers import AutoFeatureExtractor
|
7
|
-
from transformers import WhisperModel
|
8
|
-
|
9
4
|
import audeer
|
10
5
|
import audiofile
|
6
|
+
import pandas as pd
|
7
|
+
import torch
|
8
|
+
from transformers import AutoFeatureExtractor, WhisperModel
|
11
9
|
|
12
|
-
from nkululeko.feat_extract.featureset import Featureset
|
13
10
|
import nkululeko.glob_conf as glob_conf
|
11
|
+
from nkululeko.feat_extract.featureset import Featureset
|
14
12
|
|
15
13
|
|
16
14
|
class Whisper(Featureset):
|
@@ -7,6 +7,7 @@ taken June 23rd 2022.
|
|
7
7
|
import math
|
8
8
|
import statistics
|
9
9
|
|
10
|
+
import audiofile
|
10
11
|
import numpy as np
|
11
12
|
import pandas as pd
|
12
13
|
import parselmouth
|
@@ -15,9 +16,6 @@ from scipy.stats.mstats import zscore
|
|
15
16
|
from sklearn.decomposition import PCA
|
16
17
|
from tqdm import tqdm
|
17
18
|
|
18
|
-
import audiofile
|
19
|
-
|
20
|
-
|
21
19
|
# This is the function to measure source acoustics using default male parameters.
|
22
20
|
|
23
21
|
|
@@ -0,0 +1,147 @@
|
|
1
|
+
# transformer_feature_extractor.py
|
2
|
+
|
3
|
+
import os
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import torch
|
7
|
+
import torchaudio
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
10
|
+
import nkululeko.glob_conf as glob_conf
|
11
|
+
from nkululeko.feat_extract.featureset import Featureset
|
12
|
+
|
13
|
+
|
14
|
+
class TransformerFeatureExtractor(Featureset):
|
15
|
+
def __init__(self, name, data_df, feat_type):
|
16
|
+
super().__init__(name, data_df, feat_type)
|
17
|
+
cuda = "cuda" if torch.cuda.is_available() else "cpu"
|
18
|
+
self.device = self.util.config_val("MODEL", "device", cuda)
|
19
|
+
self.model_initialized = False
|
20
|
+
self.feat_type = feat_type
|
21
|
+
|
22
|
+
def init_model(self):
|
23
|
+
raise NotImplementedError("Subclasses must implement init_model method")
|
24
|
+
|
25
|
+
def get_embeddings(self, signal, sampling_rate, file):
|
26
|
+
try:
|
27
|
+
with torch.no_grad():
|
28
|
+
# Preprocess the input
|
29
|
+
inputs = self.preprocess_input(signal, sampling_rate)
|
30
|
+
|
31
|
+
# Get model outputs
|
32
|
+
outputs = self.model(**inputs)
|
33
|
+
|
34
|
+
# Extract the relevant hidden states
|
35
|
+
hidden_states = self.extract_hidden_states(outputs)
|
36
|
+
|
37
|
+
# Pool the hidden states
|
38
|
+
embeddings = self.pool_hidden_states(hidden_states)
|
39
|
+
|
40
|
+
# Convert to numpy and flatten
|
41
|
+
embeddings = embeddings.cpu().numpy().ravel()
|
42
|
+
|
43
|
+
return embeddings
|
44
|
+
|
45
|
+
except Exception as e:
|
46
|
+
self.util.error(f"Error extracting embeddings for file {file}: {str(e)}")
|
47
|
+
return np.zeros(self.get_embedding_dim()) # Return zero vector on error
|
48
|
+
|
49
|
+
def preprocess_input(self, signal, sampling_rate):
|
50
|
+
# This method should be implemented by subclasses
|
51
|
+
raise NotImplementedError("Subclasses must implement preprocess_input method")
|
52
|
+
|
53
|
+
def extract_hidden_states(self, outputs):
|
54
|
+
# This method should be implemented by subclasses
|
55
|
+
raise NotImplementedError(
|
56
|
+
"Subclasses must implement extract_hidden_states method"
|
57
|
+
)
|
58
|
+
|
59
|
+
def pool_hidden_states(self, hidden_states):
|
60
|
+
# Default implementation: mean pooling over time dimension
|
61
|
+
return torch.mean(hidden_states, dim=1)
|
62
|
+
|
63
|
+
def get_embedding_dim(self):
|
64
|
+
# This method should be implemented by subclasses
|
65
|
+
raise NotImplementedError("Subclasses must implement get_embedding_dim method")
|
66
|
+
|
67
|
+
def extract(self):
|
68
|
+
store = self.util.get_path("store")
|
69
|
+
storage = f"{store}{self.name}.pkl"
|
70
|
+
extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
|
71
|
+
no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
|
72
|
+
if extract or no_reuse or not os.path.isfile(storage):
|
73
|
+
if not self.model_initialized:
|
74
|
+
self.init_model()
|
75
|
+
self.util.debug(
|
76
|
+
f"extracting {self.feat_type} embeddings, this might take a while..."
|
77
|
+
)
|
78
|
+
emb_series = pd.Series(index=self.data_df.index, dtype=object)
|
79
|
+
for idx, (file, start, end) in enumerate(
|
80
|
+
tqdm(self.data_df.index.to_list())
|
81
|
+
):
|
82
|
+
signal, sampling_rate = torchaudio.load(
|
83
|
+
file,
|
84
|
+
frame_offset=int(start.total_seconds() * 16000),
|
85
|
+
num_frames=int((end - start).total_seconds() * 16000),
|
86
|
+
)
|
87
|
+
assert sampling_rate == 16000, f"got {sampling_rate} instead of 16000"
|
88
|
+
emb = self.get_embeddings(signal, sampling_rate, file)
|
89
|
+
emb_series[idx] = emb
|
90
|
+
self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
|
91
|
+
self.df.to_pickle(storage)
|
92
|
+
try:
|
93
|
+
glob_conf.config["DATA"]["needs_feature_extraction"] = "false"
|
94
|
+
except KeyError:
|
95
|
+
pass
|
96
|
+
else:
|
97
|
+
self.util.debug(f"reusing extracted {self.feat_type} embeddings")
|
98
|
+
self.df = pd.read_pickle(storage)
|
99
|
+
if self.df.isnull().values.any():
|
100
|
+
self.util.error(
|
101
|
+
f"got nan: {self.df.shape} {self.df.isnull().sum().sum()}"
|
102
|
+
)
|
103
|
+
|
104
|
+
def extract_sample(self, signal, sr):
|
105
|
+
self.init_model()
|
106
|
+
feats = self.get_embeddings(signal, sr, "no file")
|
107
|
+
return feats
|
108
|
+
|
109
|
+
# for each feature extractor
|
110
|
+
# feats_ast.py
|
111
|
+
|
112
|
+
# class Ast(TransformerFeatureExtractor):
|
113
|
+
# def preprocess_input(self, signal, sampling_rate):
|
114
|
+
# inputs = self.processor(signal.numpy(), sampling_rate=sampling_rate, return_tensors="pt")
|
115
|
+
# return {k: v.to(self.device) for k, v in inputs.items()}
|
116
|
+
|
117
|
+
# def extract_hidden_states(self, outputs):
|
118
|
+
# return outputs.last_hidden_state
|
119
|
+
|
120
|
+
# def get_embedding_dim(self):
|
121
|
+
# return self.model.config.hidden_size
|
122
|
+
|
123
|
+
# # feats_wav2vec2.py
|
124
|
+
|
125
|
+
# class Wav2vec2(TransformerFeatureExtractor):
|
126
|
+
# def preprocess_input(self, signal, sampling_rate):
|
127
|
+
# inputs = self.processor(signal, sampling_rate=sampling_rate, return_tensors="pt")
|
128
|
+
# return {k: v.to(self.device) for k, v in inputs.items()}
|
129
|
+
|
130
|
+
# def extract_hidden_states(self, outputs):
|
131
|
+
# return outputs.last_hidden_state
|
132
|
+
|
133
|
+
# def get_embedding_dim(self):
|
134
|
+
# return self.model.config.hidden_size
|
135
|
+
|
136
|
+
# # feats_wavlm.py
|
137
|
+
|
138
|
+
# class Wavlm(TransformerFeatureExtractor):
|
139
|
+
# def preprocess_input(self, signal, sampling_rate):
|
140
|
+
# inputs = self.processor(signal, sampling_rate=sampling_rate, return_tensors="pt")
|
141
|
+
# return {k: v.to(self.device) for k, v in inputs.items()}
|
142
|
+
|
143
|
+
# def extract_hidden_states(self, outputs):
|
144
|
+
return outputs.last_hidden_state
|
145
|
+
|
146
|
+
# def get_embedding_dim(self):
|
147
|
+
# return self.model.config.hidden_size
|
nkululeko/file_checker.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
from nkululeko.utils.util import Util
|
3
1
|
import os
|
4
2
|
|
3
|
+
from nkululeko.utils.util import Util
|
4
|
+
|
5
5
|
|
6
6
|
class FileChecker:
|
7
7
|
def __init__(self, df):
|
@@ -10,7 +10,7 @@ class FileChecker:
|
|
10
10
|
self.util.copy_flags(df, self.df)
|
11
11
|
check_vad = self.util.config_val("DATA", "check_vad", False)
|
12
12
|
if check_vad:
|
13
|
-
self.util.debug(
|
13
|
+
self.util.debug("This may take a while downloading the VAD model")
|
14
14
|
import torch
|
15
15
|
|
16
16
|
torch.set_num_threads(1)
|
nkululeko/filter_data.py
CHANGED
@@ -0,0 +1,83 @@
|
|
1
|
+
"""
|
2
|
+
Segment audio files in a given directory into smaller chunks with a specified length and overlap. This requires the PyDub library to be installed.
|
3
|
+
|
4
|
+
Args:
|
5
|
+
input_dir (str): The directory containing the audio files to be segmented.
|
6
|
+
output_dir (str): The directory where the segmented audio files will be saved.
|
7
|
+
segment_length (int): The length of each audio segment in milliseconds.
|
8
|
+
overlap (int): The overlap between adjacent audio segments in milliseconds.
|
9
|
+
|
10
|
+
This function will recursively search the input directory for all .wav audio files, and then segment each file into smaller chunks with the specified length and overlap. The segmented audio files will be saved in the output directory, preserving the relative directory structure from the input directory.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import argparse
|
14
|
+
import glob
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
from pydub import AudioSegment
|
18
|
+
|
19
|
+
|
20
|
+
# list audio files given a directory
|
21
|
+
def segment_audio(input_dir, output_dir, segment_length, overlap):
|
22
|
+
# check if input dir exist
|
23
|
+
if not Path(input_dir).exists():
|
24
|
+
print(f"Directory {input_dir} does not exist.")
|
25
|
+
return
|
26
|
+
|
27
|
+
# check if output dir exist, create if not
|
28
|
+
if not Path(output_dir).exists():
|
29
|
+
Path(output_dir).mkdir(parents=True)
|
30
|
+
|
31
|
+
audio_files = glob.glob(str(Path(input_dir) / "**" / "*.wav"), recursive=True)
|
32
|
+
|
33
|
+
for audio_file in audio_files:
|
34
|
+
# segment into 2 seconds with 1 second overlap (default values)
|
35
|
+
audio = AudioSegment.from_file(audio_file)
|
36
|
+
|
37
|
+
segments = []
|
38
|
+
|
39
|
+
for i in range(0, len(audio), segment_length - overlap):
|
40
|
+
segment = audio[i : i + segment_length]
|
41
|
+
segments.append(segment)
|
42
|
+
|
43
|
+
# Path(output_dir).mkdir(exist_ok=True)
|
44
|
+
for i, segment in enumerate(segments):
|
45
|
+
# get relative path from input_dir
|
46
|
+
relative_path = Path(audio_file).relative_to(input_dir)
|
47
|
+
# make output directory if not exist
|
48
|
+
output_subdir = Path(output_dir) / relative_path.parent
|
49
|
+
output_subdir.mkdir(parents=True, exist_ok=True)
|
50
|
+
segment.export(
|
51
|
+
str(output_subdir / f"{Path(audio_file).stem}_{i}.wav"),
|
52
|
+
format="wav",
|
53
|
+
)
|
54
|
+
|
55
|
+
print("DONE.")
|
56
|
+
|
57
|
+
|
58
|
+
if __name__ == "__main__":
|
59
|
+
parser = argparse.ArgumentParser()
|
60
|
+
parser.add_argument("--input_dir", type=str, default="./")
|
61
|
+
# add argument for output_dir
|
62
|
+
parser.add_argument(
|
63
|
+
"--output_dir",
|
64
|
+
type=str,
|
65
|
+
default="./segmented_data/",
|
66
|
+
)
|
67
|
+
parser.add_argument(
|
68
|
+
"--segment_length",
|
69
|
+
type=int,
|
70
|
+
default=2000,
|
71
|
+
)
|
72
|
+
parser.add_argument(
|
73
|
+
"--overlap",
|
74
|
+
type=int,
|
75
|
+
default=1000,
|
76
|
+
)
|
77
|
+
args = parser.parse_args()
|
78
|
+
segment_audio(
|
79
|
+
args.input_dir,
|
80
|
+
args.output_dir,
|
81
|
+
segment_length=args.segment_length,
|
82
|
+
overlap=args.overlap,
|
83
|
+
)
|
nkululeko/models/model.py
CHANGED
@@ -3,13 +3,11 @@ import ast
|
|
3
3
|
import pickle
|
4
4
|
import random
|
5
5
|
|
6
|
-
from joblib import parallel_backend
|
7
6
|
import numpy as np
|
8
7
|
import pandas as pd
|
9
|
-
from sklearn.model_selection import GridSearchCV
|
10
|
-
from sklearn.model_selection import LeaveOneGroupOut
|
11
|
-
from sklearn.model_selection import StratifiedKFold
|
12
8
|
import sklearn.utils
|
9
|
+
from joblib import parallel_backend
|
10
|
+
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut, StratifiedKFold
|
13
11
|
|
14
12
|
import nkululeko.glob_conf as glob_conf
|
15
13
|
from nkululeko.reporting.reporter import Reporter
|
@@ -119,7 +117,7 @@ class Model:
|
|
119
117
|
# get unique list of speakers
|
120
118
|
speakers = annos["speaker"].unique()
|
121
119
|
# check for folds columns
|
122
|
-
if
|
120
|
+
if "fold" not in annos.columns:
|
123
121
|
self.util.debug(f"creating random folds for {logo} groups")
|
124
122
|
# create a random dictionary of groups
|
125
123
|
sdict = {}
|
nkululeko/models/model_bayes.py
CHANGED
nkululeko/models/model_cnn.py
CHANGED
@@ -6,23 +6,21 @@ Inspired by code from Su Lei
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import ast
|
9
|
-
from collections import OrderedDict
|
10
9
|
|
11
10
|
import numpy as np
|
12
11
|
import pandas as pd
|
13
|
-
from PIL import Image
|
14
|
-
from sklearn.metrics import recall_score
|
15
12
|
import torch
|
16
13
|
import torch.nn as nn
|
17
14
|
import torch.nn.functional as F
|
18
|
-
from torch.utils.data import Dataset
|
19
15
|
import torchvision.transforms as transforms
|
16
|
+
from PIL import Image
|
17
|
+
from sklearn.metrics import recall_score
|
18
|
+
from torch.utils.data import Dataset
|
20
19
|
|
21
20
|
import nkululeko.glob_conf as glob_conf
|
22
21
|
from nkululeko.losses.loss_softf1loss import SoftF1Loss
|
23
22
|
from nkululeko.models.model import Model
|
24
23
|
from nkululeko.reporting.reporter import Reporter
|
25
|
-
from nkululeko.utils.util import Util
|
26
24
|
|
27
25
|
|
28
26
|
class CNNModel(Model):
|
@@ -55,7 +53,7 @@ class CNNModel(Model):
|
|
55
53
|
)
|
56
54
|
else:
|
57
55
|
self.util.error(f"unknown loss function: {criterion}")
|
58
|
-
self.util.debug(
|
56
|
+
self.util.debug("using model with cross entropy loss function")
|
59
57
|
# set up the model
|
60
58
|
# cuda = "cuda" if torch.cuda.is_available() else "cpu"
|
61
59
|
self.device = self.util.config_val("MODEL", "device", "cpu")
|
nkululeko/models/model_gmm.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
# model_gmm.py
|
2
2
|
|
3
|
+
import pandas as pd
|
3
4
|
from sklearn import mixture
|
5
|
+
|
4
6
|
from nkululeko.models.model import Model
|
5
|
-
|
7
|
+
|
6
8
|
|
7
9
|
class GMM_model(Model):
|
8
10
|
"""An GMM model"""
|
@@ -15,23 +17,25 @@ class GMM_model(Model):
|
|
15
17
|
self.n_components = int(self.util.config_val("MODEL", "GMM_components", "4"))
|
16
18
|
covariance_type = self.util.config_val("MODEL", "GMM_covariance_type", "full")
|
17
19
|
self.clf = mixture.GaussianMixture(
|
18
|
-
n_components=self.n_components,
|
20
|
+
n_components=self.n_components,
|
19
21
|
covariance_type=covariance_type,
|
20
|
-
random_state
|
22
|
+
random_state=42,
|
21
23
|
)
|
22
24
|
# set up the classifier
|
23
25
|
|
24
26
|
def get_predictions(self):
|
25
|
-
"""Use the predict_proba method of the GaussianMixture model to get
|
26
|
-
probabilities. Create a DataFrame with these probabilities and return
|
27
|
+
"""Use the predict_proba method of the GaussianMixture model to get
|
28
|
+
probabilities. Create a DataFrame with these probabilities and return
|
27
29
|
it along with the predictions."""
|
28
30
|
probs = self.clf.predict_proba(self.feats_test)
|
29
31
|
preds = self.clf.predict(self.feats_test)
|
30
|
-
|
32
|
+
|
31
33
|
# Convert predictions to a list
|
32
34
|
preds = preds.tolist()
|
33
|
-
|
35
|
+
|
34
36
|
# Create a DataFrame for probabilities
|
35
|
-
proba_df = pd.DataFrame(
|
36
|
-
|
37
|
+
proba_df = pd.DataFrame(
|
38
|
+
probs, index=self.feats_test.index, columns=range(self.n_components)
|
39
|
+
)
|
40
|
+
|
37
41
|
return preds, proba_df
|
nkululeko/models/model_knn.py
CHANGED
nkululeko/models/model_mlp.py
CHANGED
@@ -4,14 +4,13 @@ from collections import OrderedDict
|
|
4
4
|
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
|
-
from sklearn.metrics import recall_score
|
8
7
|
import torch
|
8
|
+
from sklearn.metrics import recall_score
|
9
9
|
|
10
10
|
import nkululeko.glob_conf as glob_conf
|
11
11
|
from nkululeko.losses.loss_softf1loss import SoftF1Loss
|
12
12
|
from nkululeko.models.model import Model
|
13
13
|
from nkululeko.reporting.reporter import Reporter
|
14
|
-
from nkululeko.utils.util import Util
|
15
14
|
|
16
15
|
|
17
16
|
class MLPModel(Model):
|
@@ -44,7 +43,7 @@ class MLPModel(Model):
|
|
44
43
|
)
|
45
44
|
else:
|
46
45
|
self.util.error(f"unknown loss function: {criterion}")
|
47
|
-
self.util.debug(
|
46
|
+
self.util.debug("using model with cross entropy loss function")
|
48
47
|
# set up the model, use GPU if availabe
|
49
48
|
cuda = "cuda" if torch.cuda.is_available() else "cpu"
|
50
49
|
self.device = self.util.config_val("MODEL", "device", cuda)
|
@@ -1,15 +1,10 @@
|
|
1
1
|
# model_mlp.py
|
2
2
|
import ast
|
3
3
|
from collections import OrderedDict
|
4
|
-
import os
|
5
4
|
|
6
5
|
import numpy as np
|
7
6
|
import torch
|
8
|
-
|
9
|
-
from audmetric import concordance_cc
|
10
|
-
from audmetric import mean_absolute_error
|
11
|
-
from audmetric import mean_squared_error
|
12
|
-
from traitlets import default
|
7
|
+
from audmetric import concordance_cc, mean_absolute_error, mean_squared_error
|
13
8
|
|
14
9
|
import nkululeko.glob_conf as glob_conf
|
15
10
|
from nkululeko.losses.loss_ccc import ConcordanceCorCoeff
|
nkululeko/models/model_svm.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# model_svm.py
|
2
2
|
|
3
|
-
import random
|
4
3
|
from sklearn import svm
|
4
|
+
|
5
5
|
from nkululeko.models.model import Model
|
6
6
|
|
7
7
|
|
@@ -25,7 +25,7 @@ class SVM_model(Model):
|
|
25
25
|
gamma="scale",
|
26
26
|
probability=True,
|
27
27
|
class_weight=class_weight,
|
28
|
-
random_state=42,
|
28
|
+
random_state=42, # for consistent result
|
29
29
|
) # set up the classifier
|
30
30
|
|
31
31
|
def set_c(self, c):
|
nkululeko/models/model_svr.py
CHANGED
nkululeko/models/model_tree.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# model_tree.py
|
2
2
|
|
3
3
|
from sklearn.tree import DecisionTreeClassifier
|
4
|
+
|
4
5
|
from nkululeko.models.model import Model
|
5
6
|
|
6
7
|
|
@@ -12,6 +13,4 @@ class Tree_model(Model):
|
|
12
13
|
def __init__(self, df_train, df_test, feats_train, feats_test):
|
13
14
|
super().__init__(df_train, df_test, feats_train, feats_test)
|
14
15
|
self.name = "tree"
|
15
|
-
self.clf = DecisionTreeClassifier(
|
16
|
-
random_state=42
|
17
|
-
) # set up the classifier
|
16
|
+
self.clf = DecisionTreeClassifier(random_state=42) # set up the classifier
|