PyPI - nkululeko - Versions diffs - 0.95.9__py3-none-any.whl → 0.96.1__py3-none-any.whl - Mend

nkululeko 0.95.9py3-none-any.whl → 0.96.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

nkululeko/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.95.9"
+VERSION="0.96.1"
 SAMPLING_RATE = 16000

nkululeko/feat_extract/feats_bert.py ADDED Viewed

@@ -0,0 +1,105 @@
+import os
+import pandas as pd
+from tqdm import tqdm
+import transformers
+import torch
+from transformers import BertTokenizer, BertModel
+from nkululeko.feat_extract.featureset import Featureset
+import nkululeko.glob_conf as glob_conf
+class Bert(Featureset):
+    """Class to extract bert embeddings"""
+    def __init__(self, name, data_df, feat_type):
+        """Constructor.
+        If_train is needed to distinguish from test/dev sets,
+        because they use the codebook from the training
+        """
+        super().__init__(name, data_df, feat_type)
+        cuda = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = self.util.config_val("MODEL", "device", cuda)
+        self.model_initialized = False
+        if feat_type == "bert":
+            self.feat_type = "bert-base-uncased"
+        else:
+            self.feat_type = feat_type
+    def init_model(self):
+        # load model
+        self.util.debug(f"loading {self.feat_type} model...")
+        model_path = self.util.config_val(
+            "FEATS", "bert.model", f"google-bert/{self.feat_type}"
+        )
+        config = transformers.AutoConfig.from_pretrained(model_path)
+        layer_num = config.num_hidden_layers
+        hidden_layer = int(self.util.config_val("FEATS", "bert.layer", "0"))
+        config.num_hidden_layers = layer_num - hidden_layer
+        self.util.debug(f"using hidden layer #{config.num_hidden_layers}")
+        self.tokenizer = BertTokenizer.from_pretrained(model_path)
+        self.model = BertModel.from_pretrained(model_path, config=config).to(
+            self.device
+        )
+        print(f"initialized {self.feat_type} model on {self.device}")
+        self.model.eval()
+        self.model_initialized = True
+    def extract(self):
+        """Extract the features or load them from disk if present."""
+        store = self.util.get_path("store")
+        storage = os.path.join(store, f"{self.name}.pkl")
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
+        no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
+        if extract or no_reuse or not os.path.isfile(storage):
+            if not self.model_initialized:
+                self.init_model()
+            self.util.debug(
+                f"extracting {self.feat_type} embeddings, this might take a while..."
+            )
+            emb_series = pd.Series(index=self.data_df.index, dtype=object)
+            for idx, row in tqdm(self.data_df.iterrows(), total=len(self.data_df)):
+                file = idx[0]
+                text = row['text']
+                emb = self.get_embeddings(text, file)
+                emb_series[idx] = emb
+            # print(f"emb_series shape: {emb_series.shape}")
+            self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
+            # print(f"df shape: {self.df.shape}")
+            self.df.to_pickle(storage)
+            try:
+                glob_conf.config["DATA"]["needs_feature_extraction"] = "false"
+            except KeyError:
+                pass
+        else:
+            self.util.debug(f"reusing extracted {self.feat_type} embeddings")
+            self.df = pd.read_pickle(storage)
+            if self.df.isnull().values.any():
+                self.util.error(
+                    f"got nan: {self.df.shape} {self.df.isnull().sum().sum()}"
+                )
+    def get_embeddings(self, text, file):
+        r"""Extract embeddings from raw audio signal."""
+        try:
+            with torch.no_grad():
+                inputs = self.tokenizer(text, return_tensors="pt")
+                outputs = self.model(**inputs)
+                # mean pooling
+                y = torch.mean(outputs[0], dim=1)
+                y = y.ravel()
+        except RuntimeError as re:
+            print(str(re))
+            self.util.error(f"couldn't extract file: {file}")
+            y = None
+        if y is None:
+            return None
+        return y.detach().cpu().numpy()
+    def extract_sample(self, text):
+        self.init_model()
+        feats = self.get_embeddings(text, "no file")
+        return feats

nkululeko/feature_extractor.py CHANGED Viewed

@@ -80,7 +80,7 @@ class FeatureExtractor:
             return MLD_set
         elif feats_type.startswith(
-            ("wav2vec2", "hubert", "wavlm", "spkrec", "whisper", "ast", "emotion2vec")
+            ("bert", "wav2vec2", "hubert", "wavlm", "spkrec", "whisper", "ast", "emotion2vec")
         ):
             return self._get_feat_extractor_by_prefix(feats_type)

nkululeko/reporting/reporter.py CHANGED Viewed

@@ -198,7 +198,9 @@ class Reporter:
             )
     def plot_proba_conf(self):
-        uncertainty_threshold = self.util.config_val("PLOT", "uncertainty_threshold", False)
+        uncertainty_threshold = self.util.config_val(
+            "PLOT", "uncertainty_threshold", False
+        )
         if uncertainty_threshold:
             uncertainty_threshold = float(uncertainty_threshold)
             old_size = self.probas.shape[0]
@@ -210,9 +212,13 @@ class Reporter:
             )
             truths = df["truth"].values
             preds = df["predicted"].values
-            self._plot_confmat(truths, preds, f"uncertainty_less_than_{uncertainty_threshold}_cnf",
-                            epoch=None, test_result=None)
+            self._plot_confmat(
+                truths,
+                preds,
+                f"uncertainty_less_than_{uncertainty_threshold}_cnf",
+                epoch=None,
+                test_result=None,
+            )
     def set_id(self, run, epoch):
         """Make the report identifiable with run and epoch index."""
@@ -434,7 +440,10 @@ class Reporter:
             self.util.debug(f"####->{file_name}<-####")
             file_name = f"{res_dir}{file_name}{self.filenameadd}.txt"
         if self.util.exp_is_classification():
-            labels = glob_conf.labels
+            if glob_conf.label_encoder is not None:
+                labels = glob_conf.label_encoder.classes_
+            else:
+                labels = glob_conf.labels
             try:
                 rpt = classification_report(
                     self.truths,
@@ -451,9 +460,7 @@ class Reporter:
                     target_names=s_labels,
                     digits=4,
                 )
-                self.util.debug(
-                    f"\n {class_report_str}"
-                )
+                self.util.debug(f"\n {class_report_str}")
             except ValueError as e:
                 self.util.debug(
                     "Reporter: caught a ValueError when trying to get"

{nkululeko-0.95.9.dist-info → nkululeko-0.96.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nkululeko
-Version: 0.95.9
+Version: 0.96.1
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt

{nkululeko-0.95.9.dist-info → nkululeko-0.96.1.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,7 @@ nkululeko/aug_train.py,sha256=wpiHCJ7zsW38kumg3ypwXZe2HQrhUblAnv7P2QeJnAc,3525
 nkululeko/augment.py,sha256=3RzaxB3gRxovgJVjHXi0glprW01J7RaHhUkqotW2T3U,2955
 nkululeko/balance.py,sha256=r7opXbrqAipm2euPPaOmLlA5J10p2bHQgO5kWk2x9ro,8702
 nkululeko/cacheddataset.py,sha256=XFpWZmbJRg0pvhnIgYf0TkclxllD-Fctu-Ol0PF_00c,969
-nkululeko/constants.py,sha256=t_C_hQqVC1idXJB6HHr1m7ZtCYC5JVvqhYrVLRhzwIw,39
+nkululeko/constants.py,sha256=SGFx3gfUvHqg9Qr8X6a1NxL8ovndS4SAEy6Seh65maE,39
 nkululeko/demo-ft.py,sha256=iD9Pzp9QjyAv31q1cDZ75vPez7Ve8A4Cfukv5yfZdrQ,770
 nkululeko/demo.py,sha256=tu7Al2l5MCLVegkDC-NE2wcuc_YE7NRbgOlPW3yhGEs,4940
 nkululeko/demo_feats.py,sha256=BvZjeNFTlERIRlq34OHM4Z96jdDQAhB01BGQAUcX9dM,2026
@@ -13,7 +13,7 @@ nkululeko/ensemble.py,sha256=71V-rre61H3J4sh7lu-OTo4I2_g7mm_rQxwW1ARDHgY,12782
 nkululeko/experiment.py,sha256=TG9G9kSETT_R8d92aRKMMsb0HRGyM_GBFHBsU9A6ppw,38633
 nkululeko/explore.py,sha256=PjNcLuPdvWqCqYXUvGhd0hBijIhzdyi3ED1RF6o5Gjk,4212
 nkululeko/export.py,sha256=U-V4acxtuL6qKt6oAsVcM5TTeWogYUJ3GU-lA6rq6d4,4336
-nkululeko/feature_extractor.py,sha256=CsKmBoxwNClRGu20ox_eCxMG4u_1OH8Y83FYw7GfUwA,4230
+nkululeko/feature_extractor.py,sha256=d3G42OOh315Aho-yLaFT739U0UI8otiB1I4ZksK8kfg,4238
 nkululeko/file_checker.py,sha256=xJY0Q6w47pnmgJVK5rcAKPYBrCpV7eBT4_3YBzTx-H8,3454
 nkululeko/filter_data.py,sha256=4sGrKvMZ_hLnJPrHm_CqjDPKIRV8REWoT7nfSYGXbwo,7305
 nkululeko/fixedsegment.py,sha256=Tb92QiuiyMsOO3WRWwuGjZGibS8hbHHCrcWAXGk7g04,2868
@@ -69,6 +69,7 @@ nkululeko/feat_extract/feats_analyser.py,sha256=lodim7qQ8M7c3iMeJ5bHQ-nCy9Cehx1X
 nkululeko/feat_extract/feats_ast.py,sha256=w62xEoLiFtU-rj6SXkqXAktmoFaXcAcAWpUyEjp8JWo,4652
 nkululeko/feat_extract/feats_auddim.py,sha256=CGLp_aYhudfwoU5522vjrvjPxfZcyw593A8xLjYefV8,3134
 nkululeko/feat_extract/feats_audmodel.py,sha256=OsZyB1rdcG0Fai2gAwBlbuubmWor1_-P4IDkZLqgPKE,3161
+nkululeko/feat_extract/feats_bert.py,sha256=KgWLYLA11e86ubY1KtUk74QGOrZaiUongn-2LKWyf1M,4114
 nkululeko/feat_extract/feats_clap.py,sha256=1tttpfm2SJmQgYm2u8eUVpDiDOpWdKqFChpY3ZZokNs,3395
 nkululeko/feat_extract/feats_emotion2vec.py,sha256=LnV8xEg7L7HIDqz0ulqUNoaAHBU0d5gyQPb2_32T_18,9694
 nkululeko/feat_extract/feats_hubert.py,sha256=F3vrPCkx8EimJjFWYCZ7Yg9uo1G3NjYt4UKrGIUev8k,5172
@@ -122,7 +123,7 @@ nkululeko/reporting/defines.py,sha256=0vh-Tlx4fAPpk1o6mP_4x3EkIoqzYMr38IZnj-JM5z
 nkululeko/reporting/latex_writer.py,sha256=NGwSIfd4nfslDkNUOSZSdqY_VDLA8634thyhe-vj1bY,1824
 nkululeko/reporting/report.py,sha256=B5eoIKMz46VKDBsi7M9u_iegzAD-E3eGCmolzSFjZ3c,1118
 nkululeko/reporting/report_item.py,sha256=drkknsyFhGviaPJNmPQtCXJmRhTSSfjNcJt0Bls6JAA,533
-nkululeko/reporting/reporter.py,sha256=ITxM5O9Hoe_1z_59g-GF4b9vciR4shokZxeFzCrDaag,21869
+nkululeko/reporting/reporter.py,sha256=usmc2GsqGua40p8AbV08oAcZdvoelP72vsG1xS1NzOQ,22051
 nkululeko/reporting/result.py,sha256=G63a2tHCwHhM6NBJgYzsWKWJm4Yu3r4hsCHA2Km7eHU,1073
 nkululeko/segmenting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nkululeko/segmenting/seg_inaspeechsegmenter.py,sha256=b3t0zdpJYofKWMyKRMtMMX91xeR-k8d5pbnNaQHcsOE,1902
@@ -136,9 +137,9 @@ nkululeko/utils/files.py,sha256=SrrYaU7AB80MZHiV1jcB0h_zigvYLYgSVNTXV4ao38g,4593
 nkululeko/utils/stats.py,sha256=3Fyx8q8BSKYmiufT6OkRug9RATWmGrr9BaX_y8jziWo,3074
 nkululeko/utils/unzip.py,sha256=G68f5120TjwACZC3bQcneMniddnwubPbBdMc2L5KBOo,1206
 nkululeko/utils/util.py,sha256=s7Hd7Ju1r3_WCw8gLD9YK4O6k3S_WhFcN2-XZBSctSM,18705
-nkululeko-0.95.9.dist-info/licenses/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
-nkululeko-0.95.9.dist-info/METADATA,sha256=WhITXnJHYD5GhyATjEb7kJhmMecWRu-BeMBw7pSWNdc,21998
-nkululeko-0.95.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nkululeko-0.95.9.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
-nkululeko-0.95.9.dist-info/top_level.txt,sha256=bf1k1YKkqcXemNX_cUgoyKqQ3_GVErPqAY-53J36jkM,19
-nkululeko-0.95.9.dist-info/RECORD,,
+nkululeko-0.96.1.dist-info/licenses/LICENSE,sha256=0zGP5B_W35yAcGfHPS18Q2B8UhvLRY3dQq1MhpsJU_U,1076
+nkululeko-0.96.1.dist-info/METADATA,sha256=y6Yr1jPBXvOEyS8VZiFa_as56oS8_h0Z-fYAkoBdky8,21998
+nkululeko-0.96.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nkululeko-0.96.1.dist-info/entry_points.txt,sha256=lNTkFEdh6Kjo5o95ZAWf_0Lq-4ztGoAoMVSDuPtuyS0,442
+nkululeko-0.96.1.dist-info/top_level.txt,sha256=bf1k1YKkqcXemNX_cUgoyKqQ3_GVErPqAY-53J36jkM,19
+nkululeko-0.96.1.dist-info/RECORD,,

{nkululeko-0.95.9.dist-info → nkululeko-0.96.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{nkululeko-0.95.9.dist-info → nkululeko-0.96.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nkululeko-0.95.9.dist-info → nkululeko-0.96.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nkululeko-0.95.9.dist-info → nkululeko-0.96.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

nkululeko 0.95.9__py3-none-any.whl → 0.96.1__py3-none-any.whl

nkululeko 0.95.9py3-none-any.whl → 0.96.1py3-none-any.whl