PyPI - nkululeko - Versions diffs - 0.93.10__tar.gz → 0.93.12__tar.gz - Mend

nkululeko 0.93.10tar.gz → 0.93.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

{nkululeko-0.93.10 → nkululeko-0.93.12}/CHANGELOG.md RENAMED Viewed

@@ -1,8 +1,16 @@
 Changelog
 =========
+Version 0.93.12
+---------------
+* bugfix: map_continuous_to_cat crashed on empty data
+Version 0.93.11
+---------------
+* bugfix: silero segmenter assigned file duration values
 Version 0.93.10
---------------
+---------------
 * added nan check for imported features
 * added LOGO result output

{nkululeko-0.93.10/nkululeko.egg-info → nkululeko-0.93.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: nkululeko
-Version: 0.93.10
+Version: 0.93.12
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -63,7 +63,7 @@ Requires-Dist: pylatex
 ## Overview
 A project to detect speaker characteristics by machine learning experiments with a high-level interface.
-The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
+The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
 * NEW with nkululeko: [Ensemble learning](http://blog.syntheticspeech.de/2024/06/25/nkululeko-ensemble-classifiers-with-late-fusion/)
 * NEW: [Finetune transformer-models](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)
@@ -353,8 +353,16 @@ If you use it, please mention the Nkululeko paper:
 Changelog
 =========
+Version 0.93.12
+---------------
+* bugfix: map_continuous_to_cat crashed on empty data
+Version 0.93.11
+---------------
+* bugfix: silero segmenter assigned file duration values
 Version 0.93.10
---------------
+---------------
 * added nan check for imported features
 * added LOGO result output

{nkululeko-0.93.10 → nkululeko-0.93.12}/README.md RENAMED Viewed

@@ -22,7 +22,7 @@
 ## Overview
 A project to detect speaker characteristics by machine learning experiments with a high-level interface.
-The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
+The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
 * NEW with nkululeko: [Ensemble learning](http://blog.syntheticspeech.de/2024/06/25/nkululeko-ensemble-classifiers-with-late-fusion/)
 * NEW: [Finetune transformer-models](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)

{nkululeko-0.93.10 → nkululeko-0.93.12}/nkululeko/constants.py RENAMED Viewed

@@ -1,2 +1,2 @@
-VERSION="0.93.10"
+VERSION="0.93.12"
 SAMPLING_RATE = 16000

{nkululeko-0.93.10 → nkululeko-0.93.12}/nkululeko/data/dataset.py RENAMED Viewed

@@ -34,9 +34,11 @@ class Dataset:
         self.plot = Plots()
         self.limit = int(self.util.config_val_data(self.name, "limit", 0))
         self.target_tables_append = eval(
-            self.util.config_val_data(self.name, "target_tables_append", "False")
+            self.util.config_val_data(
+                self.name, "target_tables_append", "False")
         )
-        self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
+        self.start_fresh = eval(
+            self.util.config_val("DATA", "no_reuse", "False"))
         self.is_labeled, self.got_speaker, self.got_gender, self.got_age = (
             False,
             False,
@@ -70,7 +72,8 @@ class Dataset:
         try:
             self.db = audformat.Database.load(root)
         except FileNotFoundError:
-            self.util.error(f"{self.name}: no audformat database found at {root}")
+            self.util.error(
+                f"{self.name}: no audformat database found at {root}")
         return root
     def _check_cols(self, df):
@@ -92,7 +95,8 @@ class Dataset:
         )
         self.util.debug(r_string)
         if glob_conf.report.initial:
-            glob_conf.report.add_item(ReportItem("Data", "Load report", r_string))
+            glob_conf.report.add_item(ReportItem(
+                "Data", "Load report", r_string))
             glob_conf.report.initial = False
     def load(self):
@@ -103,7 +107,8 @@ class Dataset:
         store_file = f"{store}{self.name}.{store_format}"
         self.root = self._load_db()
         if not self.start_fresh and os.path.isfile(store_file):
-            self.util.debug(f"{self.name}: reusing previously stored file {store_file}")
+            self.util.debug(
+                f"{self.name}: reusing previously stored file {store_file}")
             self.df = self.util.get_store(store_file, store_format)
             self.is_labeled = self.target in self.df
             self.got_gender = "gender" in self.df
@@ -118,10 +123,12 @@ class Dataset:
         # map the audio file paths
         self.db.map_files(lambda x: os.path.join(self.root, x))
         # the dataframes (potentially more than one) with at least the file names
-        df_files = self.util.config_val_data(self.name, "files_tables", "['files']")
+        df_files = self.util.config_val_data(
+            self.name, "files_tables", "['files']")
         df_files_tables = ast.literal_eval(df_files)
         # The label for the target column
-        self.col_label = self.util.config_val_data(self.name, "label", self.target)
+        self.col_label = self.util.config_val_data(
+            self.name, "label", self.target)
         (
             df,
             self.is_labeled,
@@ -157,7 +164,8 @@ class Dataset:
                 self.got_age = got_age2 or self.got_age
                 if audformat.is_filewise_index(df_target.index):
                     try:
-                        df_target = df_target.loc[df.index.get_level_values("file")]
+                        df_target = df_target.loc[df.index.get_level_values(
+                            "file")]
                         df_target = df_target.set_index(df.index)
                     except KeyError:
                         # just a try...
@@ -206,7 +214,8 @@ class Dataset:
             end = self.df.index.get_level_values(2)
             self.df["duration"] = (end - start).total_seconds()
         elif self.df.duration.dtype == "timedelta64[ns]":
-            self.df["duration"] = self.df["duration"].map(lambda x: x.total_seconds())
+            self.df["duration"] = self.df["duration"].map(
+                lambda x: x.total_seconds())
         # Perform some filtering if desired
         required = self.util.config_val_data(self.name, "required", False)
         if required:
@@ -236,15 +245,18 @@ class Dataset:
                 res.append(abs(n - max))
             return res
-        reverse = eval(self.util.config_val_data(self.name, "reverse", "False"))
+        reverse = eval(self.util.config_val_data(
+            self.name, "reverse", "False"))
         if reverse:
-            max = eval(self.util.config_val_data(self.name, "reverse.max", "False"))
+            max = eval(self.util.config_val_data(
+                self.name, "reverse.max", "False"))
             if max:
                 max = float(max)
             else:
                 max = self.df[self.target].values.max()
             self.util.debug(f"reversing target numbers with max values: {max}")
-            self.df[self.target] = reverse_array(self.df[self.target].values, max)
+            self.df[self.target] = reverse_array(
+                self.df[self.target].values, max)
         # check if the target variable should be scaled (z-transformed)
         scale = self.util.config_val_data(self.name, "scale", False)
@@ -317,13 +329,15 @@ class Dataset:
                 pass
             try:
                 # also it might be possible that the age is part of the speaker description
-                df_local["age"] = db[table]["speaker"].get(map="age").astype(int)
+                df_local["age"] = db[table]["speaker"].get(
+                    map="age").astype(int)
                 got_age = True
             except (ValueError, audformat.errors.BadKeyError):
                 pass
             try:
                 # same for the target, e.g. "age"
-                df_local[self.target] = db[table]["speaker"].get(map=self.target)
+                df_local[self.target] = db[table]["speaker"].get(
+                    map=self.target)
                 is_labeled = True
             except (ValueError, audformat.core.errors.BadKeyError):
                 pass
@@ -384,8 +398,10 @@ class Dataset:
             testdf = self.db.tables[self.target + ".test"].df
             traindf = self.db.tables[self.target + ".train"].df
             # use only the train and test samples that were not perhaps filtered out by an earlier processing step
-            self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
-            self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
+            self.df_test = self.df.loc[self.df.index.intersection(
+                testdf.index)]
+            self.df_train = self.df.loc[self.df.index.intersection(
+                traindf.index)]
         elif split_strategy == "train":
             self.df_train = self.df
             self.df_test = pd.DataFrame()
@@ -408,18 +424,23 @@ class Dataset:
             if entry_train_tables:
                 train_tables = ast.literal_eval(entry_train_tables)
                 for train_table in train_tables:
-                    traindf = pd.concat([traindf, self.db.tables[train_table].df])
+                    traindf = pd.concat(
+                        [traindf, self.db.tables[train_table].df])
             # use only the train and test samples that were not perhaps filtered out by an earlier processing step
             # testdf.index.map(lambda x: os.path.join(self.root, x))
             #            testdf.index = testdf.index.to_series().apply(lambda x: self.root+x)
             testdf = testdf.set_index(
-                audformat.utils.to_segmented_index(testdf.index, allow_nat=False)
+                audformat.utils.to_segmented_index(
+                    testdf.index, allow_nat=False)
             )
             traindf = traindf.set_index(
-                audformat.utils.to_segmented_index(traindf.index, allow_nat=False)
+                audformat.utils.to_segmented_index(
+                    traindf.index, allow_nat=False)
             )
-            self.df_test = self.df.loc[self.df.index.intersection(testdf.index)]
-            self.df_train = self.df.loc[self.df.index.intersection(traindf.index)]
+            self.df_test = self.df.loc[self.df.index.intersection(
+                testdf.index)]
+            self.df_train = self.df.loc[self.df.index.intersection(
+                traindf.index)]
             # it might be necessary to copy the target values
             try:
                 self.df_test[self.target] = testdf[self.target]
@@ -446,10 +467,12 @@ class Dataset:
             self.util.error(f"unknown split strategy: {split_strategy}")
         # check if train or test set should be ignored
-        as_test = eval(self.util.config_val_data(self.name, "as_test", "False"))
+        as_test = eval(self.util.config_val_data(
+            self.name, "as_test", "False"))
         if as_test:
             self.df_train = pd.DataFrame()
-        as_train = eval(self.util.config_val_data(self.name, "as_train", "False"))
+        as_train = eval(self.util.config_val_data(
+            self.name, "as_train", "False"))
         if as_train:
             self.df_test = pd.DataFrame()
@@ -480,7 +503,8 @@ class Dataset:
         seed = 42
         k = 30
-        test_size = int(self.util.config_val_data(self.name, "test_size", 20)) / 100.0
+        test_size = int(self.util.config_val_data(
+            self.name, "test_size", 20)) / 100.0
         df = self.df
         # split target
         targets = df[self.target].to_numpy()
@@ -496,7 +520,8 @@ class Dataset:
         stratif_vars = self.util.config_val("DATA", "balance", False)
         stratif_vars_array = {}
         if not stratif_vars:
-            self.util.error("balanced split needs stratif_vars to stratify the splits")
+            self.util.error(
+                "balanced split needs stratif_vars to stratify the splits")
         else:
             stratif_vars = ast.literal_eval(stratif_vars)
             for stratif_var in stratif_vars.keys():
@@ -505,7 +530,8 @@ class Dataset:
                     continue
                 else:
                     data = df[stratif_var].to_numpy()
-                    bins = self.util.config_val("DATA", f"{stratif_var}_bins", False)
+                    bins = self.util.config_val(
+                        "DATA", f"{stratif_var}_bins", False)
                     if bins:
                         data = binning(data, nbins=int(bins))
                     stratif_vars_array[stratif_var] = data
@@ -556,7 +582,8 @@ class Dataset:
     def split_speakers(self):
         """One way to split train and eval sets: Specify percentage of evaluation speakers"""
-        test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
+        test_percent = int(self.util.config_val_data(
+            self.name, "test_size", 20))
         df = self.df
         s_num = df.speaker.nunique()
         test_num = int(s_num * (test_percent / 100))
@@ -575,7 +602,8 @@ class Dataset:
     def random_split(self):
         """One way to split train and eval sets: Specify percentage of random samples"""
-        test_percent = int(self.util.config_val_data(self.name, "test_size", 20))
+        test_percent = int(self.util.config_val_data(
+            self.name, "test_size", 20))
         df = self.df
         s_num = len(df)
         test_num = int(s_num * (test_percent / 100))
@@ -676,8 +704,11 @@ class Dataset:
     def map_continuous_classification(self, df):
         """Map labels to bins for continuous data that should be classified"""
+        if df.empty:
+            return
         if self.check_continuous_classification():
-            self.util.debug(f"{self.name}: binning continuous variable to categories")
+            self.util.debug(
+                f"{self.name}: binning continuous variable to categories")
             cat_vals = self.util.continuous_to_categorical(df[self.target])
             df[self.target] = cat_vals.values
             labels = ast.literal_eval(glob_conf.config["DATA"]["labels"])

{nkululeko-0.93.10 → nkululeko-0.93.12}/nkululeko/plots.py RENAMED Viewed

@@ -628,8 +628,7 @@ class Plots:
         # one up because of the runs
         fig_dir = self.util.get_path("fig_dir") + "../"
         exp_name = self.util.get_exp_name(only_data=True)
-        format = self.util.config_val("PLOT", "format", "png")
-        filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{format}"
+        filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{self.format}"
         fig = ax.figure
         fig.savefig(filename)
         fig.clear()

{nkululeko-0.93.10 → nkululeko-0.93.12}/nkululeko/segment.py RENAMED Viewed

@@ -62,6 +62,11 @@ def main():
     expr.fill_train_and_tests()
     util.debug(f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
+    def calc_dur(x):
+        starts = x[1]
+        ends = x[2]
+        return (ends - starts).total_seconds()
     # segment
     segmented_file = util.config_val("SEGMENT", "result", "segmented.csv")
@@ -104,16 +109,11 @@ def main():
             df_seg = df_seg.drop(columns=[target])
             df_seg = df_seg.rename(columns={"class_label": target})
         # save file
+        df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
         df_seg.to_csv(f"{expr.data_dir}/{segmented_file}")
-    def calc_dur(x):
-        starts = x[1]
-        ends = x[2]
-        return (ends - starts).total_seconds()
     if "duration" not in df.columns:
         df["duration"] = df.index.to_series().map(lambda x: calc_dur(x))
-    df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
     num_before = df.shape[0]
     num_after = df_seg.shape[0]
     util.debug(

{nkululeko-0.93.10 → nkululeko-0.93.12/nkululeko.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: nkululeko
-Version: 0.93.10
+Version: 0.93.12
 Summary: Machine learning audio prediction experiments based on templates
 Home-page: https://github.com/felixbur/nkululeko
 Author: Felix Burkhardt
@@ -63,7 +63,7 @@ Requires-Dist: pylatex
 ## Overview
 A project to detect speaker characteristics by machine learning experiments with a high-level interface.
-The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
+The idea is to have a framework (based on e.g. sklearn and torch) that can be used to rapidly and automatically analyse audio data and explore machine learning models based on that data.
 * NEW with nkululeko: [Ensemble learning](http://blog.syntheticspeech.de/2024/06/25/nkululeko-ensemble-classifiers-with-late-fusion/)
 * NEW: [Finetune transformer-models](http://blog.syntheticspeech.de/2024/05/29/nkululeko-how-to-finetune-a-transformer-model/)
@@ -353,8 +353,16 @@ If you use it, please mention the Nkululeko paper:
 Changelog
 =========
+Version 0.93.12
+---------------
+* bugfix: map_continuous_to_cat crashed on empty data
+Version 0.93.11
+---------------
+* bugfix: silero segmenter assigned file duration values
 Version 0.93.10
---------------
+---------------
 * added nan check for imported features
 * added LOGO result output

{nkululeko-0.93.10 → nkululeko-0.93.12}/nkululeko.egg-info/SOURCES.txt RENAMED Viewed

@@ -4,51 +4,6 @@ README.md
 pyproject.toml
 setup.cfg
 setup.py
-data/aesdd/process_database.py
-data/androids/process_database.py
-data/ased/process_database.py
-data/asvp-esd/process_database.py
-data/baved/process_database.py
-data/cafe/process_database.py
-data/clac/process_database.py
-data/cmu-mosei/process_database.py
-data/demos/process_database.py
-data/ekorpus/process_database.py
-data/emns/process_database.py
-data/emofilm/convert_to_16k.py
-data/emofilm/process_database.py
-data/emorynlp/process_database.py
-data/emov-db/process_database.py
-data/emovo/process_database.py
-data/emozionalmente/create.py
-data/enterface/process_database.py
-data/esd/process_database.py
-data/gerparas/process_database.py
-data/iemocap/process_database.py
-data/jl/process_database.py
-data/jtes/process_database.py
-data/meld/process_database.py
-data/mesd/process_database.py
-data/mess/process_database.py
-data/mlendsnd/process_database.py
-data/msp-improv/process_database2.py
-data/msp-podcast/process_database.py
-data/oreau2/process_database.py
-data/portuguese/process_database.py
-data/ravdess/process_database.py
-data/ravdess/process_database_speaker.py
-data/savee/process_database.py
-data/shemo/process_database.py
-data/subesco/process_database.py
-data/tess/process_database.py
-data/thorsten-emotional/process_database.py
-data/urdu/process_database.py
-data/vivae/process_database.py
-docs/source/conf.py
-meta/demos/demo_best_model.py
-meta/demos/my_experiment.py
-meta/demos/my_experiment_local.py
-meta/demos/plot_faster_anim.py
 nkululeko/__init__.py
 nkululeko/aug_train.py
 nkululeko/augment.py
@@ -168,5 +123,4 @@ nkululeko/segmenting/seg_silero.py
 nkululeko/utils/__init__.py
 nkululeko/utils/files.py
 nkululeko/utils/stats.py
-nkululeko/utils/util.py
-venv/bin/activate_this.py
+nkululeko/utils/util.py

nkululeko-0.93.10/data/aesdd/process_database.py DELETED Viewed

@@ -1,50 +0,0 @@
-import argparse
-from pathlib import Path
-import pandas as pd
-emotion_map = {
-    "a": "anger",
-    "d": "disgust",
-    "h": "happiness",
-    "f": "fear",
-    "s": "sadness",
-}
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--data_dir", type=str, default="AESDD", help="Path of AESDD directory")
-    parser.add_argument("--out_dir", type=str, default=".")
-    args = parser.parse_args()
-    data_dir = Path(args.data_dir)
-    out_dir = Path(args.out_dir)
-    paths = list(data_dir.glob("**/*.wav"))
-    files = [file for file in paths  if file.stem != "s05 (3)"]
-    names = [file.stem for file in files]
-    emotion = [emotion_map[file.stem[0]] for file in files]
-    speaker = [str(int(x[x.find("(") + 1: x.find(")")])) for x in names]
-    gender = [["female", "male"][int(v) % 2] for v in speaker]
-    language =["greek" for file in files]
-    # convert to df
-    df = pd.DataFrame({"file": files, "speaker": speaker, "emotion": emotion, "gender": gender})
-    # print distribution per emotion
-    # print(df.groupby("emotion").count()['file'])
-    # allocate speaker 5 for test set
-    train_df = df[df["speaker"] != "5"]
-    test_df = df.drop(train_df.index)
-    # save to CSV
-    df.to_csv(out_dir / "aesdd.csv", index=False)
-    train_df.to_csv(out_dir / "aesdd_train.csv", index=False)
-    test_df.to_csv(out_dir / "aesdd_test.csv", index=False)
-    print(f"Total: {len(df)}, Train: {len(train_df)}, Test: {len(test_df)}")
-if __name__ == "__main__":
-    main()

nkululeko-0.93.10/data/androids/process_database.py DELETED Viewed

@@ -1,94 +0,0 @@
-"""
-This folder is to import the
-Androids-corpus depression
-database to nkululeko.
-I used the version downloadable from [github](https://github.com/CheyneyComputerScience/CREMA-D)
-downloaded April 27th 2023
-I used the version downloadable from [Dropbox, mentioned in this github page](https://github.com/androidscorpus/data)
-Download and unzip the file Androids-corpus.zip to the current folder
-Usage: `python process_database.py`
-"""
-import os
-import audeer
-import pandas as pd
-dataset_name = 'androids'
-data_root = './Androids-Corpus/'
-# read in the fold list
-fold_dict = {}
-df_fold = pd.read_csv(f'{data_root}fold-lists.csv')
-for i in range(df_fold.shape[0]):
-    for j in range(df_fold.shape[1]):
-        try:
-            value = df_fold.iloc[i, j].replace('\'', '')
-        except AttributeError:
-            value = 'na'
-        if j in range(7, 12):
-            fold_dict[value] = j - 7
-        else:
-            fold_dict[value] = j
-directory_list = audeer.list_file_names(data_root, filetype='wav', recursive=True, basenames=True)
-depressions, speakers, educations, genders, ages, tasks, folds = [], [], [], [], [], [], []
-file_paths = []
-print(len(directory_list))
-gender_map = {'F':'female', 'M':'male'}
-depression_map = {'P':'depressed', 'C':'control'}
-for file in directory_list:
-    # storing file paths
-    # file = file.replace(os.getcwd(), '.')
-    file_paths.append(data_root+file)
-    # storing file emotions
-    fn = audeer.basename_wo_ext(file)
-    # The naming convention of the audio files is as follows:
-    # nn_XGmm_t.wav
-    # where nn is a unique integer identifier such that, in a given group, files with the same nn contain the voice of the same speaker (there is a trailing 0 for numbers lower than 10), X is an alphabetic character corresponding to the speaker’s condition (P for depression patient and C for control), G is an alphabetic character that stands for the speaker’s gender (M for male and F for female), mm is a two-digits integer number corresponding to the speaker’s age, and t is an integer number between 1 and 4 accounting for the education level (1 corresponds to primary school and 4 corresponds to university). The letter X was used for the 2 participants who did not provide information about this aspect. There is no indication of the task because recordings corresponding to RT and IT are stored in different directories.
-    if 'Reading-Task'in file:
-        task = 'reading'
-    elif 'Interview-Task' in file:
-        task = 'interview'
-    else:
-        print('ERROR: task undefined')
-        exit(-1)
-    part = fn.split('_')
-    dir_name = f'{part[0]}_{part[1]}_{part[2]}'
-    depression = part[1][0]
-    speaker = f'{depression}_{part[0]}'
-    gender = part[1][1]
-    age = part[1][2:4]
-    education = part[2]
-    depressions.append(depression_map[depression])
-    speakers.append(speaker)
-    genders.append(gender_map[gender])
-    ages.append(age)
-    tasks.append(task)
-    educations.append(education)
-    folds.append(fold_dict[dir_name])
-#    print(f'{file} {speaker}')
-# dataframe for emotion of files
-df = pd.DataFrame({'file':file_paths,
-                   'speaker':speakers,
-                   'gender':genders,
-                   'age':ages,
-                   'task':tasks,
-                   'depression':depressions,
-                   'education':educations,
-                   'fold':folds})
-df = df.set_index('file')
-df.head()
-df.to_csv(f'{dataset_name}.csv')

nkululeko-0.93.10/data/ased/process_database.py DELETED Viewed

@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# @filename: process_database.py
-# @description: Process the ASED database
-import argparse
-import shutil
-from importlib.resources import path
-from pathlib import Path
-import pandas as pd
-from joblib import delayed
-emotion_map = {
-    "a": "anger",
-    "h": "happiness",
-    "n": "neutral",
-    "f": "fear",
-    "s": "sadness",
-}
-def main():
-    parser = argparse.ArgumentParser(
-        usage="python3 process_database.py database output"
-    )
-    parser.add_argument("--data_dir", type=str, default="ASED_V1", help="Path to the ASED database")
-    parser.add_argument("--out_dir", type=str, default=".", help="Path to the output directory")
-    args = parser.parse_args()
-    data_dir = Path(args.data_dir)
-    out_dir = Path(args.out_dir)
-    paths = list(data_dir.glob("**/*.wav"))
-    emotions = [emotion_map[path.stem[0].lower()] for path in paths]
-    genders = [["female", "male"][int(p.stem[9:11]) - 1] for p in paths]
-    spekaers = [p.stem[-2:] for p in paths]
-    languages = ["amharic" for p in paths]
-    # convert to df
-    df = pd.DataFrame({"file": paths, "emotion": emotions, "gender": genders, "speaker": spekaers, "language": languages})
-    # allocate speakers >= 55 for test
-    df_test = df[df["speaker"] > "55"]
-    df_train = df.drop(df_test.index)
-    # save to csv
-    df_train.to_csv(out_dir / "ased_train.csv", index=False)
-    df_test.to_csv(out_dir / "ased_test.csv", index=False)
-    df.to_csv(out_dir / "ased.csv", index=False)
-if __name__ == "__main__":
-    main()

nkululeko 0.93.10__tar.gz → 0.93.12__tar.gz

nkululeko 0.93.10tar.gz → 0.93.12tar.gz