PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

helm/benchmark/scenarios/audio_language/covost2_scenario.py ADDED Viewed

@@ -0,0 +1,163 @@
+from typing import Dict, List
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+class CoVoST2Scenario(Scenario):
+    """
+    CoVost-2 is a large-scale multilingual speech translation corpus covering translations from 21 languages
+    into English and from English into 15 languages.
+    The dataset contains the audio, transcriptions, and translations in the following languages:
+    French, German, Dutch, Russian, Spanish, Italian, Turkish, Persian, Swedish, Mongolian, Chinese,
+    Welsh, Catalan, Slovenian, Estonian, Indonesian, Arabic, Tamil, Portuguese, Latvian, and Japanese.
+    Paper: https://arxiv.org/abs/2007.10310
+    Dataset: https://huggingface.co/datasets/facebook/covost2
+    Requires downloading Common Voice Corpus 4 from https://commonvoice.mozilla.org/en/datasets
+    Citation:
+        @misc{wang2020covost2massivelymultilingual,
+            title={CoVoST 2 and Massively Multilingual Speech-to-Text Translation},
+            author={Changhan Wang and Anne Wu and Juan Pino},
+            year={2020},
+            eprint={2007.10310},
+            archivePrefix={arXiv},
+            primaryClass={cs.CL},
+            url={https://arxiv.org/abs/2007.10310},
+        }
+    """
+    LANGUAGE_TO_CODE: Dict[str, str] = {
+        "English": "en",
+        "German": "de",
+        "French": "fr",
+        "Spanish": "es",
+        "Italian": "it",
+        "Portuguese": "pt",
+        "Russian": "ru",
+        "Chinese": "zh-CN",
+        "Japanese": "ja",
+        "Turkish": "tr",
+        "Persian": "fa",
+        "Arabic": "ar",
+        "Dutch": "nl",
+        "Swedish": "sv-SE",
+        "Indonesian": "id",
+        "Tamil": "ta",
+        "Latvian": "lv",
+        "Slovenian": "sl",
+        "Welsh": "cy",
+        "Mongolian": "mn",
+        "Estonian": "et",
+    }
+    VALID_SUBSETS: List[str] = [
+        "en_de",
+        "en_tr",
+        "en_fa",
+        "en_sv-SE",
+        "en_mn",
+        "en_zh-CN",
+        "en_cy",
+        "en_ca",
+        "en_sl",
+        "en_et",
+        "en_id",
+        "en_ar",
+        "en_ta",
+        "en_lv",
+        "en_ja",
+        "fr_en",
+        "de_en",
+        "es_en",
+        "ca_en",
+        "it_en",
+        "ru_en",
+        "zh-CN_en",
+        "pt_en",
+        "fa_en",
+        "et_en",
+        "mn_en",
+        "nl_en",
+        "tr_en",
+        "ar_en",
+        "sv-SE_en",
+        "lv_en",
+        "sl_en",
+        "ta_en",
+        "ja_en",
+        "id_en",
+        "cy_en",
+    ]
+    name = "covost2"
+    description = (
+        "A large scale multilingual speech translation corpus ([Wang et al., 2017](https://arxiv.org/abs/2007.10310))."
+    )
+    tags = ["audio", "translation", "multilinguality"]
+    def __init__(self, source_language: str, target_language: str) -> None:
+        super().__init__()
+        if (
+            source_language not in CoVoST2Scenario.LANGUAGE_TO_CODE
+            or target_language not in CoVoST2Scenario.LANGUAGE_TO_CODE
+        ):
+            raise ValueError(f"Invalid language. Valid languages are: {list(CoVoST2Scenario.LANGUAGE_TO_CODE.keys())}")
+        # Get the corresponding language codes
+        source_language_code: str = self.LANGUAGE_TO_CODE[source_language]
+        target_language_code: str = self.LANGUAGE_TO_CODE[target_language]
+        subset: str = f"{source_language_code}_{target_language_code}"
+        if subset not in CoVoST2Scenario.VALID_SUBSETS:
+            raise ValueError(f"Invalid subset: {subset}. Valid subsets are: {CoVoST2Scenario.VALID_SUBSETS}")
+        self._subset: str = subset
+        self._source_language: str = source_language
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_dir: str = os.path.join(output_path, self._source_language)
+        assert os.path.exists(data_dir), (
+            f"Download the {self._source_language} subset from Common Voice Corpus 4 "
+            f"(https://commonvoice.mozilla.org/en/datasets) and unzip and place at {data_dir}."
+        )
+        instances: List[Instance] = []
+        split: str = TEST_SPLIT
+        for row in tqdm(
+            load_dataset(
+                "facebook/covost2",
+                self._subset,
+                cache_dir=output_path,
+                data_dir=data_dir,
+                split=split,
+                trust_remote_code=True,
+                revision="369b47c4c20aff1193b8edeeedc37d14ae28226b",
+            )
+        ):
+            audio_path: str = row["file"]
+            assert os.path.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
+            input = Input(
+                multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=audio_path)])
+            )
+            references = [Reference(Output(text=row["translation"]), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=split))
+        return instances

helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Scenarios for audio models"""
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from collections import OrderedDict
+from tqdm import tqdm
+from datasets import load_dataset
+from helm.common.media_object import MediaObject, MultimediaObject
+class FLEURSFairnessScenario(Scenario):
+    """FLEURS Fairness Scenario
+    The FLEURS (Conneau et al, 2022) dataset is an n-way parallel speech dataset in 102 languages
+    built on top of the machine translation FLoRes-101 benchmark, with approximately 12 hours of speech
+    supervision per language. The task is to identify the language used from the audio sample
+    (the Speech Language Identification task).
+    Paper: https://arxiv.org/abs/2205.12446
+    Code: https://tensorflow.org/datasets/catalog/xtreme_s
+    Citation:
+    @inproceedings{conneau2023fleurs,
+        title={Fleurs: Few-shot learning evaluation of universal representations of speech},
+        author={Conneau, Alexis and Ma, Min and Khanuja, Simran and Zhang, Yu and Axelrod,
+        Vera and Dalmia, Siddharth and Riesa, Jason and Rivera, Clara and Bapna, Ankur},
+        booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
+        pages={798--805},
+        year={2023},
+        organization={IEEE}
+        }
+    """
+    HF_DATASET_NAME = "google/xtreme_s"
+    GENDERS = {"male": 0, "female": 1}
+    name = "fleurs_fairness"
+    description = "Language identification for seven languages from seven different language groups \
+        ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446))."
+    tags: List[str] = ["audio", "recognition", "multilinguality"]
+    def __init__(self, gender: str) -> None:
+        super().__init__()
+        if gender.lower() not in FLEURSFairnessScenario.GENDERS.keys():
+            raise ValueError(
+                f"Invalid gender input: {gender}. Valid languages are: {FLEURSFairnessScenario.GENDERS.keys()}"
+            )
+        self._gender: str = gender
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        loading_cases: List[OrderedDict] = []
+        overall_dataset = load_dataset(
+            FLEURSFairnessScenario.HF_DATASET_NAME,
+            name="fleurs.en_us",
+            cache_dir=output_path,
+            split=TEST_SPLIT,
+            trust_remote_code=True,
+        )
+        for row in tqdm(overall_dataset):
+            if row["gender"] == self.GENDERS[self._gender]:
+                loading_cases.append(row)
+        for row in tqdm(loading_cases):
+            local_audio_path = row["path"]
+            answer = row["transcription"]
+            input = Input(
+                multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
+            )
+            references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances

helm/benchmark/scenarios/audio_language/fleurs_scenario.py ADDED Viewed

@@ -0,0 +1,312 @@
+"""Scenarios for audio models"""
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from collections import OrderedDict
+from tqdm import tqdm
+from datasets import load_dataset
+from helm.common.media_object import MediaObject, MultimediaObject
+class FLEURSScenario(Scenario):
+    """FLEURS Scenario
+    The FLEURS (Conneau et al, 2022) dataset is an n-way parallel speech dataset in 102 languages
+    built on top of the machine translation FLoRes-101 benchmark, with approximately 12 hours of speech
+    supervision per language. The task is to identify the language used from the audio sample
+    (the Speech Language Identification task).
+    Paper: https://arxiv.org/abs/2205.12446
+    Code: https://tensorflow.org/datasets/catalog/xtreme_s
+    Citation:
+    @inproceedings{conneau2023fleurs,
+        title={Fleurs: Few-shot learning evaluation of universal representations of speech},
+        author={Conneau, Alexis and Ma, Min and Khanuja, Simran and Zhang, Yu and Axelrod,
+        Vera and Dalmia, Siddharth and Riesa, Jason and Rivera, Clara and Bapna, Ankur},
+        booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
+        pages={798--805},
+        year={2023},
+        organization={IEEE}
+        }
+    """
+    HF_DATASET_NAME = "google/xtreme_s"
+    _FLEURS_LANG_TO_ID = OrderedDict(
+        [
+            ("Afrikaans", "af"),
+            ("Amharic", "am"),
+            ("Arabic", "ar"),
+            ("Armenian", "hy"),
+            ("Assamese", "as"),
+            ("Asturian", "ast"),
+            ("Azerbaijani", "az"),
+            ("Belarusian", "be"),
+            ("Bengali", "bn"),
+            ("Bosnian", "bs"),
+            ("Bulgarian", "bg"),
+            ("Burmese", "my"),
+            ("Catalan", "ca"),
+            ("Cebuano", "ceb"),
+            ("Mandarin_Chinese", "cmn_hans"),
+            ("Cantonese_Chinese", "yue_hant"),
+            ("Croatian", "hr"),
+            ("Czech", "cs"),
+            ("Danish", "da"),
+            ("Dutch", "nl"),
+            ("English", "en"),
+            ("Estonian", "et"),
+            ("Filipino", "fil"),
+            ("Finnish", "fi"),
+            ("French", "fr"),
+            ("Fula", "ff"),
+            ("Galician", "gl"),
+            ("Ganda", "lg"),
+            ("Georgian", "ka"),
+            ("German", "de"),
+            ("Greek", "el"),
+            ("Gujarati", "gu"),
+            ("Hausa", "ha"),
+            ("Hebrew", "he"),
+            ("Hindi", "hi"),
+            ("Hungarian", "hu"),
+            ("Icelandic", "is"),
+            ("Igbo", "ig"),
+            ("Indonesian", "id"),
+            ("Irish", "ga"),
+            ("Italian", "it"),
+            ("Japanese", "ja"),
+            ("Javanese", "jv"),
+            ("Kabuverdianu", "kea"),
+            ("Kamba", "kam"),
+            ("Kannada", "kn"),
+            ("Kazakh", "kk"),
+            ("Khmer", "km"),
+            ("Korean", "ko"),
+            ("Kyrgyz", "ky"),
+            ("Lao", "lo"),
+            ("Latvian", "lv"),
+            ("Lingala", "ln"),
+            ("Lithuanian", "lt"),
+            ("Luo", "luo"),
+            ("Luxembourgish", "lb"),
+            ("Macedonian", "mk"),
+            ("Malay", "ms"),
+            ("Malayalam", "ml"),
+            ("Maltese", "mt"),
+            ("Maori", "mi"),
+            ("Marathi", "mr"),
+            ("Mongolian", "mn"),
+            ("Nepali", "ne"),
+            ("Northern-Sotho", "nso"),
+            ("Norwegian", "nb"),
+            ("Nyanja", "ny"),
+            ("Occitan", "oc"),
+            ("Oriya", "or"),
+            ("Oromo", "om"),
+            ("Pashto", "ps"),
+            ("Persian", "fa"),
+            ("Polish", "pl"),
+            ("Portuguese", "pt"),
+            ("Punjabi", "pa"),
+            ("Romanian", "ro"),
+            ("Russian", "ru"),
+            ("Serbian", "sr"),
+            ("Shona", "sn"),
+            ("Sindhi", "sd"),
+            ("Slovak", "sk"),
+            ("Slovenian", "sl"),
+            ("Somali", "so"),
+            ("Sorani-Kurdish", "ckb"),
+            ("Spanish", "es"),
+            ("Swahili", "sw"),
+            ("Swedish", "sv"),
+            ("Tajik", "tg"),
+            ("Tamil", "ta"),
+            ("Telugu", "te"),
+            ("Thai", "th"),
+            ("Turkish", "tr"),
+            ("Ukrainian", "uk"),
+            ("Umbundu", "umb"),
+            ("Urdu", "ur"),
+            ("Uzbek", "uz"),
+            ("Vietnamese", "vi"),
+            ("Welsh", "cy"),
+            ("Wolof", "wo"),
+            ("Xhosa", "xh"),
+            ("Yoruba", "yo"),
+            ("Zulu", "zu"),
+        ]
+    )
+    _FLEURS_LANG = sorted(
+        [
+            "af_za",
+            "am_et",
+            "ar_eg",
+            "as_in",
+            "ast_es",
+            "az_az",
+            "be_by",
+            "bn_in",
+            "bs_ba",
+            "ca_es",
+            "ceb_ph",
+            "cmn_hans_cn",
+            "yue_hant_hk",
+            "cs_cz",
+            "cy_gb",
+            "da_dk",
+            "de_de",
+            "el_gr",
+            "en_us",
+            "es_419",
+            "et_ee",
+            "fa_ir",
+            "ff_sn",
+            "fi_fi",
+            "fil_ph",
+            "fr_fr",
+            "ga_ie",
+            "gl_es",
+            "gu_in",
+            "ha_ng",
+            "he_il",
+            "hi_in",
+            "hr_hr",
+            "hu_hu",
+            "hy_am",
+            "id_id",
+            "ig_ng",
+            "is_is",
+            "it_it",
+            "ja_jp",
+            "jv_id",
+            "ka_ge",
+            "kam_ke",
+            "kea_cv",
+            "kk_kz",
+            "km_kh",
+            "kn_in",
+            "ko_kr",
+            "ckb_iq",
+            "ky_kg",
+            "lb_lu",
+            "lg_ug",
+            "ln_cd",
+            "lo_la",
+            "lt_lt",
+            "luo_ke",
+            "lv_lv",
+            "mi_nz",
+            "mk_mk",
+            "ml_in",
+            "mn_mn",
+            "mr_in",
+            "ms_my",
+            "mt_mt",
+            "my_mm",
+            "nb_no",
+            "ne_np",
+            "nl_nl",
+            "nso_za",
+            "ny_mw",
+            "oc_fr",
+            "om_et",
+            "or_in",
+            "pa_in",
+            "pl_pl",
+            "ps_af",
+            "pt_br",
+            "ro_ro",
+            "ru_ru",
+            "bg_bg",
+            "sd_in",
+            "sk_sk",
+            "sl_si",
+            "sn_zw",
+            "so_so",
+            "sr_rs",
+            "sv_se",
+            "sw_ke",
+            "ta_in",
+            "te_in",
+            "tg_tj",
+            "th_th",
+            "tr_tr",
+            "uk_ua",
+            "umb_ao",
+            "ur_pk",
+            "uz_uz",
+            "vi_vn",
+            "wo_sn",
+            "xh_za",
+            "yo_ng",
+            "zu_za",
+        ]
+    )
+    # Randomly selected 7 languages from 7 different groups (western_european_we, eastern_european_ee,
+    # central_asia_middle_north_african_cmn, sub_saharan_african_ssa, south_asian_sa, south_east_asian_sea,
+    # chinese_japanase_korean_cjk) in the FLEURS dataset.
+    _FLEURS_TEST_LANG_TO_ID = OrderedDict(
+        [
+            ("Finnish", "fi"),
+            ("English", "en"),
+            ("Hebrew", "he"),
+            ("Zulu", "zu"),
+            ("Bengali", "bn"),
+            ("Thai", "th"),
+            ("Mandarin_Chinese", "cmn_hans"),
+        ]
+    )
+    name = "fleurs"
+    description = "Language identification for seven languages from seven different language groups \
+        ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446))."
+    tags: List[str] = ["audio", "recognition", "multilinguality"]
+    def __init__(self, language: str) -> None:
+        super().__init__()
+        if language not in FLEURSScenario._FLEURS_TEST_LANG_TO_ID.keys():
+            raise ValueError(
+                f"Invalid language: {language}. Valid languages are: {FLEURSScenario._FLEURS_TEST_LANG_TO_ID.keys()}"
+            )
+        self._fleurs_lang_short_to_long = {v: k for k, v in FLEURSScenario._FLEURS_LANG_TO_ID.items()}
+        self._fleurs_long_to_lang = {
+            self._fleurs_lang_short_to_long["_".join(k.split("_")[:-1]) or k]: k for k in FLEURSScenario._FLEURS_LANG
+        }
+        self._language: str = language
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        language_category = self._fleurs_long_to_lang[self._language]
+        for row in tqdm(
+            load_dataset(
+                FLEURSScenario.HF_DATASET_NAME,
+                name=f"fleurs.{language_category}",
+                cache_dir=output_path,
+                split=TEST_SPLIT,
+                trust_remote_code=True,
+            )
+        ):
+            local_audio_path = row["path"]
+            answer = row["transcription"]
+            input = Input(
+                multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
+            )
+            references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances

helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py ADDED Viewed

@@ -0,0 +1,83 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.audio_utils import ensure_audio_file_exists_from_array
+from helm.common.general import ensure_directory_exists
+from helm.common.media_object import MediaObject, MultimediaObject
+class IEMOCAPAudioScenario(Scenario):
+    """IEMOCAP (Audio)
+    This scenario is a emotion classification scenario based on the
+    "Interactive emotional dyadic motion capture database" (IEMOCAP),
+    collected by the Speech Analysis and Interpretation Laboratory (SAIL)
+    at the University of Southern California (USC). Only the audio data
+    from this dataset is used. The task is to classify the emotion of the
+    speaker(s) in the audio sample as one of angry, happy, neutral or sad.
+    Website: https://sail.usc.edu/iemocap/iemocap_release.htm
+    Paper: https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf
+    Dataset: https://huggingface.co/datasets/Zahra99/IEMOCAP_Audio/blob/main/README.md
+    Citation:
+    @article{article,
+    author = {Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower Provost, Emily and Kim, Samuel and Chang, Jeannette and Lee, Sungbok and Narayanan, Shrikanth},
+    year = {2008},
+    month = {12},
+    pages = {335-359},
+    title = {IEMOCAP: Interactive emotional dyadic motion capture database},
+    volume = {42},
+    journal = {Language Resources and Evaluation},
+    doi = {10.1007/s10579-008-9076-6}
+    }
+    """  # noqa: E501
+    name = "iemocap_audio"
+    description = "A classification scenario based on audio data from the Interactive emotional dyadic motion capture database (IEMOCAP) ([Busso et al, 2008](https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf)). The task is to classify the emotion of the speaker(s) in the audio sample."  # noqa: E501
+    tags = ["audio", "classification"]
+    LABEL_NAMES = ["angry", "happy", "neutral", "sad"]
+    SAMPLE_RATE = 16000
+    def get_instances(self, output_path: str) -> List[Instance]:
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        wav_dir = os.path.join(output_path, "wav")
+        ensure_directory_exists(wav_dir)
+        dataset = datasets.load_dataset(
+            "Zahra99/IEMOCAP_Audio", revision="4f8539a397ecc0d7185bf941bc1bb7238abc3648", cache_dir=cache_dir
+        )
+        instances: List[Instance] = []
+        for _, split in dataset.items():
+            for row in split:
+                wav_path = os.path.join(wav_dir, row["audio"]["path"])
+                print(len(row["audio"]["array"]))
+                print(list(row["audio"]["array"])[0:10])
+                ensure_audio_file_exists_from_array(
+                    wav_path, row["audio"]["array"], sample_rate=IEMOCAPAudioScenario.SAMPLE_RATE
+                )
+                input = Input(
+                    multimedia_content=MultimediaObject(
+                        media_objects=[MediaObject(location=wav_path, content_type="audio/wav")]
+                    )
+                )
+                references = [
+                    Reference(output=Output(text=IEMOCAPAudioScenario.LABEL_NAMES[row["label"]]), tags=[CORRECT_TAG])
+                ]
+                instance = Instance(input=input, references=references, split=TEST_SPLIT)
+                instances.append(instance)
+                print(row["audio"])
+                break
+        return instances

crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl