PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/scenarios/melt_scenarios.py ADDED Viewed

@@ -0,0 +1,793 @@
+import os
+from abc import abstractmethod
+from typing import Dict, List, Tuple, Optional
+from datasets import load_dataset
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    VALID_SPLIT,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Input,
+    Output,
+)
+from helm.benchmark.scenarios.math_scenario import get_answer
+class MELTQAScenario(Scenario):
+    name = "melt_question_answering"
+    description = "Question answering scenario."
+    tags = ["question_answering"]
+    def __init__(
+        self,
+        dataset_name: str,
+        revision: str,
+        subset: Optional[str] = None,
+        passage_prefix: str = "Passage: ",
+        question_prefix: str = "Question: ",
+        splits: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Initializes the question answering scenario.
+        Args:
+            dataset_name: The name of the dataset.
+            revision: The revision of the dataset to use.
+            subset: The subset of the dataset to use. Defaults to "".
+            passage_prefix: The prefix to use for the context passage. Defaults to "Passage: ".
+            question_prefix: The prefix to use for the question. Defaults to "Question: ".
+            splits: The splits to use for the dataset. Defaults to None.
+        """
+        super().__init__()
+        self.dataset_name = dataset_name
+        self.subset = subset
+        self.revision = revision
+        self.passage_prefix = passage_prefix
+        self.question_prefix = question_prefix
+        self.splits = splits
+    def process_example(self, sample: dict) -> Tuple[Input, List[str]]:
+        """
+        Given an sample from the dataset, create the prompt and the list of
+        correct references.
+        Each sample is a dictionary with the following keys:
+        - context: The passage to be used for the question.
+        - question: A questions.
+        - answers: A list of answers with dictionary format {'answer_start': [], 'text': []}
+        """
+        passage = sample["context"]
+        question = sample["question"]
+        prompt = PassageQuestionInput(
+            passage=passage,
+            passage_prefix=self.passage_prefix,
+            question=question,
+            question_prefix=self.question_prefix,
+            separator="\n\n",
+        )
+        answers: List[str] = []
+        for answer_text in sample["answers"]["text"]:
+            answers.append(answer_text)
+        return prompt, answers
+    def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
+        """
+        Helper for generating instances for a split.
+        Args:
+            splits (dict): Which splits to partition the data into.
+        Returns:
+            List[Instance]: Instances from the file for the specified split.
+        """
+        instances: List[Instance] = []
+        dataset = load_dataset(
+            self.dataset_name,
+            self.subset,
+            revision=self.revision,
+            trust_remote_code=True,
+        )
+        for dataset_split_name, helm_split_name in splits.items():
+            if dataset_split_name not in dataset:
+                raise ValueError(f"Could not find split {dataset_split_name} in dataset {self.dataset_name}")
+            for sample in dataset[dataset_split_name]:
+                prompt, answers = self.process_example(sample)
+                instance = Instance(
+                    input=prompt,
+                    references=[Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in answers],
+                    split=helm_split_name,
+                )
+                instances.append(instance)
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        if self.splits is None:
+            splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
+        else:
+            splits = {}
+            if "train" in self.splits:
+                splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
+            if "validation" in self.splits:
+                splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
+            if "test" in self.splits:
+                splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
+        instances: List[Instance] = self.get_instances_for_splits(splits=splits)
+        return instances
+class MELTQAMLQAScenario(MELTQAScenario):
+    """
+    Scenario for MLQA dataset.
+    This dataset is a multilingual question answering dataset.
+    It contains questions in multiple languages and their corresponding
+    answers in the same language.
+    In this scenario, we are using the Vietnamese subset of the MLQA dataset.
+    """
+    name = "melt_question_answering_mlqa"
+    description = "MLQA is an open-ended question answering dataset in multiple languages."
+    tags = ["question_answering"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="facebook/mlqa",
+            revision="397ed406c1a7902140303e7faf60fff35b58d285",
+            subset="mlqa.vi.vi",
+            passage_prefix="Ngữ cảnh: ",
+            question_prefix="Câu hỏi: ",
+            splits={
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+        )
+class MELTQAXQuADScenario(MELTQAScenario):
+    """
+    Scenario for XQuAD dataset.
+    XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset
+    for evaluating cross-lingual question answering performance.
+    """
+    name = "melt_question_answering_xquad"
+    description = "XQuAD is a cross-lingual question answering dataset."
+    tags = ["question_answering"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="juletxara/xquad_xtreme",
+            revision="87646a09233481f6884b6ffcc6795af9ca0b85d7",
+            subset="vi",
+            passage_prefix="Ngữ cảnh: ",
+            question_prefix="Câu hỏi: ",
+            splits={
+                VALID_SPLIT: "translate_train",
+                TEST_SPLIT: "test",
+            },
+        )
+class MELTSummarizationScenario(Scenario):
+    """
+    Scenario for single document text summarization.
+    """
+    name = "melt_summarization"
+    description = "Scenario for summarization tasks"
+    tags = ["summarization"]
+    def __init__(
+        self,
+        dataset_name: str,
+        revision: str,
+        subset: Optional[str] = None,
+        train_min_length: Optional[int] = None,
+        train_max_length: Optional[int] = None,
+        doc_max_length: Optional[int] = None,
+        article_key: str = "source",
+        summary_key: str = "target",
+        splits: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Initializes summarization scenario.
+        Args:
+            dataset_name: String identifier for dataset. Currently
+                          supported options ["vietnews", "wikilingua"].
+            revision: String identifier for dataset version.
+            subset: Dataset subset to use. Defaults to "".
+            train_min_length: Int indicating minimum length for training
+                                 documents. Training examples smaller than
+                                 train_min_length will be filtered out.
+                                 Useful for preventing the adapter from sampling
+                                 really small documents.
+            train_max_length: Int indicating maximum length for training
+                                 documents. Training examples larger than
+                                 train_max_length will be filtered out.
+                                 Useful for preventing the adapter from
+                                 sampling really large documents.
+            doc_max_length: Int indicating the maximum length to truncate
+                            documents. Documents in all splits will be
+                            truncated to doc_max_length tokens.
+                            NOTE: Currently uses whitespace tokenization.
+            article_key: String key for article text in dataset. Defaults to "source".
+            summary_key: String key for summary text in dataset. Defaults to "target".
+            splits: Dict containing split names and corresponding split. If
+                    None, defaults to {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}.
+        """
+        super().__init__()
+        self.dataset_name = dataset_name
+        self.revision = revision
+        self.subset = subset
+        self.train_min_length = train_min_length
+        self.train_max_length = train_max_length
+        self.doc_max_length = doc_max_length
+        self.article_key = article_key
+        self.summary_key = summary_key
+        self.splits = splits
+    def _clean_and_truncate(self, text: str, max_length: Optional[int] = None) -> str:
+        return " ".join(text.split()[:max_length])
+    def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
+        """
+        Helper for generating instances for a split.
+        Args:
+            splits (dict): Which splits to partition the data into.
+        Returns:
+            List[Instance]: Instances from the file for the specified split.
+        """
+        instances: List[Instance] = []
+        dataset = load_dataset(
+            self.dataset_name,
+            self.subset,
+            revision=self.revision,
+            trust_remote_code=True,
+        )
+        for dataset_split_name, helm_split_name in splits.items():
+            if dataset_split_name not in dataset:
+                raise ValueError(f"Could not find split {dataset_split_name} in dataset {self.dataset_name}")
+            for sample in dataset[dataset_split_name]:
+                article: str = self._clean_and_truncate(sample[self.article_key], self.doc_max_length)
+                summary: str = self._clean_and_truncate(sample[self.summary_key])
+                if helm_split_name == "train":
+                    art_len = len(article.split())
+                    if self.train_max_length and art_len > self.train_max_length:
+                        continue
+                    if self.train_min_length and art_len < self.train_min_length:
+                        continue
+                instances.append(
+                    Instance(
+                        input=Input(text=article),
+                        references=[Reference(Output(text=summary), tags=[CORRECT_TAG])],
+                        split=helm_split_name,
+                    )
+                )
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        if self.splits is None:
+            splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
+        else:
+            splits = {}
+            if "train" in self.splits:
+                splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
+            if "validation" in self.splits:
+                splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
+            if "test" in self.splits:
+                splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
+        instances: List[Instance] = self.get_instances_for_splits(splits=splits)
+        return instances
+class MELTSummarizationVietnewsScenario(MELTSummarizationScenario):
+    """
+    Scenario for summarization on Vietnews dataset.
+    Vietnews includes a collection of news articles in Vietnamese from
+    online news such as Tuoi Tre, VnExpress, and Nguoi Dua Tin between 2016 and 2019.
+    The topic of the articles is about the world, news, law, and business.
+    The dataset also contains the corresponding summary for each article.
+    """
+    name = "melt_summarization_vietnews"
+    description = (
+        "Vietnews is a Vietnamese news summarization dataset collected from online news articles between 2016 and 2019."
+    )
+    tags = ["summarization"]
+    def __init__(self, **kwargs):
+        super().__init__(
+            dataset_name="Yuhthe/vietnews",
+            revision="c391150e7541839d0f07d9ea89fe00005618a8f7",
+            article_key="article",
+            summary_key="abstract",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+            **kwargs,
+        )
+class MELTSummarizationWikilinguaScenario(MELTSummarizationScenario):
+    """
+    Scenario for summarization on Wikilingua dataset.
+    Wikilingua is a multilingual summarization dataset.
+    In this scenario, we are using the Vietnamese subset of the Wikilingua dataset.
+    """
+    name = "melt_summarization_wikilingua"
+    description = "Wikilingua is a multilingual summarization dataset."
+    tags = ["summarization"]
+    def __init__(self, **kwargs):
+        super().__init__(
+            dataset_name="GEM/wiki_lingua",
+            revision="af5d0f00b59a6933165c97b384f50d8b563c314d",
+            article_key="source",
+            summary_key="target",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+            **kwargs,
+        )
+class MELTMATHScenario(Scenario):
+    """
+    The MATH dataset from the paper
+    "Measuring Mathematical Problem Solving With the MATH Dataset"
+    by Hendrycks et al. (2021):
+    https://arxiv.org/pdf/2103.03874.pdf
+    Example input, using official examples:
+    ```
+    Given a mathematics problem, determine the answer. Simplify your answer as much as possible.
+    ###
+    Problem: What is $\left(\frac{7}{8}\right)^3 \cdot \left(\frac{7}{8}\right)^{-3}$?
+    Answer: $1$
+    ###
+    Problem: In how many ways can 4 books be selected from a shelf of 6 books if the order in which the books are selected does not matter?
+    Answer: $15$
+    ###
+    Problem: Find the distance between the points $(2,1,-4)$ and $(5,8,-3).$
+    Answer: $\sqrt{59}$
+    ###
+    Problem: The faces of an octahedral die are labeled with digits $1$ through $8$. What is the probability, expressed as a common fraction, of rolling a sum of $15$ with a pair of such octahedral dice?
+    Answer: $\frac{1}{32}$
+    ###
+    Problem: The first three terms of an arithmetic sequence are 1, 10 and 19, respectively. What is the value of the 21st term?
+    Answer: $181$
+    ###
+    Problem: Calculate $6 \cdot 8\frac{1}{3}
+    Answer: $50$
+    ###
+    Problem: When the binary number $100101110010_2$ is divided by 4, what is the remainder (give your answer in base 10)?
+    Answer: $2$
+    ###
+    Problem: How many zeros are at the end of the product 25 $\times$ 240?
+    Answer: $3$
+    ###
+    Problem: What is $\dbinom{n}{n}$ for any positive integer $n$?
+    Answer: $
+    ```
+    Example expected output
+    ```
+    1$
+    ```
+    """  # noqa
+    name = "MATH"
+    description = "Mathematical Problem Solving in Vietnamese"
+    tags = ["knowledge", "reasoning"]
+    subjects_mapping = {
+        "number_theory": "Number Theory",
+        "intermediate_algebra": "Intermediate Algebra",
+        "algebra": "Algebra",
+        "prealgebra": "Prealgebra",
+        "geometry": "Geometry",
+        "counting_and_probability": "Counting & Probability",
+        "precalculus": "Precalculus",
+    }
+    levels = ["1", "2", "3", "4", "5"]
+    def __init__(
+        self, subject: str, level: str, use_official_examples: bool = False, use_chain_of_thought: bool = False
+    ):
+        super().__init__()
+        self.subject_name: str = MELTMATHScenario.subjects_mapping[subject]
+        self.subject: str = subject
+        self.level: str = f"Level {level}"
+        self.use_official_examples: bool = use_official_examples
+        self.use_chain_of_thought: bool = use_chain_of_thought
+        if use_chain_of_thought:
+            assert not use_official_examples, "Cannot use official examples when use_chain_of_thought is True."
+    def get_instances(self, output_path: str) -> List[Instance]:
+        dataset = {}
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = load_dataset(
+            "ura-hcmut/Vietnamese-MATH",
+            self.subject,
+            trust_remote_code=True,
+            cache_dir=cache_dir,
+            revision="f8edc7f8e2873e8b271391d4489c1eedc5456f40",
+        )
+        instances = []
+        for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
+            if split == TRAIN_SPLIT and self.use_official_examples:
+                train_instances = [
+                    ("Kết quả của $\left(\\frac{7}{8}\\right)^3 \cdot \left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
+                    (
+                        "Có bao nhiêu cách chọn 4 quyển sách từ một kệ sách có 6 quyển,"
+                        + " nếu thứ tự các cuốn sách được chọn không quan trọng?",
+                        "15",
+                    ),
+                    ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\sqrt{59}"),
+                    (
+                        "Các mặt của khối xúc xắc bát diện được dán nhãn bằng các số từ $1$ đến $8$."
+                        + " Xác suất tung một cặp xúc xắc bát diện để được tổng số bằng $15$ là bao nhiêu?"
+                        + " Biểu diễn kết quả dưới dạng phân số tối giản.",
+                        "\\frac{1}{32}",
+                    ),
+                    (
+                        "Ba số hạng đầu tiên của một dãy số cộng lần lượt là 1, 10 và 19."
+                        + " Giá trị của số hạng thứ 21 là?",
+                        "181",
+                    ),
+                    ("Tính $6 \\cdot 8\\frac{1}{3}", "50"),
+                    (
+                        "Khi chia số nhị phân $100101110010_2$ cho 4,"
+                        + " phần dư của phép chia là bao nhiêu (biểu diễn kết quả với cơ số 10)?",
+                        "2",
+                    ),
+                    ("Có bao nhiêu số 0 ở cuối kết quả của tích 25 $\\times$ 240?", "3"),
+                ]
+                dataset[TRAIN_SPLIT] = [
+                    {"problem_vi": problem, "answer_vi": answer} for problem, answer in train_instances
+                ]
+            else:
+                examples = dataset[split].filter(lambda example: example["level"] == self.level)
+                list_answers = []
+                for example in examples:
+                    # Sanity check that we filtered correctly
+                    assert (
+                        example["type"] == self.subject_name and example["level"] == self.level
+                    ), f"Wrong example was included after filtering: {example}"
+                    if self.use_chain_of_thought:
+                        answer = example["solution_vi"]
+                    else:
+                        maybe_answer = get_answer(example["solution_vi"])
+                        if maybe_answer is None:
+                            maybe_answer = "Không có đáp án"
+                        answer = maybe_answer
+                    list_answers.append(answer)
+                # Add column answer_vi to examples
+                dataset[split] = examples.add_column("answer_vi", list_answers)
+            for example in dataset[split]:
+                instance = Instance(
+                    input=Input(text=example["problem_vi"]),
+                    references=[Reference(Output(text=example["answer_vi"]), tags=[CORRECT_TAG])],
+                    split=split,
+                )
+                instances.append(instance)
+        return instances
+class MELTTextClassificationScenario(Scenario):
+    name = "melt_text_classification"
+    description = "Text Classification scenario."
+    tags = ["text_classification"]
+    def __init__(
+        self,
+        dataset_name: str,
+        revision: str,
+        subset: Optional[str] = None,
+        text_key: str = "text",
+        label_key: str = "label",
+        splits: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Initializes the question answering scenario.
+        Args:
+            dataset_name: The name of the dataset.
+            revision: The revision of the dataset to use.
+            subset: The subset of the dataset to use. Defaults to "".
+            splits: The splits to use for the dataset. Defaults to None.
+        """
+        super().__init__()
+        self.dataset_name = dataset_name
+        self.subset = subset
+        self.revision = revision
+        self.splits = splits
+    @abstractmethod
+    def process_example(self, sample: dict) -> Tuple[str, List[str]]:
+        """
+        Given an sample from the dataset, create the input text and
+        list of answers for the instance.
+        """
+        pass
+    def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
+        """
+        Helper for generating instances for a split.
+        Args:
+            splits (dict): Which splits to partition the data into.
+        Returns:
+            List[Instance]: Instances from the file for the specified split.
+        """
+        instances: List[Instance] = []
+        dataset = load_dataset(
+            self.dataset_name,
+            self.subset,
+            revision=self.revision,
+            trust_remote_code=True,
+        )
+        for dataset_split_name, helm_split_name in splits.items():
+            if dataset_split_name not in dataset:
+                raise ValueError(f"Could not find split {dataset_split_name} in dataset {self.dataset_name}")
+            for sample in dataset[dataset_split_name]:
+                prompt, answers = self.process_example(sample)
+                instance = Instance(
+                    input=Input(text=prompt),
+                    references=[Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in answers],
+                    split=helm_split_name,
+                )
+                instances.append(instance)
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        if self.splits is None:
+            splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
+        else:
+            splits = {}
+            if "train" in self.splits:
+                splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
+            if "validation" in self.splits:
+                splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
+            if "test" in self.splits:
+                splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
+        instances: List[Instance] = self.get_instances_for_splits(splits=splits)
+        return instances
+class MELTTextClassificationVSMECScenario(MELTTextClassificationScenario):
+    """
+    Scenario for the UIT-VSMEC dataset.
+    The UIT-VSMEC dataset is a Vietnamese emotion-labeled corpus consisting of
+    6,927 human-annotated sentences collected from social media, categorized
+    into six emotions: sadness, enjoyment, anger, disgust, fear, and surprise.
+    """
+    name = "melt_text_classification_vsmec"
+    description = "UIT-VSMEC dataset for emotion classification."
+    tags = ["text_classification"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="ura-hcmut/UIT-VSMEC",
+            revision="ab642b189eff31fdb781cca7c4c34dee3ee0f1de",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+        )
+        self.text_key = "Sentence"
+        self.label_key = "Emotion"
+    def process_example(self, sample: dict) -> Tuple[str, List[str]]:
+        """
+        Given an sample from the dataset, create the input text and
+        list of answers for the instance.
+        """
+        return sample[self.text_key], [sample[self.label_key].lower()]
+class MELTTextClassificationPhoATISScenario(MELTTextClassificationScenario):
+    """
+    Scenario for the PhoATIS dataset.
+    The PhoATIS dataset is a Vietnamese benchmark for intent detection and slot filling,
+    consisting of 5,871 fluent utterances collected from task-oriented dialogue systems.
+    It was later extended with manual disfluency annotations to create a disfluent variant,
+    enabling research on the impact of disfluencies in spoken language understanding for Vietnamese.
+    """
+    name = "melt_text_classification_phoatis"
+    description = "PhoATIS dataset for intent detection of flight booking."
+    tags = ["text_classification"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="ura-hcmut/PhoATIS",
+            revision="bd026c9b276d7fb083d19ec3d6870fca90e1834f",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+        )
+        self.text_key = "text"
+        self.label_key = "label"
+    def process_example(self, sample: dict) -> Tuple[str, List[str]]:
+        """
+        Given an sample from the dataset, create the input text and
+        list of answers for the instance.
+        """
+        return sample[self.text_key], sample[self.label_key].lower().split("#")
+class MELTTSentimentAnalysisVLSPScenario(MELTTextClassificationScenario):
+    """
+    Scenario for the VLSP 2016 sentiment analysis dataset.
+    The VLSP2016 dataset is a Vietnamese sentiment analysis corpus consisting of
+    short user-generated reviews from social media, each labeled with an overall
+    sentiment of positive, negative, or neutral. It was developed to support polarity
+    classification and benchmark Vietnamese sentiment analysis systems through the
+    VLSP 2016 evaluation campaign.
+    """
+    name = "melt_sentiment_analysis_vlsp"
+    description = "VLSP 2016 contains public comments from social media, used for sentiment analysis."
+    tags = ["sentiment_analysis"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="ura-hcmut/vlsp2016",
+            revision="9531ec0ccabcafb7d51020fe69d8f9faebb91953",
+            splits={
+                TRAIN_SPLIT: "train",
+                TEST_SPLIT: "test",
+            },
+        )
+        self.text_key = "Data"
+        self.label_key = "Class"
+    def process_example(self, sample: dict) -> Tuple[str, List[str]]:
+        """
+        Given an sample from the dataset, create the input text and
+        list of answers for the instance.
+        """
+        return sample[self.text_key], [sample[self.label_key].lower()]
+class MELTTSentimentAnalysisVSFCScenario(MELTTextClassificationScenario):
+    """
+    Scenario for the UIT-VSFC dataset.
+    The UIT-VSFC dataset is a Vietnamese corpus of over 16,000 student feedback sentences,
+    annotated for both sentiment-based (positive, negative, neutral) and topic-based classifications.
+    It supports interdisciplinary research at the intersection of sentiment analysis and education,
+    with high inter-annotator agreement and strong baseline performance using a Maximum Entropy classifier.
+    """
+    name = "melt_sentiment_analysis_vsfc"
+    description = "UIT-VSFC dataset for analyzing sentiment of student feedback."
+    tags = ["sentiment_analysis"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="ura-hcmut/UIT-VSFC",
+            revision="c572aed01a811a1dbc68e9aed9f9e684980a10a2",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+        )
+        self.text_key = "text"
+        self.label_key = "label"
+    def process_example(self, sample: dict) -> Tuple[str, List[str]]:
+        """
+        Given an sample from the dataset, create the input text and
+        list of answers for the instance.
+        """
+        return sample[self.text_key], [sample[self.label_key].lower()]
+class MELTToxicityDetectionViHSDScenario(MELTTextClassificationScenario):
+    """
+    Scenario for the UIT-ViHSD dataset.
+    """
+    name = "melt_toxicity_detection_vihsd"
+    description = "UIT-ViHSD dataset for toxicity detection."
+    tags = ["toxicity_detection"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="ura-hcmut/UIT-ViHSD",
+            revision="16c4f67cf509d4f9f36ca5b63c5503c7c8830557",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+        )
+        self.label_mapping = {
+            0: "clean",
+            1: "offensive",
+            2: "hate",
+        }
+        self.text_key = "free_text"
+        self.label_key = "label_id"
+    def process_example(self, sample: dict) -> Tuple[str, List[str]]:
+        """
+        Given an sample from the dataset, create the input text and
+        list of answers for the instance.
+        """
+        label = sample[self.label_key]
+        return sample[self.text_key], [self.label_mapping[label]]
+class MELTToxicityDetectionViCTSDScenario(MELTTextClassificationScenario):
+    """
+    Scenario for the UIT-ViCTSD dataset.
+    """
+    name = "melt_toxicity_detection_victsd"
+    description = "UIT-ViCTSD dataset for toxicity detection."
+    tags = ["toxicity_detection"]
+    def __init__(self):
+        super().__init__(
+            dataset_name="tarudesu/ViCTSD",
+            revision="65a073f2c48401410b264213229a6c52417f367a",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+        )
+        self.label_mapping = {
+            0: "clean",
+            1: "toxic",
+        }
+        self.text_key = "Comment"
+        self.label_key = "Toxicity"
+    def process_example(self, sample: dict) -> Tuple[str, List[str]]:
+        """
+        Given an sample from the dataset, create the input text and
+        list of answers for the instance.
+        """
+        label = sample[self.label_key]
+        return sample[self.text_key], [self.label_mapping[label]]

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl